Statistics
| Branch: | Tag: | Revision:

root / htools / Ganeti / HTools / Cluster.hs @ 129734d3

History | View | Annotate | Download (61.7 kB)

1
{-| Implementation of cluster-wide logic.
2

    
3
This module holds all pure cluster-logic; I\/O related functionality
4
goes into the /Main/ module for the individual binaries.
5

    
6
-}
7

    
8
{-
9

    
10
Copyright (C) 2009, 2010, 2011 Google Inc.
11

    
12
This program is free software; you can redistribute it and/or modify
13
it under the terms of the GNU General Public License as published by
14
the Free Software Foundation; either version 2 of the License, or
15
(at your option) any later version.
16

    
17
This program is distributed in the hope that it will be useful, but
18
WITHOUT ANY WARRANTY; without even the implied warranty of
19
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
General Public License for more details.
21

    
22
You should have received a copy of the GNU General Public License
23
along with this program; if not, write to the Free Software
24
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25
02110-1301, USA.
26

    
27
-}
28

    
29
module Ganeti.HTools.Cluster
30
    (
31
     -- * Types
32
      AllocSolution(..)
33
    , EvacSolution(..)
34
    , Table(..)
35
    , CStats(..)
36
    , AllocStats
37
    -- * Generic functions
38
    , totalResources
39
    , computeAllocationDelta
40
    -- * First phase functions
41
    , computeBadItems
42
    -- * Second phase functions
43
    , printSolutionLine
44
    , formatCmds
45
    , involvedNodes
46
    , splitJobs
47
    -- * Display functions
48
    , printNodes
49
    , printInsts
50
    -- * Balacing functions
51
    , checkMove
52
    , doNextBalance
53
    , tryBalance
54
    , compCV
55
    , compCVNodes
56
    , compDetailedCV
57
    , printStats
58
    , iMoveToJob
59
    -- * IAllocator functions
60
    , genAllocNodes
61
    , tryAlloc
62
    , tryMGAlloc
63
    , tryReloc
64
    , tryNodeEvac
65
    , tryChangeGroup
66
    , collapseFailures
67
    -- * Allocation functions
68
    , iterateAlloc
69
    , tieredAlloc
70
     -- * Node group functions
71
    , instanceGroup
72
    , findSplitInstances
73
    , splitCluster
74
    ) where
75

    
76
import qualified Data.IntSet as IntSet
77
import Data.List
78
import Data.Maybe (fromJust, isNothing)
79
import Data.Ord (comparing)
80
import Text.Printf (printf)
81
import Control.Monad
82

    
83
import qualified Ganeti.HTools.Container as Container
84
import qualified Ganeti.HTools.Instance as Instance
85
import qualified Ganeti.HTools.Node as Node
86
import qualified Ganeti.HTools.Group as Group
87
import Ganeti.HTools.Types
88
import Ganeti.HTools.Utils
89
import Ganeti.HTools.Compat
90
import qualified Ganeti.OpCodes as OpCodes
91

    
92
-- * Types
93

    
94
-- | Allocation\/relocation solution.
95
data AllocSolution = AllocSolution
96
  { asFailures :: [FailMode]              -- ^ Failure counts
97
  , asAllocs   :: Int                     -- ^ Good allocation count
98
  , asSolution :: Maybe Node.AllocElement -- ^ The actual allocation result
99
  , asLog      :: [String]                -- ^ Informational messages
100
  }
101

    
102
-- | Node evacuation/group change iallocator result type. This result
103
-- type consists of actual opcodes (a restricted subset) that are
104
-- transmitted back to Ganeti.
105
data EvacSolution = EvacSolution
106
    { esMoved   :: [(Idx, Gdx, [Ndx])]  -- ^ Instances moved successfully
107
    , esFailed  :: [(Idx, String)]      -- ^ Instances which were not
108
                                        -- relocated
109
    , esOpCodes :: [[[OpCodes.OpCode]]] -- ^ List of lists of jobs
110
    }
111

    
112
-- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
113
type AllocResult = (FailStats, Node.List, Instance.List,
114
                    [Instance.Instance], [CStats])
115

    
116
-- | A type denoting the valid allocation mode/pairs.
117
--
118
-- For a one-node allocation, this will be a @Left ['Node.Node']@,
119
-- whereas for a two-node allocation, this will be a @Right
120
-- [('Node.Node', 'Node.Node')]@.
121
type AllocNodes = Either [Ndx] [(Ndx, Ndx)]
122

    
123
-- | The empty solution we start with when computing allocations.
124
emptyAllocSolution :: AllocSolution
125
emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
126
                                   , asSolution = Nothing, asLog = [] }
127

    
128
-- | The empty evac solution.
129
emptyEvacSolution :: EvacSolution
130
emptyEvacSolution = EvacSolution { esMoved = []
131
                                 , esFailed = []
132
                                 , esOpCodes = []
133
                                 }
134

    
135
-- | The complete state for the balancing solution.
136
data Table = Table Node.List Instance.List Score [Placement]
137
             deriving (Show, Read)
138

    
139
-- | Cluster statistics data type.
140
data CStats = CStats { csFmem :: Integer -- ^ Cluster free mem
141
                     , csFdsk :: Integer -- ^ Cluster free disk
142
                     , csAmem :: Integer -- ^ Cluster allocatable mem
143
                     , csAdsk :: Integer -- ^ Cluster allocatable disk
144
                     , csAcpu :: Integer -- ^ Cluster allocatable cpus
145
                     , csMmem :: Integer -- ^ Max node allocatable mem
146
                     , csMdsk :: Integer -- ^ Max node allocatable disk
147
                     , csMcpu :: Integer -- ^ Max node allocatable cpu
148
                     , csImem :: Integer -- ^ Instance used mem
149
                     , csIdsk :: Integer -- ^ Instance used disk
150
                     , csIcpu :: Integer -- ^ Instance used cpu
151
                     , csTmem :: Double  -- ^ Cluster total mem
152
                     , csTdsk :: Double  -- ^ Cluster total disk
153
                     , csTcpu :: Double  -- ^ Cluster total cpus
154
                     , csVcpu :: Integer -- ^ Cluster virtual cpus (if
155
                                         -- node pCpu has been set,
156
                                         -- otherwise -1)
157
                     , csXmem :: Integer -- ^ Unnacounted for mem
158
                     , csNmem :: Integer -- ^ Node own memory
159
                     , csScore :: Score  -- ^ The cluster score
160
                     , csNinst :: Int    -- ^ The total number of instances
161
                     }
162
            deriving (Show, Read)
163

    
164
-- | Currently used, possibly to allocate, unallocable.
165
type AllocStats = (RSpec, RSpec, RSpec)
166

    
167
-- * Utility functions
168

    
169
-- | Verifies the N+1 status and return the affected nodes.
170
verifyN1 :: [Node.Node] -> [Node.Node]
171
verifyN1 = filter Node.failN1
172

    
173
{-| Computes the pair of bad nodes and instances.
174

    
175
The bad node list is computed via a simple 'verifyN1' check, and the
176
bad instance list is the list of primary and secondary instances of
177
those nodes.
178

    
179
-}
180
computeBadItems :: Node.List -> Instance.List ->
181
                   ([Node.Node], [Instance.Instance])
182
computeBadItems nl il =
183
  let bad_nodes = verifyN1 $ getOnline nl
184
      bad_instances = map (`Container.find` il) .
185
                      sort . nub $
186
                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
187
  in
188
    (bad_nodes, bad_instances)
189

    
190
-- | Zero-initializer for the CStats type.
191
emptyCStats :: CStats
192
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
193

    
194
-- | Update stats with data from a new node.
195
updateCStats :: CStats -> Node.Node -> CStats
196
updateCStats cs node =
197
    let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
198
                 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
199
                 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
200
                 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
201
                 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
202
                 csVcpu = x_vcpu,
203
                 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
204
               }
205
            = cs
206
        inc_amem = Node.fMem node - Node.rMem node
207
        inc_amem' = if inc_amem > 0 then inc_amem else 0
208
        inc_adsk = Node.availDisk node
209
        inc_imem = truncate (Node.tMem node) - Node.nMem node
210
                   - Node.xMem node - Node.fMem node
211
        inc_icpu = Node.uCpu node
212
        inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
213
        inc_vcpu = Node.hiCpu node
214
        inc_acpu = Node.availCpu node
215

    
216
    in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
217
          , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
218
          , csAmem = x_amem + fromIntegral inc_amem'
219
          , csAdsk = x_adsk + fromIntegral inc_adsk
220
          , csAcpu = x_acpu + fromIntegral inc_acpu
221
          , csMmem = max x_mmem (fromIntegral inc_amem')
222
          , csMdsk = max x_mdsk (fromIntegral inc_adsk)
223
          , csMcpu = max x_mcpu (fromIntegral inc_acpu)
224
          , csImem = x_imem + fromIntegral inc_imem
225
          , csIdsk = x_idsk + fromIntegral inc_idsk
226
          , csIcpu = x_icpu + fromIntegral inc_icpu
227
          , csTmem = x_tmem + Node.tMem node
228
          , csTdsk = x_tdsk + Node.tDsk node
229
          , csTcpu = x_tcpu + Node.tCpu node
230
          , csVcpu = x_vcpu + fromIntegral inc_vcpu
231
          , csXmem = x_xmem + fromIntegral (Node.xMem node)
232
          , csNmem = x_nmem + fromIntegral (Node.nMem node)
233
          , csNinst = x_ninst + length (Node.pList node)
234
          }
235

    
236
-- | Compute the total free disk and memory in the cluster.
237
totalResources :: Node.List -> CStats
238
totalResources nl =
239
    let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
240
    in cs { csScore = compCV nl }
241

    
242
-- | Compute the delta between two cluster state.
243
--
244
-- This is used when doing allocations, to understand better the
245
-- available cluster resources. The return value is a triple of the
246
-- current used values, the delta that was still allocated, and what
247
-- was left unallocated.
248
computeAllocationDelta :: CStats -> CStats -> AllocStats
249
computeAllocationDelta cini cfin =
250
    let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini
251
        CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
252
                csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin
253
        rini = RSpec (fromIntegral i_icpu) (fromIntegral i_imem)
254
               (fromIntegral i_idsk)
255
        rfin = RSpec (fromIntegral (f_icpu - i_icpu))
256
               (fromIntegral (f_imem - i_imem))
257
               (fromIntegral (f_idsk - i_idsk))
258
        un_cpu = fromIntegral (v_cpu - f_icpu)::Int
259
        runa = RSpec un_cpu (truncate t_mem - fromIntegral f_imem)
260
               (truncate t_dsk - fromIntegral f_idsk)
261
    in (rini, rfin, runa)
262

    
263
-- | The names and weights of the individual elements in the CV list.
264
detailedCVInfo :: [(Double, String)]
265
detailedCVInfo = [ (1,  "free_mem_cv")
266
                 , (1,  "free_disk_cv")
267
                 , (1,  "n1_cnt")
268
                 , (1,  "reserved_mem_cv")
269
                 , (4,  "offline_all_cnt")
270
                 , (16, "offline_pri_cnt")
271
                 , (1,  "vcpu_ratio_cv")
272
                 , (1,  "cpu_load_cv")
273
                 , (1,  "mem_load_cv")
274
                 , (1,  "disk_load_cv")
275
                 , (1,  "net_load_cv")
276
                 , (2,  "pri_tags_score")
277
                 ]
278

    
279
-- | Holds the weights used by 'compCVNodes' for each metric.
280
detailedCVWeights :: [Double]
281
detailedCVWeights = map fst detailedCVInfo
282

    
283
-- | Compute the mem and disk covariance.
284
compDetailedCV :: [Node.Node] -> [Double]
285
compDetailedCV all_nodes =
286
    let
287
        (offline, nodes) = partition Node.offline all_nodes
288
        mem_l = map Node.pMem nodes
289
        dsk_l = map Node.pDsk nodes
290
        -- metric: memory covariance
291
        mem_cv = stdDev mem_l
292
        -- metric: disk covariance
293
        dsk_cv = stdDev dsk_l
294
        -- metric: count of instances living on N1 failing nodes
295
        n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
296
                                                   length (Node.pList n)) .
297
                   filter Node.failN1 $ nodes :: Double
298
        res_l = map Node.pRem nodes
299
        -- metric: reserved memory covariance
300
        res_cv = stdDev res_l
301
        -- offline instances metrics
302
        offline_ipri = sum . map (length . Node.pList) $ offline
303
        offline_isec = sum . map (length . Node.sList) $ offline
304
        -- metric: count of instances on offline nodes
305
        off_score = fromIntegral (offline_ipri + offline_isec)::Double
306
        -- metric: count of primary instances on offline nodes (this
307
        -- helps with evacuation/failover of primary instances on
308
        -- 2-node clusters with one node offline)
309
        off_pri_score = fromIntegral offline_ipri::Double
310
        cpu_l = map Node.pCpu nodes
311
        -- metric: covariance of vcpu/pcpu ratio
312
        cpu_cv = stdDev cpu_l
313
        -- metrics: covariance of cpu, memory, disk and network load
314
        (c_load, m_load, d_load, n_load) = unzip4 $
315
            map (\n ->
316
                     let DynUtil c1 m1 d1 n1 = Node.utilLoad n
317
                         DynUtil c2 m2 d2 n2 = Node.utilPool n
318
                     in (c1/c2, m1/m2, d1/d2, n1/n2)
319
                ) nodes
320
        -- metric: conflicting instance count
321
        pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
322
        pri_tags_score = fromIntegral pri_tags_inst::Double
323
    in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
324
       , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
325
       , pri_tags_score ]
326

    
327
-- | Compute the /total/ variance.
328
compCVNodes :: [Node.Node] -> Double
329
compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
330

    
331
-- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
332
compCV :: Node.List -> Double
333
compCV = compCVNodes . Container.elems
334

    
335
-- | Compute online nodes from a 'Node.List'.
336
getOnline :: Node.List -> [Node.Node]
337
getOnline = filter (not . Node.offline) . Container.elems
338

    
339
-- * Balancing functions
340

    
341
-- | Compute best table. Note that the ordering of the arguments is important.
342
compareTables :: Table -> Table -> Table
343
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
344
    if a_cv > b_cv then b else a
345

    
346
-- | Applies an instance move to a given node list and instance.
347
applyMove :: Node.List -> Instance.Instance
348
          -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
349
-- Failover (f)
350
applyMove nl inst Failover =
351
    let old_pdx = Instance.pNode inst
352
        old_sdx = Instance.sNode inst
353
        old_p = Container.find old_pdx nl
354
        old_s = Container.find old_sdx nl
355
        int_p = Node.removePri old_p inst
356
        int_s = Node.removeSec old_s inst
357
        force_p = Node.offline old_p
358
        new_nl = do -- Maybe monad
359
          new_p <- Node.addPriEx force_p int_s inst
360
          new_s <- Node.addSec int_p inst old_sdx
361
          let new_inst = Instance.setBoth inst old_sdx old_pdx
362
          return (Container.addTwo old_pdx new_s old_sdx new_p nl,
363
                  new_inst, old_sdx, old_pdx)
364
    in new_nl
365

    
366
-- Replace the primary (f:, r:np, f)
367
applyMove nl inst (ReplacePrimary new_pdx) =
368
    let old_pdx = Instance.pNode inst
369
        old_sdx = Instance.sNode inst
370
        old_p = Container.find old_pdx nl
371
        old_s = Container.find old_sdx nl
372
        tgt_n = Container.find new_pdx nl
373
        int_p = Node.removePri old_p inst
374
        int_s = Node.removeSec old_s inst
375
        force_p = Node.offline old_p
376
        new_nl = do -- Maybe monad
377
          -- check that the current secondary can host the instance
378
          -- during the migration
379
          tmp_s <- Node.addPriEx force_p int_s inst
380
          let tmp_s' = Node.removePri tmp_s inst
381
          new_p <- Node.addPriEx force_p tgt_n inst
382
          new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
383
          let new_inst = Instance.setPri inst new_pdx
384
          return (Container.add new_pdx new_p $
385
                  Container.addTwo old_pdx int_p old_sdx new_s nl,
386
                  new_inst, new_pdx, old_sdx)
387
    in new_nl
388

    
389
-- Replace the secondary (r:ns)
390
applyMove nl inst (ReplaceSecondary new_sdx) =
391
    let old_pdx = Instance.pNode inst
392
        old_sdx = Instance.sNode inst
393
        old_s = Container.find old_sdx nl
394
        tgt_n = Container.find new_sdx nl
395
        int_s = Node.removeSec old_s inst
396
        force_s = Node.offline old_s
397
        new_inst = Instance.setSec inst new_sdx
398
        new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
399
                 \new_s -> return (Container.addTwo new_sdx
400
                                   new_s old_sdx int_s nl,
401
                                   new_inst, old_pdx, new_sdx)
402
    in new_nl
403

    
404
-- Replace the secondary and failover (r:np, f)
405
applyMove nl inst (ReplaceAndFailover new_pdx) =
406
    let old_pdx = Instance.pNode inst
407
        old_sdx = Instance.sNode inst
408
        old_p = Container.find old_pdx nl
409
        old_s = Container.find old_sdx nl
410
        tgt_n = Container.find new_pdx nl
411
        int_p = Node.removePri old_p inst
412
        int_s = Node.removeSec old_s inst
413
        force_s = Node.offline old_s
414
        new_nl = do -- Maybe monad
415
          new_p <- Node.addPri tgt_n inst
416
          new_s <- Node.addSecEx force_s int_p inst new_pdx
417
          let new_inst = Instance.setBoth inst new_pdx old_pdx
418
          return (Container.add new_pdx new_p $
419
                  Container.addTwo old_pdx new_s old_sdx int_s nl,
420
                  new_inst, new_pdx, old_pdx)
421
    in new_nl
422

    
423
-- Failver and replace the secondary (f, r:ns)
424
applyMove nl inst (FailoverAndReplace new_sdx) =
425
    let old_pdx = Instance.pNode inst
426
        old_sdx = Instance.sNode inst
427
        old_p = Container.find old_pdx nl
428
        old_s = Container.find old_sdx nl
429
        tgt_n = Container.find new_sdx nl
430
        int_p = Node.removePri old_p inst
431
        int_s = Node.removeSec old_s inst
432
        force_p = Node.offline old_p
433
        new_nl = do -- Maybe monad
434
          new_p <- Node.addPriEx force_p int_s inst
435
          new_s <- Node.addSecEx force_p tgt_n inst old_sdx
436
          let new_inst = Instance.setBoth inst old_sdx new_sdx
437
          return (Container.add new_sdx new_s $
438
                  Container.addTwo old_sdx new_p old_pdx int_p nl,
439
                  new_inst, old_sdx, new_sdx)
440
    in new_nl
441

    
442
-- | Tries to allocate an instance on one given node.
443
allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
444
                 -> OpResult Node.AllocElement
445
allocateOnSingle nl inst new_pdx =
446
    let p = Container.find new_pdx nl
447
        new_inst = Instance.setBoth inst new_pdx Node.noSecondary
448
    in  Node.addPri p inst >>= \new_p -> do
449
      let new_nl = Container.add new_pdx new_p nl
450
          new_score = compCV nl
451
      return (new_nl, new_inst, [new_p], new_score)
452

    
453
-- | Tries to allocate an instance on a given pair of nodes.
454
allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
455
               -> OpResult Node.AllocElement
456
allocateOnPair nl inst new_pdx new_sdx =
457
    let tgt_p = Container.find new_pdx nl
458
        tgt_s = Container.find new_sdx nl
459
    in do
460
      new_p <- Node.addPri tgt_p inst
461
      new_s <- Node.addSec tgt_s inst new_pdx
462
      let new_inst = Instance.setBoth inst new_pdx new_sdx
463
          new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
464
      return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
465

    
466
-- | Tries to perform an instance move and returns the best table
467
-- between the original one and the new one.
468
checkSingleStep :: Table -- ^ The original table
469
                -> Instance.Instance -- ^ The instance to move
470
                -> Table -- ^ The current best table
471
                -> IMove -- ^ The move to apply
472
                -> Table -- ^ The final best table
473
checkSingleStep ini_tbl target cur_tbl move =
474
    let
475
        Table ini_nl ini_il _ ini_plc = ini_tbl
476
        tmp_resu = applyMove ini_nl target move
477
    in
478
      case tmp_resu of
479
        OpFail _ -> cur_tbl
480
        OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
481
            let tgt_idx = Instance.idx target
482
                upd_cvar = compCV upd_nl
483
                upd_il = Container.add tgt_idx new_inst ini_il
484
                upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
485
                upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
486
            in
487
              compareTables cur_tbl upd_tbl
488

    
489
-- | Given the status of the current secondary as a valid new node and
490
-- the current candidate target node, generate the possible moves for
491
-- a instance.
492
possibleMoves :: Bool      -- ^ Whether the secondary node is a valid new node
493
              -> Bool      -- ^ Whether we can change the primary node
494
              -> Ndx       -- ^ Target node candidate
495
              -> [IMove]   -- ^ List of valid result moves
496

    
497
possibleMoves _ False tdx =
498
    [ReplaceSecondary tdx]
499

    
500
possibleMoves True True tdx =
501
    [ReplaceSecondary tdx,
502
     ReplaceAndFailover tdx,
503
     ReplacePrimary tdx,
504
     FailoverAndReplace tdx]
505

    
506
possibleMoves False True tdx =
507
    [ReplaceSecondary tdx,
508
     ReplaceAndFailover tdx]
509

    
510
-- | Compute the best move for a given instance.
511
checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
512
                  -> Bool              -- ^ Whether disk moves are allowed
513
                  -> Bool              -- ^ Whether instance moves are allowed
514
                  -> Table             -- ^ Original table
515
                  -> Instance.Instance -- ^ Instance to move
516
                  -> Table             -- ^ Best new table for this instance
517
checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
518
    let
519
        opdx = Instance.pNode target
520
        osdx = Instance.sNode target
521
        nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx
522
        use_secondary = elem osdx nodes_idx && inst_moves
523
        aft_failover = if use_secondary -- if allowed to failover
524
                       then checkSingleStep ini_tbl target ini_tbl Failover
525
                       else ini_tbl
526
        all_moves = if disk_moves
527
                    then concatMap
528
                         (possibleMoves use_secondary inst_moves) nodes
529
                    else []
530
    in
531
      -- iterate over the possible nodes for this instance
532
      foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
533

    
534
-- | Compute the best next move.
535
checkMove :: [Ndx]               -- ^ Allowed target node indices
536
          -> Bool                -- ^ Whether disk moves are allowed
537
          -> Bool                -- ^ Whether instance moves are allowed
538
          -> Table               -- ^ The current solution
539
          -> [Instance.Instance] -- ^ List of instances still to move
540
          -> Table               -- ^ The new solution
541
checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
542
    let Table _ _ _ ini_plc = ini_tbl
543
        -- we're using rwhnf from the Control.Parallel.Strategies
544
        -- package; we don't need to use rnf as that would force too
545
        -- much evaluation in single-threaded cases, and in
546
        -- multi-threaded case the weak head normal form is enough to
547
        -- spark the evaluation
548
        tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
549
                               inst_moves ini_tbl)
550
                 victims
551
        -- iterate over all instances, computing the best move
552
        best_tbl = foldl' compareTables ini_tbl tables
553
        Table _ _ _ best_plc = best_tbl
554
    in if length best_plc == length ini_plc
555
       then ini_tbl -- no advancement
556
       else best_tbl
557

    
558
-- | Check if we are allowed to go deeper in the balancing.
559
doNextBalance :: Table     -- ^ The starting table
560
              -> Int       -- ^ Remaining length
561
              -> Score     -- ^ Score at which to stop
562
              -> Bool      -- ^ The resulting table and commands
563
doNextBalance ini_tbl max_rounds min_score =
564
    let Table _ _ ini_cv ini_plc = ini_tbl
565
        ini_plc_len = length ini_plc
566
    in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
567

    
568
-- | Run a balance move.
569
tryBalance :: Table       -- ^ The starting table
570
           -> Bool        -- ^ Allow disk moves
571
           -> Bool        -- ^ Allow instance moves
572
           -> Bool        -- ^ Only evacuate moves
573
           -> Score       -- ^ Min gain threshold
574
           -> Score       -- ^ Min gain
575
           -> Maybe Table -- ^ The resulting table and commands
576
tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
577
    let Table ini_nl ini_il ini_cv _ = ini_tbl
578
        all_inst = Container.elems ini_il
579
        all_inst' = if evac_mode
580
                    then let bad_nodes = map Node.idx . filter Node.offline $
581
                                         Container.elems ini_nl
582
                         in filter (any (`elem` bad_nodes) . Instance.allNodes)
583
                            all_inst
584
                    else all_inst
585
        reloc_inst = filter Instance.movable all_inst'
586
        node_idx = map Node.idx . filter (not . Node.offline) $
587
                   Container.elems ini_nl
588
        fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
589
        (Table _ _ fin_cv _) = fin_tbl
590
    in
591
      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
592
      then Just fin_tbl -- this round made success, return the new table
593
      else Nothing
594

    
595
-- * Allocation functions
596

    
597
-- | Build failure stats out of a list of failures.
598
collapseFailures :: [FailMode] -> FailStats
599
collapseFailures flst =
600
    map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
601
            [minBound..maxBound]
602

    
603
-- | Update current Allocation solution and failure stats with new
604
-- elements.
605
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
606
concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
607

    
608
concatAllocs as (OpGood ns@(_, _, _, nscore)) =
609
    let -- Choose the old or new solution, based on the cluster score
610
        cntok = asAllocs as
611
        osols = asSolution as
612
        nsols = case osols of
613
                  Nothing -> Just ns
614
                  Just (_, _, _, oscore) ->
615
                      if oscore < nscore
616
                      then osols
617
                      else Just ns
618
        nsuc = cntok + 1
619
    -- Note: we force evaluation of nsols here in order to keep the
620
    -- memory profile low - we know that we will need nsols for sure
621
    -- in the next cycle, so we force evaluation of nsols, since the
622
    -- foldl' in the caller will only evaluate the tuple, but not the
623
    -- elements of the tuple
624
    in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolution = nsols }
625

    
626
-- | Given a solution, generates a reasonable description for it.
627
describeSolution :: AllocSolution -> String
628
describeSolution as =
629
  let fcnt = asFailures as
630
      sols = asSolution as
631
      freasons =
632
        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
633
        filter ((> 0) . snd) . collapseFailures $ fcnt
634
  in case sols of
635
     Nothing -> "No valid allocation solutions, failure reasons: " ++
636
                (if null fcnt then "unknown reasons" else freasons)
637
     Just (_, _, nodes, cv) ->
638
         printf ("score: %.8f, successes %d, failures %d (%s)" ++
639
                 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
640
               (intercalate "/" . map Node.name $ nodes)
641

    
642
-- | Annotates a solution with the appropriate string.
643
annotateSolution :: AllocSolution -> AllocSolution
644
annotateSolution as = as { asLog = describeSolution as : asLog as }
645

    
646
-- | Reverses an evacuation solution.
647
--
648
-- Rationale: we always concat the results to the top of the lists, so
649
-- for proper jobset execution, we should reverse all lists.
650
reverseEvacSolution :: EvacSolution -> EvacSolution
651
reverseEvacSolution (EvacSolution f m o) =
652
    EvacSolution (reverse f) (reverse m) (reverse o)
653

    
654
-- | Generate the valid node allocation singles or pairs for a new instance.
655
genAllocNodes :: Group.List        -- ^ Group list
656
              -> Node.List         -- ^ The node map
657
              -> Int               -- ^ The number of nodes required
658
              -> Bool              -- ^ Whether to drop or not
659
                                   -- unallocable nodes
660
              -> Result AllocNodes -- ^ The (monadic) result
661
genAllocNodes gl nl count drop_unalloc =
662
    let filter_fn = if drop_unalloc
663
                    then filter (Group.isAllocable .
664
                                 flip Container.find gl . Node.group)
665
                    else id
666
        all_nodes = filter_fn $ getOnline nl
667
        all_pairs = liftM2 (,) all_nodes all_nodes
668
        ok_pairs = filter (\(x, y) -> Node.idx x /= Node.idx y &&
669
                                      Node.group x == Node.group y) all_pairs
670
    in case count of
671
         1 -> Ok (Left (map Node.idx all_nodes))
672
         2 -> Ok (Right (map (\(p, s) -> (Node.idx p, Node.idx s)) ok_pairs))
673
         _ -> Bad "Unsupported number of nodes, only one or two  supported"
674

    
675
-- | Try to allocate an instance on the cluster.
676
tryAlloc :: (Monad m) =>
677
            Node.List         -- ^ The node list
678
         -> Instance.List     -- ^ The instance list
679
         -> Instance.Instance -- ^ The instance to allocate
680
         -> AllocNodes        -- ^ The allocation targets
681
         -> m AllocSolution   -- ^ Possible solution list
682
tryAlloc nl _ inst (Right ok_pairs) =
683
    let sols = foldl' (\cstate (p, s) ->
684
                           concatAllocs cstate $ allocateOnPair nl inst p s
685
                      ) emptyAllocSolution ok_pairs
686

    
687
    in if null ok_pairs -- means we have just one node
688
       then fail "Not enough online nodes"
689
       else return $ annotateSolution sols
690

    
691
tryAlloc nl _ inst (Left all_nodes) =
692
    let sols = foldl' (\cstate ->
693
                           concatAllocs cstate . allocateOnSingle nl inst
694
                      ) emptyAllocSolution all_nodes
695
    in if null all_nodes
696
       then fail "No online nodes"
697
       else return $ annotateSolution sols
698

    
699
-- | Given a group/result, describe it as a nice (list of) messages.
700
solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
701
solutionDescription gl (groupId, result) =
702
  case result of
703
    Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
704
    Bad message -> [printf "Group %s: error %s" gname message]
705
  where grp = Container.find groupId gl
706
        gname = Group.name grp
707
        pol = apolToString (Group.allocPolicy grp)
708

    
709
-- | From a list of possibly bad and possibly empty solutions, filter
710
-- only the groups with a valid result. Note that the result will be
711
-- reversed compared to the original list.
712
filterMGResults :: Group.List
713
                -> [(Gdx, Result AllocSolution)]
714
                -> [(Gdx, AllocSolution)]
715
filterMGResults gl = foldl' fn []
716
    where unallocable = not . Group.isAllocable . flip Container.find gl
717
          fn accu (gdx, rasol) =
718
              case rasol of
719
                Bad _ -> accu
720
                Ok sol | isNothing (asSolution sol) -> accu
721
                       | unallocable gdx -> accu
722
                       | otherwise -> (gdx, sol):accu
723

    
724
-- | Sort multigroup results based on policy and score.
725
sortMGResults :: Group.List
726
             -> [(Gdx, AllocSolution)]
727
             -> [(Gdx, AllocSolution)]
728
sortMGResults gl sols =
729
    let extractScore (_, _, _, x) = x
730
        solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
731
                               (extractScore . fromJust . asSolution) sol)
732
    in sortBy (comparing solScore) sols
733

    
734
-- | Finds the best group for an instance on a multi-group cluster.
735
--
736
-- Only solutions in @preferred@ and @last_resort@ groups will be
737
-- accepted as valid, and additionally if the allowed groups parameter
738
-- is not null then allocation will only be run for those group
739
-- indices.
740
findBestAllocGroup :: Group.List           -- ^ The group list
741
                   -> Node.List            -- ^ The node list
742
                   -> Instance.List        -- ^ The instance list
743
                   -> Maybe [Gdx]          -- ^ The allowed groups
744
                   -> Instance.Instance    -- ^ The instance to allocate
745
                   -> Int                  -- ^ Required number of nodes
746
                   -> Result (Gdx, AllocSolution, [String])
747
findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
748
  let groups = splitCluster mgnl mgil
749
      groups' = maybe groups (\gs -> filter ((`elem` gs) . fst) groups)
750
                allowed_gdxs
751
      sols = map (\(gid, (nl, il)) ->
752
                   (gid, genAllocNodes mggl nl cnt False >>=
753
                       tryAlloc nl il inst))
754
             groups'::[(Gdx, Result AllocSolution)]
755
      all_msgs = concatMap (solutionDescription mggl) sols
756
      goodSols = filterMGResults mggl sols
757
      sortedSols = sortMGResults mggl goodSols
758
  in if null sortedSols
759
     then Bad $ intercalate ", " all_msgs
760
     else let (final_group, final_sol) = head sortedSols
761
          in return (final_group, final_sol, all_msgs)
762

    
763
-- | Try to allocate an instance on a multi-group cluster.
764
tryMGAlloc :: Group.List           -- ^ The group list
765
           -> Node.List            -- ^ The node list
766
           -> Instance.List        -- ^ The instance list
767
           -> Instance.Instance    -- ^ The instance to allocate
768
           -> Int                  -- ^ Required number of nodes
769
           -> Result AllocSolution -- ^ Possible solution list
770
tryMGAlloc mggl mgnl mgil inst cnt = do
771
  (best_group, solution, all_msgs) <-
772
      findBestAllocGroup mggl mgnl mgil Nothing inst cnt
773
  let group_name = Group.name $ Container.find best_group mggl
774
      selmsg = "Selected group: " ++ group_name
775
  return $ solution { asLog = selmsg:all_msgs }
776

    
777
-- | Try to relocate an instance on the cluster.
778
tryReloc :: (Monad m) =>
779
            Node.List       -- ^ The node list
780
         -> Instance.List   -- ^ The instance list
781
         -> Idx             -- ^ The index of the instance to move
782
         -> Int             -- ^ The number of nodes required
783
         -> [Ndx]           -- ^ Nodes which should not be used
784
         -> m AllocSolution -- ^ Solution list
785
tryReloc nl il xid 1 ex_idx =
786
    let all_nodes = getOnline nl
787
        inst = Container.find xid il
788
        ex_idx' = Instance.pNode inst:ex_idx
789
        valid_nodes = filter (not . flip elem ex_idx' . Node.idx) all_nodes
790
        valid_idxes = map Node.idx valid_nodes
791
        sols1 = foldl' (\cstate x ->
792
                            let em = do
793
                                  (mnl, i, _, _) <-
794
                                      applyMove nl inst (ReplaceSecondary x)
795
                                  return (mnl, i, [Container.find x mnl],
796
                                          compCV mnl)
797
                            in concatAllocs cstate em
798
                       ) emptyAllocSolution valid_idxes
799
    in return sols1
800

    
801
tryReloc _ _ _ reqn _  = fail $ "Unsupported number of relocation \
802
                                \destinations required (" ++ show reqn ++
803
                                                  "), only one supported"
804

    
805
-- | Function which fails if the requested mode is change secondary.
806
--
807
-- This is useful since except DRBD, no other disk template can
808
-- execute change secondary; thus, we can just call this function
809
-- instead of always checking for secondary mode. After the call to
810
-- this function, whatever mode we have is just a primary change.
811
failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
812
failOnSecondaryChange ChangeSecondary dt =
813
    fail $ "Instances with disk template '" ++ dtToString dt ++
814
         "' can't execute change secondary"
815
failOnSecondaryChange _ _ = return ()
816

    
817
-- | Run evacuation for a single instance.
818
--
819
-- /Note:/ this function should correctly execute both intra-group
820
-- evacuations (in all modes) and inter-group evacuations (in the
821
-- 'ChangeAll' mode). Of course, this requires that the correct list
822
-- of target nodes is passed.
823
nodeEvacInstance :: Node.List         -- ^ The node list (cluster-wide)
824
                 -> Instance.List     -- ^ Instance list (cluster-wide)
825
                 -> EvacMode          -- ^ The evacuation mode
826
                 -> Instance.Instance -- ^ The instance to be evacuated
827
                 -> Gdx               -- ^ The group we're targetting
828
                 -> [Ndx]             -- ^ The list of available nodes
829
                                      -- for allocation
830
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
831
nodeEvacInstance _ _ mode (Instance.Instance
832
                           {Instance.diskTemplate = dt@DTDiskless}) _ _ =
833
                  failOnSecondaryChange mode dt >>
834
                  fail "Diskless relocations not implemented yet"
835

    
836
nodeEvacInstance _ _ _ (Instance.Instance
837
                        {Instance.diskTemplate = DTPlain}) _ _ =
838
                  fail "Instances of type plain cannot be relocated"
839

    
840
nodeEvacInstance _ _ _ (Instance.Instance
841
                        {Instance.diskTemplate = DTFile}) _ _ =
842
                  fail "Instances of type file cannot be relocated"
843

    
844
nodeEvacInstance _ _ mode  (Instance.Instance
845
                            {Instance.diskTemplate = dt@DTSharedFile}) _ _ =
846
                  failOnSecondaryChange mode dt >>
847
                  fail "Shared file relocations not implemented yet"
848

    
849
nodeEvacInstance _ _ mode (Instance.Instance
850
                           {Instance.diskTemplate = dt@DTBlock}) _ _ =
851
                  failOnSecondaryChange mode dt >>
852
                  fail "Block device relocations not implemented yet"
853

    
854
nodeEvacInstance nl il ChangePrimary
855
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
856
                 _ _ =
857
  do
858
    (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
859
    let idx = Instance.idx inst
860
        il' = Container.add idx inst' il
861
        ops = iMoveToJob nl' il' idx Failover
862
    return (nl', il', ops)
863

    
864
nodeEvacInstance nl il ChangeSecondary
865
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
866
                 gdx avail_nodes =
867
  do
868
    (nl', inst', _, ndx) <- annotateResult "Can't find any good node" $
869
                            eitherToResult $
870
                            foldl' (evacDrbdSecondaryInner nl inst gdx)
871
                            (Left "no nodes available") avail_nodes
872
    let idx = Instance.idx inst
873
        il' = Container.add idx inst' il
874
        ops = iMoveToJob nl' il' idx (ReplaceSecondary ndx)
875
    return (nl', il', ops)
876

    
877
-- The algorithm for ChangeAll is as follows:
878
--
879
-- * generate all (primary, secondary) node pairs for the target groups
880
-- * for each pair, execute the needed moves (r:s, f, r:s) and compute
881
--   the final node list state and group score
882
-- * select the best choice via a foldl that uses the same Either
883
--   String solution as the ChangeSecondary mode
884
nodeEvacInstance nl il ChangeAll
885
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
886
                 gdx avail_nodes =
887
  do
888
    let no_nodes = Left "no nodes available"
889
        node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s]
890
    (nl', il', ops, _) <-
891
        annotateResult "Can't find any good nodes for relocation" $
892
        eitherToResult $
893
        foldl'
894
        (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of
895
                          Bad msg ->
896
                              case accu of
897
                                Right _ -> accu
898
                                -- we don't need more details (which
899
                                -- nodes, etc.) as we only selected
900
                                -- this group if we can allocate on
901
                                -- it, hence failures will not
902
                                -- propagate out of this fold loop
903
                                Left _ -> Left $ "Allocation failed: " ++ msg
904
                          Ok result@(_, _, _, new_cv) ->
905
                              let new_accu = Right result in
906
                              case accu of
907
                                Left _ -> new_accu
908
                                Right (_, _, _, old_cv) ->
909
                                    if old_cv < new_cv
910
                                    then accu
911
                                    else new_accu
912
        ) no_nodes node_pairs
913

    
914
    return (nl', il', ops)
915

    
916
-- | Inner fold function for changing secondary of a DRBD instance.
917
--
918
-- The running solution is either a @Left String@, which means we
919
-- don't have yet a working solution, or a @Right (...)@, which
920
-- represents a valid solution; it holds the modified node list, the
921
-- modified instance (after evacuation), the score of that solution,
922
-- and the new secondary node index.
923
evacDrbdSecondaryInner :: Node.List -- ^ Cluster node list
924
                       -> Instance.Instance -- ^ Instance being evacuated
925
                       -> Gdx -- ^ The group index of the instance
926
                       -> Either String ( Node.List
927
                                        , Instance.Instance
928
                                        , Score
929
                                        , Ndx)  -- ^ Current best solution
930
                       -> Ndx  -- ^ Node we're evaluating as new secondary
931
                       -> Either String ( Node.List
932
                                        , Instance.Instance
933
                                        , Score
934
                                        , Ndx) -- ^ New best solution
935
evacDrbdSecondaryInner nl inst gdx accu ndx =
936
    case applyMove nl inst (ReplaceSecondary ndx) of
937
      OpFail fm ->
938
          case accu of
939
            Right _ -> accu
940
            Left _ -> Left $ "Node " ++ Container.nameOf nl ndx ++
941
                      " failed: " ++ show fm
942
      OpGood (nl', inst', _, _) ->
943
          let nodes = Container.elems nl'
944
              -- The fromJust below is ugly (it can fail nastily), but
945
              -- at this point we should have any internal mismatches,
946
              -- and adding a monad here would be quite involved
947
              grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
948
              new_cv = compCVNodes grpnodes
949
              new_accu = Right (nl', inst', new_cv, ndx)
950
          in case accu of
951
               Left _ -> new_accu
952
               Right (_, _, old_cv, _) ->
953
                   if old_cv < new_cv
954
                   then accu
955
                   else new_accu
956

    
957
-- | Compute result of changing all nodes of a DRBD instance.
958
--
959
-- Given the target primary and secondary node (which might be in a
960
-- different group or not), this function will 'execute' all the
961
-- required steps and assuming all operations succceed, will return
962
-- the modified node and instance lists, the opcodes needed for this
963
-- and the new group score.
964
evacDrbdAllInner :: Node.List         -- ^ Cluster node list
965
                 -> Instance.List     -- ^ Cluster instance list
966
                 -> Instance.Instance -- ^ The instance to be moved
967
                 -> Gdx               -- ^ The target group index
968
                                      -- (which can differ from the
969
                                      -- current group of the
970
                                      -- instance)
971
                 -> (Ndx, Ndx)        -- ^ Tuple of new
972
                                      -- primary\/secondary nodes
973
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score)
974
evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) =
975
  do
976
    let primary = Container.find (Instance.pNode inst) nl
977
        idx = Instance.idx inst
978
    -- if the primary is offline, then we first failover
979
    (nl1, inst1, ops1) <-
980
        if Node.offline primary
981
        then do
982
          (nl', inst', _, _) <-
983
              annotateResult "Failing over to the secondary" $
984
              opToResult $ applyMove nl inst Failover
985
          return (nl', inst', [Failover])
986
        else return (nl, inst, [])
987
    let (o1, o2, o3) = (ReplaceSecondary t_pdx,
988
                        Failover,
989
                        ReplaceSecondary t_sdx)
990
    -- we now need to execute a replace secondary to the future
991
    -- primary node
992
    (nl2, inst2, _, _) <-
993
        annotateResult "Changing secondary to new primary" $
994
        opToResult $
995
        applyMove nl1 inst1 o1
996
    let ops2 = o1:ops1
997
    -- we now execute another failover, the primary stays fixed now
998
    (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" $
999
                          opToResult $ applyMove nl2 inst2 o2
1000
    let ops3 = o2:ops2
1001
    -- and finally another replace secondary, to the final secondary
1002
    (nl4, inst4, _, _) <-
1003
        annotateResult "Changing secondary to final secondary" $
1004
        opToResult $
1005
        applyMove nl3 inst3 o3
1006
    let ops4 = o3:ops3
1007
        il' = Container.add idx inst4 il
1008
        ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1009
    let nodes = Container.elems nl4
1010
        -- The fromJust below is ugly (it can fail nastily), but
1011
        -- at this point we should have any internal mismatches,
1012
        -- and adding a monad here would be quite involved
1013
        grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1014
        new_cv = compCVNodes grpnodes
1015
    return (nl4, il', ops, new_cv)
1016

    
1017
-- | Computes the nodes in a given group which are available for
1018
-- allocation.
1019
availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1020
                    -> IntSet.IntSet  -- ^ Nodes that are excluded
1021
                    -> Gdx            -- ^ The group for which we
1022
                                      -- query the nodes
1023
                    -> Result [Ndx]   -- ^ List of available node indices
1024
availableGroupNodes group_nodes excl_ndx gdx = do
1025
  local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1026
                 Ok (lookup gdx group_nodes)
1027
  let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1028
  return avail_nodes
1029

    
1030
-- | Updates the evac solution with the results of an instance
1031
-- evacuation.
1032
updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1033
                   -> Idx
1034
                   -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1035
                   -> (Node.List, Instance.List, EvacSolution)
1036
updateEvacSolution (nl, il, es) idx (Bad msg) =
1037
    (nl, il, es { esFailed = (idx, msg):esFailed es})
1038
updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1039
    (nl, il, es { esMoved = new_elem:esMoved es
1040
                , esOpCodes = [opcodes]:esOpCodes es })
1041
     where inst = Container.find idx il
1042
           new_elem = (idx,
1043
                       instancePriGroup nl inst,
1044
                       Instance.allNodes inst)
1045

    
1046
-- | Node-evacuation IAllocator mode main function.
1047
tryNodeEvac :: Group.List    -- ^ The cluster groups
1048
            -> Node.List     -- ^ The node list (cluster-wide, not per group)
1049
            -> Instance.List -- ^ Instance list (cluster-wide)
1050
            -> EvacMode      -- ^ The evacuation mode
1051
            -> [Idx]         -- ^ List of instance (indices) to be evacuated
1052
            -> Result (Node.List, Instance.List, EvacSolution)
1053
tryNodeEvac _ ini_nl ini_il mode idxs =
1054
    let evac_ndx = nodesToEvacuate ini_il mode idxs
1055
        offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1056
        excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1057
        group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1058
                                             (Container.elems nl))) $
1059
                      splitCluster ini_nl ini_il
1060
        (fin_nl, fin_il, esol) =
1061
            foldl' (\state@(nl, il, _) inst ->
1062
                        let gdx = instancePriGroup nl inst
1063
                            pdx = Instance.pNode inst in
1064
                        updateEvacSolution state (Instance.idx inst) $
1065
                        availableGroupNodes group_ndx
1066
                          (IntSet.insert pdx excl_ndx) gdx >>=
1067
                        nodeEvacInstance nl il mode inst gdx
1068
                   )
1069
            (ini_nl, ini_il, emptyEvacSolution)
1070
            (map (`Container.find` ini_il) idxs)
1071
    in return (fin_nl, fin_il, reverseEvacSolution esol)
1072

    
1073
-- | Change-group IAllocator mode main function.
1074
--
1075
-- This is very similar to 'tryNodeEvac', the only difference is that
1076
-- we don't choose as target group the current instance group, but
1077
-- instead:
1078
--
1079
--   1. at the start of the function, we compute which are the target
1080
--   groups; either no groups were passed in, in which case we choose
1081
--   all groups out of which we don't evacuate instance, or there were
1082
--   some groups passed, in which case we use those
1083
--
1084
--   2. for each instance, we use 'findBestAllocGroup' to choose the
1085
--   best group to hold the instance, and then we do what
1086
--   'tryNodeEvac' does, except for this group instead of the current
1087
--   instance group.
1088
--
1089
-- Note that the correct behaviour of this function relies on the
1090
-- function 'nodeEvacInstance' to be able to do correctly both
1091
-- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1092
tryChangeGroup :: Group.List    -- ^ The cluster groups
1093
               -> Node.List     -- ^ The node list (cluster-wide)
1094
               -> Instance.List -- ^ Instance list (cluster-wide)
1095
               -> [Gdx]         -- ^ Target groups; if empty, any
1096
                                -- groups not being evacuated
1097
               -> [Idx]         -- ^ List of instance (indices) to be evacuated
1098
               -> Result (Node.List, Instance.List, EvacSolution)
1099
tryChangeGroup gl ini_nl ini_il gdxs idxs =
1100
    let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1101
                               flip Container.find ini_il) idxs
1102
        target_gdxs = (if null gdxs
1103
                       then Container.keys gl
1104
                       else gdxs) \\ evac_gdxs
1105
        offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1106
        excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1107
        group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1108
                                             (Container.elems nl))) $
1109
                      splitCluster ini_nl ini_il
1110
        (fin_nl, fin_il, esol) =
1111
            foldl' (\state@(nl, il, _) inst ->
1112
                        let solution = do
1113
                              let ncnt = Instance.requiredNodes $
1114
                                         Instance.diskTemplate inst
1115
                              (gdx, _, _) <- findBestAllocGroup gl nl il
1116
                                             (Just target_gdxs) inst ncnt
1117
                              av_nodes <- availableGroupNodes group_ndx
1118
                                          excl_ndx gdx
1119
                              nodeEvacInstance nl il ChangeAll inst
1120
                                       gdx av_nodes
1121
                        in updateEvacSolution state
1122
                               (Instance.idx inst) solution
1123
                   )
1124
            (ini_nl, ini_il, emptyEvacSolution)
1125
            (map (`Container.find` ini_il) idxs)
1126
    in return (fin_nl, fin_il, reverseEvacSolution esol)
1127

    
1128
-- | Recursively place instances on the cluster until we're out of space.
1129
iterateAlloc :: Node.List
1130
             -> Instance.List
1131
             -> Maybe Int
1132
             -> Instance.Instance
1133
             -> AllocNodes
1134
             -> [Instance.Instance]
1135
             -> [CStats]
1136
             -> Result AllocResult
1137
iterateAlloc nl il limit newinst allocnodes ixes cstats =
1138
      let depth = length ixes
1139
          newname = printf "new-%d" depth::String
1140
          newidx = length (Container.elems il) + depth
1141
          newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1142
          newlimit = fmap (flip (-) 1) limit
1143
      in case tryAlloc nl il newi2 allocnodes of
1144
           Bad s -> Bad s
1145
           Ok (AllocSolution { asFailures = errs, asSolution = sols3 }) ->
1146
               let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1147
               case sols3 of
1148
                 Nothing -> newsol
1149
                 Just (xnl, xi, _, _) ->
1150
                     if limit == Just 0
1151
                     then newsol
1152
                     else iterateAlloc xnl (Container.add newidx xi il)
1153
                          newlimit newinst allocnodes (xi:ixes)
1154
                          (totalResources xnl:cstats)
1155

    
1156
-- | The core of the tiered allocation mode.
1157
tieredAlloc :: Node.List
1158
            -> Instance.List
1159
            -> Maybe Int
1160
            -> Instance.Instance
1161
            -> AllocNodes
1162
            -> [Instance.Instance]
1163
            -> [CStats]
1164
            -> Result AllocResult
1165
tieredAlloc nl il limit newinst allocnodes ixes cstats =
1166
    case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1167
      Bad s -> Bad s
1168
      Ok (errs, nl', il', ixes', cstats') ->
1169
          let newsol = Ok (errs, nl', il', ixes', cstats')
1170
              ixes_cnt = length ixes'
1171
              (stop, newlimit) = case limit of
1172
                                   Nothing -> (False, Nothing)
1173
                                   Just n -> (n <= ixes_cnt,
1174
                                              Just (n - ixes_cnt)) in
1175
          if stop then newsol else
1176
          case Instance.shrinkByType newinst . fst . last $
1177
               sortBy (comparing snd) errs of
1178
            Bad _ -> newsol
1179
            Ok newinst' -> tieredAlloc nl' il' newlimit
1180
                           newinst' allocnodes ixes' cstats'
1181

    
1182
-- * Formatting functions
1183

    
1184
-- | Given the original and final nodes, computes the relocation description.
1185
computeMoves :: Instance.Instance -- ^ The instance to be moved
1186
             -> String -- ^ The instance name
1187
             -> IMove  -- ^ The move being performed
1188
             -> String -- ^ New primary
1189
             -> String -- ^ New secondary
1190
             -> (String, [String])
1191
                -- ^ Tuple of moves and commands list; moves is containing
1192
                -- either @/f/@ for failover or @/r:name/@ for replace
1193
                -- secondary, while the command list holds gnt-instance
1194
                -- commands (without that prefix), e.g \"@failover instance1@\"
1195
computeMoves i inam mv c d =
1196
    case mv of
1197
      Failover -> ("f", [mig])
1198
      FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1199
      ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1200
      ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1201
      ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1202
    where morf = if Instance.running i then "migrate" else "failover"
1203
          mig = printf "%s -f %s" morf inam::String
1204
          rep n = printf "replace-disks -n %s %s" n inam
1205

    
1206
-- | Converts a placement to string format.
1207
printSolutionLine :: Node.List     -- ^ The node list
1208
                  -> Instance.List -- ^ The instance list
1209
                  -> Int           -- ^ Maximum node name length
1210
                  -> Int           -- ^ Maximum instance name length
1211
                  -> Placement     -- ^ The current placement
1212
                  -> Int           -- ^ The index of the placement in
1213
                                   -- the solution
1214
                  -> (String, [String])
1215
printSolutionLine nl il nmlen imlen plc pos =
1216
    let
1217
        pmlen = (2*nmlen + 1)
1218
        (i, p, s, mv, c) = plc
1219
        inst = Container.find i il
1220
        inam = Instance.alias inst
1221
        npri = Node.alias $ Container.find p nl
1222
        nsec = Node.alias $ Container.find s nl
1223
        opri = Node.alias $ Container.find (Instance.pNode inst) nl
1224
        osec = Node.alias $ Container.find (Instance.sNode inst) nl
1225
        (moves, cmds) =  computeMoves inst inam mv npri nsec
1226
        ostr = printf "%s:%s" opri osec::String
1227
        nstr = printf "%s:%s" npri nsec::String
1228
    in
1229
      (printf "  %3d. %-*s %-*s => %-*s %.8f a=%s"
1230
       pos imlen inam pmlen ostr
1231
       pmlen nstr c moves,
1232
       cmds)
1233

    
1234
-- | Return the instance and involved nodes in an instance move.
1235
--
1236
-- Note that the output list length can vary, and is not required nor
1237
-- guaranteed to be of any specific length.
1238
involvedNodes :: Instance.List -- ^ Instance list, used for retrieving
1239
                               -- the instance from its index; note
1240
                               -- that this /must/ be the original
1241
                               -- instance list, so that we can
1242
                               -- retrieve the old nodes
1243
              -> Placement     -- ^ The placement we're investigating,
1244
                               -- containing the new nodes and
1245
                               -- instance index
1246
              -> [Ndx]         -- ^ Resulting list of node indices
1247
involvedNodes il plc =
1248
    let (i, np, ns, _, _) = plc
1249
        inst = Container.find i il
1250
    in nub $ [np, ns] ++ Instance.allNodes inst
1251

    
1252
-- | Inner function for splitJobs, that either appends the next job to
1253
-- the current jobset, or starts a new jobset.
1254
mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1255
mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1256
mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1257
    | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1258
    | otherwise = ([n]:cjs, ndx)
1259

    
1260
-- | Break a list of moves into independent groups. Note that this
1261
-- will reverse the order of jobs.
1262
splitJobs :: [MoveJob] -> [JobSet]
1263
splitJobs = fst . foldl mergeJobs ([], [])
1264

    
1265
-- | Given a list of commands, prefix them with @gnt-instance@ and
1266
-- also beautify the display a little.
1267
formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1268
formatJob jsn jsl (sn, (_, _, _, cmds)) =
1269
    let out =
1270
            printf "  echo job %d/%d" jsn sn:
1271
            printf "  check":
1272
            map ("  gnt-instance " ++) cmds
1273
    in if sn == 1
1274
       then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1275
       else out
1276

    
1277
-- | Given a list of commands, prefix them with @gnt-instance@ and
1278
-- also beautify the display a little.
1279
formatCmds :: [JobSet] -> String
1280
formatCmds =
1281
    unlines .
1282
    concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1283
                             (zip [1..] js)) .
1284
    zip [1..]
1285

    
1286
-- | Print the node list.
1287
printNodes :: Node.List -> [String] -> String
1288
printNodes nl fs =
1289
    let fields = case fs of
1290
          [] -> Node.defaultFields
1291
          "+":rest -> Node.defaultFields ++ rest
1292
          _ -> fs
1293
        snl = sortBy (comparing Node.idx) (Container.elems nl)
1294
        (header, isnum) = unzip $ map Node.showHeader fields
1295
    in unlines . map ((:) ' ' .  intercalate " ") $
1296
       formatTable (header:map (Node.list fields) snl) isnum
1297

    
1298
-- | Print the instance list.
1299
printInsts :: Node.List -> Instance.List -> String
1300
printInsts nl il =
1301
    let sil = sortBy (comparing Instance.idx) (Container.elems il)
1302
        helper inst = [ if Instance.running inst then "R" else " "
1303
                      , Instance.name inst
1304
                      , Container.nameOf nl (Instance.pNode inst)
1305
                      , let sdx = Instance.sNode inst
1306
                        in if sdx == Node.noSecondary
1307
                           then  ""
1308
                           else Container.nameOf nl sdx
1309
                      , if Instance.autoBalance inst then "Y" else "N"
1310
                      , printf "%3d" $ Instance.vcpus inst
1311
                      , printf "%5d" $ Instance.mem inst
1312
                      , printf "%5d" $ Instance.dsk inst `div` 1024
1313
                      , printf "%5.3f" lC
1314
                      , printf "%5.3f" lM
1315
                      , printf "%5.3f" lD
1316
                      , printf "%5.3f" lN
1317
                      ]
1318
            where DynUtil lC lM lD lN = Instance.util inst
1319
        header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1320
                 , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1321
        isnum = False:False:False:False:False:repeat True
1322
    in unlines . map ((:) ' ' . intercalate " ") $
1323
       formatTable (header:map helper sil) isnum
1324

    
1325
-- | Shows statistics for a given node list.
1326
printStats :: Node.List -> String
1327
printStats nl =
1328
    let dcvs = compDetailedCV $ Container.elems nl
1329
        (weights, names) = unzip detailedCVInfo
1330
        hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1331
        formatted = map (\(w, header, val) ->
1332
                             printf "%s=%.8f(x%.2f)" header val w::String) hd
1333
    in intercalate ", " formatted
1334

    
1335
-- | Convert a placement into a list of OpCodes (basically a job).
1336
iMoveToJob :: Node.List        -- ^ The node list; only used for node
1337
                               -- names, so any version is good
1338
                               -- (before or after the operation)
1339
           -> Instance.List    -- ^ The instance list; also used for
1340
                               -- names only
1341
           -> Idx              -- ^ The index of the instance being
1342
                               -- moved
1343
           -> IMove            -- ^ The actual move to be described
1344
           -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to
1345
                               -- the given move
1346
iMoveToJob nl il idx move =
1347
    let inst = Container.find idx il
1348
        iname = Instance.name inst
1349
        lookNode  = Just . Container.nameOf nl
1350
        opF = OpCodes.OpInstanceMigrate iname True False True Nothing
1351
        opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n)
1352
                OpCodes.ReplaceNewSecondary [] Nothing
1353
    in case move of
1354
         Failover -> [ opF ]
1355
         ReplacePrimary np -> [ opF, opR np, opF ]
1356
         ReplaceSecondary ns -> [ opR ns ]
1357
         ReplaceAndFailover np -> [ opR np, opF ]
1358
         FailoverAndReplace ns -> [ opF, opR ns ]
1359

    
1360
-- * Node group functions
1361

    
1362
-- | Computes the group of an instance.
1363
instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1364
instanceGroup nl i =
1365
  let sidx = Instance.sNode i
1366
      pnode = Container.find (Instance.pNode i) nl
1367
      snode = if sidx == Node.noSecondary
1368
              then pnode
1369
              else Container.find sidx nl
1370
      pgroup = Node.group pnode
1371
      sgroup = Node.group snode
1372
  in if pgroup /= sgroup
1373
     then fail ("Instance placed accross two node groups, primary " ++
1374
                show pgroup ++ ", secondary " ++ show sgroup)
1375
     else return pgroup
1376

    
1377
-- | Computes the group of an instance per the primary node.
1378
instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1379
instancePriGroup nl i =
1380
  let pnode = Container.find (Instance.pNode i) nl
1381
  in  Node.group pnode
1382

    
1383
-- | Compute the list of badly allocated instances (split across node
1384
-- groups).
1385
findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1386
findSplitInstances nl =
1387
  filter (not . isOk . instanceGroup nl) . Container.elems
1388

    
1389
-- | Splits a cluster into the component node groups.
1390
splitCluster :: Node.List -> Instance.List ->
1391
                [(Gdx, (Node.List, Instance.List))]
1392
splitCluster nl il =
1393
  let ngroups = Node.computeGroups (Container.elems nl)
1394
  in map (\(guuid, nodes) ->
1395
           let nidxs = map Node.idx nodes
1396
               nodes' = zip nidxs nodes
1397
               instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1398
           in (guuid, (Container.fromList nodes', instances))) ngroups
1399

    
1400
-- | Compute the list of nodes that are to be evacuated, given a list
1401
-- of instances and an evacuation mode.
1402
nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1403
                -> EvacMode      -- ^ The evacuation mode we're using
1404
                -> [Idx]         -- ^ List of instance indices being evacuated
1405
                -> IntSet.IntSet -- ^ Set of node indices
1406
nodesToEvacuate il mode =
1407
    IntSet.delete Node.noSecondary .
1408
    foldl' (\ns idx ->
1409
                let i = Container.find idx il
1410
                    pdx = Instance.pNode i
1411
                    sdx = Instance.sNode i
1412
                    dt = Instance.diskTemplate i
1413
                    withSecondary = case dt of
1414
                                      DTDrbd8 -> IntSet.insert sdx ns
1415
                                      _ -> ns
1416
                in case mode of
1417
                     ChangePrimary   -> IntSet.insert pdx ns
1418
                     ChangeSecondary -> withSecondary
1419
                     ChangeAll       -> IntSet.insert pdx withSecondary
1420
           ) IntSet.empty