Statistics
| Branch: | Tag: | Revision:

root / htools / Ganeti / HTools / Cluster.hs @ b0631f10

History | View | Annotate | Download (62.8 kB)

1
{-| Implementation of cluster-wide logic.
2

    
3
This module holds all pure cluster-logic; I\/O related functionality
4
goes into the /Main/ module for the individual binaries.
5

    
6
-}
7

    
8
{-
9

    
10
Copyright (C) 2009, 2010, 2011 Google Inc.
11

    
12
This program is free software; you can redistribute it and/or modify
13
it under the terms of the GNU General Public License as published by
14
the Free Software Foundation; either version 2 of the License, or
15
(at your option) any later version.
16

    
17
This program is distributed in the hope that it will be useful, but
18
WITHOUT ANY WARRANTY; without even the implied warranty of
19
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
General Public License for more details.
21

    
22
You should have received a copy of the GNU General Public License
23
along with this program; if not, write to the Free Software
24
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25
02110-1301, USA.
26

    
27
-}
28

    
29
module Ganeti.HTools.Cluster
30
    (
31
     -- * Types
32
      AllocSolution(..)
33
    , EvacSolution(..)
34
    , Table(..)
35
    , CStats(..)
36
    , AllocStats
37
    -- * Generic functions
38
    , totalResources
39
    , computeAllocationDelta
40
    -- * First phase functions
41
    , computeBadItems
42
    -- * Second phase functions
43
    , printSolutionLine
44
    , formatCmds
45
    , involvedNodes
46
    , splitJobs
47
    -- * Display functions
48
    , printNodes
49
    , printInsts
50
    -- * Balacing functions
51
    , checkMove
52
    , doNextBalance
53
    , tryBalance
54
    , compCV
55
    , compCVNodes
56
    , compDetailedCV
57
    , printStats
58
    , iMoveToJob
59
    -- * IAllocator functions
60
    , genAllocNodes
61
    , tryAlloc
62
    , tryMGAlloc
63
    , tryReloc
64
    , tryNodeEvac
65
    , tryChangeGroup
66
    , collapseFailures
67
    -- * Allocation functions
68
    , iterateAlloc
69
    , tieredAlloc
70
     -- * Node group functions
71
    , instanceGroup
72
    , findSplitInstances
73
    , splitCluster
74
    ) where
75

    
76
import qualified Data.IntSet as IntSet
77
import Data.List
78
import Data.Maybe (fromJust, isNothing)
79
import Data.Ord (comparing)
80
import Text.Printf (printf)
81

    
82
import qualified Ganeti.HTools.Container as Container
83
import qualified Ganeti.HTools.Instance as Instance
84
import qualified Ganeti.HTools.Node as Node
85
import qualified Ganeti.HTools.Group as Group
86
import Ganeti.HTools.Types
87
import Ganeti.HTools.Utils
88
import Ganeti.HTools.Compat
89
import qualified Ganeti.OpCodes as OpCodes
90

    
91
-- * Types
92

    
93
-- | Allocation\/relocation solution.
94
data AllocSolution = AllocSolution
95
  { asFailures :: [FailMode]              -- ^ Failure counts
96
  , asAllocs   :: Int                     -- ^ Good allocation count
97
  , asSolution :: Maybe Node.AllocElement -- ^ The actual allocation result
98
  , asLog      :: [String]                -- ^ Informational messages
99
  }
100

    
101
-- | Node evacuation/group change iallocator result type. This result
102
-- type consists of actual opcodes (a restricted subset) that are
103
-- transmitted back to Ganeti.
104
data EvacSolution = EvacSolution
105
    { esMoved   :: [(Idx, Gdx, [Ndx])]  -- ^ Instances moved successfully
106
    , esFailed  :: [(Idx, String)]      -- ^ Instances which were not
107
                                        -- relocated
108
    , esOpCodes :: [[[OpCodes.OpCode]]] -- ^ List of lists of jobs
109
    }
110

    
111
-- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
112
type AllocResult = (FailStats, Node.List, Instance.List,
113
                    [Instance.Instance], [CStats])
114

    
115
-- | A type denoting the valid allocation mode/pairs.
116
--
117
-- For a one-node allocation, this will be a @Left ['Ndx']@, whereas
118
-- for a two-node allocation, this will be a @Right [('Ndx',
119
-- ['Ndx'])]@. In the latter case, the list is basically an
120
-- association list, grouped by primary node and holding the potential
121
-- secondary nodes in the sub-list.
122
type AllocNodes = Either [Ndx] [(Ndx, [Ndx])]
123

    
124
-- | The empty solution we start with when computing allocations.
125
emptyAllocSolution :: AllocSolution
126
emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
127
                                   , asSolution = Nothing, asLog = [] }
128

    
129
-- | The empty evac solution.
130
emptyEvacSolution :: EvacSolution
131
emptyEvacSolution = EvacSolution { esMoved = []
132
                                 , esFailed = []
133
                                 , esOpCodes = []
134
                                 }
135

    
136
-- | The complete state for the balancing solution.
137
data Table = Table Node.List Instance.List Score [Placement]
138
             deriving (Show, Read)
139

    
140
-- | Cluster statistics data type.
141
data CStats = CStats { csFmem :: Integer -- ^ Cluster free mem
142
                     , csFdsk :: Integer -- ^ Cluster free disk
143
                     , csAmem :: Integer -- ^ Cluster allocatable mem
144
                     , csAdsk :: Integer -- ^ Cluster allocatable disk
145
                     , csAcpu :: Integer -- ^ Cluster allocatable cpus
146
                     , csMmem :: Integer -- ^ Max node allocatable mem
147
                     , csMdsk :: Integer -- ^ Max node allocatable disk
148
                     , csMcpu :: Integer -- ^ Max node allocatable cpu
149
                     , csImem :: Integer -- ^ Instance used mem
150
                     , csIdsk :: Integer -- ^ Instance used disk
151
                     , csIcpu :: Integer -- ^ Instance used cpu
152
                     , csTmem :: Double  -- ^ Cluster total mem
153
                     , csTdsk :: Double  -- ^ Cluster total disk
154
                     , csTcpu :: Double  -- ^ Cluster total cpus
155
                     , csVcpu :: Integer -- ^ Cluster virtual cpus (if
156
                                         -- node pCpu has been set,
157
                                         -- otherwise -1)
158
                     , csXmem :: Integer -- ^ Unnacounted for mem
159
                     , csNmem :: Integer -- ^ Node own memory
160
                     , csScore :: Score  -- ^ The cluster score
161
                     , csNinst :: Int    -- ^ The total number of instances
162
                     }
163
            deriving (Show, Read)
164

    
165
-- | Currently used, possibly to allocate, unallocable.
166
type AllocStats = (RSpec, RSpec, RSpec)
167

    
168
-- * Utility functions
169

    
170
-- | Verifies the N+1 status and return the affected nodes.
171
verifyN1 :: [Node.Node] -> [Node.Node]
172
verifyN1 = filter Node.failN1
173

    
174
{-| Computes the pair of bad nodes and instances.
175

    
176
The bad node list is computed via a simple 'verifyN1' check, and the
177
bad instance list is the list of primary and secondary instances of
178
those nodes.
179

    
180
-}
181
computeBadItems :: Node.List -> Instance.List ->
182
                   ([Node.Node], [Instance.Instance])
183
computeBadItems nl il =
184
  let bad_nodes = verifyN1 $ getOnline nl
185
      bad_instances = map (`Container.find` il) .
186
                      sort . nub $
187
                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
188
  in
189
    (bad_nodes, bad_instances)
190

    
191
-- | Zero-initializer for the CStats type.
192
emptyCStats :: CStats
193
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
194

    
195
-- | Update stats with data from a new node.
196
updateCStats :: CStats -> Node.Node -> CStats
197
updateCStats cs node =
198
    let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
199
                 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
200
                 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
201
                 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
202
                 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
203
                 csVcpu = x_vcpu,
204
                 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
205
               }
206
            = cs
207
        inc_amem = Node.fMem node - Node.rMem node
208
        inc_amem' = if inc_amem > 0 then inc_amem else 0
209
        inc_adsk = Node.availDisk node
210
        inc_imem = truncate (Node.tMem node) - Node.nMem node
211
                   - Node.xMem node - Node.fMem node
212
        inc_icpu = Node.uCpu node
213
        inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
214
        inc_vcpu = Node.hiCpu node
215
        inc_acpu = Node.availCpu node
216

    
217
    in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
218
          , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
219
          , csAmem = x_amem + fromIntegral inc_amem'
220
          , csAdsk = x_adsk + fromIntegral inc_adsk
221
          , csAcpu = x_acpu + fromIntegral inc_acpu
222
          , csMmem = max x_mmem (fromIntegral inc_amem')
223
          , csMdsk = max x_mdsk (fromIntegral inc_adsk)
224
          , csMcpu = max x_mcpu (fromIntegral inc_acpu)
225
          , csImem = x_imem + fromIntegral inc_imem
226
          , csIdsk = x_idsk + fromIntegral inc_idsk
227
          , csIcpu = x_icpu + fromIntegral inc_icpu
228
          , csTmem = x_tmem + Node.tMem node
229
          , csTdsk = x_tdsk + Node.tDsk node
230
          , csTcpu = x_tcpu + Node.tCpu node
231
          , csVcpu = x_vcpu + fromIntegral inc_vcpu
232
          , csXmem = x_xmem + fromIntegral (Node.xMem node)
233
          , csNmem = x_nmem + fromIntegral (Node.nMem node)
234
          , csNinst = x_ninst + length (Node.pList node)
235
          }
236

    
237
-- | Compute the total free disk and memory in the cluster.
238
totalResources :: Node.List -> CStats
239
totalResources nl =
240
    let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
241
    in cs { csScore = compCV nl }
242

    
243
-- | Compute the delta between two cluster state.
244
--
245
-- This is used when doing allocations, to understand better the
246
-- available cluster resources. The return value is a triple of the
247
-- current used values, the delta that was still allocated, and what
248
-- was left unallocated.
249
computeAllocationDelta :: CStats -> CStats -> AllocStats
250
computeAllocationDelta cini cfin =
251
    let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini
252
        CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
253
                csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin
254
        rini = RSpec (fromIntegral i_icpu) (fromIntegral i_imem)
255
               (fromIntegral i_idsk)
256
        rfin = RSpec (fromIntegral (f_icpu - i_icpu))
257
               (fromIntegral (f_imem - i_imem))
258
               (fromIntegral (f_idsk - i_idsk))
259
        un_cpu = fromIntegral (v_cpu - f_icpu)::Int
260
        runa = RSpec un_cpu (truncate t_mem - fromIntegral f_imem)
261
               (truncate t_dsk - fromIntegral f_idsk)
262
    in (rini, rfin, runa)
263

    
264
-- | The names and weights of the individual elements in the CV list.
265
detailedCVInfo :: [(Double, String)]
266
detailedCVInfo = [ (1,  "free_mem_cv")
267
                 , (1,  "free_disk_cv")
268
                 , (1,  "n1_cnt")
269
                 , (1,  "reserved_mem_cv")
270
                 , (4,  "offline_all_cnt")
271
                 , (16, "offline_pri_cnt")
272
                 , (1,  "vcpu_ratio_cv")
273
                 , (1,  "cpu_load_cv")
274
                 , (1,  "mem_load_cv")
275
                 , (1,  "disk_load_cv")
276
                 , (1,  "net_load_cv")
277
                 , (2,  "pri_tags_score")
278
                 ]
279

    
280
-- | Holds the weights used by 'compCVNodes' for each metric.
281
detailedCVWeights :: [Double]
282
detailedCVWeights = map fst detailedCVInfo
283

    
284
-- | Compute the mem and disk covariance.
285
compDetailedCV :: [Node.Node] -> [Double]
286
compDetailedCV all_nodes =
287
    let
288
        (offline, nodes) = partition Node.offline all_nodes
289
        mem_l = map Node.pMem nodes
290
        dsk_l = map Node.pDsk nodes
291
        -- metric: memory covariance
292
        mem_cv = stdDev mem_l
293
        -- metric: disk covariance
294
        dsk_cv = stdDev dsk_l
295
        -- metric: count of instances living on N1 failing nodes
296
        n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
297
                                                   length (Node.pList n)) .
298
                   filter Node.failN1 $ nodes :: Double
299
        res_l = map Node.pRem nodes
300
        -- metric: reserved memory covariance
301
        res_cv = stdDev res_l
302
        -- offline instances metrics
303
        offline_ipri = sum . map (length . Node.pList) $ offline
304
        offline_isec = sum . map (length . Node.sList) $ offline
305
        -- metric: count of instances on offline nodes
306
        off_score = fromIntegral (offline_ipri + offline_isec)::Double
307
        -- metric: count of primary instances on offline nodes (this
308
        -- helps with evacuation/failover of primary instances on
309
        -- 2-node clusters with one node offline)
310
        off_pri_score = fromIntegral offline_ipri::Double
311
        cpu_l = map Node.pCpu nodes
312
        -- metric: covariance of vcpu/pcpu ratio
313
        cpu_cv = stdDev cpu_l
314
        -- metrics: covariance of cpu, memory, disk and network load
315
        (c_load, m_load, d_load, n_load) = unzip4 $
316
            map (\n ->
317
                     let DynUtil c1 m1 d1 n1 = Node.utilLoad n
318
                         DynUtil c2 m2 d2 n2 = Node.utilPool n
319
                     in (c1/c2, m1/m2, d1/d2, n1/n2)
320
                ) nodes
321
        -- metric: conflicting instance count
322
        pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
323
        pri_tags_score = fromIntegral pri_tags_inst::Double
324
    in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
325
       , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
326
       , pri_tags_score ]
327

    
328
-- | Compute the /total/ variance.
329
compCVNodes :: [Node.Node] -> Double
330
compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
331

    
332
-- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
333
compCV :: Node.List -> Double
334
compCV = compCVNodes . Container.elems
335

    
336
-- | Compute online nodes from a 'Node.List'.
337
getOnline :: Node.List -> [Node.Node]
338
getOnline = filter (not . Node.offline) . Container.elems
339

    
340
-- * Balancing functions
341

    
342
-- | Compute best table. Note that the ordering of the arguments is important.
343
compareTables :: Table -> Table -> Table
344
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
345
    if a_cv > b_cv then b else a
346

    
347
-- | Applies an instance move to a given node list and instance.
348
applyMove :: Node.List -> Instance.Instance
349
          -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
350
-- Failover (f)
351
applyMove nl inst Failover =
352
    let old_pdx = Instance.pNode inst
353
        old_sdx = Instance.sNode inst
354
        old_p = Container.find old_pdx nl
355
        old_s = Container.find old_sdx nl
356
        int_p = Node.removePri old_p inst
357
        int_s = Node.removeSec old_s inst
358
        force_p = Node.offline old_p
359
        new_nl = do -- Maybe monad
360
          new_p <- Node.addPriEx force_p int_s inst
361
          new_s <- Node.addSec int_p inst old_sdx
362
          let new_inst = Instance.setBoth inst old_sdx old_pdx
363
          return (Container.addTwo old_pdx new_s old_sdx new_p nl,
364
                  new_inst, old_sdx, old_pdx)
365
    in new_nl
366

    
367
-- Replace the primary (f:, r:np, f)
368
applyMove nl inst (ReplacePrimary new_pdx) =
369
    let old_pdx = Instance.pNode inst
370
        old_sdx = Instance.sNode inst
371
        old_p = Container.find old_pdx nl
372
        old_s = Container.find old_sdx nl
373
        tgt_n = Container.find new_pdx nl
374
        int_p = Node.removePri old_p inst
375
        int_s = Node.removeSec old_s inst
376
        force_p = Node.offline old_p
377
        new_nl = do -- Maybe monad
378
          -- check that the current secondary can host the instance
379
          -- during the migration
380
          tmp_s <- Node.addPriEx force_p int_s inst
381
          let tmp_s' = Node.removePri tmp_s inst
382
          new_p <- Node.addPriEx force_p tgt_n inst
383
          new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
384
          let new_inst = Instance.setPri inst new_pdx
385
          return (Container.add new_pdx new_p $
386
                  Container.addTwo old_pdx int_p old_sdx new_s nl,
387
                  new_inst, new_pdx, old_sdx)
388
    in new_nl
389

    
390
-- Replace the secondary (r:ns)
391
applyMove nl inst (ReplaceSecondary new_sdx) =
392
    let old_pdx = Instance.pNode inst
393
        old_sdx = Instance.sNode inst
394
        old_s = Container.find old_sdx nl
395
        tgt_n = Container.find new_sdx nl
396
        int_s = Node.removeSec old_s inst
397
        force_s = Node.offline old_s
398
        new_inst = Instance.setSec inst new_sdx
399
        new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
400
                 \new_s -> return (Container.addTwo new_sdx
401
                                   new_s old_sdx int_s nl,
402
                                   new_inst, old_pdx, new_sdx)
403
    in new_nl
404

    
405
-- Replace the secondary and failover (r:np, f)
406
applyMove nl inst (ReplaceAndFailover new_pdx) =
407
    let old_pdx = Instance.pNode inst
408
        old_sdx = Instance.sNode inst
409
        old_p = Container.find old_pdx nl
410
        old_s = Container.find old_sdx nl
411
        tgt_n = Container.find new_pdx nl
412
        int_p = Node.removePri old_p inst
413
        int_s = Node.removeSec old_s inst
414
        force_s = Node.offline old_s
415
        new_nl = do -- Maybe monad
416
          new_p <- Node.addPri tgt_n inst
417
          new_s <- Node.addSecEx force_s int_p inst new_pdx
418
          let new_inst = Instance.setBoth inst new_pdx old_pdx
419
          return (Container.add new_pdx new_p $
420
                  Container.addTwo old_pdx new_s old_sdx int_s nl,
421
                  new_inst, new_pdx, old_pdx)
422
    in new_nl
423

    
424
-- Failver and replace the secondary (f, r:ns)
425
applyMove nl inst (FailoverAndReplace new_sdx) =
426
    let old_pdx = Instance.pNode inst
427
        old_sdx = Instance.sNode inst
428
        old_p = Container.find old_pdx nl
429
        old_s = Container.find old_sdx nl
430
        tgt_n = Container.find new_sdx nl
431
        int_p = Node.removePri old_p inst
432
        int_s = Node.removeSec old_s inst
433
        force_p = Node.offline old_p
434
        new_nl = do -- Maybe monad
435
          new_p <- Node.addPriEx force_p int_s inst
436
          new_s <- Node.addSecEx force_p tgt_n inst old_sdx
437
          let new_inst = Instance.setBoth inst old_sdx new_sdx
438
          return (Container.add new_sdx new_s $
439
                  Container.addTwo old_sdx new_p old_pdx int_p nl,
440
                  new_inst, old_sdx, new_sdx)
441
    in new_nl
442

    
443
-- | Tries to allocate an instance on one given node.
444
allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
445
                 -> OpResult Node.AllocElement
446
allocateOnSingle nl inst new_pdx =
447
    let p = Container.find new_pdx nl
448
        new_inst = Instance.setBoth inst new_pdx Node.noSecondary
449
    in  Node.addPri p inst >>= \new_p -> do
450
      let new_nl = Container.add new_pdx new_p nl
451
          new_score = compCV nl
452
      return (new_nl, new_inst, [new_p], new_score)
453

    
454
-- | Tries to allocate an instance on a given pair of nodes.
455
allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
456
               -> OpResult Node.AllocElement
457
allocateOnPair nl inst new_pdx new_sdx =
458
    let tgt_p = Container.find new_pdx nl
459
        tgt_s = Container.find new_sdx nl
460
    in do
461
      new_p <- Node.addPri tgt_p inst
462
      new_s <- Node.addSec tgt_s inst new_pdx
463
      let new_inst = Instance.setBoth inst new_pdx new_sdx
464
          new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
465
      return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
466

    
467
-- | Tries to perform an instance move and returns the best table
468
-- between the original one and the new one.
469
checkSingleStep :: Table -- ^ The original table
470
                -> Instance.Instance -- ^ The instance to move
471
                -> Table -- ^ The current best table
472
                -> IMove -- ^ The move to apply
473
                -> Table -- ^ The final best table
474
checkSingleStep ini_tbl target cur_tbl move =
475
    let
476
        Table ini_nl ini_il _ ini_plc = ini_tbl
477
        tmp_resu = applyMove ini_nl target move
478
    in
479
      case tmp_resu of
480
        OpFail _ -> cur_tbl
481
        OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
482
            let tgt_idx = Instance.idx target
483
                upd_cvar = compCV upd_nl
484
                upd_il = Container.add tgt_idx new_inst ini_il
485
                upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
486
                upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
487
            in
488
              compareTables cur_tbl upd_tbl
489

    
490
-- | Given the status of the current secondary as a valid new node and
491
-- the current candidate target node, generate the possible moves for
492
-- a instance.
493
possibleMoves :: Bool      -- ^ Whether the secondary node is a valid new node
494
              -> Bool      -- ^ Whether we can change the primary node
495
              -> Ndx       -- ^ Target node candidate
496
              -> [IMove]   -- ^ List of valid result moves
497

    
498
possibleMoves _ False tdx =
499
    [ReplaceSecondary tdx]
500

    
501
possibleMoves True True tdx =
502
    [ReplaceSecondary tdx,
503
     ReplaceAndFailover tdx,
504
     ReplacePrimary tdx,
505
     FailoverAndReplace tdx]
506

    
507
possibleMoves False True tdx =
508
    [ReplaceSecondary tdx,
509
     ReplaceAndFailover tdx]
510

    
511
-- | Compute the best move for a given instance.
512
checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
513
                  -> Bool              -- ^ Whether disk moves are allowed
514
                  -> Bool              -- ^ Whether instance moves are allowed
515
                  -> Table             -- ^ Original table
516
                  -> Instance.Instance -- ^ Instance to move
517
                  -> Table             -- ^ Best new table for this instance
518
checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
519
    let
520
        opdx = Instance.pNode target
521
        osdx = Instance.sNode target
522
        nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx
523
        use_secondary = elem osdx nodes_idx && inst_moves
524
        aft_failover = if use_secondary -- if allowed to failover
525
                       then checkSingleStep ini_tbl target ini_tbl Failover
526
                       else ini_tbl
527
        all_moves = if disk_moves
528
                    then concatMap
529
                         (possibleMoves use_secondary inst_moves) nodes
530
                    else []
531
    in
532
      -- iterate over the possible nodes for this instance
533
      foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
534

    
535
-- | Compute the best next move.
536
checkMove :: [Ndx]               -- ^ Allowed target node indices
537
          -> Bool                -- ^ Whether disk moves are allowed
538
          -> Bool                -- ^ Whether instance moves are allowed
539
          -> Table               -- ^ The current solution
540
          -> [Instance.Instance] -- ^ List of instances still to move
541
          -> Table               -- ^ The new solution
542
checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
543
    let Table _ _ _ ini_plc = ini_tbl
544
        -- we're using rwhnf from the Control.Parallel.Strategies
545
        -- package; we don't need to use rnf as that would force too
546
        -- much evaluation in single-threaded cases, and in
547
        -- multi-threaded case the weak head normal form is enough to
548
        -- spark the evaluation
549
        tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
550
                               inst_moves ini_tbl)
551
                 victims
552
        -- iterate over all instances, computing the best move
553
        best_tbl = foldl' compareTables ini_tbl tables
554
        Table _ _ _ best_plc = best_tbl
555
    in if length best_plc == length ini_plc
556
       then ini_tbl -- no advancement
557
       else best_tbl
558

    
559
-- | Check if we are allowed to go deeper in the balancing.
560
doNextBalance :: Table     -- ^ The starting table
561
              -> Int       -- ^ Remaining length
562
              -> Score     -- ^ Score at which to stop
563
              -> Bool      -- ^ The resulting table and commands
564
doNextBalance ini_tbl max_rounds min_score =
565
    let Table _ _ ini_cv ini_plc = ini_tbl
566
        ini_plc_len = length ini_plc
567
    in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
568

    
569
-- | Run a balance move.
570
tryBalance :: Table       -- ^ The starting table
571
           -> Bool        -- ^ Allow disk moves
572
           -> Bool        -- ^ Allow instance moves
573
           -> Bool        -- ^ Only evacuate moves
574
           -> Score       -- ^ Min gain threshold
575
           -> Score       -- ^ Min gain
576
           -> Maybe Table -- ^ The resulting table and commands
577
tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
578
    let Table ini_nl ini_il ini_cv _ = ini_tbl
579
        all_inst = Container.elems ini_il
580
        all_inst' = if evac_mode
581
                    then let bad_nodes = map Node.idx . filter Node.offline $
582
                                         Container.elems ini_nl
583
                         in filter (any (`elem` bad_nodes) . Instance.allNodes)
584
                            all_inst
585
                    else all_inst
586
        reloc_inst = filter Instance.movable all_inst'
587
        node_idx = map Node.idx . filter (not . Node.offline) $
588
                   Container.elems ini_nl
589
        fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
590
        (Table _ _ fin_cv _) = fin_tbl
591
    in
592
      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
593
      then Just fin_tbl -- this round made success, return the new table
594
      else Nothing
595

    
596
-- * Allocation functions
597

    
598
-- | Build failure stats out of a list of failures.
599
collapseFailures :: [FailMode] -> FailStats
600
collapseFailures flst =
601
    map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
602
            [minBound..maxBound]
603

    
604
-- | Compares two Maybe AllocElement and chooses the besst score.
605
bestAllocElement :: Maybe Node.AllocElement
606
                 -> Maybe Node.AllocElement
607
                 -> Maybe Node.AllocElement
608
bestAllocElement a Nothing = a
609
bestAllocElement Nothing b = b
610
bestAllocElement a@(Just (_, _, _, ascore)) b@(Just (_, _, _, bscore)) =
611
    if ascore < bscore then a else b
612

    
613
-- | Update current Allocation solution and failure stats with new
614
-- elements.
615
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
616
concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
617

    
618
concatAllocs as (OpGood ns) =
619
    let -- Choose the old or new solution, based on the cluster score
620
        cntok = asAllocs as
621
        osols = asSolution as
622
        nsols = bestAllocElement osols (Just ns)
623
        nsuc = cntok + 1
624
    -- Note: we force evaluation of nsols here in order to keep the
625
    -- memory profile low - we know that we will need nsols for sure
626
    -- in the next cycle, so we force evaluation of nsols, since the
627
    -- foldl' in the caller will only evaluate the tuple, but not the
628
    -- elements of the tuple
629
    in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolution = nsols }
630

    
631
-- | Sums two 'AllocSolution' structures.
632
sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
633
sumAllocs (AllocSolution aFails aAllocs aSols aLog)
634
          (AllocSolution bFails bAllocs bSols bLog) =
635
    -- note: we add b first, since usually it will be smaller; when
636
    -- fold'ing, a will grow and grow whereas b is the per-group
637
    -- result, hence smaller
638
    let nFails  = bFails ++ aFails
639
        nAllocs = aAllocs + bAllocs
640
        nSols   = bestAllocElement aSols bSols
641
        nLog    = bLog ++ aLog
642
    in AllocSolution nFails nAllocs nSols nLog
643

    
644
-- | Given a solution, generates a reasonable description for it.
645
describeSolution :: AllocSolution -> String
646
describeSolution as =
647
  let fcnt = asFailures as
648
      sols = asSolution as
649
      freasons =
650
        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
651
        filter ((> 0) . snd) . collapseFailures $ fcnt
652
  in case sols of
653
     Nothing -> "No valid allocation solutions, failure reasons: " ++
654
                (if null fcnt then "unknown reasons" else freasons)
655
     Just (_, _, nodes, cv) ->
656
         printf ("score: %.8f, successes %d, failures %d (%s)" ++
657
                 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
658
               (intercalate "/" . map Node.name $ nodes)
659

    
660
-- | Annotates a solution with the appropriate string.
661
annotateSolution :: AllocSolution -> AllocSolution
662
annotateSolution as = as { asLog = describeSolution as : asLog as }
663

    
664
-- | Reverses an evacuation solution.
665
--
666
-- Rationale: we always concat the results to the top of the lists, so
667
-- for proper jobset execution, we should reverse all lists.
668
reverseEvacSolution :: EvacSolution -> EvacSolution
669
reverseEvacSolution (EvacSolution f m o) =
670
    EvacSolution (reverse f) (reverse m) (reverse o)
671

    
672
-- | Generate the valid node allocation singles or pairs for a new instance.
673
genAllocNodes :: Group.List        -- ^ Group list
674
              -> Node.List         -- ^ The node map
675
              -> Int               -- ^ The number of nodes required
676
              -> Bool              -- ^ Whether to drop or not
677
                                   -- unallocable nodes
678
              -> Result AllocNodes -- ^ The (monadic) result
679
genAllocNodes gl nl count drop_unalloc =
680
    let filter_fn = if drop_unalloc
681
                    then filter (Group.isAllocable .
682
                                 flip Container.find gl . Node.group)
683
                    else id
684
        all_nodes = filter_fn $ getOnline nl
685
        all_pairs = [(Node.idx p,
686
                      [Node.idx s | s <- all_nodes,
687
                                         Node.idx p /= Node.idx s,
688
                                         Node.group p == Node.group s]) |
689
                     p <- all_nodes]
690
    in case count of
691
         1 -> Ok (Left (map Node.idx all_nodes))
692
         2 -> Ok (Right (filter (not . null . snd) all_pairs))
693
         _ -> Bad "Unsupported number of nodes, only one or two  supported"
694

    
695
-- | Try to allocate an instance on the cluster.
696
tryAlloc :: (Monad m) =>
697
            Node.List         -- ^ The node list
698
         -> Instance.List     -- ^ The instance list
699
         -> Instance.Instance -- ^ The instance to allocate
700
         -> AllocNodes        -- ^ The allocation targets
701
         -> m AllocSolution   -- ^ Possible solution list
702
tryAlloc nl _ inst (Right ok_pairs) =
703
    let psols = parMap rwhnf (\(p, ss) ->
704
                                  foldl' (\cstate ->
705
                                          concatAllocs cstate .
706
                                          allocateOnPair nl inst p)
707
                                  emptyAllocSolution ss) ok_pairs
708
        sols = foldl' sumAllocs emptyAllocSolution psols
709
    in if null ok_pairs -- means we have just one node
710
       then fail "Not enough online nodes"
711
       else return $ annotateSolution sols
712

    
713
tryAlloc nl _ inst (Left all_nodes) =
714
    let sols = foldl' (\cstate ->
715
                           concatAllocs cstate . allocateOnSingle nl inst
716
                      ) emptyAllocSolution all_nodes
717
    in if null all_nodes
718
       then fail "No online nodes"
719
       else return $ annotateSolution sols
720

    
721
-- | Given a group/result, describe it as a nice (list of) messages.
722
solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
723
solutionDescription gl (groupId, result) =
724
  case result of
725
    Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
726
    Bad message -> [printf "Group %s: error %s" gname message]
727
  where grp = Container.find groupId gl
728
        gname = Group.name grp
729
        pol = apolToString (Group.allocPolicy grp)
730

    
731
-- | From a list of possibly bad and possibly empty solutions, filter
732
-- only the groups with a valid result. Note that the result will be
733
-- reversed compared to the original list.
734
filterMGResults :: Group.List
735
                -> [(Gdx, Result AllocSolution)]
736
                -> [(Gdx, AllocSolution)]
737
filterMGResults gl = foldl' fn []
738
    where unallocable = not . Group.isAllocable . flip Container.find gl
739
          fn accu (gdx, rasol) =
740
              case rasol of
741
                Bad _ -> accu
742
                Ok sol | isNothing (asSolution sol) -> accu
743
                       | unallocable gdx -> accu
744
                       | otherwise -> (gdx, sol):accu
745

    
746
-- | Sort multigroup results based on policy and score.
747
sortMGResults :: Group.List
748
             -> [(Gdx, AllocSolution)]
749
             -> [(Gdx, AllocSolution)]
750
sortMGResults gl sols =
751
    let extractScore (_, _, _, x) = x
752
        solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
753
                               (extractScore . fromJust . asSolution) sol)
754
    in sortBy (comparing solScore) sols
755

    
756
-- | Finds the best group for an instance on a multi-group cluster.
757
--
758
-- Only solutions in @preferred@ and @last_resort@ groups will be
759
-- accepted as valid, and additionally if the allowed groups parameter
760
-- is not null then allocation will only be run for those group
761
-- indices.
762
findBestAllocGroup :: Group.List           -- ^ The group list
763
                   -> Node.List            -- ^ The node list
764
                   -> Instance.List        -- ^ The instance list
765
                   -> Maybe [Gdx]          -- ^ The allowed groups
766
                   -> Instance.Instance    -- ^ The instance to allocate
767
                   -> Int                  -- ^ Required number of nodes
768
                   -> Result (Gdx, AllocSolution, [String])
769
findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
770
  let groups = splitCluster mgnl mgil
771
      groups' = maybe groups (\gs -> filter ((`elem` gs) . fst) groups)
772
                allowed_gdxs
773
      sols = map (\(gid, (nl, il)) ->
774
                   (gid, genAllocNodes mggl nl cnt False >>=
775
                       tryAlloc nl il inst))
776
             groups'::[(Gdx, Result AllocSolution)]
777
      all_msgs = concatMap (solutionDescription mggl) sols
778
      goodSols = filterMGResults mggl sols
779
      sortedSols = sortMGResults mggl goodSols
780
  in if null sortedSols
781
     then Bad $ intercalate ", " all_msgs
782
     else let (final_group, final_sol) = head sortedSols
783
          in return (final_group, final_sol, all_msgs)
784

    
785
-- | Try to allocate an instance on a multi-group cluster.
786
tryMGAlloc :: Group.List           -- ^ The group list
787
           -> Node.List            -- ^ The node list
788
           -> Instance.List        -- ^ The instance list
789
           -> Instance.Instance    -- ^ The instance to allocate
790
           -> Int                  -- ^ Required number of nodes
791
           -> Result AllocSolution -- ^ Possible solution list
792
tryMGAlloc mggl mgnl mgil inst cnt = do
793
  (best_group, solution, all_msgs) <-
794
      findBestAllocGroup mggl mgnl mgil Nothing inst cnt
795
  let group_name = Group.name $ Container.find best_group mggl
796
      selmsg = "Selected group: " ++ group_name
797
  return $ solution { asLog = selmsg:all_msgs }
798

    
799
-- | Try to relocate an instance on the cluster.
800
tryReloc :: (Monad m) =>
801
            Node.List       -- ^ The node list
802
         -> Instance.List   -- ^ The instance list
803
         -> Idx             -- ^ The index of the instance to move
804
         -> Int             -- ^ The number of nodes required
805
         -> [Ndx]           -- ^ Nodes which should not be used
806
         -> m AllocSolution -- ^ Solution list
807
tryReloc nl il xid 1 ex_idx =
808
    let all_nodes = getOnline nl
809
        inst = Container.find xid il
810
        ex_idx' = Instance.pNode inst:ex_idx
811
        valid_nodes = filter (not . flip elem ex_idx' . Node.idx) all_nodes
812
        valid_idxes = map Node.idx valid_nodes
813
        sols1 = foldl' (\cstate x ->
814
                            let em = do
815
                                  (mnl, i, _, _) <-
816
                                      applyMove nl inst (ReplaceSecondary x)
817
                                  return (mnl, i, [Container.find x mnl],
818
                                          compCV mnl)
819
                            in concatAllocs cstate em
820
                       ) emptyAllocSolution valid_idxes
821
    in return sols1
822

    
823
tryReloc _ _ _ reqn _  = fail $ "Unsupported number of relocation \
824
                                \destinations required (" ++ show reqn ++
825
                                                  "), only one supported"
826

    
827
-- | Function which fails if the requested mode is change secondary.
828
--
829
-- This is useful since except DRBD, no other disk template can
830
-- execute change secondary; thus, we can just call this function
831
-- instead of always checking for secondary mode. After the call to
832
-- this function, whatever mode we have is just a primary change.
833
failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
834
failOnSecondaryChange ChangeSecondary dt =
835
    fail $ "Instances with disk template '" ++ dtToString dt ++
836
         "' can't execute change secondary"
837
failOnSecondaryChange _ _ = return ()
838

    
839
-- | Run evacuation for a single instance.
840
--
841
-- /Note:/ this function should correctly execute both intra-group
842
-- evacuations (in all modes) and inter-group evacuations (in the
843
-- 'ChangeAll' mode). Of course, this requires that the correct list
844
-- of target nodes is passed.
845
nodeEvacInstance :: Node.List         -- ^ The node list (cluster-wide)
846
                 -> Instance.List     -- ^ Instance list (cluster-wide)
847
                 -> EvacMode          -- ^ The evacuation mode
848
                 -> Instance.Instance -- ^ The instance to be evacuated
849
                 -> Gdx               -- ^ The group we're targetting
850
                 -> [Ndx]             -- ^ The list of available nodes
851
                                      -- for allocation
852
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
853
nodeEvacInstance _ _ mode (Instance.Instance
854
                           {Instance.diskTemplate = dt@DTDiskless}) _ _ =
855
                  failOnSecondaryChange mode dt >>
856
                  fail "Diskless relocations not implemented yet"
857

    
858
nodeEvacInstance _ _ _ (Instance.Instance
859
                        {Instance.diskTemplate = DTPlain}) _ _ =
860
                  fail "Instances of type plain cannot be relocated"
861

    
862
nodeEvacInstance _ _ _ (Instance.Instance
863
                        {Instance.diskTemplate = DTFile}) _ _ =
864
                  fail "Instances of type file cannot be relocated"
865

    
866
nodeEvacInstance _ _ mode  (Instance.Instance
867
                            {Instance.diskTemplate = dt@DTSharedFile}) _ _ =
868
                  failOnSecondaryChange mode dt >>
869
                  fail "Shared file relocations not implemented yet"
870

    
871
nodeEvacInstance _ _ mode (Instance.Instance
872
                           {Instance.diskTemplate = dt@DTBlock}) _ _ =
873
                  failOnSecondaryChange mode dt >>
874
                  fail "Block device relocations not implemented yet"
875

    
876
nodeEvacInstance nl il ChangePrimary
877
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
878
                 _ _ =
879
  do
880
    (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
881
    let idx = Instance.idx inst
882
        il' = Container.add idx inst' il
883
        ops = iMoveToJob nl' il' idx Failover
884
    return (nl', il', ops)
885

    
886
nodeEvacInstance nl il ChangeSecondary
887
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
888
                 gdx avail_nodes =
889
  do
890
    (nl', inst', _, ndx) <- annotateResult "Can't find any good node" $
891
                            eitherToResult $
892
                            foldl' (evacDrbdSecondaryInner nl inst gdx)
893
                            (Left "no nodes available") avail_nodes
894
    let idx = Instance.idx inst
895
        il' = Container.add idx inst' il
896
        ops = iMoveToJob nl' il' idx (ReplaceSecondary ndx)
897
    return (nl', il', ops)
898

    
899
-- The algorithm for ChangeAll is as follows:
900
--
901
-- * generate all (primary, secondary) node pairs for the target groups
902
-- * for each pair, execute the needed moves (r:s, f, r:s) and compute
903
--   the final node list state and group score
904
-- * select the best choice via a foldl that uses the same Either
905
--   String solution as the ChangeSecondary mode
906
nodeEvacInstance nl il ChangeAll
907
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
908
                 gdx avail_nodes =
909
  do
910
    let no_nodes = Left "no nodes available"
911
        node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s]
912
    (nl', il', ops, _) <-
913
        annotateResult "Can't find any good nodes for relocation" $
914
        eitherToResult $
915
        foldl'
916
        (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of
917
                          Bad msg ->
918
                              case accu of
919
                                Right _ -> accu
920
                                -- we don't need more details (which
921
                                -- nodes, etc.) as we only selected
922
                                -- this group if we can allocate on
923
                                -- it, hence failures will not
924
                                -- propagate out of this fold loop
925
                                Left _ -> Left $ "Allocation failed: " ++ msg
926
                          Ok result@(_, _, _, new_cv) ->
927
                              let new_accu = Right result in
928
                              case accu of
929
                                Left _ -> new_accu
930
                                Right (_, _, _, old_cv) ->
931
                                    if old_cv < new_cv
932
                                    then accu
933
                                    else new_accu
934
        ) no_nodes node_pairs
935

    
936
    return (nl', il', ops)
937

    
938
-- | Inner fold function for changing secondary of a DRBD instance.
939
--
940
-- The running solution is either a @Left String@, which means we
941
-- don't have yet a working solution, or a @Right (...)@, which
942
-- represents a valid solution; it holds the modified node list, the
943
-- modified instance (after evacuation), the score of that solution,
944
-- and the new secondary node index.
945
evacDrbdSecondaryInner :: Node.List -- ^ Cluster node list
946
                       -> Instance.Instance -- ^ Instance being evacuated
947
                       -> Gdx -- ^ The group index of the instance
948
                       -> Either String ( Node.List
949
                                        , Instance.Instance
950
                                        , Score
951
                                        , Ndx)  -- ^ Current best solution
952
                       -> Ndx  -- ^ Node we're evaluating as new secondary
953
                       -> Either String ( Node.List
954
                                        , Instance.Instance
955
                                        , Score
956
                                        , Ndx) -- ^ New best solution
957
evacDrbdSecondaryInner nl inst gdx accu ndx =
958
    case applyMove nl inst (ReplaceSecondary ndx) of
959
      OpFail fm ->
960
          case accu of
961
            Right _ -> accu
962
            Left _ -> Left $ "Node " ++ Container.nameOf nl ndx ++
963
                      " failed: " ++ show fm
964
      OpGood (nl', inst', _, _) ->
965
          let nodes = Container.elems nl'
966
              -- The fromJust below is ugly (it can fail nastily), but
967
              -- at this point we should have any internal mismatches,
968
              -- and adding a monad here would be quite involved
969
              grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
970
              new_cv = compCVNodes grpnodes
971
              new_accu = Right (nl', inst', new_cv, ndx)
972
          in case accu of
973
               Left _ -> new_accu
974
               Right (_, _, old_cv, _) ->
975
                   if old_cv < new_cv
976
                   then accu
977
                   else new_accu
978

    
979
-- | Compute result of changing all nodes of a DRBD instance.
980
--
981
-- Given the target primary and secondary node (which might be in a
982
-- different group or not), this function will 'execute' all the
983
-- required steps and assuming all operations succceed, will return
984
-- the modified node and instance lists, the opcodes needed for this
985
-- and the new group score.
986
evacDrbdAllInner :: Node.List         -- ^ Cluster node list
987
                 -> Instance.List     -- ^ Cluster instance list
988
                 -> Instance.Instance -- ^ The instance to be moved
989
                 -> Gdx               -- ^ The target group index
990
                                      -- (which can differ from the
991
                                      -- current group of the
992
                                      -- instance)
993
                 -> (Ndx, Ndx)        -- ^ Tuple of new
994
                                      -- primary\/secondary nodes
995
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score)
996
evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) =
997
  do
998
    let primary = Container.find (Instance.pNode inst) nl
999
        idx = Instance.idx inst
1000
    -- if the primary is offline, then we first failover
1001
    (nl1, inst1, ops1) <-
1002
        if Node.offline primary
1003
        then do
1004
          (nl', inst', _, _) <-
1005
              annotateResult "Failing over to the secondary" $
1006
              opToResult $ applyMove nl inst Failover
1007
          return (nl', inst', [Failover])
1008
        else return (nl, inst, [])
1009
    let (o1, o2, o3) = (ReplaceSecondary t_pdx,
1010
                        Failover,
1011
                        ReplaceSecondary t_sdx)
1012
    -- we now need to execute a replace secondary to the future
1013
    -- primary node
1014
    (nl2, inst2, _, _) <-
1015
        annotateResult "Changing secondary to new primary" $
1016
        opToResult $
1017
        applyMove nl1 inst1 o1
1018
    let ops2 = o1:ops1
1019
    -- we now execute another failover, the primary stays fixed now
1020
    (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" $
1021
                          opToResult $ applyMove nl2 inst2 o2
1022
    let ops3 = o2:ops2
1023
    -- and finally another replace secondary, to the final secondary
1024
    (nl4, inst4, _, _) <-
1025
        annotateResult "Changing secondary to final secondary" $
1026
        opToResult $
1027
        applyMove nl3 inst3 o3
1028
    let ops4 = o3:ops3
1029
        il' = Container.add idx inst4 il
1030
        ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1031
    let nodes = Container.elems nl4
1032
        -- The fromJust below is ugly (it can fail nastily), but
1033
        -- at this point we should have any internal mismatches,
1034
        -- and adding a monad here would be quite involved
1035
        grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1036
        new_cv = compCVNodes grpnodes
1037
    return (nl4, il', ops, new_cv)
1038

    
1039
-- | Computes the nodes in a given group which are available for
1040
-- allocation.
1041
availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1042
                    -> IntSet.IntSet  -- ^ Nodes that are excluded
1043
                    -> Gdx            -- ^ The group for which we
1044
                                      -- query the nodes
1045
                    -> Result [Ndx]   -- ^ List of available node indices
1046
availableGroupNodes group_nodes excl_ndx gdx = do
1047
  local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1048
                 Ok (lookup gdx group_nodes)
1049
  let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1050
  return avail_nodes
1051

    
1052
-- | Updates the evac solution with the results of an instance
1053
-- evacuation.
1054
updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1055
                   -> Idx
1056
                   -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1057
                   -> (Node.List, Instance.List, EvacSolution)
1058
updateEvacSolution (nl, il, es) idx (Bad msg) =
1059
    (nl, il, es { esFailed = (idx, msg):esFailed es})
1060
updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1061
    (nl, il, es { esMoved = new_elem:esMoved es
1062
                , esOpCodes = [opcodes]:esOpCodes es })
1063
     where inst = Container.find idx il
1064
           new_elem = (idx,
1065
                       instancePriGroup nl inst,
1066
                       Instance.allNodes inst)
1067

    
1068
-- | Node-evacuation IAllocator mode main function.
1069
tryNodeEvac :: Group.List    -- ^ The cluster groups
1070
            -> Node.List     -- ^ The node list (cluster-wide, not per group)
1071
            -> Instance.List -- ^ Instance list (cluster-wide)
1072
            -> EvacMode      -- ^ The evacuation mode
1073
            -> [Idx]         -- ^ List of instance (indices) to be evacuated
1074
            -> Result (Node.List, Instance.List, EvacSolution)
1075
tryNodeEvac _ ini_nl ini_il mode idxs =
1076
    let evac_ndx = nodesToEvacuate ini_il mode idxs
1077
        offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1078
        excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1079
        group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1080
                                             (Container.elems nl))) $
1081
                      splitCluster ini_nl ini_il
1082
        (fin_nl, fin_il, esol) =
1083
            foldl' (\state@(nl, il, _) inst ->
1084
                        let gdx = instancePriGroup nl inst
1085
                            pdx = Instance.pNode inst in
1086
                        updateEvacSolution state (Instance.idx inst) $
1087
                        availableGroupNodes group_ndx
1088
                          (IntSet.insert pdx excl_ndx) gdx >>=
1089
                        nodeEvacInstance nl il mode inst gdx
1090
                   )
1091
            (ini_nl, ini_il, emptyEvacSolution)
1092
            (map (`Container.find` ini_il) idxs)
1093
    in return (fin_nl, fin_il, reverseEvacSolution esol)
1094

    
1095
-- | Change-group IAllocator mode main function.
1096
--
1097
-- This is very similar to 'tryNodeEvac', the only difference is that
1098
-- we don't choose as target group the current instance group, but
1099
-- instead:
1100
--
1101
--   1. at the start of the function, we compute which are the target
1102
--   groups; either no groups were passed in, in which case we choose
1103
--   all groups out of which we don't evacuate instance, or there were
1104
--   some groups passed, in which case we use those
1105
--
1106
--   2. for each instance, we use 'findBestAllocGroup' to choose the
1107
--   best group to hold the instance, and then we do what
1108
--   'tryNodeEvac' does, except for this group instead of the current
1109
--   instance group.
1110
--
1111
-- Note that the correct behaviour of this function relies on the
1112
-- function 'nodeEvacInstance' to be able to do correctly both
1113
-- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1114
tryChangeGroup :: Group.List    -- ^ The cluster groups
1115
               -> Node.List     -- ^ The node list (cluster-wide)
1116
               -> Instance.List -- ^ Instance list (cluster-wide)
1117
               -> [Gdx]         -- ^ Target groups; if empty, any
1118
                                -- groups not being evacuated
1119
               -> [Idx]         -- ^ List of instance (indices) to be evacuated
1120
               -> Result (Node.List, Instance.List, EvacSolution)
1121
tryChangeGroup gl ini_nl ini_il gdxs idxs =
1122
    let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1123
                               flip Container.find ini_il) idxs
1124
        target_gdxs = (if null gdxs
1125
                       then Container.keys gl
1126
                       else gdxs) \\ evac_gdxs
1127
        offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1128
        excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1129
        group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1130
                                             (Container.elems nl))) $
1131
                      splitCluster ini_nl ini_il
1132
        (fin_nl, fin_il, esol) =
1133
            foldl' (\state@(nl, il, _) inst ->
1134
                        let solution = do
1135
                              let ncnt = Instance.requiredNodes $
1136
                                         Instance.diskTemplate inst
1137
                              (gdx, _, _) <- findBestAllocGroup gl nl il
1138
                                             (Just target_gdxs) inst ncnt
1139
                              av_nodes <- availableGroupNodes group_ndx
1140
                                          excl_ndx gdx
1141
                              nodeEvacInstance nl il ChangeAll inst
1142
                                       gdx av_nodes
1143
                        in updateEvacSolution state
1144
                               (Instance.idx inst) solution
1145
                   )
1146
            (ini_nl, ini_il, emptyEvacSolution)
1147
            (map (`Container.find` ini_il) idxs)
1148
    in return (fin_nl, fin_il, reverseEvacSolution esol)
1149

    
1150
-- | Recursively place instances on the cluster until we're out of space.
1151
iterateAlloc :: Node.List
1152
             -> Instance.List
1153
             -> Maybe Int
1154
             -> Instance.Instance
1155
             -> AllocNodes
1156
             -> [Instance.Instance]
1157
             -> [CStats]
1158
             -> Result AllocResult
1159
iterateAlloc nl il limit newinst allocnodes ixes cstats =
1160
      let depth = length ixes
1161
          newname = printf "new-%d" depth::String
1162
          newidx = length (Container.elems il) + depth
1163
          newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1164
          newlimit = fmap (flip (-) 1) limit
1165
      in case tryAlloc nl il newi2 allocnodes of
1166
           Bad s -> Bad s
1167
           Ok (AllocSolution { asFailures = errs, asSolution = sols3 }) ->
1168
               let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1169
               case sols3 of
1170
                 Nothing -> newsol
1171
                 Just (xnl, xi, _, _) ->
1172
                     if limit == Just 0
1173
                     then newsol
1174
                     else iterateAlloc xnl (Container.add newidx xi il)
1175
                          newlimit newinst allocnodes (xi:ixes)
1176
                          (totalResources xnl:cstats)
1177

    
1178
-- | The core of the tiered allocation mode.
1179
tieredAlloc :: Node.List
1180
            -> Instance.List
1181
            -> Maybe Int
1182
            -> Instance.Instance
1183
            -> AllocNodes
1184
            -> [Instance.Instance]
1185
            -> [CStats]
1186
            -> Result AllocResult
1187
tieredAlloc nl il limit newinst allocnodes ixes cstats =
1188
    case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1189
      Bad s -> Bad s
1190
      Ok (errs, nl', il', ixes', cstats') ->
1191
          let newsol = Ok (errs, nl', il', ixes', cstats')
1192
              ixes_cnt = length ixes'
1193
              (stop, newlimit) = case limit of
1194
                                   Nothing -> (False, Nothing)
1195
                                   Just n -> (n <= ixes_cnt,
1196
                                              Just (n - ixes_cnt)) in
1197
          if stop then newsol else
1198
          case Instance.shrinkByType newinst . fst . last $
1199
               sortBy (comparing snd) errs of
1200
            Bad _ -> newsol
1201
            Ok newinst' -> tieredAlloc nl' il' newlimit
1202
                           newinst' allocnodes ixes' cstats'
1203

    
1204
-- * Formatting functions
1205

    
1206
-- | Given the original and final nodes, computes the relocation description.
1207
computeMoves :: Instance.Instance -- ^ The instance to be moved
1208
             -> String -- ^ The instance name
1209
             -> IMove  -- ^ The move being performed
1210
             -> String -- ^ New primary
1211
             -> String -- ^ New secondary
1212
             -> (String, [String])
1213
                -- ^ Tuple of moves and commands list; moves is containing
1214
                -- either @/f/@ for failover or @/r:name/@ for replace
1215
                -- secondary, while the command list holds gnt-instance
1216
                -- commands (without that prefix), e.g \"@failover instance1@\"
1217
computeMoves i inam mv c d =
1218
    case mv of
1219
      Failover -> ("f", [mig])
1220
      FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1221
      ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1222
      ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1223
      ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1224
    where morf = if Instance.running i then "migrate" else "failover"
1225
          mig = printf "%s -f %s" morf inam::String
1226
          rep n = printf "replace-disks -n %s %s" n inam
1227

    
1228
-- | Converts a placement to string format.
1229
printSolutionLine :: Node.List     -- ^ The node list
1230
                  -> Instance.List -- ^ The instance list
1231
                  -> Int           -- ^ Maximum node name length
1232
                  -> Int           -- ^ Maximum instance name length
1233
                  -> Placement     -- ^ The current placement
1234
                  -> Int           -- ^ The index of the placement in
1235
                                   -- the solution
1236
                  -> (String, [String])
1237
printSolutionLine nl il nmlen imlen plc pos =
1238
    let
1239
        pmlen = (2*nmlen + 1)
1240
        (i, p, s, mv, c) = plc
1241
        inst = Container.find i il
1242
        inam = Instance.alias inst
1243
        npri = Node.alias $ Container.find p nl
1244
        nsec = Node.alias $ Container.find s nl
1245
        opri = Node.alias $ Container.find (Instance.pNode inst) nl
1246
        osec = Node.alias $ Container.find (Instance.sNode inst) nl
1247
        (moves, cmds) =  computeMoves inst inam mv npri nsec
1248
        ostr = printf "%s:%s" opri osec::String
1249
        nstr = printf "%s:%s" npri nsec::String
1250
    in
1251
      (printf "  %3d. %-*s %-*s => %-*s %.8f a=%s"
1252
       pos imlen inam pmlen ostr
1253
       pmlen nstr c moves,
1254
       cmds)
1255

    
1256
-- | Return the instance and involved nodes in an instance move.
1257
--
1258
-- Note that the output list length can vary, and is not required nor
1259
-- guaranteed to be of any specific length.
1260
involvedNodes :: Instance.List -- ^ Instance list, used for retrieving
1261
                               -- the instance from its index; note
1262
                               -- that this /must/ be the original
1263
                               -- instance list, so that we can
1264
                               -- retrieve the old nodes
1265
              -> Placement     -- ^ The placement we're investigating,
1266
                               -- containing the new nodes and
1267
                               -- instance index
1268
              -> [Ndx]         -- ^ Resulting list of node indices
1269
involvedNodes il plc =
1270
    let (i, np, ns, _, _) = plc
1271
        inst = Container.find i il
1272
    in nub $ [np, ns] ++ Instance.allNodes inst
1273

    
1274
-- | Inner function for splitJobs, that either appends the next job to
1275
-- the current jobset, or starts a new jobset.
1276
mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1277
mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1278
mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1279
    | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1280
    | otherwise = ([n]:cjs, ndx)
1281

    
1282
-- | Break a list of moves into independent groups. Note that this
1283
-- will reverse the order of jobs.
1284
splitJobs :: [MoveJob] -> [JobSet]
1285
splitJobs = fst . foldl mergeJobs ([], [])
1286

    
1287
-- | Given a list of commands, prefix them with @gnt-instance@ and
1288
-- also beautify the display a little.
1289
formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1290
formatJob jsn jsl (sn, (_, _, _, cmds)) =
1291
    let out =
1292
            printf "  echo job %d/%d" jsn sn:
1293
            printf "  check":
1294
            map ("  gnt-instance " ++) cmds
1295
    in if sn == 1
1296
       then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1297
       else out
1298

    
1299
-- | Given a list of commands, prefix them with @gnt-instance@ and
1300
-- also beautify the display a little.
1301
formatCmds :: [JobSet] -> String
1302
formatCmds =
1303
    unlines .
1304
    concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1305
                             (zip [1..] js)) .
1306
    zip [1..]
1307

    
1308
-- | Print the node list.
1309
printNodes :: Node.List -> [String] -> String
1310
printNodes nl fs =
1311
    let fields = case fs of
1312
          [] -> Node.defaultFields
1313
          "+":rest -> Node.defaultFields ++ rest
1314
          _ -> fs
1315
        snl = sortBy (comparing Node.idx) (Container.elems nl)
1316
        (header, isnum) = unzip $ map Node.showHeader fields
1317
    in unlines . map ((:) ' ' .  intercalate " ") $
1318
       formatTable (header:map (Node.list fields) snl) isnum
1319

    
1320
-- | Print the instance list.
1321
printInsts :: Node.List -> Instance.List -> String
1322
printInsts nl il =
1323
    let sil = sortBy (comparing Instance.idx) (Container.elems il)
1324
        helper inst = [ if Instance.running inst then "R" else " "
1325
                      , Instance.name inst
1326
                      , Container.nameOf nl (Instance.pNode inst)
1327
                      , let sdx = Instance.sNode inst
1328
                        in if sdx == Node.noSecondary
1329
                           then  ""
1330
                           else Container.nameOf nl sdx
1331
                      , if Instance.autoBalance inst then "Y" else "N"
1332
                      , printf "%3d" $ Instance.vcpus inst
1333
                      , printf "%5d" $ Instance.mem inst
1334
                      , printf "%5d" $ Instance.dsk inst `div` 1024
1335
                      , printf "%5.3f" lC
1336
                      , printf "%5.3f" lM
1337
                      , printf "%5.3f" lD
1338
                      , printf "%5.3f" lN
1339
                      ]
1340
            where DynUtil lC lM lD lN = Instance.util inst
1341
        header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1342
                 , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1343
        isnum = False:False:False:False:False:repeat True
1344
    in unlines . map ((:) ' ' . intercalate " ") $
1345
       formatTable (header:map helper sil) isnum
1346

    
1347
-- | Shows statistics for a given node list.
1348
printStats :: Node.List -> String
1349
printStats nl =
1350
    let dcvs = compDetailedCV $ Container.elems nl
1351
        (weights, names) = unzip detailedCVInfo
1352
        hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1353
        formatted = map (\(w, header, val) ->
1354
                             printf "%s=%.8f(x%.2f)" header val w::String) hd
1355
    in intercalate ", " formatted
1356

    
1357
-- | Convert a placement into a list of OpCodes (basically a job).
1358
iMoveToJob :: Node.List        -- ^ The node list; only used for node
1359
                               -- names, so any version is good
1360
                               -- (before or after the operation)
1361
           -> Instance.List    -- ^ The instance list; also used for
1362
                               -- names only
1363
           -> Idx              -- ^ The index of the instance being
1364
                               -- moved
1365
           -> IMove            -- ^ The actual move to be described
1366
           -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to
1367
                               -- the given move
1368
iMoveToJob nl il idx move =
1369
    let inst = Container.find idx il
1370
        iname = Instance.name inst
1371
        lookNode  = Just . Container.nameOf nl
1372
        opF = OpCodes.OpInstanceMigrate iname True False True Nothing
1373
        opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n)
1374
                OpCodes.ReplaceNewSecondary [] Nothing
1375
    in case move of
1376
         Failover -> [ opF ]
1377
         ReplacePrimary np -> [ opF, opR np, opF ]
1378
         ReplaceSecondary ns -> [ opR ns ]
1379
         ReplaceAndFailover np -> [ opR np, opF ]
1380
         FailoverAndReplace ns -> [ opF, opR ns ]
1381

    
1382
-- * Node group functions
1383

    
1384
-- | Computes the group of an instance.
1385
instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1386
instanceGroup nl i =
1387
  let sidx = Instance.sNode i
1388
      pnode = Container.find (Instance.pNode i) nl
1389
      snode = if sidx == Node.noSecondary
1390
              then pnode
1391
              else Container.find sidx nl
1392
      pgroup = Node.group pnode
1393
      sgroup = Node.group snode
1394
  in if pgroup /= sgroup
1395
     then fail ("Instance placed accross two node groups, primary " ++
1396
                show pgroup ++ ", secondary " ++ show sgroup)
1397
     else return pgroup
1398

    
1399
-- | Computes the group of an instance per the primary node.
1400
instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1401
instancePriGroup nl i =
1402
  let pnode = Container.find (Instance.pNode i) nl
1403
  in  Node.group pnode
1404

    
1405
-- | Compute the list of badly allocated instances (split across node
1406
-- groups).
1407
findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1408
findSplitInstances nl =
1409
  filter (not . isOk . instanceGroup nl) . Container.elems
1410

    
1411
-- | Splits a cluster into the component node groups.
1412
splitCluster :: Node.List -> Instance.List ->
1413
                [(Gdx, (Node.List, Instance.List))]
1414
splitCluster nl il =
1415
  let ngroups = Node.computeGroups (Container.elems nl)
1416
  in map (\(guuid, nodes) ->
1417
           let nidxs = map Node.idx nodes
1418
               nodes' = zip nidxs nodes
1419
               instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1420
           in (guuid, (Container.fromList nodes', instances))) ngroups
1421

    
1422
-- | Compute the list of nodes that are to be evacuated, given a list
1423
-- of instances and an evacuation mode.
1424
nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1425
                -> EvacMode      -- ^ The evacuation mode we're using
1426
                -> [Idx]         -- ^ List of instance indices being evacuated
1427
                -> IntSet.IntSet -- ^ Set of node indices
1428
nodesToEvacuate il mode =
1429
    IntSet.delete Node.noSecondary .
1430
    foldl' (\ns idx ->
1431
                let i = Container.find idx il
1432
                    pdx = Instance.pNode i
1433
                    sdx = Instance.sNode i
1434
                    dt = Instance.diskTemplate i
1435
                    withSecondary = case dt of
1436
                                      DTDrbd8 -> IntSet.insert sdx ns
1437
                                      _ -> ns
1438
                in case mode of
1439
                     ChangePrimary   -> IntSet.insert pdx ns
1440
                     ChangeSecondary -> withSecondary
1441
                     ChangeAll       -> IntSet.insert pdx withSecondary
1442
           ) IntSet.empty