Statistics
| Branch: | Tag: | Revision:

root / htools / Ganeti / HTools / Cluster.hs @ 756df409

History | View | Annotate | Download (60.1 kB)

1
{-| Implementation of cluster-wide logic.
2

    
3
This module holds all pure cluster-logic; I\/O related functionality
4
goes into the /Main/ module for the individual binaries.
5

    
6
-}
7

    
8
{-
9

    
10
Copyright (C) 2009, 2010, 2011 Google Inc.
11

    
12
This program is free software; you can redistribute it and/or modify
13
it under the terms of the GNU General Public License as published by
14
the Free Software Foundation; either version 2 of the License, or
15
(at your option) any later version.
16

    
17
This program is distributed in the hope that it will be useful, but
18
WITHOUT ANY WARRANTY; without even the implied warranty of
19
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
General Public License for more details.
21

    
22
You should have received a copy of the GNU General Public License
23
along with this program; if not, write to the Free Software
24
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25
02110-1301, USA.
26

    
27
-}
28

    
29
module Ganeti.HTools.Cluster
30
    (
31
     -- * Types
32
      AllocSolution(..)
33
    , EvacSolution(..)
34
    , Table(..)
35
    , CStats(..)
36
    , AllocStats
37
    -- * Generic functions
38
    , totalResources
39
    , computeAllocationDelta
40
    -- * First phase functions
41
    , computeBadItems
42
    -- * Second phase functions
43
    , printSolutionLine
44
    , formatCmds
45
    , involvedNodes
46
    , splitJobs
47
    -- * Display functions
48
    , printNodes
49
    , printInsts
50
    -- * Balacing functions
51
    , checkMove
52
    , doNextBalance
53
    , tryBalance
54
    , compCV
55
    , compCVNodes
56
    , compDetailedCV
57
    , printStats
58
    , iMoveToJob
59
    -- * IAllocator functions
60
    , genAllocNodes
61
    , tryAlloc
62
    , tryMGAlloc
63
    , tryReloc
64
    , tryEvac
65
    , tryNodeEvac
66
    , tryChangeGroup
67
    , collapseFailures
68
    -- * Allocation functions
69
    , iterateAlloc
70
    , tieredAlloc
71
     -- * Node group functions
72
    , instanceGroup
73
    , findSplitInstances
74
    , splitCluster
75
    ) where
76

    
77
import qualified Data.IntSet as IntSet
78
import Data.List
79
import Data.Maybe (fromJust)
80
import Data.Ord (comparing)
81
import Text.Printf (printf)
82
import Control.Monad
83

    
84
import qualified Ganeti.HTools.Container as Container
85
import qualified Ganeti.HTools.Instance as Instance
86
import qualified Ganeti.HTools.Node as Node
87
import qualified Ganeti.HTools.Group as Group
88
import Ganeti.HTools.Types
89
import Ganeti.HTools.Utils
90
import Ganeti.HTools.Compat
91
import qualified Ganeti.OpCodes as OpCodes
92

    
93
-- * Types
94

    
95
-- | Allocation\/relocation solution.
96
data AllocSolution = AllocSolution
97
  { asFailures  :: [FailMode]          -- ^ Failure counts
98
  , asAllocs    :: Int                 -- ^ Good allocation count
99
  , asSolutions :: [Node.AllocElement] -- ^ The actual result, length
100
                                       -- of the list depends on the
101
                                       -- allocation/relocation mode
102
  , asLog       :: [String]            -- ^ A list of informational messages
103
  }
104

    
105
-- | Node evacuation/group change iallocator result type. This result
106
-- type consists of actual opcodes (a restricted subset) that are
107
-- transmitted back to Ganeti.
108
data EvacSolution = EvacSolution
109
    { esMoved   :: [(Idx, Gdx, [Ndx])]  -- ^ Instances moved successfully
110
    , esFailed  :: [(Idx, String)]      -- ^ Instances which were not
111
                                        -- relocated
112
    , esOpCodes :: [[[OpCodes.OpCode]]] -- ^ List of lists of jobs
113
    }
114

    
115
-- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
116
type AllocResult = (FailStats, Node.List, Instance.List,
117
                    [Instance.Instance], [CStats])
118

    
119
-- | A type denoting the valid allocation mode/pairs.
120
--
121
-- For a one-node allocation, this will be a @Left ['Node.Node']@,
122
-- whereas for a two-node allocation, this will be a @Right
123
-- [('Node.Node', 'Node.Node')]@.
124
type AllocNodes = Either [Ndx] [(Ndx, Ndx)]
125

    
126
-- | The empty solution we start with when computing allocations.
127
emptyAllocSolution :: AllocSolution
128
emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
129
                                   , asSolutions = [], asLog = [] }
130

    
131
-- | The empty evac solution.
132
emptyEvacSolution :: EvacSolution
133
emptyEvacSolution = EvacSolution { esMoved = []
134
                                 , esFailed = []
135
                                 , esOpCodes = []
136
                                 }
137

    
138
-- | The complete state for the balancing solution.
139
data Table = Table Node.List Instance.List Score [Placement]
140
             deriving (Show, Read)
141

    
142
data CStats = CStats { csFmem :: Integer -- ^ Cluster free mem
143
                     , csFdsk :: Integer -- ^ Cluster free disk
144
                     , csAmem :: Integer -- ^ Cluster allocatable mem
145
                     , csAdsk :: Integer -- ^ Cluster allocatable disk
146
                     , csAcpu :: Integer -- ^ Cluster allocatable cpus
147
                     , csMmem :: Integer -- ^ Max node allocatable mem
148
                     , csMdsk :: Integer -- ^ Max node allocatable disk
149
                     , csMcpu :: Integer -- ^ Max node allocatable cpu
150
                     , csImem :: Integer -- ^ Instance used mem
151
                     , csIdsk :: Integer -- ^ Instance used disk
152
                     , csIcpu :: Integer -- ^ Instance used cpu
153
                     , csTmem :: Double  -- ^ Cluster total mem
154
                     , csTdsk :: Double  -- ^ Cluster total disk
155
                     , csTcpu :: Double  -- ^ Cluster total cpus
156
                     , csVcpu :: Integer -- ^ Cluster virtual cpus (if
157
                                         -- node pCpu has been set,
158
                                         -- otherwise -1)
159
                     , csXmem :: Integer -- ^ Unnacounted for mem
160
                     , csNmem :: Integer -- ^ Node own memory
161
                     , csScore :: Score  -- ^ The cluster score
162
                     , csNinst :: Int    -- ^ The total number of instances
163
                     }
164
            deriving (Show, Read)
165

    
166
-- | Currently used, possibly to allocate, unallocable.
167
type AllocStats = (RSpec, RSpec, RSpec)
168

    
169
-- * Utility functions
170

    
171
-- | Verifies the N+1 status and return the affected nodes.
172
verifyN1 :: [Node.Node] -> [Node.Node]
173
verifyN1 = filter Node.failN1
174

    
175
{-| Computes the pair of bad nodes and instances.
176

    
177
The bad node list is computed via a simple 'verifyN1' check, and the
178
bad instance list is the list of primary and secondary instances of
179
those nodes.
180

    
181
-}
182
computeBadItems :: Node.List -> Instance.List ->
183
                   ([Node.Node], [Instance.Instance])
184
computeBadItems nl il =
185
  let bad_nodes = verifyN1 $ getOnline nl
186
      bad_instances = map (`Container.find` il) .
187
                      sort . nub $
188
                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
189
  in
190
    (bad_nodes, bad_instances)
191

    
192
-- | Zero-initializer for the CStats type.
193
emptyCStats :: CStats
194
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
195

    
196
-- | Update stats with data from a new node.
197
updateCStats :: CStats -> Node.Node -> CStats
198
updateCStats cs node =
199
    let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
200
                 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
201
                 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
202
                 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
203
                 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
204
                 csVcpu = x_vcpu,
205
                 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
206
               }
207
            = cs
208
        inc_amem = Node.fMem node - Node.rMem node
209
        inc_amem' = if inc_amem > 0 then inc_amem else 0
210
        inc_adsk = Node.availDisk node
211
        inc_imem = truncate (Node.tMem node) - Node.nMem node
212
                   - Node.xMem node - Node.fMem node
213
        inc_icpu = Node.uCpu node
214
        inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
215
        inc_vcpu = Node.hiCpu node
216
        inc_acpu = Node.availCpu node
217

    
218
    in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
219
          , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
220
          , csAmem = x_amem + fromIntegral inc_amem'
221
          , csAdsk = x_adsk + fromIntegral inc_adsk
222
          , csAcpu = x_acpu + fromIntegral inc_acpu
223
          , csMmem = max x_mmem (fromIntegral inc_amem')
224
          , csMdsk = max x_mdsk (fromIntegral inc_adsk)
225
          , csMcpu = max x_mcpu (fromIntegral inc_acpu)
226
          , csImem = x_imem + fromIntegral inc_imem
227
          , csIdsk = x_idsk + fromIntegral inc_idsk
228
          , csIcpu = x_icpu + fromIntegral inc_icpu
229
          , csTmem = x_tmem + Node.tMem node
230
          , csTdsk = x_tdsk + Node.tDsk node
231
          , csTcpu = x_tcpu + Node.tCpu node
232
          , csVcpu = x_vcpu + fromIntegral inc_vcpu
233
          , csXmem = x_xmem + fromIntegral (Node.xMem node)
234
          , csNmem = x_nmem + fromIntegral (Node.nMem node)
235
          , csNinst = x_ninst + length (Node.pList node)
236
          }
237

    
238
-- | Compute the total free disk and memory in the cluster.
239
totalResources :: Node.List -> CStats
240
totalResources nl =
241
    let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
242
    in cs { csScore = compCV nl }
243

    
244
-- | Compute the delta between two cluster state.
245
--
246
-- This is used when doing allocations, to understand better the
247
-- available cluster resources. The return value is a triple of the
248
-- current used values, the delta that was still allocated, and what
249
-- was left unallocated.
250
computeAllocationDelta :: CStats -> CStats -> AllocStats
251
computeAllocationDelta cini cfin =
252
    let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini
253
        CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
254
                csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin
255
        rini = RSpec (fromIntegral i_icpu) (fromIntegral i_imem)
256
               (fromIntegral i_idsk)
257
        rfin = RSpec (fromIntegral (f_icpu - i_icpu))
258
               (fromIntegral (f_imem - i_imem))
259
               (fromIntegral (f_idsk - i_idsk))
260
        un_cpu = fromIntegral (v_cpu - f_icpu)::Int
261
        runa = RSpec un_cpu (truncate t_mem - fromIntegral f_imem)
262
               (truncate t_dsk - fromIntegral f_idsk)
263
    in (rini, rfin, runa)
264

    
265
-- | The names and weights of the individual elements in the CV list.
266
detailedCVInfo :: [(Double, String)]
267
detailedCVInfo = [ (1,  "free_mem_cv")
268
                 , (1,  "free_disk_cv")
269
                 , (1,  "n1_cnt")
270
                 , (1,  "reserved_mem_cv")
271
                 , (4,  "offline_all_cnt")
272
                 , (16, "offline_pri_cnt")
273
                 , (1,  "vcpu_ratio_cv")
274
                 , (1,  "cpu_load_cv")
275
                 , (1,  "mem_load_cv")
276
                 , (1,  "disk_load_cv")
277
                 , (1,  "net_load_cv")
278
                 , (2,  "pri_tags_score")
279
                 ]
280

    
281
detailedCVWeights :: [Double]
282
detailedCVWeights = map fst detailedCVInfo
283

    
284
-- | Compute the mem and disk covariance.
285
compDetailedCV :: [Node.Node] -> [Double]
286
compDetailedCV all_nodes =
287
    let
288
        (offline, nodes) = partition Node.offline all_nodes
289
        mem_l = map Node.pMem nodes
290
        dsk_l = map Node.pDsk nodes
291
        -- metric: memory covariance
292
        mem_cv = stdDev mem_l
293
        -- metric: disk covariance
294
        dsk_cv = stdDev dsk_l
295
        -- metric: count of instances living on N1 failing nodes
296
        n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
297
                                                   length (Node.pList n)) .
298
                   filter Node.failN1 $ nodes :: Double
299
        res_l = map Node.pRem nodes
300
        -- metric: reserved memory covariance
301
        res_cv = stdDev res_l
302
        -- offline instances metrics
303
        offline_ipri = sum . map (length . Node.pList) $ offline
304
        offline_isec = sum . map (length . Node.sList) $ offline
305
        -- metric: count of instances on offline nodes
306
        off_score = fromIntegral (offline_ipri + offline_isec)::Double
307
        -- metric: count of primary instances on offline nodes (this
308
        -- helps with evacuation/failover of primary instances on
309
        -- 2-node clusters with one node offline)
310
        off_pri_score = fromIntegral offline_ipri::Double
311
        cpu_l = map Node.pCpu nodes
312
        -- metric: covariance of vcpu/pcpu ratio
313
        cpu_cv = stdDev cpu_l
314
        -- metrics: covariance of cpu, memory, disk and network load
315
        (c_load, m_load, d_load, n_load) = unzip4 $
316
            map (\n ->
317
                     let DynUtil c1 m1 d1 n1 = Node.utilLoad n
318
                         DynUtil c2 m2 d2 n2 = Node.utilPool n
319
                     in (c1/c2, m1/m2, d1/d2, n1/n2)
320
                ) nodes
321
        -- metric: conflicting instance count
322
        pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
323
        pri_tags_score = fromIntegral pri_tags_inst::Double
324
    in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
325
       , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
326
       , pri_tags_score ]
327

    
328
-- | Compute the /total/ variance.
329
compCVNodes :: [Node.Node] -> Double
330
compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
331

    
332
-- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
333
compCV :: Node.List -> Double
334
compCV = compCVNodes . Container.elems
335

    
336

    
337
-- | Compute online nodes from a 'Node.List'.
338
getOnline :: Node.List -> [Node.Node]
339
getOnline = filter (not . Node.offline) . Container.elems
340

    
341
-- * Balancing functions
342

    
343
-- | Compute best table. Note that the ordering of the arguments is important.
344
compareTables :: Table -> Table -> Table
345
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
346
    if a_cv > b_cv then b else a
347

    
348
-- | Applies an instance move to a given node list and instance.
349
applyMove :: Node.List -> Instance.Instance
350
          -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
351
-- Failover (f)
352
applyMove nl inst Failover =
353
    let old_pdx = Instance.pNode inst
354
        old_sdx = Instance.sNode inst
355
        old_p = Container.find old_pdx nl
356
        old_s = Container.find old_sdx nl
357
        int_p = Node.removePri old_p inst
358
        int_s = Node.removeSec old_s inst
359
        force_p = Node.offline old_p
360
        new_nl = do -- Maybe monad
361
          new_p <- Node.addPriEx force_p int_s inst
362
          new_s <- Node.addSec int_p inst old_sdx
363
          let new_inst = Instance.setBoth inst old_sdx old_pdx
364
          return (Container.addTwo old_pdx new_s old_sdx new_p nl,
365
                  new_inst, old_sdx, old_pdx)
366
    in new_nl
367

    
368
-- Replace the primary (f:, r:np, f)
369
applyMove nl inst (ReplacePrimary new_pdx) =
370
    let old_pdx = Instance.pNode inst
371
        old_sdx = Instance.sNode inst
372
        old_p = Container.find old_pdx nl
373
        old_s = Container.find old_sdx nl
374
        tgt_n = Container.find new_pdx nl
375
        int_p = Node.removePri old_p inst
376
        int_s = Node.removeSec old_s inst
377
        force_p = Node.offline old_p
378
        new_nl = do -- Maybe monad
379
          -- check that the current secondary can host the instance
380
          -- during the migration
381
          tmp_s <- Node.addPriEx force_p int_s inst
382
          let tmp_s' = Node.removePri tmp_s inst
383
          new_p <- Node.addPriEx force_p tgt_n inst
384
          new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
385
          let new_inst = Instance.setPri inst new_pdx
386
          return (Container.add new_pdx new_p $
387
                  Container.addTwo old_pdx int_p old_sdx new_s nl,
388
                  new_inst, new_pdx, old_sdx)
389
    in new_nl
390

    
391
-- Replace the secondary (r:ns)
392
applyMove nl inst (ReplaceSecondary new_sdx) =
393
    let old_pdx = Instance.pNode inst
394
        old_sdx = Instance.sNode inst
395
        old_s = Container.find old_sdx nl
396
        tgt_n = Container.find new_sdx nl
397
        int_s = Node.removeSec old_s inst
398
        force_s = Node.offline old_s
399
        new_inst = Instance.setSec inst new_sdx
400
        new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
401
                 \new_s -> return (Container.addTwo new_sdx
402
                                   new_s old_sdx int_s nl,
403
                                   new_inst, old_pdx, new_sdx)
404
    in new_nl
405

    
406
-- Replace the secondary and failover (r:np, f)
407
applyMove nl inst (ReplaceAndFailover new_pdx) =
408
    let old_pdx = Instance.pNode inst
409
        old_sdx = Instance.sNode inst
410
        old_p = Container.find old_pdx nl
411
        old_s = Container.find old_sdx nl
412
        tgt_n = Container.find new_pdx nl
413
        int_p = Node.removePri old_p inst
414
        int_s = Node.removeSec old_s inst
415
        force_s = Node.offline old_s
416
        new_nl = do -- Maybe monad
417
          new_p <- Node.addPri tgt_n inst
418
          new_s <- Node.addSecEx force_s int_p inst new_pdx
419
          let new_inst = Instance.setBoth inst new_pdx old_pdx
420
          return (Container.add new_pdx new_p $
421
                  Container.addTwo old_pdx new_s old_sdx int_s nl,
422
                  new_inst, new_pdx, old_pdx)
423
    in new_nl
424

    
425
-- Failver and replace the secondary (f, r:ns)
426
applyMove nl inst (FailoverAndReplace new_sdx) =
427
    let old_pdx = Instance.pNode inst
428
        old_sdx = Instance.sNode inst
429
        old_p = Container.find old_pdx nl
430
        old_s = Container.find old_sdx nl
431
        tgt_n = Container.find new_sdx nl
432
        int_p = Node.removePri old_p inst
433
        int_s = Node.removeSec old_s inst
434
        force_p = Node.offline old_p
435
        new_nl = do -- Maybe monad
436
          new_p <- Node.addPriEx force_p int_s inst
437
          new_s <- Node.addSecEx force_p tgt_n inst old_sdx
438
          let new_inst = Instance.setBoth inst old_sdx new_sdx
439
          return (Container.add new_sdx new_s $
440
                  Container.addTwo old_sdx new_p old_pdx int_p nl,
441
                  new_inst, old_sdx, new_sdx)
442
    in new_nl
443

    
444
-- | Tries to allocate an instance on one given node.
445
allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
446
                 -> OpResult Node.AllocElement
447
allocateOnSingle nl inst new_pdx =
448
    let p = Container.find new_pdx nl
449
        new_inst = Instance.setBoth inst new_pdx Node.noSecondary
450
    in  Node.addPri p inst >>= \new_p -> do
451
      let new_nl = Container.add new_pdx new_p nl
452
          new_score = compCV nl
453
      return (new_nl, new_inst, [new_p], new_score)
454

    
455
-- | Tries to allocate an instance on a given pair of nodes.
456
allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
457
               -> OpResult Node.AllocElement
458
allocateOnPair nl inst new_pdx new_sdx =
459
    let tgt_p = Container.find new_pdx nl
460
        tgt_s = Container.find new_sdx nl
461
    in do
462
      new_p <- Node.addPri tgt_p inst
463
      new_s <- Node.addSec tgt_s inst new_pdx
464
      let new_inst = Instance.setBoth inst new_pdx new_sdx
465
          new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
466
      return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
467

    
468
-- | Tries to perform an instance move and returns the best table
469
-- between the original one and the new one.
470
checkSingleStep :: Table -- ^ The original table
471
                -> Instance.Instance -- ^ The instance to move
472
                -> Table -- ^ The current best table
473
                -> IMove -- ^ The move to apply
474
                -> Table -- ^ The final best table
475
checkSingleStep ini_tbl target cur_tbl move =
476
    let
477
        Table ini_nl ini_il _ ini_plc = ini_tbl
478
        tmp_resu = applyMove ini_nl target move
479
    in
480
      case tmp_resu of
481
        OpFail _ -> cur_tbl
482
        OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
483
            let tgt_idx = Instance.idx target
484
                upd_cvar = compCV upd_nl
485
                upd_il = Container.add tgt_idx new_inst ini_il
486
                upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
487
                upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
488
            in
489
              compareTables cur_tbl upd_tbl
490

    
491
-- | Given the status of the current secondary as a valid new node and
492
-- the current candidate target node, generate the possible moves for
493
-- a instance.
494
possibleMoves :: Bool      -- ^ Whether the secondary node is a valid new node
495
              -> Bool      -- ^ Whether we can change the primary node
496
              -> Ndx       -- ^ Target node candidate
497
              -> [IMove]   -- ^ List of valid result moves
498

    
499
possibleMoves _ False tdx =
500
    [ReplaceSecondary tdx]
501

    
502
possibleMoves True True tdx =
503
    [ReplaceSecondary tdx,
504
     ReplaceAndFailover tdx,
505
     ReplacePrimary tdx,
506
     FailoverAndReplace tdx]
507

    
508
possibleMoves False True tdx =
509
    [ReplaceSecondary tdx,
510
     ReplaceAndFailover tdx]
511

    
512
-- | Compute the best move for a given instance.
513
checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
514
                  -> Bool              -- ^ Whether disk moves are allowed
515
                  -> Bool              -- ^ Whether instance moves are allowed
516
                  -> Table             -- ^ Original table
517
                  -> Instance.Instance -- ^ Instance to move
518
                  -> Table             -- ^ Best new table for this instance
519
checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
520
    let
521
        opdx = Instance.pNode target
522
        osdx = Instance.sNode target
523
        nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx
524
        use_secondary = elem osdx nodes_idx && inst_moves
525
        aft_failover = if use_secondary -- if allowed to failover
526
                       then checkSingleStep ini_tbl target ini_tbl Failover
527
                       else ini_tbl
528
        all_moves = if disk_moves
529
                    then concatMap
530
                         (possibleMoves use_secondary inst_moves) nodes
531
                    else []
532
    in
533
      -- iterate over the possible nodes for this instance
534
      foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
535

    
536
-- | Compute the best next move.
537
checkMove :: [Ndx]               -- ^ Allowed target node indices
538
          -> Bool                -- ^ Whether disk moves are allowed
539
          -> Bool                -- ^ Whether instance moves are allowed
540
          -> Table               -- ^ The current solution
541
          -> [Instance.Instance] -- ^ List of instances still to move
542
          -> Table               -- ^ The new solution
543
checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
544
    let Table _ _ _ ini_plc = ini_tbl
545
        -- we're using rwhnf from the Control.Parallel.Strategies
546
        -- package; we don't need to use rnf as that would force too
547
        -- much evaluation in single-threaded cases, and in
548
        -- multi-threaded case the weak head normal form is enough to
549
        -- spark the evaluation
550
        tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
551
                               inst_moves ini_tbl)
552
                 victims
553
        -- iterate over all instances, computing the best move
554
        best_tbl = foldl' compareTables ini_tbl tables
555
        Table _ _ _ best_plc = best_tbl
556
    in if length best_plc == length ini_plc
557
       then ini_tbl -- no advancement
558
       else best_tbl
559

    
560
-- | Check if we are allowed to go deeper in the balancing.
561
doNextBalance :: Table     -- ^ The starting table
562
              -> Int       -- ^ Remaining length
563
              -> Score     -- ^ Score at which to stop
564
              -> Bool      -- ^ The resulting table and commands
565
doNextBalance ini_tbl max_rounds min_score =
566
    let Table _ _ ini_cv ini_plc = ini_tbl
567
        ini_plc_len = length ini_plc
568
    in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
569

    
570
-- | Run a balance move.
571
tryBalance :: Table       -- ^ The starting table
572
           -> Bool        -- ^ Allow disk moves
573
           -> Bool        -- ^ Allow instance moves
574
           -> Bool        -- ^ Only evacuate moves
575
           -> Score       -- ^ Min gain threshold
576
           -> Score       -- ^ Min gain
577
           -> Maybe Table -- ^ The resulting table and commands
578
tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
579
    let Table ini_nl ini_il ini_cv _ = ini_tbl
580
        all_inst = Container.elems ini_il
581
        all_inst' = if evac_mode
582
                    then let bad_nodes = map Node.idx . filter Node.offline $
583
                                         Container.elems ini_nl
584
                         in filter (\e -> Instance.sNode e `elem` bad_nodes ||
585
                                          Instance.pNode e `elem` bad_nodes)
586
                            all_inst
587
                    else all_inst
588
        reloc_inst = filter Instance.movable all_inst'
589
        node_idx = map Node.idx . filter (not . Node.offline) $
590
                   Container.elems ini_nl
591
        fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
592
        (Table _ _ fin_cv _) = fin_tbl
593
    in
594
      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
595
      then Just fin_tbl -- this round made success, return the new table
596
      else Nothing
597

    
598
-- * Allocation functions
599

    
600
-- | Build failure stats out of a list of failures.
601
collapseFailures :: [FailMode] -> FailStats
602
collapseFailures flst =
603
    map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
604
            [minBound..maxBound]
605

    
606
-- | Update current Allocation solution and failure stats with new
607
-- elements.
608
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
609
concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
610

    
611
concatAllocs as (OpGood ns@(_, _, _, nscore)) =
612
    let -- Choose the old or new solution, based on the cluster score
613
        cntok = asAllocs as
614
        osols = asSolutions as
615
        nsols = case osols of
616
                  [] -> [ns]
617
                  (_, _, _, oscore):[] ->
618
                      if oscore < nscore
619
                      then osols
620
                      else [ns]
621
                  -- FIXME: here we simply concat to lists with more
622
                  -- than one element; we should instead abort, since
623
                  -- this is not a valid usage of this function
624
                  xs -> ns:xs
625
        nsuc = cntok + 1
626
    -- Note: we force evaluation of nsols here in order to keep the
627
    -- memory profile low - we know that we will need nsols for sure
628
    -- in the next cycle, so we force evaluation of nsols, since the
629
    -- foldl' in the caller will only evaluate the tuple, but not the
630
    -- elements of the tuple
631
    in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolutions = nsols }
632

    
633
-- | Given a solution, generates a reasonable description for it.
634
describeSolution :: AllocSolution -> String
635
describeSolution as =
636
  let fcnt = asFailures as
637
      sols = asSolutions as
638
      freasons =
639
        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
640
        filter ((> 0) . snd) . collapseFailures $ fcnt
641
  in if null sols
642
     then "No valid allocation solutions, failure reasons: " ++
643
          (if null fcnt
644
           then "unknown reasons"
645
           else freasons)
646
     else let (_, _, nodes, cv) = head sols
647
          in printf ("score: %.8f, successes %d, failures %d (%s)" ++
648
                     " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
649
             (intercalate "/" . map Node.name $ nodes)
650

    
651
-- | Annotates a solution with the appropriate string.
652
annotateSolution :: AllocSolution -> AllocSolution
653
annotateSolution as = as { asLog = describeSolution as : asLog as }
654

    
655
-- | Reverses an evacuation solution.
656
--
657
-- Rationale: we always concat the results to the top of the lists, so
658
-- for proper jobset execution, we should reverse all lists.
659
reverseEvacSolution :: EvacSolution -> EvacSolution
660
reverseEvacSolution (EvacSolution f m o) =
661
    EvacSolution (reverse f) (reverse m) (reverse o)
662

    
663
-- | Generate the valid node allocation singles or pairs for a new instance.
664
genAllocNodes :: Group.List        -- ^ Group list
665
              -> Node.List         -- ^ The node map
666
              -> Int               -- ^ The number of nodes required
667
              -> Bool              -- ^ Whether to drop or not
668
                                   -- unallocable nodes
669
              -> Result AllocNodes -- ^ The (monadic) result
670
genAllocNodes gl nl count drop_unalloc =
671
    let filter_fn = if drop_unalloc
672
                    then filter (Group.isAllocable .
673
                                 flip Container.find gl . Node.group)
674
                    else id
675
        all_nodes = filter_fn $ getOnline nl
676
        all_pairs = liftM2 (,) all_nodes all_nodes
677
        ok_pairs = filter (\(x, y) -> Node.idx x /= Node.idx y &&
678
                                      Node.group x == Node.group y) all_pairs
679
    in case count of
680
         1 -> Ok (Left (map Node.idx all_nodes))
681
         2 -> Ok (Right (map (\(p, s) -> (Node.idx p, Node.idx s)) ok_pairs))
682
         _ -> Bad "Unsupported number of nodes, only one or two  supported"
683

    
684
-- | Try to allocate an instance on the cluster.
685
tryAlloc :: (Monad m) =>
686
            Node.List         -- ^ The node list
687
         -> Instance.List     -- ^ The instance list
688
         -> Instance.Instance -- ^ The instance to allocate
689
         -> AllocNodes        -- ^ The allocation targets
690
         -> m AllocSolution   -- ^ Possible solution list
691
tryAlloc nl _ inst (Right ok_pairs) =
692
    let sols = foldl' (\cstate (p, s) ->
693
                           concatAllocs cstate $ allocateOnPair nl inst p s
694
                      ) emptyAllocSolution ok_pairs
695

    
696
    in if null ok_pairs -- means we have just one node
697
       then fail "Not enough online nodes"
698
       else return $ annotateSolution sols
699

    
700
tryAlloc nl _ inst (Left all_nodes) =
701
    let sols = foldl' (\cstate ->
702
                           concatAllocs cstate . allocateOnSingle nl inst
703
                      ) emptyAllocSolution all_nodes
704
    in if null all_nodes
705
       then fail "No online nodes"
706
       else return $ annotateSolution sols
707

    
708
-- | Given a group/result, describe it as a nice (list of) messages.
709
solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
710
solutionDescription gl (groupId, result) =
711
  case result of
712
    Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
713
    Bad message -> [printf "Group %s: error %s" gname message]
714
  where grp = Container.find groupId gl
715
        gname = Group.name grp
716
        pol = apolToString (Group.allocPolicy grp)
717

    
718
-- | From a list of possibly bad and possibly empty solutions, filter
719
-- only the groups with a valid result. Note that the result will be
720
-- reversed compared to the original list.
721
filterMGResults :: Group.List
722
                -> [(Gdx, Result AllocSolution)]
723
                -> [(Gdx, AllocSolution)]
724
filterMGResults gl = foldl' fn []
725
    where unallocable = not . Group.isAllocable . flip Container.find gl
726
          fn accu (gdx, rasol) =
727
              case rasol of
728
                Bad _ -> accu
729
                Ok sol | null (asSolutions sol) -> accu
730
                       | unallocable gdx -> accu
731
                       | otherwise -> (gdx, sol):accu
732

    
733
-- | Sort multigroup results based on policy and score.
734
sortMGResults :: Group.List
735
             -> [(Gdx, AllocSolution)]
736
             -> [(Gdx, AllocSolution)]
737
sortMGResults gl sols =
738
    let extractScore (_, _, _, x) = x
739
        solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
740
                               (extractScore . head . asSolutions) sol)
741
    in sortBy (comparing solScore) sols
742

    
743
-- | Finds the best group for an instance on a multi-group cluster.
744
--
745
-- Only solutions in @preferred@ and @last_resort@ groups will be
746
-- accepted as valid, and additionally if the allowed groups parameter
747
-- is not null then allocation will only be run for those group
748
-- indices.
749
findBestAllocGroup :: Group.List           -- ^ The group list
750
                   -> Node.List            -- ^ The node list
751
                   -> Instance.List        -- ^ The instance list
752
                   -> Maybe [Gdx]          -- ^ The allowed groups
753
                   -> Instance.Instance    -- ^ The instance to allocate
754
                   -> Int                  -- ^ Required number of nodes
755
                   -> Result (Gdx, AllocSolution, [String])
756
findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
757
  let groups = splitCluster mgnl mgil
758
      groups' = maybe groups (\gs -> filter ((`elem` gs) . fst) groups)
759
                allowed_gdxs
760
      sols = map (\(gid, (nl, il)) ->
761
                   (gid, genAllocNodes mggl nl cnt False >>=
762
                       tryAlloc nl il inst))
763
             groups'::[(Gdx, Result AllocSolution)]
764
      all_msgs = concatMap (solutionDescription mggl) sols
765
      goodSols = filterMGResults mggl sols
766
      sortedSols = sortMGResults mggl goodSols
767
  in if null sortedSols
768
     then Bad $ intercalate ", " all_msgs
769
     else let (final_group, final_sol) = head sortedSols
770
          in return (final_group, final_sol, all_msgs)
771

    
772
-- | Try to allocate an instance on a multi-group cluster.
773
tryMGAlloc :: Group.List           -- ^ The group list
774
           -> Node.List            -- ^ The node list
775
           -> Instance.List        -- ^ The instance list
776
           -> Instance.Instance    -- ^ The instance to allocate
777
           -> Int                  -- ^ Required number of nodes
778
           -> Result AllocSolution -- ^ Possible solution list
779
tryMGAlloc mggl mgnl mgil inst cnt = do
780
  (best_group, solution, all_msgs) <-
781
      findBestAllocGroup mggl mgnl mgil Nothing inst cnt
782
  let group_name = Group.name $ Container.find best_group mggl
783
      selmsg = "Selected group: " ++ group_name
784
  return $ solution { asLog = selmsg:all_msgs }
785

    
786
-- | Try to relocate an instance on the cluster.
787
tryReloc :: (Monad m) =>
788
            Node.List       -- ^ The node list
789
         -> Instance.List   -- ^ The instance list
790
         -> Idx             -- ^ The index of the instance to move
791
         -> Int             -- ^ The number of nodes required
792
         -> [Ndx]           -- ^ Nodes which should not be used
793
         -> m AllocSolution -- ^ Solution list
794
tryReloc nl il xid 1 ex_idx =
795
    let all_nodes = getOnline nl
796
        inst = Container.find xid il
797
        ex_idx' = Instance.pNode inst:ex_idx
798
        valid_nodes = filter (not . flip elem ex_idx' . Node.idx) all_nodes
799
        valid_idxes = map Node.idx valid_nodes
800
        sols1 = foldl' (\cstate x ->
801
                            let em = do
802
                                  (mnl, i, _, _) <-
803
                                      applyMove nl inst (ReplaceSecondary x)
804
                                  return (mnl, i, [Container.find x mnl],
805
                                          compCV mnl)
806
                            in concatAllocs cstate em
807
                       ) emptyAllocSolution valid_idxes
808
    in return sols1
809

    
810
tryReloc _ _ _ reqn _  = fail $ "Unsupported number of relocation \
811
                                \destinations required (" ++ show reqn ++
812
                                                  "), only one supported"
813

    
814
-- | Change an instance's secondary node.
815
evacInstance :: (Monad m) =>
816
                [Ndx]                      -- ^ Excluded nodes
817
             -> Instance.List              -- ^ The current instance list
818
             -> (Node.List, AllocSolution) -- ^ The current state
819
             -> Idx                        -- ^ The instance to evacuate
820
             -> m (Node.List, AllocSolution)
821
evacInstance ex_ndx il (nl, old_as) idx = do
822
  -- FIXME: hardcoded one node here
823

    
824
  -- Longer explanation: evacuation is currently hardcoded to DRBD
825
  -- instances (which have one secondary); hence, even if the
826
  -- IAllocator protocol can request N nodes for an instance, and all
827
  -- the message parsing/loading pass this, this implementation only
828
  -- supports one; this situation needs to be revisited if we ever
829
  -- support more than one secondary, or if we change the storage
830
  -- model
831
  new_as <- tryReloc nl il idx 1 ex_ndx
832
  case asSolutions new_as of
833
    -- an individual relocation succeeded, we kind of compose the data
834
    -- from the two solutions
835
    csol@(nl', _, _, _):_ ->
836
        return (nl', new_as { asSolutions = csol:asSolutions old_as })
837
    -- this relocation failed, so we fail the entire evac
838
    _ -> fail $ "Can't evacuate instance " ++
839
         Instance.name (Container.find idx il) ++
840
             ": " ++ describeSolution new_as
841

    
842
-- | Try to evacuate a list of nodes.
843
tryEvac :: (Monad m) =>
844
            Node.List       -- ^ The node list
845
         -> Instance.List   -- ^ The instance list
846
         -> [Idx]           -- ^ Instances to be evacuated
847
         -> [Ndx]           -- ^ Restricted nodes (the ones being evacuated)
848
         -> m AllocSolution -- ^ Solution list
849
tryEvac nl il idxs ex_ndx = do
850
  (_, sol) <- foldM (evacInstance ex_ndx il) (nl, emptyAllocSolution) idxs
851
  return sol
852

    
853
-- | Function which fails if the requested mode is change secondary.
854
--
855
-- This is useful since except DRBD, no other disk template can
856
-- execute change secondary; thus, we can just call this function
857
-- instead of always checking for secondary mode. After the call to
858
-- this function, whatever mode we have is just a primary change.
859
failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
860
failOnSecondaryChange ChangeSecondary dt =
861
    fail $ "Instances with disk template '" ++ dtToString dt ++
862
         "' can't execute change secondary"
863
failOnSecondaryChange _ _ = return ()
864

    
865
-- | Run evacuation for a single instance.
866
--
867
-- /Note:/ this function should correctly execute both intra-group
868
-- evacuations (in all modes) and inter-group evacuations (in the
869
-- 'ChangeAll' mode). Of course, this requires that the correct list
870
-- of target nodes is passed.
871
nodeEvacInstance :: Node.List         -- ^ The node list (cluster-wide)
872
                 -> Instance.List     -- ^ Instance list (cluster-wide)
873
                 -> EvacMode          -- ^ The evacuation mode
874
                 -> Instance.Instance -- ^ The instance to be evacuated
875
                 -> Gdx               -- ^ The group we're targetting
876
                 -> [Ndx]             -- ^ The list of available nodes
877
                                      -- for allocation
878
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
879
nodeEvacInstance _ _ mode (Instance.Instance
880
                           {Instance.diskTemplate = dt@DTDiskless}) _ _ =
881
                  failOnSecondaryChange mode dt >>
882
                  fail "Diskless relocations not implemented yet"
883

    
884
nodeEvacInstance _ _ _ (Instance.Instance
885
                        {Instance.diskTemplate = DTPlain}) _ _ =
886
                  fail "Instances of type plain cannot be relocated"
887

    
888
nodeEvacInstance _ _ _ (Instance.Instance
889
                        {Instance.diskTemplate = DTFile}) _ _ =
890
                  fail "Instances of type file cannot be relocated"
891

    
892
nodeEvacInstance _ _ mode  (Instance.Instance
893
                            {Instance.diskTemplate = dt@DTSharedFile}) _ _ =
894
                  failOnSecondaryChange mode dt >>
895
                  fail "Shared file relocations not implemented yet"
896

    
897
nodeEvacInstance _ _ mode (Instance.Instance
898
                           {Instance.diskTemplate = dt@DTBlock}) _ _ =
899
                  failOnSecondaryChange mode dt >>
900
                  fail "Block device relocations not implemented yet"
901

    
902
nodeEvacInstance nl il ChangePrimary
903
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
904
                 _ _ =
905
  do
906
    (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
907
    let idx = Instance.idx inst
908
        il' = Container.add idx inst' il
909
        ops = iMoveToJob nl' il' idx Failover
910
    return (nl', il', ops)
911

    
912
nodeEvacInstance nl il ChangeSecondary
913
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
914
                 gdx avail_nodes =
915
  do
916
    (nl', inst', _, ndx) <- annotateResult "Can't find any good node" $
917
                            eitherToResult $
918
                            foldl' (evacDrbdSecondaryInner nl inst gdx)
919
                            (Left "no nodes available") avail_nodes
920
    let idx = Instance.idx inst
921
        il' = Container.add idx inst' il
922
        ops = iMoveToJob nl' il' idx (ReplaceSecondary ndx)
923
    return (nl', il', ops)
924

    
925
nodeEvacInstance nl il ChangeAll
926
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
927
                 gdx avail_nodes =
928
  do
929
    let primary = Container.find (Instance.pNode inst) nl
930
        idx = Instance.idx inst
931
        no_nodes = Left "no nodes available"
932
    -- if the primary is offline, then we first failover
933
    (nl1, inst1, ops1) <-
934
        if Node.offline primary
935
        then do
936
          (nl', inst', _, _) <-
937
              annotateResult "Failing over to the secondary" $
938
              opToResult $ applyMove nl inst Failover
939
          return (nl', inst', [Failover])
940
        else return (nl, inst, [])
941
    -- we now need to execute a replace secondary to the future
942
    -- primary node
943
    (nl2, inst2, _, new_pdx) <- annotateResult "Searching for a new primary" $
944
                                eitherToResult $
945
                                foldl' (evacDrbdSecondaryInner nl1 inst1 gdx)
946
                                no_nodes avail_nodes
947
    let ops2 = ReplaceSecondary new_pdx:ops1
948
    -- since we chose the new primary, we remove it from the list of
949
    -- available nodes
950
    let avail_nodes_sec = new_pdx `delete` avail_nodes
951
    -- we now execute another failover, the primary stays fixed now
952
    (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" $
953
                          opToResult $ applyMove nl2 inst2 Failover
954
    let ops3 = Failover:ops2
955
    -- and finally another replace secondary, to the final secondary
956
    (nl4, inst4, _, new_sdx) <-
957
        annotateResult "Searching for a new secondary" $
958
        eitherToResult $
959
        foldl' (evacDrbdSecondaryInner nl3 inst3 gdx) no_nodes avail_nodes_sec
960
    let ops4 = ReplaceSecondary new_sdx:ops3
961
        il' = Container.add idx inst4 il
962
        ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
963
    return (nl4, il', ops)
964

    
965
-- | Inner fold function for changing secondary of a DRBD instance.
966
--
967
-- The "running" solution is either a @Left String@, which means we
968
-- don't have yet a working solution, or a @Right (...)@, which
969
-- represents a valid solution; it holds the modified node list, the
970
-- modified instance (after evacuation), the score of that solution,
971
-- and the new secondary node index.
972
evacDrbdSecondaryInner :: Node.List -- ^ Cluster node list
973
                       -> Instance.Instance -- ^ Instance being evacuated
974
                       -> Gdx -- ^ The group index of the instance
975
                       -> Either String ( Node.List
976
                                        , Instance.Instance
977
                                        , Score
978
                                        , Ndx)  -- ^ Current best solution
979
                       -> Ndx  -- ^ Node we're evaluating as new secondary
980
                       -> Either String ( Node.List
981
                                        , Instance.Instance
982
                                        , Score
983
                                        , Ndx) -- ^ New best solution
984
evacDrbdSecondaryInner nl inst gdx accu ndx =
985
    case applyMove nl inst (ReplaceSecondary ndx) of
986
      OpFail fm ->
987
          case accu of
988
            Right _ -> accu
989
            Left _ -> Left $ "Node " ++ Container.nameOf nl ndx ++
990
                      " failed: " ++ show fm
991
      OpGood (nl', inst', _, _) ->
992
          let nodes = Container.elems nl'
993
              -- The fromJust below is ugly (it can fail nastily), but
994
              -- at this point we should have any internal mismatches,
995
              -- and adding a monad here would be quite involved
996
              grpnodes = fromJust (gdx `lookup` (Node.computeGroups nodes))
997
              new_cv = compCVNodes grpnodes
998
              new_accu = Right (nl', inst', new_cv, ndx)
999
          in case accu of
1000
               Left _ -> new_accu
1001
               Right (_, _, old_cv, _) ->
1002
                   if old_cv < new_cv
1003
                   then accu
1004
                   else new_accu
1005

    
1006
-- | Computes the nodes in a given group which are available for
1007
-- allocation.
1008
availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1009
                    -> IntSet.IntSet  -- ^ Nodes that are excluded
1010
                    -> Gdx            -- ^ The group for which we
1011
                                      -- query the nodes
1012
                    -> Result [Ndx]   -- ^ List of available node indices
1013
availableGroupNodes group_nodes excl_ndx gdx = do
1014
  local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1015
                 Ok (lookup gdx group_nodes)
1016
  let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1017
  return avail_nodes
1018

    
1019
-- | Updates the evac solution with the results of an instance
1020
-- evacuation.
1021
updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1022
                   -> Idx
1023
                   -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1024
                   -> (Node.List, Instance.List, EvacSolution)
1025
updateEvacSolution (nl, il, es) idx (Bad msg) =
1026
    (nl, il, es { esFailed = (idx, msg):esFailed es})
1027
updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1028
    (nl, il, es { esMoved = new_elem:esMoved es
1029
                , esOpCodes = [opcodes]:esOpCodes es })
1030
     where inst = Container.find idx il
1031
           new_elem = (idx,
1032
                       instancePriGroup nl inst,
1033
                       Instance.allNodes inst)
1034

    
1035
-- | Node-evacuation IAllocator mode main function.
1036
tryNodeEvac :: Group.List    -- ^ The cluster groups
1037
            -> Node.List     -- ^ The node list (cluster-wide, not per group)
1038
            -> Instance.List -- ^ Instance list (cluster-wide)
1039
            -> EvacMode      -- ^ The evacuation mode
1040
            -> [Idx]         -- ^ List of instance (indices) to be evacuated
1041
            -> Result (Node.List, Instance.List, EvacSolution)
1042
tryNodeEvac _ ini_nl ini_il mode idxs =
1043
    let evac_ndx = nodesToEvacuate ini_il mode idxs
1044
        offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1045
        excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1046
        group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1047
                                             (Container.elems nl))) $
1048
                      splitCluster ini_nl ini_il
1049
        (fin_nl, fin_il, esol) =
1050
            foldl' (\state@(nl, il, _) inst ->
1051
                        let gdx = instancePriGroup nl inst in
1052
                        updateEvacSolution state (Instance.idx inst) $
1053
                        availableGroupNodes group_ndx
1054
                          excl_ndx gdx >>=
1055
                        nodeEvacInstance nl il mode inst gdx
1056
                   )
1057
            (ini_nl, ini_il, emptyEvacSolution)
1058
            (map (`Container.find` ini_il) idxs)
1059
    in return (fin_nl, fin_il, reverseEvacSolution esol)
1060

    
1061
-- | Change-group IAllocator mode main function.
1062
--
1063
-- This is very similar to 'tryNodeEvac', the only difference is that
1064
-- we don't choose as target group the current instance group, but
1065
-- instead:
1066
--
1067
--   1. at the start of the function, we compute which are the target
1068
--   groups; either no groups were passed in, in which case we choose
1069
--   all groups out of which we don't evacuate instance, or there were
1070
--   some groups passed, in which case we use those
1071
--
1072
--   2. for each instance, we use 'findBestAllocGroup' to choose the
1073
--   best group to hold the instance, and then we do what
1074
--   'tryNodeEvac' does, except for this group instead of the current
1075
--   instance group.
1076
--
1077
-- Note that the correct behaviour of this function relies on the
1078
-- function 'nodeEvacInstance' to be able to do correctly both
1079
-- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1080
tryChangeGroup :: Group.List    -- ^ The cluster groups
1081
               -> Node.List     -- ^ The node list (cluster-wide)
1082
               -> Instance.List -- ^ Instance list (cluster-wide)
1083
               -> [Gdx]         -- ^ Target groups; if empty, any
1084
                                -- groups not being evacuated
1085
               -> [Idx]         -- ^ List of instance (indices) to be evacuated
1086
               -> Result (Node.List, Instance.List, EvacSolution)
1087
tryChangeGroup gl ini_nl ini_il gdxs idxs =
1088
    let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1089
                               flip Container.find ini_il) idxs
1090
        target_gdxs = (if null gdxs
1091
                       then Container.keys gl
1092
                       else gdxs) \\ evac_gdxs
1093
        offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1094
        excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1095
        group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1096
                                             (Container.elems nl))) $
1097
                      splitCluster ini_nl ini_il
1098
        (fin_nl, fin_il, esol) =
1099
            foldl' (\state@(nl, il, _) inst ->
1100
                        let solution = do
1101
                              let ncnt = Instance.requiredNodes $
1102
                                         Instance.diskTemplate inst
1103
                              (gdx, _, _) <- findBestAllocGroup gl nl il
1104
                                             (Just target_gdxs) inst ncnt
1105
                              av_nodes <- availableGroupNodes group_ndx
1106
                                          excl_ndx gdx
1107
                              nodeEvacInstance nl il ChangeAll inst
1108
                                       gdx av_nodes
1109
                        in updateEvacSolution state
1110
                               (Instance.idx inst) solution
1111
                   )
1112
            (ini_nl, ini_il, emptyEvacSolution)
1113
            (map (`Container.find` ini_il) idxs)
1114
    in return (fin_nl, fin_il, reverseEvacSolution esol)
1115

    
1116
-- | Recursively place instances on the cluster until we're out of space.
1117
iterateAlloc :: Node.List
1118
             -> Instance.List
1119
             -> Maybe Int
1120
             -> Instance.Instance
1121
             -> AllocNodes
1122
             -> [Instance.Instance]
1123
             -> [CStats]
1124
             -> Result AllocResult
1125
iterateAlloc nl il limit newinst allocnodes ixes cstats =
1126
      let depth = length ixes
1127
          newname = printf "new-%d" depth::String
1128
          newidx = length (Container.elems il) + depth
1129
          newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1130
          newlimit = fmap (flip (-) 1) limit
1131
      in case tryAlloc nl il newi2 allocnodes of
1132
           Bad s -> Bad s
1133
           Ok (AllocSolution { asFailures = errs, asSolutions = sols3 }) ->
1134
               let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1135
               case sols3 of
1136
                 [] -> newsol
1137
                 (xnl, xi, _, _):[] ->
1138
                     if limit == Just 0
1139
                     then newsol
1140
                     else iterateAlloc xnl (Container.add newidx xi il)
1141
                          newlimit newinst allocnodes (xi:ixes)
1142
                          (totalResources xnl:cstats)
1143
                 _ -> Bad "Internal error: multiple solutions for single\
1144
                          \ allocation"
1145

    
1146
-- | The core of the tiered allocation mode.
1147
tieredAlloc :: Node.List
1148
            -> Instance.List
1149
            -> Maybe Int
1150
            -> Instance.Instance
1151
            -> AllocNodes
1152
            -> [Instance.Instance]
1153
            -> [CStats]
1154
            -> Result AllocResult
1155
tieredAlloc nl il limit newinst allocnodes ixes cstats =
1156
    case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1157
      Bad s -> Bad s
1158
      Ok (errs, nl', il', ixes', cstats') ->
1159
          let newsol = Ok (errs, nl', il', ixes', cstats')
1160
              ixes_cnt = length ixes'
1161
              (stop, newlimit) = case limit of
1162
                                   Nothing -> (False, Nothing)
1163
                                   Just n -> (n <= ixes_cnt,
1164
                                              Just (n - ixes_cnt)) in
1165
          if stop then newsol else
1166
          case Instance.shrinkByType newinst . fst . last $
1167
               sortBy (comparing snd) errs of
1168
            Bad _ -> newsol
1169
            Ok newinst' -> tieredAlloc nl' il' newlimit
1170
                           newinst' allocnodes ixes' cstats'
1171

    
1172
-- * Formatting functions
1173

    
1174
-- | Given the original and final nodes, computes the relocation description.
1175
computeMoves :: Instance.Instance -- ^ The instance to be moved
1176
             -> String -- ^ The instance name
1177
             -> IMove  -- ^ The move being performed
1178
             -> String -- ^ New primary
1179
             -> String -- ^ New secondary
1180
             -> (String, [String])
1181
                -- ^ Tuple of moves and commands list; moves is containing
1182
                -- either @/f/@ for failover or @/r:name/@ for replace
1183
                -- secondary, while the command list holds gnt-instance
1184
                -- commands (without that prefix), e.g \"@failover instance1@\"
1185
computeMoves i inam mv c d =
1186
    case mv of
1187
      Failover -> ("f", [mig])
1188
      FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1189
      ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1190
      ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1191
      ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1192
    where morf = if Instance.running i then "migrate" else "failover"
1193
          mig = printf "%s -f %s" morf inam::String
1194
          rep n = printf "replace-disks -n %s %s" n inam
1195

    
1196
-- | Converts a placement to string format.
1197
printSolutionLine :: Node.List     -- ^ The node list
1198
                  -> Instance.List -- ^ The instance list
1199
                  -> Int           -- ^ Maximum node name length
1200
                  -> Int           -- ^ Maximum instance name length
1201
                  -> Placement     -- ^ The current placement
1202
                  -> Int           -- ^ The index of the placement in
1203
                                   -- the solution
1204
                  -> (String, [String])
1205
printSolutionLine nl il nmlen imlen plc pos =
1206
    let
1207
        pmlen = (2*nmlen + 1)
1208
        (i, p, s, mv, c) = plc
1209
        inst = Container.find i il
1210
        inam = Instance.alias inst
1211
        npri = Node.alias $ Container.find p nl
1212
        nsec = Node.alias $ Container.find s nl
1213
        opri = Node.alias $ Container.find (Instance.pNode inst) nl
1214
        osec = Node.alias $ Container.find (Instance.sNode inst) nl
1215
        (moves, cmds) =  computeMoves inst inam mv npri nsec
1216
        ostr = printf "%s:%s" opri osec::String
1217
        nstr = printf "%s:%s" npri nsec::String
1218
    in
1219
      (printf "  %3d. %-*s %-*s => %-*s %.8f a=%s"
1220
       pos imlen inam pmlen ostr
1221
       pmlen nstr c moves,
1222
       cmds)
1223

    
1224
-- | Return the instance and involved nodes in an instance move.
1225
involvedNodes :: Instance.List -> Placement -> [Ndx]
1226
involvedNodes il plc =
1227
    let (i, np, ns, _, _) = plc
1228
        inst = Container.find i il
1229
        op = Instance.pNode inst
1230
        os = Instance.sNode inst
1231
    in nub [np, ns, op, os]
1232

    
1233
-- | Inner function for splitJobs, that either appends the next job to
1234
-- the current jobset, or starts a new jobset.
1235
mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1236
mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1237
mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1238
    | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1239
    | otherwise = ([n]:cjs, ndx)
1240

    
1241
-- | Break a list of moves into independent groups. Note that this
1242
-- will reverse the order of jobs.
1243
splitJobs :: [MoveJob] -> [JobSet]
1244
splitJobs = fst . foldl mergeJobs ([], [])
1245

    
1246
-- | Given a list of commands, prefix them with @gnt-instance@ and
1247
-- also beautify the display a little.
1248
formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1249
formatJob jsn jsl (sn, (_, _, _, cmds)) =
1250
    let out =
1251
            printf "  echo job %d/%d" jsn sn:
1252
            printf "  check":
1253
            map ("  gnt-instance " ++) cmds
1254
    in if sn == 1
1255
       then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1256
       else out
1257

    
1258
-- | Given a list of commands, prefix them with @gnt-instance@ and
1259
-- also beautify the display a little.
1260
formatCmds :: [JobSet] -> String
1261
formatCmds =
1262
    unlines .
1263
    concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1264
                             (zip [1..] js)) .
1265
    zip [1..]
1266

    
1267
-- | Print the node list.
1268
printNodes :: Node.List -> [String] -> String
1269
printNodes nl fs =
1270
    let fields = case fs of
1271
          [] -> Node.defaultFields
1272
          "+":rest -> Node.defaultFields ++ rest
1273
          _ -> fs
1274
        snl = sortBy (comparing Node.idx) (Container.elems nl)
1275
        (header, isnum) = unzip $ map Node.showHeader fields
1276
    in unlines . map ((:) ' ' .  intercalate " ") $
1277
       formatTable (header:map (Node.list fields) snl) isnum
1278

    
1279
-- | Print the instance list.
1280
printInsts :: Node.List -> Instance.List -> String
1281
printInsts nl il =
1282
    let sil = sortBy (comparing Instance.idx) (Container.elems il)
1283
        helper inst = [ if Instance.running inst then "R" else " "
1284
                      , Instance.name inst
1285
                      , Container.nameOf nl (Instance.pNode inst)
1286
                      , let sdx = Instance.sNode inst
1287
                        in if sdx == Node.noSecondary
1288
                           then  ""
1289
                           else Container.nameOf nl sdx
1290
                      , if Instance.autoBalance inst then "Y" else "N"
1291
                      , printf "%3d" $ Instance.vcpus inst
1292
                      , printf "%5d" $ Instance.mem inst
1293
                      , printf "%5d" $ Instance.dsk inst `div` 1024
1294
                      , printf "%5.3f" lC
1295
                      , printf "%5.3f" lM
1296
                      , printf "%5.3f" lD
1297
                      , printf "%5.3f" lN
1298
                      ]
1299
            where DynUtil lC lM lD lN = Instance.util inst
1300
        header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1301
                 , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1302
        isnum = False:False:False:False:False:repeat True
1303
    in unlines . map ((:) ' ' . intercalate " ") $
1304
       formatTable (header:map helper sil) isnum
1305

    
1306
-- | Shows statistics for a given node list.
1307
printStats :: Node.List -> String
1308
printStats nl =
1309
    let dcvs = compDetailedCV $ Container.elems nl
1310
        (weights, names) = unzip detailedCVInfo
1311
        hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1312
        formatted = map (\(w, header, val) ->
1313
                             printf "%s=%.8f(x%.2f)" header val w::String) hd
1314
    in intercalate ", " formatted
1315

    
1316
-- | Convert a placement into a list of OpCodes (basically a job).
1317
iMoveToJob :: Node.List -> Instance.List
1318
          -> Idx -> IMove -> [OpCodes.OpCode]
1319
iMoveToJob nl il idx move =
1320
    let inst = Container.find idx il
1321
        iname = Instance.name inst
1322
        lookNode  = Just . Container.nameOf nl
1323
        opF = OpCodes.OpInstanceMigrate iname True False True
1324
        opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n)
1325
                OpCodes.ReplaceNewSecondary [] Nothing
1326
    in case move of
1327
         Failover -> [ opF ]
1328
         ReplacePrimary np -> [ opF, opR np, opF ]
1329
         ReplaceSecondary ns -> [ opR ns ]
1330
         ReplaceAndFailover np -> [ opR np, opF ]
1331
         FailoverAndReplace ns -> [ opF, opR ns ]
1332

    
1333
-- * Node group functions
1334

    
1335
-- | Computes the group of an instance.
1336
instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1337
instanceGroup nl i =
1338
  let sidx = Instance.sNode i
1339
      pnode = Container.find (Instance.pNode i) nl
1340
      snode = if sidx == Node.noSecondary
1341
              then pnode
1342
              else Container.find sidx nl
1343
      pgroup = Node.group pnode
1344
      sgroup = Node.group snode
1345
  in if pgroup /= sgroup
1346
     then fail ("Instance placed accross two node groups, primary " ++
1347
                show pgroup ++ ", secondary " ++ show sgroup)
1348
     else return pgroup
1349

    
1350
-- | Computes the group of an instance per the primary node.
1351
instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1352
instancePriGroup nl i =
1353
  let pnode = Container.find (Instance.pNode i) nl
1354
  in  Node.group pnode
1355

    
1356
-- | Compute the list of badly allocated instances (split across node
1357
-- groups).
1358
findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1359
findSplitInstances nl =
1360
  filter (not . isOk . instanceGroup nl) . Container.elems
1361

    
1362
-- | Splits a cluster into the component node groups.
1363
splitCluster :: Node.List -> Instance.List ->
1364
                [(Gdx, (Node.List, Instance.List))]
1365
splitCluster nl il =
1366
  let ngroups = Node.computeGroups (Container.elems nl)
1367
  in map (\(guuid, nodes) ->
1368
           let nidxs = map Node.idx nodes
1369
               nodes' = zip nidxs nodes
1370
               instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1371
           in (guuid, (Container.fromList nodes', instances))) ngroups
1372

    
1373
-- | Compute the list of nodes that are to be evacuated, given a list
1374
-- of instances and an evacuation mode.
1375
nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1376
                -> EvacMode      -- ^ The evacuation mode we're using
1377
                -> [Idx]         -- ^ List of instance indices being evacuated
1378
                -> IntSet.IntSet -- ^ Set of node indices
1379
nodesToEvacuate il mode =
1380
    IntSet.delete Node.noSecondary .
1381
    foldl' (\ns idx ->
1382
                let i = Container.find idx il
1383
                    pdx = Instance.pNode i
1384
                    sdx = Instance.sNode i
1385
                    dt = Instance.diskTemplate i
1386
                    withSecondary = case dt of
1387
                                      DTDrbd8 -> IntSet.insert sdx ns
1388
                                      _ -> ns
1389
                in case mode of
1390
                     ChangePrimary   -> IntSet.insert pdx ns
1391
                     ChangeSecondary -> withSecondary
1392
                     ChangeAll       -> IntSet.insert pdx withSecondary
1393
           ) IntSet.empty