Statistics
| Branch: | Tag: | Revision:

root / htools / Ganeti / HTools / Cluster.hs @ a86fbf36

History | View | Annotate | Download (62.9 kB)

1
{-| Implementation of cluster-wide logic.
2

    
3
This module holds all pure cluster-logic; I\/O related functionality
4
goes into the /Main/ module for the individual binaries.
5

    
6
-}
7

    
8
{-
9

    
10
Copyright (C) 2009, 2010, 2011 Google Inc.
11

    
12
This program is free software; you can redistribute it and/or modify
13
it under the terms of the GNU General Public License as published by
14
the Free Software Foundation; either version 2 of the License, or
15
(at your option) any later version.
16

    
17
This program is distributed in the hope that it will be useful, but
18
WITHOUT ANY WARRANTY; without even the implied warranty of
19
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
General Public License for more details.
21

    
22
You should have received a copy of the GNU General Public License
23
along with this program; if not, write to the Free Software
24
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25
02110-1301, USA.
26

    
27
-}
28

    
29
module Ganeti.HTools.Cluster
30
    (
31
     -- * Types
32
      AllocSolution(..)
33
    , EvacSolution(..)
34
    , Table(..)
35
    , CStats(..)
36
    , AllocStats
37
    -- * Generic functions
38
    , totalResources
39
    , computeAllocationDelta
40
    -- * First phase functions
41
    , computeBadItems
42
    -- * Second phase functions
43
    , printSolutionLine
44
    , formatCmds
45
    , involvedNodes
46
    , splitJobs
47
    -- * Display functions
48
    , printNodes
49
    , printInsts
50
    -- * Balacing functions
51
    , checkMove
52
    , doNextBalance
53
    , tryBalance
54
    , compCV
55
    , compCVNodes
56
    , compDetailedCV
57
    , printStats
58
    , iMoveToJob
59
    -- * IAllocator functions
60
    , genAllocNodes
61
    , tryAlloc
62
    , tryMGAlloc
63
    , tryReloc
64
    , tryMGReloc
65
    , tryEvac
66
    , tryMGEvac
67
    , tryNodeEvac
68
    , tryChangeGroup
69
    , collapseFailures
70
    -- * Allocation functions
71
    , iterateAlloc
72
    , tieredAlloc
73
    , tieredSpecMap
74
     -- * Node group functions
75
    , instanceGroup
76
    , findSplitInstances
77
    , splitCluster
78
    ) where
79

    
80
import Data.Function (on)
81
import qualified Data.IntSet as IntSet
82
import Data.List
83
import Data.Maybe (fromJust)
84
import Data.Ord (comparing)
85
import Text.Printf (printf)
86
import Control.Monad
87
import Control.Parallel.Strategies
88

    
89
import qualified Ganeti.HTools.Container as Container
90
import qualified Ganeti.HTools.Instance as Instance
91
import qualified Ganeti.HTools.Node as Node
92
import qualified Ganeti.HTools.Group as Group
93
import Ganeti.HTools.Types
94
import Ganeti.HTools.Utils
95
import qualified Ganeti.OpCodes as OpCodes
96

    
97
-- * Types
98

    
99
-- | Allocation\/relocation solution.
100
data AllocSolution = AllocSolution
101
  { asFailures  :: [FailMode]          -- ^ Failure counts
102
  , asAllocs    :: Int                 -- ^ Good allocation count
103
  , asSolutions :: [Node.AllocElement] -- ^ The actual result, length
104
                                       -- of the list depends on the
105
                                       -- allocation/relocation mode
106
  , asLog       :: [String]            -- ^ A list of informational messages
107
  }
108

    
109
-- | Node evacuation/group change iallocator result type. This result
110
-- type consists of actual opcodes (a restricted subset) that are
111
-- transmitted back to Ganeti.
112
data EvacSolution = EvacSolution
113
    { esMoved   :: [(Idx, Gdx, [Ndx])]  -- ^ Instances moved successfully
114
    , esFailed  :: [(Idx, String)]      -- ^ Instances which were not
115
                                        -- relocated
116
    , esOpCodes :: [[[OpCodes.OpCode]]] -- ^ List of lists of jobs
117
    }
118

    
119
-- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
120
type AllocResult = (FailStats, Node.List, Instance.List,
121
                    [Instance.Instance], [CStats])
122

    
123
-- | A type denoting the valid allocation mode/pairs.
124
--
125
-- For a one-node allocation, this will be a @Left ['Node.Node']@,
126
-- whereas for a two-node allocation, this will be a @Right
127
-- [('Node.Node', 'Node.Node')]@.
128
type AllocNodes = Either [Ndx] [(Ndx, Ndx)]
129

    
130
-- | The empty solution we start with when computing allocations.
131
emptyAllocSolution :: AllocSolution
132
emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
133
                                   , asSolutions = [], asLog = [] }
134

    
135
-- | The empty evac solution.
136
emptyEvacSolution :: EvacSolution
137
emptyEvacSolution = EvacSolution { esMoved = []
138
                                 , esFailed = []
139
                                 , esOpCodes = []
140
                                 }
141

    
142
-- | The complete state for the balancing solution.
143
data Table = Table Node.List Instance.List Score [Placement]
144
             deriving (Show, Read)
145

    
146
data CStats = CStats { csFmem :: Integer -- ^ Cluster free mem
147
                     , csFdsk :: Integer -- ^ Cluster free disk
148
                     , csAmem :: Integer -- ^ Cluster allocatable mem
149
                     , csAdsk :: Integer -- ^ Cluster allocatable disk
150
                     , csAcpu :: Integer -- ^ Cluster allocatable cpus
151
                     , csMmem :: Integer -- ^ Max node allocatable mem
152
                     , csMdsk :: Integer -- ^ Max node allocatable disk
153
                     , csMcpu :: Integer -- ^ Max node allocatable cpu
154
                     , csImem :: Integer -- ^ Instance used mem
155
                     , csIdsk :: Integer -- ^ Instance used disk
156
                     , csIcpu :: Integer -- ^ Instance used cpu
157
                     , csTmem :: Double  -- ^ Cluster total mem
158
                     , csTdsk :: Double  -- ^ Cluster total disk
159
                     , csTcpu :: Double  -- ^ Cluster total cpus
160
                     , csVcpu :: Integer -- ^ Cluster virtual cpus (if
161
                                         -- node pCpu has been set,
162
                                         -- otherwise -1)
163
                     , csXmem :: Integer -- ^ Unnacounted for mem
164
                     , csNmem :: Integer -- ^ Node own memory
165
                     , csScore :: Score  -- ^ The cluster score
166
                     , csNinst :: Int    -- ^ The total number of instances
167
                     }
168
            deriving (Show, Read)
169

    
170
-- | Currently used, possibly to allocate, unallocable.
171
type AllocStats = (RSpec, RSpec, RSpec)
172

    
173
-- * Utility functions
174

    
175
-- | Verifies the N+1 status and return the affected nodes.
176
verifyN1 :: [Node.Node] -> [Node.Node]
177
verifyN1 = filter Node.failN1
178

    
179
{-| Computes the pair of bad nodes and instances.
180

    
181
The bad node list is computed via a simple 'verifyN1' check, and the
182
bad instance list is the list of primary and secondary instances of
183
those nodes.
184

    
185
-}
186
computeBadItems :: Node.List -> Instance.List ->
187
                   ([Node.Node], [Instance.Instance])
188
computeBadItems nl il =
189
  let bad_nodes = verifyN1 $ getOnline nl
190
      bad_instances = map (`Container.find` il) .
191
                      sort . nub $
192
                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
193
  in
194
    (bad_nodes, bad_instances)
195

    
196
-- | Zero-initializer for the CStats type.
197
emptyCStats :: CStats
198
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
199

    
200
-- | Update stats with data from a new node.
201
updateCStats :: CStats -> Node.Node -> CStats
202
updateCStats cs node =
203
    let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
204
                 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
205
                 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
206
                 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
207
                 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
208
                 csVcpu = x_vcpu,
209
                 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
210
               }
211
            = cs
212
        inc_amem = Node.fMem node - Node.rMem node
213
        inc_amem' = if inc_amem > 0 then inc_amem else 0
214
        inc_adsk = Node.availDisk node
215
        inc_imem = truncate (Node.tMem node) - Node.nMem node
216
                   - Node.xMem node - Node.fMem node
217
        inc_icpu = Node.uCpu node
218
        inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
219
        inc_vcpu = Node.hiCpu node
220
        inc_acpu = Node.availCpu node
221

    
222
    in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
223
          , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
224
          , csAmem = x_amem + fromIntegral inc_amem'
225
          , csAdsk = x_adsk + fromIntegral inc_adsk
226
          , csAcpu = x_acpu + fromIntegral inc_acpu
227
          , csMmem = max x_mmem (fromIntegral inc_amem')
228
          , csMdsk = max x_mdsk (fromIntegral inc_adsk)
229
          , csMcpu = max x_mcpu (fromIntegral inc_acpu)
230
          , csImem = x_imem + fromIntegral inc_imem
231
          , csIdsk = x_idsk + fromIntegral inc_idsk
232
          , csIcpu = x_icpu + fromIntegral inc_icpu
233
          , csTmem = x_tmem + Node.tMem node
234
          , csTdsk = x_tdsk + Node.tDsk node
235
          , csTcpu = x_tcpu + Node.tCpu node
236
          , csVcpu = x_vcpu + fromIntegral inc_vcpu
237
          , csXmem = x_xmem + fromIntegral (Node.xMem node)
238
          , csNmem = x_nmem + fromIntegral (Node.nMem node)
239
          , csNinst = x_ninst + length (Node.pList node)
240
          }
241

    
242
-- | Compute the total free disk and memory in the cluster.
243
totalResources :: Node.List -> CStats
244
totalResources nl =
245
    let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
246
    in cs { csScore = compCV nl }
247

    
248
-- | Compute the delta between two cluster state.
249
--
250
-- This is used when doing allocations, to understand better the
251
-- available cluster resources. The return value is a triple of the
252
-- current used values, the delta that was still allocated, and what
253
-- was left unallocated.
254
computeAllocationDelta :: CStats -> CStats -> AllocStats
255
computeAllocationDelta cini cfin =
256
    let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini
257
        CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
258
                csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin
259
        rini = RSpec (fromIntegral i_icpu) (fromIntegral i_imem)
260
               (fromIntegral i_idsk)
261
        rfin = RSpec (fromIntegral (f_icpu - i_icpu))
262
               (fromIntegral (f_imem - i_imem))
263
               (fromIntegral (f_idsk - i_idsk))
264
        un_cpu = fromIntegral (v_cpu - f_icpu)::Int
265
        runa = RSpec un_cpu (truncate t_mem - fromIntegral f_imem)
266
               (truncate t_dsk - fromIntegral f_idsk)
267
    in (rini, rfin, runa)
268

    
269
-- | The names and weights of the individual elements in the CV list.
270
detailedCVInfo :: [(Double, String)]
271
detailedCVInfo = [ (1,  "free_mem_cv")
272
                 , (1,  "free_disk_cv")
273
                 , (1,  "n1_cnt")
274
                 , (1,  "reserved_mem_cv")
275
                 , (4,  "offline_all_cnt")
276
                 , (16, "offline_pri_cnt")
277
                 , (1,  "vcpu_ratio_cv")
278
                 , (1,  "cpu_load_cv")
279
                 , (1,  "mem_load_cv")
280
                 , (1,  "disk_load_cv")
281
                 , (1,  "net_load_cv")
282
                 , (2,  "pri_tags_score")
283
                 ]
284

    
285
detailedCVWeights :: [Double]
286
detailedCVWeights = map fst detailedCVInfo
287

    
288
-- | Compute the mem and disk covariance.
289
compDetailedCV :: [Node.Node] -> [Double]
290
compDetailedCV all_nodes =
291
    let
292
        (offline, nodes) = partition Node.offline all_nodes
293
        mem_l = map Node.pMem nodes
294
        dsk_l = map Node.pDsk nodes
295
        -- metric: memory covariance
296
        mem_cv = stdDev mem_l
297
        -- metric: disk covariance
298
        dsk_cv = stdDev dsk_l
299
        -- metric: count of instances living on N1 failing nodes
300
        n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
301
                                                   length (Node.pList n)) .
302
                   filter Node.failN1 $ nodes :: Double
303
        res_l = map Node.pRem nodes
304
        -- metric: reserved memory covariance
305
        res_cv = stdDev res_l
306
        -- offline instances metrics
307
        offline_ipri = sum . map (length . Node.pList) $ offline
308
        offline_isec = sum . map (length . Node.sList) $ offline
309
        -- metric: count of instances on offline nodes
310
        off_score = fromIntegral (offline_ipri + offline_isec)::Double
311
        -- metric: count of primary instances on offline nodes (this
312
        -- helps with evacuation/failover of primary instances on
313
        -- 2-node clusters with one node offline)
314
        off_pri_score = fromIntegral offline_ipri::Double
315
        cpu_l = map Node.pCpu nodes
316
        -- metric: covariance of vcpu/pcpu ratio
317
        cpu_cv = stdDev cpu_l
318
        -- metrics: covariance of cpu, memory, disk and network load
319
        (c_load, m_load, d_load, n_load) = unzip4 $
320
            map (\n ->
321
                     let DynUtil c1 m1 d1 n1 = Node.utilLoad n
322
                         DynUtil c2 m2 d2 n2 = Node.utilPool n
323
                     in (c1/c2, m1/m2, d1/d2, n1/n2)
324
                ) nodes
325
        -- metric: conflicting instance count
326
        pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
327
        pri_tags_score = fromIntegral pri_tags_inst::Double
328
    in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
329
       , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
330
       , pri_tags_score ]
331

    
332
-- | Compute the /total/ variance.
333
compCVNodes :: [Node.Node] -> Double
334
compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
335

    
336
-- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
337
compCV :: Node.List -> Double
338
compCV = compCVNodes . Container.elems
339

    
340

    
341
-- | Compute online nodes from a 'Node.List'.
342
getOnline :: Node.List -> [Node.Node]
343
getOnline = filter (not . Node.offline) . Container.elems
344

    
345
-- * Balancing functions
346

    
347
-- | Compute best table. Note that the ordering of the arguments is important.
348
compareTables :: Table -> Table -> Table
349
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
350
    if a_cv > b_cv then b else a
351

    
352
-- | Applies an instance move to a given node list and instance.
353
applyMove :: Node.List -> Instance.Instance
354
          -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
355
-- Failover (f)
356
applyMove nl inst Failover =
357
    let old_pdx = Instance.pNode inst
358
        old_sdx = Instance.sNode inst
359
        old_p = Container.find old_pdx nl
360
        old_s = Container.find old_sdx nl
361
        int_p = Node.removePri old_p inst
362
        int_s = Node.removeSec old_s inst
363
        force_p = Node.offline old_p
364
        new_nl = do -- Maybe monad
365
          new_p <- Node.addPriEx force_p int_s inst
366
          new_s <- Node.addSec int_p inst old_sdx
367
          let new_inst = Instance.setBoth inst old_sdx old_pdx
368
          return (Container.addTwo old_pdx new_s old_sdx new_p nl,
369
                  new_inst, old_sdx, old_pdx)
370
    in new_nl
371

    
372
-- Replace the primary (f:, r:np, f)
373
applyMove nl inst (ReplacePrimary new_pdx) =
374
    let old_pdx = Instance.pNode inst
375
        old_sdx = Instance.sNode inst
376
        old_p = Container.find old_pdx nl
377
        old_s = Container.find old_sdx nl
378
        tgt_n = Container.find new_pdx nl
379
        int_p = Node.removePri old_p inst
380
        int_s = Node.removeSec old_s inst
381
        force_p = Node.offline old_p
382
        new_nl = do -- Maybe monad
383
          -- check that the current secondary can host the instance
384
          -- during the migration
385
          tmp_s <- Node.addPriEx force_p int_s inst
386
          let tmp_s' = Node.removePri tmp_s inst
387
          new_p <- Node.addPriEx force_p tgt_n inst
388
          new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
389
          let new_inst = Instance.setPri inst new_pdx
390
          return (Container.add new_pdx new_p $
391
                  Container.addTwo old_pdx int_p old_sdx new_s nl,
392
                  new_inst, new_pdx, old_sdx)
393
    in new_nl
394

    
395
-- Replace the secondary (r:ns)
396
applyMove nl inst (ReplaceSecondary new_sdx) =
397
    let old_pdx = Instance.pNode inst
398
        old_sdx = Instance.sNode inst
399
        old_s = Container.find old_sdx nl
400
        tgt_n = Container.find new_sdx nl
401
        int_s = Node.removeSec old_s inst
402
        force_s = Node.offline old_s
403
        new_inst = Instance.setSec inst new_sdx
404
        new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
405
                 \new_s -> return (Container.addTwo new_sdx
406
                                   new_s old_sdx int_s nl,
407
                                   new_inst, old_pdx, new_sdx)
408
    in new_nl
409

    
410
-- Replace the secondary and failover (r:np, f)
411
applyMove nl inst (ReplaceAndFailover new_pdx) =
412
    let old_pdx = Instance.pNode inst
413
        old_sdx = Instance.sNode inst
414
        old_p = Container.find old_pdx nl
415
        old_s = Container.find old_sdx nl
416
        tgt_n = Container.find new_pdx nl
417
        int_p = Node.removePri old_p inst
418
        int_s = Node.removeSec old_s inst
419
        force_s = Node.offline old_s
420
        new_nl = do -- Maybe monad
421
          new_p <- Node.addPri tgt_n inst
422
          new_s <- Node.addSecEx force_s int_p inst new_pdx
423
          let new_inst = Instance.setBoth inst new_pdx old_pdx
424
          return (Container.add new_pdx new_p $
425
                  Container.addTwo old_pdx new_s old_sdx int_s nl,
426
                  new_inst, new_pdx, old_pdx)
427
    in new_nl
428

    
429
-- Failver and replace the secondary (f, r:ns)
430
applyMove nl inst (FailoverAndReplace new_sdx) =
431
    let old_pdx = Instance.pNode inst
432
        old_sdx = Instance.sNode inst
433
        old_p = Container.find old_pdx nl
434
        old_s = Container.find old_sdx nl
435
        tgt_n = Container.find new_sdx nl
436
        int_p = Node.removePri old_p inst
437
        int_s = Node.removeSec old_s inst
438
        force_p = Node.offline old_p
439
        new_nl = do -- Maybe monad
440
          new_p <- Node.addPriEx force_p int_s inst
441
          new_s <- Node.addSecEx force_p tgt_n inst old_sdx
442
          let new_inst = Instance.setBoth inst old_sdx new_sdx
443
          return (Container.add new_sdx new_s $
444
                  Container.addTwo old_sdx new_p old_pdx int_p nl,
445
                  new_inst, old_sdx, new_sdx)
446
    in new_nl
447

    
448
-- | Tries to allocate an instance on one given node.
449
allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
450
                 -> OpResult Node.AllocElement
451
allocateOnSingle nl inst new_pdx =
452
    let p = Container.find new_pdx nl
453
        new_inst = Instance.setBoth inst new_pdx Node.noSecondary
454
    in  Node.addPri p inst >>= \new_p -> do
455
      let new_nl = Container.add new_pdx new_p nl
456
          new_score = compCV nl
457
      return (new_nl, new_inst, [new_p], new_score)
458

    
459
-- | Tries to allocate an instance on a given pair of nodes.
460
allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
461
               -> OpResult Node.AllocElement
462
allocateOnPair nl inst new_pdx new_sdx =
463
    let tgt_p = Container.find new_pdx nl
464
        tgt_s = Container.find new_sdx nl
465
    in do
466
      new_p <- Node.addPri tgt_p inst
467
      new_s <- Node.addSec tgt_s inst new_pdx
468
      let new_inst = Instance.setBoth inst new_pdx new_sdx
469
          new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
470
      return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
471

    
472
-- | Tries to perform an instance move and returns the best table
473
-- between the original one and the new one.
474
checkSingleStep :: Table -- ^ The original table
475
                -> Instance.Instance -- ^ The instance to move
476
                -> Table -- ^ The current best table
477
                -> IMove -- ^ The move to apply
478
                -> Table -- ^ The final best table
479
checkSingleStep ini_tbl target cur_tbl move =
480
    let
481
        Table ini_nl ini_il _ ini_plc = ini_tbl
482
        tmp_resu = applyMove ini_nl target move
483
    in
484
      case tmp_resu of
485
        OpFail _ -> cur_tbl
486
        OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
487
            let tgt_idx = Instance.idx target
488
                upd_cvar = compCV upd_nl
489
                upd_il = Container.add tgt_idx new_inst ini_il
490
                upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
491
                upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
492
            in
493
              compareTables cur_tbl upd_tbl
494

    
495
-- | Given the status of the current secondary as a valid new node and
496
-- the current candidate target node, generate the possible moves for
497
-- a instance.
498
possibleMoves :: Bool      -- ^ Whether the secondary node is a valid new node
499
              -> Bool      -- ^ Whether we can change the primary node
500
              -> Ndx       -- ^ Target node candidate
501
              -> [IMove]   -- ^ List of valid result moves
502

    
503
possibleMoves _ False tdx =
504
    [ReplaceSecondary tdx]
505

    
506
possibleMoves True True tdx =
507
    [ReplaceSecondary tdx,
508
     ReplaceAndFailover tdx,
509
     ReplacePrimary tdx,
510
     FailoverAndReplace tdx]
511

    
512
possibleMoves False True tdx =
513
    [ReplaceSecondary tdx,
514
     ReplaceAndFailover tdx]
515

    
516
-- | Compute the best move for a given instance.
517
checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
518
                  -> Bool              -- ^ Whether disk moves are allowed
519
                  -> Bool              -- ^ Whether instance moves are allowed
520
                  -> Table             -- ^ Original table
521
                  -> Instance.Instance -- ^ Instance to move
522
                  -> Table             -- ^ Best new table for this instance
523
checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
524
    let
525
        opdx = Instance.pNode target
526
        osdx = Instance.sNode target
527
        nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx
528
        use_secondary = elem osdx nodes_idx && inst_moves
529
        aft_failover = if use_secondary -- if allowed to failover
530
                       then checkSingleStep ini_tbl target ini_tbl Failover
531
                       else ini_tbl
532
        all_moves = if disk_moves
533
                    then concatMap
534
                         (possibleMoves use_secondary inst_moves) nodes
535
                    else []
536
    in
537
      -- iterate over the possible nodes for this instance
538
      foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
539

    
540
-- | Compute the best next move.
541
checkMove :: [Ndx]               -- ^ Allowed target node indices
542
          -> Bool                -- ^ Whether disk moves are allowed
543
          -> Bool                -- ^ Whether instance moves are allowed
544
          -> Table               -- ^ The current solution
545
          -> [Instance.Instance] -- ^ List of instances still to move
546
          -> Table               -- ^ The new solution
547
checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
548
    let Table _ _ _ ini_plc = ini_tbl
549
        -- we're using rwhnf from the Control.Parallel.Strategies
550
        -- package; we don't need to use rnf as that would force too
551
        -- much evaluation in single-threaded cases, and in
552
        -- multi-threaded case the weak head normal form is enough to
553
        -- spark the evaluation
554
        tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
555
                               inst_moves ini_tbl)
556
                 victims
557
        -- iterate over all instances, computing the best move
558
        best_tbl = foldl' compareTables ini_tbl tables
559
        Table _ _ _ best_plc = best_tbl
560
    in if length best_plc == length ini_plc
561
       then ini_tbl -- no advancement
562
       else best_tbl
563

    
564
-- | Check if we are allowed to go deeper in the balancing.
565
doNextBalance :: Table     -- ^ The starting table
566
              -> Int       -- ^ Remaining length
567
              -> Score     -- ^ Score at which to stop
568
              -> Bool      -- ^ The resulting table and commands
569
doNextBalance ini_tbl max_rounds min_score =
570
    let Table _ _ ini_cv ini_plc = ini_tbl
571
        ini_plc_len = length ini_plc
572
    in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
573

    
574
-- | Run a balance move.
575
tryBalance :: Table       -- ^ The starting table
576
           -> Bool        -- ^ Allow disk moves
577
           -> Bool        -- ^ Allow instance moves
578
           -> Bool        -- ^ Only evacuate moves
579
           -> Score       -- ^ Min gain threshold
580
           -> Score       -- ^ Min gain
581
           -> Maybe Table -- ^ The resulting table and commands
582
tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
583
    let Table ini_nl ini_il ini_cv _ = ini_tbl
584
        all_inst = Container.elems ini_il
585
        all_inst' = if evac_mode
586
                    then let bad_nodes = map Node.idx . filter Node.offline $
587
                                         Container.elems ini_nl
588
                         in filter (\e -> Instance.sNode e `elem` bad_nodes ||
589
                                          Instance.pNode e `elem` bad_nodes)
590
                            all_inst
591
                    else all_inst
592
        reloc_inst = filter Instance.movable all_inst'
593
        node_idx = map Node.idx . filter (not . Node.offline) $
594
                   Container.elems ini_nl
595
        fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
596
        (Table _ _ fin_cv _) = fin_tbl
597
    in
598
      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
599
      then Just fin_tbl -- this round made success, return the new table
600
      else Nothing
601

    
602
-- * Allocation functions
603

    
604
-- | Build failure stats out of a list of failures.
605
collapseFailures :: [FailMode] -> FailStats
606
collapseFailures flst =
607
    map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
608
            [minBound..maxBound]
609

    
610
-- | Update current Allocation solution and failure stats with new
611
-- elements.
612
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
613
concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
614

    
615
concatAllocs as (OpGood ns@(_, _, _, nscore)) =
616
    let -- Choose the old or new solution, based on the cluster score
617
        cntok = asAllocs as
618
        osols = asSolutions as
619
        nsols = case osols of
620
                  [] -> [ns]
621
                  (_, _, _, oscore):[] ->
622
                      if oscore < nscore
623
                      then osols
624
                      else [ns]
625
                  -- FIXME: here we simply concat to lists with more
626
                  -- than one element; we should instead abort, since
627
                  -- this is not a valid usage of this function
628
                  xs -> ns:xs
629
        nsuc = cntok + 1
630
    -- Note: we force evaluation of nsols here in order to keep the
631
    -- memory profile low - we know that we will need nsols for sure
632
    -- in the next cycle, so we force evaluation of nsols, since the
633
    -- foldl' in the caller will only evaluate the tuple, but not the
634
    -- elements of the tuple
635
    in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolutions = nsols }
636

    
637
-- | Sums two allocation solutions (e.g. for two separate node groups).
638
sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
639
sumAllocs (AllocSolution af aa as al) (AllocSolution bf ba bs bl) =
640
    AllocSolution (af ++ bf) (aa + ba) (as ++ bs) (al ++ bl)
641

    
642
-- | Given a solution, generates a reasonable description for it.
643
describeSolution :: AllocSolution -> String
644
describeSolution as =
645
  let fcnt = asFailures as
646
      sols = asSolutions as
647
      freasons =
648
        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
649
        filter ((> 0) . snd) . collapseFailures $ fcnt
650
  in if null sols
651
     then "No valid allocation solutions, failure reasons: " ++
652
          (if null fcnt
653
           then "unknown reasons"
654
           else freasons)
655
     else let (_, _, nodes, cv) = head sols
656
          in printf ("score: %.8f, successes %d, failures %d (%s)" ++
657
                     " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
658
             (intercalate "/" . map Node.name $ nodes)
659

    
660
-- | Annotates a solution with the appropriate string.
661
annotateSolution :: AllocSolution -> AllocSolution
662
annotateSolution as = as { asLog = describeSolution as : asLog as }
663

    
664
-- | Reverses an evacuation solution.
665
--
666
-- Rationale: we always concat the results to the top of the lists, so
667
-- for proper jobset execution, we should reverse all lists.
668
reverseEvacSolution :: EvacSolution -> EvacSolution
669
reverseEvacSolution (EvacSolution f m o) =
670
    EvacSolution (reverse f) (reverse m) (reverse o)
671

    
672
-- | Generate the valid node allocation singles or pairs for a new instance.
673
genAllocNodes :: Group.List        -- ^ Group list
674
              -> Node.List         -- ^ The node map
675
              -> Int               -- ^ The number of nodes required
676
              -> Bool              -- ^ Whether to drop or not
677
                                   -- unallocable nodes
678
              -> Result AllocNodes -- ^ The (monadic) result
679
genAllocNodes gl nl count drop_unalloc =
680
    let filter_fn = if drop_unalloc
681
                    then filter (Group.isAllocable .
682
                                 flip Container.find gl . Node.group)
683
                    else id
684
        all_nodes = filter_fn $ getOnline nl
685
        all_pairs = liftM2 (,) all_nodes all_nodes
686
        ok_pairs = filter (\(x, y) -> Node.idx x /= Node.idx y &&
687
                                      Node.group x == Node.group y) all_pairs
688
    in case count of
689
         1 -> Ok (Left (map Node.idx all_nodes))
690
         2 -> Ok (Right (map (\(p, s) -> (Node.idx p, Node.idx s)) ok_pairs))
691
         _ -> Bad "Unsupported number of nodes, only one or two  supported"
692

    
693
-- | Try to allocate an instance on the cluster.
694
tryAlloc :: (Monad m) =>
695
            Node.List         -- ^ The node list
696
         -> Instance.List     -- ^ The instance list
697
         -> Instance.Instance -- ^ The instance to allocate
698
         -> AllocNodes        -- ^ The allocation targets
699
         -> m AllocSolution   -- ^ Possible solution list
700
tryAlloc nl _ inst (Right ok_pairs) =
701
    let sols = foldl' (\cstate (p, s) ->
702
                           concatAllocs cstate $ allocateOnPair nl inst p s
703
                      ) emptyAllocSolution ok_pairs
704

    
705
    in if null ok_pairs -- means we have just one node
706
       then fail "Not enough online nodes"
707
       else return $ annotateSolution sols
708

    
709
tryAlloc nl _ inst (Left all_nodes) =
710
    let sols = foldl' (\cstate ->
711
                           concatAllocs cstate . allocateOnSingle nl inst
712
                      ) emptyAllocSolution all_nodes
713
    in if null all_nodes
714
       then fail "No online nodes"
715
       else return $ annotateSolution sols
716

    
717
-- | Given a group/result, describe it as a nice (list of) messages.
718
solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
719
solutionDescription gl (groupId, result) =
720
  case result of
721
    Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
722
    Bad message -> [printf "Group %s: error %s" gname message]
723
  where grp = Container.find groupId gl
724
        gname = Group.name grp
725
        pol = apolToString (Group.allocPolicy grp)
726

    
727
-- | From a list of possibly bad and possibly empty solutions, filter
728
-- only the groups with a valid result. Note that the result will be
729
-- reversed compared to the original list.
730
filterMGResults :: Group.List
731
                -> [(Gdx, Result AllocSolution)]
732
                -> [(Gdx, AllocSolution)]
733
filterMGResults gl = foldl' fn []
734
    where unallocable = not . Group.isAllocable . flip Container.find gl
735
          fn accu (gdx, rasol) =
736
              case rasol of
737
                Bad _ -> accu
738
                Ok sol | null (asSolutions sol) -> accu
739
                       | unallocable gdx -> accu
740
                       | otherwise -> (gdx, sol):accu
741

    
742
-- | Sort multigroup results based on policy and score.
743
sortMGResults :: Group.List
744
             -> [(Gdx, AllocSolution)]
745
             -> [(Gdx, AllocSolution)]
746
sortMGResults gl sols =
747
    let extractScore (_, _, _, x) = x
748
        solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
749
                               (extractScore . head . asSolutions) sol)
750
    in sortBy (comparing solScore) sols
751

    
752
-- | Finds the best group for an instance on a multi-group cluster.
753
--
754
-- Only solutions in @preferred@ and @last_resort@ groups will be
755
-- accepted as valid, and additionally if the allowed groups parameter
756
-- is not null then allocation will only be run for those group
757
-- indices.
758
findBestAllocGroup :: Group.List           -- ^ The group list
759
                   -> Node.List            -- ^ The node list
760
                   -> Instance.List        -- ^ The instance list
761
                   -> Maybe [Gdx]          -- ^ The allowed groups
762
                   -> Instance.Instance    -- ^ The instance to allocate
763
                   -> Int                  -- ^ Required number of nodes
764
                   -> Result (Gdx, AllocSolution, [String])
765
findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
766
  let groups = splitCluster mgnl mgil
767
      groups' = maybe groups (\gs -> filter ((`elem` gs) . fst) groups)
768
                allowed_gdxs
769
      sols = map (\(gid, (nl, il)) ->
770
                   (gid, genAllocNodes mggl nl cnt False >>=
771
                       tryAlloc nl il inst))
772
             groups'::[(Gdx, Result AllocSolution)]
773
      all_msgs = concatMap (solutionDescription mggl) sols
774
      goodSols = filterMGResults mggl sols
775
      sortedSols = sortMGResults mggl goodSols
776
  in if null sortedSols
777
     then Bad $ intercalate ", " all_msgs
778
     else let (final_group, final_sol) = head sortedSols
779
          in return (final_group, final_sol, all_msgs)
780

    
781
-- | Try to allocate an instance on a multi-group cluster.
782
tryMGAlloc :: Group.List           -- ^ The group list
783
           -> Node.List            -- ^ The node list
784
           -> Instance.List        -- ^ The instance list
785
           -> Instance.Instance    -- ^ The instance to allocate
786
           -> Int                  -- ^ Required number of nodes
787
           -> Result AllocSolution -- ^ Possible solution list
788
tryMGAlloc mggl mgnl mgil inst cnt = do
789
  (best_group, solution, all_msgs) <-
790
      findBestAllocGroup mggl mgnl mgil Nothing inst cnt
791
  let group_name = Group.name $ Container.find best_group mggl
792
      selmsg = "Selected group: " ++ group_name
793
  return $ solution { asLog = selmsg:all_msgs }
794

    
795
-- | Try to relocate an instance on the cluster.
796
tryReloc :: (Monad m) =>
797
            Node.List       -- ^ The node list
798
         -> Instance.List   -- ^ The instance list
799
         -> Idx             -- ^ The index of the instance to move
800
         -> Int             -- ^ The number of nodes required
801
         -> [Ndx]           -- ^ Nodes which should not be used
802
         -> m AllocSolution -- ^ Solution list
803
tryReloc nl il xid 1 ex_idx =
804
    let all_nodes = getOnline nl
805
        inst = Container.find xid il
806
        ex_idx' = Instance.pNode inst:ex_idx
807
        valid_nodes = filter (not . flip elem ex_idx' . Node.idx) all_nodes
808
        valid_idxes = map Node.idx valid_nodes
809
        sols1 = foldl' (\cstate x ->
810
                            let em = do
811
                                  (mnl, i, _, _) <-
812
                                      applyMove nl inst (ReplaceSecondary x)
813
                                  return (mnl, i, [Container.find x mnl],
814
                                          compCV mnl)
815
                            in concatAllocs cstate em
816
                       ) emptyAllocSolution valid_idxes
817
    in return sols1
818

    
819
tryReloc _ _ _ reqn _  = fail $ "Unsupported number of relocation \
820
                                \destinations required (" ++ show reqn ++
821
                                                  "), only one supported"
822

    
823
tryMGReloc :: (Monad m) =>
824
              Group.List      -- ^ The group list
825
           -> Node.List       -- ^ The node list
826
           -> Instance.List   -- ^ The instance list
827
           -> Idx             -- ^ The index of the instance to move
828
           -> Int             -- ^ The number of nodes required
829
           -> [Ndx]           -- ^ Nodes which should not be used
830
           -> m AllocSolution -- ^ Solution list
831
tryMGReloc _ mgnl mgil xid ncount ex_ndx = do
832
  let groups = splitCluster mgnl mgil
833
      -- TODO: we only relocate inside the group for now
834
      inst = Container.find xid mgil
835
  (nl, il) <- case lookup (instancePriGroup mgnl inst) groups of
836
                Nothing -> fail $ "Cannot find group for instance " ++
837
                           Instance.name inst
838
                Just v -> return v
839
  tryReloc nl il xid ncount ex_ndx
840

    
841
-- | Change an instance's secondary node.
842
evacInstance :: (Monad m) =>
843
                [Ndx]                      -- ^ Excluded nodes
844
             -> Instance.List              -- ^ The current instance list
845
             -> (Node.List, AllocSolution) -- ^ The current state
846
             -> Idx                        -- ^ The instance to evacuate
847
             -> m (Node.List, AllocSolution)
848
evacInstance ex_ndx il (nl, old_as) idx = do
849
  -- FIXME: hardcoded one node here
850

    
851
  -- Longer explanation: evacuation is currently hardcoded to DRBD
852
  -- instances (which have one secondary); hence, even if the
853
  -- IAllocator protocol can request N nodes for an instance, and all
854
  -- the message parsing/loading pass this, this implementation only
855
  -- supports one; this situation needs to be revisited if we ever
856
  -- support more than one secondary, or if we change the storage
857
  -- model
858
  new_as <- tryReloc nl il idx 1 ex_ndx
859
  case asSolutions new_as of
860
    -- an individual relocation succeeded, we kind of compose the data
861
    -- from the two solutions
862
    csol@(nl', _, _, _):_ ->
863
        return (nl', new_as { asSolutions = csol:asSolutions old_as })
864
    -- this relocation failed, so we fail the entire evac
865
    _ -> fail $ "Can't evacuate instance " ++
866
         Instance.name (Container.find idx il) ++
867
             ": " ++ describeSolution new_as
868

    
869
-- | Try to evacuate a list of nodes.
870
tryEvac :: (Monad m) =>
871
            Node.List       -- ^ The node list
872
         -> Instance.List   -- ^ The instance list
873
         -> [Idx]           -- ^ Instances to be evacuated
874
         -> [Ndx]           -- ^ Restricted nodes (the ones being evacuated)
875
         -> m AllocSolution -- ^ Solution list
876
tryEvac nl il idxs ex_ndx = do
877
  (_, sol) <- foldM (evacInstance ex_ndx il) (nl, emptyAllocSolution) idxs
878
  return sol
879

    
880
-- | Multi-group evacuation of a list of nodes.
881
tryMGEvac :: (Monad m) =>
882
             Group.List -- ^ The group list
883
          -> Node.List       -- ^ The node list
884
          -> Instance.List   -- ^ The instance list
885
          -> [Ndx]           -- ^ Nodes to be evacuated
886
          -> m AllocSolution -- ^ Solution list
887
tryMGEvac _ nl il ex_ndx =
888
    let ex_nodes = map (`Container.find` nl) ex_ndx
889
        all_insts = nub . concatMap Node.sList $ ex_nodes
890
        all_insts' = associateIdxs all_insts $ splitCluster nl il
891
    in do
892
      results <- mapM (\(_, (gnl, gil, idxs)) -> tryEvac gnl gil idxs ex_ndx)
893
                 all_insts'
894
      let sol = foldl' sumAllocs emptyAllocSolution results
895
      return $ annotateSolution sol
896

    
897
-- | Function which fails if the requested mode is change secondary.
898
--
899
-- This is useful since except DRBD, no other disk template can
900
-- execute change secondary; thus, we can just call this function
901
-- instead of always checking for secondary mode. After the call to
902
-- this function, whatever mode we have is just a primary change.
903
failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
904
failOnSecondaryChange ChangeSecondary dt =
905
    fail $ "Instances with disk template '" ++ dtToString dt ++
906
         "' can't execute change secondary"
907
failOnSecondaryChange _ _ = return ()
908

    
909
-- | Run evacuation for a single instance.
910
--
911
-- /Note:/ this function should correctly execute both intra-group
912
-- evacuations (in all modes) and inter-group evacuations (in the
913
-- 'ChangeAll' mode). Of course, this requires that the correct list
914
-- of target nodes is passed.
915
nodeEvacInstance :: Node.List         -- ^ The node list (cluster-wide)
916
                 -> Instance.List     -- ^ Instance list (cluster-wide)
917
                 -> EvacMode          -- ^ The evacuation mode
918
                 -> Instance.Instance -- ^ The instance to be evacuated
919
                 -> Gdx               -- ^ The group we're targetting
920
                 -> [Ndx]             -- ^ The list of available nodes
921
                                      -- for allocation
922
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
923
nodeEvacInstance _ _ mode (Instance.Instance
924
                           {Instance.diskTemplate = dt@DTDiskless}) _ _ =
925
                  failOnSecondaryChange mode dt >>
926
                  fail "Diskless relocations not implemented yet"
927

    
928
nodeEvacInstance _ _ _ (Instance.Instance
929
                        {Instance.diskTemplate = DTPlain}) _ _ =
930
                  fail "Instances of type plain cannot be relocated"
931

    
932
nodeEvacInstance _ _ _ (Instance.Instance
933
                        {Instance.diskTemplate = DTFile}) _ _ =
934
                  fail "Instances of type file cannot be relocated"
935

    
936
nodeEvacInstance _ _ mode  (Instance.Instance
937
                            {Instance.diskTemplate = dt@DTSharedFile}) _ _ =
938
                  failOnSecondaryChange mode dt >>
939
                  fail "Shared file relocations not implemented yet"
940

    
941
nodeEvacInstance _ _ mode (Instance.Instance
942
                           {Instance.diskTemplate = dt@DTBlock}) _ _ =
943
                  failOnSecondaryChange mode dt >>
944
                  fail "Block device relocations not implemented yet"
945

    
946
nodeEvacInstance nl il ChangePrimary
947
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
948
                 _ _ =
949
  do
950
    (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
951
    let idx = Instance.idx inst
952
        il' = Container.add idx inst' il
953
        ops = iMoveToJob nl' il' idx Failover
954
    return (nl', il', ops)
955

    
956
nodeEvacInstance nl il ChangeSecondary
957
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
958
                 gdx avail_nodes =
959
  do
960
    (nl', inst', _, ndx) <- annotateResult "Can't find any good node" $
961
                            eitherToResult $
962
                            foldl' (evacDrbdSecondaryInner nl inst gdx)
963
                            (Left "no nodes available") avail_nodes
964
    let idx = Instance.idx inst
965
        il' = Container.add idx inst' il
966
        ops = iMoveToJob nl' il' idx (ReplaceSecondary ndx)
967
    return (nl', il', ops)
968

    
969
nodeEvacInstance nl il ChangeAll
970
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
971
                 gdx avail_nodes =
972
  do
973
    let primary = Container.find (Instance.pNode inst) nl
974
        idx = Instance.idx inst
975
        no_nodes = Left "no nodes available"
976
    -- if the primary is offline, then we first failover
977
    (nl1, inst1, ops1) <-
978
        if Node.offline primary
979
        then do
980
          (nl', inst', _, _) <-
981
              annotateResult "Failing over to the secondary" $
982
              opToResult $ applyMove nl inst Failover
983
          return (nl', inst', [Failover])
984
        else return (nl, inst, [])
985
    -- we now need to execute a replace secondary to the future
986
    -- primary node
987
    (nl2, inst2, _, new_pdx) <- annotateResult "Searching for a new primary" $
988
                                eitherToResult $
989
                                foldl' (evacDrbdSecondaryInner nl1 inst1 gdx)
990
                                no_nodes avail_nodes
991
    let ops2 = ReplaceSecondary new_pdx:ops1
992
    -- since we chose the new primary, we remove it from the list of
993
    -- available nodes
994
    let avail_nodes_sec = new_pdx `delete` avail_nodes
995
    -- we now execute another failover, the primary stays fixed now
996
    (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" $
997
                          opToResult $ applyMove nl2 inst2 Failover
998
    let ops3 = Failover:ops2
999
    -- and finally another replace secondary, to the final secondary
1000
    (nl4, inst4, _, new_sdx) <-
1001
        annotateResult "Searching for a new secondary" $
1002
        eitherToResult $
1003
        foldl' (evacDrbdSecondaryInner nl3 inst3 gdx) no_nodes avail_nodes_sec
1004
    let ops4 = ReplaceSecondary new_sdx:ops3
1005
        il' = Container.add idx inst4 il
1006
        ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1007
    return (nl4, il', ops)
1008

    
1009
-- | Inner fold function for changing secondary of a DRBD instance.
1010
--
1011
-- The "running" solution is either a @Left String@, which means we
1012
-- don't have yet a working solution, or a @Right (...)@, which
1013
-- represents a valid solution; it holds the modified node list, the
1014
-- modified instance (after evacuation), the score of that solution,
1015
-- and the new secondary node index.
1016
evacDrbdSecondaryInner :: Node.List -- ^ Cluster node list
1017
                       -> Instance.Instance -- ^ Instance being evacuated
1018
                       -> Gdx -- ^ The group index of the instance
1019
                       -> Either String ( Node.List
1020
                                        , Instance.Instance
1021
                                        , Score
1022
                                        , Ndx)  -- ^ Current best solution
1023
                       -> Ndx  -- ^ Node we're evaluating as new secondary
1024
                       -> Either String ( Node.List
1025
                                        , Instance.Instance
1026
                                        , Score
1027
                                        , Ndx) -- ^ New best solution
1028
evacDrbdSecondaryInner nl inst gdx accu ndx =
1029
    case applyMove nl inst (ReplaceSecondary ndx) of
1030
      OpFail fm ->
1031
          case accu of
1032
            Right _ -> accu
1033
            Left _ -> Left $ "Node " ++ Container.nameOf nl ndx ++
1034
                      " failed: " ++ show fm
1035
      OpGood (nl', inst', _, _) ->
1036
          let nodes = Container.elems nl'
1037
              -- The fromJust below is ugly (it can fail nastily), but
1038
              -- at this point we should have any internal mismatches,
1039
              -- and adding a monad here would be quite involved
1040
              grpnodes = fromJust (gdx `lookup` (Node.computeGroups nodes))
1041
              new_cv = compCVNodes grpnodes
1042
              new_accu = Right (nl', inst', new_cv, ndx)
1043
          in case accu of
1044
               Left _ -> new_accu
1045
               Right (_, _, old_cv, _) ->
1046
                   if old_cv < new_cv
1047
                   then accu
1048
                   else new_accu
1049

    
1050
-- | Computes the nodes in a given group which are available for
1051
-- allocation.
1052
availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1053
                    -> IntSet.IntSet  -- ^ Nodes that are excluded
1054
                    -> Gdx            -- ^ The group for which we
1055
                                      -- query the nodes
1056
                    -> Result [Ndx]   -- ^ List of available node indices
1057
availableGroupNodes group_nodes excl_ndx gdx = do
1058
  local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1059
                 Ok (lookup gdx group_nodes)
1060
  let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1061
  return avail_nodes
1062

    
1063
-- | Updates the evac solution with the results of an instance
1064
-- evacuation.
1065
updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1066
                   -> Idx
1067
                   -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1068
                   -> (Node.List, Instance.List, EvacSolution)
1069
updateEvacSolution (nl, il, es) idx (Bad msg) =
1070
    (nl, il, es { esFailed = (idx, msg):esFailed es})
1071
updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1072
    (nl, il, es { esMoved = new_elem:esMoved es
1073
                , esOpCodes = [opcodes]:esOpCodes es })
1074
     where inst = Container.find idx il
1075
           new_elem = (idx,
1076
                       instancePriGroup nl inst,
1077
                       Instance.allNodes inst)
1078

    
1079
-- | Node-evacuation IAllocator mode main function.
1080
tryNodeEvac :: Group.List    -- ^ The cluster groups
1081
            -> Node.List     -- ^ The node list (cluster-wide, not per group)
1082
            -> Instance.List -- ^ Instance list (cluster-wide)
1083
            -> EvacMode      -- ^ The evacuation mode
1084
            -> [Idx]         -- ^ List of instance (indices) to be evacuated
1085
            -> Result (Node.List, Instance.List, EvacSolution)
1086
tryNodeEvac _ ini_nl ini_il mode idxs =
1087
    let evac_ndx = nodesToEvacuate ini_il mode idxs
1088
        offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1089
        excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1090
        group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1091
                                             (Container.elems nl))) $
1092
                      splitCluster ini_nl ini_il
1093
        (fin_nl, fin_il, esol) =
1094
            foldl' (\state@(nl, il, _) inst ->
1095
                        let gdx = instancePriGroup nl inst in
1096
                        updateEvacSolution state (Instance.idx inst) $
1097
                        availableGroupNodes group_ndx
1098
                          excl_ndx gdx >>=
1099
                        nodeEvacInstance nl il mode inst gdx
1100
                   )
1101
            (ini_nl, ini_il, emptyEvacSolution)
1102
            (map (`Container.find` ini_il) idxs)
1103
    in return (fin_nl, fin_il, reverseEvacSolution esol)
1104

    
1105
-- | Change-group IAllocator mode main function.
1106
--
1107
-- This is very similar to 'tryNodeEvac', the only difference is that
1108
-- we don't choose as target group the current instance group, but
1109
-- instead:
1110
--
1111
--   1. at the start of the function, we compute which are the target
1112
--   groups; either no groups were passed in, in which case we choose
1113
--   all groups out of which we don't evacuate instance, or there were
1114
--   some groups passed, in which case we use those
1115
--
1116
--   2. for each instance, we use 'findBestAllocGroup' to choose the
1117
--   best group to hold the instance, and then we do what
1118
--   'tryNodeEvac' does, except for this group instead of the current
1119
--   instance group.
1120
--
1121
-- Note that the correct behaviour of this function relies on the
1122
-- function 'nodeEvacInstance' to be able to do correctly both
1123
-- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1124
tryChangeGroup :: Group.List    -- ^ The cluster groups
1125
               -> Node.List     -- ^ The node list (cluster-wide)
1126
               -> Instance.List -- ^ Instance list (cluster-wide)
1127
               -> [Gdx]         -- ^ Target groups; if empty, any
1128
                                -- groups not being evacuated
1129
               -> [Idx]         -- ^ List of instance (indices) to be evacuated
1130
               -> Result (Node.List, Instance.List, EvacSolution)
1131
tryChangeGroup gl ini_nl ini_il gdxs idxs =
1132
    let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1133
                               flip Container.find ini_il) idxs
1134
        target_gdxs = (if null gdxs
1135
                       then Container.keys gl
1136
                       else gdxs) \\ evac_gdxs
1137
        offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1138
        excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1139
        group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1140
                                             (Container.elems nl))) $
1141
                      splitCluster ini_nl ini_il
1142
        (fin_nl, fin_il, esol) =
1143
            foldl' (\state@(nl, il, _) inst ->
1144
                        let solution = do
1145
                              let ncnt = Instance.requiredNodes $
1146
                                         Instance.diskTemplate inst
1147
                              (gdx, _, _) <- findBestAllocGroup gl nl il
1148
                                             (Just target_gdxs) inst ncnt
1149
                              av_nodes <- availableGroupNodes group_ndx
1150
                                          excl_ndx gdx
1151
                              nodeEvacInstance nl il ChangeAll inst
1152
                                       gdx av_nodes
1153
                        in updateEvacSolution state
1154
                               (Instance.idx inst) solution
1155
                   )
1156
            (ini_nl, ini_il, emptyEvacSolution)
1157
            (map (`Container.find` ini_il) idxs)
1158
    in return (fin_nl, fin_il, reverseEvacSolution esol)
1159

    
1160
-- | Recursively place instances on the cluster until we're out of space.
1161
iterateAlloc :: Node.List
1162
             -> Instance.List
1163
             -> Maybe Int
1164
             -> Instance.Instance
1165
             -> AllocNodes
1166
             -> [Instance.Instance]
1167
             -> [CStats]
1168
             -> Result AllocResult
1169
iterateAlloc nl il limit newinst allocnodes ixes cstats =
1170
      let depth = length ixes
1171
          newname = printf "new-%d" depth::String
1172
          newidx = length (Container.elems il) + depth
1173
          newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1174
          newlimit = fmap (flip (-) 1) limit
1175
      in case tryAlloc nl il newi2 allocnodes of
1176
           Bad s -> Bad s
1177
           Ok (AllocSolution { asFailures = errs, asSolutions = sols3 }) ->
1178
               let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1179
               case sols3 of
1180
                 [] -> newsol
1181
                 (xnl, xi, _, _):[] ->
1182
                     if limit == Just 0
1183
                     then newsol
1184
                     else iterateAlloc xnl (Container.add newidx xi il)
1185
                          newlimit newinst allocnodes (xi:ixes)
1186
                          (totalResources xnl:cstats)
1187
                 _ -> Bad "Internal error: multiple solutions for single\
1188
                          \ allocation"
1189

    
1190
-- | The core of the tiered allocation mode.
1191
tieredAlloc :: Node.List
1192
            -> Instance.List
1193
            -> Maybe Int
1194
            -> Instance.Instance
1195
            -> AllocNodes
1196
            -> [Instance.Instance]
1197
            -> [CStats]
1198
            -> Result AllocResult
1199
tieredAlloc nl il limit newinst allocnodes ixes cstats =
1200
    case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1201
      Bad s -> Bad s
1202
      Ok (errs, nl', il', ixes', cstats') ->
1203
          let newsol = Ok (errs, nl', il', ixes', cstats')
1204
              ixes_cnt = length ixes'
1205
              (stop, newlimit) = case limit of
1206
                                   Nothing -> (False, Nothing)
1207
                                   Just n -> (n <= ixes_cnt,
1208
                                              Just (n - ixes_cnt)) in
1209
          if stop then newsol else
1210
          case Instance.shrinkByType newinst . fst . last $
1211
               sortBy (comparing snd) errs of
1212
            Bad _ -> newsol
1213
            Ok newinst' -> tieredAlloc nl' il' newlimit
1214
                           newinst' allocnodes ixes' cstats'
1215

    
1216
-- | Compute the tiered spec string description from a list of
1217
-- allocated instances.
1218
tieredSpecMap :: [Instance.Instance]
1219
              -> [String]
1220
tieredSpecMap trl_ixes =
1221
    let fin_trl_ixes = reverse trl_ixes
1222
        ix_byspec = groupBy ((==) `on` Instance.specOf) fin_trl_ixes
1223
        spec_map = map (\ixs -> (Instance.specOf $ head ixs, length ixs))
1224
                   ix_byspec
1225
    in  map (\(spec, cnt) -> printf "%d,%d,%d=%d" (rspecMem spec)
1226
                             (rspecDsk spec) (rspecCpu spec) cnt) spec_map
1227

    
1228
-- * Formatting functions
1229

    
1230
-- | Given the original and final nodes, computes the relocation description.
1231
computeMoves :: Instance.Instance -- ^ The instance to be moved
1232
             -> String -- ^ The instance name
1233
             -> IMove  -- ^ The move being performed
1234
             -> String -- ^ New primary
1235
             -> String -- ^ New secondary
1236
             -> (String, [String])
1237
                -- ^ Tuple of moves and commands list; moves is containing
1238
                -- either @/f/@ for failover or @/r:name/@ for replace
1239
                -- secondary, while the command list holds gnt-instance
1240
                -- commands (without that prefix), e.g \"@failover instance1@\"
1241
computeMoves i inam mv c d =
1242
    case mv of
1243
      Failover -> ("f", [mig])
1244
      FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1245
      ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1246
      ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1247
      ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1248
    where morf = if Instance.running i then "migrate" else "failover"
1249
          mig = printf "%s -f %s" morf inam::String
1250
          rep n = printf "replace-disks -n %s %s" n inam
1251

    
1252
-- | Converts a placement to string format.
1253
printSolutionLine :: Node.List     -- ^ The node list
1254
                  -> Instance.List -- ^ The instance list
1255
                  -> Int           -- ^ Maximum node name length
1256
                  -> Int           -- ^ Maximum instance name length
1257
                  -> Placement     -- ^ The current placement
1258
                  -> Int           -- ^ The index of the placement in
1259
                                   -- the solution
1260
                  -> (String, [String])
1261
printSolutionLine nl il nmlen imlen plc pos =
1262
    let
1263
        pmlen = (2*nmlen + 1)
1264
        (i, p, s, mv, c) = plc
1265
        inst = Container.find i il
1266
        inam = Instance.alias inst
1267
        npri = Node.alias $ Container.find p nl
1268
        nsec = Node.alias $ Container.find s nl
1269
        opri = Node.alias $ Container.find (Instance.pNode inst) nl
1270
        osec = Node.alias $ Container.find (Instance.sNode inst) nl
1271
        (moves, cmds) =  computeMoves inst inam mv npri nsec
1272
        ostr = printf "%s:%s" opri osec::String
1273
        nstr = printf "%s:%s" npri nsec::String
1274
    in
1275
      (printf "  %3d. %-*s %-*s => %-*s %.8f a=%s"
1276
       pos imlen inam pmlen ostr
1277
       pmlen nstr c moves,
1278
       cmds)
1279

    
1280
-- | Return the instance and involved nodes in an instance move.
1281
involvedNodes :: Instance.List -> Placement -> [Ndx]
1282
involvedNodes il plc =
1283
    let (i, np, ns, _, _) = plc
1284
        inst = Container.find i il
1285
        op = Instance.pNode inst
1286
        os = Instance.sNode inst
1287
    in nub [np, ns, op, os]
1288

    
1289
-- | Inner function for splitJobs, that either appends the next job to
1290
-- the current jobset, or starts a new jobset.
1291
mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1292
mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1293
mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1294
    | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1295
    | otherwise = ([n]:cjs, ndx)
1296

    
1297
-- | Break a list of moves into independent groups. Note that this
1298
-- will reverse the order of jobs.
1299
splitJobs :: [MoveJob] -> [JobSet]
1300
splitJobs = fst . foldl mergeJobs ([], [])
1301

    
1302
-- | Given a list of commands, prefix them with @gnt-instance@ and
1303
-- also beautify the display a little.
1304
formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1305
formatJob jsn jsl (sn, (_, _, _, cmds)) =
1306
    let out =
1307
            printf "  echo job %d/%d" jsn sn:
1308
            printf "  check":
1309
            map ("  gnt-instance " ++) cmds
1310
    in if sn == 1
1311
       then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1312
       else out
1313

    
1314
-- | Given a list of commands, prefix them with @gnt-instance@ and
1315
-- also beautify the display a little.
1316
formatCmds :: [JobSet] -> String
1317
formatCmds =
1318
    unlines .
1319
    concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1320
                             (zip [1..] js)) .
1321
    zip [1..]
1322

    
1323
-- | Print the node list.
1324
printNodes :: Node.List -> [String] -> String
1325
printNodes nl fs =
1326
    let fields = case fs of
1327
          [] -> Node.defaultFields
1328
          "+":rest -> Node.defaultFields ++ rest
1329
          _ -> fs
1330
        snl = sortBy (comparing Node.idx) (Container.elems nl)
1331
        (header, isnum) = unzip $ map Node.showHeader fields
1332
    in unlines . map ((:) ' ' .  intercalate " ") $
1333
       formatTable (header:map (Node.list fields) snl) isnum
1334

    
1335
-- | Print the instance list.
1336
printInsts :: Node.List -> Instance.List -> String
1337
printInsts nl il =
1338
    let sil = sortBy (comparing Instance.idx) (Container.elems il)
1339
        helper inst = [ if Instance.running inst then "R" else " "
1340
                      , Instance.name inst
1341
                      , Container.nameOf nl (Instance.pNode inst)
1342
                      , let sdx = Instance.sNode inst
1343
                        in if sdx == Node.noSecondary
1344
                           then  ""
1345
                           else Container.nameOf nl sdx
1346
                      , if Instance.autoBalance inst then "Y" else "N"
1347
                      , printf "%3d" $ Instance.vcpus inst
1348
                      , printf "%5d" $ Instance.mem inst
1349
                      , printf "%5d" $ Instance.dsk inst `div` 1024
1350
                      , printf "%5.3f" lC
1351
                      , printf "%5.3f" lM
1352
                      , printf "%5.3f" lD
1353
                      , printf "%5.3f" lN
1354
                      ]
1355
            where DynUtil lC lM lD lN = Instance.util inst
1356
        header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1357
                 , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1358
        isnum = False:False:False:False:False:repeat True
1359
    in unlines . map ((:) ' ' . intercalate " ") $
1360
       formatTable (header:map helper sil) isnum
1361

    
1362
-- | Shows statistics for a given node list.
1363
printStats :: Node.List -> String
1364
printStats nl =
1365
    let dcvs = compDetailedCV $ Container.elems nl
1366
        (weights, names) = unzip detailedCVInfo
1367
        hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1368
        formatted = map (\(w, header, val) ->
1369
                             printf "%s=%.8f(x%.2f)" header val w::String) hd
1370
    in intercalate ", " formatted
1371

    
1372
-- | Convert a placement into a list of OpCodes (basically a job).
1373
iMoveToJob :: Node.List -> Instance.List
1374
          -> Idx -> IMove -> [OpCodes.OpCode]
1375
iMoveToJob nl il idx move =
1376
    let inst = Container.find idx il
1377
        iname = Instance.name inst
1378
        lookNode  = Just . Container.nameOf nl
1379
        opF = OpCodes.OpInstanceMigrate iname True False True
1380
        opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n)
1381
                OpCodes.ReplaceNewSecondary [] Nothing
1382
    in case move of
1383
         Failover -> [ opF ]
1384
         ReplacePrimary np -> [ opF, opR np, opF ]
1385
         ReplaceSecondary ns -> [ opR ns ]
1386
         ReplaceAndFailover np -> [ opR np, opF ]
1387
         FailoverAndReplace ns -> [ opF, opR ns ]
1388

    
1389
-- * Node group functions
1390

    
1391
-- | Computes the group of an instance.
1392
instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1393
instanceGroup nl i =
1394
  let sidx = Instance.sNode i
1395
      pnode = Container.find (Instance.pNode i) nl
1396
      snode = if sidx == Node.noSecondary
1397
              then pnode
1398
              else Container.find sidx nl
1399
      pgroup = Node.group pnode
1400
      sgroup = Node.group snode
1401
  in if pgroup /= sgroup
1402
     then fail ("Instance placed accross two node groups, primary " ++
1403
                show pgroup ++ ", secondary " ++ show sgroup)
1404
     else return pgroup
1405

    
1406
-- | Computes the group of an instance per the primary node.
1407
instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1408
instancePriGroup nl i =
1409
  let pnode = Container.find (Instance.pNode i) nl
1410
  in  Node.group pnode
1411

    
1412
-- | Compute the list of badly allocated instances (split across node
1413
-- groups).
1414
findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1415
findSplitInstances nl =
1416
  filter (not . isOk . instanceGroup nl) . Container.elems
1417

    
1418
-- | Splits a cluster into the component node groups.
1419
splitCluster :: Node.List -> Instance.List ->
1420
                [(Gdx, (Node.List, Instance.List))]
1421
splitCluster nl il =
1422
  let ngroups = Node.computeGroups (Container.elems nl)
1423
  in map (\(guuid, nodes) ->
1424
           let nidxs = map Node.idx nodes
1425
               nodes' = zip nidxs nodes
1426
               instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1427
           in (guuid, (Container.fromList nodes', instances))) ngroups
1428

    
1429
-- | Split a global instance index map into per-group, and associate
1430
-- it with the group/node/instance lists.
1431
associateIdxs :: [Idx] -- ^ Instance indices to be split/associated
1432
              -> [(Gdx, (Node.List, Instance.List))]        -- ^ Input groups
1433
              -> [(Gdx, (Node.List, Instance.List, [Idx]))] -- ^ Result
1434
associateIdxs idxs =
1435
    map (\(gdx, (nl, il)) ->
1436
             (gdx, (nl, il, filter (`Container.member` il) idxs)))
1437

    
1438
-- | Compute the list of nodes that are to be evacuated, given a list
1439
-- of instances and an evacuation mode.
1440
nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1441
                -> EvacMode      -- ^ The evacuation mode we're using
1442
                -> [Idx]         -- ^ List of instance indices being evacuated
1443
                -> IntSet.IntSet -- ^ Set of node indices
1444
nodesToEvacuate il mode =
1445
    IntSet.delete Node.noSecondary .
1446
    foldl' (\ns idx ->
1447
                let i = Container.find idx il
1448
                    pdx = Instance.pNode i
1449
                    sdx = Instance.sNode i
1450
                    dt = Instance.diskTemplate i
1451
                    withSecondary = case dt of
1452
                                      DTDrbd8 -> IntSet.insert sdx ns
1453
                                      _ -> ns
1454
                in case mode of
1455
                     ChangePrimary   -> IntSet.insert pdx ns
1456
                     ChangeSecondary -> withSecondary
1457
                     ChangeAll       -> IntSet.insert pdx withSecondary
1458
           ) IntSet.empty