Statistics
| Branch: | Tag: | Revision:

root / htools / Ganeti / HTools / Cluster.hs @ 8fd09137

History | View | Annotate | Download (59 kB)

1
{-| Implementation of cluster-wide logic.
2

    
3
This module holds all pure cluster-logic; I\/O related functionality
4
goes into the /Main/ module for the individual binaries.
5

    
6
-}
7

    
8
{-
9

    
10
Copyright (C) 2009, 2010, 2011 Google Inc.
11

    
12
This program is free software; you can redistribute it and/or modify
13
it under the terms of the GNU General Public License as published by
14
the Free Software Foundation; either version 2 of the License, or
15
(at your option) any later version.
16

    
17
This program is distributed in the hope that it will be useful, but
18
WITHOUT ANY WARRANTY; without even the implied warranty of
19
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
General Public License for more details.
21

    
22
You should have received a copy of the GNU General Public License
23
along with this program; if not, write to the Free Software
24
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25
02110-1301, USA.
26

    
27
-}
28

    
29
module Ganeti.HTools.Cluster
30
    (
31
     -- * Types
32
      AllocSolution(..)
33
    , EvacSolution(..)
34
    , Table(..)
35
    , CStats(..)
36
    , AllocStats
37
    -- * Generic functions
38
    , totalResources
39
    , computeAllocationDelta
40
    -- * First phase functions
41
    , computeBadItems
42
    -- * Second phase functions
43
    , printSolutionLine
44
    , formatCmds
45
    , involvedNodes
46
    , splitJobs
47
    -- * Display functions
48
    , printNodes
49
    , printInsts
50
    -- * Balacing functions
51
    , checkMove
52
    , doNextBalance
53
    , tryBalance
54
    , compCV
55
    , compCVNodes
56
    , compDetailedCV
57
    , printStats
58
    , iMoveToJob
59
    -- * IAllocator functions
60
    , genAllocNodes
61
    , tryAlloc
62
    , tryMGAlloc
63
    , tryReloc
64
    , tryMGReloc
65
    , tryEvac
66
    , tryMGEvac
67
    , tryNodeEvac
68
    , collapseFailures
69
    -- * Allocation functions
70
    , iterateAlloc
71
    , tieredAlloc
72
    , tieredSpecMap
73
     -- * Node group functions
74
    , instanceGroup
75
    , findSplitInstances
76
    , splitCluster
77
    ) where
78

    
79
import Data.Function (on)
80
import qualified Data.IntSet as IntSet
81
import Data.List
82
import Data.Maybe (fromJust)
83
import Data.Ord (comparing)
84
import Text.Printf (printf)
85
import Control.Monad
86
import Control.Parallel.Strategies
87

    
88
import qualified Ganeti.HTools.Container as Container
89
import qualified Ganeti.HTools.Instance as Instance
90
import qualified Ganeti.HTools.Node as Node
91
import qualified Ganeti.HTools.Group as Group
92
import Ganeti.HTools.Types
93
import Ganeti.HTools.Utils
94
import qualified Ganeti.OpCodes as OpCodes
95

    
96
-- * Types
97

    
98
-- | Allocation\/relocation solution.
99
data AllocSolution = AllocSolution
100
  { asFailures  :: [FailMode]          -- ^ Failure counts
101
  , asAllocs    :: Int                 -- ^ Good allocation count
102
  , asSolutions :: [Node.AllocElement] -- ^ The actual result, length
103
                                       -- of the list depends on the
104
                                       -- allocation/relocation mode
105
  , asLog       :: [String]            -- ^ A list of informational messages
106
  }
107

    
108
-- | Node evacuation/group change iallocator result type. This result
109
-- type consists of actual opcodes (a restricted subset) that are
110
-- transmitted back to Ganeti.
111
data EvacSolution = EvacSolution
112
    { esMoved   :: [String]             -- ^ Instance moved successfully
113
    , esFailed  :: [String]             -- ^ Instance which were not
114
                                        -- relocated
115
    , esOpCodes :: [[[OpCodes.OpCode]]] -- ^ List of lists of jobs
116
    }
117

    
118
-- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
119
type AllocResult = (FailStats, Node.List, Instance.List,
120
                    [Instance.Instance], [CStats])
121

    
122
-- | A type denoting the valid allocation mode/pairs.
123
--
124
-- For a one-node allocation, this will be a @Left ['Node.Node']@,
125
-- whereas for a two-node allocation, this will be a @Right
126
-- [('Node.Node', 'Node.Node')]@.
127
type AllocNodes = Either [Ndx] [(Ndx, Ndx)]
128

    
129
-- | The empty solution we start with when computing allocations.
130
emptyAllocSolution :: AllocSolution
131
emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
132
                                   , asSolutions = [], asLog = [] }
133

    
134
-- | The empty evac solution.
135
emptyEvacSolution :: EvacSolution
136
emptyEvacSolution = EvacSolution { esMoved = []
137
                                 , esFailed = []
138
                                 , esOpCodes = []
139
                                 }
140

    
141
-- | The complete state for the balancing solution.
142
data Table = Table Node.List Instance.List Score [Placement]
143
             deriving (Show, Read)
144

    
145
data CStats = CStats { csFmem :: Integer -- ^ Cluster free mem
146
                     , csFdsk :: Integer -- ^ Cluster free disk
147
                     , csAmem :: Integer -- ^ Cluster allocatable mem
148
                     , csAdsk :: Integer -- ^ Cluster allocatable disk
149
                     , csAcpu :: Integer -- ^ Cluster allocatable cpus
150
                     , csMmem :: Integer -- ^ Max node allocatable mem
151
                     , csMdsk :: Integer -- ^ Max node allocatable disk
152
                     , csMcpu :: Integer -- ^ Max node allocatable cpu
153
                     , csImem :: Integer -- ^ Instance used mem
154
                     , csIdsk :: Integer -- ^ Instance used disk
155
                     , csIcpu :: Integer -- ^ Instance used cpu
156
                     , csTmem :: Double  -- ^ Cluster total mem
157
                     , csTdsk :: Double  -- ^ Cluster total disk
158
                     , csTcpu :: Double  -- ^ Cluster total cpus
159
                     , csVcpu :: Integer -- ^ Cluster virtual cpus (if
160
                                         -- node pCpu has been set,
161
                                         -- otherwise -1)
162
                     , csXmem :: Integer -- ^ Unnacounted for mem
163
                     , csNmem :: Integer -- ^ Node own memory
164
                     , csScore :: Score  -- ^ The cluster score
165
                     , csNinst :: Int    -- ^ The total number of instances
166
                     }
167
            deriving (Show, Read)
168

    
169
-- | Currently used, possibly to allocate, unallocable.
170
type AllocStats = (RSpec, RSpec, RSpec)
171

    
172
-- * Utility functions
173

    
174
-- | Verifies the N+1 status and return the affected nodes.
175
verifyN1 :: [Node.Node] -> [Node.Node]
176
verifyN1 = filter Node.failN1
177

    
178
{-| Computes the pair of bad nodes and instances.
179

    
180
The bad node list is computed via a simple 'verifyN1' check, and the
181
bad instance list is the list of primary and secondary instances of
182
those nodes.
183

    
184
-}
185
computeBadItems :: Node.List -> Instance.List ->
186
                   ([Node.Node], [Instance.Instance])
187
computeBadItems nl il =
188
  let bad_nodes = verifyN1 $ getOnline nl
189
      bad_instances = map (`Container.find` il) .
190
                      sort . nub $
191
                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
192
  in
193
    (bad_nodes, bad_instances)
194

    
195
-- | Zero-initializer for the CStats type.
196
emptyCStats :: CStats
197
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
198

    
199
-- | Update stats with data from a new node.
200
updateCStats :: CStats -> Node.Node -> CStats
201
updateCStats cs node =
202
    let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
203
                 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
204
                 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
205
                 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
206
                 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
207
                 csVcpu = x_vcpu,
208
                 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
209
               }
210
            = cs
211
        inc_amem = Node.fMem node - Node.rMem node
212
        inc_amem' = if inc_amem > 0 then inc_amem else 0
213
        inc_adsk = Node.availDisk node
214
        inc_imem = truncate (Node.tMem node) - Node.nMem node
215
                   - Node.xMem node - Node.fMem node
216
        inc_icpu = Node.uCpu node
217
        inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
218
        inc_vcpu = Node.hiCpu node
219
        inc_acpu = Node.availCpu node
220

    
221
    in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
222
          , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
223
          , csAmem = x_amem + fromIntegral inc_amem'
224
          , csAdsk = x_adsk + fromIntegral inc_adsk
225
          , csAcpu = x_acpu + fromIntegral inc_acpu
226
          , csMmem = max x_mmem (fromIntegral inc_amem')
227
          , csMdsk = max x_mdsk (fromIntegral inc_adsk)
228
          , csMcpu = max x_mcpu (fromIntegral inc_acpu)
229
          , csImem = x_imem + fromIntegral inc_imem
230
          , csIdsk = x_idsk + fromIntegral inc_idsk
231
          , csIcpu = x_icpu + fromIntegral inc_icpu
232
          , csTmem = x_tmem + Node.tMem node
233
          , csTdsk = x_tdsk + Node.tDsk node
234
          , csTcpu = x_tcpu + Node.tCpu node
235
          , csVcpu = x_vcpu + fromIntegral inc_vcpu
236
          , csXmem = x_xmem + fromIntegral (Node.xMem node)
237
          , csNmem = x_nmem + fromIntegral (Node.nMem node)
238
          , csNinst = x_ninst + length (Node.pList node)
239
          }
240

    
241
-- | Compute the total free disk and memory in the cluster.
242
totalResources :: Node.List -> CStats
243
totalResources nl =
244
    let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
245
    in cs { csScore = compCV nl }
246

    
247
-- | Compute the delta between two cluster state.
248
--
249
-- This is used when doing allocations, to understand better the
250
-- available cluster resources. The return value is a triple of the
251
-- current used values, the delta that was still allocated, and what
252
-- was left unallocated.
253
computeAllocationDelta :: CStats -> CStats -> AllocStats
254
computeAllocationDelta cini cfin =
255
    let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini
256
        CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
257
                csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin
258
        rini = RSpec (fromIntegral i_icpu) (fromIntegral i_imem)
259
               (fromIntegral i_idsk)
260
        rfin = RSpec (fromIntegral (f_icpu - i_icpu))
261
               (fromIntegral (f_imem - i_imem))
262
               (fromIntegral (f_idsk - i_idsk))
263
        un_cpu = fromIntegral (v_cpu - f_icpu)::Int
264
        runa = RSpec un_cpu (truncate t_mem - fromIntegral f_imem)
265
               (truncate t_dsk - fromIntegral f_idsk)
266
    in (rini, rfin, runa)
267

    
268
-- | The names and weights of the individual elements in the CV list.
269
detailedCVInfo :: [(Double, String)]
270
detailedCVInfo = [ (1,  "free_mem_cv")
271
                 , (1,  "free_disk_cv")
272
                 , (1,  "n1_cnt")
273
                 , (1,  "reserved_mem_cv")
274
                 , (4,  "offline_all_cnt")
275
                 , (16, "offline_pri_cnt")
276
                 , (1,  "vcpu_ratio_cv")
277
                 , (1,  "cpu_load_cv")
278
                 , (1,  "mem_load_cv")
279
                 , (1,  "disk_load_cv")
280
                 , (1,  "net_load_cv")
281
                 , (2,  "pri_tags_score")
282
                 ]
283

    
284
detailedCVWeights :: [Double]
285
detailedCVWeights = map fst detailedCVInfo
286

    
287
-- | Compute the mem and disk covariance.
288
compDetailedCV :: [Node.Node] -> [Double]
289
compDetailedCV all_nodes =
290
    let
291
        (offline, nodes) = partition Node.offline all_nodes
292
        mem_l = map Node.pMem nodes
293
        dsk_l = map Node.pDsk nodes
294
        -- metric: memory covariance
295
        mem_cv = stdDev mem_l
296
        -- metric: disk covariance
297
        dsk_cv = stdDev dsk_l
298
        -- metric: count of instances living on N1 failing nodes
299
        n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
300
                                                   length (Node.pList n)) .
301
                   filter Node.failN1 $ nodes :: Double
302
        res_l = map Node.pRem nodes
303
        -- metric: reserved memory covariance
304
        res_cv = stdDev res_l
305
        -- offline instances metrics
306
        offline_ipri = sum . map (length . Node.pList) $ offline
307
        offline_isec = sum . map (length . Node.sList) $ offline
308
        -- metric: count of instances on offline nodes
309
        off_score = fromIntegral (offline_ipri + offline_isec)::Double
310
        -- metric: count of primary instances on offline nodes (this
311
        -- helps with evacuation/failover of primary instances on
312
        -- 2-node clusters with one node offline)
313
        off_pri_score = fromIntegral offline_ipri::Double
314
        cpu_l = map Node.pCpu nodes
315
        -- metric: covariance of vcpu/pcpu ratio
316
        cpu_cv = stdDev cpu_l
317
        -- metrics: covariance of cpu, memory, disk and network load
318
        (c_load, m_load, d_load, n_load) = unzip4 $
319
            map (\n ->
320
                     let DynUtil c1 m1 d1 n1 = Node.utilLoad n
321
                         DynUtil c2 m2 d2 n2 = Node.utilPool n
322
                     in (c1/c2, m1/m2, d1/d2, n1/n2)
323
                ) nodes
324
        -- metric: conflicting instance count
325
        pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
326
        pri_tags_score = fromIntegral pri_tags_inst::Double
327
    in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
328
       , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
329
       , pri_tags_score ]
330

    
331
-- | Compute the /total/ variance.
332
compCVNodes :: [Node.Node] -> Double
333
compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
334

    
335
-- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
336
compCV :: Node.List -> Double
337
compCV = compCVNodes . Container.elems
338

    
339

    
340
-- | Compute online nodes from a 'Node.List'.
341
getOnline :: Node.List -> [Node.Node]
342
getOnline = filter (not . Node.offline) . Container.elems
343

    
344
-- * Balancing functions
345

    
346
-- | Compute best table. Note that the ordering of the arguments is important.
347
compareTables :: Table -> Table -> Table
348
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
349
    if a_cv > b_cv then b else a
350

    
351
-- | Applies an instance move to a given node list and instance.
352
applyMove :: Node.List -> Instance.Instance
353
          -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
354
-- Failover (f)
355
applyMove nl inst Failover =
356
    let old_pdx = Instance.pNode inst
357
        old_sdx = Instance.sNode inst
358
        old_p = Container.find old_pdx nl
359
        old_s = Container.find old_sdx nl
360
        int_p = Node.removePri old_p inst
361
        int_s = Node.removeSec old_s inst
362
        force_p = Node.offline old_p
363
        new_nl = do -- Maybe monad
364
          new_p <- Node.addPriEx force_p int_s inst
365
          new_s <- Node.addSec int_p inst old_sdx
366
          let new_inst = Instance.setBoth inst old_sdx old_pdx
367
          return (Container.addTwo old_pdx new_s old_sdx new_p nl,
368
                  new_inst, old_sdx, old_pdx)
369
    in new_nl
370

    
371
-- Replace the primary (f:, r:np, f)
372
applyMove nl inst (ReplacePrimary new_pdx) =
373
    let old_pdx = Instance.pNode inst
374
        old_sdx = Instance.sNode inst
375
        old_p = Container.find old_pdx nl
376
        old_s = Container.find old_sdx nl
377
        tgt_n = Container.find new_pdx nl
378
        int_p = Node.removePri old_p inst
379
        int_s = Node.removeSec old_s inst
380
        force_p = Node.offline old_p
381
        new_nl = do -- Maybe monad
382
          -- check that the current secondary can host the instance
383
          -- during the migration
384
          tmp_s <- Node.addPriEx force_p int_s inst
385
          let tmp_s' = Node.removePri tmp_s inst
386
          new_p <- Node.addPriEx force_p tgt_n inst
387
          new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
388
          let new_inst = Instance.setPri inst new_pdx
389
          return (Container.add new_pdx new_p $
390
                  Container.addTwo old_pdx int_p old_sdx new_s nl,
391
                  new_inst, new_pdx, old_sdx)
392
    in new_nl
393

    
394
-- Replace the secondary (r:ns)
395
applyMove nl inst (ReplaceSecondary new_sdx) =
396
    let old_pdx = Instance.pNode inst
397
        old_sdx = Instance.sNode inst
398
        old_s = Container.find old_sdx nl
399
        tgt_n = Container.find new_sdx nl
400
        int_s = Node.removeSec old_s inst
401
        force_s = Node.offline old_s
402
        new_inst = Instance.setSec inst new_sdx
403
        new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
404
                 \new_s -> return (Container.addTwo new_sdx
405
                                   new_s old_sdx int_s nl,
406
                                   new_inst, old_pdx, new_sdx)
407
    in new_nl
408

    
409
-- Replace the secondary and failover (r:np, f)
410
applyMove nl inst (ReplaceAndFailover new_pdx) =
411
    let old_pdx = Instance.pNode inst
412
        old_sdx = Instance.sNode inst
413
        old_p = Container.find old_pdx nl
414
        old_s = Container.find old_sdx nl
415
        tgt_n = Container.find new_pdx nl
416
        int_p = Node.removePri old_p inst
417
        int_s = Node.removeSec old_s inst
418
        force_s = Node.offline old_s
419
        new_nl = do -- Maybe monad
420
          new_p <- Node.addPri tgt_n inst
421
          new_s <- Node.addSecEx force_s int_p inst new_pdx
422
          let new_inst = Instance.setBoth inst new_pdx old_pdx
423
          return (Container.add new_pdx new_p $
424
                  Container.addTwo old_pdx new_s old_sdx int_s nl,
425
                  new_inst, new_pdx, old_pdx)
426
    in new_nl
427

    
428
-- Failver and replace the secondary (f, r:ns)
429
applyMove nl inst (FailoverAndReplace new_sdx) =
430
    let old_pdx = Instance.pNode inst
431
        old_sdx = Instance.sNode inst
432
        old_p = Container.find old_pdx nl
433
        old_s = Container.find old_sdx nl
434
        tgt_n = Container.find new_sdx nl
435
        int_p = Node.removePri old_p inst
436
        int_s = Node.removeSec old_s inst
437
        force_p = Node.offline old_p
438
        new_nl = do -- Maybe monad
439
          new_p <- Node.addPriEx force_p int_s inst
440
          new_s <- Node.addSecEx force_p tgt_n inst old_sdx
441
          let new_inst = Instance.setBoth inst old_sdx new_sdx
442
          return (Container.add new_sdx new_s $
443
                  Container.addTwo old_sdx new_p old_pdx int_p nl,
444
                  new_inst, old_sdx, new_sdx)
445
    in new_nl
446

    
447
-- | Tries to allocate an instance on one given node.
448
allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
449
                 -> OpResult Node.AllocElement
450
allocateOnSingle nl inst new_pdx =
451
    let p = Container.find new_pdx nl
452
        new_inst = Instance.setBoth inst new_pdx Node.noSecondary
453
    in  Node.addPri p inst >>= \new_p -> do
454
      let new_nl = Container.add new_pdx new_p nl
455
          new_score = compCV nl
456
      return (new_nl, new_inst, [new_p], new_score)
457

    
458
-- | Tries to allocate an instance on a given pair of nodes.
459
allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
460
               -> OpResult Node.AllocElement
461
allocateOnPair nl inst new_pdx new_sdx =
462
    let tgt_p = Container.find new_pdx nl
463
        tgt_s = Container.find new_sdx nl
464
    in do
465
      new_p <- Node.addPri tgt_p inst
466
      new_s <- Node.addSec tgt_s inst new_pdx
467
      let new_inst = Instance.setBoth inst new_pdx new_sdx
468
          new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
469
      return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
470

    
471
-- | Tries to perform an instance move and returns the best table
472
-- between the original one and the new one.
473
checkSingleStep :: Table -- ^ The original table
474
                -> Instance.Instance -- ^ The instance to move
475
                -> Table -- ^ The current best table
476
                -> IMove -- ^ The move to apply
477
                -> Table -- ^ The final best table
478
checkSingleStep ini_tbl target cur_tbl move =
479
    let
480
        Table ini_nl ini_il _ ini_plc = ini_tbl
481
        tmp_resu = applyMove ini_nl target move
482
    in
483
      case tmp_resu of
484
        OpFail _ -> cur_tbl
485
        OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
486
            let tgt_idx = Instance.idx target
487
                upd_cvar = compCV upd_nl
488
                upd_il = Container.add tgt_idx new_inst ini_il
489
                upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
490
                upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
491
            in
492
              compareTables cur_tbl upd_tbl
493

    
494
-- | Given the status of the current secondary as a valid new node and
495
-- the current candidate target node, generate the possible moves for
496
-- a instance.
497
possibleMoves :: Bool      -- ^ Whether the secondary node is a valid new node
498
              -> Bool      -- ^ Whether we can change the primary node
499
              -> Ndx       -- ^ Target node candidate
500
              -> [IMove]   -- ^ List of valid result moves
501

    
502
possibleMoves _ False tdx =
503
    [ReplaceSecondary tdx]
504

    
505
possibleMoves True True tdx =
506
    [ReplaceSecondary tdx,
507
     ReplaceAndFailover tdx,
508
     ReplacePrimary tdx,
509
     FailoverAndReplace tdx]
510

    
511
possibleMoves False True tdx =
512
    [ReplaceSecondary tdx,
513
     ReplaceAndFailover tdx]
514

    
515
-- | Compute the best move for a given instance.
516
checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
517
                  -> Bool              -- ^ Whether disk moves are allowed
518
                  -> Bool              -- ^ Whether instance moves are allowed
519
                  -> Table             -- ^ Original table
520
                  -> Instance.Instance -- ^ Instance to move
521
                  -> Table             -- ^ Best new table for this instance
522
checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
523
    let
524
        opdx = Instance.pNode target
525
        osdx = Instance.sNode target
526
        nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx
527
        use_secondary = elem osdx nodes_idx && inst_moves
528
        aft_failover = if use_secondary -- if allowed to failover
529
                       then checkSingleStep ini_tbl target ini_tbl Failover
530
                       else ini_tbl
531
        all_moves = if disk_moves
532
                    then concatMap
533
                         (possibleMoves use_secondary inst_moves) nodes
534
                    else []
535
    in
536
      -- iterate over the possible nodes for this instance
537
      foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
538

    
539
-- | Compute the best next move.
540
checkMove :: [Ndx]               -- ^ Allowed target node indices
541
          -> Bool                -- ^ Whether disk moves are allowed
542
          -> Bool                -- ^ Whether instance moves are allowed
543
          -> Table               -- ^ The current solution
544
          -> [Instance.Instance] -- ^ List of instances still to move
545
          -> Table               -- ^ The new solution
546
checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
547
    let Table _ _ _ ini_plc = ini_tbl
548
        -- we're using rwhnf from the Control.Parallel.Strategies
549
        -- package; we don't need to use rnf as that would force too
550
        -- much evaluation in single-threaded cases, and in
551
        -- multi-threaded case the weak head normal form is enough to
552
        -- spark the evaluation
553
        tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
554
                               inst_moves ini_tbl)
555
                 victims
556
        -- iterate over all instances, computing the best move
557
        best_tbl = foldl' compareTables ini_tbl tables
558
        Table _ _ _ best_plc = best_tbl
559
    in if length best_plc == length ini_plc
560
       then ini_tbl -- no advancement
561
       else best_tbl
562

    
563
-- | Check if we are allowed to go deeper in the balancing.
564
doNextBalance :: Table     -- ^ The starting table
565
              -> Int       -- ^ Remaining length
566
              -> Score     -- ^ Score at which to stop
567
              -> Bool      -- ^ The resulting table and commands
568
doNextBalance ini_tbl max_rounds min_score =
569
    let Table _ _ ini_cv ini_plc = ini_tbl
570
        ini_plc_len = length ini_plc
571
    in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
572

    
573
-- | Run a balance move.
574
tryBalance :: Table       -- ^ The starting table
575
           -> Bool        -- ^ Allow disk moves
576
           -> Bool        -- ^ Allow instance moves
577
           -> Bool        -- ^ Only evacuate moves
578
           -> Score       -- ^ Min gain threshold
579
           -> Score       -- ^ Min gain
580
           -> Maybe Table -- ^ The resulting table and commands
581
tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
582
    let Table ini_nl ini_il ini_cv _ = ini_tbl
583
        all_inst = Container.elems ini_il
584
        all_inst' = if evac_mode
585
                    then let bad_nodes = map Node.idx . filter Node.offline $
586
                                         Container.elems ini_nl
587
                         in filter (\e -> Instance.sNode e `elem` bad_nodes ||
588
                                          Instance.pNode e `elem` bad_nodes)
589
                            all_inst
590
                    else all_inst
591
        reloc_inst = filter Instance.movable all_inst'
592
        node_idx = map Node.idx . filter (not . Node.offline) $
593
                   Container.elems ini_nl
594
        fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
595
        (Table _ _ fin_cv _) = fin_tbl
596
    in
597
      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
598
      then Just fin_tbl -- this round made success, return the new table
599
      else Nothing
600

    
601
-- * Allocation functions
602

    
603
-- | Build failure stats out of a list of failures.
604
collapseFailures :: [FailMode] -> FailStats
605
collapseFailures flst =
606
    map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
607
            [minBound..maxBound]
608

    
609
-- | Update current Allocation solution and failure stats with new
610
-- elements.
611
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
612
concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
613

    
614
concatAllocs as (OpGood ns@(_, _, _, nscore)) =
615
    let -- Choose the old or new solution, based on the cluster score
616
        cntok = asAllocs as
617
        osols = asSolutions as
618
        nsols = case osols of
619
                  [] -> [ns]
620
                  (_, _, _, oscore):[] ->
621
                      if oscore < nscore
622
                      then osols
623
                      else [ns]
624
                  -- FIXME: here we simply concat to lists with more
625
                  -- than one element; we should instead abort, since
626
                  -- this is not a valid usage of this function
627
                  xs -> ns:xs
628
        nsuc = cntok + 1
629
    -- Note: we force evaluation of nsols here in order to keep the
630
    -- memory profile low - we know that we will need nsols for sure
631
    -- in the next cycle, so we force evaluation of nsols, since the
632
    -- foldl' in the caller will only evaluate the tuple, but not the
633
    -- elements of the tuple
634
    in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolutions = nsols }
635

    
636
-- | Sums two allocation solutions (e.g. for two separate node groups).
637
sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
638
sumAllocs (AllocSolution af aa as al) (AllocSolution bf ba bs bl) =
639
    AllocSolution (af ++ bf) (aa + ba) (as ++ bs) (al ++ bl)
640

    
641
-- | Given a solution, generates a reasonable description for it.
642
describeSolution :: AllocSolution -> String
643
describeSolution as =
644
  let fcnt = asFailures as
645
      sols = asSolutions as
646
      freasons =
647
        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
648
        filter ((> 0) . snd) . collapseFailures $ fcnt
649
  in if null sols
650
     then "No valid allocation solutions, failure reasons: " ++
651
          (if null fcnt
652
           then "unknown reasons"
653
           else freasons)
654
     else let (_, _, nodes, cv) = head sols
655
          in printf ("score: %.8f, successes %d, failures %d (%s)" ++
656
                     " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
657
             (intercalate "/" . map Node.name $ nodes)
658

    
659
-- | Annotates a solution with the appropriate string.
660
annotateSolution :: AllocSolution -> AllocSolution
661
annotateSolution as = as { asLog = describeSolution as : asLog as }
662

    
663
-- | Reverses an evacuation solution.
664
--
665
-- Rationale: we always concat the results to the top of the lists, so
666
-- for proper jobset execution, we should reverse all lists.
667
reverseEvacSolution :: EvacSolution -> EvacSolution
668
reverseEvacSolution (EvacSolution f m o) =
669
    EvacSolution (reverse f) (reverse m) (reverse o)
670

    
671
-- | Generate the valid node allocation singles or pairs for a new instance.
672
genAllocNodes :: Group.List        -- ^ Group list
673
              -> Node.List         -- ^ The node map
674
              -> Int               -- ^ The number of nodes required
675
              -> Bool              -- ^ Whether to drop or not
676
                                   -- unallocable nodes
677
              -> Result AllocNodes -- ^ The (monadic) result
678
genAllocNodes gl nl count drop_unalloc =
679
    let filter_fn = if drop_unalloc
680
                    then filter (Group.isAllocable .
681
                                 flip Container.find gl . Node.group)
682
                    else id
683
        all_nodes = filter_fn $ getOnline nl
684
        all_pairs = liftM2 (,) all_nodes all_nodes
685
        ok_pairs = filter (\(x, y) -> Node.idx x /= Node.idx y &&
686
                                      Node.group x == Node.group y) all_pairs
687
    in case count of
688
         1 -> Ok (Left (map Node.idx all_nodes))
689
         2 -> Ok (Right (map (\(p, s) -> (Node.idx p, Node.idx s)) ok_pairs))
690
         _ -> Bad "Unsupported number of nodes, only one or two  supported"
691

    
692
-- | Try to allocate an instance on the cluster.
693
tryAlloc :: (Monad m) =>
694
            Node.List         -- ^ The node list
695
         -> Instance.List     -- ^ The instance list
696
         -> Instance.Instance -- ^ The instance to allocate
697
         -> AllocNodes        -- ^ The allocation targets
698
         -> m AllocSolution   -- ^ Possible solution list
699
tryAlloc nl _ inst (Right ok_pairs) =
700
    let sols = foldl' (\cstate (p, s) ->
701
                           concatAllocs cstate $ allocateOnPair nl inst p s
702
                      ) emptyAllocSolution ok_pairs
703

    
704
    in if null ok_pairs -- means we have just one node
705
       then fail "Not enough online nodes"
706
       else return $ annotateSolution sols
707

    
708
tryAlloc nl _ inst (Left all_nodes) =
709
    let sols = foldl' (\cstate ->
710
                           concatAllocs cstate . allocateOnSingle nl inst
711
                      ) emptyAllocSolution all_nodes
712
    in if null all_nodes
713
       then fail "No online nodes"
714
       else return $ annotateSolution sols
715

    
716
-- | Given a group/result, describe it as a nice (list of) messages.
717
solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
718
solutionDescription gl (groupId, result) =
719
  case result of
720
    Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
721
    Bad message -> [printf "Group %s: error %s" gname message]
722
  where grp = Container.find groupId gl
723
        gname = Group.name grp
724
        pol = apolToString (Group.allocPolicy grp)
725

    
726
-- | From a list of possibly bad and possibly empty solutions, filter
727
-- only the groups with a valid result. Note that the result will be
728
-- reversed compared to the original list.
729
filterMGResults :: Group.List
730
                -> [(Gdx, Result AllocSolution)]
731
                -> [(Gdx, AllocSolution)]
732
filterMGResults gl = foldl' fn []
733
    where unallocable = not . Group.isAllocable . flip Container.find gl
734
          fn accu (gdx, rasol) =
735
              case rasol of
736
                Bad _ -> accu
737
                Ok sol | null (asSolutions sol) -> accu
738
                       | unallocable gdx -> accu
739
                       | otherwise -> (gdx, sol):accu
740

    
741
-- | Sort multigroup results based on policy and score.
742
sortMGResults :: Group.List
743
             -> [(Gdx, AllocSolution)]
744
             -> [(Gdx, AllocSolution)]
745
sortMGResults gl sols =
746
    let extractScore (_, _, _, x) = x
747
        solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
748
                               (extractScore . head . asSolutions) sol)
749
    in sortBy (comparing solScore) sols
750

    
751
-- | Finds the best group for an instance on a multi-group cluster.
752
findBestAllocGroup :: Group.List           -- ^ The group list
753
                   -> Node.List            -- ^ The node list
754
                   -> Instance.List        -- ^ The instance list
755
                   -> Instance.Instance    -- ^ The instance to allocate
756
                   -> Int                  -- ^ Required number of nodes
757
                   -> Result (Gdx, AllocSolution, [String])
758
findBestAllocGroup mggl mgnl mgil inst cnt =
759
  let groups = splitCluster mgnl mgil
760
      sols = map (\(gid, (nl, il)) ->
761
                   (gid, genAllocNodes mggl nl cnt False >>=
762
                       tryAlloc nl il inst))
763
             groups::[(Gdx, Result AllocSolution)]
764
      all_msgs = concatMap (solutionDescription mggl) sols
765
      goodSols = filterMGResults mggl sols
766
      sortedSols = sortMGResults mggl goodSols
767
  in if null sortedSols
768
     then Bad $ intercalate ", " all_msgs
769
     else let (final_group, final_sol) = head sortedSols
770
          in return (final_group, final_sol, all_msgs)
771

    
772
-- | Try to allocate an instance on a multi-group cluster.
773
tryMGAlloc :: Group.List           -- ^ The group list
774
           -> Node.List            -- ^ The node list
775
           -> Instance.List        -- ^ The instance list
776
           -> Instance.Instance    -- ^ The instance to allocate
777
           -> Int                  -- ^ Required number of nodes
778
           -> Result AllocSolution -- ^ Possible solution list
779
tryMGAlloc mggl mgnl mgil inst cnt = do
780
  (best_group, solution, all_msgs) <-
781
      findBestAllocGroup mggl mgnl mgil inst cnt
782
  let group_name = Group.name $ Container.find best_group mggl
783
      selmsg = "Selected group: " ++ group_name
784
  return $ solution { asLog = selmsg:all_msgs }
785

    
786
-- | Try to relocate an instance on the cluster.
787
tryReloc :: (Monad m) =>
788
            Node.List       -- ^ The node list
789
         -> Instance.List   -- ^ The instance list
790
         -> Idx             -- ^ The index of the instance to move
791
         -> Int             -- ^ The number of nodes required
792
         -> [Ndx]           -- ^ Nodes which should not be used
793
         -> m AllocSolution -- ^ Solution list
794
tryReloc nl il xid 1 ex_idx =
795
    let all_nodes = getOnline nl
796
        inst = Container.find xid il
797
        ex_idx' = Instance.pNode inst:ex_idx
798
        valid_nodes = filter (not . flip elem ex_idx' . Node.idx) all_nodes
799
        valid_idxes = map Node.idx valid_nodes
800
        sols1 = foldl' (\cstate x ->
801
                            let em = do
802
                                  (mnl, i, _, _) <-
803
                                      applyMove nl inst (ReplaceSecondary x)
804
                                  return (mnl, i, [Container.find x mnl],
805
                                          compCV mnl)
806
                            in concatAllocs cstate em
807
                       ) emptyAllocSolution valid_idxes
808
    in return sols1
809

    
810
tryReloc _ _ _ reqn _  = fail $ "Unsupported number of relocation \
811
                                \destinations required (" ++ show reqn ++
812
                                                  "), only one supported"
813

    
814
tryMGReloc :: (Monad m) =>
815
              Group.List      -- ^ The group list
816
           -> Node.List       -- ^ The node list
817
           -> Instance.List   -- ^ The instance list
818
           -> Idx             -- ^ The index of the instance to move
819
           -> Int             -- ^ The number of nodes required
820
           -> [Ndx]           -- ^ Nodes which should not be used
821
           -> m AllocSolution -- ^ Solution list
822
tryMGReloc _ mgnl mgil xid ncount ex_ndx = do
823
  let groups = splitCluster mgnl mgil
824
      -- TODO: we only relocate inside the group for now
825
      inst = Container.find xid mgil
826
  (nl, il) <- case lookup (instancePriGroup mgnl inst) groups of
827
                Nothing -> fail $ "Cannot find group for instance " ++
828
                           Instance.name inst
829
                Just v -> return v
830
  tryReloc nl il xid ncount ex_ndx
831

    
832
-- | Change an instance's secondary node.
833
evacInstance :: (Monad m) =>
834
                [Ndx]                      -- ^ Excluded nodes
835
             -> Instance.List              -- ^ The current instance list
836
             -> (Node.List, AllocSolution) -- ^ The current state
837
             -> Idx                        -- ^ The instance to evacuate
838
             -> m (Node.List, AllocSolution)
839
evacInstance ex_ndx il (nl, old_as) idx = do
840
  -- FIXME: hardcoded one node here
841

    
842
  -- Longer explanation: evacuation is currently hardcoded to DRBD
843
  -- instances (which have one secondary); hence, even if the
844
  -- IAllocator protocol can request N nodes for an instance, and all
845
  -- the message parsing/loading pass this, this implementation only
846
  -- supports one; this situation needs to be revisited if we ever
847
  -- support more than one secondary, or if we change the storage
848
  -- model
849
  new_as <- tryReloc nl il idx 1 ex_ndx
850
  case asSolutions new_as of
851
    -- an individual relocation succeeded, we kind of compose the data
852
    -- from the two solutions
853
    csol@(nl', _, _, _):_ ->
854
        return (nl', new_as { asSolutions = csol:asSolutions old_as })
855
    -- this relocation failed, so we fail the entire evac
856
    _ -> fail $ "Can't evacuate instance " ++
857
         Instance.name (Container.find idx il) ++
858
             ": " ++ describeSolution new_as
859

    
860
-- | Try to evacuate a list of nodes.
861
tryEvac :: (Monad m) =>
862
            Node.List       -- ^ The node list
863
         -> Instance.List   -- ^ The instance list
864
         -> [Idx]           -- ^ Instances to be evacuated
865
         -> [Ndx]           -- ^ Restricted nodes (the ones being evacuated)
866
         -> m AllocSolution -- ^ Solution list
867
tryEvac nl il idxs ex_ndx = do
868
  (_, sol) <- foldM (evacInstance ex_ndx il) (nl, emptyAllocSolution) idxs
869
  return sol
870

    
871
-- | Multi-group evacuation of a list of nodes.
872
tryMGEvac :: (Monad m) =>
873
             Group.List -- ^ The group list
874
          -> Node.List       -- ^ The node list
875
          -> Instance.List   -- ^ The instance list
876
          -> [Ndx]           -- ^ Nodes to be evacuated
877
          -> m AllocSolution -- ^ Solution list
878
tryMGEvac _ nl il ex_ndx =
879
    let ex_nodes = map (`Container.find` nl) ex_ndx
880
        all_insts = nub . concatMap Node.sList $ ex_nodes
881
        all_insts' = associateIdxs all_insts $ splitCluster nl il
882
    in do
883
      results <- mapM (\(_, (gnl, gil, idxs)) -> tryEvac gnl gil idxs ex_ndx)
884
                 all_insts'
885
      let sol = foldl' sumAllocs emptyAllocSolution results
886
      return $ annotateSolution sol
887

    
888
-- | Function which fails if the requested mode is change secondary.
889
--
890
-- This is useful since except DRBD, no other disk template can
891
-- execute change secondary; thus, we can just call this function
892
-- instead of always checking for secondary mode. After the call to
893
-- this function, whatever mode we have is just a primary change.
894
failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
895
failOnSecondaryChange ChangeSecondary dt =
896
    fail $ "Instances with disk template '" ++ dtToString dt ++
897
         "' can't execute change secondary"
898
failOnSecondaryChange _ _ = return ()
899

    
900
-- | Run evacuation for a single instance.
901
nodeEvacInstance :: Node.List         -- ^ The node list (cluster-wide)
902
                 -> Instance.List     -- ^ Instance list (cluster-wide)
903
                 -> EvacMode          -- ^ The evacuation mode
904
                 -> Instance.Instance -- ^ The instance to be evacuated
905
                 -> [Ndx]             -- ^ The list of available nodes
906
                                      -- for allocation
907
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
908
nodeEvacInstance _ _ mode (Instance.Instance
909
                           {Instance.diskTemplate = dt@DTDiskless}) _ =
910
                  failOnSecondaryChange mode dt >>
911
                  fail "Diskless relocations not implemented yet"
912

    
913
nodeEvacInstance _ _ _ (Instance.Instance
914
                        {Instance.diskTemplate = DTPlain}) _ =
915
                  fail "Instances of type plain cannot be relocated"
916

    
917
nodeEvacInstance _ _ _ (Instance.Instance
918
                        {Instance.diskTemplate = DTFile}) _ =
919
                  fail "Instances of type file cannot be relocated"
920

    
921
nodeEvacInstance _ _ mode  (Instance.Instance
922
                            {Instance.diskTemplate = dt@DTSharedFile}) _ =
923
                  failOnSecondaryChange mode dt >>
924
                  fail "Shared file relocations not implemented yet"
925

    
926
nodeEvacInstance _ _ mode (Instance.Instance
927
                           {Instance.diskTemplate = dt@DTBlock}) _ =
928
                  failOnSecondaryChange mode dt >>
929
                  fail "Block device relocations not implemented yet"
930

    
931
nodeEvacInstance nl il ChangePrimary
932
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8}) _ =
933
  do
934
    (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
935
    let idx = Instance.idx inst
936
        il' = Container.add idx inst' il
937
        ops = iMoveToJob nl' il' idx Failover
938
    return (nl', il', ops)
939

    
940
nodeEvacInstance nl il ChangeSecondary
941
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
942
                 avail_nodes =
943
  do
944
    let gdx = instancePriGroup nl inst
945
    (nl', inst', _, ndx) <- annotateResult "Can't find any good node" $
946
                            eitherToResult $
947
                            foldl' (evacDrbdSecondaryInner nl inst gdx)
948
                            (Left "no nodes available") avail_nodes
949
    let idx = Instance.idx inst
950
        il' = Container.add idx inst' il
951
        ops = iMoveToJob nl' il' idx (ReplaceSecondary ndx)
952
    return (nl', il', ops)
953

    
954
nodeEvacInstance nl il ChangeAll
955
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
956
                 avail_nodes =
957
  do
958
    let primary = Container.find (Instance.pNode inst) nl
959
        idx = Instance.idx inst
960
        gdx = instancePriGroup nl inst
961
        no_nodes = Left "no nodes available"
962
    -- if the primary is offline, then we first failover
963
    (nl1, inst1, ops1) <-
964
        if Node.offline primary
965
        then do
966
          (nl', inst', _, _) <-
967
              annotateResult "Failing over to the secondary" $
968
              opToResult $ applyMove nl inst Failover
969
          return (nl', inst', [Failover])
970
        else return (nl, inst, [])
971
    -- we now need to execute a replace secondary to the future
972
    -- primary node
973
    (nl2, inst2, _, new_pdx) <- annotateResult "Searching for a new primary" $
974
                                eitherToResult $
975
                                foldl' (evacDrbdSecondaryInner nl1 inst1 gdx)
976
                                no_nodes avail_nodes
977
    let ops2 = ReplaceSecondary new_pdx:ops1
978
    -- since we chose the new primary, we remove it from the list of
979
    -- available nodes
980
    let avail_nodes_sec = new_pdx `delete` avail_nodes
981
    -- we now execute another failover, the primary stays fixed now
982
    (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" $
983
                          opToResult $ applyMove nl2 inst2 Failover
984
    let ops3 = Failover:ops2
985
    -- and finally another replace secondary, to the final secondary
986
    (nl4, inst4, _, new_sdx) <-
987
        annotateResult "Searching for a new secondary" $
988
        eitherToResult $
989
        foldl' (evacDrbdSecondaryInner nl3 inst3 gdx) no_nodes avail_nodes_sec
990
    let ops4 = ReplaceSecondary new_sdx:ops3
991
        il' = Container.add idx inst4 il
992
        ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
993
    return (nl4, il', ops)
994

    
995
-- | Inner fold function for changing secondary of a DRBD instance.
996
--
997
-- The "running" solution is either a @Left String@, which means we
998
-- don't have yet a working solution, or a @Right (...)@, which
999
-- represents a valid solution; it holds the modified node list, the
1000
-- modified instance (after evacuation), the score of that solution,
1001
-- and the new secondary node index.
1002
evacDrbdSecondaryInner :: Node.List -- ^ Cluster node list
1003
                       -> Instance.Instance -- ^ Instance being evacuated
1004
                       -> Gdx -- ^ The group index of the instance
1005
                       -> Either String ( Node.List
1006
                                        , Instance.Instance
1007
                                        , Score
1008
                                        , Ndx)  -- ^ Current best solution
1009
                       -> Ndx  -- ^ Node we're evaluating as new secondary
1010
                       -> Either String ( Node.List
1011
                                        , Instance.Instance
1012
                                        , Score
1013
                                        , Ndx) -- ^ New best solution
1014
evacDrbdSecondaryInner nl inst gdx accu ndx =
1015
    case applyMove nl inst (ReplaceSecondary ndx) of
1016
      OpFail fm ->
1017
          case accu of
1018
            Right _ -> accu
1019
            Left _ -> Left $ "Node " ++ Container.nameOf nl ndx ++
1020
                      " failed: " ++ show fm
1021
      OpGood (nl', inst', _, _) ->
1022
          let nodes = Container.elems nl'
1023
              -- The fromJust below is ugly (it can fail nastily), but
1024
              -- at this point we should have any internal mismatches,
1025
              -- and adding a monad here would be quite involved
1026
              grpnodes = fromJust (gdx `lookup` (Node.computeGroups nodes))
1027
              new_cv = compCVNodes grpnodes
1028
              new_accu = Right (nl', inst', new_cv, ndx)
1029
          in case accu of
1030
               Left _ -> new_accu
1031
               Right (_, _, old_cv, _) ->
1032
                   if old_cv < new_cv
1033
                   then accu
1034
                   else new_accu
1035

    
1036
-- | Computes the local nodes of a given instance which are available
1037
-- for allocation.
1038
availableLocalNodes :: Node.List
1039
                    -> [(Gdx, [Ndx])]
1040
                    -> IntSet.IntSet
1041
                    -> Instance.Instance
1042
                    -> Result [Ndx]
1043
availableLocalNodes nl group_nodes excl_ndx inst = do
1044
  let gdx = instancePriGroup nl inst
1045
  local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1046
                 Ok (lookup gdx group_nodes)
1047
  let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1048
  return avail_nodes
1049

    
1050
-- | Updates the evac solution with the results of an instance
1051
-- evacuation.
1052
updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1053
                   -> Instance.Instance
1054
                   -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1055
                   -> (Node.List, Instance.List, EvacSolution)
1056
updateEvacSolution (nl, il, es) inst (Bad msg) =
1057
    (nl, il, es { esFailed = (Instance.name inst ++ ": " ++ msg):esFailed es})
1058
updateEvacSolution (_, _, es) inst (Ok (nl, il, opcodes)) =
1059
    (nl, il, es { esMoved = Instance.name inst:esMoved es
1060
                , esOpCodes = [opcodes]:esOpCodes es })
1061

    
1062
-- | Node-evacuation IAllocator mode main function.
1063
tryNodeEvac :: Group.List    -- ^ The cluster groups
1064
            -> Node.List     -- ^ The node list (cluster-wide, not per group)
1065
            -> Instance.List -- ^ Instance list (cluster-wide)
1066
            -> EvacMode      -- ^ The evacuation mode
1067
            -> [Idx]         -- ^ List of instance (indices) to be evacuated
1068
            -> Result EvacSolution
1069
tryNodeEvac _ ini_nl ini_il mode idxs =
1070
    let evac_ndx = nodesToEvacuate ini_il mode idxs
1071
        offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1072
        excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1073
        group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1074
                                             (Container.elems nl))) $
1075
                      splitCluster ini_nl ini_il
1076
        (_, _, esol) =
1077
            foldl' (\state@(nl, il, _) inst ->
1078
                        updateEvacSolution state inst $
1079
                        availableLocalNodes nl group_ndx excl_ndx inst >>=
1080
                        nodeEvacInstance nl il mode inst
1081
                   )
1082
            (ini_nl, ini_il, emptyEvacSolution)
1083
            (map (`Container.find` ini_il) idxs)
1084
    in return $ reverseEvacSolution esol
1085

    
1086
-- | Recursively place instances on the cluster until we're out of space.
1087
iterateAlloc :: Node.List
1088
             -> Instance.List
1089
             -> Maybe Int
1090
             -> Instance.Instance
1091
             -> AllocNodes
1092
             -> [Instance.Instance]
1093
             -> [CStats]
1094
             -> Result AllocResult
1095
iterateAlloc nl il limit newinst allocnodes ixes cstats =
1096
      let depth = length ixes
1097
          newname = printf "new-%d" depth::String
1098
          newidx = length (Container.elems il) + depth
1099
          newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1100
          newlimit = fmap (flip (-) 1) limit
1101
      in case tryAlloc nl il newi2 allocnodes of
1102
           Bad s -> Bad s
1103
           Ok (AllocSolution { asFailures = errs, asSolutions = sols3 }) ->
1104
               let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1105
               case sols3 of
1106
                 [] -> newsol
1107
                 (xnl, xi, _, _):[] ->
1108
                     if limit == Just 0
1109
                     then newsol
1110
                     else iterateAlloc xnl (Container.add newidx xi il)
1111
                          newlimit newinst allocnodes (xi:ixes)
1112
                          (totalResources xnl:cstats)
1113
                 _ -> Bad "Internal error: multiple solutions for single\
1114
                          \ allocation"
1115

    
1116
-- | The core of the tiered allocation mode.
1117
tieredAlloc :: Node.List
1118
            -> Instance.List
1119
            -> Maybe Int
1120
            -> Instance.Instance
1121
            -> AllocNodes
1122
            -> [Instance.Instance]
1123
            -> [CStats]
1124
            -> Result AllocResult
1125
tieredAlloc nl il limit newinst allocnodes ixes cstats =
1126
    case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1127
      Bad s -> Bad s
1128
      Ok (errs, nl', il', ixes', cstats') ->
1129
          let newsol = Ok (errs, nl', il', ixes', cstats')
1130
              ixes_cnt = length ixes'
1131
              (stop, newlimit) = case limit of
1132
                                   Nothing -> (False, Nothing)
1133
                                   Just n -> (n <= ixes_cnt,
1134
                                              Just (n - ixes_cnt)) in
1135
          if stop then newsol else
1136
          case Instance.shrinkByType newinst . fst . last $
1137
               sortBy (comparing snd) errs of
1138
            Bad _ -> newsol
1139
            Ok newinst' -> tieredAlloc nl' il' newlimit
1140
                           newinst' allocnodes ixes' cstats'
1141

    
1142
-- | Compute the tiered spec string description from a list of
1143
-- allocated instances.
1144
tieredSpecMap :: [Instance.Instance]
1145
              -> [String]
1146
tieredSpecMap trl_ixes =
1147
    let fin_trl_ixes = reverse trl_ixes
1148
        ix_byspec = groupBy ((==) `on` Instance.specOf) fin_trl_ixes
1149
        spec_map = map (\ixs -> (Instance.specOf $ head ixs, length ixs))
1150
                   ix_byspec
1151
    in  map (\(spec, cnt) -> printf "%d,%d,%d=%d" (rspecMem spec)
1152
                             (rspecDsk spec) (rspecCpu spec) cnt) spec_map
1153

    
1154
-- * Formatting functions
1155

    
1156
-- | Given the original and final nodes, computes the relocation description.
1157
computeMoves :: Instance.Instance -- ^ The instance to be moved
1158
             -> String -- ^ The instance name
1159
             -> IMove  -- ^ The move being performed
1160
             -> String -- ^ New primary
1161
             -> String -- ^ New secondary
1162
             -> (String, [String])
1163
                -- ^ Tuple of moves and commands list; moves is containing
1164
                -- either @/f/@ for failover or @/r:name/@ for replace
1165
                -- secondary, while the command list holds gnt-instance
1166
                -- commands (without that prefix), e.g \"@failover instance1@\"
1167
computeMoves i inam mv c d =
1168
    case mv of
1169
      Failover -> ("f", [mig])
1170
      FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1171
      ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1172
      ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1173
      ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1174
    where morf = if Instance.running i then "migrate" else "failover"
1175
          mig = printf "%s -f %s" morf inam::String
1176
          rep n = printf "replace-disks -n %s %s" n inam
1177

    
1178
-- | Converts a placement to string format.
1179
printSolutionLine :: Node.List     -- ^ The node list
1180
                  -> Instance.List -- ^ The instance list
1181
                  -> Int           -- ^ Maximum node name length
1182
                  -> Int           -- ^ Maximum instance name length
1183
                  -> Placement     -- ^ The current placement
1184
                  -> Int           -- ^ The index of the placement in
1185
                                   -- the solution
1186
                  -> (String, [String])
1187
printSolutionLine nl il nmlen imlen plc pos =
1188
    let
1189
        pmlen = (2*nmlen + 1)
1190
        (i, p, s, mv, c) = plc
1191
        inst = Container.find i il
1192
        inam = Instance.alias inst
1193
        npri = Node.alias $ Container.find p nl
1194
        nsec = Node.alias $ Container.find s nl
1195
        opri = Node.alias $ Container.find (Instance.pNode inst) nl
1196
        osec = Node.alias $ Container.find (Instance.sNode inst) nl
1197
        (moves, cmds) =  computeMoves inst inam mv npri nsec
1198
        ostr = printf "%s:%s" opri osec::String
1199
        nstr = printf "%s:%s" npri nsec::String
1200
    in
1201
      (printf "  %3d. %-*s %-*s => %-*s %.8f a=%s"
1202
       pos imlen inam pmlen ostr
1203
       pmlen nstr c moves,
1204
       cmds)
1205

    
1206
-- | Return the instance and involved nodes in an instance move.
1207
involvedNodes :: Instance.List -> Placement -> [Ndx]
1208
involvedNodes il plc =
1209
    let (i, np, ns, _, _) = plc
1210
        inst = Container.find i il
1211
        op = Instance.pNode inst
1212
        os = Instance.sNode inst
1213
    in nub [np, ns, op, os]
1214

    
1215
-- | Inner function for splitJobs, that either appends the next job to
1216
-- the current jobset, or starts a new jobset.
1217
mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1218
mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1219
mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1220
    | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1221
    | otherwise = ([n]:cjs, ndx)
1222

    
1223
-- | Break a list of moves into independent groups. Note that this
1224
-- will reverse the order of jobs.
1225
splitJobs :: [MoveJob] -> [JobSet]
1226
splitJobs = fst . foldl mergeJobs ([], [])
1227

    
1228
-- | Given a list of commands, prefix them with @gnt-instance@ and
1229
-- also beautify the display a little.
1230
formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1231
formatJob jsn jsl (sn, (_, _, _, cmds)) =
1232
    let out =
1233
            printf "  echo job %d/%d" jsn sn:
1234
            printf "  check":
1235
            map ("  gnt-instance " ++) cmds
1236
    in if sn == 1
1237
       then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1238
       else out
1239

    
1240
-- | Given a list of commands, prefix them with @gnt-instance@ and
1241
-- also beautify the display a little.
1242
formatCmds :: [JobSet] -> String
1243
formatCmds =
1244
    unlines .
1245
    concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1246
                             (zip [1..] js)) .
1247
    zip [1..]
1248

    
1249
-- | Print the node list.
1250
printNodes :: Node.List -> [String] -> String
1251
printNodes nl fs =
1252
    let fields = case fs of
1253
          [] -> Node.defaultFields
1254
          "+":rest -> Node.defaultFields ++ rest
1255
          _ -> fs
1256
        snl = sortBy (comparing Node.idx) (Container.elems nl)
1257
        (header, isnum) = unzip $ map Node.showHeader fields
1258
    in unlines . map ((:) ' ' .  intercalate " ") $
1259
       formatTable (header:map (Node.list fields) snl) isnum
1260

    
1261
-- | Print the instance list.
1262
printInsts :: Node.List -> Instance.List -> String
1263
printInsts nl il =
1264
    let sil = sortBy (comparing Instance.idx) (Container.elems il)
1265
        helper inst = [ if Instance.running inst then "R" else " "
1266
                      , Instance.name inst
1267
                      , Container.nameOf nl (Instance.pNode inst)
1268
                      , let sdx = Instance.sNode inst
1269
                        in if sdx == Node.noSecondary
1270
                           then  ""
1271
                           else Container.nameOf nl sdx
1272
                      , if Instance.autoBalance inst then "Y" else "N"
1273
                      , printf "%3d" $ Instance.vcpus inst
1274
                      , printf "%5d" $ Instance.mem inst
1275
                      , printf "%5d" $ Instance.dsk inst `div` 1024
1276
                      , printf "%5.3f" lC
1277
                      , printf "%5.3f" lM
1278
                      , printf "%5.3f" lD
1279
                      , printf "%5.3f" lN
1280
                      ]
1281
            where DynUtil lC lM lD lN = Instance.util inst
1282
        header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1283
                 , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1284
        isnum = False:False:False:False:False:repeat True
1285
    in unlines . map ((:) ' ' . intercalate " ") $
1286
       formatTable (header:map helper sil) isnum
1287

    
1288
-- | Shows statistics for a given node list.
1289
printStats :: Node.List -> String
1290
printStats nl =
1291
    let dcvs = compDetailedCV $ Container.elems nl
1292
        (weights, names) = unzip detailedCVInfo
1293
        hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1294
        formatted = map (\(w, header, val) ->
1295
                             printf "%s=%.8f(x%.2f)" header val w::String) hd
1296
    in intercalate ", " formatted
1297

    
1298
-- | Convert a placement into a list of OpCodes (basically a job).
1299
iMoveToJob :: Node.List -> Instance.List
1300
          -> Idx -> IMove -> [OpCodes.OpCode]
1301
iMoveToJob nl il idx move =
1302
    let inst = Container.find idx il
1303
        iname = Instance.name inst
1304
        lookNode  = Just . Container.nameOf nl
1305
        opF = OpCodes.OpInstanceMigrate iname True False True
1306
        opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n)
1307
                OpCodes.ReplaceNewSecondary [] Nothing
1308
    in case move of
1309
         Failover -> [ opF ]
1310
         ReplacePrimary np -> [ opF, opR np, opF ]
1311
         ReplaceSecondary ns -> [ opR ns ]
1312
         ReplaceAndFailover np -> [ opR np, opF ]
1313
         FailoverAndReplace ns -> [ opF, opR ns ]
1314

    
1315
-- * Node group functions
1316

    
1317
-- | Computes the group of an instance.
1318
instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1319
instanceGroup nl i =
1320
  let sidx = Instance.sNode i
1321
      pnode = Container.find (Instance.pNode i) nl
1322
      snode = if sidx == Node.noSecondary
1323
              then pnode
1324
              else Container.find sidx nl
1325
      pgroup = Node.group pnode
1326
      sgroup = Node.group snode
1327
  in if pgroup /= sgroup
1328
     then fail ("Instance placed accross two node groups, primary " ++
1329
                show pgroup ++ ", secondary " ++ show sgroup)
1330
     else return pgroup
1331

    
1332
-- | Computes the group of an instance per the primary node.
1333
instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1334
instancePriGroup nl i =
1335
  let pnode = Container.find (Instance.pNode i) nl
1336
  in  Node.group pnode
1337

    
1338
-- | Compute the list of badly allocated instances (split across node
1339
-- groups).
1340
findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1341
findSplitInstances nl =
1342
  filter (not . isOk . instanceGroup nl) . Container.elems
1343

    
1344
-- | Splits a cluster into the component node groups.
1345
splitCluster :: Node.List -> Instance.List ->
1346
                [(Gdx, (Node.List, Instance.List))]
1347
splitCluster nl il =
1348
  let ngroups = Node.computeGroups (Container.elems nl)
1349
  in map (\(guuid, nodes) ->
1350
           let nidxs = map Node.idx nodes
1351
               nodes' = zip nidxs nodes
1352
               instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1353
           in (guuid, (Container.fromList nodes', instances))) ngroups
1354

    
1355
-- | Split a global instance index map into per-group, and associate
1356
-- it with the group/node/instance lists.
1357
associateIdxs :: [Idx] -- ^ Instance indices to be split/associated
1358
              -> [(Gdx, (Node.List, Instance.List))]        -- ^ Input groups
1359
              -> [(Gdx, (Node.List, Instance.List, [Idx]))] -- ^ Result
1360
associateIdxs idxs =
1361
    map (\(gdx, (nl, il)) ->
1362
             (gdx, (nl, il, filter (`Container.member` il) idxs)))
1363

    
1364
-- | Compute the list of nodes that are to be evacuated, given a list
1365
-- of instances and an evacuation mode.
1366
nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1367
                -> EvacMode      -- ^ The evacuation mode we're using
1368
                -> [Idx]         -- ^ List of instance indices being evacuated
1369
                -> IntSet.IntSet -- ^ Set of node indices
1370
nodesToEvacuate il mode =
1371
    IntSet.delete Node.noSecondary .
1372
    foldl' (\ns idx ->
1373
                let i = Container.find idx il
1374
                    pdx = Instance.pNode i
1375
                    sdx = Instance.sNode i
1376
                    dt = Instance.diskTemplate i
1377
                    withSecondary = case dt of
1378
                                      DTDrbd8 -> IntSet.insert sdx ns
1379
                                      _ -> ns
1380
                in case mode of
1381
                     ChangePrimary   -> IntSet.insert pdx ns
1382
                     ChangeSecondary -> withSecondary
1383
                     ChangeAll       -> IntSet.insert pdx withSecondary
1384
           ) IntSet.empty