Statistics
| Branch: | Tag: | Revision:

root / htools / Ganeti / HTools / Cluster.hs @ 20b376ff

History | View | Annotate | Download (62.5 kB)

1
{-| Implementation of cluster-wide logic.
2

    
3
This module holds all pure cluster-logic; I\/O related functionality
4
goes into the /Main/ module for the individual binaries.
5

    
6
-}
7

    
8
{-
9

    
10
Copyright (C) 2009, 2010, 2011 Google Inc.
11

    
12
This program is free software; you can redistribute it and/or modify
13
it under the terms of the GNU General Public License as published by
14
the Free Software Foundation; either version 2 of the License, or
15
(at your option) any later version.
16

    
17
This program is distributed in the hope that it will be useful, but
18
WITHOUT ANY WARRANTY; without even the implied warranty of
19
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
General Public License for more details.
21

    
22
You should have received a copy of the GNU General Public License
23
along with this program; if not, write to the Free Software
24
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25
02110-1301, USA.
26

    
27
-}
28

    
29
module Ganeti.HTools.Cluster
30
    (
31
     -- * Types
32
      AllocSolution(..)
33
    , EvacSolution(..)
34
    , Table(..)
35
    , CStats(..)
36
    , AllocStats
37
    -- * Generic functions
38
    , totalResources
39
    , computeAllocationDelta
40
    -- * First phase functions
41
    , computeBadItems
42
    -- * Second phase functions
43
    , printSolutionLine
44
    , formatCmds
45
    , involvedNodes
46
    , splitJobs
47
    -- * Display functions
48
    , printNodes
49
    , printInsts
50
    -- * Balacing functions
51
    , checkMove
52
    , doNextBalance
53
    , tryBalance
54
    , compCV
55
    , compCVNodes
56
    , compDetailedCV
57
    , printStats
58
    , iMoveToJob
59
    -- * IAllocator functions
60
    , genAllocNodes
61
    , tryAlloc
62
    , tryMGAlloc
63
    , tryReloc
64
    , tryMGReloc
65
    , tryEvac
66
    , tryMGEvac
67
    , tryNodeEvac
68
    , tryChangeGroup
69
    , collapseFailures
70
    -- * Allocation functions
71
    , iterateAlloc
72
    , tieredAlloc
73
    , tieredSpecMap
74
     -- * Node group functions
75
    , instanceGroup
76
    , findSplitInstances
77
    , splitCluster
78
    ) where
79

    
80
import Data.Function (on)
81
import qualified Data.IntSet as IntSet
82
import Data.List
83
import Data.Maybe (fromJust)
84
import Data.Ord (comparing)
85
import Text.Printf (printf)
86
import Control.Monad
87
import Control.Parallel.Strategies
88

    
89
import qualified Ganeti.HTools.Container as Container
90
import qualified Ganeti.HTools.Instance as Instance
91
import qualified Ganeti.HTools.Node as Node
92
import qualified Ganeti.HTools.Group as Group
93
import Ganeti.HTools.Types
94
import Ganeti.HTools.Utils
95
import qualified Ganeti.OpCodes as OpCodes
96

    
97
-- * Types
98

    
99
-- | Allocation\/relocation solution.
100
data AllocSolution = AllocSolution
101
  { asFailures  :: [FailMode]          -- ^ Failure counts
102
  , asAllocs    :: Int                 -- ^ Good allocation count
103
  , asSolutions :: [Node.AllocElement] -- ^ The actual result, length
104
                                       -- of the list depends on the
105
                                       -- allocation/relocation mode
106
  , asLog       :: [String]            -- ^ A list of informational messages
107
  }
108

    
109
-- | Node evacuation/group change iallocator result type. This result
110
-- type consists of actual opcodes (a restricted subset) that are
111
-- transmitted back to Ganeti.
112
data EvacSolution = EvacSolution
113
    { esMoved   :: [String]             -- ^ Instance moved successfully
114
    , esFailed  :: [String]             -- ^ Instance which were not
115
                                        -- relocated
116
    , esOpCodes :: [[[OpCodes.OpCode]]] -- ^ List of lists of jobs
117
    }
118

    
119
-- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
120
type AllocResult = (FailStats, Node.List, Instance.List,
121
                    [Instance.Instance], [CStats])
122

    
123
-- | A type denoting the valid allocation mode/pairs.
124
--
125
-- For a one-node allocation, this will be a @Left ['Node.Node']@,
126
-- whereas for a two-node allocation, this will be a @Right
127
-- [('Node.Node', 'Node.Node')]@.
128
type AllocNodes = Either [Ndx] [(Ndx, Ndx)]
129

    
130
-- | The empty solution we start with when computing allocations.
131
emptyAllocSolution :: AllocSolution
132
emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
133
                                   , asSolutions = [], asLog = [] }
134

    
135
-- | The empty evac solution.
136
emptyEvacSolution :: EvacSolution
137
emptyEvacSolution = EvacSolution { esMoved = []
138
                                 , esFailed = []
139
                                 , esOpCodes = []
140
                                 }
141

    
142
-- | The complete state for the balancing solution.
143
data Table = Table Node.List Instance.List Score [Placement]
144
             deriving (Show, Read)
145

    
146
data CStats = CStats { csFmem :: Integer -- ^ Cluster free mem
147
                     , csFdsk :: Integer -- ^ Cluster free disk
148
                     , csAmem :: Integer -- ^ Cluster allocatable mem
149
                     , csAdsk :: Integer -- ^ Cluster allocatable disk
150
                     , csAcpu :: Integer -- ^ Cluster allocatable cpus
151
                     , csMmem :: Integer -- ^ Max node allocatable mem
152
                     , csMdsk :: Integer -- ^ Max node allocatable disk
153
                     , csMcpu :: Integer -- ^ Max node allocatable cpu
154
                     , csImem :: Integer -- ^ Instance used mem
155
                     , csIdsk :: Integer -- ^ Instance used disk
156
                     , csIcpu :: Integer -- ^ Instance used cpu
157
                     , csTmem :: Double  -- ^ Cluster total mem
158
                     , csTdsk :: Double  -- ^ Cluster total disk
159
                     , csTcpu :: Double  -- ^ Cluster total cpus
160
                     , csVcpu :: Integer -- ^ Cluster virtual cpus (if
161
                                         -- node pCpu has been set,
162
                                         -- otherwise -1)
163
                     , csXmem :: Integer -- ^ Unnacounted for mem
164
                     , csNmem :: Integer -- ^ Node own memory
165
                     , csScore :: Score  -- ^ The cluster score
166
                     , csNinst :: Int    -- ^ The total number of instances
167
                     }
168
            deriving (Show, Read)
169

    
170
-- | Currently used, possibly to allocate, unallocable.
171
type AllocStats = (RSpec, RSpec, RSpec)
172

    
173
-- * Utility functions
174

    
175
-- | Verifies the N+1 status and return the affected nodes.
176
verifyN1 :: [Node.Node] -> [Node.Node]
177
verifyN1 = filter Node.failN1
178

    
179
{-| Computes the pair of bad nodes and instances.
180

    
181
The bad node list is computed via a simple 'verifyN1' check, and the
182
bad instance list is the list of primary and secondary instances of
183
those nodes.
184

    
185
-}
186
computeBadItems :: Node.List -> Instance.List ->
187
                   ([Node.Node], [Instance.Instance])
188
computeBadItems nl il =
189
  let bad_nodes = verifyN1 $ getOnline nl
190
      bad_instances = map (`Container.find` il) .
191
                      sort . nub $
192
                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
193
  in
194
    (bad_nodes, bad_instances)
195

    
196
-- | Zero-initializer for the CStats type.
197
emptyCStats :: CStats
198
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
199

    
200
-- | Update stats with data from a new node.
201
updateCStats :: CStats -> Node.Node -> CStats
202
updateCStats cs node =
203
    let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
204
                 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
205
                 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
206
                 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
207
                 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
208
                 csVcpu = x_vcpu,
209
                 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
210
               }
211
            = cs
212
        inc_amem = Node.fMem node - Node.rMem node
213
        inc_amem' = if inc_amem > 0 then inc_amem else 0
214
        inc_adsk = Node.availDisk node
215
        inc_imem = truncate (Node.tMem node) - Node.nMem node
216
                   - Node.xMem node - Node.fMem node
217
        inc_icpu = Node.uCpu node
218
        inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
219
        inc_vcpu = Node.hiCpu node
220
        inc_acpu = Node.availCpu node
221

    
222
    in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
223
          , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
224
          , csAmem = x_amem + fromIntegral inc_amem'
225
          , csAdsk = x_adsk + fromIntegral inc_adsk
226
          , csAcpu = x_acpu + fromIntegral inc_acpu
227
          , csMmem = max x_mmem (fromIntegral inc_amem')
228
          , csMdsk = max x_mdsk (fromIntegral inc_adsk)
229
          , csMcpu = max x_mcpu (fromIntegral inc_acpu)
230
          , csImem = x_imem + fromIntegral inc_imem
231
          , csIdsk = x_idsk + fromIntegral inc_idsk
232
          , csIcpu = x_icpu + fromIntegral inc_icpu
233
          , csTmem = x_tmem + Node.tMem node
234
          , csTdsk = x_tdsk + Node.tDsk node
235
          , csTcpu = x_tcpu + Node.tCpu node
236
          , csVcpu = x_vcpu + fromIntegral inc_vcpu
237
          , csXmem = x_xmem + fromIntegral (Node.xMem node)
238
          , csNmem = x_nmem + fromIntegral (Node.nMem node)
239
          , csNinst = x_ninst + length (Node.pList node)
240
          }
241

    
242
-- | Compute the total free disk and memory in the cluster.
243
totalResources :: Node.List -> CStats
244
totalResources nl =
245
    let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
246
    in cs { csScore = compCV nl }
247

    
248
-- | Compute the delta between two cluster state.
249
--
250
-- This is used when doing allocations, to understand better the
251
-- available cluster resources. The return value is a triple of the
252
-- current used values, the delta that was still allocated, and what
253
-- was left unallocated.
254
computeAllocationDelta :: CStats -> CStats -> AllocStats
255
computeAllocationDelta cini cfin =
256
    let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini
257
        CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
258
                csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin
259
        rini = RSpec (fromIntegral i_icpu) (fromIntegral i_imem)
260
               (fromIntegral i_idsk)
261
        rfin = RSpec (fromIntegral (f_icpu - i_icpu))
262
               (fromIntegral (f_imem - i_imem))
263
               (fromIntegral (f_idsk - i_idsk))
264
        un_cpu = fromIntegral (v_cpu - f_icpu)::Int
265
        runa = RSpec un_cpu (truncate t_mem - fromIntegral f_imem)
266
               (truncate t_dsk - fromIntegral f_idsk)
267
    in (rini, rfin, runa)
268

    
269
-- | The names and weights of the individual elements in the CV list.
270
detailedCVInfo :: [(Double, String)]
271
detailedCVInfo = [ (1,  "free_mem_cv")
272
                 , (1,  "free_disk_cv")
273
                 , (1,  "n1_cnt")
274
                 , (1,  "reserved_mem_cv")
275
                 , (4,  "offline_all_cnt")
276
                 , (16, "offline_pri_cnt")
277
                 , (1,  "vcpu_ratio_cv")
278
                 , (1,  "cpu_load_cv")
279
                 , (1,  "mem_load_cv")
280
                 , (1,  "disk_load_cv")
281
                 , (1,  "net_load_cv")
282
                 , (2,  "pri_tags_score")
283
                 ]
284

    
285
detailedCVWeights :: [Double]
286
detailedCVWeights = map fst detailedCVInfo
287

    
288
-- | Compute the mem and disk covariance.
289
compDetailedCV :: [Node.Node] -> [Double]
290
compDetailedCV all_nodes =
291
    let
292
        (offline, nodes) = partition Node.offline all_nodes
293
        mem_l = map Node.pMem nodes
294
        dsk_l = map Node.pDsk nodes
295
        -- metric: memory covariance
296
        mem_cv = stdDev mem_l
297
        -- metric: disk covariance
298
        dsk_cv = stdDev dsk_l
299
        -- metric: count of instances living on N1 failing nodes
300
        n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
301
                                                   length (Node.pList n)) .
302
                   filter Node.failN1 $ nodes :: Double
303
        res_l = map Node.pRem nodes
304
        -- metric: reserved memory covariance
305
        res_cv = stdDev res_l
306
        -- offline instances metrics
307
        offline_ipri = sum . map (length . Node.pList) $ offline
308
        offline_isec = sum . map (length . Node.sList) $ offline
309
        -- metric: count of instances on offline nodes
310
        off_score = fromIntegral (offline_ipri + offline_isec)::Double
311
        -- metric: count of primary instances on offline nodes (this
312
        -- helps with evacuation/failover of primary instances on
313
        -- 2-node clusters with one node offline)
314
        off_pri_score = fromIntegral offline_ipri::Double
315
        cpu_l = map Node.pCpu nodes
316
        -- metric: covariance of vcpu/pcpu ratio
317
        cpu_cv = stdDev cpu_l
318
        -- metrics: covariance of cpu, memory, disk and network load
319
        (c_load, m_load, d_load, n_load) = unzip4 $
320
            map (\n ->
321
                     let DynUtil c1 m1 d1 n1 = Node.utilLoad n
322
                         DynUtil c2 m2 d2 n2 = Node.utilPool n
323
                     in (c1/c2, m1/m2, d1/d2, n1/n2)
324
                ) nodes
325
        -- metric: conflicting instance count
326
        pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
327
        pri_tags_score = fromIntegral pri_tags_inst::Double
328
    in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
329
       , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
330
       , pri_tags_score ]
331

    
332
-- | Compute the /total/ variance.
333
compCVNodes :: [Node.Node] -> Double
334
compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
335

    
336
-- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
337
compCV :: Node.List -> Double
338
compCV = compCVNodes . Container.elems
339

    
340

    
341
-- | Compute online nodes from a 'Node.List'.
342
getOnline :: Node.List -> [Node.Node]
343
getOnline = filter (not . Node.offline) . Container.elems
344

    
345
-- * Balancing functions
346

    
347
-- | Compute best table. Note that the ordering of the arguments is important.
348
compareTables :: Table -> Table -> Table
349
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
350
    if a_cv > b_cv then b else a
351

    
352
-- | Applies an instance move to a given node list and instance.
353
applyMove :: Node.List -> Instance.Instance
354
          -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
355
-- Failover (f)
356
applyMove nl inst Failover =
357
    let old_pdx = Instance.pNode inst
358
        old_sdx = Instance.sNode inst
359
        old_p = Container.find old_pdx nl
360
        old_s = Container.find old_sdx nl
361
        int_p = Node.removePri old_p inst
362
        int_s = Node.removeSec old_s inst
363
        force_p = Node.offline old_p
364
        new_nl = do -- Maybe monad
365
          new_p <- Node.addPriEx force_p int_s inst
366
          new_s <- Node.addSec int_p inst old_sdx
367
          let new_inst = Instance.setBoth inst old_sdx old_pdx
368
          return (Container.addTwo old_pdx new_s old_sdx new_p nl,
369
                  new_inst, old_sdx, old_pdx)
370
    in new_nl
371

    
372
-- Replace the primary (f:, r:np, f)
373
applyMove nl inst (ReplacePrimary new_pdx) =
374
    let old_pdx = Instance.pNode inst
375
        old_sdx = Instance.sNode inst
376
        old_p = Container.find old_pdx nl
377
        old_s = Container.find old_sdx nl
378
        tgt_n = Container.find new_pdx nl
379
        int_p = Node.removePri old_p inst
380
        int_s = Node.removeSec old_s inst
381
        force_p = Node.offline old_p
382
        new_nl = do -- Maybe monad
383
          -- check that the current secondary can host the instance
384
          -- during the migration
385
          tmp_s <- Node.addPriEx force_p int_s inst
386
          let tmp_s' = Node.removePri tmp_s inst
387
          new_p <- Node.addPriEx force_p tgt_n inst
388
          new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
389
          let new_inst = Instance.setPri inst new_pdx
390
          return (Container.add new_pdx new_p $
391
                  Container.addTwo old_pdx int_p old_sdx new_s nl,
392
                  new_inst, new_pdx, old_sdx)
393
    in new_nl
394

    
395
-- Replace the secondary (r:ns)
396
applyMove nl inst (ReplaceSecondary new_sdx) =
397
    let old_pdx = Instance.pNode inst
398
        old_sdx = Instance.sNode inst
399
        old_s = Container.find old_sdx nl
400
        tgt_n = Container.find new_sdx nl
401
        int_s = Node.removeSec old_s inst
402
        force_s = Node.offline old_s
403
        new_inst = Instance.setSec inst new_sdx
404
        new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
405
                 \new_s -> return (Container.addTwo new_sdx
406
                                   new_s old_sdx int_s nl,
407
                                   new_inst, old_pdx, new_sdx)
408
    in new_nl
409

    
410
-- Replace the secondary and failover (r:np, f)
411
applyMove nl inst (ReplaceAndFailover new_pdx) =
412
    let old_pdx = Instance.pNode inst
413
        old_sdx = Instance.sNode inst
414
        old_p = Container.find old_pdx nl
415
        old_s = Container.find old_sdx nl
416
        tgt_n = Container.find new_pdx nl
417
        int_p = Node.removePri old_p inst
418
        int_s = Node.removeSec old_s inst
419
        force_s = Node.offline old_s
420
        new_nl = do -- Maybe monad
421
          new_p <- Node.addPri tgt_n inst
422
          new_s <- Node.addSecEx force_s int_p inst new_pdx
423
          let new_inst = Instance.setBoth inst new_pdx old_pdx
424
          return (Container.add new_pdx new_p $
425
                  Container.addTwo old_pdx new_s old_sdx int_s nl,
426
                  new_inst, new_pdx, old_pdx)
427
    in new_nl
428

    
429
-- Failver and replace the secondary (f, r:ns)
430
applyMove nl inst (FailoverAndReplace new_sdx) =
431
    let old_pdx = Instance.pNode inst
432
        old_sdx = Instance.sNode inst
433
        old_p = Container.find old_pdx nl
434
        old_s = Container.find old_sdx nl
435
        tgt_n = Container.find new_sdx nl
436
        int_p = Node.removePri old_p inst
437
        int_s = Node.removeSec old_s inst
438
        force_p = Node.offline old_p
439
        new_nl = do -- Maybe monad
440
          new_p <- Node.addPriEx force_p int_s inst
441
          new_s <- Node.addSecEx force_p tgt_n inst old_sdx
442
          let new_inst = Instance.setBoth inst old_sdx new_sdx
443
          return (Container.add new_sdx new_s $
444
                  Container.addTwo old_sdx new_p old_pdx int_p nl,
445
                  new_inst, old_sdx, new_sdx)
446
    in new_nl
447

    
448
-- | Tries to allocate an instance on one given node.
449
allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
450
                 -> OpResult Node.AllocElement
451
allocateOnSingle nl inst new_pdx =
452
    let p = Container.find new_pdx nl
453
        new_inst = Instance.setBoth inst new_pdx Node.noSecondary
454
    in  Node.addPri p inst >>= \new_p -> do
455
      let new_nl = Container.add new_pdx new_p nl
456
          new_score = compCV nl
457
      return (new_nl, new_inst, [new_p], new_score)
458

    
459
-- | Tries to allocate an instance on a given pair of nodes.
460
allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
461
               -> OpResult Node.AllocElement
462
allocateOnPair nl inst new_pdx new_sdx =
463
    let tgt_p = Container.find new_pdx nl
464
        tgt_s = Container.find new_sdx nl
465
    in do
466
      new_p <- Node.addPri tgt_p inst
467
      new_s <- Node.addSec tgt_s inst new_pdx
468
      let new_inst = Instance.setBoth inst new_pdx new_sdx
469
          new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
470
      return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
471

    
472
-- | Tries to perform an instance move and returns the best table
473
-- between the original one and the new one.
474
checkSingleStep :: Table -- ^ The original table
475
                -> Instance.Instance -- ^ The instance to move
476
                -> Table -- ^ The current best table
477
                -> IMove -- ^ The move to apply
478
                -> Table -- ^ The final best table
479
checkSingleStep ini_tbl target cur_tbl move =
480
    let
481
        Table ini_nl ini_il _ ini_plc = ini_tbl
482
        tmp_resu = applyMove ini_nl target move
483
    in
484
      case tmp_resu of
485
        OpFail _ -> cur_tbl
486
        OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
487
            let tgt_idx = Instance.idx target
488
                upd_cvar = compCV upd_nl
489
                upd_il = Container.add tgt_idx new_inst ini_il
490
                upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
491
                upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
492
            in
493
              compareTables cur_tbl upd_tbl
494

    
495
-- | Given the status of the current secondary as a valid new node and
496
-- the current candidate target node, generate the possible moves for
497
-- a instance.
498
possibleMoves :: Bool      -- ^ Whether the secondary node is a valid new node
499
              -> Bool      -- ^ Whether we can change the primary node
500
              -> Ndx       -- ^ Target node candidate
501
              -> [IMove]   -- ^ List of valid result moves
502

    
503
possibleMoves _ False tdx =
504
    [ReplaceSecondary tdx]
505

    
506
possibleMoves True True tdx =
507
    [ReplaceSecondary tdx,
508
     ReplaceAndFailover tdx,
509
     ReplacePrimary tdx,
510
     FailoverAndReplace tdx]
511

    
512
possibleMoves False True tdx =
513
    [ReplaceSecondary tdx,
514
     ReplaceAndFailover tdx]
515

    
516
-- | Compute the best move for a given instance.
517
checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
518
                  -> Bool              -- ^ Whether disk moves are allowed
519
                  -> Bool              -- ^ Whether instance moves are allowed
520
                  -> Table             -- ^ Original table
521
                  -> Instance.Instance -- ^ Instance to move
522
                  -> Table             -- ^ Best new table for this instance
523
checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
524
    let
525
        opdx = Instance.pNode target
526
        osdx = Instance.sNode target
527
        nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx
528
        use_secondary = elem osdx nodes_idx && inst_moves
529
        aft_failover = if use_secondary -- if allowed to failover
530
                       then checkSingleStep ini_tbl target ini_tbl Failover
531
                       else ini_tbl
532
        all_moves = if disk_moves
533
                    then concatMap
534
                         (possibleMoves use_secondary inst_moves) nodes
535
                    else []
536
    in
537
      -- iterate over the possible nodes for this instance
538
      foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
539

    
540
-- | Compute the best next move.
541
checkMove :: [Ndx]               -- ^ Allowed target node indices
542
          -> Bool                -- ^ Whether disk moves are allowed
543
          -> Bool                -- ^ Whether instance moves are allowed
544
          -> Table               -- ^ The current solution
545
          -> [Instance.Instance] -- ^ List of instances still to move
546
          -> Table               -- ^ The new solution
547
checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
548
    let Table _ _ _ ini_plc = ini_tbl
549
        -- we're using rwhnf from the Control.Parallel.Strategies
550
        -- package; we don't need to use rnf as that would force too
551
        -- much evaluation in single-threaded cases, and in
552
        -- multi-threaded case the weak head normal form is enough to
553
        -- spark the evaluation
554
        tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
555
                               inst_moves ini_tbl)
556
                 victims
557
        -- iterate over all instances, computing the best move
558
        best_tbl = foldl' compareTables ini_tbl tables
559
        Table _ _ _ best_plc = best_tbl
560
    in if length best_plc == length ini_plc
561
       then ini_tbl -- no advancement
562
       else best_tbl
563

    
564
-- | Check if we are allowed to go deeper in the balancing.
565
doNextBalance :: Table     -- ^ The starting table
566
              -> Int       -- ^ Remaining length
567
              -> Score     -- ^ Score at which to stop
568
              -> Bool      -- ^ The resulting table and commands
569
doNextBalance ini_tbl max_rounds min_score =
570
    let Table _ _ ini_cv ini_plc = ini_tbl
571
        ini_plc_len = length ini_plc
572
    in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
573

    
574
-- | Run a balance move.
575
tryBalance :: Table       -- ^ The starting table
576
           -> Bool        -- ^ Allow disk moves
577
           -> Bool        -- ^ Allow instance moves
578
           -> Bool        -- ^ Only evacuate moves
579
           -> Score       -- ^ Min gain threshold
580
           -> Score       -- ^ Min gain
581
           -> Maybe Table -- ^ The resulting table and commands
582
tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
583
    let Table ini_nl ini_il ini_cv _ = ini_tbl
584
        all_inst = Container.elems ini_il
585
        all_inst' = if evac_mode
586
                    then let bad_nodes = map Node.idx . filter Node.offline $
587
                                         Container.elems ini_nl
588
                         in filter (\e -> Instance.sNode e `elem` bad_nodes ||
589
                                          Instance.pNode e `elem` bad_nodes)
590
                            all_inst
591
                    else all_inst
592
        reloc_inst = filter Instance.movable all_inst'
593
        node_idx = map Node.idx . filter (not . Node.offline) $
594
                   Container.elems ini_nl
595
        fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
596
        (Table _ _ fin_cv _) = fin_tbl
597
    in
598
      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
599
      then Just fin_tbl -- this round made success, return the new table
600
      else Nothing
601

    
602
-- * Allocation functions
603

    
604
-- | Build failure stats out of a list of failures.
605
collapseFailures :: [FailMode] -> FailStats
606
collapseFailures flst =
607
    map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
608
            [minBound..maxBound]
609

    
610
-- | Update current Allocation solution and failure stats with new
611
-- elements.
612
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
613
concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
614

    
615
concatAllocs as (OpGood ns@(_, _, _, nscore)) =
616
    let -- Choose the old or new solution, based on the cluster score
617
        cntok = asAllocs as
618
        osols = asSolutions as
619
        nsols = case osols of
620
                  [] -> [ns]
621
                  (_, _, _, oscore):[] ->
622
                      if oscore < nscore
623
                      then osols
624
                      else [ns]
625
                  -- FIXME: here we simply concat to lists with more
626
                  -- than one element; we should instead abort, since
627
                  -- this is not a valid usage of this function
628
                  xs -> ns:xs
629
        nsuc = cntok + 1
630
    -- Note: we force evaluation of nsols here in order to keep the
631
    -- memory profile low - we know that we will need nsols for sure
632
    -- in the next cycle, so we force evaluation of nsols, since the
633
    -- foldl' in the caller will only evaluate the tuple, but not the
634
    -- elements of the tuple
635
    in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolutions = nsols }
636

    
637
-- | Sums two allocation solutions (e.g. for two separate node groups).
638
sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
639
sumAllocs (AllocSolution af aa as al) (AllocSolution bf ba bs bl) =
640
    AllocSolution (af ++ bf) (aa + ba) (as ++ bs) (al ++ bl)
641

    
642
-- | Given a solution, generates a reasonable description for it.
643
describeSolution :: AllocSolution -> String
644
describeSolution as =
645
  let fcnt = asFailures as
646
      sols = asSolutions as
647
      freasons =
648
        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
649
        filter ((> 0) . snd) . collapseFailures $ fcnt
650
  in if null sols
651
     then "No valid allocation solutions, failure reasons: " ++
652
          (if null fcnt
653
           then "unknown reasons"
654
           else freasons)
655
     else let (_, _, nodes, cv) = head sols
656
          in printf ("score: %.8f, successes %d, failures %d (%s)" ++
657
                     " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
658
             (intercalate "/" . map Node.name $ nodes)
659

    
660
-- | Annotates a solution with the appropriate string.
661
annotateSolution :: AllocSolution -> AllocSolution
662
annotateSolution as = as { asLog = describeSolution as : asLog as }
663

    
664
-- | Reverses an evacuation solution.
665
--
666
-- Rationale: we always concat the results to the top of the lists, so
667
-- for proper jobset execution, we should reverse all lists.
668
reverseEvacSolution :: EvacSolution -> EvacSolution
669
reverseEvacSolution (EvacSolution f m o) =
670
    EvacSolution (reverse f) (reverse m) (reverse o)
671

    
672
-- | Generate the valid node allocation singles or pairs for a new instance.
673
genAllocNodes :: Group.List        -- ^ Group list
674
              -> Node.List         -- ^ The node map
675
              -> Int               -- ^ The number of nodes required
676
              -> Bool              -- ^ Whether to drop or not
677
                                   -- unallocable nodes
678
              -> Result AllocNodes -- ^ The (monadic) result
679
genAllocNodes gl nl count drop_unalloc =
680
    let filter_fn = if drop_unalloc
681
                    then filter (Group.isAllocable .
682
                                 flip Container.find gl . Node.group)
683
                    else id
684
        all_nodes = filter_fn $ getOnline nl
685
        all_pairs = liftM2 (,) all_nodes all_nodes
686
        ok_pairs = filter (\(x, y) -> Node.idx x /= Node.idx y &&
687
                                      Node.group x == Node.group y) all_pairs
688
    in case count of
689
         1 -> Ok (Left (map Node.idx all_nodes))
690
         2 -> Ok (Right (map (\(p, s) -> (Node.idx p, Node.idx s)) ok_pairs))
691
         _ -> Bad "Unsupported number of nodes, only one or two  supported"
692

    
693
-- | Try to allocate an instance on the cluster.
694
tryAlloc :: (Monad m) =>
695
            Node.List         -- ^ The node list
696
         -> Instance.List     -- ^ The instance list
697
         -> Instance.Instance -- ^ The instance to allocate
698
         -> AllocNodes        -- ^ The allocation targets
699
         -> m AllocSolution   -- ^ Possible solution list
700
tryAlloc nl _ inst (Right ok_pairs) =
701
    let sols = foldl' (\cstate (p, s) ->
702
                           concatAllocs cstate $ allocateOnPair nl inst p s
703
                      ) emptyAllocSolution ok_pairs
704

    
705
    in if null ok_pairs -- means we have just one node
706
       then fail "Not enough online nodes"
707
       else return $ annotateSolution sols
708

    
709
tryAlloc nl _ inst (Left all_nodes) =
710
    let sols = foldl' (\cstate ->
711
                           concatAllocs cstate . allocateOnSingle nl inst
712
                      ) emptyAllocSolution all_nodes
713
    in if null all_nodes
714
       then fail "No online nodes"
715
       else return $ annotateSolution sols
716

    
717
-- | Given a group/result, describe it as a nice (list of) messages.
718
solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
719
solutionDescription gl (groupId, result) =
720
  case result of
721
    Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
722
    Bad message -> [printf "Group %s: error %s" gname message]
723
  where grp = Container.find groupId gl
724
        gname = Group.name grp
725
        pol = apolToString (Group.allocPolicy grp)
726

    
727
-- | From a list of possibly bad and possibly empty solutions, filter
728
-- only the groups with a valid result. Note that the result will be
729
-- reversed compared to the original list.
730
filterMGResults :: Group.List
731
                -> [(Gdx, Result AllocSolution)]
732
                -> [(Gdx, AllocSolution)]
733
filterMGResults gl = foldl' fn []
734
    where unallocable = not . Group.isAllocable . flip Container.find gl
735
          fn accu (gdx, rasol) =
736
              case rasol of
737
                Bad _ -> accu
738
                Ok sol | null (asSolutions sol) -> accu
739
                       | unallocable gdx -> accu
740
                       | otherwise -> (gdx, sol):accu
741

    
742
-- | Sort multigroup results based on policy and score.
743
sortMGResults :: Group.List
744
             -> [(Gdx, AllocSolution)]
745
             -> [(Gdx, AllocSolution)]
746
sortMGResults gl sols =
747
    let extractScore (_, _, _, x) = x
748
        solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
749
                               (extractScore . head . asSolutions) sol)
750
    in sortBy (comparing solScore) sols
751

    
752
-- | Finds the best group for an instance on a multi-group cluster.
753
--
754
-- Only solutions in @preferred@ and @last_resort@ groups will be
755
-- accepted as valid, and additionally if the allowed groups parameter
756
-- is not null then allocation will only be run for those group
757
-- indices.
758
findBestAllocGroup :: Group.List           -- ^ The group list
759
                   -> Node.List            -- ^ The node list
760
                   -> Instance.List        -- ^ The instance list
761
                   -> Maybe [Gdx]          -- ^ The allowed groups
762
                   -> Instance.Instance    -- ^ The instance to allocate
763
                   -> Int                  -- ^ Required number of nodes
764
                   -> Result (Gdx, AllocSolution, [String])
765
findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
766
  let groups = splitCluster mgnl mgil
767
      groups' = maybe groups (\gs -> filter ((`elem` gs) . fst) groups)
768
                allowed_gdxs
769
      sols = map (\(gid, (nl, il)) ->
770
                   (gid, genAllocNodes mggl nl cnt False >>=
771
                       tryAlloc nl il inst))
772
             groups'::[(Gdx, Result AllocSolution)]
773
      all_msgs = concatMap (solutionDescription mggl) sols
774
      goodSols = filterMGResults mggl sols
775
      sortedSols = sortMGResults mggl goodSols
776
  in if null sortedSols
777
     then Bad $ intercalate ", " all_msgs
778
     else let (final_group, final_sol) = head sortedSols
779
          in return (final_group, final_sol, all_msgs)
780

    
781
-- | Try to allocate an instance on a multi-group cluster.
782
tryMGAlloc :: Group.List           -- ^ The group list
783
           -> Node.List            -- ^ The node list
784
           -> Instance.List        -- ^ The instance list
785
           -> Instance.Instance    -- ^ The instance to allocate
786
           -> Int                  -- ^ Required number of nodes
787
           -> Result AllocSolution -- ^ Possible solution list
788
tryMGAlloc mggl mgnl mgil inst cnt = do
789
  (best_group, solution, all_msgs) <-
790
      findBestAllocGroup mggl mgnl mgil Nothing inst cnt
791
  let group_name = Group.name $ Container.find best_group mggl
792
      selmsg = "Selected group: " ++ group_name
793
  return $ solution { asLog = selmsg:all_msgs }
794

    
795
-- | Try to relocate an instance on the cluster.
796
tryReloc :: (Monad m) =>
797
            Node.List       -- ^ The node list
798
         -> Instance.List   -- ^ The instance list
799
         -> Idx             -- ^ The index of the instance to move
800
         -> Int             -- ^ The number of nodes required
801
         -> [Ndx]           -- ^ Nodes which should not be used
802
         -> m AllocSolution -- ^ Solution list
803
tryReloc nl il xid 1 ex_idx =
804
    let all_nodes = getOnline nl
805
        inst = Container.find xid il
806
        ex_idx' = Instance.pNode inst:ex_idx
807
        valid_nodes = filter (not . flip elem ex_idx' . Node.idx) all_nodes
808
        valid_idxes = map Node.idx valid_nodes
809
        sols1 = foldl' (\cstate x ->
810
                            let em = do
811
                                  (mnl, i, _, _) <-
812
                                      applyMove nl inst (ReplaceSecondary x)
813
                                  return (mnl, i, [Container.find x mnl],
814
                                          compCV mnl)
815
                            in concatAllocs cstate em
816
                       ) emptyAllocSolution valid_idxes
817
    in return sols1
818

    
819
tryReloc _ _ _ reqn _  = fail $ "Unsupported number of relocation \
820
                                \destinations required (" ++ show reqn ++
821
                                                  "), only one supported"
822

    
823
tryMGReloc :: (Monad m) =>
824
              Group.List      -- ^ The group list
825
           -> Node.List       -- ^ The node list
826
           -> Instance.List   -- ^ The instance list
827
           -> Idx             -- ^ The index of the instance to move
828
           -> Int             -- ^ The number of nodes required
829
           -> [Ndx]           -- ^ Nodes which should not be used
830
           -> m AllocSolution -- ^ Solution list
831
tryMGReloc _ mgnl mgil xid ncount ex_ndx = do
832
  let groups = splitCluster mgnl mgil
833
      -- TODO: we only relocate inside the group for now
834
      inst = Container.find xid mgil
835
  (nl, il) <- case lookup (instancePriGroup mgnl inst) groups of
836
                Nothing -> fail $ "Cannot find group for instance " ++
837
                           Instance.name inst
838
                Just v -> return v
839
  tryReloc nl il xid ncount ex_ndx
840

    
841
-- | Change an instance's secondary node.
842
evacInstance :: (Monad m) =>
843
                [Ndx]                      -- ^ Excluded nodes
844
             -> Instance.List              -- ^ The current instance list
845
             -> (Node.List, AllocSolution) -- ^ The current state
846
             -> Idx                        -- ^ The instance to evacuate
847
             -> m (Node.List, AllocSolution)
848
evacInstance ex_ndx il (nl, old_as) idx = do
849
  -- FIXME: hardcoded one node here
850

    
851
  -- Longer explanation: evacuation is currently hardcoded to DRBD
852
  -- instances (which have one secondary); hence, even if the
853
  -- IAllocator protocol can request N nodes for an instance, and all
854
  -- the message parsing/loading pass this, this implementation only
855
  -- supports one; this situation needs to be revisited if we ever
856
  -- support more than one secondary, or if we change the storage
857
  -- model
858
  new_as <- tryReloc nl il idx 1 ex_ndx
859
  case asSolutions new_as of
860
    -- an individual relocation succeeded, we kind of compose the data
861
    -- from the two solutions
862
    csol@(nl', _, _, _):_ ->
863
        return (nl', new_as { asSolutions = csol:asSolutions old_as })
864
    -- this relocation failed, so we fail the entire evac
865
    _ -> fail $ "Can't evacuate instance " ++
866
         Instance.name (Container.find idx il) ++
867
             ": " ++ describeSolution new_as
868

    
869
-- | Try to evacuate a list of nodes.
870
tryEvac :: (Monad m) =>
871
            Node.List       -- ^ The node list
872
         -> Instance.List   -- ^ The instance list
873
         -> [Idx]           -- ^ Instances to be evacuated
874
         -> [Ndx]           -- ^ Restricted nodes (the ones being evacuated)
875
         -> m AllocSolution -- ^ Solution list
876
tryEvac nl il idxs ex_ndx = do
877
  (_, sol) <- foldM (evacInstance ex_ndx il) (nl, emptyAllocSolution) idxs
878
  return sol
879

    
880
-- | Multi-group evacuation of a list of nodes.
881
tryMGEvac :: (Monad m) =>
882
             Group.List -- ^ The group list
883
          -> Node.List       -- ^ The node list
884
          -> Instance.List   -- ^ The instance list
885
          -> [Ndx]           -- ^ Nodes to be evacuated
886
          -> m AllocSolution -- ^ Solution list
887
tryMGEvac _ nl il ex_ndx =
888
    let ex_nodes = map (`Container.find` nl) ex_ndx
889
        all_insts = nub . concatMap Node.sList $ ex_nodes
890
        all_insts' = associateIdxs all_insts $ splitCluster nl il
891
    in do
892
      results <- mapM (\(_, (gnl, gil, idxs)) -> tryEvac gnl gil idxs ex_ndx)
893
                 all_insts'
894
      let sol = foldl' sumAllocs emptyAllocSolution results
895
      return $ annotateSolution sol
896

    
897
-- | Function which fails if the requested mode is change secondary.
898
--
899
-- This is useful since except DRBD, no other disk template can
900
-- execute change secondary; thus, we can just call this function
901
-- instead of always checking for secondary mode. After the call to
902
-- this function, whatever mode we have is just a primary change.
903
failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
904
failOnSecondaryChange ChangeSecondary dt =
905
    fail $ "Instances with disk template '" ++ dtToString dt ++
906
         "' can't execute change secondary"
907
failOnSecondaryChange _ _ = return ()
908

    
909
-- | Run evacuation for a single instance.
910
--
911
-- /Note:/ this function should correctly execute both intra-group
912
-- evacuations (in all modes) and inter-group evacuations (in the
913
-- 'ChangeAll' mode). Of course, this requires that the correct list
914
-- of target nodes is passed.
915
nodeEvacInstance :: Node.List         -- ^ The node list (cluster-wide)
916
                 -> Instance.List     -- ^ Instance list (cluster-wide)
917
                 -> EvacMode          -- ^ The evacuation mode
918
                 -> Instance.Instance -- ^ The instance to be evacuated
919
                 -> [Ndx]             -- ^ The list of available nodes
920
                                      -- for allocation
921
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
922
nodeEvacInstance _ _ mode (Instance.Instance
923
                           {Instance.diskTemplate = dt@DTDiskless}) _ =
924
                  failOnSecondaryChange mode dt >>
925
                  fail "Diskless relocations not implemented yet"
926

    
927
nodeEvacInstance _ _ _ (Instance.Instance
928
                        {Instance.diskTemplate = DTPlain}) _ =
929
                  fail "Instances of type plain cannot be relocated"
930

    
931
nodeEvacInstance _ _ _ (Instance.Instance
932
                        {Instance.diskTemplate = DTFile}) _ =
933
                  fail "Instances of type file cannot be relocated"
934

    
935
nodeEvacInstance _ _ mode  (Instance.Instance
936
                            {Instance.diskTemplate = dt@DTSharedFile}) _ =
937
                  failOnSecondaryChange mode dt >>
938
                  fail "Shared file relocations not implemented yet"
939

    
940
nodeEvacInstance _ _ mode (Instance.Instance
941
                           {Instance.diskTemplate = dt@DTBlock}) _ =
942
                  failOnSecondaryChange mode dt >>
943
                  fail "Block device relocations not implemented yet"
944

    
945
nodeEvacInstance nl il ChangePrimary
946
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8}) _ =
947
  do
948
    (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
949
    let idx = Instance.idx inst
950
        il' = Container.add idx inst' il
951
        ops = iMoveToJob nl' il' idx Failover
952
    return (nl', il', ops)
953

    
954
nodeEvacInstance nl il ChangeSecondary
955
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
956
                 avail_nodes =
957
  do
958
    let gdx = instancePriGroup nl inst
959
    (nl', inst', _, ndx) <- annotateResult "Can't find any good node" $
960
                            eitherToResult $
961
                            foldl' (evacDrbdSecondaryInner nl inst gdx)
962
                            (Left "no nodes available") avail_nodes
963
    let idx = Instance.idx inst
964
        il' = Container.add idx inst' il
965
        ops = iMoveToJob nl' il' idx (ReplaceSecondary ndx)
966
    return (nl', il', ops)
967

    
968
nodeEvacInstance nl il ChangeAll
969
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
970
                 avail_nodes =
971
  do
972
    let primary = Container.find (Instance.pNode inst) nl
973
        idx = Instance.idx inst
974
        gdx = instancePriGroup nl inst
975
        no_nodes = Left "no nodes available"
976
    -- if the primary is offline, then we first failover
977
    (nl1, inst1, ops1) <-
978
        if Node.offline primary
979
        then do
980
          (nl', inst', _, _) <-
981
              annotateResult "Failing over to the secondary" $
982
              opToResult $ applyMove nl inst Failover
983
          return (nl', inst', [Failover])
984
        else return (nl, inst, [])
985
    -- we now need to execute a replace secondary to the future
986
    -- primary node
987
    (nl2, inst2, _, new_pdx) <- annotateResult "Searching for a new primary" $
988
                                eitherToResult $
989
                                foldl' (evacDrbdSecondaryInner nl1 inst1 gdx)
990
                                no_nodes avail_nodes
991
    let ops2 = ReplaceSecondary new_pdx:ops1
992
    -- since we chose the new primary, we remove it from the list of
993
    -- available nodes
994
    let avail_nodes_sec = new_pdx `delete` avail_nodes
995
    -- we now execute another failover, the primary stays fixed now
996
    (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" $
997
                          opToResult $ applyMove nl2 inst2 Failover
998
    let ops3 = Failover:ops2
999
    -- and finally another replace secondary, to the final secondary
1000
    (nl4, inst4, _, new_sdx) <-
1001
        annotateResult "Searching for a new secondary" $
1002
        eitherToResult $
1003
        foldl' (evacDrbdSecondaryInner nl3 inst3 gdx) no_nodes avail_nodes_sec
1004
    let ops4 = ReplaceSecondary new_sdx:ops3
1005
        il' = Container.add idx inst4 il
1006
        ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1007
    return (nl4, il', ops)
1008

    
1009
-- | Inner fold function for changing secondary of a DRBD instance.
1010
--
1011
-- The "running" solution is either a @Left String@, which means we
1012
-- don't have yet a working solution, or a @Right (...)@, which
1013
-- represents a valid solution; it holds the modified node list, the
1014
-- modified instance (after evacuation), the score of that solution,
1015
-- and the new secondary node index.
1016
evacDrbdSecondaryInner :: Node.List -- ^ Cluster node list
1017
                       -> Instance.Instance -- ^ Instance being evacuated
1018
                       -> Gdx -- ^ The group index of the instance
1019
                       -> Either String ( Node.List
1020
                                        , Instance.Instance
1021
                                        , Score
1022
                                        , Ndx)  -- ^ Current best solution
1023
                       -> Ndx  -- ^ Node we're evaluating as new secondary
1024
                       -> Either String ( Node.List
1025
                                        , Instance.Instance
1026
                                        , Score
1027
                                        , Ndx) -- ^ New best solution
1028
evacDrbdSecondaryInner nl inst gdx accu ndx =
1029
    case applyMove nl inst (ReplaceSecondary ndx) of
1030
      OpFail fm ->
1031
          case accu of
1032
            Right _ -> accu
1033
            Left _ -> Left $ "Node " ++ Container.nameOf nl ndx ++
1034
                      " failed: " ++ show fm
1035
      OpGood (nl', inst', _, _) ->
1036
          let nodes = Container.elems nl'
1037
              -- The fromJust below is ugly (it can fail nastily), but
1038
              -- at this point we should have any internal mismatches,
1039
              -- and adding a monad here would be quite involved
1040
              grpnodes = fromJust (gdx `lookup` (Node.computeGroups nodes))
1041
              new_cv = compCVNodes grpnodes
1042
              new_accu = Right (nl', inst', new_cv, ndx)
1043
          in case accu of
1044
               Left _ -> new_accu
1045
               Right (_, _, old_cv, _) ->
1046
                   if old_cv < new_cv
1047
                   then accu
1048
                   else new_accu
1049

    
1050
-- | Computes the nodes in a given group which are available for
1051
-- allocation.
1052
availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1053
                    -> IntSet.IntSet  -- ^ Nodes that are excluded
1054
                    -> Gdx            -- ^ The group for which we
1055
                                      -- query the nodes
1056
                    -> Result [Ndx]   -- ^ List of available node indices
1057
availableGroupNodes group_nodes excl_ndx gdx = do
1058
  local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1059
                 Ok (lookup gdx group_nodes)
1060
  let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1061
  return avail_nodes
1062

    
1063
-- | Updates the evac solution with the results of an instance
1064
-- evacuation.
1065
updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1066
                   -> Instance.Instance
1067
                   -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1068
                   -> (Node.List, Instance.List, EvacSolution)
1069
updateEvacSolution (nl, il, es) inst (Bad msg) =
1070
    (nl, il, es { esFailed = (Instance.name inst ++ ": " ++ msg):esFailed es})
1071
updateEvacSolution (_, _, es) inst (Ok (nl, il, opcodes)) =
1072
    (nl, il, es { esMoved = Instance.name inst:esMoved es
1073
                , esOpCodes = [opcodes]:esOpCodes es })
1074

    
1075
-- | Node-evacuation IAllocator mode main function.
1076
tryNodeEvac :: Group.List    -- ^ The cluster groups
1077
            -> Node.List     -- ^ The node list (cluster-wide, not per group)
1078
            -> Instance.List -- ^ Instance list (cluster-wide)
1079
            -> EvacMode      -- ^ The evacuation mode
1080
            -> [Idx]         -- ^ List of instance (indices) to be evacuated
1081
            -> Result EvacSolution
1082
tryNodeEvac _ ini_nl ini_il mode idxs =
1083
    let evac_ndx = nodesToEvacuate ini_il mode idxs
1084
        offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1085
        excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1086
        group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1087
                                             (Container.elems nl))) $
1088
                      splitCluster ini_nl ini_il
1089
        (_, _, esol) =
1090
            foldl' (\state@(nl, il, _) inst ->
1091
                        updateEvacSolution state inst $
1092
                        availableGroupNodes group_ndx
1093
                          excl_ndx (instancePriGroup nl inst) >>=
1094
                        nodeEvacInstance nl il mode inst
1095
                   )
1096
            (ini_nl, ini_il, emptyEvacSolution)
1097
            (map (`Container.find` ini_il) idxs)
1098
    in return $ reverseEvacSolution esol
1099

    
1100
-- | Change-group IAllocator mode main function.
1101
--
1102
-- This is very similar to 'tryNodeEvac', the only difference is that
1103
-- we don't choose as target group the current instance group, but
1104
-- instead:
1105
--
1106
--   1. at the start of the function, we compute which are the target
1107
--   groups; either no groups were passed in, in which case we choose
1108
--   all groups out of which we don't evacuate instance, or there were
1109
--   some groups passed, in which case we use those
1110
--
1111
--   2. for each instance, we use 'findBestAllocGroup' to choose the
1112
--   best group to hold the instance, and then we do what
1113
--   'tryNodeEvac' does, except for this group instead of the current
1114
--   instance group.
1115
--
1116
-- Note that the correct behaviour of this function relies on the
1117
-- function 'nodeEvacInstance' to be able to do correctly both
1118
-- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1119
tryChangeGroup :: Group.List    -- ^ The cluster groups
1120
               -> Node.List     -- ^ The node list (cluster-wide)
1121
               -> Instance.List -- ^ Instance list (cluster-wide)
1122
               -> [Gdx]         -- ^ Target groups; if empty, any
1123
                                -- groups not being evacuated
1124
               -> [Idx]         -- ^ List of instance (indices) to be evacuated
1125
               -> Result EvacSolution
1126
tryChangeGroup gl ini_nl ini_il gdxs idxs =
1127
    let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1128
                               flip Container.find ini_il) idxs
1129
        target_gdxs = (if null gdxs
1130
                       then Container.keys gl
1131
                       else gdxs) \\ evac_gdxs
1132
        offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1133
        excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1134
        group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1135
                                             (Container.elems nl))) $
1136
                      splitCluster ini_nl ini_il
1137
        (_, _, esol) =
1138
            foldl' (\state@(nl, il, _) inst ->
1139
                        let solution = do
1140
                              let ncnt = Instance.requiredNodes $
1141
                                         Instance.diskTemplate inst
1142
                              (gdx, _, _) <- findBestAllocGroup gl nl il
1143
                                             (Just target_gdxs) inst ncnt
1144
                              av_nodes <- availableGroupNodes group_ndx
1145
                                          excl_ndx gdx
1146
                              nodeEvacInstance nl il ChangeAll inst av_nodes
1147
                        in updateEvacSolution state inst solution
1148
                   )
1149
            (ini_nl, ini_il, emptyEvacSolution)
1150
            (map (`Container.find` ini_il) idxs)
1151
    in return $ reverseEvacSolution esol
1152

    
1153

    
1154
-- | Recursively place instances on the cluster until we're out of space.
1155
iterateAlloc :: Node.List
1156
             -> Instance.List
1157
             -> Maybe Int
1158
             -> Instance.Instance
1159
             -> AllocNodes
1160
             -> [Instance.Instance]
1161
             -> [CStats]
1162
             -> Result AllocResult
1163
iterateAlloc nl il limit newinst allocnodes ixes cstats =
1164
      let depth = length ixes
1165
          newname = printf "new-%d" depth::String
1166
          newidx = length (Container.elems il) + depth
1167
          newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1168
          newlimit = fmap (flip (-) 1) limit
1169
      in case tryAlloc nl il newi2 allocnodes of
1170
           Bad s -> Bad s
1171
           Ok (AllocSolution { asFailures = errs, asSolutions = sols3 }) ->
1172
               let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1173
               case sols3 of
1174
                 [] -> newsol
1175
                 (xnl, xi, _, _):[] ->
1176
                     if limit == Just 0
1177
                     then newsol
1178
                     else iterateAlloc xnl (Container.add newidx xi il)
1179
                          newlimit newinst allocnodes (xi:ixes)
1180
                          (totalResources xnl:cstats)
1181
                 _ -> Bad "Internal error: multiple solutions for single\
1182
                          \ allocation"
1183

    
1184
-- | The core of the tiered allocation mode.
1185
tieredAlloc :: Node.List
1186
            -> Instance.List
1187
            -> Maybe Int
1188
            -> Instance.Instance
1189
            -> AllocNodes
1190
            -> [Instance.Instance]
1191
            -> [CStats]
1192
            -> Result AllocResult
1193
tieredAlloc nl il limit newinst allocnodes ixes cstats =
1194
    case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1195
      Bad s -> Bad s
1196
      Ok (errs, nl', il', ixes', cstats') ->
1197
          let newsol = Ok (errs, nl', il', ixes', cstats')
1198
              ixes_cnt = length ixes'
1199
              (stop, newlimit) = case limit of
1200
                                   Nothing -> (False, Nothing)
1201
                                   Just n -> (n <= ixes_cnt,
1202
                                              Just (n - ixes_cnt)) in
1203
          if stop then newsol else
1204
          case Instance.shrinkByType newinst . fst . last $
1205
               sortBy (comparing snd) errs of
1206
            Bad _ -> newsol
1207
            Ok newinst' -> tieredAlloc nl' il' newlimit
1208
                           newinst' allocnodes ixes' cstats'
1209

    
1210
-- | Compute the tiered spec string description from a list of
1211
-- allocated instances.
1212
tieredSpecMap :: [Instance.Instance]
1213
              -> [String]
1214
tieredSpecMap trl_ixes =
1215
    let fin_trl_ixes = reverse trl_ixes
1216
        ix_byspec = groupBy ((==) `on` Instance.specOf) fin_trl_ixes
1217
        spec_map = map (\ixs -> (Instance.specOf $ head ixs, length ixs))
1218
                   ix_byspec
1219
    in  map (\(spec, cnt) -> printf "%d,%d,%d=%d" (rspecMem spec)
1220
                             (rspecDsk spec) (rspecCpu spec) cnt) spec_map
1221

    
1222
-- * Formatting functions
1223

    
1224
-- | Given the original and final nodes, computes the relocation description.
1225
computeMoves :: Instance.Instance -- ^ The instance to be moved
1226
             -> String -- ^ The instance name
1227
             -> IMove  -- ^ The move being performed
1228
             -> String -- ^ New primary
1229
             -> String -- ^ New secondary
1230
             -> (String, [String])
1231
                -- ^ Tuple of moves and commands list; moves is containing
1232
                -- either @/f/@ for failover or @/r:name/@ for replace
1233
                -- secondary, while the command list holds gnt-instance
1234
                -- commands (without that prefix), e.g \"@failover instance1@\"
1235
computeMoves i inam mv c d =
1236
    case mv of
1237
      Failover -> ("f", [mig])
1238
      FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1239
      ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1240
      ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1241
      ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1242
    where morf = if Instance.running i then "migrate" else "failover"
1243
          mig = printf "%s -f %s" morf inam::String
1244
          rep n = printf "replace-disks -n %s %s" n inam
1245

    
1246
-- | Converts a placement to string format.
1247
printSolutionLine :: Node.List     -- ^ The node list
1248
                  -> Instance.List -- ^ The instance list
1249
                  -> Int           -- ^ Maximum node name length
1250
                  -> Int           -- ^ Maximum instance name length
1251
                  -> Placement     -- ^ The current placement
1252
                  -> Int           -- ^ The index of the placement in
1253
                                   -- the solution
1254
                  -> (String, [String])
1255
printSolutionLine nl il nmlen imlen plc pos =
1256
    let
1257
        pmlen = (2*nmlen + 1)
1258
        (i, p, s, mv, c) = plc
1259
        inst = Container.find i il
1260
        inam = Instance.alias inst
1261
        npri = Node.alias $ Container.find p nl
1262
        nsec = Node.alias $ Container.find s nl
1263
        opri = Node.alias $ Container.find (Instance.pNode inst) nl
1264
        osec = Node.alias $ Container.find (Instance.sNode inst) nl
1265
        (moves, cmds) =  computeMoves inst inam mv npri nsec
1266
        ostr = printf "%s:%s" opri osec::String
1267
        nstr = printf "%s:%s" npri nsec::String
1268
    in
1269
      (printf "  %3d. %-*s %-*s => %-*s %.8f a=%s"
1270
       pos imlen inam pmlen ostr
1271
       pmlen nstr c moves,
1272
       cmds)
1273

    
1274
-- | Return the instance and involved nodes in an instance move.
1275
involvedNodes :: Instance.List -> Placement -> [Ndx]
1276
involvedNodes il plc =
1277
    let (i, np, ns, _, _) = plc
1278
        inst = Container.find i il
1279
        op = Instance.pNode inst
1280
        os = Instance.sNode inst
1281
    in nub [np, ns, op, os]
1282

    
1283
-- | Inner function for splitJobs, that either appends the next job to
1284
-- the current jobset, or starts a new jobset.
1285
mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1286
mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1287
mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1288
    | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1289
    | otherwise = ([n]:cjs, ndx)
1290

    
1291
-- | Break a list of moves into independent groups. Note that this
1292
-- will reverse the order of jobs.
1293
splitJobs :: [MoveJob] -> [JobSet]
1294
splitJobs = fst . foldl mergeJobs ([], [])
1295

    
1296
-- | Given a list of commands, prefix them with @gnt-instance@ and
1297
-- also beautify the display a little.
1298
formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1299
formatJob jsn jsl (sn, (_, _, _, cmds)) =
1300
    let out =
1301
            printf "  echo job %d/%d" jsn sn:
1302
            printf "  check":
1303
            map ("  gnt-instance " ++) cmds
1304
    in if sn == 1
1305
       then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1306
       else out
1307

    
1308
-- | Given a list of commands, prefix them with @gnt-instance@ and
1309
-- also beautify the display a little.
1310
formatCmds :: [JobSet] -> String
1311
formatCmds =
1312
    unlines .
1313
    concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1314
                             (zip [1..] js)) .
1315
    zip [1..]
1316

    
1317
-- | Print the node list.
1318
printNodes :: Node.List -> [String] -> String
1319
printNodes nl fs =
1320
    let fields = case fs of
1321
          [] -> Node.defaultFields
1322
          "+":rest -> Node.defaultFields ++ rest
1323
          _ -> fs
1324
        snl = sortBy (comparing Node.idx) (Container.elems nl)
1325
        (header, isnum) = unzip $ map Node.showHeader fields
1326
    in unlines . map ((:) ' ' .  intercalate " ") $
1327
       formatTable (header:map (Node.list fields) snl) isnum
1328

    
1329
-- | Print the instance list.
1330
printInsts :: Node.List -> Instance.List -> String
1331
printInsts nl il =
1332
    let sil = sortBy (comparing Instance.idx) (Container.elems il)
1333
        helper inst = [ if Instance.running inst then "R" else " "
1334
                      , Instance.name inst
1335
                      , Container.nameOf nl (Instance.pNode inst)
1336
                      , let sdx = Instance.sNode inst
1337
                        in if sdx == Node.noSecondary
1338
                           then  ""
1339
                           else Container.nameOf nl sdx
1340
                      , if Instance.autoBalance inst then "Y" else "N"
1341
                      , printf "%3d" $ Instance.vcpus inst
1342
                      , printf "%5d" $ Instance.mem inst
1343
                      , printf "%5d" $ Instance.dsk inst `div` 1024
1344
                      , printf "%5.3f" lC
1345
                      , printf "%5.3f" lM
1346
                      , printf "%5.3f" lD
1347
                      , printf "%5.3f" lN
1348
                      ]
1349
            where DynUtil lC lM lD lN = Instance.util inst
1350
        header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1351
                 , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1352
        isnum = False:False:False:False:False:repeat True
1353
    in unlines . map ((:) ' ' . intercalate " ") $
1354
       formatTable (header:map helper sil) isnum
1355

    
1356
-- | Shows statistics for a given node list.
1357
printStats :: Node.List -> String
1358
printStats nl =
1359
    let dcvs = compDetailedCV $ Container.elems nl
1360
        (weights, names) = unzip detailedCVInfo
1361
        hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1362
        formatted = map (\(w, header, val) ->
1363
                             printf "%s=%.8f(x%.2f)" header val w::String) hd
1364
    in intercalate ", " formatted
1365

    
1366
-- | Convert a placement into a list of OpCodes (basically a job).
1367
iMoveToJob :: Node.List -> Instance.List
1368
          -> Idx -> IMove -> [OpCodes.OpCode]
1369
iMoveToJob nl il idx move =
1370
    let inst = Container.find idx il
1371
        iname = Instance.name inst
1372
        lookNode  = Just . Container.nameOf nl
1373
        opF = OpCodes.OpInstanceMigrate iname True False True
1374
        opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n)
1375
                OpCodes.ReplaceNewSecondary [] Nothing
1376
    in case move of
1377
         Failover -> [ opF ]
1378
         ReplacePrimary np -> [ opF, opR np, opF ]
1379
         ReplaceSecondary ns -> [ opR ns ]
1380
         ReplaceAndFailover np -> [ opR np, opF ]
1381
         FailoverAndReplace ns -> [ opF, opR ns ]
1382

    
1383
-- * Node group functions
1384

    
1385
-- | Computes the group of an instance.
1386
instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1387
instanceGroup nl i =
1388
  let sidx = Instance.sNode i
1389
      pnode = Container.find (Instance.pNode i) nl
1390
      snode = if sidx == Node.noSecondary
1391
              then pnode
1392
              else Container.find sidx nl
1393
      pgroup = Node.group pnode
1394
      sgroup = Node.group snode
1395
  in if pgroup /= sgroup
1396
     then fail ("Instance placed accross two node groups, primary " ++
1397
                show pgroup ++ ", secondary " ++ show sgroup)
1398
     else return pgroup
1399

    
1400
-- | Computes the group of an instance per the primary node.
1401
instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1402
instancePriGroup nl i =
1403
  let pnode = Container.find (Instance.pNode i) nl
1404
  in  Node.group pnode
1405

    
1406
-- | Compute the list of badly allocated instances (split across node
1407
-- groups).
1408
findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1409
findSplitInstances nl =
1410
  filter (not . isOk . instanceGroup nl) . Container.elems
1411

    
1412
-- | Splits a cluster into the component node groups.
1413
splitCluster :: Node.List -> Instance.List ->
1414
                [(Gdx, (Node.List, Instance.List))]
1415
splitCluster nl il =
1416
  let ngroups = Node.computeGroups (Container.elems nl)
1417
  in map (\(guuid, nodes) ->
1418
           let nidxs = map Node.idx nodes
1419
               nodes' = zip nidxs nodes
1420
               instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1421
           in (guuid, (Container.fromList nodes', instances))) ngroups
1422

    
1423
-- | Split a global instance index map into per-group, and associate
1424
-- it with the group/node/instance lists.
1425
associateIdxs :: [Idx] -- ^ Instance indices to be split/associated
1426
              -> [(Gdx, (Node.List, Instance.List))]        -- ^ Input groups
1427
              -> [(Gdx, (Node.List, Instance.List, [Idx]))] -- ^ Result
1428
associateIdxs idxs =
1429
    map (\(gdx, (nl, il)) ->
1430
             (gdx, (nl, il, filter (`Container.member` il) idxs)))
1431

    
1432
-- | Compute the list of nodes that are to be evacuated, given a list
1433
-- of instances and an evacuation mode.
1434
nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1435
                -> EvacMode      -- ^ The evacuation mode we're using
1436
                -> [Idx]         -- ^ List of instance indices being evacuated
1437
                -> IntSet.IntSet -- ^ Set of node indices
1438
nodesToEvacuate il mode =
1439
    IntSet.delete Node.noSecondary .
1440
    foldl' (\ns idx ->
1441
                let i = Container.find idx il
1442
                    pdx = Instance.pNode i
1443
                    sdx = Instance.sNode i
1444
                    dt = Instance.diskTemplate i
1445
                    withSecondary = case dt of
1446
                                      DTDrbd8 -> IntSet.insert sdx ns
1447
                                      _ -> ns
1448
                in case mode of
1449
                     ChangePrimary   -> IntSet.insert pdx ns
1450
                     ChangeSecondary -> withSecondary
1451
                     ChangeAll       -> IntSet.insert pdx withSecondary
1452
           ) IntSet.empty