Statistics
| Branch: | Tag: | Revision:

root / htools / Ganeti / HTools / Cluster.hs @ 6804faa0

History | View | Annotate | Download (62.2 kB)

1
{-| Implementation of cluster-wide logic.
2

    
3
This module holds all pure cluster-logic; I\/O related functionality
4
goes into the /Main/ module for the individual binaries.
5

    
6
-}
7

    
8
{-
9

    
10
Copyright (C) 2009, 2010, 2011 Google Inc.
11

    
12
This program is free software; you can redistribute it and/or modify
13
it under the terms of the GNU General Public License as published by
14
the Free Software Foundation; either version 2 of the License, or
15
(at your option) any later version.
16

    
17
This program is distributed in the hope that it will be useful, but
18
WITHOUT ANY WARRANTY; without even the implied warranty of
19
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
General Public License for more details.
21

    
22
You should have received a copy of the GNU General Public License
23
along with this program; if not, write to the Free Software
24
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25
02110-1301, USA.
26

    
27
-}
28

    
29
module Ganeti.HTools.Cluster
30
    (
31
     -- * Types
32
      AllocSolution(..)
33
    , EvacSolution(..)
34
    , Table(..)
35
    , CStats(..)
36
    , AllocStats
37
    -- * Generic functions
38
    , totalResources
39
    , computeAllocationDelta
40
    -- * First phase functions
41
    , computeBadItems
42
    -- * Second phase functions
43
    , printSolutionLine
44
    , formatCmds
45
    , involvedNodes
46
    , splitJobs
47
    -- * Display functions
48
    , printNodes
49
    , printInsts
50
    -- * Balacing functions
51
    , checkMove
52
    , doNextBalance
53
    , tryBalance
54
    , compCV
55
    , compCVNodes
56
    , compDetailedCV
57
    , printStats
58
    , iMoveToJob
59
    -- * IAllocator functions
60
    , genAllocNodes
61
    , tryAlloc
62
    , tryMGAlloc
63
    , tryReloc
64
    , tryNodeEvac
65
    , tryChangeGroup
66
    , collapseFailures
67
    -- * Allocation functions
68
    , iterateAlloc
69
    , tieredAlloc
70
     -- * Node group functions
71
    , instanceGroup
72
    , findSplitInstances
73
    , splitCluster
74
    ) where
75

    
76
import qualified Data.IntSet as IntSet
77
import Data.List
78
import Data.Maybe (fromJust)
79
import Data.Ord (comparing)
80
import Text.Printf (printf)
81
import Control.Monad
82

    
83
import qualified Ganeti.HTools.Container as Container
84
import qualified Ganeti.HTools.Instance as Instance
85
import qualified Ganeti.HTools.Node as Node
86
import qualified Ganeti.HTools.Group as Group
87
import Ganeti.HTools.Types
88
import Ganeti.HTools.Utils
89
import Ganeti.HTools.Compat
90
import qualified Ganeti.OpCodes as OpCodes
91

    
92
-- * Types
93

    
94
-- | Allocation\/relocation solution.
95
data AllocSolution = AllocSolution
96
  { asFailures  :: [FailMode]          -- ^ Failure counts
97
  , asAllocs    :: Int                 -- ^ Good allocation count
98
  , asSolutions :: [Node.AllocElement] -- ^ The actual result, length
99
                                       -- of the list depends on the
100
                                       -- allocation/relocation mode
101
  , asLog       :: [String]            -- ^ A list of informational messages
102
  }
103

    
104
-- | Node evacuation/group change iallocator result type. This result
105
-- type consists of actual opcodes (a restricted subset) that are
106
-- transmitted back to Ganeti.
107
data EvacSolution = EvacSolution
108
    { esMoved   :: [(Idx, Gdx, [Ndx])]  -- ^ Instances moved successfully
109
    , esFailed  :: [(Idx, String)]      -- ^ Instances which were not
110
                                        -- relocated
111
    , esOpCodes :: [[[OpCodes.OpCode]]] -- ^ List of lists of jobs
112
    }
113

    
114
-- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
115
type AllocResult = (FailStats, Node.List, Instance.List,
116
                    [Instance.Instance], [CStats])
117

    
118
-- | A type denoting the valid allocation mode/pairs.
119
--
120
-- For a one-node allocation, this will be a @Left ['Node.Node']@,
121
-- whereas for a two-node allocation, this will be a @Right
122
-- [('Node.Node', 'Node.Node')]@.
123
type AllocNodes = Either [Ndx] [(Ndx, Ndx)]
124

    
125
-- | The empty solution we start with when computing allocations.
126
emptyAllocSolution :: AllocSolution
127
emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
128
                                   , asSolutions = [], asLog = [] }
129

    
130
-- | The empty evac solution.
131
emptyEvacSolution :: EvacSolution
132
emptyEvacSolution = EvacSolution { esMoved = []
133
                                 , esFailed = []
134
                                 , esOpCodes = []
135
                                 }
136

    
137
-- | The complete state for the balancing solution.
138
data Table = Table Node.List Instance.List Score [Placement]
139
             deriving (Show, Read)
140

    
141
-- | Cluster statistics data type.
142
data CStats = CStats { csFmem :: Integer -- ^ Cluster free mem
143
                     , csFdsk :: Integer -- ^ Cluster free disk
144
                     , csAmem :: Integer -- ^ Cluster allocatable mem
145
                     , csAdsk :: Integer -- ^ Cluster allocatable disk
146
                     , csAcpu :: Integer -- ^ Cluster allocatable cpus
147
                     , csMmem :: Integer -- ^ Max node allocatable mem
148
                     , csMdsk :: Integer -- ^ Max node allocatable disk
149
                     , csMcpu :: Integer -- ^ Max node allocatable cpu
150
                     , csImem :: Integer -- ^ Instance used mem
151
                     , csIdsk :: Integer -- ^ Instance used disk
152
                     , csIcpu :: Integer -- ^ Instance used cpu
153
                     , csTmem :: Double  -- ^ Cluster total mem
154
                     , csTdsk :: Double  -- ^ Cluster total disk
155
                     , csTcpu :: Double  -- ^ Cluster total cpus
156
                     , csVcpu :: Integer -- ^ Cluster virtual cpus (if
157
                                         -- node pCpu has been set,
158
                                         -- otherwise -1)
159
                     , csXmem :: Integer -- ^ Unnacounted for mem
160
                     , csNmem :: Integer -- ^ Node own memory
161
                     , csScore :: Score  -- ^ The cluster score
162
                     , csNinst :: Int    -- ^ The total number of instances
163
                     }
164
            deriving (Show, Read)
165

    
166
-- | Currently used, possibly to allocate, unallocable.
167
type AllocStats = (RSpec, RSpec, RSpec)
168

    
169
-- * Utility functions
170

    
171
-- | Verifies the N+1 status and return the affected nodes.
172
verifyN1 :: [Node.Node] -> [Node.Node]
173
verifyN1 = filter Node.failN1
174

    
175
{-| Computes the pair of bad nodes and instances.
176

    
177
The bad node list is computed via a simple 'verifyN1' check, and the
178
bad instance list is the list of primary and secondary instances of
179
those nodes.
180

    
181
-}
182
computeBadItems :: Node.List -> Instance.List ->
183
                   ([Node.Node], [Instance.Instance])
184
computeBadItems nl il =
185
  let bad_nodes = verifyN1 $ getOnline nl
186
      bad_instances = map (`Container.find` il) .
187
                      sort . nub $
188
                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
189
  in
190
    (bad_nodes, bad_instances)
191

    
192
-- | Zero-initializer for the CStats type.
193
emptyCStats :: CStats
194
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
195

    
196
-- | Update stats with data from a new node.
197
updateCStats :: CStats -> Node.Node -> CStats
198
updateCStats cs node =
199
    let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
200
                 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
201
                 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
202
                 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
203
                 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
204
                 csVcpu = x_vcpu,
205
                 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
206
               }
207
            = cs
208
        inc_amem = Node.fMem node - Node.rMem node
209
        inc_amem' = if inc_amem > 0 then inc_amem else 0
210
        inc_adsk = Node.availDisk node
211
        inc_imem = truncate (Node.tMem node) - Node.nMem node
212
                   - Node.xMem node - Node.fMem node
213
        inc_icpu = Node.uCpu node
214
        inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
215
        inc_vcpu = Node.hiCpu node
216
        inc_acpu = Node.availCpu node
217

    
218
    in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
219
          , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
220
          , csAmem = x_amem + fromIntegral inc_amem'
221
          , csAdsk = x_adsk + fromIntegral inc_adsk
222
          , csAcpu = x_acpu + fromIntegral inc_acpu
223
          , csMmem = max x_mmem (fromIntegral inc_amem')
224
          , csMdsk = max x_mdsk (fromIntegral inc_adsk)
225
          , csMcpu = max x_mcpu (fromIntegral inc_acpu)
226
          , csImem = x_imem + fromIntegral inc_imem
227
          , csIdsk = x_idsk + fromIntegral inc_idsk
228
          , csIcpu = x_icpu + fromIntegral inc_icpu
229
          , csTmem = x_tmem + Node.tMem node
230
          , csTdsk = x_tdsk + Node.tDsk node
231
          , csTcpu = x_tcpu + Node.tCpu node
232
          , csVcpu = x_vcpu + fromIntegral inc_vcpu
233
          , csXmem = x_xmem + fromIntegral (Node.xMem node)
234
          , csNmem = x_nmem + fromIntegral (Node.nMem node)
235
          , csNinst = x_ninst + length (Node.pList node)
236
          }
237

    
238
-- | Compute the total free disk and memory in the cluster.
239
totalResources :: Node.List -> CStats
240
totalResources nl =
241
    let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
242
    in cs { csScore = compCV nl }
243

    
244
-- | Compute the delta between two cluster state.
245
--
246
-- This is used when doing allocations, to understand better the
247
-- available cluster resources. The return value is a triple of the
248
-- current used values, the delta that was still allocated, and what
249
-- was left unallocated.
250
computeAllocationDelta :: CStats -> CStats -> AllocStats
251
computeAllocationDelta cini cfin =
252
    let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini
253
        CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
254
                csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin
255
        rini = RSpec (fromIntegral i_icpu) (fromIntegral i_imem)
256
               (fromIntegral i_idsk)
257
        rfin = RSpec (fromIntegral (f_icpu - i_icpu))
258
               (fromIntegral (f_imem - i_imem))
259
               (fromIntegral (f_idsk - i_idsk))
260
        un_cpu = fromIntegral (v_cpu - f_icpu)::Int
261
        runa = RSpec un_cpu (truncate t_mem - fromIntegral f_imem)
262
               (truncate t_dsk - fromIntegral f_idsk)
263
    in (rini, rfin, runa)
264

    
265
-- | The names and weights of the individual elements in the CV list.
266
detailedCVInfo :: [(Double, String)]
267
detailedCVInfo = [ (1,  "free_mem_cv")
268
                 , (1,  "free_disk_cv")
269
                 , (1,  "n1_cnt")
270
                 , (1,  "reserved_mem_cv")
271
                 , (4,  "offline_all_cnt")
272
                 , (16, "offline_pri_cnt")
273
                 , (1,  "vcpu_ratio_cv")
274
                 , (1,  "cpu_load_cv")
275
                 , (1,  "mem_load_cv")
276
                 , (1,  "disk_load_cv")
277
                 , (1,  "net_load_cv")
278
                 , (2,  "pri_tags_score")
279
                 ]
280

    
281
-- | Holds the weights used by 'compCVNodes' for each metric.
282
detailedCVWeights :: [Double]
283
detailedCVWeights = map fst detailedCVInfo
284

    
285
-- | Compute the mem and disk covariance.
286
compDetailedCV :: [Node.Node] -> [Double]
287
compDetailedCV all_nodes =
288
    let
289
        (offline, nodes) = partition Node.offline all_nodes
290
        mem_l = map Node.pMem nodes
291
        dsk_l = map Node.pDsk nodes
292
        -- metric: memory covariance
293
        mem_cv = stdDev mem_l
294
        -- metric: disk covariance
295
        dsk_cv = stdDev dsk_l
296
        -- metric: count of instances living on N1 failing nodes
297
        n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
298
                                                   length (Node.pList n)) .
299
                   filter Node.failN1 $ nodes :: Double
300
        res_l = map Node.pRem nodes
301
        -- metric: reserved memory covariance
302
        res_cv = stdDev res_l
303
        -- offline instances metrics
304
        offline_ipri = sum . map (length . Node.pList) $ offline
305
        offline_isec = sum . map (length . Node.sList) $ offline
306
        -- metric: count of instances on offline nodes
307
        off_score = fromIntegral (offline_ipri + offline_isec)::Double
308
        -- metric: count of primary instances on offline nodes (this
309
        -- helps with evacuation/failover of primary instances on
310
        -- 2-node clusters with one node offline)
311
        off_pri_score = fromIntegral offline_ipri::Double
312
        cpu_l = map Node.pCpu nodes
313
        -- metric: covariance of vcpu/pcpu ratio
314
        cpu_cv = stdDev cpu_l
315
        -- metrics: covariance of cpu, memory, disk and network load
316
        (c_load, m_load, d_load, n_load) = unzip4 $
317
            map (\n ->
318
                     let DynUtil c1 m1 d1 n1 = Node.utilLoad n
319
                         DynUtil c2 m2 d2 n2 = Node.utilPool n
320
                     in (c1/c2, m1/m2, d1/d2, n1/n2)
321
                ) nodes
322
        -- metric: conflicting instance count
323
        pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
324
        pri_tags_score = fromIntegral pri_tags_inst::Double
325
    in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
326
       , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
327
       , pri_tags_score ]
328

    
329
-- | Compute the /total/ variance.
330
compCVNodes :: [Node.Node] -> Double
331
compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
332

    
333
-- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
334
compCV :: Node.List -> Double
335
compCV = compCVNodes . Container.elems
336

    
337
-- | Compute online nodes from a 'Node.List'.
338
getOnline :: Node.List -> [Node.Node]
339
getOnline = filter (not . Node.offline) . Container.elems
340

    
341
-- * Balancing functions
342

    
343
-- | Compute best table. Note that the ordering of the arguments is important.
344
compareTables :: Table -> Table -> Table
345
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
346
    if a_cv > b_cv then b else a
347

    
348
-- | Applies an instance move to a given node list and instance.
349
applyMove :: Node.List -> Instance.Instance
350
          -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
351
-- Failover (f)
352
applyMove nl inst Failover =
353
    let old_pdx = Instance.pNode inst
354
        old_sdx = Instance.sNode inst
355
        old_p = Container.find old_pdx nl
356
        old_s = Container.find old_sdx nl
357
        int_p = Node.removePri old_p inst
358
        int_s = Node.removeSec old_s inst
359
        force_p = Node.offline old_p
360
        new_nl = do -- Maybe monad
361
          new_p <- Node.addPriEx force_p int_s inst
362
          new_s <- Node.addSec int_p inst old_sdx
363
          let new_inst = Instance.setBoth inst old_sdx old_pdx
364
          return (Container.addTwo old_pdx new_s old_sdx new_p nl,
365
                  new_inst, old_sdx, old_pdx)
366
    in new_nl
367

    
368
-- Replace the primary (f:, r:np, f)
369
applyMove nl inst (ReplacePrimary new_pdx) =
370
    let old_pdx = Instance.pNode inst
371
        old_sdx = Instance.sNode inst
372
        old_p = Container.find old_pdx nl
373
        old_s = Container.find old_sdx nl
374
        tgt_n = Container.find new_pdx nl
375
        int_p = Node.removePri old_p inst
376
        int_s = Node.removeSec old_s inst
377
        force_p = Node.offline old_p
378
        new_nl = do -- Maybe monad
379
          -- check that the current secondary can host the instance
380
          -- during the migration
381
          tmp_s <- Node.addPriEx force_p int_s inst
382
          let tmp_s' = Node.removePri tmp_s inst
383
          new_p <- Node.addPriEx force_p tgt_n inst
384
          new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
385
          let new_inst = Instance.setPri inst new_pdx
386
          return (Container.add new_pdx new_p $
387
                  Container.addTwo old_pdx int_p old_sdx new_s nl,
388
                  new_inst, new_pdx, old_sdx)
389
    in new_nl
390

    
391
-- Replace the secondary (r:ns)
392
applyMove nl inst (ReplaceSecondary new_sdx) =
393
    let old_pdx = Instance.pNode inst
394
        old_sdx = Instance.sNode inst
395
        old_s = Container.find old_sdx nl
396
        tgt_n = Container.find new_sdx nl
397
        int_s = Node.removeSec old_s inst
398
        force_s = Node.offline old_s
399
        new_inst = Instance.setSec inst new_sdx
400
        new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
401
                 \new_s -> return (Container.addTwo new_sdx
402
                                   new_s old_sdx int_s nl,
403
                                   new_inst, old_pdx, new_sdx)
404
    in new_nl
405

    
406
-- Replace the secondary and failover (r:np, f)
407
applyMove nl inst (ReplaceAndFailover new_pdx) =
408
    let old_pdx = Instance.pNode inst
409
        old_sdx = Instance.sNode inst
410
        old_p = Container.find old_pdx nl
411
        old_s = Container.find old_sdx nl
412
        tgt_n = Container.find new_pdx nl
413
        int_p = Node.removePri old_p inst
414
        int_s = Node.removeSec old_s inst
415
        force_s = Node.offline old_s
416
        new_nl = do -- Maybe monad
417
          new_p <- Node.addPri tgt_n inst
418
          new_s <- Node.addSecEx force_s int_p inst new_pdx
419
          let new_inst = Instance.setBoth inst new_pdx old_pdx
420
          return (Container.add new_pdx new_p $
421
                  Container.addTwo old_pdx new_s old_sdx int_s nl,
422
                  new_inst, new_pdx, old_pdx)
423
    in new_nl
424

    
425
-- Failver and replace the secondary (f, r:ns)
426
applyMove nl inst (FailoverAndReplace new_sdx) =
427
    let old_pdx = Instance.pNode inst
428
        old_sdx = Instance.sNode inst
429
        old_p = Container.find old_pdx nl
430
        old_s = Container.find old_sdx nl
431
        tgt_n = Container.find new_sdx nl
432
        int_p = Node.removePri old_p inst
433
        int_s = Node.removeSec old_s inst
434
        force_p = Node.offline old_p
435
        new_nl = do -- Maybe monad
436
          new_p <- Node.addPriEx force_p int_s inst
437
          new_s <- Node.addSecEx force_p tgt_n inst old_sdx
438
          let new_inst = Instance.setBoth inst old_sdx new_sdx
439
          return (Container.add new_sdx new_s $
440
                  Container.addTwo old_sdx new_p old_pdx int_p nl,
441
                  new_inst, old_sdx, new_sdx)
442
    in new_nl
443

    
444
-- | Tries to allocate an instance on one given node.
445
allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
446
                 -> OpResult Node.AllocElement
447
allocateOnSingle nl inst new_pdx =
448
    let p = Container.find new_pdx nl
449
        new_inst = Instance.setBoth inst new_pdx Node.noSecondary
450
    in  Node.addPri p inst >>= \new_p -> do
451
      let new_nl = Container.add new_pdx new_p nl
452
          new_score = compCV nl
453
      return (new_nl, new_inst, [new_p], new_score)
454

    
455
-- | Tries to allocate an instance on a given pair of nodes.
456
allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
457
               -> OpResult Node.AllocElement
458
allocateOnPair nl inst new_pdx new_sdx =
459
    let tgt_p = Container.find new_pdx nl
460
        tgt_s = Container.find new_sdx nl
461
    in do
462
      new_p <- Node.addPri tgt_p inst
463
      new_s <- Node.addSec tgt_s inst new_pdx
464
      let new_inst = Instance.setBoth inst new_pdx new_sdx
465
          new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
466
      return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
467

    
468
-- | Tries to perform an instance move and returns the best table
469
-- between the original one and the new one.
470
checkSingleStep :: Table -- ^ The original table
471
                -> Instance.Instance -- ^ The instance to move
472
                -> Table -- ^ The current best table
473
                -> IMove -- ^ The move to apply
474
                -> Table -- ^ The final best table
475
checkSingleStep ini_tbl target cur_tbl move =
476
    let
477
        Table ini_nl ini_il _ ini_plc = ini_tbl
478
        tmp_resu = applyMove ini_nl target move
479
    in
480
      case tmp_resu of
481
        OpFail _ -> cur_tbl
482
        OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
483
            let tgt_idx = Instance.idx target
484
                upd_cvar = compCV upd_nl
485
                upd_il = Container.add tgt_idx new_inst ini_il
486
                upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
487
                upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
488
            in
489
              compareTables cur_tbl upd_tbl
490

    
491
-- | Given the status of the current secondary as a valid new node and
492
-- the current candidate target node, generate the possible moves for
493
-- a instance.
494
possibleMoves :: Bool      -- ^ Whether the secondary node is a valid new node
495
              -> Bool      -- ^ Whether we can change the primary node
496
              -> Ndx       -- ^ Target node candidate
497
              -> [IMove]   -- ^ List of valid result moves
498

    
499
possibleMoves _ False tdx =
500
    [ReplaceSecondary tdx]
501

    
502
possibleMoves True True tdx =
503
    [ReplaceSecondary tdx,
504
     ReplaceAndFailover tdx,
505
     ReplacePrimary tdx,
506
     FailoverAndReplace tdx]
507

    
508
possibleMoves False True tdx =
509
    [ReplaceSecondary tdx,
510
     ReplaceAndFailover tdx]
511

    
512
-- | Compute the best move for a given instance.
513
checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
514
                  -> Bool              -- ^ Whether disk moves are allowed
515
                  -> Bool              -- ^ Whether instance moves are allowed
516
                  -> Table             -- ^ Original table
517
                  -> Instance.Instance -- ^ Instance to move
518
                  -> Table             -- ^ Best new table for this instance
519
checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
520
    let
521
        opdx = Instance.pNode target
522
        osdx = Instance.sNode target
523
        nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx
524
        use_secondary = elem osdx nodes_idx && inst_moves
525
        aft_failover = if use_secondary -- if allowed to failover
526
                       then checkSingleStep ini_tbl target ini_tbl Failover
527
                       else ini_tbl
528
        all_moves = if disk_moves
529
                    then concatMap
530
                         (possibleMoves use_secondary inst_moves) nodes
531
                    else []
532
    in
533
      -- iterate over the possible nodes for this instance
534
      foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
535

    
536
-- | Compute the best next move.
537
checkMove :: [Ndx]               -- ^ Allowed target node indices
538
          -> Bool                -- ^ Whether disk moves are allowed
539
          -> Bool                -- ^ Whether instance moves are allowed
540
          -> Table               -- ^ The current solution
541
          -> [Instance.Instance] -- ^ List of instances still to move
542
          -> Table               -- ^ The new solution
543
checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
544
    let Table _ _ _ ini_plc = ini_tbl
545
        -- we're using rwhnf from the Control.Parallel.Strategies
546
        -- package; we don't need to use rnf as that would force too
547
        -- much evaluation in single-threaded cases, and in
548
        -- multi-threaded case the weak head normal form is enough to
549
        -- spark the evaluation
550
        tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
551
                               inst_moves ini_tbl)
552
                 victims
553
        -- iterate over all instances, computing the best move
554
        best_tbl = foldl' compareTables ini_tbl tables
555
        Table _ _ _ best_plc = best_tbl
556
    in if length best_plc == length ini_plc
557
       then ini_tbl -- no advancement
558
       else best_tbl
559

    
560
-- | Check if we are allowed to go deeper in the balancing.
561
doNextBalance :: Table     -- ^ The starting table
562
              -> Int       -- ^ Remaining length
563
              -> Score     -- ^ Score at which to stop
564
              -> Bool      -- ^ The resulting table and commands
565
doNextBalance ini_tbl max_rounds min_score =
566
    let Table _ _ ini_cv ini_plc = ini_tbl
567
        ini_plc_len = length ini_plc
568
    in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
569

    
570
-- | Run a balance move.
571
tryBalance :: Table       -- ^ The starting table
572
           -> Bool        -- ^ Allow disk moves
573
           -> Bool        -- ^ Allow instance moves
574
           -> Bool        -- ^ Only evacuate moves
575
           -> Score       -- ^ Min gain threshold
576
           -> Score       -- ^ Min gain
577
           -> Maybe Table -- ^ The resulting table and commands
578
tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
579
    let Table ini_nl ini_il ini_cv _ = ini_tbl
580
        all_inst = Container.elems ini_il
581
        all_inst' = if evac_mode
582
                    then let bad_nodes = map Node.idx . filter Node.offline $
583
                                         Container.elems ini_nl
584
                         in filter (any (`elem` bad_nodes) . Instance.allNodes)
585
                            all_inst
586
                    else all_inst
587
        reloc_inst = filter Instance.movable all_inst'
588
        node_idx = map Node.idx . filter (not . Node.offline) $
589
                   Container.elems ini_nl
590
        fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
591
        (Table _ _ fin_cv _) = fin_tbl
592
    in
593
      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
594
      then Just fin_tbl -- this round made success, return the new table
595
      else Nothing
596

    
597
-- * Allocation functions
598

    
599
-- | Build failure stats out of a list of failures.
600
collapseFailures :: [FailMode] -> FailStats
601
collapseFailures flst =
602
    map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
603
            [minBound..maxBound]
604

    
605
-- | Update current Allocation solution and failure stats with new
606
-- elements.
607
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
608
concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
609

    
610
concatAllocs as (OpGood ns@(_, _, _, nscore)) =
611
    let -- Choose the old or new solution, based on the cluster score
612
        cntok = asAllocs as
613
        osols = asSolutions as
614
        nsols = case osols of
615
                  [] -> [ns]
616
                  (_, _, _, oscore):[] ->
617
                      if oscore < nscore
618
                      then osols
619
                      else [ns]
620
                  -- FIXME: here we simply concat to lists with more
621
                  -- than one element; we should instead abort, since
622
                  -- this is not a valid usage of this function
623
                  xs -> ns:xs
624
        nsuc = cntok + 1
625
    -- Note: we force evaluation of nsols here in order to keep the
626
    -- memory profile low - we know that we will need nsols for sure
627
    -- in the next cycle, so we force evaluation of nsols, since the
628
    -- foldl' in the caller will only evaluate the tuple, but not the
629
    -- elements of the tuple
630
    in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolutions = nsols }
631

    
632
-- | Given a solution, generates a reasonable description for it.
633
describeSolution :: AllocSolution -> String
634
describeSolution as =
635
  let fcnt = asFailures as
636
      sols = asSolutions as
637
      freasons =
638
        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
639
        filter ((> 0) . snd) . collapseFailures $ fcnt
640
  in if null sols
641
     then "No valid allocation solutions, failure reasons: " ++
642
          (if null fcnt
643
           then "unknown reasons"
644
           else freasons)
645
     else let (_, _, nodes, cv) = head sols
646
          in printf ("score: %.8f, successes %d, failures %d (%s)" ++
647
                     " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
648
             (intercalate "/" . map Node.name $ nodes)
649

    
650
-- | Annotates a solution with the appropriate string.
651
annotateSolution :: AllocSolution -> AllocSolution
652
annotateSolution as = as { asLog = describeSolution as : asLog as }
653

    
654
-- | Reverses an evacuation solution.
655
--
656
-- Rationale: we always concat the results to the top of the lists, so
657
-- for proper jobset execution, we should reverse all lists.
658
reverseEvacSolution :: EvacSolution -> EvacSolution
659
reverseEvacSolution (EvacSolution f m o) =
660
    EvacSolution (reverse f) (reverse m) (reverse o)
661

    
662
-- | Generate the valid node allocation singles or pairs for a new instance.
663
genAllocNodes :: Group.List        -- ^ Group list
664
              -> Node.List         -- ^ The node map
665
              -> Int               -- ^ The number of nodes required
666
              -> Bool              -- ^ Whether to drop or not
667
                                   -- unallocable nodes
668
              -> Result AllocNodes -- ^ The (monadic) result
669
genAllocNodes gl nl count drop_unalloc =
670
    let filter_fn = if drop_unalloc
671
                    then filter (Group.isAllocable .
672
                                 flip Container.find gl . Node.group)
673
                    else id
674
        all_nodes = filter_fn $ getOnline nl
675
        all_pairs = liftM2 (,) all_nodes all_nodes
676
        ok_pairs = filter (\(x, y) -> Node.idx x /= Node.idx y &&
677
                                      Node.group x == Node.group y) all_pairs
678
    in case count of
679
         1 -> Ok (Left (map Node.idx all_nodes))
680
         2 -> Ok (Right (map (\(p, s) -> (Node.idx p, Node.idx s)) ok_pairs))
681
         _ -> Bad "Unsupported number of nodes, only one or two  supported"
682

    
683
-- | Try to allocate an instance on the cluster.
684
tryAlloc :: (Monad m) =>
685
            Node.List         -- ^ The node list
686
         -> Instance.List     -- ^ The instance list
687
         -> Instance.Instance -- ^ The instance to allocate
688
         -> AllocNodes        -- ^ The allocation targets
689
         -> m AllocSolution   -- ^ Possible solution list
690
tryAlloc nl _ inst (Right ok_pairs) =
691
    let sols = foldl' (\cstate (p, s) ->
692
                           concatAllocs cstate $ allocateOnPair nl inst p s
693
                      ) emptyAllocSolution ok_pairs
694

    
695
    in if null ok_pairs -- means we have just one node
696
       then fail "Not enough online nodes"
697
       else return $ annotateSolution sols
698

    
699
tryAlloc nl _ inst (Left all_nodes) =
700
    let sols = foldl' (\cstate ->
701
                           concatAllocs cstate . allocateOnSingle nl inst
702
                      ) emptyAllocSolution all_nodes
703
    in if null all_nodes
704
       then fail "No online nodes"
705
       else return $ annotateSolution sols
706

    
707
-- | Given a group/result, describe it as a nice (list of) messages.
708
solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
709
solutionDescription gl (groupId, result) =
710
  case result of
711
    Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
712
    Bad message -> [printf "Group %s: error %s" gname message]
713
  where grp = Container.find groupId gl
714
        gname = Group.name grp
715
        pol = apolToString (Group.allocPolicy grp)
716

    
717
-- | From a list of possibly bad and possibly empty solutions, filter
718
-- only the groups with a valid result. Note that the result will be
719
-- reversed compared to the original list.
720
filterMGResults :: Group.List
721
                -> [(Gdx, Result AllocSolution)]
722
                -> [(Gdx, AllocSolution)]
723
filterMGResults gl = foldl' fn []
724
    where unallocable = not . Group.isAllocable . flip Container.find gl
725
          fn accu (gdx, rasol) =
726
              case rasol of
727
                Bad _ -> accu
728
                Ok sol | null (asSolutions sol) -> accu
729
                       | unallocable gdx -> accu
730
                       | otherwise -> (gdx, sol):accu
731

    
732
-- | Sort multigroup results based on policy and score.
733
sortMGResults :: Group.List
734
             -> [(Gdx, AllocSolution)]
735
             -> [(Gdx, AllocSolution)]
736
sortMGResults gl sols =
737
    let extractScore (_, _, _, x) = x
738
        solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
739
                               (extractScore . head . asSolutions) sol)
740
    in sortBy (comparing solScore) sols
741

    
742
-- | Finds the best group for an instance on a multi-group cluster.
743
--
744
-- Only solutions in @preferred@ and @last_resort@ groups will be
745
-- accepted as valid, and additionally if the allowed groups parameter
746
-- is not null then allocation will only be run for those group
747
-- indices.
748
findBestAllocGroup :: Group.List           -- ^ The group list
749
                   -> Node.List            -- ^ The node list
750
                   -> Instance.List        -- ^ The instance list
751
                   -> Maybe [Gdx]          -- ^ The allowed groups
752
                   -> Instance.Instance    -- ^ The instance to allocate
753
                   -> Int                  -- ^ Required number of nodes
754
                   -> Result (Gdx, AllocSolution, [String])
755
findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
756
  let groups = splitCluster mgnl mgil
757
      groups' = maybe groups (\gs -> filter ((`elem` gs) . fst) groups)
758
                allowed_gdxs
759
      sols = map (\(gid, (nl, il)) ->
760
                   (gid, genAllocNodes mggl nl cnt False >>=
761
                       tryAlloc nl il inst))
762
             groups'::[(Gdx, Result AllocSolution)]
763
      all_msgs = concatMap (solutionDescription mggl) sols
764
      goodSols = filterMGResults mggl sols
765
      sortedSols = sortMGResults mggl goodSols
766
  in if null sortedSols
767
     then Bad $ intercalate ", " all_msgs
768
     else let (final_group, final_sol) = head sortedSols
769
          in return (final_group, final_sol, all_msgs)
770

    
771
-- | Try to allocate an instance on a multi-group cluster.
772
tryMGAlloc :: Group.List           -- ^ The group list
773
           -> Node.List            -- ^ The node list
774
           -> Instance.List        -- ^ The instance list
775
           -> Instance.Instance    -- ^ The instance to allocate
776
           -> Int                  -- ^ Required number of nodes
777
           -> Result AllocSolution -- ^ Possible solution list
778
tryMGAlloc mggl mgnl mgil inst cnt = do
779
  (best_group, solution, all_msgs) <-
780
      findBestAllocGroup mggl mgnl mgil Nothing inst cnt
781
  let group_name = Group.name $ Container.find best_group mggl
782
      selmsg = "Selected group: " ++ group_name
783
  return $ solution { asLog = selmsg:all_msgs }
784

    
785
-- | Try to relocate an instance on the cluster.
786
tryReloc :: (Monad m) =>
787
            Node.List       -- ^ The node list
788
         -> Instance.List   -- ^ The instance list
789
         -> Idx             -- ^ The index of the instance to move
790
         -> Int             -- ^ The number of nodes required
791
         -> [Ndx]           -- ^ Nodes which should not be used
792
         -> m AllocSolution -- ^ Solution list
793
tryReloc nl il xid 1 ex_idx =
794
    let all_nodes = getOnline nl
795
        inst = Container.find xid il
796
        ex_idx' = Instance.pNode inst:ex_idx
797
        valid_nodes = filter (not . flip elem ex_idx' . Node.idx) all_nodes
798
        valid_idxes = map Node.idx valid_nodes
799
        sols1 = foldl' (\cstate x ->
800
                            let em = do
801
                                  (mnl, i, _, _) <-
802
                                      applyMove nl inst (ReplaceSecondary x)
803
                                  return (mnl, i, [Container.find x mnl],
804
                                          compCV mnl)
805
                            in concatAllocs cstate em
806
                       ) emptyAllocSolution valid_idxes
807
    in return sols1
808

    
809
tryReloc _ _ _ reqn _  = fail $ "Unsupported number of relocation \
810
                                \destinations required (" ++ show reqn ++
811
                                                  "), only one supported"
812

    
813
-- | Function which fails if the requested mode is change secondary.
814
--
815
-- This is useful since except DRBD, no other disk template can
816
-- execute change secondary; thus, we can just call this function
817
-- instead of always checking for secondary mode. After the call to
818
-- this function, whatever mode we have is just a primary change.
819
failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
820
failOnSecondaryChange ChangeSecondary dt =
821
    fail $ "Instances with disk template '" ++ dtToString dt ++
822
         "' can't execute change secondary"
823
failOnSecondaryChange _ _ = return ()
824

    
825
-- | Run evacuation for a single instance.
826
--
827
-- /Note:/ this function should correctly execute both intra-group
828
-- evacuations (in all modes) and inter-group evacuations (in the
829
-- 'ChangeAll' mode). Of course, this requires that the correct list
830
-- of target nodes is passed.
831
nodeEvacInstance :: Node.List         -- ^ The node list (cluster-wide)
832
                 -> Instance.List     -- ^ Instance list (cluster-wide)
833
                 -> EvacMode          -- ^ The evacuation mode
834
                 -> Instance.Instance -- ^ The instance to be evacuated
835
                 -> Gdx               -- ^ The group we're targetting
836
                 -> [Ndx]             -- ^ The list of available nodes
837
                                      -- for allocation
838
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
839
nodeEvacInstance _ _ mode (Instance.Instance
840
                           {Instance.diskTemplate = dt@DTDiskless}) _ _ =
841
                  failOnSecondaryChange mode dt >>
842
                  fail "Diskless relocations not implemented yet"
843

    
844
nodeEvacInstance _ _ _ (Instance.Instance
845
                        {Instance.diskTemplate = DTPlain}) _ _ =
846
                  fail "Instances of type plain cannot be relocated"
847

    
848
nodeEvacInstance _ _ _ (Instance.Instance
849
                        {Instance.diskTemplate = DTFile}) _ _ =
850
                  fail "Instances of type file cannot be relocated"
851

    
852
nodeEvacInstance _ _ mode  (Instance.Instance
853
                            {Instance.diskTemplate = dt@DTSharedFile}) _ _ =
854
                  failOnSecondaryChange mode dt >>
855
                  fail "Shared file relocations not implemented yet"
856

    
857
nodeEvacInstance _ _ mode (Instance.Instance
858
                           {Instance.diskTemplate = dt@DTBlock}) _ _ =
859
                  failOnSecondaryChange mode dt >>
860
                  fail "Block device relocations not implemented yet"
861

    
862
nodeEvacInstance nl il ChangePrimary
863
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
864
                 _ _ =
865
  do
866
    (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
867
    let idx = Instance.idx inst
868
        il' = Container.add idx inst' il
869
        ops = iMoveToJob nl' il' idx Failover
870
    return (nl', il', ops)
871

    
872
nodeEvacInstance nl il ChangeSecondary
873
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
874
                 gdx avail_nodes =
875
  do
876
    (nl', inst', _, ndx) <- annotateResult "Can't find any good node" $
877
                            eitherToResult $
878
                            foldl' (evacDrbdSecondaryInner nl inst gdx)
879
                            (Left "no nodes available") avail_nodes
880
    let idx = Instance.idx inst
881
        il' = Container.add idx inst' il
882
        ops = iMoveToJob nl' il' idx (ReplaceSecondary ndx)
883
    return (nl', il', ops)
884

    
885
-- The algorithm for ChangeAll is as follows:
886
--
887
-- * generate all (primary, secondary) node pairs for the target groups
888
-- * for each pair, execute the needed moves (r:s, f, r:s) and compute
889
--   the final node list state and group score
890
-- * select the best choice via a foldl that uses the same Either
891
--   String solution as the ChangeSecondary mode
892
nodeEvacInstance nl il ChangeAll
893
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
894
                 gdx avail_nodes =
895
  do
896
    let no_nodes = Left "no nodes available"
897
        node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s]
898
    (nl', il', ops, _) <-
899
        annotateResult "Can't find any good nodes for relocation" $
900
        eitherToResult $
901
        foldl'
902
        (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of
903
                          Bad msg ->
904
                              case accu of
905
                                Right _ -> accu
906
                                -- we don't need more details (which
907
                                -- nodes, etc.) as we only selected
908
                                -- this group if we can allocate on
909
                                -- it, hence failures will not
910
                                -- propagate out of this fold loop
911
                                Left _ -> Left $ "Allocation failed: " ++ msg
912
                          Ok result@(_, _, _, new_cv) ->
913
                              let new_accu = Right result in
914
                              case accu of
915
                                Left _ -> new_accu
916
                                Right (_, _, _, old_cv) ->
917
                                    if old_cv < new_cv
918
                                    then accu
919
                                    else new_accu
920
        ) no_nodes node_pairs
921

    
922
    return (nl', il', ops)
923

    
924
-- | Inner fold function for changing secondary of a DRBD instance.
925
--
926
-- The running solution is either a @Left String@, which means we
927
-- don't have yet a working solution, or a @Right (...)@, which
928
-- represents a valid solution; it holds the modified node list, the
929
-- modified instance (after evacuation), the score of that solution,
930
-- and the new secondary node index.
931
evacDrbdSecondaryInner :: Node.List -- ^ Cluster node list
932
                       -> Instance.Instance -- ^ Instance being evacuated
933
                       -> Gdx -- ^ The group index of the instance
934
                       -> Either String ( Node.List
935
                                        , Instance.Instance
936
                                        , Score
937
                                        , Ndx)  -- ^ Current best solution
938
                       -> Ndx  -- ^ Node we're evaluating as new secondary
939
                       -> Either String ( Node.List
940
                                        , Instance.Instance
941
                                        , Score
942
                                        , Ndx) -- ^ New best solution
943
evacDrbdSecondaryInner nl inst gdx accu ndx =
944
    case applyMove nl inst (ReplaceSecondary ndx) of
945
      OpFail fm ->
946
          case accu of
947
            Right _ -> accu
948
            Left _ -> Left $ "Node " ++ Container.nameOf nl ndx ++
949
                      " failed: " ++ show fm
950
      OpGood (nl', inst', _, _) ->
951
          let nodes = Container.elems nl'
952
              -- The fromJust below is ugly (it can fail nastily), but
953
              -- at this point we should have any internal mismatches,
954
              -- and adding a monad here would be quite involved
955
              grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
956
              new_cv = compCVNodes grpnodes
957
              new_accu = Right (nl', inst', new_cv, ndx)
958
          in case accu of
959
               Left _ -> new_accu
960
               Right (_, _, old_cv, _) ->
961
                   if old_cv < new_cv
962
                   then accu
963
                   else new_accu
964

    
965
-- | Compute result of changing all nodes of a DRBD instance.
966
--
967
-- Given the target primary and secondary node (which might be in a
968
-- different group or not), this function will 'execute' all the
969
-- required steps and assuming all operations succceed, will return
970
-- the modified node and instance lists, the opcodes needed for this
971
-- and the new group score.
972
evacDrbdAllInner :: Node.List         -- ^ Cluster node list
973
                 -> Instance.List     -- ^ Cluster instance list
974
                 -> Instance.Instance -- ^ The instance to be moved
975
                 -> Gdx               -- ^ The target group index
976
                                      -- (which can differ from the
977
                                      -- current group of the
978
                                      -- instance)
979
                 -> (Ndx, Ndx)        -- ^ Tuple of new
980
                                      -- primary\/secondary nodes
981
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score)
982
evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) =
983
  do
984
    let primary = Container.find (Instance.pNode inst) nl
985
        idx = Instance.idx inst
986
    -- if the primary is offline, then we first failover
987
    (nl1, inst1, ops1) <-
988
        if Node.offline primary
989
        then do
990
          (nl', inst', _, _) <-
991
              annotateResult "Failing over to the secondary" $
992
              opToResult $ applyMove nl inst Failover
993
          return (nl', inst', [Failover])
994
        else return (nl, inst, [])
995
    let (o1, o2, o3) = (ReplaceSecondary t_pdx,
996
                        Failover,
997
                        ReplaceSecondary t_sdx)
998
    -- we now need to execute a replace secondary to the future
999
    -- primary node
1000
    (nl2, inst2, _, _) <-
1001
        annotateResult "Changing secondary to new primary" $
1002
        opToResult $
1003
        applyMove nl1 inst1 o1
1004
    let ops2 = o1:ops1
1005
    -- we now execute another failover, the primary stays fixed now
1006
    (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" $
1007
                          opToResult $ applyMove nl2 inst2 o2
1008
    let ops3 = o2:ops2
1009
    -- and finally another replace secondary, to the final secondary
1010
    (nl4, inst4, _, _) <-
1011
        annotateResult "Changing secondary to final secondary" $
1012
        opToResult $
1013
        applyMove nl3 inst3 o3
1014
    let ops4 = o3:ops3
1015
        il' = Container.add idx inst4 il
1016
        ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1017
    let nodes = Container.elems nl4
1018
        -- The fromJust below is ugly (it can fail nastily), but
1019
        -- at this point we should have any internal mismatches,
1020
        -- and adding a monad here would be quite involved
1021
        grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1022
        new_cv = compCVNodes grpnodes
1023
    return (nl4, il', ops, new_cv)
1024

    
1025
-- | Computes the nodes in a given group which are available for
1026
-- allocation.
1027
availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1028
                    -> IntSet.IntSet  -- ^ Nodes that are excluded
1029
                    -> Gdx            -- ^ The group for which we
1030
                                      -- query the nodes
1031
                    -> Result [Ndx]   -- ^ List of available node indices
1032
availableGroupNodes group_nodes excl_ndx gdx = do
1033
  local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1034
                 Ok (lookup gdx group_nodes)
1035
  let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1036
  return avail_nodes
1037

    
1038
-- | Updates the evac solution with the results of an instance
1039
-- evacuation.
1040
updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1041
                   -> Idx
1042
                   -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1043
                   -> (Node.List, Instance.List, EvacSolution)
1044
updateEvacSolution (nl, il, es) idx (Bad msg) =
1045
    (nl, il, es { esFailed = (idx, msg):esFailed es})
1046
updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1047
    (nl, il, es { esMoved = new_elem:esMoved es
1048
                , esOpCodes = [opcodes]:esOpCodes es })
1049
     where inst = Container.find idx il
1050
           new_elem = (idx,
1051
                       instancePriGroup nl inst,
1052
                       Instance.allNodes inst)
1053

    
1054
-- | Node-evacuation IAllocator mode main function.
1055
tryNodeEvac :: Group.List    -- ^ The cluster groups
1056
            -> Node.List     -- ^ The node list (cluster-wide, not per group)
1057
            -> Instance.List -- ^ Instance list (cluster-wide)
1058
            -> EvacMode      -- ^ The evacuation mode
1059
            -> [Idx]         -- ^ List of instance (indices) to be evacuated
1060
            -> Result (Node.List, Instance.List, EvacSolution)
1061
tryNodeEvac _ ini_nl ini_il mode idxs =
1062
    let evac_ndx = nodesToEvacuate ini_il mode idxs
1063
        offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1064
        excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1065
        group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1066
                                             (Container.elems nl))) $
1067
                      splitCluster ini_nl ini_il
1068
        (fin_nl, fin_il, esol) =
1069
            foldl' (\state@(nl, il, _) inst ->
1070
                        let gdx = instancePriGroup nl inst
1071
                            pdx = Instance.pNode inst in
1072
                        updateEvacSolution state (Instance.idx inst) $
1073
                        availableGroupNodes group_ndx
1074
                          (IntSet.insert pdx excl_ndx) gdx >>=
1075
                        nodeEvacInstance nl il mode inst gdx
1076
                   )
1077
            (ini_nl, ini_il, emptyEvacSolution)
1078
            (map (`Container.find` ini_il) idxs)
1079
    in return (fin_nl, fin_il, reverseEvacSolution esol)
1080

    
1081
-- | Change-group IAllocator mode main function.
1082
--
1083
-- This is very similar to 'tryNodeEvac', the only difference is that
1084
-- we don't choose as target group the current instance group, but
1085
-- instead:
1086
--
1087
--   1. at the start of the function, we compute which are the target
1088
--   groups; either no groups were passed in, in which case we choose
1089
--   all groups out of which we don't evacuate instance, or there were
1090
--   some groups passed, in which case we use those
1091
--
1092
--   2. for each instance, we use 'findBestAllocGroup' to choose the
1093
--   best group to hold the instance, and then we do what
1094
--   'tryNodeEvac' does, except for this group instead of the current
1095
--   instance group.
1096
--
1097
-- Note that the correct behaviour of this function relies on the
1098
-- function 'nodeEvacInstance' to be able to do correctly both
1099
-- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1100
tryChangeGroup :: Group.List    -- ^ The cluster groups
1101
               -> Node.List     -- ^ The node list (cluster-wide)
1102
               -> Instance.List -- ^ Instance list (cluster-wide)
1103
               -> [Gdx]         -- ^ Target groups; if empty, any
1104
                                -- groups not being evacuated
1105
               -> [Idx]         -- ^ List of instance (indices) to be evacuated
1106
               -> Result (Node.List, Instance.List, EvacSolution)
1107
tryChangeGroup gl ini_nl ini_il gdxs idxs =
1108
    let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1109
                               flip Container.find ini_il) idxs
1110
        target_gdxs = (if null gdxs
1111
                       then Container.keys gl
1112
                       else gdxs) \\ evac_gdxs
1113
        offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1114
        excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1115
        group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1116
                                             (Container.elems nl))) $
1117
                      splitCluster ini_nl ini_il
1118
        (fin_nl, fin_il, esol) =
1119
            foldl' (\state@(nl, il, _) inst ->
1120
                        let solution = do
1121
                              let ncnt = Instance.requiredNodes $
1122
                                         Instance.diskTemplate inst
1123
                              (gdx, _, _) <- findBestAllocGroup gl nl il
1124
                                             (Just target_gdxs) inst ncnt
1125
                              av_nodes <- availableGroupNodes group_ndx
1126
                                          excl_ndx gdx
1127
                              nodeEvacInstance nl il ChangeAll inst
1128
                                       gdx av_nodes
1129
                        in updateEvacSolution state
1130
                               (Instance.idx inst) solution
1131
                   )
1132
            (ini_nl, ini_il, emptyEvacSolution)
1133
            (map (`Container.find` ini_il) idxs)
1134
    in return (fin_nl, fin_il, reverseEvacSolution esol)
1135

    
1136
-- | Recursively place instances on the cluster until we're out of space.
1137
iterateAlloc :: Node.List
1138
             -> Instance.List
1139
             -> Maybe Int
1140
             -> Instance.Instance
1141
             -> AllocNodes
1142
             -> [Instance.Instance]
1143
             -> [CStats]
1144
             -> Result AllocResult
1145
iterateAlloc nl il limit newinst allocnodes ixes cstats =
1146
      let depth = length ixes
1147
          newname = printf "new-%d" depth::String
1148
          newidx = length (Container.elems il) + depth
1149
          newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1150
          newlimit = fmap (flip (-) 1) limit
1151
      in case tryAlloc nl il newi2 allocnodes of
1152
           Bad s -> Bad s
1153
           Ok (AllocSolution { asFailures = errs, asSolutions = sols3 }) ->
1154
               let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1155
               case sols3 of
1156
                 [] -> newsol
1157
                 (xnl, xi, _, _):[] ->
1158
                     if limit == Just 0
1159
                     then newsol
1160
                     else iterateAlloc xnl (Container.add newidx xi il)
1161
                          newlimit newinst allocnodes (xi:ixes)
1162
                          (totalResources xnl:cstats)
1163
                 _ -> Bad "Internal error: multiple solutions for single\
1164
                          \ allocation"
1165

    
1166
-- | The core of the tiered allocation mode.
1167
tieredAlloc :: Node.List
1168
            -> Instance.List
1169
            -> Maybe Int
1170
            -> Instance.Instance
1171
            -> AllocNodes
1172
            -> [Instance.Instance]
1173
            -> [CStats]
1174
            -> Result AllocResult
1175
tieredAlloc nl il limit newinst allocnodes ixes cstats =
1176
    case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1177
      Bad s -> Bad s
1178
      Ok (errs, nl', il', ixes', cstats') ->
1179
          let newsol = Ok (errs, nl', il', ixes', cstats')
1180
              ixes_cnt = length ixes'
1181
              (stop, newlimit) = case limit of
1182
                                   Nothing -> (False, Nothing)
1183
                                   Just n -> (n <= ixes_cnt,
1184
                                              Just (n - ixes_cnt)) in
1185
          if stop then newsol else
1186
          case Instance.shrinkByType newinst . fst . last $
1187
               sortBy (comparing snd) errs of
1188
            Bad _ -> newsol
1189
            Ok newinst' -> tieredAlloc nl' il' newlimit
1190
                           newinst' allocnodes ixes' cstats'
1191

    
1192
-- * Formatting functions
1193

    
1194
-- | Given the original and final nodes, computes the relocation description.
1195
computeMoves :: Instance.Instance -- ^ The instance to be moved
1196
             -> String -- ^ The instance name
1197
             -> IMove  -- ^ The move being performed
1198
             -> String -- ^ New primary
1199
             -> String -- ^ New secondary
1200
             -> (String, [String])
1201
                -- ^ Tuple of moves and commands list; moves is containing
1202
                -- either @/f/@ for failover or @/r:name/@ for replace
1203
                -- secondary, while the command list holds gnt-instance
1204
                -- commands (without that prefix), e.g \"@failover instance1@\"
1205
computeMoves i inam mv c d =
1206
    case mv of
1207
      Failover -> ("f", [mig])
1208
      FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1209
      ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1210
      ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1211
      ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1212
    where morf = if Instance.running i then "migrate" else "failover"
1213
          mig = printf "%s -f %s" morf inam::String
1214
          rep n = printf "replace-disks -n %s %s" n inam
1215

    
1216
-- | Converts a placement to string format.
1217
printSolutionLine :: Node.List     -- ^ The node list
1218
                  -> Instance.List -- ^ The instance list
1219
                  -> Int           -- ^ Maximum node name length
1220
                  -> Int           -- ^ Maximum instance name length
1221
                  -> Placement     -- ^ The current placement
1222
                  -> Int           -- ^ The index of the placement in
1223
                                   -- the solution
1224
                  -> (String, [String])
1225
printSolutionLine nl il nmlen imlen plc pos =
1226
    let
1227
        pmlen = (2*nmlen + 1)
1228
        (i, p, s, mv, c) = plc
1229
        inst = Container.find i il
1230
        inam = Instance.alias inst
1231
        npri = Node.alias $ Container.find p nl
1232
        nsec = Node.alias $ Container.find s nl
1233
        opri = Node.alias $ Container.find (Instance.pNode inst) nl
1234
        osec = Node.alias $ Container.find (Instance.sNode inst) nl
1235
        (moves, cmds) =  computeMoves inst inam mv npri nsec
1236
        ostr = printf "%s:%s" opri osec::String
1237
        nstr = printf "%s:%s" npri nsec::String
1238
    in
1239
      (printf "  %3d. %-*s %-*s => %-*s %.8f a=%s"
1240
       pos imlen inam pmlen ostr
1241
       pmlen nstr c moves,
1242
       cmds)
1243

    
1244
-- | Return the instance and involved nodes in an instance move.
1245
--
1246
-- Note that the output list length can vary, and is not required nor
1247
-- guaranteed to be of any specific length.
1248
involvedNodes :: Instance.List -- ^ Instance list, used for retrieving
1249
                               -- the instance from its index; note
1250
                               -- that this /must/ be the original
1251
                               -- instance list, so that we can
1252
                               -- retrieve the old nodes
1253
              -> Placement     -- ^ The placement we're investigating,
1254
                               -- containing the new nodes and
1255
                               -- instance index
1256
              -> [Ndx]         -- ^ Resulting list of node indices
1257
involvedNodes il plc =
1258
    let (i, np, ns, _, _) = plc
1259
        inst = Container.find i il
1260
    in nub $ [np, ns] ++ Instance.allNodes inst
1261

    
1262
-- | Inner function for splitJobs, that either appends the next job to
1263
-- the current jobset, or starts a new jobset.
1264
mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1265
mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1266
mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1267
    | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1268
    | otherwise = ([n]:cjs, ndx)
1269

    
1270
-- | Break a list of moves into independent groups. Note that this
1271
-- will reverse the order of jobs.
1272
splitJobs :: [MoveJob] -> [JobSet]
1273
splitJobs = fst . foldl mergeJobs ([], [])
1274

    
1275
-- | Given a list of commands, prefix them with @gnt-instance@ and
1276
-- also beautify the display a little.
1277
formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1278
formatJob jsn jsl (sn, (_, _, _, cmds)) =
1279
    let out =
1280
            printf "  echo job %d/%d" jsn sn:
1281
            printf "  check":
1282
            map ("  gnt-instance " ++) cmds
1283
    in if sn == 1
1284
       then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1285
       else out
1286

    
1287
-- | Given a list of commands, prefix them with @gnt-instance@ and
1288
-- also beautify the display a little.
1289
formatCmds :: [JobSet] -> String
1290
formatCmds =
1291
    unlines .
1292
    concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1293
                             (zip [1..] js)) .
1294
    zip [1..]
1295

    
1296
-- | Print the node list.
1297
printNodes :: Node.List -> [String] -> String
1298
printNodes nl fs =
1299
    let fields = case fs of
1300
          [] -> Node.defaultFields
1301
          "+":rest -> Node.defaultFields ++ rest
1302
          _ -> fs
1303
        snl = sortBy (comparing Node.idx) (Container.elems nl)
1304
        (header, isnum) = unzip $ map Node.showHeader fields
1305
    in unlines . map ((:) ' ' .  intercalate " ") $
1306
       formatTable (header:map (Node.list fields) snl) isnum
1307

    
1308
-- | Print the instance list.
1309
printInsts :: Node.List -> Instance.List -> String
1310
printInsts nl il =
1311
    let sil = sortBy (comparing Instance.idx) (Container.elems il)
1312
        helper inst = [ if Instance.running inst then "R" else " "
1313
                      , Instance.name inst
1314
                      , Container.nameOf nl (Instance.pNode inst)
1315
                      , let sdx = Instance.sNode inst
1316
                        in if sdx == Node.noSecondary
1317
                           then  ""
1318
                           else Container.nameOf nl sdx
1319
                      , if Instance.autoBalance inst then "Y" else "N"
1320
                      , printf "%3d" $ Instance.vcpus inst
1321
                      , printf "%5d" $ Instance.mem inst
1322
                      , printf "%5d" $ Instance.dsk inst `div` 1024
1323
                      , printf "%5.3f" lC
1324
                      , printf "%5.3f" lM
1325
                      , printf "%5.3f" lD
1326
                      , printf "%5.3f" lN
1327
                      ]
1328
            where DynUtil lC lM lD lN = Instance.util inst
1329
        header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1330
                 , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1331
        isnum = False:False:False:False:False:repeat True
1332
    in unlines . map ((:) ' ' . intercalate " ") $
1333
       formatTable (header:map helper sil) isnum
1334

    
1335
-- | Shows statistics for a given node list.
1336
printStats :: Node.List -> String
1337
printStats nl =
1338
    let dcvs = compDetailedCV $ Container.elems nl
1339
        (weights, names) = unzip detailedCVInfo
1340
        hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1341
        formatted = map (\(w, header, val) ->
1342
                             printf "%s=%.8f(x%.2f)" header val w::String) hd
1343
    in intercalate ", " formatted
1344

    
1345
-- | Convert a placement into a list of OpCodes (basically a job).
1346
iMoveToJob :: Node.List        -- ^ The node list; only used for node
1347
                               -- names, so any version is good
1348
                               -- (before or after the operation)
1349
           -> Instance.List    -- ^ The instance list; also used for
1350
                               -- names only
1351
           -> Idx              -- ^ The index of the instance being
1352
                               -- moved
1353
           -> IMove            -- ^ The actual move to be described
1354
           -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to
1355
                               -- the given move
1356
iMoveToJob nl il idx move =
1357
    let inst = Container.find idx il
1358
        iname = Instance.name inst
1359
        lookNode  = Just . Container.nameOf nl
1360
        opF = OpCodes.OpInstanceMigrate iname True False True Nothing
1361
        opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n)
1362
                OpCodes.ReplaceNewSecondary [] Nothing
1363
    in case move of
1364
         Failover -> [ opF ]
1365
         ReplacePrimary np -> [ opF, opR np, opF ]
1366
         ReplaceSecondary ns -> [ opR ns ]
1367
         ReplaceAndFailover np -> [ opR np, opF ]
1368
         FailoverAndReplace ns -> [ opF, opR ns ]
1369

    
1370
-- * Node group functions
1371

    
1372
-- | Computes the group of an instance.
1373
instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1374
instanceGroup nl i =
1375
  let sidx = Instance.sNode i
1376
      pnode = Container.find (Instance.pNode i) nl
1377
      snode = if sidx == Node.noSecondary
1378
              then pnode
1379
              else Container.find sidx nl
1380
      pgroup = Node.group pnode
1381
      sgroup = Node.group snode
1382
  in if pgroup /= sgroup
1383
     then fail ("Instance placed accross two node groups, primary " ++
1384
                show pgroup ++ ", secondary " ++ show sgroup)
1385
     else return pgroup
1386

    
1387
-- | Computes the group of an instance per the primary node.
1388
instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1389
instancePriGroup nl i =
1390
  let pnode = Container.find (Instance.pNode i) nl
1391
  in  Node.group pnode
1392

    
1393
-- | Compute the list of badly allocated instances (split across node
1394
-- groups).
1395
findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1396
findSplitInstances nl =
1397
  filter (not . isOk . instanceGroup nl) . Container.elems
1398

    
1399
-- | Splits a cluster into the component node groups.
1400
splitCluster :: Node.List -> Instance.List ->
1401
                [(Gdx, (Node.List, Instance.List))]
1402
splitCluster nl il =
1403
  let ngroups = Node.computeGroups (Container.elems nl)
1404
  in map (\(guuid, nodes) ->
1405
           let nidxs = map Node.idx nodes
1406
               nodes' = zip nidxs nodes
1407
               instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1408
           in (guuid, (Container.fromList nodes', instances))) ngroups
1409

    
1410
-- | Compute the list of nodes that are to be evacuated, given a list
1411
-- of instances and an evacuation mode.
1412
nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1413
                -> EvacMode      -- ^ The evacuation mode we're using
1414
                -> [Idx]         -- ^ List of instance indices being evacuated
1415
                -> IntSet.IntSet -- ^ Set of node indices
1416
nodesToEvacuate il mode =
1417
    IntSet.delete Node.noSecondary .
1418
    foldl' (\ns idx ->
1419
                let i = Container.find idx il
1420
                    pdx = Instance.pNode i
1421
                    sdx = Instance.sNode i
1422
                    dt = Instance.diskTemplate i
1423
                    withSecondary = case dt of
1424
                                      DTDrbd8 -> IntSet.insert sdx ns
1425
                                      _ -> ns
1426
                in case mode of
1427
                     ChangePrimary   -> IntSet.insert pdx ns
1428
                     ChangeSecondary -> withSecondary
1429
                     ChangeAll       -> IntSet.insert pdx withSecondary
1430
           ) IntSet.empty