Statistics
| Branch: | Tag: | Revision:

root / htools / Ganeti / HTools / Cluster.hs @ 8f48f67d

History | View | Annotate | Download (56.7 kB)

1
{-| Implementation of cluster-wide logic.
2

    
3
This module holds all pure cluster-logic; I\/O related functionality
4
goes into the /Main/ module for the individual binaries.
5

    
6
-}
7

    
8
{-
9

    
10
Copyright (C) 2009, 2010, 2011 Google Inc.
11

    
12
This program is free software; you can redistribute it and/or modify
13
it under the terms of the GNU General Public License as published by
14
the Free Software Foundation; either version 2 of the License, or
15
(at your option) any later version.
16

    
17
This program is distributed in the hope that it will be useful, but
18
WITHOUT ANY WARRANTY; without even the implied warranty of
19
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
General Public License for more details.
21

    
22
You should have received a copy of the GNU General Public License
23
along with this program; if not, write to the Free Software
24
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25
02110-1301, USA.
26

    
27
-}
28

    
29
module Ganeti.HTools.Cluster
30
    (
31
     -- * Types
32
      AllocSolution(..)
33
    , EvacSolution(..)
34
    , Table(..)
35
    , CStats(..)
36
    , AllocStats
37
    -- * Generic functions
38
    , totalResources
39
    , computeAllocationDelta
40
    -- * First phase functions
41
    , computeBadItems
42
    -- * Second phase functions
43
    , printSolutionLine
44
    , formatCmds
45
    , involvedNodes
46
    , splitJobs
47
    -- * Display functions
48
    , printNodes
49
    , printInsts
50
    -- * Balacing functions
51
    , checkMove
52
    , doNextBalance
53
    , tryBalance
54
    , compCV
55
    , compCVNodes
56
    , compDetailedCV
57
    , printStats
58
    , iMoveToJob
59
    -- * IAllocator functions
60
    , genAllocNodes
61
    , tryAlloc
62
    , tryMGAlloc
63
    , tryReloc
64
    , tryMGReloc
65
    , tryEvac
66
    , tryMGEvac
67
    , tryNodeEvac
68
    , collapseFailures
69
    -- * Allocation functions
70
    , iterateAlloc
71
    , tieredAlloc
72
    , tieredSpecMap
73
     -- * Node group functions
74
    , instanceGroup
75
    , findSplitInstances
76
    , splitCluster
77
    ) where
78

    
79
import Data.Function (on)
80
import qualified Data.IntSet as IntSet
81
import Data.List
82
import Data.Maybe (fromJust)
83
import Data.Ord (comparing)
84
import Text.Printf (printf)
85
import Control.Monad
86
import Control.Parallel.Strategies
87

    
88
import qualified Ganeti.HTools.Container as Container
89
import qualified Ganeti.HTools.Instance as Instance
90
import qualified Ganeti.HTools.Node as Node
91
import qualified Ganeti.HTools.Group as Group
92
import Ganeti.HTools.Types
93
import Ganeti.HTools.Utils
94
import qualified Ganeti.OpCodes as OpCodes
95

    
96
-- * Types
97

    
98
-- | Allocation\/relocation solution.
99
data AllocSolution = AllocSolution
100
  { asFailures  :: [FailMode]          -- ^ Failure counts
101
  , asAllocs    :: Int                 -- ^ Good allocation count
102
  , asSolutions :: [Node.AllocElement] -- ^ The actual result, length
103
                                       -- of the list depends on the
104
                                       -- allocation/relocation mode
105
  , asLog       :: [String]            -- ^ A list of informational messages
106
  }
107

    
108
-- | Node evacuation/group change iallocator result type. This result
109
-- type consists of actual opcodes (a restricted subset) that are
110
-- transmitted back to Ganeti.
111
data EvacSolution = EvacSolution
112
    { esMoved   :: [String]             -- ^ Instance moved successfully
113
    , esFailed  :: [String]             -- ^ Instance which were not
114
                                        -- relocated
115
    , esOpCodes :: [[[OpCodes.OpCode]]] -- ^ List of lists of jobs
116
    }
117

    
118
-- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
119
type AllocResult = (FailStats, Node.List, Instance.List,
120
                    [Instance.Instance], [CStats])
121

    
122
-- | A type denoting the valid allocation mode/pairs.
123
--
124
-- For a one-node allocation, this will be a @Left ['Node.Node']@,
125
-- whereas for a two-node allocation, this will be a @Right
126
-- [('Node.Node', 'Node.Node')]@.
127
type AllocNodes = Either [Ndx] [(Ndx, Ndx)]
128

    
129
-- | The empty solution we start with when computing allocations.
130
emptyAllocSolution :: AllocSolution
131
emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
132
                                   , asSolutions = [], asLog = [] }
133

    
134
-- | The empty evac solution.
135
emptyEvacSolution :: EvacSolution
136
emptyEvacSolution = EvacSolution { esMoved = []
137
                                 , esFailed = []
138
                                 , esOpCodes = []
139
                                 }
140

    
141
-- | The complete state for the balancing solution.
142
data Table = Table Node.List Instance.List Score [Placement]
143
             deriving (Show, Read)
144

    
145
data CStats = CStats { csFmem :: Integer -- ^ Cluster free mem
146
                     , csFdsk :: Integer -- ^ Cluster free disk
147
                     , csAmem :: Integer -- ^ Cluster allocatable mem
148
                     , csAdsk :: Integer -- ^ Cluster allocatable disk
149
                     , csAcpu :: Integer -- ^ Cluster allocatable cpus
150
                     , csMmem :: Integer -- ^ Max node allocatable mem
151
                     , csMdsk :: Integer -- ^ Max node allocatable disk
152
                     , csMcpu :: Integer -- ^ Max node allocatable cpu
153
                     , csImem :: Integer -- ^ Instance used mem
154
                     , csIdsk :: Integer -- ^ Instance used disk
155
                     , csIcpu :: Integer -- ^ Instance used cpu
156
                     , csTmem :: Double  -- ^ Cluster total mem
157
                     , csTdsk :: Double  -- ^ Cluster total disk
158
                     , csTcpu :: Double  -- ^ Cluster total cpus
159
                     , csVcpu :: Integer -- ^ Cluster virtual cpus (if
160
                                         -- node pCpu has been set,
161
                                         -- otherwise -1)
162
                     , csXmem :: Integer -- ^ Unnacounted for mem
163
                     , csNmem :: Integer -- ^ Node own memory
164
                     , csScore :: Score  -- ^ The cluster score
165
                     , csNinst :: Int    -- ^ The total number of instances
166
                     }
167
            deriving (Show, Read)
168

    
169
-- | Currently used, possibly to allocate, unallocable.
170
type AllocStats = (RSpec, RSpec, RSpec)
171

    
172
-- * Utility functions
173

    
174
-- | Verifies the N+1 status and return the affected nodes.
175
verifyN1 :: [Node.Node] -> [Node.Node]
176
verifyN1 = filter Node.failN1
177

    
178
{-| Computes the pair of bad nodes and instances.
179

    
180
The bad node list is computed via a simple 'verifyN1' check, and the
181
bad instance list is the list of primary and secondary instances of
182
those nodes.
183

    
184
-}
185
computeBadItems :: Node.List -> Instance.List ->
186
                   ([Node.Node], [Instance.Instance])
187
computeBadItems nl il =
188
  let bad_nodes = verifyN1 $ getOnline nl
189
      bad_instances = map (`Container.find` il) .
190
                      sort . nub $
191
                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
192
  in
193
    (bad_nodes, bad_instances)
194

    
195
-- | Zero-initializer for the CStats type.
196
emptyCStats :: CStats
197
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
198

    
199
-- | Update stats with data from a new node.
200
updateCStats :: CStats -> Node.Node -> CStats
201
updateCStats cs node =
202
    let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
203
                 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
204
                 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
205
                 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
206
                 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
207
                 csVcpu = x_vcpu,
208
                 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
209
               }
210
            = cs
211
        inc_amem = Node.fMem node - Node.rMem node
212
        inc_amem' = if inc_amem > 0 then inc_amem else 0
213
        inc_adsk = Node.availDisk node
214
        inc_imem = truncate (Node.tMem node) - Node.nMem node
215
                   - Node.xMem node - Node.fMem node
216
        inc_icpu = Node.uCpu node
217
        inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
218
        inc_vcpu = Node.hiCpu node
219
        inc_acpu = Node.availCpu node
220

    
221
    in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
222
          , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
223
          , csAmem = x_amem + fromIntegral inc_amem'
224
          , csAdsk = x_adsk + fromIntegral inc_adsk
225
          , csAcpu = x_acpu + fromIntegral inc_acpu
226
          , csMmem = max x_mmem (fromIntegral inc_amem')
227
          , csMdsk = max x_mdsk (fromIntegral inc_adsk)
228
          , csMcpu = max x_mcpu (fromIntegral inc_acpu)
229
          , csImem = x_imem + fromIntegral inc_imem
230
          , csIdsk = x_idsk + fromIntegral inc_idsk
231
          , csIcpu = x_icpu + fromIntegral inc_icpu
232
          , csTmem = x_tmem + Node.tMem node
233
          , csTdsk = x_tdsk + Node.tDsk node
234
          , csTcpu = x_tcpu + Node.tCpu node
235
          , csVcpu = x_vcpu + fromIntegral inc_vcpu
236
          , csXmem = x_xmem + fromIntegral (Node.xMem node)
237
          , csNmem = x_nmem + fromIntegral (Node.nMem node)
238
          , csNinst = x_ninst + length (Node.pList node)
239
          }
240

    
241
-- | Compute the total free disk and memory in the cluster.
242
totalResources :: Node.List -> CStats
243
totalResources nl =
244
    let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
245
    in cs { csScore = compCV nl }
246

    
247
-- | Compute the delta between two cluster state.
248
--
249
-- This is used when doing allocations, to understand better the
250
-- available cluster resources. The return value is a triple of the
251
-- current used values, the delta that was still allocated, and what
252
-- was left unallocated.
253
computeAllocationDelta :: CStats -> CStats -> AllocStats
254
computeAllocationDelta cini cfin =
255
    let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini
256
        CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
257
                csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin
258
        rini = RSpec (fromIntegral i_icpu) (fromIntegral i_imem)
259
               (fromIntegral i_idsk)
260
        rfin = RSpec (fromIntegral (f_icpu - i_icpu))
261
               (fromIntegral (f_imem - i_imem))
262
               (fromIntegral (f_idsk - i_idsk))
263
        un_cpu = fromIntegral (v_cpu - f_icpu)::Int
264
        runa = RSpec un_cpu (truncate t_mem - fromIntegral f_imem)
265
               (truncate t_dsk - fromIntegral f_idsk)
266
    in (rini, rfin, runa)
267

    
268
-- | The names and weights of the individual elements in the CV list.
269
detailedCVInfo :: [(Double, String)]
270
detailedCVInfo = [ (1,  "free_mem_cv")
271
                 , (1,  "free_disk_cv")
272
                 , (1,  "n1_cnt")
273
                 , (1,  "reserved_mem_cv")
274
                 , (4,  "offline_all_cnt")
275
                 , (16, "offline_pri_cnt")
276
                 , (1,  "vcpu_ratio_cv")
277
                 , (1,  "cpu_load_cv")
278
                 , (1,  "mem_load_cv")
279
                 , (1,  "disk_load_cv")
280
                 , (1,  "net_load_cv")
281
                 , (2,  "pri_tags_score")
282
                 ]
283

    
284
detailedCVWeights :: [Double]
285
detailedCVWeights = map fst detailedCVInfo
286

    
287
-- | Compute the mem and disk covariance.
288
compDetailedCV :: [Node.Node] -> [Double]
289
compDetailedCV all_nodes =
290
    let
291
        (offline, nodes) = partition Node.offline all_nodes
292
        mem_l = map Node.pMem nodes
293
        dsk_l = map Node.pDsk nodes
294
        -- metric: memory covariance
295
        mem_cv = stdDev mem_l
296
        -- metric: disk covariance
297
        dsk_cv = stdDev dsk_l
298
        -- metric: count of instances living on N1 failing nodes
299
        n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
300
                                                   length (Node.pList n)) .
301
                   filter Node.failN1 $ nodes :: Double
302
        res_l = map Node.pRem nodes
303
        -- metric: reserved memory covariance
304
        res_cv = stdDev res_l
305
        -- offline instances metrics
306
        offline_ipri = sum . map (length . Node.pList) $ offline
307
        offline_isec = sum . map (length . Node.sList) $ offline
308
        -- metric: count of instances on offline nodes
309
        off_score = fromIntegral (offline_ipri + offline_isec)::Double
310
        -- metric: count of primary instances on offline nodes (this
311
        -- helps with evacuation/failover of primary instances on
312
        -- 2-node clusters with one node offline)
313
        off_pri_score = fromIntegral offline_ipri::Double
314
        cpu_l = map Node.pCpu nodes
315
        -- metric: covariance of vcpu/pcpu ratio
316
        cpu_cv = stdDev cpu_l
317
        -- metrics: covariance of cpu, memory, disk and network load
318
        (c_load, m_load, d_load, n_load) = unzip4 $
319
            map (\n ->
320
                     let DynUtil c1 m1 d1 n1 = Node.utilLoad n
321
                         DynUtil c2 m2 d2 n2 = Node.utilPool n
322
                     in (c1/c2, m1/m2, d1/d2, n1/n2)
323
                ) nodes
324
        -- metric: conflicting instance count
325
        pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
326
        pri_tags_score = fromIntegral pri_tags_inst::Double
327
    in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
328
       , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
329
       , pri_tags_score ]
330

    
331
-- | Compute the /total/ variance.
332
compCVNodes :: [Node.Node] -> Double
333
compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
334

    
335
-- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
336
compCV :: Node.List -> Double
337
compCV = compCVNodes . Container.elems
338

    
339

    
340
-- | Compute online nodes from a 'Node.List'.
341
getOnline :: Node.List -> [Node.Node]
342
getOnline = filter (not . Node.offline) . Container.elems
343

    
344
-- * Balancing functions
345

    
346
-- | Compute best table. Note that the ordering of the arguments is important.
347
compareTables :: Table -> Table -> Table
348
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
349
    if a_cv > b_cv then b else a
350

    
351
-- | Applies an instance move to a given node list and instance.
352
applyMove :: Node.List -> Instance.Instance
353
          -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
354
-- Failover (f)
355
applyMove nl inst Failover =
356
    let old_pdx = Instance.pNode inst
357
        old_sdx = Instance.sNode inst
358
        old_p = Container.find old_pdx nl
359
        old_s = Container.find old_sdx nl
360
        int_p = Node.removePri old_p inst
361
        int_s = Node.removeSec old_s inst
362
        force_p = Node.offline old_p
363
        new_nl = do -- Maybe monad
364
          new_p <- Node.addPriEx force_p int_s inst
365
          new_s <- Node.addSec int_p inst old_sdx
366
          let new_inst = Instance.setBoth inst old_sdx old_pdx
367
          return (Container.addTwo old_pdx new_s old_sdx new_p nl,
368
                  new_inst, old_sdx, old_pdx)
369
    in new_nl
370

    
371
-- Replace the primary (f:, r:np, f)
372
applyMove nl inst (ReplacePrimary new_pdx) =
373
    let old_pdx = Instance.pNode inst
374
        old_sdx = Instance.sNode inst
375
        old_p = Container.find old_pdx nl
376
        old_s = Container.find old_sdx nl
377
        tgt_n = Container.find new_pdx nl
378
        int_p = Node.removePri old_p inst
379
        int_s = Node.removeSec old_s inst
380
        force_p = Node.offline old_p
381
        new_nl = do -- Maybe monad
382
          -- check that the current secondary can host the instance
383
          -- during the migration
384
          tmp_s <- Node.addPriEx force_p int_s inst
385
          let tmp_s' = Node.removePri tmp_s inst
386
          new_p <- Node.addPriEx force_p tgt_n inst
387
          new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
388
          let new_inst = Instance.setPri inst new_pdx
389
          return (Container.add new_pdx new_p $
390
                  Container.addTwo old_pdx int_p old_sdx new_s nl,
391
                  new_inst, new_pdx, old_sdx)
392
    in new_nl
393

    
394
-- Replace the secondary (r:ns)
395
applyMove nl inst (ReplaceSecondary new_sdx) =
396
    let old_pdx = Instance.pNode inst
397
        old_sdx = Instance.sNode inst
398
        old_s = Container.find old_sdx nl
399
        tgt_n = Container.find new_sdx nl
400
        int_s = Node.removeSec old_s inst
401
        force_s = Node.offline old_s
402
        new_inst = Instance.setSec inst new_sdx
403
        new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
404
                 \new_s -> return (Container.addTwo new_sdx
405
                                   new_s old_sdx int_s nl,
406
                                   new_inst, old_pdx, new_sdx)
407
    in new_nl
408

    
409
-- Replace the secondary and failover (r:np, f)
410
applyMove nl inst (ReplaceAndFailover new_pdx) =
411
    let old_pdx = Instance.pNode inst
412
        old_sdx = Instance.sNode inst
413
        old_p = Container.find old_pdx nl
414
        old_s = Container.find old_sdx nl
415
        tgt_n = Container.find new_pdx nl
416
        int_p = Node.removePri old_p inst
417
        int_s = Node.removeSec old_s inst
418
        force_s = Node.offline old_s
419
        new_nl = do -- Maybe monad
420
          new_p <- Node.addPri tgt_n inst
421
          new_s <- Node.addSecEx force_s int_p inst new_pdx
422
          let new_inst = Instance.setBoth inst new_pdx old_pdx
423
          return (Container.add new_pdx new_p $
424
                  Container.addTwo old_pdx new_s old_sdx int_s nl,
425
                  new_inst, new_pdx, old_pdx)
426
    in new_nl
427

    
428
-- Failver and replace the secondary (f, r:ns)
429
applyMove nl inst (FailoverAndReplace new_sdx) =
430
    let old_pdx = Instance.pNode inst
431
        old_sdx = Instance.sNode inst
432
        old_p = Container.find old_pdx nl
433
        old_s = Container.find old_sdx nl
434
        tgt_n = Container.find new_sdx nl
435
        int_p = Node.removePri old_p inst
436
        int_s = Node.removeSec old_s inst
437
        force_p = Node.offline old_p
438
        new_nl = do -- Maybe monad
439
          new_p <- Node.addPriEx force_p int_s inst
440
          new_s <- Node.addSecEx force_p tgt_n inst old_sdx
441
          let new_inst = Instance.setBoth inst old_sdx new_sdx
442
          return (Container.add new_sdx new_s $
443
                  Container.addTwo old_sdx new_p old_pdx int_p nl,
444
                  new_inst, old_sdx, new_sdx)
445
    in new_nl
446

    
447
-- | Tries to allocate an instance on one given node.
448
allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
449
                 -> OpResult Node.AllocElement
450
allocateOnSingle nl inst new_pdx =
451
    let p = Container.find new_pdx nl
452
        new_inst = Instance.setBoth inst new_pdx Node.noSecondary
453
    in  Node.addPri p inst >>= \new_p -> do
454
      let new_nl = Container.add new_pdx new_p nl
455
          new_score = compCV nl
456
      return (new_nl, new_inst, [new_p], new_score)
457

    
458
-- | Tries to allocate an instance on a given pair of nodes.
459
allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
460
               -> OpResult Node.AllocElement
461
allocateOnPair nl inst new_pdx new_sdx =
462
    let tgt_p = Container.find new_pdx nl
463
        tgt_s = Container.find new_sdx nl
464
    in do
465
      new_p <- Node.addPri tgt_p inst
466
      new_s <- Node.addSec tgt_s inst new_pdx
467
      let new_inst = Instance.setBoth inst new_pdx new_sdx
468
          new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
469
      return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
470

    
471
-- | Tries to perform an instance move and returns the best table
472
-- between the original one and the new one.
473
checkSingleStep :: Table -- ^ The original table
474
                -> Instance.Instance -- ^ The instance to move
475
                -> Table -- ^ The current best table
476
                -> IMove -- ^ The move to apply
477
                -> Table -- ^ The final best table
478
checkSingleStep ini_tbl target cur_tbl move =
479
    let
480
        Table ini_nl ini_il _ ini_plc = ini_tbl
481
        tmp_resu = applyMove ini_nl target move
482
    in
483
      case tmp_resu of
484
        OpFail _ -> cur_tbl
485
        OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
486
            let tgt_idx = Instance.idx target
487
                upd_cvar = compCV upd_nl
488
                upd_il = Container.add tgt_idx new_inst ini_il
489
                upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
490
                upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
491
            in
492
              compareTables cur_tbl upd_tbl
493

    
494
-- | Given the status of the current secondary as a valid new node and
495
-- the current candidate target node, generate the possible moves for
496
-- a instance.
497
possibleMoves :: Bool      -- ^ Whether the secondary node is a valid new node
498
              -> Bool      -- ^ Whether we can change the primary node
499
              -> Ndx       -- ^ Target node candidate
500
              -> [IMove]   -- ^ List of valid result moves
501

    
502
possibleMoves _ False tdx =
503
    [ReplaceSecondary tdx]
504

    
505
possibleMoves True True tdx =
506
    [ReplaceSecondary tdx,
507
     ReplaceAndFailover tdx,
508
     ReplacePrimary tdx,
509
     FailoverAndReplace tdx]
510

    
511
possibleMoves False True tdx =
512
    [ReplaceSecondary tdx,
513
     ReplaceAndFailover tdx]
514

    
515
-- | Compute the best move for a given instance.
516
checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
517
                  -> Bool              -- ^ Whether disk moves are allowed
518
                  -> Bool              -- ^ Whether instance moves are allowed
519
                  -> Table             -- ^ Original table
520
                  -> Instance.Instance -- ^ Instance to move
521
                  -> Table             -- ^ Best new table for this instance
522
checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
523
    let
524
        opdx = Instance.pNode target
525
        osdx = Instance.sNode target
526
        nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx
527
        use_secondary = elem osdx nodes_idx && inst_moves
528
        aft_failover = if use_secondary -- if allowed to failover
529
                       then checkSingleStep ini_tbl target ini_tbl Failover
530
                       else ini_tbl
531
        all_moves = if disk_moves
532
                    then concatMap
533
                         (possibleMoves use_secondary inst_moves) nodes
534
                    else []
535
    in
536
      -- iterate over the possible nodes for this instance
537
      foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
538

    
539
-- | Compute the best next move.
540
checkMove :: [Ndx]               -- ^ Allowed target node indices
541
          -> Bool                -- ^ Whether disk moves are allowed
542
          -> Bool                -- ^ Whether instance moves are allowed
543
          -> Table               -- ^ The current solution
544
          -> [Instance.Instance] -- ^ List of instances still to move
545
          -> Table               -- ^ The new solution
546
checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
547
    let Table _ _ _ ini_plc = ini_tbl
548
        -- we're using rwhnf from the Control.Parallel.Strategies
549
        -- package; we don't need to use rnf as that would force too
550
        -- much evaluation in single-threaded cases, and in
551
        -- multi-threaded case the weak head normal form is enough to
552
        -- spark the evaluation
553
        tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
554
                               inst_moves ini_tbl)
555
                 victims
556
        -- iterate over all instances, computing the best move
557
        best_tbl = foldl' compareTables ini_tbl tables
558
        Table _ _ _ best_plc = best_tbl
559
    in if length best_plc == length ini_plc
560
       then ini_tbl -- no advancement
561
       else best_tbl
562

    
563
-- | Check if we are allowed to go deeper in the balancing.
564
doNextBalance :: Table     -- ^ The starting table
565
              -> Int       -- ^ Remaining length
566
              -> Score     -- ^ Score at which to stop
567
              -> Bool      -- ^ The resulting table and commands
568
doNextBalance ini_tbl max_rounds min_score =
569
    let Table _ _ ini_cv ini_plc = ini_tbl
570
        ini_plc_len = length ini_plc
571
    in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
572

    
573
-- | Run a balance move.
574
tryBalance :: Table       -- ^ The starting table
575
           -> Bool        -- ^ Allow disk moves
576
           -> Bool        -- ^ Allow instance moves
577
           -> Bool        -- ^ Only evacuate moves
578
           -> Score       -- ^ Min gain threshold
579
           -> Score       -- ^ Min gain
580
           -> Maybe Table -- ^ The resulting table and commands
581
tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
582
    let Table ini_nl ini_il ini_cv _ = ini_tbl
583
        all_inst = Container.elems ini_il
584
        all_inst' = if evac_mode
585
                    then let bad_nodes = map Node.idx . filter Node.offline $
586
                                         Container.elems ini_nl
587
                         in filter (\e -> Instance.sNode e `elem` bad_nodes ||
588
                                          Instance.pNode e `elem` bad_nodes)
589
                            all_inst
590
                    else all_inst
591
        reloc_inst = filter Instance.movable all_inst'
592
        node_idx = map Node.idx . filter (not . Node.offline) $
593
                   Container.elems ini_nl
594
        fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
595
        (Table _ _ fin_cv _) = fin_tbl
596
    in
597
      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
598
      then Just fin_tbl -- this round made success, return the new table
599
      else Nothing
600

    
601
-- * Allocation functions
602

    
603
-- | Build failure stats out of a list of failures.
604
collapseFailures :: [FailMode] -> FailStats
605
collapseFailures flst =
606
    map (\k -> (k, length $ filter (k ==) flst)) [minBound..maxBound]
607

    
608
-- | Update current Allocation solution and failure stats with new
609
-- elements.
610
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
611
concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
612

    
613
concatAllocs as (OpGood ns@(_, _, _, nscore)) =
614
    let -- Choose the old or new solution, based on the cluster score
615
        cntok = asAllocs as
616
        osols = asSolutions as
617
        nsols = case osols of
618
                  [] -> [ns]
619
                  (_, _, _, oscore):[] ->
620
                      if oscore < nscore
621
                      then osols
622
                      else [ns]
623
                  -- FIXME: here we simply concat to lists with more
624
                  -- than one element; we should instead abort, since
625
                  -- this is not a valid usage of this function
626
                  xs -> ns:xs
627
        nsuc = cntok + 1
628
    -- Note: we force evaluation of nsols here in order to keep the
629
    -- memory profile low - we know that we will need nsols for sure
630
    -- in the next cycle, so we force evaluation of nsols, since the
631
    -- foldl' in the caller will only evaluate the tuple, but not the
632
    -- elements of the tuple
633
    in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolutions = nsols }
634

    
635
-- | Sums two allocation solutions (e.g. for two separate node groups).
636
sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
637
sumAllocs (AllocSolution af aa as al) (AllocSolution bf ba bs bl) =
638
    AllocSolution (af ++ bf) (aa + ba) (as ++ bs) (al ++ bl)
639

    
640
-- | Given a solution, generates a reasonable description for it.
641
describeSolution :: AllocSolution -> String
642
describeSolution as =
643
  let fcnt = asFailures as
644
      sols = asSolutions as
645
      freasons =
646
        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
647
        filter ((> 0) . snd) . collapseFailures $ fcnt
648
  in if null sols
649
     then "No valid allocation solutions, failure reasons: " ++
650
          (if null fcnt
651
           then "unknown reasons"
652
           else freasons)
653
     else let (_, _, nodes, cv) = head sols
654
          in printf ("score: %.8f, successes %d, failures %d (%s)" ++
655
                     " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
656
             (intercalate "/" . map Node.name $ nodes)
657

    
658
-- | Annotates a solution with the appropriate string.
659
annotateSolution :: AllocSolution -> AllocSolution
660
annotateSolution as = as { asLog = describeSolution as : asLog as }
661

    
662
-- | Reverses an evacuation solution.
663
--
664
-- Rationale: we always concat the results to the top of the lists, so
665
-- for proper jobset execution, we should reverse all lists.
666
reverseEvacSolution :: EvacSolution -> EvacSolution
667
reverseEvacSolution (EvacSolution f m o) =
668
    EvacSolution (reverse f) (reverse m) (reverse o)
669

    
670
-- | Generate the valid node allocation singles or pairs for a new instance.
671
genAllocNodes :: Group.List        -- ^ Group list
672
              -> Node.List         -- ^ The node map
673
              -> Int               -- ^ The number of nodes required
674
              -> Bool              -- ^ Whether to drop or not
675
                                   -- unallocable nodes
676
              -> Result AllocNodes -- ^ The (monadic) result
677
genAllocNodes gl nl count drop_unalloc =
678
    let filter_fn = if drop_unalloc
679
                    then filter (Group.isAllocable .
680
                                 flip Container.find gl . Node.group)
681
                    else id
682
        all_nodes = filter_fn $ getOnline nl
683
        all_pairs = liftM2 (,) all_nodes all_nodes
684
        ok_pairs = filter (\(x, y) -> Node.idx x /= Node.idx y &&
685
                                      Node.group x == Node.group y) all_pairs
686
    in case count of
687
         1 -> Ok (Left (map Node.idx all_nodes))
688
         2 -> Ok (Right (map (\(p, s) -> (Node.idx p, Node.idx s)) ok_pairs))
689
         _ -> Bad "Unsupported number of nodes, only one or two  supported"
690

    
691
-- | Try to allocate an instance on the cluster.
692
tryAlloc :: (Monad m) =>
693
            Node.List         -- ^ The node list
694
         -> Instance.List     -- ^ The instance list
695
         -> Instance.Instance -- ^ The instance to allocate
696
         -> AllocNodes        -- ^ The allocation targets
697
         -> m AllocSolution   -- ^ Possible solution list
698
tryAlloc nl _ inst (Right ok_pairs) =
699
    let sols = foldl' (\cstate (p, s) ->
700
                           concatAllocs cstate $ allocateOnPair nl inst p s
701
                      ) emptyAllocSolution ok_pairs
702

    
703
    in if null ok_pairs -- means we have just one node
704
       then fail "Not enough online nodes"
705
       else return $ annotateSolution sols
706

    
707
tryAlloc nl _ inst (Left all_nodes) =
708
    let sols = foldl' (\cstate ->
709
                           concatAllocs cstate . allocateOnSingle nl inst
710
                      ) emptyAllocSolution all_nodes
711
    in if null all_nodes
712
       then fail "No online nodes"
713
       else return $ annotateSolution sols
714

    
715
-- | Given a group/result, describe it as a nice (list of) messages.
716
solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
717
solutionDescription gl (groupId, result) =
718
  case result of
719
    Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
720
    Bad message -> [printf "Group %s: error %s" gname message]
721
  where grp = Container.find groupId gl
722
        gname = Group.name grp
723
        pol = apolToString (Group.allocPolicy grp)
724

    
725
-- | From a list of possibly bad and possibly empty solutions, filter
726
-- only the groups with a valid result. Note that the result will be
727
-- reversed compared to the original list.
728
filterMGResults :: Group.List
729
                -> [(Gdx, Result AllocSolution)]
730
                -> [(Gdx, AllocSolution)]
731
filterMGResults gl = foldl' fn []
732
    where unallocable = not . Group.isAllocable . flip Container.find gl
733
          fn accu (gdx, rasol) =
734
              case rasol of
735
                Bad _ -> accu
736
                Ok sol | null (asSolutions sol) -> accu
737
                       | unallocable gdx -> accu
738
                       | otherwise -> (gdx, sol):accu
739

    
740
-- | Sort multigroup results based on policy and score.
741
sortMGResults :: Group.List
742
             -> [(Gdx, AllocSolution)]
743
             -> [(Gdx, AllocSolution)]
744
sortMGResults gl sols =
745
    let extractScore (_, _, _, x) = x
746
        solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
747
                               (extractScore . head . asSolutions) sol)
748
    in sortBy (comparing solScore) sols
749

    
750
-- | Try to allocate an instance on a multi-group cluster.
751
tryMGAlloc :: Group.List           -- ^ The group list
752
           -> Node.List            -- ^ The node list
753
           -> Instance.List        -- ^ The instance list
754
           -> Instance.Instance    -- ^ The instance to allocate
755
           -> Int                  -- ^ Required number of nodes
756
           -> Result AllocSolution -- ^ Possible solution list
757
tryMGAlloc mggl mgnl mgil inst cnt =
758
  let groups = splitCluster mgnl mgil
759
      sols = map (\(gid, (nl, il)) ->
760
                   (gid, genAllocNodes mggl nl cnt False >>=
761
                       tryAlloc nl il inst))
762
             groups::[(Gdx, Result AllocSolution)]
763
      all_msgs = concatMap (solutionDescription mggl) sols
764
      goodSols = filterMGResults mggl sols
765
      sortedSols = sortMGResults mggl goodSols
766
  in if null sortedSols
767
     then Bad $ intercalate ", " all_msgs
768
     else let (final_group, final_sol) = head sortedSols
769
              final_name = Group.name $ Container.find final_group mggl
770
              selmsg = "Selected group: " ++  final_name
771
          in Ok $ final_sol { asLog = selmsg:all_msgs }
772

    
773
-- | Try to relocate an instance on the cluster.
774
tryReloc :: (Monad m) =>
775
            Node.List       -- ^ The node list
776
         -> Instance.List   -- ^ The instance list
777
         -> Idx             -- ^ The index of the instance to move
778
         -> Int             -- ^ The number of nodes required
779
         -> [Ndx]           -- ^ Nodes which should not be used
780
         -> m AllocSolution -- ^ Solution list
781
tryReloc nl il xid 1 ex_idx =
782
    let all_nodes = getOnline nl
783
        inst = Container.find xid il
784
        ex_idx' = Instance.pNode inst:ex_idx
785
        valid_nodes = filter (not . flip elem ex_idx' . Node.idx) all_nodes
786
        valid_idxes = map Node.idx valid_nodes
787
        sols1 = foldl' (\cstate x ->
788
                            let em = do
789
                                  (mnl, i, _, _) <-
790
                                      applyMove nl inst (ReplaceSecondary x)
791
                                  return (mnl, i, [Container.find x mnl],
792
                                          compCV mnl)
793
                            in concatAllocs cstate em
794
                       ) emptyAllocSolution valid_idxes
795
    in return sols1
796

    
797
tryReloc _ _ _ reqn _  = fail $ "Unsupported number of relocation \
798
                                \destinations required (" ++ show reqn ++
799
                                                  "), only one supported"
800

    
801
tryMGReloc :: (Monad m) =>
802
              Group.List      -- ^ The group list
803
           -> Node.List       -- ^ The node list
804
           -> Instance.List   -- ^ The instance list
805
           -> Idx             -- ^ The index of the instance to move
806
           -> Int             -- ^ The number of nodes required
807
           -> [Ndx]           -- ^ Nodes which should not be used
808
           -> m AllocSolution -- ^ Solution list
809
tryMGReloc _ mgnl mgil xid ncount ex_ndx = do
810
  let groups = splitCluster mgnl mgil
811
      -- TODO: we only relocate inside the group for now
812
      inst = Container.find xid mgil
813
  (nl, il) <- case lookup (instancePriGroup mgnl inst) groups of
814
                Nothing -> fail $ "Cannot find group for instance " ++
815
                           Instance.name inst
816
                Just v -> return v
817
  tryReloc nl il xid ncount ex_ndx
818

    
819
-- | Change an instance's secondary node.
820
evacInstance :: (Monad m) =>
821
                [Ndx]                      -- ^ Excluded nodes
822
             -> Instance.List              -- ^ The current instance list
823
             -> (Node.List, AllocSolution) -- ^ The current state
824
             -> Idx                        -- ^ The instance to evacuate
825
             -> m (Node.List, AllocSolution)
826
evacInstance ex_ndx il (nl, old_as) idx = do
827
  -- FIXME: hardcoded one node here
828

    
829
  -- Longer explanation: evacuation is currently hardcoded to DRBD
830
  -- instances (which have one secondary); hence, even if the
831
  -- IAllocator protocol can request N nodes for an instance, and all
832
  -- the message parsing/loading pass this, this implementation only
833
  -- supports one; this situation needs to be revisited if we ever
834
  -- support more than one secondary, or if we change the storage
835
  -- model
836
  new_as <- tryReloc nl il idx 1 ex_ndx
837
  case asSolutions new_as of
838
    -- an individual relocation succeeded, we kind of compose the data
839
    -- from the two solutions
840
    csol@(nl', _, _, _):_ ->
841
        return (nl', new_as { asSolutions = csol:asSolutions old_as })
842
    -- this relocation failed, so we fail the entire evac
843
    _ -> fail $ "Can't evacuate instance " ++
844
         Instance.name (Container.find idx il) ++
845
             ": " ++ describeSolution new_as
846

    
847
-- | Try to evacuate a list of nodes.
848
tryEvac :: (Monad m) =>
849
            Node.List       -- ^ The node list
850
         -> Instance.List   -- ^ The instance list
851
         -> [Idx]           -- ^ Instances to be evacuated
852
         -> [Ndx]           -- ^ Restricted nodes (the ones being evacuated)
853
         -> m AllocSolution -- ^ Solution list
854
tryEvac nl il idxs ex_ndx = do
855
  (_, sol) <- foldM (evacInstance ex_ndx il) (nl, emptyAllocSolution) idxs
856
  return sol
857

    
858
-- | Multi-group evacuation of a list of nodes.
859
tryMGEvac :: (Monad m) =>
860
             Group.List -- ^ The group list
861
          -> Node.List       -- ^ The node list
862
          -> Instance.List   -- ^ The instance list
863
          -> [Ndx]           -- ^ Nodes to be evacuated
864
          -> m AllocSolution -- ^ Solution list
865
tryMGEvac _ nl il ex_ndx =
866
    let ex_nodes = map (`Container.find` nl) ex_ndx
867
        all_insts = nub . concatMap Node.sList $ ex_nodes
868
        all_insts' = associateIdxs all_insts $ splitCluster nl il
869
    in do
870
      results <- mapM (\(_, (gnl, gil, idxs)) -> tryEvac gnl gil idxs ex_ndx)
871
                 all_insts'
872
      let sol = foldl' sumAllocs emptyAllocSolution results
873
      return $ annotateSolution sol
874

    
875
-- | Function which fails if the requested mode is change secondary.
876
--
877
-- This is useful since except DRBD, no other disk template can
878
-- execute change secondary; thus, we can just call this function
879
-- instead of always checking for secondary mode. After the call to
880
-- this function, whatever mode we have is just a primary change.
881
failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
882
failOnSecondaryChange ChangeSecondary dt =
883
    fail $ "Instances with disk template '" ++ dtToString dt ++
884
         "' can't execute change secondary"
885
failOnSecondaryChange _ _ = return ()
886

    
887
-- | Run evacuation for a single instance.
888
nodeEvacInstance :: Node.List         -- ^ The node list (cluster-wide)
889
                 -> Instance.List     -- ^ Instance list (cluster-wide)
890
                 -> EvacMode          -- ^ The evacuation mode
891
                 -> Instance.Instance -- ^ The instance to be evacuated
892
                 -> [Ndx]             -- ^ The list of available nodes
893
                                      -- for allocation
894
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
895
nodeEvacInstance _ _ mode (Instance.Instance
896
                           {Instance.diskTemplate = dt@DTDiskless}) _ =
897
                  failOnSecondaryChange mode dt >>
898
                  fail "Diskless relocations not implemented yet"
899

    
900
nodeEvacInstance _ _ _ (Instance.Instance
901
                        {Instance.diskTemplate = DTPlain}) _ =
902
                  fail "Instances of type plain cannot be relocated"
903

    
904
nodeEvacInstance _ _ _ (Instance.Instance
905
                        {Instance.diskTemplate = DTFile}) _ =
906
                  fail "Instances of type file cannot be relocated"
907

    
908
nodeEvacInstance _ _ mode  (Instance.Instance
909
                            {Instance.diskTemplate = dt@DTSharedFile}) _ =
910
                  failOnSecondaryChange mode dt >>
911
                  fail "Shared file relocations not implemented yet"
912

    
913
nodeEvacInstance _ _ mode (Instance.Instance
914
                           {Instance.diskTemplate = dt@DTBlock}) _ =
915
                  failOnSecondaryChange mode dt >>
916
                  fail "Block device relocations not implemented yet"
917

    
918
nodeEvacInstance nl il ChangePrimary
919
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8}) _ =
920
  do
921
    (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
922
    let idx = Instance.idx inst
923
        il' = Container.add idx inst' il
924
        ops = iMoveToJob nl' il' idx Failover
925
    return (nl', il', ops)
926

    
927
nodeEvacInstance nl il ChangeSecondary
928
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
929
                 avail_nodes =
930
  do
931
    let gdx = instancePriGroup nl inst
932
    (nl', inst', _, ndx) <- annotateResult "Can't find any good node" $
933
                            eitherToResult $
934
                            foldl' (evacDrbdSecondaryInner nl inst gdx)
935
                            (Left "no nodes available") avail_nodes
936
    let idx = Instance.idx inst
937
        il' = Container.add idx inst' il
938
        ops = iMoveToJob nl' il' idx (ReplaceSecondary ndx)
939
    return (nl', il', ops)
940

    
941
nodeEvacInstance _ _ _ (Instance.Instance
942
                        {Instance.diskTemplate = DTDrbd8}) _ =
943
                  fail "DRBD relocations not implemented yet"
944

    
945
-- | Inner fold function for changing secondary of a DRBD instance.
946
--
947
-- The "running" solution is either a @Left String@, which means we
948
-- don't have yet a working solution, or a @Right (...)@, which
949
-- represents a valid solution; it holds the modified node list, the
950
-- modified instance (after evacuation), the score of that solution,
951
-- and the new secondary node index.
952
evacDrbdSecondaryInner :: Node.List -- ^ Cluster node list
953
                       -> Instance.Instance -- ^ Instance being evacuated
954
                       -> Gdx -- ^ The group index of the instance
955
                       -> Either String ( Node.List
956
                                        , Instance.Instance
957
                                        , Score
958
                                        , Ndx)  -- ^ Current best solution
959
                       -> Ndx  -- ^ Node we're evaluating as new secondary
960
                       -> Either String ( Node.List
961
                                        , Instance.Instance
962
                                        , Score
963
                                        , Ndx) -- ^ New best solution
964
evacDrbdSecondaryInner nl inst gdx accu ndx =
965
    case applyMove nl inst (ReplaceSecondary ndx) of
966
      OpFail fm ->
967
          case accu of
968
            Right _ -> accu
969
            Left _ -> Left $ "Node " ++ Container.nameOf nl ndx ++
970
                      " failed: " ++ show fm
971
      OpGood (nl', inst', _, _) ->
972
          let nodes = Container.elems nl'
973
              -- The fromJust below is ugly (it can fail nastily), but
974
              -- at this point we should have any internal mismatches,
975
              -- and adding a monad here would be quite involved
976
              grpnodes = fromJust (gdx `lookup` (Node.computeGroups nodes))
977
              new_cv = compCVNodes grpnodes
978
              new_accu = Right (nl', inst', new_cv, ndx)
979
          in case accu of
980
               Left _ -> new_accu
981
               Right (_, _, old_cv, _) ->
982
                   if old_cv < new_cv
983
                   then accu
984
                   else new_accu
985

    
986
-- | Computes the local nodes of a given instance which are available
987
-- for allocation.
988
availableLocalNodes :: Node.List
989
                    -> [(Gdx, [Ndx])]
990
                    -> IntSet.IntSet
991
                    -> Instance.Instance
992
                    -> Result [Ndx]
993
availableLocalNodes nl group_nodes excl_ndx inst = do
994
  let gdx = instancePriGroup nl inst
995
  local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
996
                 Ok (lookup gdx group_nodes)
997
  let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
998
  return avail_nodes
999

    
1000
-- | Updates the evac solution with the results of an instance
1001
-- evacuation.
1002
updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1003
                   -> Instance.Instance
1004
                   -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1005
                   -> (Node.List, Instance.List, EvacSolution)
1006
updateEvacSolution (nl, il, es) inst (Bad msg) =
1007
    (nl, il, es { esFailed = (Instance.name inst ++ ": " ++ msg):esFailed es})
1008
updateEvacSolution (_, _, es) inst (Ok (nl, il, opcodes)) =
1009
    (nl, il, es { esMoved = Instance.name inst:esMoved es
1010
                , esOpCodes = [opcodes]:esOpCodes es })
1011

    
1012
-- | Node-evacuation IAllocator mode main function.
1013
tryNodeEvac :: Group.List    -- ^ The cluster groups
1014
            -> Node.List     -- ^ The node list (cluster-wide, not per group)
1015
            -> Instance.List -- ^ Instance list (cluster-wide)
1016
            -> EvacMode      -- ^ The evacuation mode
1017
            -> [Idx]         -- ^ List of instance (indices) to be evacuated
1018
            -> Result EvacSolution
1019
tryNodeEvac _ ini_nl ini_il mode idxs =
1020
    let evac_ndx = nodesToEvacuate ini_il mode idxs
1021
        offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1022
        excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1023
        group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1024
                                             (Container.elems nl))) $
1025
                      splitCluster ini_nl ini_il
1026
        (_, _, esol) =
1027
            foldl' (\state@(nl, il, _) inst ->
1028
                        updateEvacSolution state inst $
1029
                        availableLocalNodes nl group_ndx excl_ndx inst >>=
1030
                        nodeEvacInstance nl il mode inst
1031
                   )
1032
            (ini_nl, ini_il, emptyEvacSolution)
1033
            (map (`Container.find` ini_il) idxs)
1034
    in return $ reverseEvacSolution esol
1035

    
1036
-- | Recursively place instances on the cluster until we're out of space.
1037
iterateAlloc :: Node.List
1038
             -> Instance.List
1039
             -> Maybe Int
1040
             -> Instance.Instance
1041
             -> AllocNodes
1042
             -> [Instance.Instance]
1043
             -> [CStats]
1044
             -> Result AllocResult
1045
iterateAlloc nl il limit newinst allocnodes ixes cstats =
1046
      let depth = length ixes
1047
          newname = printf "new-%d" depth::String
1048
          newidx = length (Container.elems il) + depth
1049
          newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1050
          newlimit = fmap (flip (-) 1) limit
1051
      in case tryAlloc nl il newi2 allocnodes of
1052
           Bad s -> Bad s
1053
           Ok (AllocSolution { asFailures = errs, asSolutions = sols3 }) ->
1054
               let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1055
               case sols3 of
1056
                 [] -> newsol
1057
                 (xnl, xi, _, _):[] ->
1058
                     if limit == Just 0
1059
                     then newsol
1060
                     else iterateAlloc xnl (Container.add newidx xi il)
1061
                          newlimit newinst allocnodes (xi:ixes)
1062
                          (totalResources xnl:cstats)
1063
                 _ -> Bad "Internal error: multiple solutions for single\
1064
                          \ allocation"
1065

    
1066
-- | The core of the tiered allocation mode.
1067
tieredAlloc :: Node.List
1068
            -> Instance.List
1069
            -> Maybe Int
1070
            -> Instance.Instance
1071
            -> AllocNodes
1072
            -> [Instance.Instance]
1073
            -> [CStats]
1074
            -> Result AllocResult
1075
tieredAlloc nl il limit newinst allocnodes ixes cstats =
1076
    case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1077
      Bad s -> Bad s
1078
      Ok (errs, nl', il', ixes', cstats') ->
1079
          let newsol = Ok (errs, nl', il', ixes', cstats')
1080
              ixes_cnt = length ixes'
1081
              (stop, newlimit) = case limit of
1082
                                   Nothing -> (False, Nothing)
1083
                                   Just n -> (n <= ixes_cnt,
1084
                                              Just (n - ixes_cnt)) in
1085
          if stop then newsol else
1086
          case Instance.shrinkByType newinst . fst . last $
1087
               sortBy (comparing snd) errs of
1088
            Bad _ -> newsol
1089
            Ok newinst' -> tieredAlloc nl' il' newlimit
1090
                           newinst' allocnodes ixes' cstats'
1091

    
1092
-- | Compute the tiered spec string description from a list of
1093
-- allocated instances.
1094
tieredSpecMap :: [Instance.Instance]
1095
              -> [String]
1096
tieredSpecMap trl_ixes =
1097
    let fin_trl_ixes = reverse trl_ixes
1098
        ix_byspec = groupBy ((==) `on` Instance.specOf) fin_trl_ixes
1099
        spec_map = map (\ixs -> (Instance.specOf $ head ixs, length ixs))
1100
                   ix_byspec
1101
    in  map (\(spec, cnt) -> printf "%d,%d,%d=%d" (rspecMem spec)
1102
                             (rspecDsk spec) (rspecCpu spec) cnt) spec_map
1103

    
1104
-- * Formatting functions
1105

    
1106
-- | Given the original and final nodes, computes the relocation description.
1107
computeMoves :: Instance.Instance -- ^ The instance to be moved
1108
             -> String -- ^ The instance name
1109
             -> IMove  -- ^ The move being performed
1110
             -> String -- ^ New primary
1111
             -> String -- ^ New secondary
1112
             -> (String, [String])
1113
                -- ^ Tuple of moves and commands list; moves is containing
1114
                -- either @/f/@ for failover or @/r:name/@ for replace
1115
                -- secondary, while the command list holds gnt-instance
1116
                -- commands (without that prefix), e.g \"@failover instance1@\"
1117
computeMoves i inam mv c d =
1118
    case mv of
1119
      Failover -> ("f", [mig])
1120
      FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1121
      ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1122
      ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1123
      ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1124
    where morf = if Instance.running i then "migrate" else "failover"
1125
          mig = printf "%s -f %s" morf inam::String
1126
          rep n = printf "replace-disks -n %s %s" n inam
1127

    
1128
-- | Converts a placement to string format.
1129
printSolutionLine :: Node.List     -- ^ The node list
1130
                  -> Instance.List -- ^ The instance list
1131
                  -> Int           -- ^ Maximum node name length
1132
                  -> Int           -- ^ Maximum instance name length
1133
                  -> Placement     -- ^ The current placement
1134
                  -> Int           -- ^ The index of the placement in
1135
                                   -- the solution
1136
                  -> (String, [String])
1137
printSolutionLine nl il nmlen imlen plc pos =
1138
    let
1139
        pmlen = (2*nmlen + 1)
1140
        (i, p, s, mv, c) = plc
1141
        inst = Container.find i il
1142
        inam = Instance.alias inst
1143
        npri = Node.alias $ Container.find p nl
1144
        nsec = Node.alias $ Container.find s nl
1145
        opri = Node.alias $ Container.find (Instance.pNode inst) nl
1146
        osec = Node.alias $ Container.find (Instance.sNode inst) nl
1147
        (moves, cmds) =  computeMoves inst inam mv npri nsec
1148
        ostr = printf "%s:%s" opri osec::String
1149
        nstr = printf "%s:%s" npri nsec::String
1150
    in
1151
      (printf "  %3d. %-*s %-*s => %-*s %.8f a=%s"
1152
       pos imlen inam pmlen ostr
1153
       pmlen nstr c moves,
1154
       cmds)
1155

    
1156
-- | Return the instance and involved nodes in an instance move.
1157
involvedNodes :: Instance.List -> Placement -> [Ndx]
1158
involvedNodes il plc =
1159
    let (i, np, ns, _, _) = plc
1160
        inst = Container.find i il
1161
        op = Instance.pNode inst
1162
        os = Instance.sNode inst
1163
    in nub [np, ns, op, os]
1164

    
1165
-- | Inner function for splitJobs, that either appends the next job to
1166
-- the current jobset, or starts a new jobset.
1167
mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1168
mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1169
mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1170
    | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1171
    | otherwise = ([n]:cjs, ndx)
1172

    
1173
-- | Break a list of moves into independent groups. Note that this
1174
-- will reverse the order of jobs.
1175
splitJobs :: [MoveJob] -> [JobSet]
1176
splitJobs = fst . foldl mergeJobs ([], [])
1177

    
1178
-- | Given a list of commands, prefix them with @gnt-instance@ and
1179
-- also beautify the display a little.
1180
formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1181
formatJob jsn jsl (sn, (_, _, _, cmds)) =
1182
    let out =
1183
            printf "  echo job %d/%d" jsn sn:
1184
            printf "  check":
1185
            map ("  gnt-instance " ++) cmds
1186
    in if sn == 1
1187
       then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1188
       else out
1189

    
1190
-- | Given a list of commands, prefix them with @gnt-instance@ and
1191
-- also beautify the display a little.
1192
formatCmds :: [JobSet] -> String
1193
formatCmds =
1194
    unlines .
1195
    concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1196
                             (zip [1..] js)) .
1197
    zip [1..]
1198

    
1199
-- | Print the node list.
1200
printNodes :: Node.List -> [String] -> String
1201
printNodes nl fs =
1202
    let fields = case fs of
1203
          [] -> Node.defaultFields
1204
          "+":rest -> Node.defaultFields ++ rest
1205
          _ -> fs
1206
        snl = sortBy (comparing Node.idx) (Container.elems nl)
1207
        (header, isnum) = unzip $ map Node.showHeader fields
1208
    in unlines . map ((:) ' ' .  intercalate " ") $
1209
       formatTable (header:map (Node.list fields) snl) isnum
1210

    
1211
-- | Print the instance list.
1212
printInsts :: Node.List -> Instance.List -> String
1213
printInsts nl il =
1214
    let sil = sortBy (comparing Instance.idx) (Container.elems il)
1215
        helper inst = [ if Instance.running inst then "R" else " "
1216
                      , Instance.name inst
1217
                      , Container.nameOf nl (Instance.pNode inst)
1218
                      , let sdx = Instance.sNode inst
1219
                        in if sdx == Node.noSecondary
1220
                           then  ""
1221
                           else Container.nameOf nl sdx
1222
                      , if Instance.autoBalance inst then "Y" else "N"
1223
                      , printf "%3d" $ Instance.vcpus inst
1224
                      , printf "%5d" $ Instance.mem inst
1225
                      , printf "%5d" $ Instance.dsk inst `div` 1024
1226
                      , printf "%5.3f" lC
1227
                      , printf "%5.3f" lM
1228
                      , printf "%5.3f" lD
1229
                      , printf "%5.3f" lN
1230
                      ]
1231
            where DynUtil lC lM lD lN = Instance.util inst
1232
        header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1233
                 , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1234
        isnum = False:False:False:False:False:repeat True
1235
    in unlines . map ((:) ' ' . intercalate " ") $
1236
       formatTable (header:map helper sil) isnum
1237

    
1238
-- | Shows statistics for a given node list.
1239
printStats :: Node.List -> String
1240
printStats nl =
1241
    let dcvs = compDetailedCV $ Container.elems nl
1242
        (weights, names) = unzip detailedCVInfo
1243
        hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1244
        formatted = map (\(w, header, val) ->
1245
                             printf "%s=%.8f(x%.2f)" header val w::String) hd
1246
    in intercalate ", " formatted
1247

    
1248
-- | Convert a placement into a list of OpCodes (basically a job).
1249
iMoveToJob :: Node.List -> Instance.List
1250
          -> Idx -> IMove -> [OpCodes.OpCode]
1251
iMoveToJob nl il idx move =
1252
    let inst = Container.find idx il
1253
        iname = Instance.name inst
1254
        lookNode  = Just . Container.nameOf nl
1255
        opF = OpCodes.OpInstanceMigrate iname True False True
1256
        opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n)
1257
                OpCodes.ReplaceNewSecondary [] Nothing
1258
    in case move of
1259
         Failover -> [ opF ]
1260
         ReplacePrimary np -> [ opF, opR np, opF ]
1261
         ReplaceSecondary ns -> [ opR ns ]
1262
         ReplaceAndFailover np -> [ opR np, opF ]
1263
         FailoverAndReplace ns -> [ opF, opR ns ]
1264

    
1265
-- * Node group functions
1266

    
1267
-- | Computes the group of an instance.
1268
instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1269
instanceGroup nl i =
1270
  let sidx = Instance.sNode i
1271
      pnode = Container.find (Instance.pNode i) nl
1272
      snode = if sidx == Node.noSecondary
1273
              then pnode
1274
              else Container.find sidx nl
1275
      pgroup = Node.group pnode
1276
      sgroup = Node.group snode
1277
  in if pgroup /= sgroup
1278
     then fail ("Instance placed accross two node groups, primary " ++
1279
                show pgroup ++ ", secondary " ++ show sgroup)
1280
     else return pgroup
1281

    
1282
-- | Computes the group of an instance per the primary node.
1283
instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1284
instancePriGroup nl i =
1285
  let pnode = Container.find (Instance.pNode i) nl
1286
  in  Node.group pnode
1287

    
1288
-- | Compute the list of badly allocated instances (split across node
1289
-- groups).
1290
findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1291
findSplitInstances nl =
1292
  filter (not . isOk . instanceGroup nl) . Container.elems
1293

    
1294
-- | Splits a cluster into the component node groups.
1295
splitCluster :: Node.List -> Instance.List ->
1296
                [(Gdx, (Node.List, Instance.List))]
1297
splitCluster nl il =
1298
  let ngroups = Node.computeGroups (Container.elems nl)
1299
  in map (\(guuid, nodes) ->
1300
           let nidxs = map Node.idx nodes
1301
               nodes' = zip nidxs nodes
1302
               instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1303
           in (guuid, (Container.fromList nodes', instances))) ngroups
1304

    
1305
-- | Split a global instance index map into per-group, and associate
1306
-- it with the group/node/instance lists.
1307
associateIdxs :: [Idx] -- ^ Instance indices to be split/associated
1308
              -> [(Gdx, (Node.List, Instance.List))]        -- ^ Input groups
1309
              -> [(Gdx, (Node.List, Instance.List, [Idx]))] -- ^ Result
1310
associateIdxs idxs =
1311
    map (\(gdx, (nl, il)) ->
1312
             (gdx, (nl, il, filter (`Container.member` il) idxs)))
1313

    
1314
-- | Compute the list of nodes that are to be evacuated, given a list
1315
-- of instances and an evacuation mode.
1316
nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1317
                -> EvacMode      -- ^ The evacuation mode we're using
1318
                -> [Idx]         -- ^ List of instance indices being evacuated
1319
                -> IntSet.IntSet -- ^ Set of node indices
1320
nodesToEvacuate il mode =
1321
    IntSet.delete Node.noSecondary .
1322
    foldl' (\ns idx ->
1323
                let i = Container.find idx il
1324
                    pdx = Instance.pNode i
1325
                    sdx = Instance.sNode i
1326
                    dt = Instance.diskTemplate i
1327
                    withSecondary = case dt of
1328
                                      DTDrbd8 -> IntSet.insert sdx ns
1329
                                      _ -> ns
1330
                in case mode of
1331
                     ChangePrimary   -> IntSet.insert pdx ns
1332
                     ChangeSecondary -> withSecondary
1333
                     ChangeAll       -> IntSet.insert pdx withSecondary
1334
           ) IntSet.empty