Statistics
| Branch: | Tag: | Revision:

root / htools / Ganeti / HTools / Cluster.hs @ bef83fd1

History | View | Annotate | Download (53.3 kB)

1
{-| Implementation of cluster-wide logic.
2

    
3
This module holds all pure cluster-logic; I\/O related functionality
4
goes into the /Main/ module for the individual binaries.
5

    
6
-}
7

    
8
{-
9

    
10
Copyright (C) 2009, 2010, 2011 Google Inc.
11

    
12
This program is free software; you can redistribute it and/or modify
13
it under the terms of the GNU General Public License as published by
14
the Free Software Foundation; either version 2 of the License, or
15
(at your option) any later version.
16

    
17
This program is distributed in the hope that it will be useful, but
18
WITHOUT ANY WARRANTY; without even the implied warranty of
19
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
General Public License for more details.
21

    
22
You should have received a copy of the GNU General Public License
23
along with this program; if not, write to the Free Software
24
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25
02110-1301, USA.
26

    
27
-}
28

    
29
module Ganeti.HTools.Cluster
30
    (
31
     -- * Types
32
      AllocSolution(..)
33
    , EvacSolution(..)
34
    , Table(..)
35
    , CStats(..)
36
    , AllocStats
37
    -- * Generic functions
38
    , totalResources
39
    , computeAllocationDelta
40
    -- * First phase functions
41
    , computeBadItems
42
    -- * Second phase functions
43
    , printSolutionLine
44
    , formatCmds
45
    , involvedNodes
46
    , splitJobs
47
    -- * Display functions
48
    , printNodes
49
    , printInsts
50
    -- * Balacing functions
51
    , checkMove
52
    , doNextBalance
53
    , tryBalance
54
    , compCV
55
    , compDetailedCV
56
    , printStats
57
    , iMoveToJob
58
    -- * IAllocator functions
59
    , genAllocNodes
60
    , tryAlloc
61
    , tryMGAlloc
62
    , tryReloc
63
    , tryMGReloc
64
    , tryEvac
65
    , tryMGEvac
66
    , tryNodeEvac
67
    , collapseFailures
68
    -- * Allocation functions
69
    , iterateAlloc
70
    , tieredAlloc
71
    , tieredSpecMap
72
     -- * Node group functions
73
    , instanceGroup
74
    , findSplitInstances
75
    , splitCluster
76
    ) where
77

    
78
import Data.Function (on)
79
import qualified Data.IntSet as IntSet
80
import Data.List
81
import Data.Ord (comparing)
82
import Text.Printf (printf)
83
import Control.Monad
84
import Control.Parallel.Strategies
85

    
86
import qualified Ganeti.HTools.Container as Container
87
import qualified Ganeti.HTools.Instance as Instance
88
import qualified Ganeti.HTools.Node as Node
89
import qualified Ganeti.HTools.Group as Group
90
import Ganeti.HTools.Types
91
import Ganeti.HTools.Utils
92
import qualified Ganeti.OpCodes as OpCodes
93

    
94
-- * Types
95

    
96
-- | Allocation\/relocation solution.
97
data AllocSolution = AllocSolution
98
  { asFailures  :: [FailMode]          -- ^ Failure counts
99
  , asAllocs    :: Int                 -- ^ Good allocation count
100
  , asSolutions :: [Node.AllocElement] -- ^ The actual result, length
101
                                       -- of the list depends on the
102
                                       -- allocation/relocation mode
103
  , asLog       :: [String]            -- ^ A list of informational messages
104
  }
105

    
106
-- | Node evacuation/group change iallocator result type. This result
107
-- type consists of actual opcodes (a restricted subset) that are
108
-- transmitted back to Ganeti.
109
data EvacSolution = EvacSolution
110
    { esMoved   :: [String]             -- ^ Instance moved successfully
111
    , esFailed  :: [String]             -- ^ Instance which were not
112
                                        -- relocated
113
    , esOpCodes :: [[[OpCodes.OpCode]]] -- ^ List of lists of jobs
114
    }
115

    
116
-- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
117
type AllocResult = (FailStats, Node.List, Instance.List,
118
                    [Instance.Instance], [CStats])
119

    
120
-- | A type denoting the valid allocation mode/pairs.
121
--
122
-- For a one-node allocation, this will be a @Left ['Node.Node']@,
123
-- whereas for a two-node allocation, this will be a @Right
124
-- [('Node.Node', 'Node.Node')]@.
125
type AllocNodes = Either [Ndx] [(Ndx, Ndx)]
126

    
127
-- | The empty solution we start with when computing allocations.
128
emptyAllocSolution :: AllocSolution
129
emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
130
                                   , asSolutions = [], asLog = [] }
131

    
132
-- | The empty evac solution.
133
emptyEvacSolution :: EvacSolution
134
emptyEvacSolution = EvacSolution { esMoved = []
135
                                 , esFailed = []
136
                                 , esOpCodes = []
137
                                 }
138

    
139
-- | The complete state for the balancing solution.
140
data Table = Table Node.List Instance.List Score [Placement]
141
             deriving (Show, Read)
142

    
143
data CStats = CStats { csFmem :: Integer -- ^ Cluster free mem
144
                     , csFdsk :: Integer -- ^ Cluster free disk
145
                     , csAmem :: Integer -- ^ Cluster allocatable mem
146
                     , csAdsk :: Integer -- ^ Cluster allocatable disk
147
                     , csAcpu :: Integer -- ^ Cluster allocatable cpus
148
                     , csMmem :: Integer -- ^ Max node allocatable mem
149
                     , csMdsk :: Integer -- ^ Max node allocatable disk
150
                     , csMcpu :: Integer -- ^ Max node allocatable cpu
151
                     , csImem :: Integer -- ^ Instance used mem
152
                     , csIdsk :: Integer -- ^ Instance used disk
153
                     , csIcpu :: Integer -- ^ Instance used cpu
154
                     , csTmem :: Double  -- ^ Cluster total mem
155
                     , csTdsk :: Double  -- ^ Cluster total disk
156
                     , csTcpu :: Double  -- ^ Cluster total cpus
157
                     , csVcpu :: Integer -- ^ Cluster virtual cpus (if
158
                                         -- node pCpu has been set,
159
                                         -- otherwise -1)
160
                     , csXmem :: Integer -- ^ Unnacounted for mem
161
                     , csNmem :: Integer -- ^ Node own memory
162
                     , csScore :: Score  -- ^ The cluster score
163
                     , csNinst :: Int    -- ^ The total number of instances
164
                     }
165
            deriving (Show, Read)
166

    
167
-- | Currently used, possibly to allocate, unallocable.
168
type AllocStats = (RSpec, RSpec, RSpec)
169

    
170
-- * Utility functions
171

    
172
-- | Verifies the N+1 status and return the affected nodes.
173
verifyN1 :: [Node.Node] -> [Node.Node]
174
verifyN1 = filter Node.failN1
175

    
176
{-| Computes the pair of bad nodes and instances.
177

    
178
The bad node list is computed via a simple 'verifyN1' check, and the
179
bad instance list is the list of primary and secondary instances of
180
those nodes.
181

    
182
-}
183
computeBadItems :: Node.List -> Instance.List ->
184
                   ([Node.Node], [Instance.Instance])
185
computeBadItems nl il =
186
  let bad_nodes = verifyN1 $ getOnline nl
187
      bad_instances = map (`Container.find` il) .
188
                      sort . nub $
189
                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
190
  in
191
    (bad_nodes, bad_instances)
192

    
193
-- | Zero-initializer for the CStats type.
194
emptyCStats :: CStats
195
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
196

    
197
-- | Update stats with data from a new node.
198
updateCStats :: CStats -> Node.Node -> CStats
199
updateCStats cs node =
200
    let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
201
                 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
202
                 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
203
                 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
204
                 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
205
                 csVcpu = x_vcpu,
206
                 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
207
               }
208
            = cs
209
        inc_amem = Node.fMem node - Node.rMem node
210
        inc_amem' = if inc_amem > 0 then inc_amem else 0
211
        inc_adsk = Node.availDisk node
212
        inc_imem = truncate (Node.tMem node) - Node.nMem node
213
                   - Node.xMem node - Node.fMem node
214
        inc_icpu = Node.uCpu node
215
        inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
216
        inc_vcpu = Node.hiCpu node
217
        inc_acpu = Node.availCpu node
218

    
219
    in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
220
          , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
221
          , csAmem = x_amem + fromIntegral inc_amem'
222
          , csAdsk = x_adsk + fromIntegral inc_adsk
223
          , csAcpu = x_acpu + fromIntegral inc_acpu
224
          , csMmem = max x_mmem (fromIntegral inc_amem')
225
          , csMdsk = max x_mdsk (fromIntegral inc_adsk)
226
          , csMcpu = max x_mcpu (fromIntegral inc_acpu)
227
          , csImem = x_imem + fromIntegral inc_imem
228
          , csIdsk = x_idsk + fromIntegral inc_idsk
229
          , csIcpu = x_icpu + fromIntegral inc_icpu
230
          , csTmem = x_tmem + Node.tMem node
231
          , csTdsk = x_tdsk + Node.tDsk node
232
          , csTcpu = x_tcpu + Node.tCpu node
233
          , csVcpu = x_vcpu + fromIntegral inc_vcpu
234
          , csXmem = x_xmem + fromIntegral (Node.xMem node)
235
          , csNmem = x_nmem + fromIntegral (Node.nMem node)
236
          , csNinst = x_ninst + length (Node.pList node)
237
          }
238

    
239
-- | Compute the total free disk and memory in the cluster.
240
totalResources :: Node.List -> CStats
241
totalResources nl =
242
    let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
243
    in cs { csScore = compCV nl }
244

    
245
-- | Compute the delta between two cluster state.
246
--
247
-- This is used when doing allocations, to understand better the
248
-- available cluster resources. The return value is a triple of the
249
-- current used values, the delta that was still allocated, and what
250
-- was left unallocated.
251
computeAllocationDelta :: CStats -> CStats -> AllocStats
252
computeAllocationDelta cini cfin =
253
    let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini
254
        CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
255
                csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin
256
        rini = RSpec (fromIntegral i_icpu) (fromIntegral i_imem)
257
               (fromIntegral i_idsk)
258
        rfin = RSpec (fromIntegral (f_icpu - i_icpu))
259
               (fromIntegral (f_imem - i_imem))
260
               (fromIntegral (f_idsk - i_idsk))
261
        un_cpu = fromIntegral (v_cpu - f_icpu)::Int
262
        runa = RSpec un_cpu (truncate t_mem - fromIntegral f_imem)
263
               (truncate t_dsk - fromIntegral f_idsk)
264
    in (rini, rfin, runa)
265

    
266
-- | The names and weights of the individual elements in the CV list.
267
detailedCVInfo :: [(Double, String)]
268
detailedCVInfo = [ (1,  "free_mem_cv")
269
                 , (1,  "free_disk_cv")
270
                 , (1,  "n1_cnt")
271
                 , (1,  "reserved_mem_cv")
272
                 , (4,  "offline_all_cnt")
273
                 , (16, "offline_pri_cnt")
274
                 , (1,  "vcpu_ratio_cv")
275
                 , (1,  "cpu_load_cv")
276
                 , (1,  "mem_load_cv")
277
                 , (1,  "disk_load_cv")
278
                 , (1,  "net_load_cv")
279
                 , (2,  "pri_tags_score")
280
                 ]
281

    
282
detailedCVWeights :: [Double]
283
detailedCVWeights = map fst detailedCVInfo
284

    
285
-- | Compute the mem and disk covariance.
286
compDetailedCV :: Node.List -> [Double]
287
compDetailedCV nl =
288
    let
289
        all_nodes = Container.elems nl
290
        (offline, nodes) = partition Node.offline all_nodes
291
        mem_l = map Node.pMem nodes
292
        dsk_l = map Node.pDsk nodes
293
        -- metric: memory covariance
294
        mem_cv = stdDev mem_l
295
        -- metric: disk covariance
296
        dsk_cv = stdDev dsk_l
297
        -- metric: count of instances living on N1 failing nodes
298
        n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
299
                                                   length (Node.pList n)) .
300
                   filter Node.failN1 $ nodes :: Double
301
        res_l = map Node.pRem nodes
302
        -- metric: reserved memory covariance
303
        res_cv = stdDev res_l
304
        -- offline instances metrics
305
        offline_ipri = sum . map (length . Node.pList) $ offline
306
        offline_isec = sum . map (length . Node.sList) $ offline
307
        -- metric: count of instances on offline nodes
308
        off_score = fromIntegral (offline_ipri + offline_isec)::Double
309
        -- metric: count of primary instances on offline nodes (this
310
        -- helps with evacuation/failover of primary instances on
311
        -- 2-node clusters with one node offline)
312
        off_pri_score = fromIntegral offline_ipri::Double
313
        cpu_l = map Node.pCpu nodes
314
        -- metric: covariance of vcpu/pcpu ratio
315
        cpu_cv = stdDev cpu_l
316
        -- metrics: covariance of cpu, memory, disk and network load
317
        (c_load, m_load, d_load, n_load) = unzip4 $
318
            map (\n ->
319
                     let DynUtil c1 m1 d1 n1 = Node.utilLoad n
320
                         DynUtil c2 m2 d2 n2 = Node.utilPool n
321
                     in (c1/c2, m1/m2, d1/d2, n1/n2)
322
                ) nodes
323
        -- metric: conflicting instance count
324
        pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
325
        pri_tags_score = fromIntegral pri_tags_inst::Double
326
    in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
327
       , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
328
       , pri_tags_score ]
329

    
330
-- | Compute the /total/ variance.
331
compCV :: Node.List -> Double
332
compCV = sum . zipWith (*) detailedCVWeights . compDetailedCV
333

    
334
-- | Compute online nodes from a 'Node.List'.
335
getOnline :: Node.List -> [Node.Node]
336
getOnline = filter (not . Node.offline) . Container.elems
337

    
338
-- * Balancing functions
339

    
340
-- | Compute best table. Note that the ordering of the arguments is important.
341
compareTables :: Table -> Table -> Table
342
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
343
    if a_cv > b_cv then b else a
344

    
345
-- | Applies an instance move to a given node list and instance.
346
applyMove :: Node.List -> Instance.Instance
347
          -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
348
-- Failover (f)
349
applyMove nl inst Failover =
350
    let old_pdx = Instance.pNode inst
351
        old_sdx = Instance.sNode inst
352
        old_p = Container.find old_pdx nl
353
        old_s = Container.find old_sdx nl
354
        int_p = Node.removePri old_p inst
355
        int_s = Node.removeSec old_s inst
356
        force_p = Node.offline old_p
357
        new_nl = do -- Maybe monad
358
          new_p <- Node.addPriEx force_p int_s inst
359
          new_s <- Node.addSec int_p inst old_sdx
360
          let new_inst = Instance.setBoth inst old_sdx old_pdx
361
          return (Container.addTwo old_pdx new_s old_sdx new_p nl,
362
                  new_inst, old_sdx, old_pdx)
363
    in new_nl
364

    
365
-- Replace the primary (f:, r:np, f)
366
applyMove nl inst (ReplacePrimary new_pdx) =
367
    let old_pdx = Instance.pNode inst
368
        old_sdx = Instance.sNode inst
369
        old_p = Container.find old_pdx nl
370
        old_s = Container.find old_sdx nl
371
        tgt_n = Container.find new_pdx nl
372
        int_p = Node.removePri old_p inst
373
        int_s = Node.removeSec old_s inst
374
        force_p = Node.offline old_p
375
        new_nl = do -- Maybe monad
376
          -- check that the current secondary can host the instance
377
          -- during the migration
378
          tmp_s <- Node.addPriEx force_p int_s inst
379
          let tmp_s' = Node.removePri tmp_s inst
380
          new_p <- Node.addPriEx force_p tgt_n inst
381
          new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
382
          let new_inst = Instance.setPri inst new_pdx
383
          return (Container.add new_pdx new_p $
384
                  Container.addTwo old_pdx int_p old_sdx new_s nl,
385
                  new_inst, new_pdx, old_sdx)
386
    in new_nl
387

    
388
-- Replace the secondary (r:ns)
389
applyMove nl inst (ReplaceSecondary new_sdx) =
390
    let old_pdx = Instance.pNode inst
391
        old_sdx = Instance.sNode inst
392
        old_s = Container.find old_sdx nl
393
        tgt_n = Container.find new_sdx nl
394
        int_s = Node.removeSec old_s inst
395
        force_s = Node.offline old_s
396
        new_inst = Instance.setSec inst new_sdx
397
        new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
398
                 \new_s -> return (Container.addTwo new_sdx
399
                                   new_s old_sdx int_s nl,
400
                                   new_inst, old_pdx, new_sdx)
401
    in new_nl
402

    
403
-- Replace the secondary and failover (r:np, f)
404
applyMove nl inst (ReplaceAndFailover new_pdx) =
405
    let old_pdx = Instance.pNode inst
406
        old_sdx = Instance.sNode inst
407
        old_p = Container.find old_pdx nl
408
        old_s = Container.find old_sdx nl
409
        tgt_n = Container.find new_pdx nl
410
        int_p = Node.removePri old_p inst
411
        int_s = Node.removeSec old_s inst
412
        force_s = Node.offline old_s
413
        new_nl = do -- Maybe monad
414
          new_p <- Node.addPri tgt_n inst
415
          new_s <- Node.addSecEx force_s int_p inst new_pdx
416
          let new_inst = Instance.setBoth inst new_pdx old_pdx
417
          return (Container.add new_pdx new_p $
418
                  Container.addTwo old_pdx new_s old_sdx int_s nl,
419
                  new_inst, new_pdx, old_pdx)
420
    in new_nl
421

    
422
-- Failver and replace the secondary (f, r:ns)
423
applyMove nl inst (FailoverAndReplace new_sdx) =
424
    let old_pdx = Instance.pNode inst
425
        old_sdx = Instance.sNode inst
426
        old_p = Container.find old_pdx nl
427
        old_s = Container.find old_sdx nl
428
        tgt_n = Container.find new_sdx nl
429
        int_p = Node.removePri old_p inst
430
        int_s = Node.removeSec old_s inst
431
        force_p = Node.offline old_p
432
        new_nl = do -- Maybe monad
433
          new_p <- Node.addPriEx force_p int_s inst
434
          new_s <- Node.addSecEx force_p tgt_n inst old_sdx
435
          let new_inst = Instance.setBoth inst old_sdx new_sdx
436
          return (Container.add new_sdx new_s $
437
                  Container.addTwo old_sdx new_p old_pdx int_p nl,
438
                  new_inst, old_sdx, new_sdx)
439
    in new_nl
440

    
441
-- | Tries to allocate an instance on one given node.
442
allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
443
                 -> OpResult Node.AllocElement
444
allocateOnSingle nl inst new_pdx =
445
    let p = Container.find new_pdx nl
446
        new_inst = Instance.setBoth inst new_pdx Node.noSecondary
447
    in  Node.addPri p inst >>= \new_p -> do
448
      let new_nl = Container.add new_pdx new_p nl
449
          new_score = compCV nl
450
      return (new_nl, new_inst, [new_p], new_score)
451

    
452
-- | Tries to allocate an instance on a given pair of nodes.
453
allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
454
               -> OpResult Node.AllocElement
455
allocateOnPair nl inst new_pdx new_sdx =
456
    let tgt_p = Container.find new_pdx nl
457
        tgt_s = Container.find new_sdx nl
458
    in do
459
      new_p <- Node.addPri tgt_p inst
460
      new_s <- Node.addSec tgt_s inst new_pdx
461
      let new_inst = Instance.setBoth inst new_pdx new_sdx
462
          new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
463
      return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
464

    
465
-- | Tries to perform an instance move and returns the best table
466
-- between the original one and the new one.
467
checkSingleStep :: Table -- ^ The original table
468
                -> Instance.Instance -- ^ The instance to move
469
                -> Table -- ^ The current best table
470
                -> IMove -- ^ The move to apply
471
                -> Table -- ^ The final best table
472
checkSingleStep ini_tbl target cur_tbl move =
473
    let
474
        Table ini_nl ini_il _ ini_plc = ini_tbl
475
        tmp_resu = applyMove ini_nl target move
476
    in
477
      case tmp_resu of
478
        OpFail _ -> cur_tbl
479
        OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
480
            let tgt_idx = Instance.idx target
481
                upd_cvar = compCV upd_nl
482
                upd_il = Container.add tgt_idx new_inst ini_il
483
                upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
484
                upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
485
            in
486
              compareTables cur_tbl upd_tbl
487

    
488
-- | Given the status of the current secondary as a valid new node and
489
-- the current candidate target node, generate the possible moves for
490
-- a instance.
491
possibleMoves :: Bool      -- ^ Whether the secondary node is a valid new node
492
              -> Bool      -- ^ Whether we can change the primary node
493
              -> Ndx       -- ^ Target node candidate
494
              -> [IMove]   -- ^ List of valid result moves
495

    
496
possibleMoves _ False tdx =
497
    [ReplaceSecondary tdx]
498

    
499
possibleMoves True True tdx =
500
    [ReplaceSecondary tdx,
501
     ReplaceAndFailover tdx,
502
     ReplacePrimary tdx,
503
     FailoverAndReplace tdx]
504

    
505
possibleMoves False True tdx =
506
    [ReplaceSecondary tdx,
507
     ReplaceAndFailover tdx]
508

    
509
-- | Compute the best move for a given instance.
510
checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
511
                  -> Bool              -- ^ Whether disk moves are allowed
512
                  -> Bool              -- ^ Whether instance moves are allowed
513
                  -> Table             -- ^ Original table
514
                  -> Instance.Instance -- ^ Instance to move
515
                  -> Table             -- ^ Best new table for this instance
516
checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
517
    let
518
        opdx = Instance.pNode target
519
        osdx = Instance.sNode target
520
        nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx
521
        use_secondary = elem osdx nodes_idx && inst_moves
522
        aft_failover = if use_secondary -- if allowed to failover
523
                       then checkSingleStep ini_tbl target ini_tbl Failover
524
                       else ini_tbl
525
        all_moves = if disk_moves
526
                    then concatMap
527
                         (possibleMoves use_secondary inst_moves) nodes
528
                    else []
529
    in
530
      -- iterate over the possible nodes for this instance
531
      foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
532

    
533
-- | Compute the best next move.
534
checkMove :: [Ndx]               -- ^ Allowed target node indices
535
          -> Bool                -- ^ Whether disk moves are allowed
536
          -> Bool                -- ^ Whether instance moves are allowed
537
          -> Table               -- ^ The current solution
538
          -> [Instance.Instance] -- ^ List of instances still to move
539
          -> Table               -- ^ The new solution
540
checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
541
    let Table _ _ _ ini_plc = ini_tbl
542
        -- we're using rwhnf from the Control.Parallel.Strategies
543
        -- package; we don't need to use rnf as that would force too
544
        -- much evaluation in single-threaded cases, and in
545
        -- multi-threaded case the weak head normal form is enough to
546
        -- spark the evaluation
547
        tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
548
                               inst_moves ini_tbl)
549
                 victims
550
        -- iterate over all instances, computing the best move
551
        best_tbl = foldl' compareTables ini_tbl tables
552
        Table _ _ _ best_plc = best_tbl
553
    in if length best_plc == length ini_plc
554
       then ini_tbl -- no advancement
555
       else best_tbl
556

    
557
-- | Check if we are allowed to go deeper in the balancing.
558
doNextBalance :: Table     -- ^ The starting table
559
              -> Int       -- ^ Remaining length
560
              -> Score     -- ^ Score at which to stop
561
              -> Bool      -- ^ The resulting table and commands
562
doNextBalance ini_tbl max_rounds min_score =
563
    let Table _ _ ini_cv ini_plc = ini_tbl
564
        ini_plc_len = length ini_plc
565
    in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
566

    
567
-- | Run a balance move.
568
tryBalance :: Table       -- ^ The starting table
569
           -> Bool        -- ^ Allow disk moves
570
           -> Bool        -- ^ Allow instance moves
571
           -> Bool        -- ^ Only evacuate moves
572
           -> Score       -- ^ Min gain threshold
573
           -> Score       -- ^ Min gain
574
           -> Maybe Table -- ^ The resulting table and commands
575
tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
576
    let Table ini_nl ini_il ini_cv _ = ini_tbl
577
        all_inst = Container.elems ini_il
578
        all_inst' = if evac_mode
579
                    then let bad_nodes = map Node.idx . filter Node.offline $
580
                                         Container.elems ini_nl
581
                         in filter (\e -> Instance.sNode e `elem` bad_nodes ||
582
                                          Instance.pNode e `elem` bad_nodes)
583
                            all_inst
584
                    else all_inst
585
        reloc_inst = filter Instance.movable all_inst'
586
        node_idx = map Node.idx . filter (not . Node.offline) $
587
                   Container.elems ini_nl
588
        fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
589
        (Table _ _ fin_cv _) = fin_tbl
590
    in
591
      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
592
      then Just fin_tbl -- this round made success, return the new table
593
      else Nothing
594

    
595
-- * Allocation functions
596

    
597
-- | Build failure stats out of a list of failures.
598
collapseFailures :: [FailMode] -> FailStats
599
collapseFailures flst =
600
    map (\k -> (k, length $ filter (k ==) flst)) [minBound..maxBound]
601

    
602
-- | Update current Allocation solution and failure stats with new
603
-- elements.
604
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
605
concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
606

    
607
concatAllocs as (OpGood ns@(_, _, _, nscore)) =
608
    let -- Choose the old or new solution, based on the cluster score
609
        cntok = asAllocs as
610
        osols = asSolutions as
611
        nsols = case osols of
612
                  [] -> [ns]
613
                  (_, _, _, oscore):[] ->
614
                      if oscore < nscore
615
                      then osols
616
                      else [ns]
617
                  -- FIXME: here we simply concat to lists with more
618
                  -- than one element; we should instead abort, since
619
                  -- this is not a valid usage of this function
620
                  xs -> ns:xs
621
        nsuc = cntok + 1
622
    -- Note: we force evaluation of nsols here in order to keep the
623
    -- memory profile low - we know that we will need nsols for sure
624
    -- in the next cycle, so we force evaluation of nsols, since the
625
    -- foldl' in the caller will only evaluate the tuple, but not the
626
    -- elements of the tuple
627
    in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolutions = nsols }
628

    
629
-- | Sums two allocation solutions (e.g. for two separate node groups).
630
sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
631
sumAllocs (AllocSolution af aa as al) (AllocSolution bf ba bs bl) =
632
    AllocSolution (af ++ bf) (aa + ba) (as ++ bs) (al ++ bl)
633

    
634
-- | Given a solution, generates a reasonable description for it.
635
describeSolution :: AllocSolution -> String
636
describeSolution as =
637
  let fcnt = asFailures as
638
      sols = asSolutions as
639
      freasons =
640
        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
641
        filter ((> 0) . snd) . collapseFailures $ fcnt
642
  in if null sols
643
     then "No valid allocation solutions, failure reasons: " ++
644
          (if null fcnt
645
           then "unknown reasons"
646
           else freasons)
647
     else let (_, _, nodes, cv) = head sols
648
          in printf ("score: %.8f, successes %d, failures %d (%s)" ++
649
                     " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
650
             (intercalate "/" . map Node.name $ nodes)
651

    
652
-- | Annotates a solution with the appropriate string.
653
annotateSolution :: AllocSolution -> AllocSolution
654
annotateSolution as = as { asLog = describeSolution as : asLog as }
655

    
656
-- | Reverses an evacuation solution.
657
--
658
-- Rationale: we always concat the results to the top of the lists, so
659
-- for proper jobset execution, we should reverse all lists.
660
reverseEvacSolution :: EvacSolution -> EvacSolution
661
reverseEvacSolution (EvacSolution f m o) =
662
    EvacSolution (reverse f) (reverse m) (reverse o)
663

    
664
-- | Generate the valid node allocation singles or pairs for a new instance.
665
genAllocNodes :: Group.List        -- ^ Group list
666
              -> Node.List         -- ^ The node map
667
              -> Int               -- ^ The number of nodes required
668
              -> Bool              -- ^ Whether to drop or not
669
                                   -- unallocable nodes
670
              -> Result AllocNodes -- ^ The (monadic) result
671
genAllocNodes gl nl count drop_unalloc =
672
    let filter_fn = if drop_unalloc
673
                    then filter (Group.isAllocable .
674
                                 flip Container.find gl . Node.group)
675
                    else id
676
        all_nodes = filter_fn $ getOnline nl
677
        all_pairs = liftM2 (,) all_nodes all_nodes
678
        ok_pairs = filter (\(x, y) -> Node.idx x /= Node.idx y &&
679
                                      Node.group x == Node.group y) all_pairs
680
    in case count of
681
         1 -> Ok (Left (map Node.idx all_nodes))
682
         2 -> Ok (Right (map (\(p, s) -> (Node.idx p, Node.idx s)) ok_pairs))
683
         _ -> Bad "Unsupported number of nodes, only one or two  supported"
684

    
685
-- | Try to allocate an instance on the cluster.
686
tryAlloc :: (Monad m) =>
687
            Node.List         -- ^ The node list
688
         -> Instance.List     -- ^ The instance list
689
         -> Instance.Instance -- ^ The instance to allocate
690
         -> AllocNodes        -- ^ The allocation targets
691
         -> m AllocSolution   -- ^ Possible solution list
692
tryAlloc nl _ inst (Right ok_pairs) =
693
    let sols = foldl' (\cstate (p, s) ->
694
                           concatAllocs cstate $ allocateOnPair nl inst p s
695
                      ) emptyAllocSolution ok_pairs
696

    
697
    in if null ok_pairs -- means we have just one node
698
       then fail "Not enough online nodes"
699
       else return $ annotateSolution sols
700

    
701
tryAlloc nl _ inst (Left all_nodes) =
702
    let sols = foldl' (\cstate ->
703
                           concatAllocs cstate . allocateOnSingle nl inst
704
                      ) emptyAllocSolution all_nodes
705
    in if null all_nodes
706
       then fail "No online nodes"
707
       else return $ annotateSolution sols
708

    
709
-- | Given a group/result, describe it as a nice (list of) messages.
710
solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
711
solutionDescription gl (groupId, result) =
712
  case result of
713
    Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
714
    Bad message -> [printf "Group %s: error %s" gname message]
715
  where grp = Container.find groupId gl
716
        gname = Group.name grp
717
        pol = apolToString (Group.allocPolicy grp)
718

    
719
-- | From a list of possibly bad and possibly empty solutions, filter
720
-- only the groups with a valid result. Note that the result will be
721
-- reversed compared to the original list.
722
filterMGResults :: Group.List
723
                -> [(Gdx, Result AllocSolution)]
724
                -> [(Gdx, AllocSolution)]
725
filterMGResults gl = foldl' fn []
726
    where unallocable = not . Group.isAllocable . flip Container.find gl
727
          fn accu (gdx, rasol) =
728
              case rasol of
729
                Bad _ -> accu
730
                Ok sol | null (asSolutions sol) -> accu
731
                       | unallocable gdx -> accu
732
                       | otherwise -> (gdx, sol):accu
733

    
734
-- | Sort multigroup results based on policy and score.
735
sortMGResults :: Group.List
736
             -> [(Gdx, AllocSolution)]
737
             -> [(Gdx, AllocSolution)]
738
sortMGResults gl sols =
739
    let extractScore (_, _, _, x) = x
740
        solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
741
                               (extractScore . head . asSolutions) sol)
742
    in sortBy (comparing solScore) sols
743

    
744
-- | Try to allocate an instance on a multi-group cluster.
745
tryMGAlloc :: Group.List           -- ^ The group list
746
           -> Node.List            -- ^ The node list
747
           -> Instance.List        -- ^ The instance list
748
           -> Instance.Instance    -- ^ The instance to allocate
749
           -> Int                  -- ^ Required number of nodes
750
           -> Result AllocSolution -- ^ Possible solution list
751
tryMGAlloc mggl mgnl mgil inst cnt =
752
  let groups = splitCluster mgnl mgil
753
      sols = map (\(gid, (nl, il)) ->
754
                   (gid, genAllocNodes mggl nl cnt False >>=
755
                       tryAlloc nl il inst))
756
             groups::[(Gdx, Result AllocSolution)]
757
      all_msgs = concatMap (solutionDescription mggl) sols
758
      goodSols = filterMGResults mggl sols
759
      sortedSols = sortMGResults mggl goodSols
760
  in if null sortedSols
761
     then Bad $ intercalate ", " all_msgs
762
     else let (final_group, final_sol) = head sortedSols
763
              final_name = Group.name $ Container.find final_group mggl
764
              selmsg = "Selected group: " ++  final_name
765
          in Ok $ final_sol { asLog = selmsg:all_msgs }
766

    
767
-- | Try to relocate an instance on the cluster.
768
tryReloc :: (Monad m) =>
769
            Node.List       -- ^ The node list
770
         -> Instance.List   -- ^ The instance list
771
         -> Idx             -- ^ The index of the instance to move
772
         -> Int             -- ^ The number of nodes required
773
         -> [Ndx]           -- ^ Nodes which should not be used
774
         -> m AllocSolution -- ^ Solution list
775
tryReloc nl il xid 1 ex_idx =
776
    let all_nodes = getOnline nl
777
        inst = Container.find xid il
778
        ex_idx' = Instance.pNode inst:ex_idx
779
        valid_nodes = filter (not . flip elem ex_idx' . Node.idx) all_nodes
780
        valid_idxes = map Node.idx valid_nodes
781
        sols1 = foldl' (\cstate x ->
782
                            let em = do
783
                                  (mnl, i, _, _) <-
784
                                      applyMove nl inst (ReplaceSecondary x)
785
                                  return (mnl, i, [Container.find x mnl],
786
                                          compCV mnl)
787
                            in concatAllocs cstate em
788
                       ) emptyAllocSolution valid_idxes
789
    in return sols1
790

    
791
tryReloc _ _ _ reqn _  = fail $ "Unsupported number of relocation \
792
                                \destinations required (" ++ show reqn ++
793
                                                  "), only one supported"
794

    
795
tryMGReloc :: (Monad m) =>
796
              Group.List      -- ^ The group list
797
           -> Node.List       -- ^ The node list
798
           -> Instance.List   -- ^ The instance list
799
           -> Idx             -- ^ The index of the instance to move
800
           -> Int             -- ^ The number of nodes required
801
           -> [Ndx]           -- ^ Nodes which should not be used
802
           -> m AllocSolution -- ^ Solution list
803
tryMGReloc _ mgnl mgil xid ncount ex_ndx = do
804
  let groups = splitCluster mgnl mgil
805
      -- TODO: we only relocate inside the group for now
806
      inst = Container.find xid mgil
807
  (nl, il) <- case lookup (instancePriGroup mgnl inst) groups of
808
                Nothing -> fail $ "Cannot find group for instance " ++
809
                           Instance.name inst
810
                Just v -> return v
811
  tryReloc nl il xid ncount ex_ndx
812

    
813
-- | Change an instance's secondary node.
814
evacInstance :: (Monad m) =>
815
                [Ndx]                      -- ^ Excluded nodes
816
             -> Instance.List              -- ^ The current instance list
817
             -> (Node.List, AllocSolution) -- ^ The current state
818
             -> Idx                        -- ^ The instance to evacuate
819
             -> m (Node.List, AllocSolution)
820
evacInstance ex_ndx il (nl, old_as) idx = do
821
  -- FIXME: hardcoded one node here
822

    
823
  -- Longer explanation: evacuation is currently hardcoded to DRBD
824
  -- instances (which have one secondary); hence, even if the
825
  -- IAllocator protocol can request N nodes for an instance, and all
826
  -- the message parsing/loading pass this, this implementation only
827
  -- supports one; this situation needs to be revisited if we ever
828
  -- support more than one secondary, or if we change the storage
829
  -- model
830
  new_as <- tryReloc nl il idx 1 ex_ndx
831
  case asSolutions new_as of
832
    -- an individual relocation succeeded, we kind of compose the data
833
    -- from the two solutions
834
    csol@(nl', _, _, _):_ ->
835
        return (nl', new_as { asSolutions = csol:asSolutions old_as })
836
    -- this relocation failed, so we fail the entire evac
837
    _ -> fail $ "Can't evacuate instance " ++
838
         Instance.name (Container.find idx il) ++
839
             ": " ++ describeSolution new_as
840

    
841
-- | Try to evacuate a list of nodes.
842
tryEvac :: (Monad m) =>
843
            Node.List       -- ^ The node list
844
         -> Instance.List   -- ^ The instance list
845
         -> [Idx]           -- ^ Instances to be evacuated
846
         -> [Ndx]           -- ^ Restricted nodes (the ones being evacuated)
847
         -> m AllocSolution -- ^ Solution list
848
tryEvac nl il idxs ex_ndx = do
849
  (_, sol) <- foldM (evacInstance ex_ndx il) (nl, emptyAllocSolution) idxs
850
  return sol
851

    
852
-- | Multi-group evacuation of a list of nodes.
853
tryMGEvac :: (Monad m) =>
854
             Group.List -- ^ The group list
855
          -> Node.List       -- ^ The node list
856
          -> Instance.List   -- ^ The instance list
857
          -> [Ndx]           -- ^ Nodes to be evacuated
858
          -> m AllocSolution -- ^ Solution list
859
tryMGEvac _ nl il ex_ndx =
860
    let ex_nodes = map (`Container.find` nl) ex_ndx
861
        all_insts = nub . concatMap Node.sList $ ex_nodes
862
        all_insts' = associateIdxs all_insts $ splitCluster nl il
863
    in do
864
      results <- mapM (\(_, (gnl, gil, idxs)) -> tryEvac gnl gil idxs ex_ndx)
865
                 all_insts'
866
      let sol = foldl' sumAllocs emptyAllocSolution results
867
      return $ annotateSolution sol
868

    
869
-- | Function which fails if the requested mode is change secondary.
870
--
871
-- This is useful since except DRBD, no other disk template can
872
-- execute change secondary; thus, we can just call this function
873
-- instead of always checking for secondary mode. After the call to
874
-- this function, whatever mode we have is just a primary change.
875
failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
876
failOnSecondaryChange ChangeSecondary dt =
877
    fail $ "Instances with disk template '" ++ dtToString dt ++
878
         "' can't execute change secondary"
879
failOnSecondaryChange _ _ = return ()
880

    
881
-- | Run evacuation for a single instance.
882
nodeEvacInstance :: Node.List         -- ^ The node list (cluster-wide)
883
                 -> Instance.List     -- ^ Instance list (cluster-wide)
884
                 -> EvacMode          -- ^ The evacuation mode
885
                 -> Instance.Instance -- ^ The instance to be evacuated
886
                 -> [Ndx]             -- ^ The list of available nodes
887
                                      -- for allocation
888
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
889
nodeEvacInstance _ _ mode (Instance.Instance
890
                           {Instance.diskTemplate = dt@DTDiskless}) _ =
891
                  failOnSecondaryChange mode dt >>
892
                  fail "Diskless relocations not implemented yet"
893

    
894
nodeEvacInstance _ _ _ (Instance.Instance
895
                        {Instance.diskTemplate = DTPlain}) _ =
896
                  fail "Instances of type plain cannot be relocated"
897

    
898
nodeEvacInstance _ _ _ (Instance.Instance
899
                        {Instance.diskTemplate = DTFile}) _ =
900
                  fail "Instances of type file cannot be relocated"
901

    
902
nodeEvacInstance _ _ mode  (Instance.Instance
903
                            {Instance.diskTemplate = dt@DTSharedFile}) _ =
904
                  failOnSecondaryChange mode dt >>
905
                  fail "Shared file relocations not implemented yet"
906

    
907
nodeEvacInstance _ _ mode (Instance.Instance
908
                           {Instance.diskTemplate = dt@DTBlock}) _ =
909
                  failOnSecondaryChange mode dt >>
910
                  fail "Block device relocations not implemented yet"
911

    
912
nodeEvacInstance nl il ChangePrimary
913
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8}) _ =
914
  do
915
    (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
916
    let idx = Instance.idx inst
917
        il' = Container.add idx inst' il
918
        ops = iMoveToJob nl' il' idx Failover
919
    return (nl', il', ops)
920

    
921
nodeEvacInstance _ _ _ (Instance.Instance
922
                        {Instance.diskTemplate = DTDrbd8}) _ =
923
                  fail "DRBD relocations not implemented yet"
924

    
925
-- | Computes the local nodes of a given instance which are available
926
-- for allocation.
927
availableLocalNodes :: Node.List
928
                    -> [(Gdx, [Ndx])]
929
                    -> IntSet.IntSet
930
                    -> Instance.Instance
931
                    -> Result [Ndx]
932
availableLocalNodes nl group_nodes excl_ndx inst = do
933
  let gdx = instancePriGroup nl inst
934
  local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
935
                 Ok (lookup gdx group_nodes)
936
  let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
937
  return avail_nodes
938

    
939
-- | Updates the evac solution with the results of an instance
940
-- evacuation.
941
updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
942
                   -> Instance.Instance
943
                   -> Result (Node.List, Instance.List, [OpCodes.OpCode])
944
                   -> (Node.List, Instance.List, EvacSolution)
945
updateEvacSolution (nl, il, es) inst (Bad msg) =
946
    (nl, il, es { esFailed = (Instance.name inst ++ ": " ++ msg):esFailed es})
947
updateEvacSolution (_, _, es) inst (Ok (nl, il, opcodes)) =
948
    (nl, il, es { esMoved = Instance.name inst:esMoved es
949
                , esOpCodes = [opcodes]:esOpCodes es })
950

    
951
-- | Node-evacuation IAllocator mode main function.
952
tryNodeEvac :: Group.List    -- ^ The cluster groups
953
            -> Node.List     -- ^ The node list (cluster-wide, not per group)
954
            -> Instance.List -- ^ Instance list (cluster-wide)
955
            -> EvacMode      -- ^ The evacuation mode
956
            -> [Idx]         -- ^ List of instance (indices) to be evacuated
957
            -> Result EvacSolution
958
tryNodeEvac _ ini_nl ini_il mode idxs =
959
    let evac_ndx = nodesToEvacuate ini_il mode idxs
960
        offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
961
        excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
962
        group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
963
                                             (Container.elems nl))) $
964
                      splitCluster ini_nl ini_il
965
        (_, _, esol) =
966
            foldl' (\state@(nl, il, _) inst ->
967
                        updateEvacSolution state inst $
968
                        availableLocalNodes nl group_ndx excl_ndx inst >>=
969
                        nodeEvacInstance nl il mode inst
970
                   )
971
            (ini_nl, ini_il, emptyEvacSolution)
972
            (map (`Container.find` ini_il) idxs)
973
    in return $ reverseEvacSolution esol
974

    
975
-- | Recursively place instances on the cluster until we're out of space.
976
iterateAlloc :: Node.List
977
             -> Instance.List
978
             -> Instance.Instance
979
             -> AllocNodes
980
             -> [Instance.Instance]
981
             -> [CStats]
982
             -> Result AllocResult
983
iterateAlloc nl il newinst allocnodes ixes cstats =
984
      let depth = length ixes
985
          newname = printf "new-%d" depth::String
986
          newidx = length (Container.elems il) + depth
987
          newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
988
      in case tryAlloc nl il newi2 allocnodes of
989
           Bad s -> Bad s
990
           Ok (AllocSolution { asFailures = errs, asSolutions = sols3 }) ->
991
               case sols3 of
992
                 [] -> Ok (collapseFailures errs, nl, il, ixes, cstats)
993
                 (xnl, xi, _, _):[] ->
994
                     iterateAlloc xnl (Container.add newidx xi il)
995
                                  newinst allocnodes (xi:ixes)
996
                                  (totalResources xnl:cstats)
997
                 _ -> Bad "Internal error: multiple solutions for single\
998
                          \ allocation"
999

    
1000
-- | The core of the tiered allocation mode.
1001
tieredAlloc :: Node.List
1002
            -> Instance.List
1003
            -> Instance.Instance
1004
            -> AllocNodes
1005
            -> [Instance.Instance]
1006
            -> [CStats]
1007
            -> Result AllocResult
1008
tieredAlloc nl il newinst allocnodes ixes cstats =
1009
    case iterateAlloc nl il newinst allocnodes ixes cstats of
1010
      Bad s -> Bad s
1011
      Ok (errs, nl', il', ixes', cstats') ->
1012
          case Instance.shrinkByType newinst . fst . last $
1013
               sortBy (comparing snd) errs of
1014
            Bad _ -> Ok (errs, nl', il', ixes', cstats')
1015
            Ok newinst' ->
1016
                tieredAlloc nl' il' newinst' allocnodes ixes' cstats'
1017

    
1018
-- | Compute the tiered spec string description from a list of
1019
-- allocated instances.
1020
tieredSpecMap :: [Instance.Instance]
1021
              -> [String]
1022
tieredSpecMap trl_ixes =
1023
    let fin_trl_ixes = reverse trl_ixes
1024
        ix_byspec = groupBy ((==) `on` Instance.specOf) fin_trl_ixes
1025
        spec_map = map (\ixs -> (Instance.specOf $ head ixs, length ixs))
1026
                   ix_byspec
1027
    in  map (\(spec, cnt) -> printf "%d,%d,%d=%d" (rspecMem spec)
1028
                             (rspecDsk spec) (rspecCpu spec) cnt) spec_map
1029

    
1030
-- * Formatting functions
1031

    
1032
-- | Given the original and final nodes, computes the relocation description.
1033
computeMoves :: Instance.Instance -- ^ The instance to be moved
1034
             -> String -- ^ The instance name
1035
             -> IMove  -- ^ The move being performed
1036
             -> String -- ^ New primary
1037
             -> String -- ^ New secondary
1038
             -> (String, [String])
1039
                -- ^ Tuple of moves and commands list; moves is containing
1040
                -- either @/f/@ for failover or @/r:name/@ for replace
1041
                -- secondary, while the command list holds gnt-instance
1042
                -- commands (without that prefix), e.g \"@failover instance1@\"
1043
computeMoves i inam mv c d =
1044
    case mv of
1045
      Failover -> ("f", [mig])
1046
      FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1047
      ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1048
      ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1049
      ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1050
    where morf = if Instance.running i then "migrate" else "failover"
1051
          mig = printf "%s -f %s" morf inam::String
1052
          rep n = printf "replace-disks -n %s %s" n inam
1053

    
1054
-- | Converts a placement to string format.
1055
printSolutionLine :: Node.List     -- ^ The node list
1056
                  -> Instance.List -- ^ The instance list
1057
                  -> Int           -- ^ Maximum node name length
1058
                  -> Int           -- ^ Maximum instance name length
1059
                  -> Placement     -- ^ The current placement
1060
                  -> Int           -- ^ The index of the placement in
1061
                                   -- the solution
1062
                  -> (String, [String])
1063
printSolutionLine nl il nmlen imlen plc pos =
1064
    let
1065
        pmlen = (2*nmlen + 1)
1066
        (i, p, s, mv, c) = plc
1067
        inst = Container.find i il
1068
        inam = Instance.alias inst
1069
        npri = Node.alias $ Container.find p nl
1070
        nsec = Node.alias $ Container.find s nl
1071
        opri = Node.alias $ Container.find (Instance.pNode inst) nl
1072
        osec = Node.alias $ Container.find (Instance.sNode inst) nl
1073
        (moves, cmds) =  computeMoves inst inam mv npri nsec
1074
        ostr = printf "%s:%s" opri osec::String
1075
        nstr = printf "%s:%s" npri nsec::String
1076
    in
1077
      (printf "  %3d. %-*s %-*s => %-*s %.8f a=%s"
1078
       pos imlen inam pmlen ostr
1079
       pmlen nstr c moves,
1080
       cmds)
1081

    
1082
-- | Return the instance and involved nodes in an instance move.
1083
involvedNodes :: Instance.List -> Placement -> [Ndx]
1084
involvedNodes il plc =
1085
    let (i, np, ns, _, _) = plc
1086
        inst = Container.find i il
1087
        op = Instance.pNode inst
1088
        os = Instance.sNode inst
1089
    in nub [np, ns, op, os]
1090

    
1091
-- | Inner function for splitJobs, that either appends the next job to
1092
-- the current jobset, or starts a new jobset.
1093
mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1094
mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1095
mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1096
    | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1097
    | otherwise = ([n]:cjs, ndx)
1098

    
1099
-- | Break a list of moves into independent groups. Note that this
1100
-- will reverse the order of jobs.
1101
splitJobs :: [MoveJob] -> [JobSet]
1102
splitJobs = fst . foldl mergeJobs ([], [])
1103

    
1104
-- | Given a list of commands, prefix them with @gnt-instance@ and
1105
-- also beautify the display a little.
1106
formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1107
formatJob jsn jsl (sn, (_, _, _, cmds)) =
1108
    let out =
1109
            printf "  echo job %d/%d" jsn sn:
1110
            printf "  check":
1111
            map ("  gnt-instance " ++) cmds
1112
    in if sn == 1
1113
       then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1114
       else out
1115

    
1116
-- | Given a list of commands, prefix them with @gnt-instance@ and
1117
-- also beautify the display a little.
1118
formatCmds :: [JobSet] -> String
1119
formatCmds =
1120
    unlines .
1121
    concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1122
                             (zip [1..] js)) .
1123
    zip [1..]
1124

    
1125
-- | Print the node list.
1126
printNodes :: Node.List -> [String] -> String
1127
printNodes nl fs =
1128
    let fields = case fs of
1129
          [] -> Node.defaultFields
1130
          "+":rest -> Node.defaultFields ++ rest
1131
          _ -> fs
1132
        snl = sortBy (comparing Node.idx) (Container.elems nl)
1133
        (header, isnum) = unzip $ map Node.showHeader fields
1134
    in unlines . map ((:) ' ' .  intercalate " ") $
1135
       formatTable (header:map (Node.list fields) snl) isnum
1136

    
1137
-- | Print the instance list.
1138
printInsts :: Node.List -> Instance.List -> String
1139
printInsts nl il =
1140
    let sil = sortBy (comparing Instance.idx) (Container.elems il)
1141
        helper inst = [ if Instance.running inst then "R" else " "
1142
                      , Instance.name inst
1143
                      , Container.nameOf nl (Instance.pNode inst)
1144
                      , let sdx = Instance.sNode inst
1145
                        in if sdx == Node.noSecondary
1146
                           then  ""
1147
                           else Container.nameOf nl sdx
1148
                      , if Instance.autoBalance inst then "Y" else "N"
1149
                      , printf "%3d" $ Instance.vcpus inst
1150
                      , printf "%5d" $ Instance.mem inst
1151
                      , printf "%5d" $ Instance.dsk inst `div` 1024
1152
                      , printf "%5.3f" lC
1153
                      , printf "%5.3f" lM
1154
                      , printf "%5.3f" lD
1155
                      , printf "%5.3f" lN
1156
                      ]
1157
            where DynUtil lC lM lD lN = Instance.util inst
1158
        header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1159
                 , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1160
        isnum = False:False:False:False:False:repeat True
1161
    in unlines . map ((:) ' ' . intercalate " ") $
1162
       formatTable (header:map helper sil) isnum
1163

    
1164
-- | Shows statistics for a given node list.
1165
printStats :: Node.List -> String
1166
printStats nl =
1167
    let dcvs = compDetailedCV nl
1168
        (weights, names) = unzip detailedCVInfo
1169
        hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1170
        formatted = map (\(w, header, val) ->
1171
                             printf "%s=%.8f(x%.2f)" header val w::String) hd
1172
    in intercalate ", " formatted
1173

    
1174
-- | Convert a placement into a list of OpCodes (basically a job).
1175
iMoveToJob :: Node.List -> Instance.List
1176
          -> Idx -> IMove -> [OpCodes.OpCode]
1177
iMoveToJob nl il idx move =
1178
    let inst = Container.find idx il
1179
        iname = Instance.name inst
1180
        lookNode  = Just . Container.nameOf nl
1181
        opF = OpCodes.OpInstanceMigrate iname True False True
1182
        opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n)
1183
                OpCodes.ReplaceNewSecondary [] Nothing
1184
    in case move of
1185
         Failover -> [ opF ]
1186
         ReplacePrimary np -> [ opF, opR np, opF ]
1187
         ReplaceSecondary ns -> [ opR ns ]
1188
         ReplaceAndFailover np -> [ opR np, opF ]
1189
         FailoverAndReplace ns -> [ opF, opR ns ]
1190

    
1191
-- * Node group functions
1192

    
1193
-- | Computes the group of an instance.
1194
instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1195
instanceGroup nl i =
1196
  let sidx = Instance.sNode i
1197
      pnode = Container.find (Instance.pNode i) nl
1198
      snode = if sidx == Node.noSecondary
1199
              then pnode
1200
              else Container.find sidx nl
1201
      pgroup = Node.group pnode
1202
      sgroup = Node.group snode
1203
  in if pgroup /= sgroup
1204
     then fail ("Instance placed accross two node groups, primary " ++
1205
                show pgroup ++ ", secondary " ++ show sgroup)
1206
     else return pgroup
1207

    
1208
-- | Computes the group of an instance per the primary node.
1209
instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1210
instancePriGroup nl i =
1211
  let pnode = Container.find (Instance.pNode i) nl
1212
  in  Node.group pnode
1213

    
1214
-- | Compute the list of badly allocated instances (split across node
1215
-- groups).
1216
findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1217
findSplitInstances nl =
1218
  filter (not . isOk . instanceGroup nl) . Container.elems
1219

    
1220
-- | Splits a cluster into the component node groups.
1221
splitCluster :: Node.List -> Instance.List ->
1222
                [(Gdx, (Node.List, Instance.List))]
1223
splitCluster nl il =
1224
  let ngroups = Node.computeGroups (Container.elems nl)
1225
  in map (\(guuid, nodes) ->
1226
           let nidxs = map Node.idx nodes
1227
               nodes' = zip nidxs nodes
1228
               instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1229
           in (guuid, (Container.fromList nodes', instances))) ngroups
1230

    
1231
-- | Split a global instance index map into per-group, and associate
1232
-- it with the group/node/instance lists.
1233
associateIdxs :: [Idx] -- ^ Instance indices to be split/associated
1234
              -> [(Gdx, (Node.List, Instance.List))]        -- ^ Input groups
1235
              -> [(Gdx, (Node.List, Instance.List, [Idx]))] -- ^ Result
1236
associateIdxs idxs =
1237
    map (\(gdx, (nl, il)) ->
1238
             (gdx, (nl, il, filter (`Container.member` il) idxs)))
1239

    
1240
-- | Compute the list of nodes that are to be evacuated, given a list
1241
-- of instances and an evacuation mode.
1242
nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1243
                -> EvacMode      -- ^ The evacuation mode we're using
1244
                -> [Idx]         -- ^ List of instance indices being evacuated
1245
                -> IntSet.IntSet -- ^ Set of node indices
1246
nodesToEvacuate il mode =
1247
    IntSet.delete Node.noSecondary .
1248
    foldl' (\ns idx ->
1249
                let i = Container.find idx il
1250
                    pdx = Instance.pNode i
1251
                    sdx = Instance.sNode i
1252
                    dt = Instance.diskTemplate i
1253
                    withSecondary = case dt of
1254
                                      DTDrbd8 -> IntSet.insert sdx ns
1255
                                      _ -> ns
1256
                in case mode of
1257
                     ChangePrimary   -> IntSet.insert pdx ns
1258
                     ChangeSecondary -> withSecondary
1259
                     ChangeAll       -> IntSet.insert pdx withSecondary
1260
           ) IntSet.empty