Statistics
| Branch: | Tag: | Revision:

root / htools / Ganeti / HTools / Cluster.hs @ 9bb5721c

History | View | Annotate | Download (53.5 kB)

1
{-| Implementation of cluster-wide logic.
2

    
3
This module holds all pure cluster-logic; I\/O related functionality
4
goes into the /Main/ module for the individual binaries.
5

    
6
-}
7

    
8
{-
9

    
10
Copyright (C) 2009, 2010, 2011 Google Inc.
11

    
12
This program is free software; you can redistribute it and/or modify
13
it under the terms of the GNU General Public License as published by
14
the Free Software Foundation; either version 2 of the License, or
15
(at your option) any later version.
16

    
17
This program is distributed in the hope that it will be useful, but
18
WITHOUT ANY WARRANTY; without even the implied warranty of
19
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
General Public License for more details.
21

    
22
You should have received a copy of the GNU General Public License
23
along with this program; if not, write to the Free Software
24
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25
02110-1301, USA.
26

    
27
-}
28

    
29
module Ganeti.HTools.Cluster
30
    (
31
     -- * Types
32
      AllocSolution(..)
33
    , EvacSolution(..)
34
    , Table(..)
35
    , CStats(..)
36
    , AllocStats
37
    -- * Generic functions
38
    , totalResources
39
    , computeAllocationDelta
40
    -- * First phase functions
41
    , computeBadItems
42
    -- * Second phase functions
43
    , printSolutionLine
44
    , formatCmds
45
    , involvedNodes
46
    , splitJobs
47
    -- * Display functions
48
    , printNodes
49
    , printInsts
50
    -- * Balacing functions
51
    , checkMove
52
    , doNextBalance
53
    , tryBalance
54
    , compCV
55
    , compCVNodes
56
    , compDetailedCV
57
    , printStats
58
    , iMoveToJob
59
    -- * IAllocator functions
60
    , genAllocNodes
61
    , tryAlloc
62
    , tryMGAlloc
63
    , tryReloc
64
    , tryMGReloc
65
    , tryEvac
66
    , tryMGEvac
67
    , tryNodeEvac
68
    , collapseFailures
69
    -- * Allocation functions
70
    , iterateAlloc
71
    , tieredAlloc
72
    , tieredSpecMap
73
     -- * Node group functions
74
    , instanceGroup
75
    , findSplitInstances
76
    , splitCluster
77
    ) where
78

    
79
import Data.Function (on)
80
import qualified Data.IntSet as IntSet
81
import Data.List
82
import Data.Ord (comparing)
83
import Text.Printf (printf)
84
import Control.Monad
85
import Control.Parallel.Strategies
86

    
87
import qualified Ganeti.HTools.Container as Container
88
import qualified Ganeti.HTools.Instance as Instance
89
import qualified Ganeti.HTools.Node as Node
90
import qualified Ganeti.HTools.Group as Group
91
import Ganeti.HTools.Types
92
import Ganeti.HTools.Utils
93
import qualified Ganeti.OpCodes as OpCodes
94

    
95
-- * Types
96

    
97
-- | Allocation\/relocation solution.
98
data AllocSolution = AllocSolution
99
  { asFailures  :: [FailMode]          -- ^ Failure counts
100
  , asAllocs    :: Int                 -- ^ Good allocation count
101
  , asSolutions :: [Node.AllocElement] -- ^ The actual result, length
102
                                       -- of the list depends on the
103
                                       -- allocation/relocation mode
104
  , asLog       :: [String]            -- ^ A list of informational messages
105
  }
106

    
107
-- | Node evacuation/group change iallocator result type. This result
108
-- type consists of actual opcodes (a restricted subset) that are
109
-- transmitted back to Ganeti.
110
data EvacSolution = EvacSolution
111
    { esMoved   :: [String]             -- ^ Instance moved successfully
112
    , esFailed  :: [String]             -- ^ Instance which were not
113
                                        -- relocated
114
    , esOpCodes :: [[[OpCodes.OpCode]]] -- ^ List of lists of jobs
115
    }
116

    
117
-- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
118
type AllocResult = (FailStats, Node.List, Instance.List,
119
                    [Instance.Instance], [CStats])
120

    
121
-- | A type denoting the valid allocation mode/pairs.
122
--
123
-- For a one-node allocation, this will be a @Left ['Node.Node']@,
124
-- whereas for a two-node allocation, this will be a @Right
125
-- [('Node.Node', 'Node.Node')]@.
126
type AllocNodes = Either [Ndx] [(Ndx, Ndx)]
127

    
128
-- | The empty solution we start with when computing allocations.
129
emptyAllocSolution :: AllocSolution
130
emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
131
                                   , asSolutions = [], asLog = [] }
132

    
133
-- | The empty evac solution.
134
emptyEvacSolution :: EvacSolution
135
emptyEvacSolution = EvacSolution { esMoved = []
136
                                 , esFailed = []
137
                                 , esOpCodes = []
138
                                 }
139

    
140
-- | The complete state for the balancing solution.
141
data Table = Table Node.List Instance.List Score [Placement]
142
             deriving (Show, Read)
143

    
144
data CStats = CStats { csFmem :: Integer -- ^ Cluster free mem
145
                     , csFdsk :: Integer -- ^ Cluster free disk
146
                     , csAmem :: Integer -- ^ Cluster allocatable mem
147
                     , csAdsk :: Integer -- ^ Cluster allocatable disk
148
                     , csAcpu :: Integer -- ^ Cluster allocatable cpus
149
                     , csMmem :: Integer -- ^ Max node allocatable mem
150
                     , csMdsk :: Integer -- ^ Max node allocatable disk
151
                     , csMcpu :: Integer -- ^ Max node allocatable cpu
152
                     , csImem :: Integer -- ^ Instance used mem
153
                     , csIdsk :: Integer -- ^ Instance used disk
154
                     , csIcpu :: Integer -- ^ Instance used cpu
155
                     , csTmem :: Double  -- ^ Cluster total mem
156
                     , csTdsk :: Double  -- ^ Cluster total disk
157
                     , csTcpu :: Double  -- ^ Cluster total cpus
158
                     , csVcpu :: Integer -- ^ Cluster virtual cpus (if
159
                                         -- node pCpu has been set,
160
                                         -- otherwise -1)
161
                     , csXmem :: Integer -- ^ Unnacounted for mem
162
                     , csNmem :: Integer -- ^ Node own memory
163
                     , csScore :: Score  -- ^ The cluster score
164
                     , csNinst :: Int    -- ^ The total number of instances
165
                     }
166
            deriving (Show, Read)
167

    
168
-- | Currently used, possibly to allocate, unallocable.
169
type AllocStats = (RSpec, RSpec, RSpec)
170

    
171
-- * Utility functions
172

    
173
-- | Verifies the N+1 status and return the affected nodes.
174
verifyN1 :: [Node.Node] -> [Node.Node]
175
verifyN1 = filter Node.failN1
176

    
177
{-| Computes the pair of bad nodes and instances.
178

    
179
The bad node list is computed via a simple 'verifyN1' check, and the
180
bad instance list is the list of primary and secondary instances of
181
those nodes.
182

    
183
-}
184
computeBadItems :: Node.List -> Instance.List ->
185
                   ([Node.Node], [Instance.Instance])
186
computeBadItems nl il =
187
  let bad_nodes = verifyN1 $ getOnline nl
188
      bad_instances = map (`Container.find` il) .
189
                      sort . nub $
190
                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
191
  in
192
    (bad_nodes, bad_instances)
193

    
194
-- | Zero-initializer for the CStats type.
195
emptyCStats :: CStats
196
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
197

    
198
-- | Update stats with data from a new node.
199
updateCStats :: CStats -> Node.Node -> CStats
200
updateCStats cs node =
201
    let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
202
                 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
203
                 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
204
                 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
205
                 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
206
                 csVcpu = x_vcpu,
207
                 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
208
               }
209
            = cs
210
        inc_amem = Node.fMem node - Node.rMem node
211
        inc_amem' = if inc_amem > 0 then inc_amem else 0
212
        inc_adsk = Node.availDisk node
213
        inc_imem = truncate (Node.tMem node) - Node.nMem node
214
                   - Node.xMem node - Node.fMem node
215
        inc_icpu = Node.uCpu node
216
        inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
217
        inc_vcpu = Node.hiCpu node
218
        inc_acpu = Node.availCpu node
219

    
220
    in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
221
          , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
222
          , csAmem = x_amem + fromIntegral inc_amem'
223
          , csAdsk = x_adsk + fromIntegral inc_adsk
224
          , csAcpu = x_acpu + fromIntegral inc_acpu
225
          , csMmem = max x_mmem (fromIntegral inc_amem')
226
          , csMdsk = max x_mdsk (fromIntegral inc_adsk)
227
          , csMcpu = max x_mcpu (fromIntegral inc_acpu)
228
          , csImem = x_imem + fromIntegral inc_imem
229
          , csIdsk = x_idsk + fromIntegral inc_idsk
230
          , csIcpu = x_icpu + fromIntegral inc_icpu
231
          , csTmem = x_tmem + Node.tMem node
232
          , csTdsk = x_tdsk + Node.tDsk node
233
          , csTcpu = x_tcpu + Node.tCpu node
234
          , csVcpu = x_vcpu + fromIntegral inc_vcpu
235
          , csXmem = x_xmem + fromIntegral (Node.xMem node)
236
          , csNmem = x_nmem + fromIntegral (Node.nMem node)
237
          , csNinst = x_ninst + length (Node.pList node)
238
          }
239

    
240
-- | Compute the total free disk and memory in the cluster.
241
totalResources :: Node.List -> CStats
242
totalResources nl =
243
    let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
244
    in cs { csScore = compCV nl }
245

    
246
-- | Compute the delta between two cluster state.
247
--
248
-- This is used when doing allocations, to understand better the
249
-- available cluster resources. The return value is a triple of the
250
-- current used values, the delta that was still allocated, and what
251
-- was left unallocated.
252
computeAllocationDelta :: CStats -> CStats -> AllocStats
253
computeAllocationDelta cini cfin =
254
    let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini
255
        CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
256
                csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin
257
        rini = RSpec (fromIntegral i_icpu) (fromIntegral i_imem)
258
               (fromIntegral i_idsk)
259
        rfin = RSpec (fromIntegral (f_icpu - i_icpu))
260
               (fromIntegral (f_imem - i_imem))
261
               (fromIntegral (f_idsk - i_idsk))
262
        un_cpu = fromIntegral (v_cpu - f_icpu)::Int
263
        runa = RSpec un_cpu (truncate t_mem - fromIntegral f_imem)
264
               (truncate t_dsk - fromIntegral f_idsk)
265
    in (rini, rfin, runa)
266

    
267
-- | The names and weights of the individual elements in the CV list.
268
detailedCVInfo :: [(Double, String)]
269
detailedCVInfo = [ (1,  "free_mem_cv")
270
                 , (1,  "free_disk_cv")
271
                 , (1,  "n1_cnt")
272
                 , (1,  "reserved_mem_cv")
273
                 , (4,  "offline_all_cnt")
274
                 , (16, "offline_pri_cnt")
275
                 , (1,  "vcpu_ratio_cv")
276
                 , (1,  "cpu_load_cv")
277
                 , (1,  "mem_load_cv")
278
                 , (1,  "disk_load_cv")
279
                 , (1,  "net_load_cv")
280
                 , (2,  "pri_tags_score")
281
                 ]
282

    
283
detailedCVWeights :: [Double]
284
detailedCVWeights = map fst detailedCVInfo
285

    
286
-- | Compute the mem and disk covariance.
287
compDetailedCV :: [Node.Node] -> [Double]
288
compDetailedCV all_nodes =
289
    let
290
        (offline, nodes) = partition Node.offline all_nodes
291
        mem_l = map Node.pMem nodes
292
        dsk_l = map Node.pDsk nodes
293
        -- metric: memory covariance
294
        mem_cv = stdDev mem_l
295
        -- metric: disk covariance
296
        dsk_cv = stdDev dsk_l
297
        -- metric: count of instances living on N1 failing nodes
298
        n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
299
                                                   length (Node.pList n)) .
300
                   filter Node.failN1 $ nodes :: Double
301
        res_l = map Node.pRem nodes
302
        -- metric: reserved memory covariance
303
        res_cv = stdDev res_l
304
        -- offline instances metrics
305
        offline_ipri = sum . map (length . Node.pList) $ offline
306
        offline_isec = sum . map (length . Node.sList) $ offline
307
        -- metric: count of instances on offline nodes
308
        off_score = fromIntegral (offline_ipri + offline_isec)::Double
309
        -- metric: count of primary instances on offline nodes (this
310
        -- helps with evacuation/failover of primary instances on
311
        -- 2-node clusters with one node offline)
312
        off_pri_score = fromIntegral offline_ipri::Double
313
        cpu_l = map Node.pCpu nodes
314
        -- metric: covariance of vcpu/pcpu ratio
315
        cpu_cv = stdDev cpu_l
316
        -- metrics: covariance of cpu, memory, disk and network load
317
        (c_load, m_load, d_load, n_load) = unzip4 $
318
            map (\n ->
319
                     let DynUtil c1 m1 d1 n1 = Node.utilLoad n
320
                         DynUtil c2 m2 d2 n2 = Node.utilPool n
321
                     in (c1/c2, m1/m2, d1/d2, n1/n2)
322
                ) nodes
323
        -- metric: conflicting instance count
324
        pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
325
        pri_tags_score = fromIntegral pri_tags_inst::Double
326
    in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
327
       , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
328
       , pri_tags_score ]
329

    
330
-- | Compute the /total/ variance.
331
compCVNodes :: [Node.Node] -> Double
332
compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
333

    
334
-- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
335
compCV :: Node.List -> Double
336
compCV = compCVNodes . Container.elems
337

    
338

    
339
-- | Compute online nodes from a 'Node.List'.
340
getOnline :: Node.List -> [Node.Node]
341
getOnline = filter (not . Node.offline) . Container.elems
342

    
343
-- * Balancing functions
344

    
345
-- | Compute best table. Note that the ordering of the arguments is important.
346
compareTables :: Table -> Table -> Table
347
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
348
    if a_cv > b_cv then b else a
349

    
350
-- | Applies an instance move to a given node list and instance.
351
applyMove :: Node.List -> Instance.Instance
352
          -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
353
-- Failover (f)
354
applyMove nl inst Failover =
355
    let old_pdx = Instance.pNode inst
356
        old_sdx = Instance.sNode inst
357
        old_p = Container.find old_pdx nl
358
        old_s = Container.find old_sdx nl
359
        int_p = Node.removePri old_p inst
360
        int_s = Node.removeSec old_s inst
361
        force_p = Node.offline old_p
362
        new_nl = do -- Maybe monad
363
          new_p <- Node.addPriEx force_p int_s inst
364
          new_s <- Node.addSec int_p inst old_sdx
365
          let new_inst = Instance.setBoth inst old_sdx old_pdx
366
          return (Container.addTwo old_pdx new_s old_sdx new_p nl,
367
                  new_inst, old_sdx, old_pdx)
368
    in new_nl
369

    
370
-- Replace the primary (f:, r:np, f)
371
applyMove nl inst (ReplacePrimary new_pdx) =
372
    let old_pdx = Instance.pNode inst
373
        old_sdx = Instance.sNode inst
374
        old_p = Container.find old_pdx nl
375
        old_s = Container.find old_sdx nl
376
        tgt_n = Container.find new_pdx nl
377
        int_p = Node.removePri old_p inst
378
        int_s = Node.removeSec old_s inst
379
        force_p = Node.offline old_p
380
        new_nl = do -- Maybe monad
381
          -- check that the current secondary can host the instance
382
          -- during the migration
383
          tmp_s <- Node.addPriEx force_p int_s inst
384
          let tmp_s' = Node.removePri tmp_s inst
385
          new_p <- Node.addPriEx force_p tgt_n inst
386
          new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
387
          let new_inst = Instance.setPri inst new_pdx
388
          return (Container.add new_pdx new_p $
389
                  Container.addTwo old_pdx int_p old_sdx new_s nl,
390
                  new_inst, new_pdx, old_sdx)
391
    in new_nl
392

    
393
-- Replace the secondary (r:ns)
394
applyMove nl inst (ReplaceSecondary new_sdx) =
395
    let old_pdx = Instance.pNode inst
396
        old_sdx = Instance.sNode inst
397
        old_s = Container.find old_sdx nl
398
        tgt_n = Container.find new_sdx nl
399
        int_s = Node.removeSec old_s inst
400
        force_s = Node.offline old_s
401
        new_inst = Instance.setSec inst new_sdx
402
        new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
403
                 \new_s -> return (Container.addTwo new_sdx
404
                                   new_s old_sdx int_s nl,
405
                                   new_inst, old_pdx, new_sdx)
406
    in new_nl
407

    
408
-- Replace the secondary and failover (r:np, f)
409
applyMove nl inst (ReplaceAndFailover new_pdx) =
410
    let old_pdx = Instance.pNode inst
411
        old_sdx = Instance.sNode inst
412
        old_p = Container.find old_pdx nl
413
        old_s = Container.find old_sdx nl
414
        tgt_n = Container.find new_pdx nl
415
        int_p = Node.removePri old_p inst
416
        int_s = Node.removeSec old_s inst
417
        force_s = Node.offline old_s
418
        new_nl = do -- Maybe monad
419
          new_p <- Node.addPri tgt_n inst
420
          new_s <- Node.addSecEx force_s int_p inst new_pdx
421
          let new_inst = Instance.setBoth inst new_pdx old_pdx
422
          return (Container.add new_pdx new_p $
423
                  Container.addTwo old_pdx new_s old_sdx int_s nl,
424
                  new_inst, new_pdx, old_pdx)
425
    in new_nl
426

    
427
-- Failver and replace the secondary (f, r:ns)
428
applyMove nl inst (FailoverAndReplace new_sdx) =
429
    let old_pdx = Instance.pNode inst
430
        old_sdx = Instance.sNode inst
431
        old_p = Container.find old_pdx nl
432
        old_s = Container.find old_sdx nl
433
        tgt_n = Container.find new_sdx nl
434
        int_p = Node.removePri old_p inst
435
        int_s = Node.removeSec old_s inst
436
        force_p = Node.offline old_p
437
        new_nl = do -- Maybe monad
438
          new_p <- Node.addPriEx force_p int_s inst
439
          new_s <- Node.addSecEx force_p tgt_n inst old_sdx
440
          let new_inst = Instance.setBoth inst old_sdx new_sdx
441
          return (Container.add new_sdx new_s $
442
                  Container.addTwo old_sdx new_p old_pdx int_p nl,
443
                  new_inst, old_sdx, new_sdx)
444
    in new_nl
445

    
446
-- | Tries to allocate an instance on one given node.
447
allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
448
                 -> OpResult Node.AllocElement
449
allocateOnSingle nl inst new_pdx =
450
    let p = Container.find new_pdx nl
451
        new_inst = Instance.setBoth inst new_pdx Node.noSecondary
452
    in  Node.addPri p inst >>= \new_p -> do
453
      let new_nl = Container.add new_pdx new_p nl
454
          new_score = compCV nl
455
      return (new_nl, new_inst, [new_p], new_score)
456

    
457
-- | Tries to allocate an instance on a given pair of nodes.
458
allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
459
               -> OpResult Node.AllocElement
460
allocateOnPair nl inst new_pdx new_sdx =
461
    let tgt_p = Container.find new_pdx nl
462
        tgt_s = Container.find new_sdx nl
463
    in do
464
      new_p <- Node.addPri tgt_p inst
465
      new_s <- Node.addSec tgt_s inst new_pdx
466
      let new_inst = Instance.setBoth inst new_pdx new_sdx
467
          new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
468
      return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
469

    
470
-- | Tries to perform an instance move and returns the best table
471
-- between the original one and the new one.
472
checkSingleStep :: Table -- ^ The original table
473
                -> Instance.Instance -- ^ The instance to move
474
                -> Table -- ^ The current best table
475
                -> IMove -- ^ The move to apply
476
                -> Table -- ^ The final best table
477
checkSingleStep ini_tbl target cur_tbl move =
478
    let
479
        Table ini_nl ini_il _ ini_plc = ini_tbl
480
        tmp_resu = applyMove ini_nl target move
481
    in
482
      case tmp_resu of
483
        OpFail _ -> cur_tbl
484
        OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
485
            let tgt_idx = Instance.idx target
486
                upd_cvar = compCV upd_nl
487
                upd_il = Container.add tgt_idx new_inst ini_il
488
                upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
489
                upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
490
            in
491
              compareTables cur_tbl upd_tbl
492

    
493
-- | Given the status of the current secondary as a valid new node and
494
-- the current candidate target node, generate the possible moves for
495
-- a instance.
496
possibleMoves :: Bool      -- ^ Whether the secondary node is a valid new node
497
              -> Bool      -- ^ Whether we can change the primary node
498
              -> Ndx       -- ^ Target node candidate
499
              -> [IMove]   -- ^ List of valid result moves
500

    
501
possibleMoves _ False tdx =
502
    [ReplaceSecondary tdx]
503

    
504
possibleMoves True True tdx =
505
    [ReplaceSecondary tdx,
506
     ReplaceAndFailover tdx,
507
     ReplacePrimary tdx,
508
     FailoverAndReplace tdx]
509

    
510
possibleMoves False True tdx =
511
    [ReplaceSecondary tdx,
512
     ReplaceAndFailover tdx]
513

    
514
-- | Compute the best move for a given instance.
515
checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
516
                  -> Bool              -- ^ Whether disk moves are allowed
517
                  -> Bool              -- ^ Whether instance moves are allowed
518
                  -> Table             -- ^ Original table
519
                  -> Instance.Instance -- ^ Instance to move
520
                  -> Table             -- ^ Best new table for this instance
521
checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
522
    let
523
        opdx = Instance.pNode target
524
        osdx = Instance.sNode target
525
        nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx
526
        use_secondary = elem osdx nodes_idx && inst_moves
527
        aft_failover = if use_secondary -- if allowed to failover
528
                       then checkSingleStep ini_tbl target ini_tbl Failover
529
                       else ini_tbl
530
        all_moves = if disk_moves
531
                    then concatMap
532
                         (possibleMoves use_secondary inst_moves) nodes
533
                    else []
534
    in
535
      -- iterate over the possible nodes for this instance
536
      foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
537

    
538
-- | Compute the best next move.
539
checkMove :: [Ndx]               -- ^ Allowed target node indices
540
          -> Bool                -- ^ Whether disk moves are allowed
541
          -> Bool                -- ^ Whether instance moves are allowed
542
          -> Table               -- ^ The current solution
543
          -> [Instance.Instance] -- ^ List of instances still to move
544
          -> Table               -- ^ The new solution
545
checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
546
    let Table _ _ _ ini_plc = ini_tbl
547
        -- we're using rwhnf from the Control.Parallel.Strategies
548
        -- package; we don't need to use rnf as that would force too
549
        -- much evaluation in single-threaded cases, and in
550
        -- multi-threaded case the weak head normal form is enough to
551
        -- spark the evaluation
552
        tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
553
                               inst_moves ini_tbl)
554
                 victims
555
        -- iterate over all instances, computing the best move
556
        best_tbl = foldl' compareTables ini_tbl tables
557
        Table _ _ _ best_plc = best_tbl
558
    in if length best_plc == length ini_plc
559
       then ini_tbl -- no advancement
560
       else best_tbl
561

    
562
-- | Check if we are allowed to go deeper in the balancing.
563
doNextBalance :: Table     -- ^ The starting table
564
              -> Int       -- ^ Remaining length
565
              -> Score     -- ^ Score at which to stop
566
              -> Bool      -- ^ The resulting table and commands
567
doNextBalance ini_tbl max_rounds min_score =
568
    let Table _ _ ini_cv ini_plc = ini_tbl
569
        ini_plc_len = length ini_plc
570
    in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
571

    
572
-- | Run a balance move.
573
tryBalance :: Table       -- ^ The starting table
574
           -> Bool        -- ^ Allow disk moves
575
           -> Bool        -- ^ Allow instance moves
576
           -> Bool        -- ^ Only evacuate moves
577
           -> Score       -- ^ Min gain threshold
578
           -> Score       -- ^ Min gain
579
           -> Maybe Table -- ^ The resulting table and commands
580
tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
581
    let Table ini_nl ini_il ini_cv _ = ini_tbl
582
        all_inst = Container.elems ini_il
583
        all_inst' = if evac_mode
584
                    then let bad_nodes = map Node.idx . filter Node.offline $
585
                                         Container.elems ini_nl
586
                         in filter (\e -> Instance.sNode e `elem` bad_nodes ||
587
                                          Instance.pNode e `elem` bad_nodes)
588
                            all_inst
589
                    else all_inst
590
        reloc_inst = filter Instance.movable all_inst'
591
        node_idx = map Node.idx . filter (not . Node.offline) $
592
                   Container.elems ini_nl
593
        fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
594
        (Table _ _ fin_cv _) = fin_tbl
595
    in
596
      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
597
      then Just fin_tbl -- this round made success, return the new table
598
      else Nothing
599

    
600
-- * Allocation functions
601

    
602
-- | Build failure stats out of a list of failures.
603
collapseFailures :: [FailMode] -> FailStats
604
collapseFailures flst =
605
    map (\k -> (k, length $ filter (k ==) flst)) [minBound..maxBound]
606

    
607
-- | Update current Allocation solution and failure stats with new
608
-- elements.
609
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
610
concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
611

    
612
concatAllocs as (OpGood ns@(_, _, _, nscore)) =
613
    let -- Choose the old or new solution, based on the cluster score
614
        cntok = asAllocs as
615
        osols = asSolutions as
616
        nsols = case osols of
617
                  [] -> [ns]
618
                  (_, _, _, oscore):[] ->
619
                      if oscore < nscore
620
                      then osols
621
                      else [ns]
622
                  -- FIXME: here we simply concat to lists with more
623
                  -- than one element; we should instead abort, since
624
                  -- this is not a valid usage of this function
625
                  xs -> ns:xs
626
        nsuc = cntok + 1
627
    -- Note: we force evaluation of nsols here in order to keep the
628
    -- memory profile low - we know that we will need nsols for sure
629
    -- in the next cycle, so we force evaluation of nsols, since the
630
    -- foldl' in the caller will only evaluate the tuple, but not the
631
    -- elements of the tuple
632
    in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolutions = nsols }
633

    
634
-- | Sums two allocation solutions (e.g. for two separate node groups).
635
sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
636
sumAllocs (AllocSolution af aa as al) (AllocSolution bf ba bs bl) =
637
    AllocSolution (af ++ bf) (aa + ba) (as ++ bs) (al ++ bl)
638

    
639
-- | Given a solution, generates a reasonable description for it.
640
describeSolution :: AllocSolution -> String
641
describeSolution as =
642
  let fcnt = asFailures as
643
      sols = asSolutions as
644
      freasons =
645
        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
646
        filter ((> 0) . snd) . collapseFailures $ fcnt
647
  in if null sols
648
     then "No valid allocation solutions, failure reasons: " ++
649
          (if null fcnt
650
           then "unknown reasons"
651
           else freasons)
652
     else let (_, _, nodes, cv) = head sols
653
          in printf ("score: %.8f, successes %d, failures %d (%s)" ++
654
                     " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
655
             (intercalate "/" . map Node.name $ nodes)
656

    
657
-- | Annotates a solution with the appropriate string.
658
annotateSolution :: AllocSolution -> AllocSolution
659
annotateSolution as = as { asLog = describeSolution as : asLog as }
660

    
661
-- | Reverses an evacuation solution.
662
--
663
-- Rationale: we always concat the results to the top of the lists, so
664
-- for proper jobset execution, we should reverse all lists.
665
reverseEvacSolution :: EvacSolution -> EvacSolution
666
reverseEvacSolution (EvacSolution f m o) =
667
    EvacSolution (reverse f) (reverse m) (reverse o)
668

    
669
-- | Generate the valid node allocation singles or pairs for a new instance.
670
genAllocNodes :: Group.List        -- ^ Group list
671
              -> Node.List         -- ^ The node map
672
              -> Int               -- ^ The number of nodes required
673
              -> Bool              -- ^ Whether to drop or not
674
                                   -- unallocable nodes
675
              -> Result AllocNodes -- ^ The (monadic) result
676
genAllocNodes gl nl count drop_unalloc =
677
    let filter_fn = if drop_unalloc
678
                    then filter (Group.isAllocable .
679
                                 flip Container.find gl . Node.group)
680
                    else id
681
        all_nodes = filter_fn $ getOnline nl
682
        all_pairs = liftM2 (,) all_nodes all_nodes
683
        ok_pairs = filter (\(x, y) -> Node.idx x /= Node.idx y &&
684
                                      Node.group x == Node.group y) all_pairs
685
    in case count of
686
         1 -> Ok (Left (map Node.idx all_nodes))
687
         2 -> Ok (Right (map (\(p, s) -> (Node.idx p, Node.idx s)) ok_pairs))
688
         _ -> Bad "Unsupported number of nodes, only one or two  supported"
689

    
690
-- | Try to allocate an instance on the cluster.
691
tryAlloc :: (Monad m) =>
692
            Node.List         -- ^ The node list
693
         -> Instance.List     -- ^ The instance list
694
         -> Instance.Instance -- ^ The instance to allocate
695
         -> AllocNodes        -- ^ The allocation targets
696
         -> m AllocSolution   -- ^ Possible solution list
697
tryAlloc nl _ inst (Right ok_pairs) =
698
    let sols = foldl' (\cstate (p, s) ->
699
                           concatAllocs cstate $ allocateOnPair nl inst p s
700
                      ) emptyAllocSolution ok_pairs
701

    
702
    in if null ok_pairs -- means we have just one node
703
       then fail "Not enough online nodes"
704
       else return $ annotateSolution sols
705

    
706
tryAlloc nl _ inst (Left all_nodes) =
707
    let sols = foldl' (\cstate ->
708
                           concatAllocs cstate . allocateOnSingle nl inst
709
                      ) emptyAllocSolution all_nodes
710
    in if null all_nodes
711
       then fail "No online nodes"
712
       else return $ annotateSolution sols
713

    
714
-- | Given a group/result, describe it as a nice (list of) messages.
715
solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
716
solutionDescription gl (groupId, result) =
717
  case result of
718
    Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
719
    Bad message -> [printf "Group %s: error %s" gname message]
720
  where grp = Container.find groupId gl
721
        gname = Group.name grp
722
        pol = apolToString (Group.allocPolicy grp)
723

    
724
-- | From a list of possibly bad and possibly empty solutions, filter
725
-- only the groups with a valid result. Note that the result will be
726
-- reversed compared to the original list.
727
filterMGResults :: Group.List
728
                -> [(Gdx, Result AllocSolution)]
729
                -> [(Gdx, AllocSolution)]
730
filterMGResults gl = foldl' fn []
731
    where unallocable = not . Group.isAllocable . flip Container.find gl
732
          fn accu (gdx, rasol) =
733
              case rasol of
734
                Bad _ -> accu
735
                Ok sol | null (asSolutions sol) -> accu
736
                       | unallocable gdx -> accu
737
                       | otherwise -> (gdx, sol):accu
738

    
739
-- | Sort multigroup results based on policy and score.
740
sortMGResults :: Group.List
741
             -> [(Gdx, AllocSolution)]
742
             -> [(Gdx, AllocSolution)]
743
sortMGResults gl sols =
744
    let extractScore (_, _, _, x) = x
745
        solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
746
                               (extractScore . head . asSolutions) sol)
747
    in sortBy (comparing solScore) sols
748

    
749
-- | Try to allocate an instance on a multi-group cluster.
750
tryMGAlloc :: Group.List           -- ^ The group list
751
           -> Node.List            -- ^ The node list
752
           -> Instance.List        -- ^ The instance list
753
           -> Instance.Instance    -- ^ The instance to allocate
754
           -> Int                  -- ^ Required number of nodes
755
           -> Result AllocSolution -- ^ Possible solution list
756
tryMGAlloc mggl mgnl mgil inst cnt =
757
  let groups = splitCluster mgnl mgil
758
      sols = map (\(gid, (nl, il)) ->
759
                   (gid, genAllocNodes mggl nl cnt False >>=
760
                       tryAlloc nl il inst))
761
             groups::[(Gdx, Result AllocSolution)]
762
      all_msgs = concatMap (solutionDescription mggl) sols
763
      goodSols = filterMGResults mggl sols
764
      sortedSols = sortMGResults mggl goodSols
765
  in if null sortedSols
766
     then Bad $ intercalate ", " all_msgs
767
     else let (final_group, final_sol) = head sortedSols
768
              final_name = Group.name $ Container.find final_group mggl
769
              selmsg = "Selected group: " ++  final_name
770
          in Ok $ final_sol { asLog = selmsg:all_msgs }
771

    
772
-- | Try to relocate an instance on the cluster.
773
tryReloc :: (Monad m) =>
774
            Node.List       -- ^ The node list
775
         -> Instance.List   -- ^ The instance list
776
         -> Idx             -- ^ The index of the instance to move
777
         -> Int             -- ^ The number of nodes required
778
         -> [Ndx]           -- ^ Nodes which should not be used
779
         -> m AllocSolution -- ^ Solution list
780
tryReloc nl il xid 1 ex_idx =
781
    let all_nodes = getOnline nl
782
        inst = Container.find xid il
783
        ex_idx' = Instance.pNode inst:ex_idx
784
        valid_nodes = filter (not . flip elem ex_idx' . Node.idx) all_nodes
785
        valid_idxes = map Node.idx valid_nodes
786
        sols1 = foldl' (\cstate x ->
787
                            let em = do
788
                                  (mnl, i, _, _) <-
789
                                      applyMove nl inst (ReplaceSecondary x)
790
                                  return (mnl, i, [Container.find x mnl],
791
                                          compCV mnl)
792
                            in concatAllocs cstate em
793
                       ) emptyAllocSolution valid_idxes
794
    in return sols1
795

    
796
tryReloc _ _ _ reqn _  = fail $ "Unsupported number of relocation \
797
                                \destinations required (" ++ show reqn ++
798
                                                  "), only one supported"
799

    
800
tryMGReloc :: (Monad m) =>
801
              Group.List      -- ^ The group list
802
           -> Node.List       -- ^ The node list
803
           -> Instance.List   -- ^ The instance list
804
           -> Idx             -- ^ The index of the instance to move
805
           -> Int             -- ^ The number of nodes required
806
           -> [Ndx]           -- ^ Nodes which should not be used
807
           -> m AllocSolution -- ^ Solution list
808
tryMGReloc _ mgnl mgil xid ncount ex_ndx = do
809
  let groups = splitCluster mgnl mgil
810
      -- TODO: we only relocate inside the group for now
811
      inst = Container.find xid mgil
812
  (nl, il) <- case lookup (instancePriGroup mgnl inst) groups of
813
                Nothing -> fail $ "Cannot find group for instance " ++
814
                           Instance.name inst
815
                Just v -> return v
816
  tryReloc nl il xid ncount ex_ndx
817

    
818
-- | Change an instance's secondary node.
819
evacInstance :: (Monad m) =>
820
                [Ndx]                      -- ^ Excluded nodes
821
             -> Instance.List              -- ^ The current instance list
822
             -> (Node.List, AllocSolution) -- ^ The current state
823
             -> Idx                        -- ^ The instance to evacuate
824
             -> m (Node.List, AllocSolution)
825
evacInstance ex_ndx il (nl, old_as) idx = do
826
  -- FIXME: hardcoded one node here
827

    
828
  -- Longer explanation: evacuation is currently hardcoded to DRBD
829
  -- instances (which have one secondary); hence, even if the
830
  -- IAllocator protocol can request N nodes for an instance, and all
831
  -- the message parsing/loading pass this, this implementation only
832
  -- supports one; this situation needs to be revisited if we ever
833
  -- support more than one secondary, or if we change the storage
834
  -- model
835
  new_as <- tryReloc nl il idx 1 ex_ndx
836
  case asSolutions new_as of
837
    -- an individual relocation succeeded, we kind of compose the data
838
    -- from the two solutions
839
    csol@(nl', _, _, _):_ ->
840
        return (nl', new_as { asSolutions = csol:asSolutions old_as })
841
    -- this relocation failed, so we fail the entire evac
842
    _ -> fail $ "Can't evacuate instance " ++
843
         Instance.name (Container.find idx il) ++
844
             ": " ++ describeSolution new_as
845

    
846
-- | Try to evacuate a list of nodes.
847
tryEvac :: (Monad m) =>
848
            Node.List       -- ^ The node list
849
         -> Instance.List   -- ^ The instance list
850
         -> [Idx]           -- ^ Instances to be evacuated
851
         -> [Ndx]           -- ^ Restricted nodes (the ones being evacuated)
852
         -> m AllocSolution -- ^ Solution list
853
tryEvac nl il idxs ex_ndx = do
854
  (_, sol) <- foldM (evacInstance ex_ndx il) (nl, emptyAllocSolution) idxs
855
  return sol
856

    
857
-- | Multi-group evacuation of a list of nodes.
858
tryMGEvac :: (Monad m) =>
859
             Group.List -- ^ The group list
860
          -> Node.List       -- ^ The node list
861
          -> Instance.List   -- ^ The instance list
862
          -> [Ndx]           -- ^ Nodes to be evacuated
863
          -> m AllocSolution -- ^ Solution list
864
tryMGEvac _ nl il ex_ndx =
865
    let ex_nodes = map (`Container.find` nl) ex_ndx
866
        all_insts = nub . concatMap Node.sList $ ex_nodes
867
        all_insts' = associateIdxs all_insts $ splitCluster nl il
868
    in do
869
      results <- mapM (\(_, (gnl, gil, idxs)) -> tryEvac gnl gil idxs ex_ndx)
870
                 all_insts'
871
      let sol = foldl' sumAllocs emptyAllocSolution results
872
      return $ annotateSolution sol
873

    
874
-- | Function which fails if the requested mode is change secondary.
875
--
876
-- This is useful since except DRBD, no other disk template can
877
-- execute change secondary; thus, we can just call this function
878
-- instead of always checking for secondary mode. After the call to
879
-- this function, whatever mode we have is just a primary change.
880
failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
881
failOnSecondaryChange ChangeSecondary dt =
882
    fail $ "Instances with disk template '" ++ dtToString dt ++
883
         "' can't execute change secondary"
884
failOnSecondaryChange _ _ = return ()
885

    
886
-- | Run evacuation for a single instance.
887
nodeEvacInstance :: Node.List         -- ^ The node list (cluster-wide)
888
                 -> Instance.List     -- ^ Instance list (cluster-wide)
889
                 -> EvacMode          -- ^ The evacuation mode
890
                 -> Instance.Instance -- ^ The instance to be evacuated
891
                 -> [Ndx]             -- ^ The list of available nodes
892
                                      -- for allocation
893
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
894
nodeEvacInstance _ _ mode (Instance.Instance
895
                           {Instance.diskTemplate = dt@DTDiskless}) _ =
896
                  failOnSecondaryChange mode dt >>
897
                  fail "Diskless relocations not implemented yet"
898

    
899
nodeEvacInstance _ _ _ (Instance.Instance
900
                        {Instance.diskTemplate = DTPlain}) _ =
901
                  fail "Instances of type plain cannot be relocated"
902

    
903
nodeEvacInstance _ _ _ (Instance.Instance
904
                        {Instance.diskTemplate = DTFile}) _ =
905
                  fail "Instances of type file cannot be relocated"
906

    
907
nodeEvacInstance _ _ mode  (Instance.Instance
908
                            {Instance.diskTemplate = dt@DTSharedFile}) _ =
909
                  failOnSecondaryChange mode dt >>
910
                  fail "Shared file relocations not implemented yet"
911

    
912
nodeEvacInstance _ _ mode (Instance.Instance
913
                           {Instance.diskTemplate = dt@DTBlock}) _ =
914
                  failOnSecondaryChange mode dt >>
915
                  fail "Block device relocations not implemented yet"
916

    
917
nodeEvacInstance nl il ChangePrimary
918
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8}) _ =
919
  do
920
    (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
921
    let idx = Instance.idx inst
922
        il' = Container.add idx inst' il
923
        ops = iMoveToJob nl' il' idx Failover
924
    return (nl', il', ops)
925

    
926
nodeEvacInstance _ _ _ (Instance.Instance
927
                        {Instance.diskTemplate = DTDrbd8}) _ =
928
                  fail "DRBD relocations not implemented yet"
929

    
930
-- | Computes the local nodes of a given instance which are available
931
-- for allocation.
932
availableLocalNodes :: Node.List
933
                    -> [(Gdx, [Ndx])]
934
                    -> IntSet.IntSet
935
                    -> Instance.Instance
936
                    -> Result [Ndx]
937
availableLocalNodes nl group_nodes excl_ndx inst = do
938
  let gdx = instancePriGroup nl inst
939
  local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
940
                 Ok (lookup gdx group_nodes)
941
  let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
942
  return avail_nodes
943

    
944
-- | Updates the evac solution with the results of an instance
945
-- evacuation.
946
updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
947
                   -> Instance.Instance
948
                   -> Result (Node.List, Instance.List, [OpCodes.OpCode])
949
                   -> (Node.List, Instance.List, EvacSolution)
950
updateEvacSolution (nl, il, es) inst (Bad msg) =
951
    (nl, il, es { esFailed = (Instance.name inst ++ ": " ++ msg):esFailed es})
952
updateEvacSolution (_, _, es) inst (Ok (nl, il, opcodes)) =
953
    (nl, il, es { esMoved = Instance.name inst:esMoved es
954
                , esOpCodes = [opcodes]:esOpCodes es })
955

    
956
-- | Node-evacuation IAllocator mode main function.
957
tryNodeEvac :: Group.List    -- ^ The cluster groups
958
            -> Node.List     -- ^ The node list (cluster-wide, not per group)
959
            -> Instance.List -- ^ Instance list (cluster-wide)
960
            -> EvacMode      -- ^ The evacuation mode
961
            -> [Idx]         -- ^ List of instance (indices) to be evacuated
962
            -> Result EvacSolution
963
tryNodeEvac _ ini_nl ini_il mode idxs =
964
    let evac_ndx = nodesToEvacuate ini_il mode idxs
965
        offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
966
        excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
967
        group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
968
                                             (Container.elems nl))) $
969
                      splitCluster ini_nl ini_il
970
        (_, _, esol) =
971
            foldl' (\state@(nl, il, _) inst ->
972
                        updateEvacSolution state inst $
973
                        availableLocalNodes nl group_ndx excl_ndx inst >>=
974
                        nodeEvacInstance nl il mode inst
975
                   )
976
            (ini_nl, ini_il, emptyEvacSolution)
977
            (map (`Container.find` ini_il) idxs)
978
    in return $ reverseEvacSolution esol
979

    
980
-- | Recursively place instances on the cluster until we're out of space.
981
iterateAlloc :: Node.List
982
             -> Instance.List
983
             -> Instance.Instance
984
             -> AllocNodes
985
             -> [Instance.Instance]
986
             -> [CStats]
987
             -> Result AllocResult
988
iterateAlloc nl il newinst allocnodes ixes cstats =
989
      let depth = length ixes
990
          newname = printf "new-%d" depth::String
991
          newidx = length (Container.elems il) + depth
992
          newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
993
      in case tryAlloc nl il newi2 allocnodes of
994
           Bad s -> Bad s
995
           Ok (AllocSolution { asFailures = errs, asSolutions = sols3 }) ->
996
               case sols3 of
997
                 [] -> Ok (collapseFailures errs, nl, il, ixes, cstats)
998
                 (xnl, xi, _, _):[] ->
999
                     iterateAlloc xnl (Container.add newidx xi il)
1000
                                  newinst allocnodes (xi:ixes)
1001
                                  (totalResources xnl:cstats)
1002
                 _ -> Bad "Internal error: multiple solutions for single\
1003
                          \ allocation"
1004

    
1005
-- | The core of the tiered allocation mode.
1006
tieredAlloc :: Node.List
1007
            -> Instance.List
1008
            -> Instance.Instance
1009
            -> AllocNodes
1010
            -> [Instance.Instance]
1011
            -> [CStats]
1012
            -> Result AllocResult
1013
tieredAlloc nl il newinst allocnodes ixes cstats =
1014
    case iterateAlloc nl il newinst allocnodes ixes cstats of
1015
      Bad s -> Bad s
1016
      Ok (errs, nl', il', ixes', cstats') ->
1017
          case Instance.shrinkByType newinst . fst . last $
1018
               sortBy (comparing snd) errs of
1019
            Bad _ -> Ok (errs, nl', il', ixes', cstats')
1020
            Ok newinst' ->
1021
                tieredAlloc nl' il' newinst' allocnodes ixes' cstats'
1022

    
1023
-- | Compute the tiered spec string description from a list of
1024
-- allocated instances.
1025
tieredSpecMap :: [Instance.Instance]
1026
              -> [String]
1027
tieredSpecMap trl_ixes =
1028
    let fin_trl_ixes = reverse trl_ixes
1029
        ix_byspec = groupBy ((==) `on` Instance.specOf) fin_trl_ixes
1030
        spec_map = map (\ixs -> (Instance.specOf $ head ixs, length ixs))
1031
                   ix_byspec
1032
    in  map (\(spec, cnt) -> printf "%d,%d,%d=%d" (rspecMem spec)
1033
                             (rspecDsk spec) (rspecCpu spec) cnt) spec_map
1034

    
1035
-- * Formatting functions
1036

    
1037
-- | Given the original and final nodes, computes the relocation description.
1038
computeMoves :: Instance.Instance -- ^ The instance to be moved
1039
             -> String -- ^ The instance name
1040
             -> IMove  -- ^ The move being performed
1041
             -> String -- ^ New primary
1042
             -> String -- ^ New secondary
1043
             -> (String, [String])
1044
                -- ^ Tuple of moves and commands list; moves is containing
1045
                -- either @/f/@ for failover or @/r:name/@ for replace
1046
                -- secondary, while the command list holds gnt-instance
1047
                -- commands (without that prefix), e.g \"@failover instance1@\"
1048
computeMoves i inam mv c d =
1049
    case mv of
1050
      Failover -> ("f", [mig])
1051
      FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1052
      ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1053
      ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1054
      ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1055
    where morf = if Instance.running i then "migrate" else "failover"
1056
          mig = printf "%s -f %s" morf inam::String
1057
          rep n = printf "replace-disks -n %s %s" n inam
1058

    
1059
-- | Converts a placement to string format.
1060
printSolutionLine :: Node.List     -- ^ The node list
1061
                  -> Instance.List -- ^ The instance list
1062
                  -> Int           -- ^ Maximum node name length
1063
                  -> Int           -- ^ Maximum instance name length
1064
                  -> Placement     -- ^ The current placement
1065
                  -> Int           -- ^ The index of the placement in
1066
                                   -- the solution
1067
                  -> (String, [String])
1068
printSolutionLine nl il nmlen imlen plc pos =
1069
    let
1070
        pmlen = (2*nmlen + 1)
1071
        (i, p, s, mv, c) = plc
1072
        inst = Container.find i il
1073
        inam = Instance.alias inst
1074
        npri = Node.alias $ Container.find p nl
1075
        nsec = Node.alias $ Container.find s nl
1076
        opri = Node.alias $ Container.find (Instance.pNode inst) nl
1077
        osec = Node.alias $ Container.find (Instance.sNode inst) nl
1078
        (moves, cmds) =  computeMoves inst inam mv npri nsec
1079
        ostr = printf "%s:%s" opri osec::String
1080
        nstr = printf "%s:%s" npri nsec::String
1081
    in
1082
      (printf "  %3d. %-*s %-*s => %-*s %.8f a=%s"
1083
       pos imlen inam pmlen ostr
1084
       pmlen nstr c moves,
1085
       cmds)
1086

    
1087
-- | Return the instance and involved nodes in an instance move.
1088
involvedNodes :: Instance.List -> Placement -> [Ndx]
1089
involvedNodes il plc =
1090
    let (i, np, ns, _, _) = plc
1091
        inst = Container.find i il
1092
        op = Instance.pNode inst
1093
        os = Instance.sNode inst
1094
    in nub [np, ns, op, os]
1095

    
1096
-- | Inner function for splitJobs, that either appends the next job to
1097
-- the current jobset, or starts a new jobset.
1098
mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1099
mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1100
mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1101
    | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1102
    | otherwise = ([n]:cjs, ndx)
1103

    
1104
-- | Break a list of moves into independent groups. Note that this
1105
-- will reverse the order of jobs.
1106
splitJobs :: [MoveJob] -> [JobSet]
1107
splitJobs = fst . foldl mergeJobs ([], [])
1108

    
1109
-- | Given a list of commands, prefix them with @gnt-instance@ and
1110
-- also beautify the display a little.
1111
formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1112
formatJob jsn jsl (sn, (_, _, _, cmds)) =
1113
    let out =
1114
            printf "  echo job %d/%d" jsn sn:
1115
            printf "  check":
1116
            map ("  gnt-instance " ++) cmds
1117
    in if sn == 1
1118
       then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1119
       else out
1120

    
1121
-- | Given a list of commands, prefix them with @gnt-instance@ and
1122
-- also beautify the display a little.
1123
formatCmds :: [JobSet] -> String
1124
formatCmds =
1125
    unlines .
1126
    concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1127
                             (zip [1..] js)) .
1128
    zip [1..]
1129

    
1130
-- | Print the node list.
1131
printNodes :: Node.List -> [String] -> String
1132
printNodes nl fs =
1133
    let fields = case fs of
1134
          [] -> Node.defaultFields
1135
          "+":rest -> Node.defaultFields ++ rest
1136
          _ -> fs
1137
        snl = sortBy (comparing Node.idx) (Container.elems nl)
1138
        (header, isnum) = unzip $ map Node.showHeader fields
1139
    in unlines . map ((:) ' ' .  intercalate " ") $
1140
       formatTable (header:map (Node.list fields) snl) isnum
1141

    
1142
-- | Print the instance list.
1143
printInsts :: Node.List -> Instance.List -> String
1144
printInsts nl il =
1145
    let sil = sortBy (comparing Instance.idx) (Container.elems il)
1146
        helper inst = [ if Instance.running inst then "R" else " "
1147
                      , Instance.name inst
1148
                      , Container.nameOf nl (Instance.pNode inst)
1149
                      , let sdx = Instance.sNode inst
1150
                        in if sdx == Node.noSecondary
1151
                           then  ""
1152
                           else Container.nameOf nl sdx
1153
                      , if Instance.autoBalance inst then "Y" else "N"
1154
                      , printf "%3d" $ Instance.vcpus inst
1155
                      , printf "%5d" $ Instance.mem inst
1156
                      , printf "%5d" $ Instance.dsk inst `div` 1024
1157
                      , printf "%5.3f" lC
1158
                      , printf "%5.3f" lM
1159
                      , printf "%5.3f" lD
1160
                      , printf "%5.3f" lN
1161
                      ]
1162
            where DynUtil lC lM lD lN = Instance.util inst
1163
        header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1164
                 , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1165
        isnum = False:False:False:False:False:repeat True
1166
    in unlines . map ((:) ' ' . intercalate " ") $
1167
       formatTable (header:map helper sil) isnum
1168

    
1169
-- | Shows statistics for a given node list.
1170
printStats :: Node.List -> String
1171
printStats nl =
1172
    let dcvs = compDetailedCV $ Container.elems nl
1173
        (weights, names) = unzip detailedCVInfo
1174
        hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1175
        formatted = map (\(w, header, val) ->
1176
                             printf "%s=%.8f(x%.2f)" header val w::String) hd
1177
    in intercalate ", " formatted
1178

    
1179
-- | Convert a placement into a list of OpCodes (basically a job).
1180
iMoveToJob :: Node.List -> Instance.List
1181
          -> Idx -> IMove -> [OpCodes.OpCode]
1182
iMoveToJob nl il idx move =
1183
    let inst = Container.find idx il
1184
        iname = Instance.name inst
1185
        lookNode  = Just . Container.nameOf nl
1186
        opF = OpCodes.OpInstanceMigrate iname True False True
1187
        opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n)
1188
                OpCodes.ReplaceNewSecondary [] Nothing
1189
    in case move of
1190
         Failover -> [ opF ]
1191
         ReplacePrimary np -> [ opF, opR np, opF ]
1192
         ReplaceSecondary ns -> [ opR ns ]
1193
         ReplaceAndFailover np -> [ opR np, opF ]
1194
         FailoverAndReplace ns -> [ opF, opR ns ]
1195

    
1196
-- * Node group functions
1197

    
1198
-- | Computes the group of an instance.
1199
instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1200
instanceGroup nl i =
1201
  let sidx = Instance.sNode i
1202
      pnode = Container.find (Instance.pNode i) nl
1203
      snode = if sidx == Node.noSecondary
1204
              then pnode
1205
              else Container.find sidx nl
1206
      pgroup = Node.group pnode
1207
      sgroup = Node.group snode
1208
  in if pgroup /= sgroup
1209
     then fail ("Instance placed accross two node groups, primary " ++
1210
                show pgroup ++ ", secondary " ++ show sgroup)
1211
     else return pgroup
1212

    
1213
-- | Computes the group of an instance per the primary node.
1214
instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1215
instancePriGroup nl i =
1216
  let pnode = Container.find (Instance.pNode i) nl
1217
  in  Node.group pnode
1218

    
1219
-- | Compute the list of badly allocated instances (split across node
1220
-- groups).
1221
findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1222
findSplitInstances nl =
1223
  filter (not . isOk . instanceGroup nl) . Container.elems
1224

    
1225
-- | Splits a cluster into the component node groups.
1226
splitCluster :: Node.List -> Instance.List ->
1227
                [(Gdx, (Node.List, Instance.List))]
1228
splitCluster nl il =
1229
  let ngroups = Node.computeGroups (Container.elems nl)
1230
  in map (\(guuid, nodes) ->
1231
           let nidxs = map Node.idx nodes
1232
               nodes' = zip nidxs nodes
1233
               instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1234
           in (guuid, (Container.fromList nodes', instances))) ngroups
1235

    
1236
-- | Split a global instance index map into per-group, and associate
1237
-- it with the group/node/instance lists.
1238
associateIdxs :: [Idx] -- ^ Instance indices to be split/associated
1239
              -> [(Gdx, (Node.List, Instance.List))]        -- ^ Input groups
1240
              -> [(Gdx, (Node.List, Instance.List, [Idx]))] -- ^ Result
1241
associateIdxs idxs =
1242
    map (\(gdx, (nl, il)) ->
1243
             (gdx, (nl, il, filter (`Container.member` il) idxs)))
1244

    
1245
-- | Compute the list of nodes that are to be evacuated, given a list
1246
-- of instances and an evacuation mode.
1247
nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1248
                -> EvacMode      -- ^ The evacuation mode we're using
1249
                -> [Idx]         -- ^ List of instance indices being evacuated
1250
                -> IntSet.IntSet -- ^ Set of node indices
1251
nodesToEvacuate il mode =
1252
    IntSet.delete Node.noSecondary .
1253
    foldl' (\ns idx ->
1254
                let i = Container.find idx il
1255
                    pdx = Instance.pNode i
1256
                    sdx = Instance.sNode i
1257
                    dt = Instance.diskTemplate i
1258
                    withSecondary = case dt of
1259
                                      DTDrbd8 -> IntSet.insert sdx ns
1260
                                      _ -> ns
1261
                in case mode of
1262
                     ChangePrimary   -> IntSet.insert pdx ns
1263
                     ChangeSecondary -> withSecondary
1264
                     ChangeAll       -> IntSet.insert pdx withSecondary
1265
           ) IntSet.empty