Statistics
| Branch: | Tag: | Revision:

root / htools / Ganeti / HTools / Cluster.hs @ d52d41de

History | View | Annotate | Download (58.4 kB)

1
{-| Implementation of cluster-wide logic.
2

    
3
This module holds all pure cluster-logic; I\/O related functionality
4
goes into the /Main/ module for the individual binaries.
5

    
6
-}
7

    
8
{-
9

    
10
Copyright (C) 2009, 2010, 2011 Google Inc.
11

    
12
This program is free software; you can redistribute it and/or modify
13
it under the terms of the GNU General Public License as published by
14
the Free Software Foundation; either version 2 of the License, or
15
(at your option) any later version.
16

    
17
This program is distributed in the hope that it will be useful, but
18
WITHOUT ANY WARRANTY; without even the implied warranty of
19
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
General Public License for more details.
21

    
22
You should have received a copy of the GNU General Public License
23
along with this program; if not, write to the Free Software
24
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25
02110-1301, USA.
26

    
27
-}
28

    
29
module Ganeti.HTools.Cluster
30
    (
31
     -- * Types
32
      AllocSolution(..)
33
    , EvacSolution(..)
34
    , Table(..)
35
    , CStats(..)
36
    , AllocStats
37
    -- * Generic functions
38
    , totalResources
39
    , computeAllocationDelta
40
    -- * First phase functions
41
    , computeBadItems
42
    -- * Second phase functions
43
    , printSolutionLine
44
    , formatCmds
45
    , involvedNodes
46
    , splitJobs
47
    -- * Display functions
48
    , printNodes
49
    , printInsts
50
    -- * Balacing functions
51
    , checkMove
52
    , doNextBalance
53
    , tryBalance
54
    , compCV
55
    , compCVNodes
56
    , compDetailedCV
57
    , printStats
58
    , iMoveToJob
59
    -- * IAllocator functions
60
    , genAllocNodes
61
    , tryAlloc
62
    , tryMGAlloc
63
    , tryReloc
64
    , tryMGReloc
65
    , tryEvac
66
    , tryMGEvac
67
    , tryNodeEvac
68
    , collapseFailures
69
    -- * Allocation functions
70
    , iterateAlloc
71
    , tieredAlloc
72
    , tieredSpecMap
73
     -- * Node group functions
74
    , instanceGroup
75
    , findSplitInstances
76
    , splitCluster
77
    ) where
78

    
79
import Data.Function (on)
80
import qualified Data.IntSet as IntSet
81
import Data.List
82
import Data.Maybe (fromJust)
83
import Data.Ord (comparing)
84
import Text.Printf (printf)
85
import Control.Monad
86
import Control.Parallel.Strategies
87

    
88
import qualified Ganeti.HTools.Container as Container
89
import qualified Ganeti.HTools.Instance as Instance
90
import qualified Ganeti.HTools.Node as Node
91
import qualified Ganeti.HTools.Group as Group
92
import Ganeti.HTools.Types
93
import Ganeti.HTools.Utils
94
import qualified Ganeti.OpCodes as OpCodes
95

    
96
-- * Types
97

    
98
-- | Allocation\/relocation solution.
99
data AllocSolution = AllocSolution
100
  { asFailures  :: [FailMode]          -- ^ Failure counts
101
  , asAllocs    :: Int                 -- ^ Good allocation count
102
  , asSolutions :: [Node.AllocElement] -- ^ The actual result, length
103
                                       -- of the list depends on the
104
                                       -- allocation/relocation mode
105
  , asLog       :: [String]            -- ^ A list of informational messages
106
  }
107

    
108
-- | Node evacuation/group change iallocator result type. This result
109
-- type consists of actual opcodes (a restricted subset) that are
110
-- transmitted back to Ganeti.
111
data EvacSolution = EvacSolution
112
    { esMoved   :: [String]             -- ^ Instance moved successfully
113
    , esFailed  :: [String]             -- ^ Instance which were not
114
                                        -- relocated
115
    , esOpCodes :: [[[OpCodes.OpCode]]] -- ^ List of lists of jobs
116
    }
117

    
118
-- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
119
type AllocResult = (FailStats, Node.List, Instance.List,
120
                    [Instance.Instance], [CStats])
121

    
122
-- | A type denoting the valid allocation mode/pairs.
123
--
124
-- For a one-node allocation, this will be a @Left ['Node.Node']@,
125
-- whereas for a two-node allocation, this will be a @Right
126
-- [('Node.Node', 'Node.Node')]@.
127
type AllocNodes = Either [Ndx] [(Ndx, Ndx)]
128

    
129
-- | The empty solution we start with when computing allocations.
130
emptyAllocSolution :: AllocSolution
131
emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
132
                                   , asSolutions = [], asLog = [] }
133

    
134
-- | The empty evac solution.
135
emptyEvacSolution :: EvacSolution
136
emptyEvacSolution = EvacSolution { esMoved = []
137
                                 , esFailed = []
138
                                 , esOpCodes = []
139
                                 }
140

    
141
-- | The complete state for the balancing solution.
142
data Table = Table Node.List Instance.List Score [Placement]
143
             deriving (Show, Read)
144

    
145
data CStats = CStats { csFmem :: Integer -- ^ Cluster free mem
146
                     , csFdsk :: Integer -- ^ Cluster free disk
147
                     , csAmem :: Integer -- ^ Cluster allocatable mem
148
                     , csAdsk :: Integer -- ^ Cluster allocatable disk
149
                     , csAcpu :: Integer -- ^ Cluster allocatable cpus
150
                     , csMmem :: Integer -- ^ Max node allocatable mem
151
                     , csMdsk :: Integer -- ^ Max node allocatable disk
152
                     , csMcpu :: Integer -- ^ Max node allocatable cpu
153
                     , csImem :: Integer -- ^ Instance used mem
154
                     , csIdsk :: Integer -- ^ Instance used disk
155
                     , csIcpu :: Integer -- ^ Instance used cpu
156
                     , csTmem :: Double  -- ^ Cluster total mem
157
                     , csTdsk :: Double  -- ^ Cluster total disk
158
                     , csTcpu :: Double  -- ^ Cluster total cpus
159
                     , csVcpu :: Integer -- ^ Cluster virtual cpus (if
160
                                         -- node pCpu has been set,
161
                                         -- otherwise -1)
162
                     , csXmem :: Integer -- ^ Unnacounted for mem
163
                     , csNmem :: Integer -- ^ Node own memory
164
                     , csScore :: Score  -- ^ The cluster score
165
                     , csNinst :: Int    -- ^ The total number of instances
166
                     }
167
            deriving (Show, Read)
168

    
169
-- | Currently used, possibly to allocate, unallocable.
170
type AllocStats = (RSpec, RSpec, RSpec)
171

    
172
-- * Utility functions
173

    
174
-- | Verifies the N+1 status and return the affected nodes.
175
verifyN1 :: [Node.Node] -> [Node.Node]
176
verifyN1 = filter Node.failN1
177

    
178
{-| Computes the pair of bad nodes and instances.
179

    
180
The bad node list is computed via a simple 'verifyN1' check, and the
181
bad instance list is the list of primary and secondary instances of
182
those nodes.
183

    
184
-}
185
computeBadItems :: Node.List -> Instance.List ->
186
                   ([Node.Node], [Instance.Instance])
187
computeBadItems nl il =
188
  let bad_nodes = verifyN1 $ getOnline nl
189
      bad_instances = map (`Container.find` il) .
190
                      sort . nub $
191
                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
192
  in
193
    (bad_nodes, bad_instances)
194

    
195
-- | Zero-initializer for the CStats type.
196
emptyCStats :: CStats
197
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
198

    
199
-- | Update stats with data from a new node.
200
updateCStats :: CStats -> Node.Node -> CStats
201
updateCStats cs node =
202
    let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
203
                 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
204
                 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
205
                 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
206
                 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
207
                 csVcpu = x_vcpu,
208
                 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
209
               }
210
            = cs
211
        inc_amem = Node.fMem node - Node.rMem node
212
        inc_amem' = if inc_amem > 0 then inc_amem else 0
213
        inc_adsk = Node.availDisk node
214
        inc_imem = truncate (Node.tMem node) - Node.nMem node
215
                   - Node.xMem node - Node.fMem node
216
        inc_icpu = Node.uCpu node
217
        inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
218
        inc_vcpu = Node.hiCpu node
219
        inc_acpu = Node.availCpu node
220

    
221
    in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
222
          , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
223
          , csAmem = x_amem + fromIntegral inc_amem'
224
          , csAdsk = x_adsk + fromIntegral inc_adsk
225
          , csAcpu = x_acpu + fromIntegral inc_acpu
226
          , csMmem = max x_mmem (fromIntegral inc_amem')
227
          , csMdsk = max x_mdsk (fromIntegral inc_adsk)
228
          , csMcpu = max x_mcpu (fromIntegral inc_acpu)
229
          , csImem = x_imem + fromIntegral inc_imem
230
          , csIdsk = x_idsk + fromIntegral inc_idsk
231
          , csIcpu = x_icpu + fromIntegral inc_icpu
232
          , csTmem = x_tmem + Node.tMem node
233
          , csTdsk = x_tdsk + Node.tDsk node
234
          , csTcpu = x_tcpu + Node.tCpu node
235
          , csVcpu = x_vcpu + fromIntegral inc_vcpu
236
          , csXmem = x_xmem + fromIntegral (Node.xMem node)
237
          , csNmem = x_nmem + fromIntegral (Node.nMem node)
238
          , csNinst = x_ninst + length (Node.pList node)
239
          }
240

    
241
-- | Compute the total free disk and memory in the cluster.
242
totalResources :: Node.List -> CStats
243
totalResources nl =
244
    let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
245
    in cs { csScore = compCV nl }
246

    
247
-- | Compute the delta between two cluster state.
248
--
249
-- This is used when doing allocations, to understand better the
250
-- available cluster resources. The return value is a triple of the
251
-- current used values, the delta that was still allocated, and what
252
-- was left unallocated.
253
computeAllocationDelta :: CStats -> CStats -> AllocStats
254
computeAllocationDelta cini cfin =
255
    let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini
256
        CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
257
                csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin
258
        rini = RSpec (fromIntegral i_icpu) (fromIntegral i_imem)
259
               (fromIntegral i_idsk)
260
        rfin = RSpec (fromIntegral (f_icpu - i_icpu))
261
               (fromIntegral (f_imem - i_imem))
262
               (fromIntegral (f_idsk - i_idsk))
263
        un_cpu = fromIntegral (v_cpu - f_icpu)::Int
264
        runa = RSpec un_cpu (truncate t_mem - fromIntegral f_imem)
265
               (truncate t_dsk - fromIntegral f_idsk)
266
    in (rini, rfin, runa)
267

    
268
-- | The names and weights of the individual elements in the CV list.
269
detailedCVInfo :: [(Double, String)]
270
detailedCVInfo = [ (1,  "free_mem_cv")
271
                 , (1,  "free_disk_cv")
272
                 , (1,  "n1_cnt")
273
                 , (1,  "reserved_mem_cv")
274
                 , (4,  "offline_all_cnt")
275
                 , (16, "offline_pri_cnt")
276
                 , (1,  "vcpu_ratio_cv")
277
                 , (1,  "cpu_load_cv")
278
                 , (1,  "mem_load_cv")
279
                 , (1,  "disk_load_cv")
280
                 , (1,  "net_load_cv")
281
                 , (2,  "pri_tags_score")
282
                 ]
283

    
284
detailedCVWeights :: [Double]
285
detailedCVWeights = map fst detailedCVInfo
286

    
287
-- | Compute the mem and disk covariance.
288
compDetailedCV :: [Node.Node] -> [Double]
289
compDetailedCV all_nodes =
290
    let
291
        (offline, nodes) = partition Node.offline all_nodes
292
        mem_l = map Node.pMem nodes
293
        dsk_l = map Node.pDsk nodes
294
        -- metric: memory covariance
295
        mem_cv = stdDev mem_l
296
        -- metric: disk covariance
297
        dsk_cv = stdDev dsk_l
298
        -- metric: count of instances living on N1 failing nodes
299
        n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
300
                                                   length (Node.pList n)) .
301
                   filter Node.failN1 $ nodes :: Double
302
        res_l = map Node.pRem nodes
303
        -- metric: reserved memory covariance
304
        res_cv = stdDev res_l
305
        -- offline instances metrics
306
        offline_ipri = sum . map (length . Node.pList) $ offline
307
        offline_isec = sum . map (length . Node.sList) $ offline
308
        -- metric: count of instances on offline nodes
309
        off_score = fromIntegral (offline_ipri + offline_isec)::Double
310
        -- metric: count of primary instances on offline nodes (this
311
        -- helps with evacuation/failover of primary instances on
312
        -- 2-node clusters with one node offline)
313
        off_pri_score = fromIntegral offline_ipri::Double
314
        cpu_l = map Node.pCpu nodes
315
        -- metric: covariance of vcpu/pcpu ratio
316
        cpu_cv = stdDev cpu_l
317
        -- metrics: covariance of cpu, memory, disk and network load
318
        (c_load, m_load, d_load, n_load) = unzip4 $
319
            map (\n ->
320
                     let DynUtil c1 m1 d1 n1 = Node.utilLoad n
321
                         DynUtil c2 m2 d2 n2 = Node.utilPool n
322
                     in (c1/c2, m1/m2, d1/d2, n1/n2)
323
                ) nodes
324
        -- metric: conflicting instance count
325
        pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
326
        pri_tags_score = fromIntegral pri_tags_inst::Double
327
    in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
328
       , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
329
       , pri_tags_score ]
330

    
331
-- | Compute the /total/ variance.
332
compCVNodes :: [Node.Node] -> Double
333
compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
334

    
335
-- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
336
compCV :: Node.List -> Double
337
compCV = compCVNodes . Container.elems
338

    
339

    
340
-- | Compute online nodes from a 'Node.List'.
341
getOnline :: Node.List -> [Node.Node]
342
getOnline = filter (not . Node.offline) . Container.elems
343

    
344
-- * Balancing functions
345

    
346
-- | Compute best table. Note that the ordering of the arguments is important.
347
compareTables :: Table -> Table -> Table
348
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
349
    if a_cv > b_cv then b else a
350

    
351
-- | Applies an instance move to a given node list and instance.
352
applyMove :: Node.List -> Instance.Instance
353
          -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
354
-- Failover (f)
355
applyMove nl inst Failover =
356
    let old_pdx = Instance.pNode inst
357
        old_sdx = Instance.sNode inst
358
        old_p = Container.find old_pdx nl
359
        old_s = Container.find old_sdx nl
360
        int_p = Node.removePri old_p inst
361
        int_s = Node.removeSec old_s inst
362
        force_p = Node.offline old_p
363
        new_nl = do -- Maybe monad
364
          new_p <- Node.addPriEx force_p int_s inst
365
          new_s <- Node.addSec int_p inst old_sdx
366
          let new_inst = Instance.setBoth inst old_sdx old_pdx
367
          return (Container.addTwo old_pdx new_s old_sdx new_p nl,
368
                  new_inst, old_sdx, old_pdx)
369
    in new_nl
370

    
371
-- Replace the primary (f:, r:np, f)
372
applyMove nl inst (ReplacePrimary new_pdx) =
373
    let old_pdx = Instance.pNode inst
374
        old_sdx = Instance.sNode inst
375
        old_p = Container.find old_pdx nl
376
        old_s = Container.find old_sdx nl
377
        tgt_n = Container.find new_pdx nl
378
        int_p = Node.removePri old_p inst
379
        int_s = Node.removeSec old_s inst
380
        force_p = Node.offline old_p
381
        new_nl = do -- Maybe monad
382
          -- check that the current secondary can host the instance
383
          -- during the migration
384
          tmp_s <- Node.addPriEx force_p int_s inst
385
          let tmp_s' = Node.removePri tmp_s inst
386
          new_p <- Node.addPriEx force_p tgt_n inst
387
          new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
388
          let new_inst = Instance.setPri inst new_pdx
389
          return (Container.add new_pdx new_p $
390
                  Container.addTwo old_pdx int_p old_sdx new_s nl,
391
                  new_inst, new_pdx, old_sdx)
392
    in new_nl
393

    
394
-- Replace the secondary (r:ns)
395
applyMove nl inst (ReplaceSecondary new_sdx) =
396
    let old_pdx = Instance.pNode inst
397
        old_sdx = Instance.sNode inst
398
        old_s = Container.find old_sdx nl
399
        tgt_n = Container.find new_sdx nl
400
        int_s = Node.removeSec old_s inst
401
        force_s = Node.offline old_s
402
        new_inst = Instance.setSec inst new_sdx
403
        new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
404
                 \new_s -> return (Container.addTwo new_sdx
405
                                   new_s old_sdx int_s nl,
406
                                   new_inst, old_pdx, new_sdx)
407
    in new_nl
408

    
409
-- Replace the secondary and failover (r:np, f)
410
applyMove nl inst (ReplaceAndFailover new_pdx) =
411
    let old_pdx = Instance.pNode inst
412
        old_sdx = Instance.sNode inst
413
        old_p = Container.find old_pdx nl
414
        old_s = Container.find old_sdx nl
415
        tgt_n = Container.find new_pdx nl
416
        int_p = Node.removePri old_p inst
417
        int_s = Node.removeSec old_s inst
418
        force_s = Node.offline old_s
419
        new_nl = do -- Maybe monad
420
          new_p <- Node.addPri tgt_n inst
421
          new_s <- Node.addSecEx force_s int_p inst new_pdx
422
          let new_inst = Instance.setBoth inst new_pdx old_pdx
423
          return (Container.add new_pdx new_p $
424
                  Container.addTwo old_pdx new_s old_sdx int_s nl,
425
                  new_inst, new_pdx, old_pdx)
426
    in new_nl
427

    
428
-- Failver and replace the secondary (f, r:ns)
429
applyMove nl inst (FailoverAndReplace new_sdx) =
430
    let old_pdx = Instance.pNode inst
431
        old_sdx = Instance.sNode inst
432
        old_p = Container.find old_pdx nl
433
        old_s = Container.find old_sdx nl
434
        tgt_n = Container.find new_sdx nl
435
        int_p = Node.removePri old_p inst
436
        int_s = Node.removeSec old_s inst
437
        force_p = Node.offline old_p
438
        new_nl = do -- Maybe monad
439
          new_p <- Node.addPriEx force_p int_s inst
440
          new_s <- Node.addSecEx force_p tgt_n inst old_sdx
441
          let new_inst = Instance.setBoth inst old_sdx new_sdx
442
          return (Container.add new_sdx new_s $
443
                  Container.addTwo old_sdx new_p old_pdx int_p nl,
444
                  new_inst, old_sdx, new_sdx)
445
    in new_nl
446

    
447
-- | Tries to allocate an instance on one given node.
448
allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
449
                 -> OpResult Node.AllocElement
450
allocateOnSingle nl inst new_pdx =
451
    let p = Container.find new_pdx nl
452
        new_inst = Instance.setBoth inst new_pdx Node.noSecondary
453
    in  Node.addPri p inst >>= \new_p -> do
454
      let new_nl = Container.add new_pdx new_p nl
455
          new_score = compCV nl
456
      return (new_nl, new_inst, [new_p], new_score)
457

    
458
-- | Tries to allocate an instance on a given pair of nodes.
459
allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
460
               -> OpResult Node.AllocElement
461
allocateOnPair nl inst new_pdx new_sdx =
462
    let tgt_p = Container.find new_pdx nl
463
        tgt_s = Container.find new_sdx nl
464
    in do
465
      new_p <- Node.addPri tgt_p inst
466
      new_s <- Node.addSec tgt_s inst new_pdx
467
      let new_inst = Instance.setBoth inst new_pdx new_sdx
468
          new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
469
      return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
470

    
471
-- | Tries to perform an instance move and returns the best table
472
-- between the original one and the new one.
473
checkSingleStep :: Table -- ^ The original table
474
                -> Instance.Instance -- ^ The instance to move
475
                -> Table -- ^ The current best table
476
                -> IMove -- ^ The move to apply
477
                -> Table -- ^ The final best table
478
checkSingleStep ini_tbl target cur_tbl move =
479
    let
480
        Table ini_nl ini_il _ ini_plc = ini_tbl
481
        tmp_resu = applyMove ini_nl target move
482
    in
483
      case tmp_resu of
484
        OpFail _ -> cur_tbl
485
        OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
486
            let tgt_idx = Instance.idx target
487
                upd_cvar = compCV upd_nl
488
                upd_il = Container.add tgt_idx new_inst ini_il
489
                upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
490
                upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
491
            in
492
              compareTables cur_tbl upd_tbl
493

    
494
-- | Given the status of the current secondary as a valid new node and
495
-- the current candidate target node, generate the possible moves for
496
-- a instance.
497
possibleMoves :: Bool      -- ^ Whether the secondary node is a valid new node
498
              -> Bool      -- ^ Whether we can change the primary node
499
              -> Ndx       -- ^ Target node candidate
500
              -> [IMove]   -- ^ List of valid result moves
501

    
502
possibleMoves _ False tdx =
503
    [ReplaceSecondary tdx]
504

    
505
possibleMoves True True tdx =
506
    [ReplaceSecondary tdx,
507
     ReplaceAndFailover tdx,
508
     ReplacePrimary tdx,
509
     FailoverAndReplace tdx]
510

    
511
possibleMoves False True tdx =
512
    [ReplaceSecondary tdx,
513
     ReplaceAndFailover tdx]
514

    
515
-- | Compute the best move for a given instance.
516
checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
517
                  -> Bool              -- ^ Whether disk moves are allowed
518
                  -> Bool              -- ^ Whether instance moves are allowed
519
                  -> Table             -- ^ Original table
520
                  -> Instance.Instance -- ^ Instance to move
521
                  -> Table             -- ^ Best new table for this instance
522
checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
523
    let
524
        opdx = Instance.pNode target
525
        osdx = Instance.sNode target
526
        nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx
527
        use_secondary = elem osdx nodes_idx && inst_moves
528
        aft_failover = if use_secondary -- if allowed to failover
529
                       then checkSingleStep ini_tbl target ini_tbl Failover
530
                       else ini_tbl
531
        all_moves = if disk_moves
532
                    then concatMap
533
                         (possibleMoves use_secondary inst_moves) nodes
534
                    else []
535
    in
536
      -- iterate over the possible nodes for this instance
537
      foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
538

    
539
-- | Compute the best next move.
540
checkMove :: [Ndx]               -- ^ Allowed target node indices
541
          -> Bool                -- ^ Whether disk moves are allowed
542
          -> Bool                -- ^ Whether instance moves are allowed
543
          -> Table               -- ^ The current solution
544
          -> [Instance.Instance] -- ^ List of instances still to move
545
          -> Table               -- ^ The new solution
546
checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
547
    let Table _ _ _ ini_plc = ini_tbl
548
        -- we're using rwhnf from the Control.Parallel.Strategies
549
        -- package; we don't need to use rnf as that would force too
550
        -- much evaluation in single-threaded cases, and in
551
        -- multi-threaded case the weak head normal form is enough to
552
        -- spark the evaluation
553
        tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
554
                               inst_moves ini_tbl)
555
                 victims
556
        -- iterate over all instances, computing the best move
557
        best_tbl = foldl' compareTables ini_tbl tables
558
        Table _ _ _ best_plc = best_tbl
559
    in if length best_plc == length ini_plc
560
       then ini_tbl -- no advancement
561
       else best_tbl
562

    
563
-- | Check if we are allowed to go deeper in the balancing.
564
doNextBalance :: Table     -- ^ The starting table
565
              -> Int       -- ^ Remaining length
566
              -> Score     -- ^ Score at which to stop
567
              -> Bool      -- ^ The resulting table and commands
568
doNextBalance ini_tbl max_rounds min_score =
569
    let Table _ _ ini_cv ini_plc = ini_tbl
570
        ini_plc_len = length ini_plc
571
    in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
572

    
573
-- | Run a balance move.
574
tryBalance :: Table       -- ^ The starting table
575
           -> Bool        -- ^ Allow disk moves
576
           -> Bool        -- ^ Allow instance moves
577
           -> Bool        -- ^ Only evacuate moves
578
           -> Score       -- ^ Min gain threshold
579
           -> Score       -- ^ Min gain
580
           -> Maybe Table -- ^ The resulting table and commands
581
tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
582
    let Table ini_nl ini_il ini_cv _ = ini_tbl
583
        all_inst = Container.elems ini_il
584
        all_inst' = if evac_mode
585
                    then let bad_nodes = map Node.idx . filter Node.offline $
586
                                         Container.elems ini_nl
587
                         in filter (\e -> Instance.sNode e `elem` bad_nodes ||
588
                                          Instance.pNode e `elem` bad_nodes)
589
                            all_inst
590
                    else all_inst
591
        reloc_inst = filter Instance.movable all_inst'
592
        node_idx = map Node.idx . filter (not . Node.offline) $
593
                   Container.elems ini_nl
594
        fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
595
        (Table _ _ fin_cv _) = fin_tbl
596
    in
597
      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
598
      then Just fin_tbl -- this round made success, return the new table
599
      else Nothing
600

    
601
-- * Allocation functions
602

    
603
-- | Build failure stats out of a list of failures.
604
collapseFailures :: [FailMode] -> FailStats
605
collapseFailures flst =
606
    map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
607
            [minBound..maxBound]
608

    
609
-- | Update current Allocation solution and failure stats with new
610
-- elements.
611
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
612
concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
613

    
614
concatAllocs as (OpGood ns@(_, _, _, nscore)) =
615
    let -- Choose the old or new solution, based on the cluster score
616
        cntok = asAllocs as
617
        osols = asSolutions as
618
        nsols = case osols of
619
                  [] -> [ns]
620
                  (_, _, _, oscore):[] ->
621
                      if oscore < nscore
622
                      then osols
623
                      else [ns]
624
                  -- FIXME: here we simply concat to lists with more
625
                  -- than one element; we should instead abort, since
626
                  -- this is not a valid usage of this function
627
                  xs -> ns:xs
628
        nsuc = cntok + 1
629
    -- Note: we force evaluation of nsols here in order to keep the
630
    -- memory profile low - we know that we will need nsols for sure
631
    -- in the next cycle, so we force evaluation of nsols, since the
632
    -- foldl' in the caller will only evaluate the tuple, but not the
633
    -- elements of the tuple
634
    in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolutions = nsols }
635

    
636
-- | Sums two allocation solutions (e.g. for two separate node groups).
637
sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
638
sumAllocs (AllocSolution af aa as al) (AllocSolution bf ba bs bl) =
639
    AllocSolution (af ++ bf) (aa + ba) (as ++ bs) (al ++ bl)
640

    
641
-- | Given a solution, generates a reasonable description for it.
642
describeSolution :: AllocSolution -> String
643
describeSolution as =
644
  let fcnt = asFailures as
645
      sols = asSolutions as
646
      freasons =
647
        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
648
        filter ((> 0) . snd) . collapseFailures $ fcnt
649
  in if null sols
650
     then "No valid allocation solutions, failure reasons: " ++
651
          (if null fcnt
652
           then "unknown reasons"
653
           else freasons)
654
     else let (_, _, nodes, cv) = head sols
655
          in printf ("score: %.8f, successes %d, failures %d (%s)" ++
656
                     " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
657
             (intercalate "/" . map Node.name $ nodes)
658

    
659
-- | Annotates a solution with the appropriate string.
660
annotateSolution :: AllocSolution -> AllocSolution
661
annotateSolution as = as { asLog = describeSolution as : asLog as }
662

    
663
-- | Reverses an evacuation solution.
664
--
665
-- Rationale: we always concat the results to the top of the lists, so
666
-- for proper jobset execution, we should reverse all lists.
667
reverseEvacSolution :: EvacSolution -> EvacSolution
668
reverseEvacSolution (EvacSolution f m o) =
669
    EvacSolution (reverse f) (reverse m) (reverse o)
670

    
671
-- | Generate the valid node allocation singles or pairs for a new instance.
672
genAllocNodes :: Group.List        -- ^ Group list
673
              -> Node.List         -- ^ The node map
674
              -> Int               -- ^ The number of nodes required
675
              -> Bool              -- ^ Whether to drop or not
676
                                   -- unallocable nodes
677
              -> Result AllocNodes -- ^ The (monadic) result
678
genAllocNodes gl nl count drop_unalloc =
679
    let filter_fn = if drop_unalloc
680
                    then filter (Group.isAllocable .
681
                                 flip Container.find gl . Node.group)
682
                    else id
683
        all_nodes = filter_fn $ getOnline nl
684
        all_pairs = liftM2 (,) all_nodes all_nodes
685
        ok_pairs = filter (\(x, y) -> Node.idx x /= Node.idx y &&
686
                                      Node.group x == Node.group y) all_pairs
687
    in case count of
688
         1 -> Ok (Left (map Node.idx all_nodes))
689
         2 -> Ok (Right (map (\(p, s) -> (Node.idx p, Node.idx s)) ok_pairs))
690
         _ -> Bad "Unsupported number of nodes, only one or two  supported"
691

    
692
-- | Try to allocate an instance on the cluster.
693
tryAlloc :: (Monad m) =>
694
            Node.List         -- ^ The node list
695
         -> Instance.List     -- ^ The instance list
696
         -> Instance.Instance -- ^ The instance to allocate
697
         -> AllocNodes        -- ^ The allocation targets
698
         -> m AllocSolution   -- ^ Possible solution list
699
tryAlloc nl _ inst (Right ok_pairs) =
700
    let sols = foldl' (\cstate (p, s) ->
701
                           concatAllocs cstate $ allocateOnPair nl inst p s
702
                      ) emptyAllocSolution ok_pairs
703

    
704
    in if null ok_pairs -- means we have just one node
705
       then fail "Not enough online nodes"
706
       else return $ annotateSolution sols
707

    
708
tryAlloc nl _ inst (Left all_nodes) =
709
    let sols = foldl' (\cstate ->
710
                           concatAllocs cstate . allocateOnSingle nl inst
711
                      ) emptyAllocSolution all_nodes
712
    in if null all_nodes
713
       then fail "No online nodes"
714
       else return $ annotateSolution sols
715

    
716
-- | Given a group/result, describe it as a nice (list of) messages.
717
solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
718
solutionDescription gl (groupId, result) =
719
  case result of
720
    Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
721
    Bad message -> [printf "Group %s: error %s" gname message]
722
  where grp = Container.find groupId gl
723
        gname = Group.name grp
724
        pol = apolToString (Group.allocPolicy grp)
725

    
726
-- | From a list of possibly bad and possibly empty solutions, filter
727
-- only the groups with a valid result. Note that the result will be
728
-- reversed compared to the original list.
729
filterMGResults :: Group.List
730
                -> [(Gdx, Result AllocSolution)]
731
                -> [(Gdx, AllocSolution)]
732
filterMGResults gl = foldl' fn []
733
    where unallocable = not . Group.isAllocable . flip Container.find gl
734
          fn accu (gdx, rasol) =
735
              case rasol of
736
                Bad _ -> accu
737
                Ok sol | null (asSolutions sol) -> accu
738
                       | unallocable gdx -> accu
739
                       | otherwise -> (gdx, sol):accu
740

    
741
-- | Sort multigroup results based on policy and score.
742
sortMGResults :: Group.List
743
             -> [(Gdx, AllocSolution)]
744
             -> [(Gdx, AllocSolution)]
745
sortMGResults gl sols =
746
    let extractScore (_, _, _, x) = x
747
        solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
748
                               (extractScore . head . asSolutions) sol)
749
    in sortBy (comparing solScore) sols
750

    
751
-- | Try to allocate an instance on a multi-group cluster.
752
tryMGAlloc :: Group.List           -- ^ The group list
753
           -> Node.List            -- ^ The node list
754
           -> Instance.List        -- ^ The instance list
755
           -> Instance.Instance    -- ^ The instance to allocate
756
           -> Int                  -- ^ Required number of nodes
757
           -> Result AllocSolution -- ^ Possible solution list
758
tryMGAlloc mggl mgnl mgil inst cnt =
759
  let groups = splitCluster mgnl mgil
760
      sols = map (\(gid, (nl, il)) ->
761
                   (gid, genAllocNodes mggl nl cnt False >>=
762
                       tryAlloc nl il inst))
763
             groups::[(Gdx, Result AllocSolution)]
764
      all_msgs = concatMap (solutionDescription mggl) sols
765
      goodSols = filterMGResults mggl sols
766
      sortedSols = sortMGResults mggl goodSols
767
  in if null sortedSols
768
     then Bad $ intercalate ", " all_msgs
769
     else let (final_group, final_sol) = head sortedSols
770
              final_name = Group.name $ Container.find final_group mggl
771
              selmsg = "Selected group: " ++  final_name
772
          in Ok $ final_sol { asLog = selmsg:all_msgs }
773

    
774
-- | Try to relocate an instance on the cluster.
775
tryReloc :: (Monad m) =>
776
            Node.List       -- ^ The node list
777
         -> Instance.List   -- ^ The instance list
778
         -> Idx             -- ^ The index of the instance to move
779
         -> Int             -- ^ The number of nodes required
780
         -> [Ndx]           -- ^ Nodes which should not be used
781
         -> m AllocSolution -- ^ Solution list
782
tryReloc nl il xid 1 ex_idx =
783
    let all_nodes = getOnline nl
784
        inst = Container.find xid il
785
        ex_idx' = Instance.pNode inst:ex_idx
786
        valid_nodes = filter (not . flip elem ex_idx' . Node.idx) all_nodes
787
        valid_idxes = map Node.idx valid_nodes
788
        sols1 = foldl' (\cstate x ->
789
                            let em = do
790
                                  (mnl, i, _, _) <-
791
                                      applyMove nl inst (ReplaceSecondary x)
792
                                  return (mnl, i, [Container.find x mnl],
793
                                          compCV mnl)
794
                            in concatAllocs cstate em
795
                       ) emptyAllocSolution valid_idxes
796
    in return sols1
797

    
798
tryReloc _ _ _ reqn _  = fail $ "Unsupported number of relocation \
799
                                \destinations required (" ++ show reqn ++
800
                                                  "), only one supported"
801

    
802
tryMGReloc :: (Monad m) =>
803
              Group.List      -- ^ The group list
804
           -> Node.List       -- ^ The node list
805
           -> Instance.List   -- ^ The instance list
806
           -> Idx             -- ^ The index of the instance to move
807
           -> Int             -- ^ The number of nodes required
808
           -> [Ndx]           -- ^ Nodes which should not be used
809
           -> m AllocSolution -- ^ Solution list
810
tryMGReloc _ mgnl mgil xid ncount ex_ndx = do
811
  let groups = splitCluster mgnl mgil
812
      -- TODO: we only relocate inside the group for now
813
      inst = Container.find xid mgil
814
  (nl, il) <- case lookup (instancePriGroup mgnl inst) groups of
815
                Nothing -> fail $ "Cannot find group for instance " ++
816
                           Instance.name inst
817
                Just v -> return v
818
  tryReloc nl il xid ncount ex_ndx
819

    
820
-- | Change an instance's secondary node.
821
evacInstance :: (Monad m) =>
822
                [Ndx]                      -- ^ Excluded nodes
823
             -> Instance.List              -- ^ The current instance list
824
             -> (Node.List, AllocSolution) -- ^ The current state
825
             -> Idx                        -- ^ The instance to evacuate
826
             -> m (Node.List, AllocSolution)
827
evacInstance ex_ndx il (nl, old_as) idx = do
828
  -- FIXME: hardcoded one node here
829

    
830
  -- Longer explanation: evacuation is currently hardcoded to DRBD
831
  -- instances (which have one secondary); hence, even if the
832
  -- IAllocator protocol can request N nodes for an instance, and all
833
  -- the message parsing/loading pass this, this implementation only
834
  -- supports one; this situation needs to be revisited if we ever
835
  -- support more than one secondary, or if we change the storage
836
  -- model
837
  new_as <- tryReloc nl il idx 1 ex_ndx
838
  case asSolutions new_as of
839
    -- an individual relocation succeeded, we kind of compose the data
840
    -- from the two solutions
841
    csol@(nl', _, _, _):_ ->
842
        return (nl', new_as { asSolutions = csol:asSolutions old_as })
843
    -- this relocation failed, so we fail the entire evac
844
    _ -> fail $ "Can't evacuate instance " ++
845
         Instance.name (Container.find idx il) ++
846
             ": " ++ describeSolution new_as
847

    
848
-- | Try to evacuate a list of nodes.
849
tryEvac :: (Monad m) =>
850
            Node.List       -- ^ The node list
851
         -> Instance.List   -- ^ The instance list
852
         -> [Idx]           -- ^ Instances to be evacuated
853
         -> [Ndx]           -- ^ Restricted nodes (the ones being evacuated)
854
         -> m AllocSolution -- ^ Solution list
855
tryEvac nl il idxs ex_ndx = do
856
  (_, sol) <- foldM (evacInstance ex_ndx il) (nl, emptyAllocSolution) idxs
857
  return sol
858

    
859
-- | Multi-group evacuation of a list of nodes.
860
tryMGEvac :: (Monad m) =>
861
             Group.List -- ^ The group list
862
          -> Node.List       -- ^ The node list
863
          -> Instance.List   -- ^ The instance list
864
          -> [Ndx]           -- ^ Nodes to be evacuated
865
          -> m AllocSolution -- ^ Solution list
866
tryMGEvac _ nl il ex_ndx =
867
    let ex_nodes = map (`Container.find` nl) ex_ndx
868
        all_insts = nub . concatMap Node.sList $ ex_nodes
869
        all_insts' = associateIdxs all_insts $ splitCluster nl il
870
    in do
871
      results <- mapM (\(_, (gnl, gil, idxs)) -> tryEvac gnl gil idxs ex_ndx)
872
                 all_insts'
873
      let sol = foldl' sumAllocs emptyAllocSolution results
874
      return $ annotateSolution sol
875

    
876
-- | Function which fails if the requested mode is change secondary.
877
--
878
-- This is useful since except DRBD, no other disk template can
879
-- execute change secondary; thus, we can just call this function
880
-- instead of always checking for secondary mode. After the call to
881
-- this function, whatever mode we have is just a primary change.
882
failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
883
failOnSecondaryChange ChangeSecondary dt =
884
    fail $ "Instances with disk template '" ++ dtToString dt ++
885
         "' can't execute change secondary"
886
failOnSecondaryChange _ _ = return ()
887

    
888
-- | Run evacuation for a single instance.
889
nodeEvacInstance :: Node.List         -- ^ The node list (cluster-wide)
890
                 -> Instance.List     -- ^ Instance list (cluster-wide)
891
                 -> EvacMode          -- ^ The evacuation mode
892
                 -> Instance.Instance -- ^ The instance to be evacuated
893
                 -> [Ndx]             -- ^ The list of available nodes
894
                                      -- for allocation
895
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
896
nodeEvacInstance _ _ mode (Instance.Instance
897
                           {Instance.diskTemplate = dt@DTDiskless}) _ =
898
                  failOnSecondaryChange mode dt >>
899
                  fail "Diskless relocations not implemented yet"
900

    
901
nodeEvacInstance _ _ _ (Instance.Instance
902
                        {Instance.diskTemplate = DTPlain}) _ =
903
                  fail "Instances of type plain cannot be relocated"
904

    
905
nodeEvacInstance _ _ _ (Instance.Instance
906
                        {Instance.diskTemplate = DTFile}) _ =
907
                  fail "Instances of type file cannot be relocated"
908

    
909
nodeEvacInstance _ _ mode  (Instance.Instance
910
                            {Instance.diskTemplate = dt@DTSharedFile}) _ =
911
                  failOnSecondaryChange mode dt >>
912
                  fail "Shared file relocations not implemented yet"
913

    
914
nodeEvacInstance _ _ mode (Instance.Instance
915
                           {Instance.diskTemplate = dt@DTBlock}) _ =
916
                  failOnSecondaryChange mode dt >>
917
                  fail "Block device relocations not implemented yet"
918

    
919
nodeEvacInstance nl il ChangePrimary
920
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8}) _ =
921
  do
922
    (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
923
    let idx = Instance.idx inst
924
        il' = Container.add idx inst' il
925
        ops = iMoveToJob nl' il' idx Failover
926
    return (nl', il', ops)
927

    
928
nodeEvacInstance nl il ChangeSecondary
929
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
930
                 avail_nodes =
931
  do
932
    let gdx = instancePriGroup nl inst
933
    (nl', inst', _, ndx) <- annotateResult "Can't find any good node" $
934
                            eitherToResult $
935
                            foldl' (evacDrbdSecondaryInner nl inst gdx)
936
                            (Left "no nodes available") avail_nodes
937
    let idx = Instance.idx inst
938
        il' = Container.add idx inst' il
939
        ops = iMoveToJob nl' il' idx (ReplaceSecondary ndx)
940
    return (nl', il', ops)
941

    
942
nodeEvacInstance nl il ChangeAll
943
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
944
                 avail_nodes =
945
  do
946
    let primary = Container.find (Instance.pNode inst) nl
947
        idx = Instance.idx inst
948
        gdx = instancePriGroup nl inst
949
        no_nodes = Left "no nodes available"
950
    -- if the primary is offline, then we first failover
951
    (nl1, inst1, ops1) <-
952
        if Node.offline primary
953
        then do
954
          (nl', inst', _, _) <-
955
              annotateResult "Failing over to the secondary" $
956
              opToResult $ applyMove nl inst Failover
957
          return (nl', inst', [Failover])
958
        else return (nl, inst, [])
959
    -- we now need to execute a replace secondary to the future
960
    -- primary node
961
    (nl2, inst2, _, new_pdx) <- annotateResult "Searching for a new primary" $
962
                                eitherToResult $
963
                                foldl' (evacDrbdSecondaryInner nl1 inst1 gdx)
964
                                no_nodes avail_nodes
965
    let ops2 = ReplaceSecondary new_pdx:ops1
966
    -- since we chose the new primary, we remove it from the list of
967
    -- available nodes
968
    let avail_nodes_sec = new_pdx `delete` avail_nodes
969
    -- we now execute another failover, the primary stays fixed now
970
    (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" $
971
                          opToResult $ applyMove nl2 inst2 Failover
972
    let ops3 = Failover:ops2
973
    -- and finally another replace secondary, to the final secondary
974
    (nl4, inst4, _, new_sdx) <-
975
        annotateResult "Searching for a new secondary" $
976
        eitherToResult $
977
        foldl' (evacDrbdSecondaryInner nl3 inst3 gdx) no_nodes avail_nodes_sec
978
    let ops4 = ReplaceSecondary new_sdx:ops3
979
        il' = Container.add idx inst4 il
980
        ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
981
    return (nl4, il', ops)
982

    
983
-- | Inner fold function for changing secondary of a DRBD instance.
984
--
985
-- The "running" solution is either a @Left String@, which means we
986
-- don't have yet a working solution, or a @Right (...)@, which
987
-- represents a valid solution; it holds the modified node list, the
988
-- modified instance (after evacuation), the score of that solution,
989
-- and the new secondary node index.
990
evacDrbdSecondaryInner :: Node.List -- ^ Cluster node list
991
                       -> Instance.Instance -- ^ Instance being evacuated
992
                       -> Gdx -- ^ The group index of the instance
993
                       -> Either String ( Node.List
994
                                        , Instance.Instance
995
                                        , Score
996
                                        , Ndx)  -- ^ Current best solution
997
                       -> Ndx  -- ^ Node we're evaluating as new secondary
998
                       -> Either String ( Node.List
999
                                        , Instance.Instance
1000
                                        , Score
1001
                                        , Ndx) -- ^ New best solution
1002
evacDrbdSecondaryInner nl inst gdx accu ndx =
1003
    case applyMove nl inst (ReplaceSecondary ndx) of
1004
      OpFail fm ->
1005
          case accu of
1006
            Right _ -> accu
1007
            Left _ -> Left $ "Node " ++ Container.nameOf nl ndx ++
1008
                      " failed: " ++ show fm
1009
      OpGood (nl', inst', _, _) ->
1010
          let nodes = Container.elems nl'
1011
              -- The fromJust below is ugly (it can fail nastily), but
1012
              -- at this point we should have any internal mismatches,
1013
              -- and adding a monad here would be quite involved
1014
              grpnodes = fromJust (gdx `lookup` (Node.computeGroups nodes))
1015
              new_cv = compCVNodes grpnodes
1016
              new_accu = Right (nl', inst', new_cv, ndx)
1017
          in case accu of
1018
               Left _ -> new_accu
1019
               Right (_, _, old_cv, _) ->
1020
                   if old_cv < new_cv
1021
                   then accu
1022
                   else new_accu
1023

    
1024
-- | Computes the local nodes of a given instance which are available
1025
-- for allocation.
1026
availableLocalNodes :: Node.List
1027
                    -> [(Gdx, [Ndx])]
1028
                    -> IntSet.IntSet
1029
                    -> Instance.Instance
1030
                    -> Result [Ndx]
1031
availableLocalNodes nl group_nodes excl_ndx inst = do
1032
  let gdx = instancePriGroup nl inst
1033
  local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1034
                 Ok (lookup gdx group_nodes)
1035
  let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1036
  return avail_nodes
1037

    
1038
-- | Updates the evac solution with the results of an instance
1039
-- evacuation.
1040
updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1041
                   -> Instance.Instance
1042
                   -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1043
                   -> (Node.List, Instance.List, EvacSolution)
1044
updateEvacSolution (nl, il, es) inst (Bad msg) =
1045
    (nl, il, es { esFailed = (Instance.name inst ++ ": " ++ msg):esFailed es})
1046
updateEvacSolution (_, _, es) inst (Ok (nl, il, opcodes)) =
1047
    (nl, il, es { esMoved = Instance.name inst:esMoved es
1048
                , esOpCodes = [opcodes]:esOpCodes es })
1049

    
1050
-- | Node-evacuation IAllocator mode main function.
1051
tryNodeEvac :: Group.List    -- ^ The cluster groups
1052
            -> Node.List     -- ^ The node list (cluster-wide, not per group)
1053
            -> Instance.List -- ^ Instance list (cluster-wide)
1054
            -> EvacMode      -- ^ The evacuation mode
1055
            -> [Idx]         -- ^ List of instance (indices) to be evacuated
1056
            -> Result EvacSolution
1057
tryNodeEvac _ ini_nl ini_il mode idxs =
1058
    let evac_ndx = nodesToEvacuate ini_il mode idxs
1059
        offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1060
        excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1061
        group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1062
                                             (Container.elems nl))) $
1063
                      splitCluster ini_nl ini_il
1064
        (_, _, esol) =
1065
            foldl' (\state@(nl, il, _) inst ->
1066
                        updateEvacSolution state inst $
1067
                        availableLocalNodes nl group_ndx excl_ndx inst >>=
1068
                        nodeEvacInstance nl il mode inst
1069
                   )
1070
            (ini_nl, ini_il, emptyEvacSolution)
1071
            (map (`Container.find` ini_il) idxs)
1072
    in return $ reverseEvacSolution esol
1073

    
1074
-- | Recursively place instances on the cluster until we're out of space.
1075
iterateAlloc :: Node.List
1076
             -> Instance.List
1077
             -> Maybe Int
1078
             -> Instance.Instance
1079
             -> AllocNodes
1080
             -> [Instance.Instance]
1081
             -> [CStats]
1082
             -> Result AllocResult
1083
iterateAlloc nl il limit newinst allocnodes ixes cstats =
1084
      let depth = length ixes
1085
          newname = printf "new-%d" depth::String
1086
          newidx = length (Container.elems il) + depth
1087
          newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1088
          newlimit = fmap (flip (-) 1) limit
1089
      in case tryAlloc nl il newi2 allocnodes of
1090
           Bad s -> Bad s
1091
           Ok (AllocSolution { asFailures = errs, asSolutions = sols3 }) ->
1092
               let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1093
               case sols3 of
1094
                 [] -> newsol
1095
                 (xnl, xi, _, _):[] ->
1096
                     if limit == Just 0
1097
                     then newsol
1098
                     else iterateAlloc xnl (Container.add newidx xi il)
1099
                          newlimit newinst allocnodes (xi:ixes)
1100
                          (totalResources xnl:cstats)
1101
                 _ -> Bad "Internal error: multiple solutions for single\
1102
                          \ allocation"
1103

    
1104
-- | The core of the tiered allocation mode.
1105
tieredAlloc :: Node.List
1106
            -> Instance.List
1107
            -> Maybe Int
1108
            -> Instance.Instance
1109
            -> AllocNodes
1110
            -> [Instance.Instance]
1111
            -> [CStats]
1112
            -> Result AllocResult
1113
tieredAlloc nl il limit newinst allocnodes ixes cstats =
1114
    case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1115
      Bad s -> Bad s
1116
      Ok (errs, nl', il', ixes', cstats') ->
1117
          let newsol = Ok (errs, nl', il', ixes', cstats')
1118
              ixes_cnt = length ixes'
1119
              (stop, newlimit) = case limit of
1120
                                   Nothing -> (False, Nothing)
1121
                                   Just n -> (n <= ixes_cnt,
1122
                                              Just (n - ixes_cnt)) in
1123
          if stop then newsol else
1124
          case Instance.shrinkByType newinst . fst . last $
1125
               sortBy (comparing snd) errs of
1126
            Bad _ -> newsol
1127
            Ok newinst' -> tieredAlloc nl' il' newlimit
1128
                           newinst' allocnodes ixes' cstats'
1129

    
1130
-- | Compute the tiered spec string description from a list of
1131
-- allocated instances.
1132
tieredSpecMap :: [Instance.Instance]
1133
              -> [String]
1134
tieredSpecMap trl_ixes =
1135
    let fin_trl_ixes = reverse trl_ixes
1136
        ix_byspec = groupBy ((==) `on` Instance.specOf) fin_trl_ixes
1137
        spec_map = map (\ixs -> (Instance.specOf $ head ixs, length ixs))
1138
                   ix_byspec
1139
    in  map (\(spec, cnt) -> printf "%d,%d,%d=%d" (rspecMem spec)
1140
                             (rspecDsk spec) (rspecCpu spec) cnt) spec_map
1141

    
1142
-- * Formatting functions
1143

    
1144
-- | Given the original and final nodes, computes the relocation description.
1145
computeMoves :: Instance.Instance -- ^ The instance to be moved
1146
             -> String -- ^ The instance name
1147
             -> IMove  -- ^ The move being performed
1148
             -> String -- ^ New primary
1149
             -> String -- ^ New secondary
1150
             -> (String, [String])
1151
                -- ^ Tuple of moves and commands list; moves is containing
1152
                -- either @/f/@ for failover or @/r:name/@ for replace
1153
                -- secondary, while the command list holds gnt-instance
1154
                -- commands (without that prefix), e.g \"@failover instance1@\"
1155
computeMoves i inam mv c d =
1156
    case mv of
1157
      Failover -> ("f", [mig])
1158
      FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1159
      ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1160
      ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1161
      ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1162
    where morf = if Instance.running i then "migrate" else "failover"
1163
          mig = printf "%s -f %s" morf inam::String
1164
          rep n = printf "replace-disks -n %s %s" n inam
1165

    
1166
-- | Converts a placement to string format.
1167
printSolutionLine :: Node.List     -- ^ The node list
1168
                  -> Instance.List -- ^ The instance list
1169
                  -> Int           -- ^ Maximum node name length
1170
                  -> Int           -- ^ Maximum instance name length
1171
                  -> Placement     -- ^ The current placement
1172
                  -> Int           -- ^ The index of the placement in
1173
                                   -- the solution
1174
                  -> (String, [String])
1175
printSolutionLine nl il nmlen imlen plc pos =
1176
    let
1177
        pmlen = (2*nmlen + 1)
1178
        (i, p, s, mv, c) = plc
1179
        inst = Container.find i il
1180
        inam = Instance.alias inst
1181
        npri = Node.alias $ Container.find p nl
1182
        nsec = Node.alias $ Container.find s nl
1183
        opri = Node.alias $ Container.find (Instance.pNode inst) nl
1184
        osec = Node.alias $ Container.find (Instance.sNode inst) nl
1185
        (moves, cmds) =  computeMoves inst inam mv npri nsec
1186
        ostr = printf "%s:%s" opri osec::String
1187
        nstr = printf "%s:%s" npri nsec::String
1188
    in
1189
      (printf "  %3d. %-*s %-*s => %-*s %.8f a=%s"
1190
       pos imlen inam pmlen ostr
1191
       pmlen nstr c moves,
1192
       cmds)
1193

    
1194
-- | Return the instance and involved nodes in an instance move.
1195
involvedNodes :: Instance.List -> Placement -> [Ndx]
1196
involvedNodes il plc =
1197
    let (i, np, ns, _, _) = plc
1198
        inst = Container.find i il
1199
        op = Instance.pNode inst
1200
        os = Instance.sNode inst
1201
    in nub [np, ns, op, os]
1202

    
1203
-- | Inner function for splitJobs, that either appends the next job to
1204
-- the current jobset, or starts a new jobset.
1205
mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1206
mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1207
mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1208
    | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1209
    | otherwise = ([n]:cjs, ndx)
1210

    
1211
-- | Break a list of moves into independent groups. Note that this
1212
-- will reverse the order of jobs.
1213
splitJobs :: [MoveJob] -> [JobSet]
1214
splitJobs = fst . foldl mergeJobs ([], [])
1215

    
1216
-- | Given a list of commands, prefix them with @gnt-instance@ and
1217
-- also beautify the display a little.
1218
formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1219
formatJob jsn jsl (sn, (_, _, _, cmds)) =
1220
    let out =
1221
            printf "  echo job %d/%d" jsn sn:
1222
            printf "  check":
1223
            map ("  gnt-instance " ++) cmds
1224
    in if sn == 1
1225
       then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1226
       else out
1227

    
1228
-- | Given a list of commands, prefix them with @gnt-instance@ and
1229
-- also beautify the display a little.
1230
formatCmds :: [JobSet] -> String
1231
formatCmds =
1232
    unlines .
1233
    concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1234
                             (zip [1..] js)) .
1235
    zip [1..]
1236

    
1237
-- | Print the node list.
1238
printNodes :: Node.List -> [String] -> String
1239
printNodes nl fs =
1240
    let fields = case fs of
1241
          [] -> Node.defaultFields
1242
          "+":rest -> Node.defaultFields ++ rest
1243
          _ -> fs
1244
        snl = sortBy (comparing Node.idx) (Container.elems nl)
1245
        (header, isnum) = unzip $ map Node.showHeader fields
1246
    in unlines . map ((:) ' ' .  intercalate " ") $
1247
       formatTable (header:map (Node.list fields) snl) isnum
1248

    
1249
-- | Print the instance list.
1250
printInsts :: Node.List -> Instance.List -> String
1251
printInsts nl il =
1252
    let sil = sortBy (comparing Instance.idx) (Container.elems il)
1253
        helper inst = [ if Instance.running inst then "R" else " "
1254
                      , Instance.name inst
1255
                      , Container.nameOf nl (Instance.pNode inst)
1256
                      , let sdx = Instance.sNode inst
1257
                        in if sdx == Node.noSecondary
1258
                           then  ""
1259
                           else Container.nameOf nl sdx
1260
                      , if Instance.autoBalance inst then "Y" else "N"
1261
                      , printf "%3d" $ Instance.vcpus inst
1262
                      , printf "%5d" $ Instance.mem inst
1263
                      , printf "%5d" $ Instance.dsk inst `div` 1024
1264
                      , printf "%5.3f" lC
1265
                      , printf "%5.3f" lM
1266
                      , printf "%5.3f" lD
1267
                      , printf "%5.3f" lN
1268
                      ]
1269
            where DynUtil lC lM lD lN = Instance.util inst
1270
        header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1271
                 , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1272
        isnum = False:False:False:False:False:repeat True
1273
    in unlines . map ((:) ' ' . intercalate " ") $
1274
       formatTable (header:map helper sil) isnum
1275

    
1276
-- | Shows statistics for a given node list.
1277
printStats :: Node.List -> String
1278
printStats nl =
1279
    let dcvs = compDetailedCV $ Container.elems nl
1280
        (weights, names) = unzip detailedCVInfo
1281
        hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1282
        formatted = map (\(w, header, val) ->
1283
                             printf "%s=%.8f(x%.2f)" header val w::String) hd
1284
    in intercalate ", " formatted
1285

    
1286
-- | Convert a placement into a list of OpCodes (basically a job).
1287
iMoveToJob :: Node.List -> Instance.List
1288
          -> Idx -> IMove -> [OpCodes.OpCode]
1289
iMoveToJob nl il idx move =
1290
    let inst = Container.find idx il
1291
        iname = Instance.name inst
1292
        lookNode  = Just . Container.nameOf nl
1293
        opF = OpCodes.OpInstanceMigrate iname True False True
1294
        opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n)
1295
                OpCodes.ReplaceNewSecondary [] Nothing
1296
    in case move of
1297
         Failover -> [ opF ]
1298
         ReplacePrimary np -> [ opF, opR np, opF ]
1299
         ReplaceSecondary ns -> [ opR ns ]
1300
         ReplaceAndFailover np -> [ opR np, opF ]
1301
         FailoverAndReplace ns -> [ opF, opR ns ]
1302

    
1303
-- * Node group functions
1304

    
1305
-- | Computes the group of an instance.
1306
instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1307
instanceGroup nl i =
1308
  let sidx = Instance.sNode i
1309
      pnode = Container.find (Instance.pNode i) nl
1310
      snode = if sidx == Node.noSecondary
1311
              then pnode
1312
              else Container.find sidx nl
1313
      pgroup = Node.group pnode
1314
      sgroup = Node.group snode
1315
  in if pgroup /= sgroup
1316
     then fail ("Instance placed accross two node groups, primary " ++
1317
                show pgroup ++ ", secondary " ++ show sgroup)
1318
     else return pgroup
1319

    
1320
-- | Computes the group of an instance per the primary node.
1321
instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1322
instancePriGroup nl i =
1323
  let pnode = Container.find (Instance.pNode i) nl
1324
  in  Node.group pnode
1325

    
1326
-- | Compute the list of badly allocated instances (split across node
1327
-- groups).
1328
findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1329
findSplitInstances nl =
1330
  filter (not . isOk . instanceGroup nl) . Container.elems
1331

    
1332
-- | Splits a cluster into the component node groups.
1333
splitCluster :: Node.List -> Instance.List ->
1334
                [(Gdx, (Node.List, Instance.List))]
1335
splitCluster nl il =
1336
  let ngroups = Node.computeGroups (Container.elems nl)
1337
  in map (\(guuid, nodes) ->
1338
           let nidxs = map Node.idx nodes
1339
               nodes' = zip nidxs nodes
1340
               instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1341
           in (guuid, (Container.fromList nodes', instances))) ngroups
1342

    
1343
-- | Split a global instance index map into per-group, and associate
1344
-- it with the group/node/instance lists.
1345
associateIdxs :: [Idx] -- ^ Instance indices to be split/associated
1346
              -> [(Gdx, (Node.List, Instance.List))]        -- ^ Input groups
1347
              -> [(Gdx, (Node.List, Instance.List, [Idx]))] -- ^ Result
1348
associateIdxs idxs =
1349
    map (\(gdx, (nl, il)) ->
1350
             (gdx, (nl, il, filter (`Container.member` il) idxs)))
1351

    
1352
-- | Compute the list of nodes that are to be evacuated, given a list
1353
-- of instances and an evacuation mode.
1354
nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1355
                -> EvacMode      -- ^ The evacuation mode we're using
1356
                -> [Idx]         -- ^ List of instance indices being evacuated
1357
                -> IntSet.IntSet -- ^ Set of node indices
1358
nodesToEvacuate il mode =
1359
    IntSet.delete Node.noSecondary .
1360
    foldl' (\ns idx ->
1361
                let i = Container.find idx il
1362
                    pdx = Instance.pNode i
1363
                    sdx = Instance.sNode i
1364
                    dt = Instance.diskTemplate i
1365
                    withSecondary = case dt of
1366
                                      DTDrbd8 -> IntSet.insert sdx ns
1367
                                      _ -> ns
1368
                in case mode of
1369
                     ChangePrimary   -> IntSet.insert pdx ns
1370
                     ChangeSecondary -> withSecondary
1371
                     ChangeAll       -> IntSet.insert pdx withSecondary
1372
           ) IntSet.empty