Statistics
| Branch: | Tag: | Revision:

root / htools / Ganeti / HTools / Cluster.hs @ 1b0a6356

History | View | Annotate | Download (60.7 kB)

1
{-| Implementation of cluster-wide logic.
2

    
3
This module holds all pure cluster-logic; I\/O related functionality
4
goes into the /Main/ module for the individual binaries.
5

    
6
-}
7

    
8
{-
9

    
10
Copyright (C) 2009, 2010, 2011 Google Inc.
11

    
12
This program is free software; you can redistribute it and/or modify
13
it under the terms of the GNU General Public License as published by
14
the Free Software Foundation; either version 2 of the License, or
15
(at your option) any later version.
16

    
17
This program is distributed in the hope that it will be useful, but
18
WITHOUT ANY WARRANTY; without even the implied warranty of
19
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
General Public License for more details.
21

    
22
You should have received a copy of the GNU General Public License
23
along with this program; if not, write to the Free Software
24
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25
02110-1301, USA.
26

    
27
-}
28

    
29
module Ganeti.HTools.Cluster
30
    (
31
     -- * Types
32
      AllocSolution(..)
33
    , EvacSolution(..)
34
    , Table(..)
35
    , CStats(..)
36
    , AllocStats
37
    -- * Generic functions
38
    , totalResources
39
    , computeAllocationDelta
40
    -- * First phase functions
41
    , computeBadItems
42
    -- * Second phase functions
43
    , printSolutionLine
44
    , formatCmds
45
    , involvedNodes
46
    , splitJobs
47
    -- * Display functions
48
    , printNodes
49
    , printInsts
50
    -- * Balacing functions
51
    , checkMove
52
    , doNextBalance
53
    , tryBalance
54
    , compCV
55
    , compCVNodes
56
    , compDetailedCV
57
    , printStats
58
    , iMoveToJob
59
    -- * IAllocator functions
60
    , genAllocNodes
61
    , tryAlloc
62
    , tryMGAlloc
63
    , tryReloc
64
    , tryEvac
65
    , tryNodeEvac
66
    , tryChangeGroup
67
    , collapseFailures
68
    -- * Allocation functions
69
    , iterateAlloc
70
    , tieredAlloc
71
     -- * Node group functions
72
    , instanceGroup
73
    , findSplitInstances
74
    , splitCluster
75
    ) where
76

    
77
import qualified Data.IntSet as IntSet
78
import Data.List
79
import Data.Maybe (fromJust)
80
import Data.Ord (comparing)
81
import Text.Printf (printf)
82
import Control.Monad
83

    
84
import qualified Ganeti.HTools.Container as Container
85
import qualified Ganeti.HTools.Instance as Instance
86
import qualified Ganeti.HTools.Node as Node
87
import qualified Ganeti.HTools.Group as Group
88
import Ganeti.HTools.Types
89
import Ganeti.HTools.Utils
90
import Ganeti.HTools.Compat
91
import qualified Ganeti.OpCodes as OpCodes
92

    
93
-- * Types
94

    
95
-- | Allocation\/relocation solution.
96
data AllocSolution = AllocSolution
97
  { asFailures  :: [FailMode]          -- ^ Failure counts
98
  , asAllocs    :: Int                 -- ^ Good allocation count
99
  , asSolutions :: [Node.AllocElement] -- ^ The actual result, length
100
                                       -- of the list depends on the
101
                                       -- allocation/relocation mode
102
  , asLog       :: [String]            -- ^ A list of informational messages
103
  }
104

    
105
-- | Node evacuation/group change iallocator result type. This result
106
-- type consists of actual opcodes (a restricted subset) that are
107
-- transmitted back to Ganeti.
108
data EvacSolution = EvacSolution
109
    { esMoved   :: [(Idx, Gdx, [Ndx])]  -- ^ Instances moved successfully
110
    , esFailed  :: [(Idx, String)]      -- ^ Instances which were not
111
                                        -- relocated
112
    , esOpCodes :: [[[OpCodes.OpCode]]] -- ^ List of lists of jobs
113
    }
114

    
115
-- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
116
type AllocResult = (FailStats, Node.List, Instance.List,
117
                    [Instance.Instance], [CStats])
118

    
119
-- | A type denoting the valid allocation mode/pairs.
120
--
121
-- For a one-node allocation, this will be a @Left ['Node.Node']@,
122
-- whereas for a two-node allocation, this will be a @Right
123
-- [('Node.Node', 'Node.Node')]@.
124
type AllocNodes = Either [Ndx] [(Ndx, Ndx)]
125

    
126
-- | The empty solution we start with when computing allocations.
127
emptyAllocSolution :: AllocSolution
128
emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
129
                                   , asSolutions = [], asLog = [] }
130

    
131
-- | The empty evac solution.
132
emptyEvacSolution :: EvacSolution
133
emptyEvacSolution = EvacSolution { esMoved = []
134
                                 , esFailed = []
135
                                 , esOpCodes = []
136
                                 }
137

    
138
-- | The complete state for the balancing solution.
139
data Table = Table Node.List Instance.List Score [Placement]
140
             deriving (Show, Read)
141

    
142
-- | Cluster statistics data type.
143
data CStats = CStats { csFmem :: Integer -- ^ Cluster free mem
144
                     , csFdsk :: Integer -- ^ Cluster free disk
145
                     , csAmem :: Integer -- ^ Cluster allocatable mem
146
                     , csAdsk :: Integer -- ^ Cluster allocatable disk
147
                     , csAcpu :: Integer -- ^ Cluster allocatable cpus
148
                     , csMmem :: Integer -- ^ Max node allocatable mem
149
                     , csMdsk :: Integer -- ^ Max node allocatable disk
150
                     , csMcpu :: Integer -- ^ Max node allocatable cpu
151
                     , csImem :: Integer -- ^ Instance used mem
152
                     , csIdsk :: Integer -- ^ Instance used disk
153
                     , csIcpu :: Integer -- ^ Instance used cpu
154
                     , csTmem :: Double  -- ^ Cluster total mem
155
                     , csTdsk :: Double  -- ^ Cluster total disk
156
                     , csTcpu :: Double  -- ^ Cluster total cpus
157
                     , csVcpu :: Integer -- ^ Cluster virtual cpus (if
158
                                         -- node pCpu has been set,
159
                                         -- otherwise -1)
160
                     , csXmem :: Integer -- ^ Unnacounted for mem
161
                     , csNmem :: Integer -- ^ Node own memory
162
                     , csScore :: Score  -- ^ The cluster score
163
                     , csNinst :: Int    -- ^ The total number of instances
164
                     }
165
            deriving (Show, Read)
166

    
167
-- | Currently used, possibly to allocate, unallocable.
168
type AllocStats = (RSpec, RSpec, RSpec)
169

    
170
-- * Utility functions
171

    
172
-- | Verifies the N+1 status and return the affected nodes.
173
verifyN1 :: [Node.Node] -> [Node.Node]
174
verifyN1 = filter Node.failN1
175

    
176
{-| Computes the pair of bad nodes and instances.
177

    
178
The bad node list is computed via a simple 'verifyN1' check, and the
179
bad instance list is the list of primary and secondary instances of
180
those nodes.
181

    
182
-}
183
computeBadItems :: Node.List -> Instance.List ->
184
                   ([Node.Node], [Instance.Instance])
185
computeBadItems nl il =
186
  let bad_nodes = verifyN1 $ getOnline nl
187
      bad_instances = map (`Container.find` il) .
188
                      sort . nub $
189
                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
190
  in
191
    (bad_nodes, bad_instances)
192

    
193
-- | Zero-initializer for the CStats type.
194
emptyCStats :: CStats
195
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
196

    
197
-- | Update stats with data from a new node.
198
updateCStats :: CStats -> Node.Node -> CStats
199
updateCStats cs node =
200
    let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
201
                 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
202
                 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
203
                 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
204
                 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
205
                 csVcpu = x_vcpu,
206
                 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
207
               }
208
            = cs
209
        inc_amem = Node.fMem node - Node.rMem node
210
        inc_amem' = if inc_amem > 0 then inc_amem else 0
211
        inc_adsk = Node.availDisk node
212
        inc_imem = truncate (Node.tMem node) - Node.nMem node
213
                   - Node.xMem node - Node.fMem node
214
        inc_icpu = Node.uCpu node
215
        inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
216
        inc_vcpu = Node.hiCpu node
217
        inc_acpu = Node.availCpu node
218

    
219
    in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
220
          , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
221
          , csAmem = x_amem + fromIntegral inc_amem'
222
          , csAdsk = x_adsk + fromIntegral inc_adsk
223
          , csAcpu = x_acpu + fromIntegral inc_acpu
224
          , csMmem = max x_mmem (fromIntegral inc_amem')
225
          , csMdsk = max x_mdsk (fromIntegral inc_adsk)
226
          , csMcpu = max x_mcpu (fromIntegral inc_acpu)
227
          , csImem = x_imem + fromIntegral inc_imem
228
          , csIdsk = x_idsk + fromIntegral inc_idsk
229
          , csIcpu = x_icpu + fromIntegral inc_icpu
230
          , csTmem = x_tmem + Node.tMem node
231
          , csTdsk = x_tdsk + Node.tDsk node
232
          , csTcpu = x_tcpu + Node.tCpu node
233
          , csVcpu = x_vcpu + fromIntegral inc_vcpu
234
          , csXmem = x_xmem + fromIntegral (Node.xMem node)
235
          , csNmem = x_nmem + fromIntegral (Node.nMem node)
236
          , csNinst = x_ninst + length (Node.pList node)
237
          }
238

    
239
-- | Compute the total free disk and memory in the cluster.
240
totalResources :: Node.List -> CStats
241
totalResources nl =
242
    let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
243
    in cs { csScore = compCV nl }
244

    
245
-- | Compute the delta between two cluster state.
246
--
247
-- This is used when doing allocations, to understand better the
248
-- available cluster resources. The return value is a triple of the
249
-- current used values, the delta that was still allocated, and what
250
-- was left unallocated.
251
computeAllocationDelta :: CStats -> CStats -> AllocStats
252
computeAllocationDelta cini cfin =
253
    let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini
254
        CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
255
                csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin
256
        rini = RSpec (fromIntegral i_icpu) (fromIntegral i_imem)
257
               (fromIntegral i_idsk)
258
        rfin = RSpec (fromIntegral (f_icpu - i_icpu))
259
               (fromIntegral (f_imem - i_imem))
260
               (fromIntegral (f_idsk - i_idsk))
261
        un_cpu = fromIntegral (v_cpu - f_icpu)::Int
262
        runa = RSpec un_cpu (truncate t_mem - fromIntegral f_imem)
263
               (truncate t_dsk - fromIntegral f_idsk)
264
    in (rini, rfin, runa)
265

    
266
-- | The names and weights of the individual elements in the CV list.
267
detailedCVInfo :: [(Double, String)]
268
detailedCVInfo = [ (1,  "free_mem_cv")
269
                 , (1,  "free_disk_cv")
270
                 , (1,  "n1_cnt")
271
                 , (1,  "reserved_mem_cv")
272
                 , (4,  "offline_all_cnt")
273
                 , (16, "offline_pri_cnt")
274
                 , (1,  "vcpu_ratio_cv")
275
                 , (1,  "cpu_load_cv")
276
                 , (1,  "mem_load_cv")
277
                 , (1,  "disk_load_cv")
278
                 , (1,  "net_load_cv")
279
                 , (2,  "pri_tags_score")
280
                 ]
281

    
282
-- | Holds the weights used by 'compCVNodes' for each metric.
283
detailedCVWeights :: [Double]
284
detailedCVWeights = map fst detailedCVInfo
285

    
286
-- | Compute the mem and disk covariance.
287
compDetailedCV :: [Node.Node] -> [Double]
288
compDetailedCV all_nodes =
289
    let
290
        (offline, nodes) = partition Node.offline all_nodes
291
        mem_l = map Node.pMem nodes
292
        dsk_l = map Node.pDsk nodes
293
        -- metric: memory covariance
294
        mem_cv = stdDev mem_l
295
        -- metric: disk covariance
296
        dsk_cv = stdDev dsk_l
297
        -- metric: count of instances living on N1 failing nodes
298
        n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
299
                                                   length (Node.pList n)) .
300
                   filter Node.failN1 $ nodes :: Double
301
        res_l = map Node.pRem nodes
302
        -- metric: reserved memory covariance
303
        res_cv = stdDev res_l
304
        -- offline instances metrics
305
        offline_ipri = sum . map (length . Node.pList) $ offline
306
        offline_isec = sum . map (length . Node.sList) $ offline
307
        -- metric: count of instances on offline nodes
308
        off_score = fromIntegral (offline_ipri + offline_isec)::Double
309
        -- metric: count of primary instances on offline nodes (this
310
        -- helps with evacuation/failover of primary instances on
311
        -- 2-node clusters with one node offline)
312
        off_pri_score = fromIntegral offline_ipri::Double
313
        cpu_l = map Node.pCpu nodes
314
        -- metric: covariance of vcpu/pcpu ratio
315
        cpu_cv = stdDev cpu_l
316
        -- metrics: covariance of cpu, memory, disk and network load
317
        (c_load, m_load, d_load, n_load) = unzip4 $
318
            map (\n ->
319
                     let DynUtil c1 m1 d1 n1 = Node.utilLoad n
320
                         DynUtil c2 m2 d2 n2 = Node.utilPool n
321
                     in (c1/c2, m1/m2, d1/d2, n1/n2)
322
                ) nodes
323
        -- metric: conflicting instance count
324
        pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
325
        pri_tags_score = fromIntegral pri_tags_inst::Double
326
    in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
327
       , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
328
       , pri_tags_score ]
329

    
330
-- | Compute the /total/ variance.
331
compCVNodes :: [Node.Node] -> Double
332
compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
333

    
334
-- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
335
compCV :: Node.List -> Double
336
compCV = compCVNodes . Container.elems
337

    
338
-- | Compute online nodes from a 'Node.List'.
339
getOnline :: Node.List -> [Node.Node]
340
getOnline = filter (not . Node.offline) . Container.elems
341

    
342
-- * Balancing functions
343

    
344
-- | Compute best table. Note that the ordering of the arguments is important.
345
compareTables :: Table -> Table -> Table
346
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
347
    if a_cv > b_cv then b else a
348

    
349
-- | Applies an instance move to a given node list and instance.
350
applyMove :: Node.List -> Instance.Instance
351
          -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
352
-- Failover (f)
353
applyMove nl inst Failover =
354
    let old_pdx = Instance.pNode inst
355
        old_sdx = Instance.sNode inst
356
        old_p = Container.find old_pdx nl
357
        old_s = Container.find old_sdx nl
358
        int_p = Node.removePri old_p inst
359
        int_s = Node.removeSec old_s inst
360
        force_p = Node.offline old_p
361
        new_nl = do -- Maybe monad
362
          new_p <- Node.addPriEx force_p int_s inst
363
          new_s <- Node.addSec int_p inst old_sdx
364
          let new_inst = Instance.setBoth inst old_sdx old_pdx
365
          return (Container.addTwo old_pdx new_s old_sdx new_p nl,
366
                  new_inst, old_sdx, old_pdx)
367
    in new_nl
368

    
369
-- Replace the primary (f:, r:np, f)
370
applyMove nl inst (ReplacePrimary new_pdx) =
371
    let old_pdx = Instance.pNode inst
372
        old_sdx = Instance.sNode inst
373
        old_p = Container.find old_pdx nl
374
        old_s = Container.find old_sdx nl
375
        tgt_n = Container.find new_pdx nl
376
        int_p = Node.removePri old_p inst
377
        int_s = Node.removeSec old_s inst
378
        force_p = Node.offline old_p
379
        new_nl = do -- Maybe monad
380
          -- check that the current secondary can host the instance
381
          -- during the migration
382
          tmp_s <- Node.addPriEx force_p int_s inst
383
          let tmp_s' = Node.removePri tmp_s inst
384
          new_p <- Node.addPriEx force_p tgt_n inst
385
          new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
386
          let new_inst = Instance.setPri inst new_pdx
387
          return (Container.add new_pdx new_p $
388
                  Container.addTwo old_pdx int_p old_sdx new_s nl,
389
                  new_inst, new_pdx, old_sdx)
390
    in new_nl
391

    
392
-- Replace the secondary (r:ns)
393
applyMove nl inst (ReplaceSecondary new_sdx) =
394
    let old_pdx = Instance.pNode inst
395
        old_sdx = Instance.sNode inst
396
        old_s = Container.find old_sdx nl
397
        tgt_n = Container.find new_sdx nl
398
        int_s = Node.removeSec old_s inst
399
        force_s = Node.offline old_s
400
        new_inst = Instance.setSec inst new_sdx
401
        new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
402
                 \new_s -> return (Container.addTwo new_sdx
403
                                   new_s old_sdx int_s nl,
404
                                   new_inst, old_pdx, new_sdx)
405
    in new_nl
406

    
407
-- Replace the secondary and failover (r:np, f)
408
applyMove nl inst (ReplaceAndFailover new_pdx) =
409
    let old_pdx = Instance.pNode inst
410
        old_sdx = Instance.sNode inst
411
        old_p = Container.find old_pdx nl
412
        old_s = Container.find old_sdx nl
413
        tgt_n = Container.find new_pdx nl
414
        int_p = Node.removePri old_p inst
415
        int_s = Node.removeSec old_s inst
416
        force_s = Node.offline old_s
417
        new_nl = do -- Maybe monad
418
          new_p <- Node.addPri tgt_n inst
419
          new_s <- Node.addSecEx force_s int_p inst new_pdx
420
          let new_inst = Instance.setBoth inst new_pdx old_pdx
421
          return (Container.add new_pdx new_p $
422
                  Container.addTwo old_pdx new_s old_sdx int_s nl,
423
                  new_inst, new_pdx, old_pdx)
424
    in new_nl
425

    
426
-- Failver and replace the secondary (f, r:ns)
427
applyMove nl inst (FailoverAndReplace new_sdx) =
428
    let old_pdx = Instance.pNode inst
429
        old_sdx = Instance.sNode inst
430
        old_p = Container.find old_pdx nl
431
        old_s = Container.find old_sdx nl
432
        tgt_n = Container.find new_sdx nl
433
        int_p = Node.removePri old_p inst
434
        int_s = Node.removeSec old_s inst
435
        force_p = Node.offline old_p
436
        new_nl = do -- Maybe monad
437
          new_p <- Node.addPriEx force_p int_s inst
438
          new_s <- Node.addSecEx force_p tgt_n inst old_sdx
439
          let new_inst = Instance.setBoth inst old_sdx new_sdx
440
          return (Container.add new_sdx new_s $
441
                  Container.addTwo old_sdx new_p old_pdx int_p nl,
442
                  new_inst, old_sdx, new_sdx)
443
    in new_nl
444

    
445
-- | Tries to allocate an instance on one given node.
446
allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
447
                 -> OpResult Node.AllocElement
448
allocateOnSingle nl inst new_pdx =
449
    let p = Container.find new_pdx nl
450
        new_inst = Instance.setBoth inst new_pdx Node.noSecondary
451
    in  Node.addPri p inst >>= \new_p -> do
452
      let new_nl = Container.add new_pdx new_p nl
453
          new_score = compCV nl
454
      return (new_nl, new_inst, [new_p], new_score)
455

    
456
-- | Tries to allocate an instance on a given pair of nodes.
457
allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
458
               -> OpResult Node.AllocElement
459
allocateOnPair nl inst new_pdx new_sdx =
460
    let tgt_p = Container.find new_pdx nl
461
        tgt_s = Container.find new_sdx nl
462
    in do
463
      new_p <- Node.addPri tgt_p inst
464
      new_s <- Node.addSec tgt_s inst new_pdx
465
      let new_inst = Instance.setBoth inst new_pdx new_sdx
466
          new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
467
      return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
468

    
469
-- | Tries to perform an instance move and returns the best table
470
-- between the original one and the new one.
471
checkSingleStep :: Table -- ^ The original table
472
                -> Instance.Instance -- ^ The instance to move
473
                -> Table -- ^ The current best table
474
                -> IMove -- ^ The move to apply
475
                -> Table -- ^ The final best table
476
checkSingleStep ini_tbl target cur_tbl move =
477
    let
478
        Table ini_nl ini_il _ ini_plc = ini_tbl
479
        tmp_resu = applyMove ini_nl target move
480
    in
481
      case tmp_resu of
482
        OpFail _ -> cur_tbl
483
        OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
484
            let tgt_idx = Instance.idx target
485
                upd_cvar = compCV upd_nl
486
                upd_il = Container.add tgt_idx new_inst ini_il
487
                upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
488
                upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
489
            in
490
              compareTables cur_tbl upd_tbl
491

    
492
-- | Given the status of the current secondary as a valid new node and
493
-- the current candidate target node, generate the possible moves for
494
-- a instance.
495
possibleMoves :: Bool      -- ^ Whether the secondary node is a valid new node
496
              -> Bool      -- ^ Whether we can change the primary node
497
              -> Ndx       -- ^ Target node candidate
498
              -> [IMove]   -- ^ List of valid result moves
499

    
500
possibleMoves _ False tdx =
501
    [ReplaceSecondary tdx]
502

    
503
possibleMoves True True tdx =
504
    [ReplaceSecondary tdx,
505
     ReplaceAndFailover tdx,
506
     ReplacePrimary tdx,
507
     FailoverAndReplace tdx]
508

    
509
possibleMoves False True tdx =
510
    [ReplaceSecondary tdx,
511
     ReplaceAndFailover tdx]
512

    
513
-- | Compute the best move for a given instance.
514
checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
515
                  -> Bool              -- ^ Whether disk moves are allowed
516
                  -> Bool              -- ^ Whether instance moves are allowed
517
                  -> Table             -- ^ Original table
518
                  -> Instance.Instance -- ^ Instance to move
519
                  -> Table             -- ^ Best new table for this instance
520
checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
521
    let
522
        opdx = Instance.pNode target
523
        osdx = Instance.sNode target
524
        nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx
525
        use_secondary = elem osdx nodes_idx && inst_moves
526
        aft_failover = if use_secondary -- if allowed to failover
527
                       then checkSingleStep ini_tbl target ini_tbl Failover
528
                       else ini_tbl
529
        all_moves = if disk_moves
530
                    then concatMap
531
                         (possibleMoves use_secondary inst_moves) nodes
532
                    else []
533
    in
534
      -- iterate over the possible nodes for this instance
535
      foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
536

    
537
-- | Compute the best next move.
538
checkMove :: [Ndx]               -- ^ Allowed target node indices
539
          -> Bool                -- ^ Whether disk moves are allowed
540
          -> Bool                -- ^ Whether instance moves are allowed
541
          -> Table               -- ^ The current solution
542
          -> [Instance.Instance] -- ^ List of instances still to move
543
          -> Table               -- ^ The new solution
544
checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
545
    let Table _ _ _ ini_plc = ini_tbl
546
        -- we're using rwhnf from the Control.Parallel.Strategies
547
        -- package; we don't need to use rnf as that would force too
548
        -- much evaluation in single-threaded cases, and in
549
        -- multi-threaded case the weak head normal form is enough to
550
        -- spark the evaluation
551
        tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
552
                               inst_moves ini_tbl)
553
                 victims
554
        -- iterate over all instances, computing the best move
555
        best_tbl = foldl' compareTables ini_tbl tables
556
        Table _ _ _ best_plc = best_tbl
557
    in if length best_plc == length ini_plc
558
       then ini_tbl -- no advancement
559
       else best_tbl
560

    
561
-- | Check if we are allowed to go deeper in the balancing.
562
doNextBalance :: Table     -- ^ The starting table
563
              -> Int       -- ^ Remaining length
564
              -> Score     -- ^ Score at which to stop
565
              -> Bool      -- ^ The resulting table and commands
566
doNextBalance ini_tbl max_rounds min_score =
567
    let Table _ _ ini_cv ini_plc = ini_tbl
568
        ini_plc_len = length ini_plc
569
    in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
570

    
571
-- | Run a balance move.
572
tryBalance :: Table       -- ^ The starting table
573
           -> Bool        -- ^ Allow disk moves
574
           -> Bool        -- ^ Allow instance moves
575
           -> Bool        -- ^ Only evacuate moves
576
           -> Score       -- ^ Min gain threshold
577
           -> Score       -- ^ Min gain
578
           -> Maybe Table -- ^ The resulting table and commands
579
tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
580
    let Table ini_nl ini_il ini_cv _ = ini_tbl
581
        all_inst = Container.elems ini_il
582
        all_inst' = if evac_mode
583
                    then let bad_nodes = map Node.idx . filter Node.offline $
584
                                         Container.elems ini_nl
585
                         in filter (\e -> Instance.sNode e `elem` bad_nodes ||
586
                                          Instance.pNode e `elem` bad_nodes)
587
                            all_inst
588
                    else all_inst
589
        reloc_inst = filter Instance.movable all_inst'
590
        node_idx = map Node.idx . filter (not . Node.offline) $
591
                   Container.elems ini_nl
592
        fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
593
        (Table _ _ fin_cv _) = fin_tbl
594
    in
595
      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
596
      then Just fin_tbl -- this round made success, return the new table
597
      else Nothing
598

    
599
-- * Allocation functions
600

    
601
-- | Build failure stats out of a list of failures.
602
collapseFailures :: [FailMode] -> FailStats
603
collapseFailures flst =
604
    map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
605
            [minBound..maxBound]
606

    
607
-- | Update current Allocation solution and failure stats with new
608
-- elements.
609
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
610
concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
611

    
612
concatAllocs as (OpGood ns@(_, _, _, nscore)) =
613
    let -- Choose the old or new solution, based on the cluster score
614
        cntok = asAllocs as
615
        osols = asSolutions as
616
        nsols = case osols of
617
                  [] -> [ns]
618
                  (_, _, _, oscore):[] ->
619
                      if oscore < nscore
620
                      then osols
621
                      else [ns]
622
                  -- FIXME: here we simply concat to lists with more
623
                  -- than one element; we should instead abort, since
624
                  -- this is not a valid usage of this function
625
                  xs -> ns:xs
626
        nsuc = cntok + 1
627
    -- Note: we force evaluation of nsols here in order to keep the
628
    -- memory profile low - we know that we will need nsols for sure
629
    -- in the next cycle, so we force evaluation of nsols, since the
630
    -- foldl' in the caller will only evaluate the tuple, but not the
631
    -- elements of the tuple
632
    in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolutions = nsols }
633

    
634
-- | Given a solution, generates a reasonable description for it.
635
describeSolution :: AllocSolution -> String
636
describeSolution as =
637
  let fcnt = asFailures as
638
      sols = asSolutions as
639
      freasons =
640
        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
641
        filter ((> 0) . snd) . collapseFailures $ fcnt
642
  in if null sols
643
     then "No valid allocation solutions, failure reasons: " ++
644
          (if null fcnt
645
           then "unknown reasons"
646
           else freasons)
647
     else let (_, _, nodes, cv) = head sols
648
          in printf ("score: %.8f, successes %d, failures %d (%s)" ++
649
                     " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
650
             (intercalate "/" . map Node.name $ nodes)
651

    
652
-- | Annotates a solution with the appropriate string.
653
annotateSolution :: AllocSolution -> AllocSolution
654
annotateSolution as = as { asLog = describeSolution as : asLog as }
655

    
656
-- | Reverses an evacuation solution.
657
--
658
-- Rationale: we always concat the results to the top of the lists, so
659
-- for proper jobset execution, we should reverse all lists.
660
reverseEvacSolution :: EvacSolution -> EvacSolution
661
reverseEvacSolution (EvacSolution f m o) =
662
    EvacSolution (reverse f) (reverse m) (reverse o)
663

    
664
-- | Generate the valid node allocation singles or pairs for a new instance.
665
genAllocNodes :: Group.List        -- ^ Group list
666
              -> Node.List         -- ^ The node map
667
              -> Int               -- ^ The number of nodes required
668
              -> Bool              -- ^ Whether to drop or not
669
                                   -- unallocable nodes
670
              -> Result AllocNodes -- ^ The (monadic) result
671
genAllocNodes gl nl count drop_unalloc =
672
    let filter_fn = if drop_unalloc
673
                    then filter (Group.isAllocable .
674
                                 flip Container.find gl . Node.group)
675
                    else id
676
        all_nodes = filter_fn $ getOnline nl
677
        all_pairs = liftM2 (,) all_nodes all_nodes
678
        ok_pairs = filter (\(x, y) -> Node.idx x /= Node.idx y &&
679
                                      Node.group x == Node.group y) all_pairs
680
    in case count of
681
         1 -> Ok (Left (map Node.idx all_nodes))
682
         2 -> Ok (Right (map (\(p, s) -> (Node.idx p, Node.idx s)) ok_pairs))
683
         _ -> Bad "Unsupported number of nodes, only one or two  supported"
684

    
685
-- | Try to allocate an instance on the cluster.
686
tryAlloc :: (Monad m) =>
687
            Node.List         -- ^ The node list
688
         -> Instance.List     -- ^ The instance list
689
         -> Instance.Instance -- ^ The instance to allocate
690
         -> AllocNodes        -- ^ The allocation targets
691
         -> m AllocSolution   -- ^ Possible solution list
692
tryAlloc nl _ inst (Right ok_pairs) =
693
    let sols = foldl' (\cstate (p, s) ->
694
                           concatAllocs cstate $ allocateOnPair nl inst p s
695
                      ) emptyAllocSolution ok_pairs
696

    
697
    in if null ok_pairs -- means we have just one node
698
       then fail "Not enough online nodes"
699
       else return $ annotateSolution sols
700

    
701
tryAlloc nl _ inst (Left all_nodes) =
702
    let sols = foldl' (\cstate ->
703
                           concatAllocs cstate . allocateOnSingle nl inst
704
                      ) emptyAllocSolution all_nodes
705
    in if null all_nodes
706
       then fail "No online nodes"
707
       else return $ annotateSolution sols
708

    
709
-- | Given a group/result, describe it as a nice (list of) messages.
710
solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
711
solutionDescription gl (groupId, result) =
712
  case result of
713
    Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
714
    Bad message -> [printf "Group %s: error %s" gname message]
715
  where grp = Container.find groupId gl
716
        gname = Group.name grp
717
        pol = apolToString (Group.allocPolicy grp)
718

    
719
-- | From a list of possibly bad and possibly empty solutions, filter
720
-- only the groups with a valid result. Note that the result will be
721
-- reversed compared to the original list.
722
filterMGResults :: Group.List
723
                -> [(Gdx, Result AllocSolution)]
724
                -> [(Gdx, AllocSolution)]
725
filterMGResults gl = foldl' fn []
726
    where unallocable = not . Group.isAllocable . flip Container.find gl
727
          fn accu (gdx, rasol) =
728
              case rasol of
729
                Bad _ -> accu
730
                Ok sol | null (asSolutions sol) -> accu
731
                       | unallocable gdx -> accu
732
                       | otherwise -> (gdx, sol):accu
733

    
734
-- | Sort multigroup results based on policy and score.
735
sortMGResults :: Group.List
736
             -> [(Gdx, AllocSolution)]
737
             -> [(Gdx, AllocSolution)]
738
sortMGResults gl sols =
739
    let extractScore (_, _, _, x) = x
740
        solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
741
                               (extractScore . head . asSolutions) sol)
742
    in sortBy (comparing solScore) sols
743

    
744
-- | Finds the best group for an instance on a multi-group cluster.
745
--
746
-- Only solutions in @preferred@ and @last_resort@ groups will be
747
-- accepted as valid, and additionally if the allowed groups parameter
748
-- is not null then allocation will only be run for those group
749
-- indices.
750
findBestAllocGroup :: Group.List           -- ^ The group list
751
                   -> Node.List            -- ^ The node list
752
                   -> Instance.List        -- ^ The instance list
753
                   -> Maybe [Gdx]          -- ^ The allowed groups
754
                   -> Instance.Instance    -- ^ The instance to allocate
755
                   -> Int                  -- ^ Required number of nodes
756
                   -> Result (Gdx, AllocSolution, [String])
757
findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
758
  let groups = splitCluster mgnl mgil
759
      groups' = maybe groups (\gs -> filter ((`elem` gs) . fst) groups)
760
                allowed_gdxs
761
      sols = map (\(gid, (nl, il)) ->
762
                   (gid, genAllocNodes mggl nl cnt False >>=
763
                       tryAlloc nl il inst))
764
             groups'::[(Gdx, Result AllocSolution)]
765
      all_msgs = concatMap (solutionDescription mggl) sols
766
      goodSols = filterMGResults mggl sols
767
      sortedSols = sortMGResults mggl goodSols
768
  in if null sortedSols
769
     then Bad $ intercalate ", " all_msgs
770
     else let (final_group, final_sol) = head sortedSols
771
          in return (final_group, final_sol, all_msgs)
772

    
773
-- | Try to allocate an instance on a multi-group cluster.
774
tryMGAlloc :: Group.List           -- ^ The group list
775
           -> Node.List            -- ^ The node list
776
           -> Instance.List        -- ^ The instance list
777
           -> Instance.Instance    -- ^ The instance to allocate
778
           -> Int                  -- ^ Required number of nodes
779
           -> Result AllocSolution -- ^ Possible solution list
780
tryMGAlloc mggl mgnl mgil inst cnt = do
781
  (best_group, solution, all_msgs) <-
782
      findBestAllocGroup mggl mgnl mgil Nothing inst cnt
783
  let group_name = Group.name $ Container.find best_group mggl
784
      selmsg = "Selected group: " ++ group_name
785
  return $ solution { asLog = selmsg:all_msgs }
786

    
787
-- | Try to relocate an instance on the cluster.
788
tryReloc :: (Monad m) =>
789
            Node.List       -- ^ The node list
790
         -> Instance.List   -- ^ The instance list
791
         -> Idx             -- ^ The index of the instance to move
792
         -> Int             -- ^ The number of nodes required
793
         -> [Ndx]           -- ^ Nodes which should not be used
794
         -> m AllocSolution -- ^ Solution list
795
tryReloc nl il xid 1 ex_idx =
796
    let all_nodes = getOnline nl
797
        inst = Container.find xid il
798
        ex_idx' = Instance.pNode inst:ex_idx
799
        valid_nodes = filter (not . flip elem ex_idx' . Node.idx) all_nodes
800
        valid_idxes = map Node.idx valid_nodes
801
        sols1 = foldl' (\cstate x ->
802
                            let em = do
803
                                  (mnl, i, _, _) <-
804
                                      applyMove nl inst (ReplaceSecondary x)
805
                                  return (mnl, i, [Container.find x mnl],
806
                                          compCV mnl)
807
                            in concatAllocs cstate em
808
                       ) emptyAllocSolution valid_idxes
809
    in return sols1
810

    
811
tryReloc _ _ _ reqn _  = fail $ "Unsupported number of relocation \
812
                                \destinations required (" ++ show reqn ++
813
                                                  "), only one supported"
814

    
815
-- | Change an instance's secondary node.
816
evacInstance :: (Monad m) =>
817
                [Ndx]                      -- ^ Excluded nodes
818
             -> Instance.List              -- ^ The current instance list
819
             -> (Node.List, AllocSolution) -- ^ The current state
820
             -> Idx                        -- ^ The instance to evacuate
821
             -> m (Node.List, AllocSolution)
822
evacInstance ex_ndx il (nl, old_as) idx = do
823
  -- FIXME: hardcoded one node here
824

    
825
  -- Longer explanation: evacuation is currently hardcoded to DRBD
826
  -- instances (which have one secondary); hence, even if the
827
  -- IAllocator protocol can request N nodes for an instance, and all
828
  -- the message parsing/loading pass this, this implementation only
829
  -- supports one; this situation needs to be revisited if we ever
830
  -- support more than one secondary, or if we change the storage
831
  -- model
832
  new_as <- tryReloc nl il idx 1 ex_ndx
833
  case asSolutions new_as of
834
    -- an individual relocation succeeded, we kind of compose the data
835
    -- from the two solutions
836
    csol@(nl', _, _, _):_ ->
837
        return (nl', new_as { asSolutions = csol:asSolutions old_as })
838
    -- this relocation failed, so we fail the entire evac
839
    _ -> fail $ "Can't evacuate instance " ++
840
         Instance.name (Container.find idx il) ++
841
             ": " ++ describeSolution new_as
842

    
843
-- | Try to evacuate a list of nodes.
844
tryEvac :: (Monad m) =>
845
            Node.List       -- ^ The node list
846
         -> Instance.List   -- ^ The instance list
847
         -> [Idx]           -- ^ Instances to be evacuated
848
         -> [Ndx]           -- ^ Restricted nodes (the ones being evacuated)
849
         -> m AllocSolution -- ^ Solution list
850
tryEvac nl il idxs ex_ndx = do
851
  (_, sol) <- foldM (evacInstance ex_ndx il) (nl, emptyAllocSolution) idxs
852
  return sol
853

    
854
-- | Function which fails if the requested mode is change secondary.
855
--
856
-- This is useful since except DRBD, no other disk template can
857
-- execute change secondary; thus, we can just call this function
858
-- instead of always checking for secondary mode. After the call to
859
-- this function, whatever mode we have is just a primary change.
860
failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
861
failOnSecondaryChange ChangeSecondary dt =
862
    fail $ "Instances with disk template '" ++ dtToString dt ++
863
         "' can't execute change secondary"
864
failOnSecondaryChange _ _ = return ()
865

    
866
-- | Run evacuation for a single instance.
867
--
868
-- /Note:/ this function should correctly execute both intra-group
869
-- evacuations (in all modes) and inter-group evacuations (in the
870
-- 'ChangeAll' mode). Of course, this requires that the correct list
871
-- of target nodes is passed.
872
nodeEvacInstance :: Node.List         -- ^ The node list (cluster-wide)
873
                 -> Instance.List     -- ^ Instance list (cluster-wide)
874
                 -> EvacMode          -- ^ The evacuation mode
875
                 -> Instance.Instance -- ^ The instance to be evacuated
876
                 -> Gdx               -- ^ The group we're targetting
877
                 -> [Ndx]             -- ^ The list of available nodes
878
                                      -- for allocation
879
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
880
nodeEvacInstance _ _ mode (Instance.Instance
881
                           {Instance.diskTemplate = dt@DTDiskless}) _ _ =
882
                  failOnSecondaryChange mode dt >>
883
                  fail "Diskless relocations not implemented yet"
884

    
885
nodeEvacInstance _ _ _ (Instance.Instance
886
                        {Instance.diskTemplate = DTPlain}) _ _ =
887
                  fail "Instances of type plain cannot be relocated"
888

    
889
nodeEvacInstance _ _ _ (Instance.Instance
890
                        {Instance.diskTemplate = DTFile}) _ _ =
891
                  fail "Instances of type file cannot be relocated"
892

    
893
nodeEvacInstance _ _ mode  (Instance.Instance
894
                            {Instance.diskTemplate = dt@DTSharedFile}) _ _ =
895
                  failOnSecondaryChange mode dt >>
896
                  fail "Shared file relocations not implemented yet"
897

    
898
nodeEvacInstance _ _ mode (Instance.Instance
899
                           {Instance.diskTemplate = dt@DTBlock}) _ _ =
900
                  failOnSecondaryChange mode dt >>
901
                  fail "Block device relocations not implemented yet"
902

    
903
nodeEvacInstance nl il ChangePrimary
904
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
905
                 _ _ =
906
  do
907
    (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
908
    let idx = Instance.idx inst
909
        il' = Container.add idx inst' il
910
        ops = iMoveToJob nl' il' idx Failover
911
    return (nl', il', ops)
912

    
913
nodeEvacInstance nl il ChangeSecondary
914
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
915
                 gdx avail_nodes =
916
  do
917
    (nl', inst', _, ndx) <- annotateResult "Can't find any good node" $
918
                            eitherToResult $
919
                            foldl' (evacDrbdSecondaryInner nl inst gdx)
920
                            (Left "no nodes available") avail_nodes
921
    let idx = Instance.idx inst
922
        il' = Container.add idx inst' il
923
        ops = iMoveToJob nl' il' idx (ReplaceSecondary ndx)
924
    return (nl', il', ops)
925

    
926
nodeEvacInstance nl il ChangeAll
927
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
928
                 gdx avail_nodes =
929
  do
930
    let primary = Container.find (Instance.pNode inst) nl
931
        idx = Instance.idx inst
932
        no_nodes = Left "no nodes available"
933
    -- if the primary is offline, then we first failover
934
    (nl1, inst1, ops1) <-
935
        if Node.offline primary
936
        then do
937
          (nl', inst', _, _) <-
938
              annotateResult "Failing over to the secondary" $
939
              opToResult $ applyMove nl inst Failover
940
          return (nl', inst', [Failover])
941
        else return (nl, inst, [])
942
    -- we now need to execute a replace secondary to the future
943
    -- primary node
944
    (nl2, inst2, _, new_pdx) <- annotateResult "Searching for a new primary" $
945
                                eitherToResult $
946
                                foldl' (evacDrbdSecondaryInner nl1 inst1 gdx)
947
                                no_nodes avail_nodes
948
    let ops2 = ReplaceSecondary new_pdx:ops1
949
    -- since we chose the new primary, we remove it from the list of
950
    -- available nodes
951
    let avail_nodes_sec = new_pdx `delete` avail_nodes
952
    -- we now execute another failover, the primary stays fixed now
953
    (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" $
954
                          opToResult $ applyMove nl2 inst2 Failover
955
    let ops3 = Failover:ops2
956
    -- and finally another replace secondary, to the final secondary
957
    (nl4, inst4, _, new_sdx) <-
958
        annotateResult "Searching for a new secondary" $
959
        eitherToResult $
960
        foldl' (evacDrbdSecondaryInner nl3 inst3 gdx) no_nodes avail_nodes_sec
961
    let ops4 = ReplaceSecondary new_sdx:ops3
962
        il' = Container.add idx inst4 il
963
        ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
964
    return (nl4, il', ops)
965

    
966
-- | Inner fold function for changing secondary of a DRBD instance.
967
--
968
-- The "running" solution is either a @Left String@, which means we
969
-- don't have yet a working solution, or a @Right (...)@, which
970
-- represents a valid solution; it holds the modified node list, the
971
-- modified instance (after evacuation), the score of that solution,
972
-- and the new secondary node index.
973
evacDrbdSecondaryInner :: Node.List -- ^ Cluster node list
974
                       -> Instance.Instance -- ^ Instance being evacuated
975
                       -> Gdx -- ^ The group index of the instance
976
                       -> Either String ( Node.List
977
                                        , Instance.Instance
978
                                        , Score
979
                                        , Ndx)  -- ^ Current best solution
980
                       -> Ndx  -- ^ Node we're evaluating as new secondary
981
                       -> Either String ( Node.List
982
                                        , Instance.Instance
983
                                        , Score
984
                                        , Ndx) -- ^ New best solution
985
evacDrbdSecondaryInner nl inst gdx accu ndx =
986
    case applyMove nl inst (ReplaceSecondary ndx) of
987
      OpFail fm ->
988
          case accu of
989
            Right _ -> accu
990
            Left _ -> Left $ "Node " ++ Container.nameOf nl ndx ++
991
                      " failed: " ++ show fm
992
      OpGood (nl', inst', _, _) ->
993
          let nodes = Container.elems nl'
994
              -- The fromJust below is ugly (it can fail nastily), but
995
              -- at this point we should have any internal mismatches,
996
              -- and adding a monad here would be quite involved
997
              grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
998
              new_cv = compCVNodes grpnodes
999
              new_accu = Right (nl', inst', new_cv, ndx)
1000
          in case accu of
1001
               Left _ -> new_accu
1002
               Right (_, _, old_cv, _) ->
1003
                   if old_cv < new_cv
1004
                   then accu
1005
                   else new_accu
1006

    
1007
-- | Computes the nodes in a given group which are available for
1008
-- allocation.
1009
availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1010
                    -> IntSet.IntSet  -- ^ Nodes that are excluded
1011
                    -> Gdx            -- ^ The group for which we
1012
                                      -- query the nodes
1013
                    -> Result [Ndx]   -- ^ List of available node indices
1014
availableGroupNodes group_nodes excl_ndx gdx = do
1015
  local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1016
                 Ok (lookup gdx group_nodes)
1017
  let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1018
  return avail_nodes
1019

    
1020
-- | Updates the evac solution with the results of an instance
1021
-- evacuation.
1022
updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1023
                   -> Idx
1024
                   -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1025
                   -> (Node.List, Instance.List, EvacSolution)
1026
updateEvacSolution (nl, il, es) idx (Bad msg) =
1027
    (nl, il, es { esFailed = (idx, msg):esFailed es})
1028
updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1029
    (nl, il, es { esMoved = new_elem:esMoved es
1030
                , esOpCodes = [opcodes]:esOpCodes es })
1031
     where inst = Container.find idx il
1032
           new_elem = (idx,
1033
                       instancePriGroup nl inst,
1034
                       Instance.allNodes inst)
1035

    
1036
-- | Node-evacuation IAllocator mode main function.
1037
tryNodeEvac :: Group.List    -- ^ The cluster groups
1038
            -> Node.List     -- ^ The node list (cluster-wide, not per group)
1039
            -> Instance.List -- ^ Instance list (cluster-wide)
1040
            -> EvacMode      -- ^ The evacuation mode
1041
            -> [Idx]         -- ^ List of instance (indices) to be evacuated
1042
            -> Result (Node.List, Instance.List, EvacSolution)
1043
tryNodeEvac _ ini_nl ini_il mode idxs =
1044
    let evac_ndx = nodesToEvacuate ini_il mode idxs
1045
        offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1046
        excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1047
        group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1048
                                             (Container.elems nl))) $
1049
                      splitCluster ini_nl ini_il
1050
        (fin_nl, fin_il, esol) =
1051
            foldl' (\state@(nl, il, _) inst ->
1052
                        let gdx = instancePriGroup nl inst in
1053
                        updateEvacSolution state (Instance.idx inst) $
1054
                        availableGroupNodes group_ndx
1055
                          excl_ndx gdx >>=
1056
                        nodeEvacInstance nl il mode inst gdx
1057
                   )
1058
            (ini_nl, ini_il, emptyEvacSolution)
1059
            (map (`Container.find` ini_il) idxs)
1060
    in return (fin_nl, fin_il, reverseEvacSolution esol)
1061

    
1062
-- | Change-group IAllocator mode main function.
1063
--
1064
-- This is very similar to 'tryNodeEvac', the only difference is that
1065
-- we don't choose as target group the current instance group, but
1066
-- instead:
1067
--
1068
--   1. at the start of the function, we compute which are the target
1069
--   groups; either no groups were passed in, in which case we choose
1070
--   all groups out of which we don't evacuate instance, or there were
1071
--   some groups passed, in which case we use those
1072
--
1073
--   2. for each instance, we use 'findBestAllocGroup' to choose the
1074
--   best group to hold the instance, and then we do what
1075
--   'tryNodeEvac' does, except for this group instead of the current
1076
--   instance group.
1077
--
1078
-- Note that the correct behaviour of this function relies on the
1079
-- function 'nodeEvacInstance' to be able to do correctly both
1080
-- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1081
tryChangeGroup :: Group.List    -- ^ The cluster groups
1082
               -> Node.List     -- ^ The node list (cluster-wide)
1083
               -> Instance.List -- ^ Instance list (cluster-wide)
1084
               -> [Gdx]         -- ^ Target groups; if empty, any
1085
                                -- groups not being evacuated
1086
               -> [Idx]         -- ^ List of instance (indices) to be evacuated
1087
               -> Result (Node.List, Instance.List, EvacSolution)
1088
tryChangeGroup gl ini_nl ini_il gdxs idxs =
1089
    let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1090
                               flip Container.find ini_il) idxs
1091
        target_gdxs = (if null gdxs
1092
                       then Container.keys gl
1093
                       else gdxs) \\ evac_gdxs
1094
        offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1095
        excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1096
        group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1097
                                             (Container.elems nl))) $
1098
                      splitCluster ini_nl ini_il
1099
        (fin_nl, fin_il, esol) =
1100
            foldl' (\state@(nl, il, _) inst ->
1101
                        let solution = do
1102
                              let ncnt = Instance.requiredNodes $
1103
                                         Instance.diskTemplate inst
1104
                              (gdx, _, _) <- findBestAllocGroup gl nl il
1105
                                             (Just target_gdxs) inst ncnt
1106
                              av_nodes <- availableGroupNodes group_ndx
1107
                                          excl_ndx gdx
1108
                              nodeEvacInstance nl il ChangeAll inst
1109
                                       gdx av_nodes
1110
                        in updateEvacSolution state
1111
                               (Instance.idx inst) solution
1112
                   )
1113
            (ini_nl, ini_il, emptyEvacSolution)
1114
            (map (`Container.find` ini_il) idxs)
1115
    in return (fin_nl, fin_il, reverseEvacSolution esol)
1116

    
1117
-- | Recursively place instances on the cluster until we're out of space.
1118
iterateAlloc :: Node.List
1119
             -> Instance.List
1120
             -> Maybe Int
1121
             -> Instance.Instance
1122
             -> AllocNodes
1123
             -> [Instance.Instance]
1124
             -> [CStats]
1125
             -> Result AllocResult
1126
iterateAlloc nl il limit newinst allocnodes ixes cstats =
1127
      let depth = length ixes
1128
          newname = printf "new-%d" depth::String
1129
          newidx = length (Container.elems il) + depth
1130
          newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1131
          newlimit = fmap (flip (-) 1) limit
1132
      in case tryAlloc nl il newi2 allocnodes of
1133
           Bad s -> Bad s
1134
           Ok (AllocSolution { asFailures = errs, asSolutions = sols3 }) ->
1135
               let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1136
               case sols3 of
1137
                 [] -> newsol
1138
                 (xnl, xi, _, _):[] ->
1139
                     if limit == Just 0
1140
                     then newsol
1141
                     else iterateAlloc xnl (Container.add newidx xi il)
1142
                          newlimit newinst allocnodes (xi:ixes)
1143
                          (totalResources xnl:cstats)
1144
                 _ -> Bad "Internal error: multiple solutions for single\
1145
                          \ allocation"
1146

    
1147
-- | The core of the tiered allocation mode.
1148
tieredAlloc :: Node.List
1149
            -> Instance.List
1150
            -> Maybe Int
1151
            -> Instance.Instance
1152
            -> AllocNodes
1153
            -> [Instance.Instance]
1154
            -> [CStats]
1155
            -> Result AllocResult
1156
tieredAlloc nl il limit newinst allocnodes ixes cstats =
1157
    case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1158
      Bad s -> Bad s
1159
      Ok (errs, nl', il', ixes', cstats') ->
1160
          let newsol = Ok (errs, nl', il', ixes', cstats')
1161
              ixes_cnt = length ixes'
1162
              (stop, newlimit) = case limit of
1163
                                   Nothing -> (False, Nothing)
1164
                                   Just n -> (n <= ixes_cnt,
1165
                                              Just (n - ixes_cnt)) in
1166
          if stop then newsol else
1167
          case Instance.shrinkByType newinst . fst . last $
1168
               sortBy (comparing snd) errs of
1169
            Bad _ -> newsol
1170
            Ok newinst' -> tieredAlloc nl' il' newlimit
1171
                           newinst' allocnodes ixes' cstats'
1172

    
1173
-- * Formatting functions
1174

    
1175
-- | Given the original and final nodes, computes the relocation description.
1176
computeMoves :: Instance.Instance -- ^ The instance to be moved
1177
             -> String -- ^ The instance name
1178
             -> IMove  -- ^ The move being performed
1179
             -> String -- ^ New primary
1180
             -> String -- ^ New secondary
1181
             -> (String, [String])
1182
                -- ^ Tuple of moves and commands list; moves is containing
1183
                -- either @/f/@ for failover or @/r:name/@ for replace
1184
                -- secondary, while the command list holds gnt-instance
1185
                -- commands (without that prefix), e.g \"@failover instance1@\"
1186
computeMoves i inam mv c d =
1187
    case mv of
1188
      Failover -> ("f", [mig])
1189
      FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1190
      ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1191
      ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1192
      ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1193
    where morf = if Instance.running i then "migrate" else "failover"
1194
          mig = printf "%s -f %s" morf inam::String
1195
          rep n = printf "replace-disks -n %s %s" n inam
1196

    
1197
-- | Converts a placement to string format.
1198
printSolutionLine :: Node.List     -- ^ The node list
1199
                  -> Instance.List -- ^ The instance list
1200
                  -> Int           -- ^ Maximum node name length
1201
                  -> Int           -- ^ Maximum instance name length
1202
                  -> Placement     -- ^ The current placement
1203
                  -> Int           -- ^ The index of the placement in
1204
                                   -- the solution
1205
                  -> (String, [String])
1206
printSolutionLine nl il nmlen imlen plc pos =
1207
    let
1208
        pmlen = (2*nmlen + 1)
1209
        (i, p, s, mv, c) = plc
1210
        inst = Container.find i il
1211
        inam = Instance.alias inst
1212
        npri = Node.alias $ Container.find p nl
1213
        nsec = Node.alias $ Container.find s nl
1214
        opri = Node.alias $ Container.find (Instance.pNode inst) nl
1215
        osec = Node.alias $ Container.find (Instance.sNode inst) nl
1216
        (moves, cmds) =  computeMoves inst inam mv npri nsec
1217
        ostr = printf "%s:%s" opri osec::String
1218
        nstr = printf "%s:%s" npri nsec::String
1219
    in
1220
      (printf "  %3d. %-*s %-*s => %-*s %.8f a=%s"
1221
       pos imlen inam pmlen ostr
1222
       pmlen nstr c moves,
1223
       cmds)
1224

    
1225
-- | Return the instance and involved nodes in an instance move.
1226
involvedNodes :: Instance.List -> Placement -> [Ndx]
1227
involvedNodes il plc =
1228
    let (i, np, ns, _, _) = plc
1229
        inst = Container.find i il
1230
        op = Instance.pNode inst
1231
        os = Instance.sNode inst
1232
    in nub [np, ns, op, os]
1233

    
1234
-- | Inner function for splitJobs, that either appends the next job to
1235
-- the current jobset, or starts a new jobset.
1236
mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1237
mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1238
mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1239
    | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1240
    | otherwise = ([n]:cjs, ndx)
1241

    
1242
-- | Break a list of moves into independent groups. Note that this
1243
-- will reverse the order of jobs.
1244
splitJobs :: [MoveJob] -> [JobSet]
1245
splitJobs = fst . foldl mergeJobs ([], [])
1246

    
1247
-- | Given a list of commands, prefix them with @gnt-instance@ and
1248
-- also beautify the display a little.
1249
formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1250
formatJob jsn jsl (sn, (_, _, _, cmds)) =
1251
    let out =
1252
            printf "  echo job %d/%d" jsn sn:
1253
            printf "  check":
1254
            map ("  gnt-instance " ++) cmds
1255
    in if sn == 1
1256
       then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1257
       else out
1258

    
1259
-- | Given a list of commands, prefix them with @gnt-instance@ and
1260
-- also beautify the display a little.
1261
formatCmds :: [JobSet] -> String
1262
formatCmds =
1263
    unlines .
1264
    concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1265
                             (zip [1..] js)) .
1266
    zip [1..]
1267

    
1268
-- | Print the node list.
1269
printNodes :: Node.List -> [String] -> String
1270
printNodes nl fs =
1271
    let fields = case fs of
1272
          [] -> Node.defaultFields
1273
          "+":rest -> Node.defaultFields ++ rest
1274
          _ -> fs
1275
        snl = sortBy (comparing Node.idx) (Container.elems nl)
1276
        (header, isnum) = unzip $ map Node.showHeader fields
1277
    in unlines . map ((:) ' ' .  intercalate " ") $
1278
       formatTable (header:map (Node.list fields) snl) isnum
1279

    
1280
-- | Print the instance list.
1281
printInsts :: Node.List -> Instance.List -> String
1282
printInsts nl il =
1283
    let sil = sortBy (comparing Instance.idx) (Container.elems il)
1284
        helper inst = [ if Instance.running inst then "R" else " "
1285
                      , Instance.name inst
1286
                      , Container.nameOf nl (Instance.pNode inst)
1287
                      , let sdx = Instance.sNode inst
1288
                        in if sdx == Node.noSecondary
1289
                           then  ""
1290
                           else Container.nameOf nl sdx
1291
                      , if Instance.autoBalance inst then "Y" else "N"
1292
                      , printf "%3d" $ Instance.vcpus inst
1293
                      , printf "%5d" $ Instance.mem inst
1294
                      , printf "%5d" $ Instance.dsk inst `div` 1024
1295
                      , printf "%5.3f" lC
1296
                      , printf "%5.3f" lM
1297
                      , printf "%5.3f" lD
1298
                      , printf "%5.3f" lN
1299
                      ]
1300
            where DynUtil lC lM lD lN = Instance.util inst
1301
        header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1302
                 , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1303
        isnum = False:False:False:False:False:repeat True
1304
    in unlines . map ((:) ' ' . intercalate " ") $
1305
       formatTable (header:map helper sil) isnum
1306

    
1307
-- | Shows statistics for a given node list.
1308
printStats :: Node.List -> String
1309
printStats nl =
1310
    let dcvs = compDetailedCV $ Container.elems nl
1311
        (weights, names) = unzip detailedCVInfo
1312
        hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1313
        formatted = map (\(w, header, val) ->
1314
                             printf "%s=%.8f(x%.2f)" header val w::String) hd
1315
    in intercalate ", " formatted
1316

    
1317
-- | Convert a placement into a list of OpCodes (basically a job).
1318
iMoveToJob :: Node.List        -- ^ The node list; only used for node
1319
                               -- names, so any version is good
1320
                               -- (before or after the operation)
1321
           -> Instance.List    -- ^ The instance list; also used for
1322
                               -- names only
1323
           -> Idx              -- ^ The index of the instance being
1324
                               -- moved
1325
           -> IMove            -- ^ The actual move to be described
1326
           -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to
1327
                               -- the given move
1328
iMoveToJob nl il idx move =
1329
    let inst = Container.find idx il
1330
        iname = Instance.name inst
1331
        lookNode  = Just . Container.nameOf nl
1332
        opF = OpCodes.OpInstanceMigrate iname True False True
1333
        opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n)
1334
                OpCodes.ReplaceNewSecondary [] Nothing
1335
    in case move of
1336
         Failover -> [ opF ]
1337
         ReplacePrimary np -> [ opF, opR np, opF ]
1338
         ReplaceSecondary ns -> [ opR ns ]
1339
         ReplaceAndFailover np -> [ opR np, opF ]
1340
         FailoverAndReplace ns -> [ opF, opR ns ]
1341

    
1342
-- * Node group functions
1343

    
1344
-- | Computes the group of an instance.
1345
instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1346
instanceGroup nl i =
1347
  let sidx = Instance.sNode i
1348
      pnode = Container.find (Instance.pNode i) nl
1349
      snode = if sidx == Node.noSecondary
1350
              then pnode
1351
              else Container.find sidx nl
1352
      pgroup = Node.group pnode
1353
      sgroup = Node.group snode
1354
  in if pgroup /= sgroup
1355
     then fail ("Instance placed accross two node groups, primary " ++
1356
                show pgroup ++ ", secondary " ++ show sgroup)
1357
     else return pgroup
1358

    
1359
-- | Computes the group of an instance per the primary node.
1360
instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1361
instancePriGroup nl i =
1362
  let pnode = Container.find (Instance.pNode i) nl
1363
  in  Node.group pnode
1364

    
1365
-- | Compute the list of badly allocated instances (split across node
1366
-- groups).
1367
findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1368
findSplitInstances nl =
1369
  filter (not . isOk . instanceGroup nl) . Container.elems
1370

    
1371
-- | Splits a cluster into the component node groups.
1372
splitCluster :: Node.List -> Instance.List ->
1373
                [(Gdx, (Node.List, Instance.List))]
1374
splitCluster nl il =
1375
  let ngroups = Node.computeGroups (Container.elems nl)
1376
  in map (\(guuid, nodes) ->
1377
           let nidxs = map Node.idx nodes
1378
               nodes' = zip nidxs nodes
1379
               instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1380
           in (guuid, (Container.fromList nodes', instances))) ngroups
1381

    
1382
-- | Compute the list of nodes that are to be evacuated, given a list
1383
-- of instances and an evacuation mode.
1384
nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1385
                -> EvacMode      -- ^ The evacuation mode we're using
1386
                -> [Idx]         -- ^ List of instance indices being evacuated
1387
                -> IntSet.IntSet -- ^ Set of node indices
1388
nodesToEvacuate il mode =
1389
    IntSet.delete Node.noSecondary .
1390
    foldl' (\ns idx ->
1391
                let i = Container.find idx il
1392
                    pdx = Instance.pNode i
1393
                    sdx = Instance.sNode i
1394
                    dt = Instance.diskTemplate i
1395
                    withSecondary = case dt of
1396
                                      DTDrbd8 -> IntSet.insert sdx ns
1397
                                      _ -> ns
1398
                in case mode of
1399
                     ChangePrimary   -> IntSet.insert pdx ns
1400
                     ChangeSecondary -> withSecondary
1401
                     ChangeAll       -> IntSet.insert pdx withSecondary
1402
           ) IntSet.empty