Statistics
| Branch: | Tag: | Revision:

root / htools / Ganeti / HTools / Cluster.hs @ a5f248ac

History | View | Annotate | Download (60.6 kB)

1
{-| Implementation of cluster-wide logic.
2

    
3
This module holds all pure cluster-logic; I\/O related functionality
4
goes into the /Main/ module for the individual binaries.
5

    
6
-}
7

    
8
{-
9

    
10
Copyright (C) 2009, 2010, 2011 Google Inc.
11

    
12
This program is free software; you can redistribute it and/or modify
13
it under the terms of the GNU General Public License as published by
14
the Free Software Foundation; either version 2 of the License, or
15
(at your option) any later version.
16

    
17
This program is distributed in the hope that it will be useful, but
18
WITHOUT ANY WARRANTY; without even the implied warranty of
19
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
General Public License for more details.
21

    
22
You should have received a copy of the GNU General Public License
23
along with this program; if not, write to the Free Software
24
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25
02110-1301, USA.
26

    
27
-}
28

    
29
module Ganeti.HTools.Cluster
30
    (
31
     -- * Types
32
      AllocSolution(..)
33
    , EvacSolution(..)
34
    , Table(..)
35
    , CStats(..)
36
    , AllocStats
37
    -- * Generic functions
38
    , totalResources
39
    , computeAllocationDelta
40
    -- * First phase functions
41
    , computeBadItems
42
    -- * Second phase functions
43
    , printSolutionLine
44
    , formatCmds
45
    , involvedNodes
46
    , splitJobs
47
    -- * Display functions
48
    , printNodes
49
    , printInsts
50
    -- * Balacing functions
51
    , checkMove
52
    , doNextBalance
53
    , tryBalance
54
    , compCV
55
    , compCVNodes
56
    , compDetailedCV
57
    , printStats
58
    , iMoveToJob
59
    -- * IAllocator functions
60
    , genAllocNodes
61
    , tryAlloc
62
    , tryMGAlloc
63
    , tryReloc
64
    , tryEvac
65
    , tryNodeEvac
66
    , tryChangeGroup
67
    , collapseFailures
68
    -- * Allocation functions
69
    , iterateAlloc
70
    , tieredAlloc
71
    , tieredSpecMap
72
     -- * Node group functions
73
    , instanceGroup
74
    , findSplitInstances
75
    , splitCluster
76
    ) where
77

    
78
import Data.Function (on)
79
import qualified Data.IntSet as IntSet
80
import Data.List
81
import Data.Maybe (fromJust)
82
import Data.Ord (comparing)
83
import Text.Printf (printf)
84
import Control.Monad
85
import Control.Parallel.Strategies
86

    
87
import qualified Ganeti.HTools.Container as Container
88
import qualified Ganeti.HTools.Instance as Instance
89
import qualified Ganeti.HTools.Node as Node
90
import qualified Ganeti.HTools.Group as Group
91
import Ganeti.HTools.Types
92
import Ganeti.HTools.Utils
93
import qualified Ganeti.OpCodes as OpCodes
94

    
95
-- * Types
96

    
97
-- | Allocation\/relocation solution.
98
data AllocSolution = AllocSolution
99
  { asFailures  :: [FailMode]          -- ^ Failure counts
100
  , asAllocs    :: Int                 -- ^ Good allocation count
101
  , asSolutions :: [Node.AllocElement] -- ^ The actual result, length
102
                                       -- of the list depends on the
103
                                       -- allocation/relocation mode
104
  , asLog       :: [String]            -- ^ A list of informational messages
105
  }
106

    
107
-- | Node evacuation/group change iallocator result type. This result
108
-- type consists of actual opcodes (a restricted subset) that are
109
-- transmitted back to Ganeti.
110
data EvacSolution = EvacSolution
111
    { esMoved   :: [(Idx, Gdx, [Ndx])]  -- ^ Instances moved successfully
112
    , esFailed  :: [(Idx, String)]      -- ^ Instances which were not
113
                                        -- relocated
114
    , esOpCodes :: [[[OpCodes.OpCode]]] -- ^ List of lists of jobs
115
    }
116

    
117
-- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
118
type AllocResult = (FailStats, Node.List, Instance.List,
119
                    [Instance.Instance], [CStats])
120

    
121
-- | A type denoting the valid allocation mode/pairs.
122
--
123
-- For a one-node allocation, this will be a @Left ['Node.Node']@,
124
-- whereas for a two-node allocation, this will be a @Right
125
-- [('Node.Node', 'Node.Node')]@.
126
type AllocNodes = Either [Ndx] [(Ndx, Ndx)]
127

    
128
-- | The empty solution we start with when computing allocations.
129
emptyAllocSolution :: AllocSolution
130
emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
131
                                   , asSolutions = [], asLog = [] }
132

    
133
-- | The empty evac solution.
134
emptyEvacSolution :: EvacSolution
135
emptyEvacSolution = EvacSolution { esMoved = []
136
                                 , esFailed = []
137
                                 , esOpCodes = []
138
                                 }
139

    
140
-- | The complete state for the balancing solution.
141
data Table = Table Node.List Instance.List Score [Placement]
142
             deriving (Show, Read)
143

    
144
data CStats = CStats { csFmem :: Integer -- ^ Cluster free mem
145
                     , csFdsk :: Integer -- ^ Cluster free disk
146
                     , csAmem :: Integer -- ^ Cluster allocatable mem
147
                     , csAdsk :: Integer -- ^ Cluster allocatable disk
148
                     , csAcpu :: Integer -- ^ Cluster allocatable cpus
149
                     , csMmem :: Integer -- ^ Max node allocatable mem
150
                     , csMdsk :: Integer -- ^ Max node allocatable disk
151
                     , csMcpu :: Integer -- ^ Max node allocatable cpu
152
                     , csImem :: Integer -- ^ Instance used mem
153
                     , csIdsk :: Integer -- ^ Instance used disk
154
                     , csIcpu :: Integer -- ^ Instance used cpu
155
                     , csTmem :: Double  -- ^ Cluster total mem
156
                     , csTdsk :: Double  -- ^ Cluster total disk
157
                     , csTcpu :: Double  -- ^ Cluster total cpus
158
                     , csVcpu :: Integer -- ^ Cluster virtual cpus (if
159
                                         -- node pCpu has been set,
160
                                         -- otherwise -1)
161
                     , csXmem :: Integer -- ^ Unnacounted for mem
162
                     , csNmem :: Integer -- ^ Node own memory
163
                     , csScore :: Score  -- ^ The cluster score
164
                     , csNinst :: Int    -- ^ The total number of instances
165
                     }
166
            deriving (Show, Read)
167

    
168
-- | Currently used, possibly to allocate, unallocable.
169
type AllocStats = (RSpec, RSpec, RSpec)
170

    
171
-- * Utility functions
172

    
173
-- | Verifies the N+1 status and return the affected nodes.
174
verifyN1 :: [Node.Node] -> [Node.Node]
175
verifyN1 = filter Node.failN1
176

    
177
{-| Computes the pair of bad nodes and instances.
178

    
179
The bad node list is computed via a simple 'verifyN1' check, and the
180
bad instance list is the list of primary and secondary instances of
181
those nodes.
182

    
183
-}
184
computeBadItems :: Node.List -> Instance.List ->
185
                   ([Node.Node], [Instance.Instance])
186
computeBadItems nl il =
187
  let bad_nodes = verifyN1 $ getOnline nl
188
      bad_instances = map (`Container.find` il) .
189
                      sort . nub $
190
                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
191
  in
192
    (bad_nodes, bad_instances)
193

    
194
-- | Zero-initializer for the CStats type.
195
emptyCStats :: CStats
196
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
197

    
198
-- | Update stats with data from a new node.
199
updateCStats :: CStats -> Node.Node -> CStats
200
updateCStats cs node =
201
    let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
202
                 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
203
                 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
204
                 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
205
                 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
206
                 csVcpu = x_vcpu,
207
                 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
208
               }
209
            = cs
210
        inc_amem = Node.fMem node - Node.rMem node
211
        inc_amem' = if inc_amem > 0 then inc_amem else 0
212
        inc_adsk = Node.availDisk node
213
        inc_imem = truncate (Node.tMem node) - Node.nMem node
214
                   - Node.xMem node - Node.fMem node
215
        inc_icpu = Node.uCpu node
216
        inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
217
        inc_vcpu = Node.hiCpu node
218
        inc_acpu = Node.availCpu node
219

    
220
    in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
221
          , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
222
          , csAmem = x_amem + fromIntegral inc_amem'
223
          , csAdsk = x_adsk + fromIntegral inc_adsk
224
          , csAcpu = x_acpu + fromIntegral inc_acpu
225
          , csMmem = max x_mmem (fromIntegral inc_amem')
226
          , csMdsk = max x_mdsk (fromIntegral inc_adsk)
227
          , csMcpu = max x_mcpu (fromIntegral inc_acpu)
228
          , csImem = x_imem + fromIntegral inc_imem
229
          , csIdsk = x_idsk + fromIntegral inc_idsk
230
          , csIcpu = x_icpu + fromIntegral inc_icpu
231
          , csTmem = x_tmem + Node.tMem node
232
          , csTdsk = x_tdsk + Node.tDsk node
233
          , csTcpu = x_tcpu + Node.tCpu node
234
          , csVcpu = x_vcpu + fromIntegral inc_vcpu
235
          , csXmem = x_xmem + fromIntegral (Node.xMem node)
236
          , csNmem = x_nmem + fromIntegral (Node.nMem node)
237
          , csNinst = x_ninst + length (Node.pList node)
238
          }
239

    
240
-- | Compute the total free disk and memory in the cluster.
241
totalResources :: Node.List -> CStats
242
totalResources nl =
243
    let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
244
    in cs { csScore = compCV nl }
245

    
246
-- | Compute the delta between two cluster state.
247
--
248
-- This is used when doing allocations, to understand better the
249
-- available cluster resources. The return value is a triple of the
250
-- current used values, the delta that was still allocated, and what
251
-- was left unallocated.
252
computeAllocationDelta :: CStats -> CStats -> AllocStats
253
computeAllocationDelta cini cfin =
254
    let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini
255
        CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
256
                csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin
257
        rini = RSpec (fromIntegral i_icpu) (fromIntegral i_imem)
258
               (fromIntegral i_idsk)
259
        rfin = RSpec (fromIntegral (f_icpu - i_icpu))
260
               (fromIntegral (f_imem - i_imem))
261
               (fromIntegral (f_idsk - i_idsk))
262
        un_cpu = fromIntegral (v_cpu - f_icpu)::Int
263
        runa = RSpec un_cpu (truncate t_mem - fromIntegral f_imem)
264
               (truncate t_dsk - fromIntegral f_idsk)
265
    in (rini, rfin, runa)
266

    
267
-- | The names and weights of the individual elements in the CV list.
268
detailedCVInfo :: [(Double, String)]
269
detailedCVInfo = [ (1,  "free_mem_cv")
270
                 , (1,  "free_disk_cv")
271
                 , (1,  "n1_cnt")
272
                 , (1,  "reserved_mem_cv")
273
                 , (4,  "offline_all_cnt")
274
                 , (16, "offline_pri_cnt")
275
                 , (1,  "vcpu_ratio_cv")
276
                 , (1,  "cpu_load_cv")
277
                 , (1,  "mem_load_cv")
278
                 , (1,  "disk_load_cv")
279
                 , (1,  "net_load_cv")
280
                 , (2,  "pri_tags_score")
281
                 ]
282

    
283
detailedCVWeights :: [Double]
284
detailedCVWeights = map fst detailedCVInfo
285

    
286
-- | Compute the mem and disk covariance.
287
compDetailedCV :: [Node.Node] -> [Double]
288
compDetailedCV all_nodes =
289
    let
290
        (offline, nodes) = partition Node.offline all_nodes
291
        mem_l = map Node.pMem nodes
292
        dsk_l = map Node.pDsk nodes
293
        -- metric: memory covariance
294
        mem_cv = stdDev mem_l
295
        -- metric: disk covariance
296
        dsk_cv = stdDev dsk_l
297
        -- metric: count of instances living on N1 failing nodes
298
        n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
299
                                                   length (Node.pList n)) .
300
                   filter Node.failN1 $ nodes :: Double
301
        res_l = map Node.pRem nodes
302
        -- metric: reserved memory covariance
303
        res_cv = stdDev res_l
304
        -- offline instances metrics
305
        offline_ipri = sum . map (length . Node.pList) $ offline
306
        offline_isec = sum . map (length . Node.sList) $ offline
307
        -- metric: count of instances on offline nodes
308
        off_score = fromIntegral (offline_ipri + offline_isec)::Double
309
        -- metric: count of primary instances on offline nodes (this
310
        -- helps with evacuation/failover of primary instances on
311
        -- 2-node clusters with one node offline)
312
        off_pri_score = fromIntegral offline_ipri::Double
313
        cpu_l = map Node.pCpu nodes
314
        -- metric: covariance of vcpu/pcpu ratio
315
        cpu_cv = stdDev cpu_l
316
        -- metrics: covariance of cpu, memory, disk and network load
317
        (c_load, m_load, d_load, n_load) = unzip4 $
318
            map (\n ->
319
                     let DynUtil c1 m1 d1 n1 = Node.utilLoad n
320
                         DynUtil c2 m2 d2 n2 = Node.utilPool n
321
                     in (c1/c2, m1/m2, d1/d2, n1/n2)
322
                ) nodes
323
        -- metric: conflicting instance count
324
        pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
325
        pri_tags_score = fromIntegral pri_tags_inst::Double
326
    in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
327
       , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
328
       , pri_tags_score ]
329

    
330
-- | Compute the /total/ variance.
331
compCVNodes :: [Node.Node] -> Double
332
compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
333

    
334
-- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
335
compCV :: Node.List -> Double
336
compCV = compCVNodes . Container.elems
337

    
338

    
339
-- | Compute online nodes from a 'Node.List'.
340
getOnline :: Node.List -> [Node.Node]
341
getOnline = filter (not . Node.offline) . Container.elems
342

    
343
-- * Balancing functions
344

    
345
-- | Compute best table. Note that the ordering of the arguments is important.
346
compareTables :: Table -> Table -> Table
347
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
348
    if a_cv > b_cv then b else a
349

    
350
-- | Applies an instance move to a given node list and instance.
351
applyMove :: Node.List -> Instance.Instance
352
          -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
353
-- Failover (f)
354
applyMove nl inst Failover =
355
    let old_pdx = Instance.pNode inst
356
        old_sdx = Instance.sNode inst
357
        old_p = Container.find old_pdx nl
358
        old_s = Container.find old_sdx nl
359
        int_p = Node.removePri old_p inst
360
        int_s = Node.removeSec old_s inst
361
        force_p = Node.offline old_p
362
        new_nl = do -- Maybe monad
363
          new_p <- Node.addPriEx force_p int_s inst
364
          new_s <- Node.addSec int_p inst old_sdx
365
          let new_inst = Instance.setBoth inst old_sdx old_pdx
366
          return (Container.addTwo old_pdx new_s old_sdx new_p nl,
367
                  new_inst, old_sdx, old_pdx)
368
    in new_nl
369

    
370
-- Replace the primary (f:, r:np, f)
371
applyMove nl inst (ReplacePrimary new_pdx) =
372
    let old_pdx = Instance.pNode inst
373
        old_sdx = Instance.sNode inst
374
        old_p = Container.find old_pdx nl
375
        old_s = Container.find old_sdx nl
376
        tgt_n = Container.find new_pdx nl
377
        int_p = Node.removePri old_p inst
378
        int_s = Node.removeSec old_s inst
379
        force_p = Node.offline old_p
380
        new_nl = do -- Maybe monad
381
          -- check that the current secondary can host the instance
382
          -- during the migration
383
          tmp_s <- Node.addPriEx force_p int_s inst
384
          let tmp_s' = Node.removePri tmp_s inst
385
          new_p <- Node.addPriEx force_p tgt_n inst
386
          new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
387
          let new_inst = Instance.setPri inst new_pdx
388
          return (Container.add new_pdx new_p $
389
                  Container.addTwo old_pdx int_p old_sdx new_s nl,
390
                  new_inst, new_pdx, old_sdx)
391
    in new_nl
392

    
393
-- Replace the secondary (r:ns)
394
applyMove nl inst (ReplaceSecondary new_sdx) =
395
    let old_pdx = Instance.pNode inst
396
        old_sdx = Instance.sNode inst
397
        old_s = Container.find old_sdx nl
398
        tgt_n = Container.find new_sdx nl
399
        int_s = Node.removeSec old_s inst
400
        force_s = Node.offline old_s
401
        new_inst = Instance.setSec inst new_sdx
402
        new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
403
                 \new_s -> return (Container.addTwo new_sdx
404
                                   new_s old_sdx int_s nl,
405
                                   new_inst, old_pdx, new_sdx)
406
    in new_nl
407

    
408
-- Replace the secondary and failover (r:np, f)
409
applyMove nl inst (ReplaceAndFailover new_pdx) =
410
    let old_pdx = Instance.pNode inst
411
        old_sdx = Instance.sNode inst
412
        old_p = Container.find old_pdx nl
413
        old_s = Container.find old_sdx nl
414
        tgt_n = Container.find new_pdx nl
415
        int_p = Node.removePri old_p inst
416
        int_s = Node.removeSec old_s inst
417
        force_s = Node.offline old_s
418
        new_nl = do -- Maybe monad
419
          new_p <- Node.addPri tgt_n inst
420
          new_s <- Node.addSecEx force_s int_p inst new_pdx
421
          let new_inst = Instance.setBoth inst new_pdx old_pdx
422
          return (Container.add new_pdx new_p $
423
                  Container.addTwo old_pdx new_s old_sdx int_s nl,
424
                  new_inst, new_pdx, old_pdx)
425
    in new_nl
426

    
427
-- Failver and replace the secondary (f, r:ns)
428
applyMove nl inst (FailoverAndReplace new_sdx) =
429
    let old_pdx = Instance.pNode inst
430
        old_sdx = Instance.sNode inst
431
        old_p = Container.find old_pdx nl
432
        old_s = Container.find old_sdx nl
433
        tgt_n = Container.find new_sdx nl
434
        int_p = Node.removePri old_p inst
435
        int_s = Node.removeSec old_s inst
436
        force_p = Node.offline old_p
437
        new_nl = do -- Maybe monad
438
          new_p <- Node.addPriEx force_p int_s inst
439
          new_s <- Node.addSecEx force_p tgt_n inst old_sdx
440
          let new_inst = Instance.setBoth inst old_sdx new_sdx
441
          return (Container.add new_sdx new_s $
442
                  Container.addTwo old_sdx new_p old_pdx int_p nl,
443
                  new_inst, old_sdx, new_sdx)
444
    in new_nl
445

    
446
-- | Tries to allocate an instance on one given node.
447
allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
448
                 -> OpResult Node.AllocElement
449
allocateOnSingle nl inst new_pdx =
450
    let p = Container.find new_pdx nl
451
        new_inst = Instance.setBoth inst new_pdx Node.noSecondary
452
    in  Node.addPri p inst >>= \new_p -> do
453
      let new_nl = Container.add new_pdx new_p nl
454
          new_score = compCV nl
455
      return (new_nl, new_inst, [new_p], new_score)
456

    
457
-- | Tries to allocate an instance on a given pair of nodes.
458
allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
459
               -> OpResult Node.AllocElement
460
allocateOnPair nl inst new_pdx new_sdx =
461
    let tgt_p = Container.find new_pdx nl
462
        tgt_s = Container.find new_sdx nl
463
    in do
464
      new_p <- Node.addPri tgt_p inst
465
      new_s <- Node.addSec tgt_s inst new_pdx
466
      let new_inst = Instance.setBoth inst new_pdx new_sdx
467
          new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
468
      return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
469

    
470
-- | Tries to perform an instance move and returns the best table
471
-- between the original one and the new one.
472
checkSingleStep :: Table -- ^ The original table
473
                -> Instance.Instance -- ^ The instance to move
474
                -> Table -- ^ The current best table
475
                -> IMove -- ^ The move to apply
476
                -> Table -- ^ The final best table
477
checkSingleStep ini_tbl target cur_tbl move =
478
    let
479
        Table ini_nl ini_il _ ini_plc = ini_tbl
480
        tmp_resu = applyMove ini_nl target move
481
    in
482
      case tmp_resu of
483
        OpFail _ -> cur_tbl
484
        OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
485
            let tgt_idx = Instance.idx target
486
                upd_cvar = compCV upd_nl
487
                upd_il = Container.add tgt_idx new_inst ini_il
488
                upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
489
                upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
490
            in
491
              compareTables cur_tbl upd_tbl
492

    
493
-- | Given the status of the current secondary as a valid new node and
494
-- the current candidate target node, generate the possible moves for
495
-- a instance.
496
possibleMoves :: Bool      -- ^ Whether the secondary node is a valid new node
497
              -> Bool      -- ^ Whether we can change the primary node
498
              -> Ndx       -- ^ Target node candidate
499
              -> [IMove]   -- ^ List of valid result moves
500

    
501
possibleMoves _ False tdx =
502
    [ReplaceSecondary tdx]
503

    
504
possibleMoves True True tdx =
505
    [ReplaceSecondary tdx,
506
     ReplaceAndFailover tdx,
507
     ReplacePrimary tdx,
508
     FailoverAndReplace tdx]
509

    
510
possibleMoves False True tdx =
511
    [ReplaceSecondary tdx,
512
     ReplaceAndFailover tdx]
513

    
514
-- | Compute the best move for a given instance.
515
checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
516
                  -> Bool              -- ^ Whether disk moves are allowed
517
                  -> Bool              -- ^ Whether instance moves are allowed
518
                  -> Table             -- ^ Original table
519
                  -> Instance.Instance -- ^ Instance to move
520
                  -> Table             -- ^ Best new table for this instance
521
checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
522
    let
523
        opdx = Instance.pNode target
524
        osdx = Instance.sNode target
525
        nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx
526
        use_secondary = elem osdx nodes_idx && inst_moves
527
        aft_failover = if use_secondary -- if allowed to failover
528
                       then checkSingleStep ini_tbl target ini_tbl Failover
529
                       else ini_tbl
530
        all_moves = if disk_moves
531
                    then concatMap
532
                         (possibleMoves use_secondary inst_moves) nodes
533
                    else []
534
    in
535
      -- iterate over the possible nodes for this instance
536
      foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
537

    
538
-- | Compute the best next move.
539
checkMove :: [Ndx]               -- ^ Allowed target node indices
540
          -> Bool                -- ^ Whether disk moves are allowed
541
          -> Bool                -- ^ Whether instance moves are allowed
542
          -> Table               -- ^ The current solution
543
          -> [Instance.Instance] -- ^ List of instances still to move
544
          -> Table               -- ^ The new solution
545
checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
546
    let Table _ _ _ ini_plc = ini_tbl
547
        -- we're using rwhnf from the Control.Parallel.Strategies
548
        -- package; we don't need to use rnf as that would force too
549
        -- much evaluation in single-threaded cases, and in
550
        -- multi-threaded case the weak head normal form is enough to
551
        -- spark the evaluation
552
        tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
553
                               inst_moves ini_tbl)
554
                 victims
555
        -- iterate over all instances, computing the best move
556
        best_tbl = foldl' compareTables ini_tbl tables
557
        Table _ _ _ best_plc = best_tbl
558
    in if length best_plc == length ini_plc
559
       then ini_tbl -- no advancement
560
       else best_tbl
561

    
562
-- | Check if we are allowed to go deeper in the balancing.
563
doNextBalance :: Table     -- ^ The starting table
564
              -> Int       -- ^ Remaining length
565
              -> Score     -- ^ Score at which to stop
566
              -> Bool      -- ^ The resulting table and commands
567
doNextBalance ini_tbl max_rounds min_score =
568
    let Table _ _ ini_cv ini_plc = ini_tbl
569
        ini_plc_len = length ini_plc
570
    in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
571

    
572
-- | Run a balance move.
573
tryBalance :: Table       -- ^ The starting table
574
           -> Bool        -- ^ Allow disk moves
575
           -> Bool        -- ^ Allow instance moves
576
           -> Bool        -- ^ Only evacuate moves
577
           -> Score       -- ^ Min gain threshold
578
           -> Score       -- ^ Min gain
579
           -> Maybe Table -- ^ The resulting table and commands
580
tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
581
    let Table ini_nl ini_il ini_cv _ = ini_tbl
582
        all_inst = Container.elems ini_il
583
        all_inst' = if evac_mode
584
                    then let bad_nodes = map Node.idx . filter Node.offline $
585
                                         Container.elems ini_nl
586
                         in filter (\e -> Instance.sNode e `elem` bad_nodes ||
587
                                          Instance.pNode e `elem` bad_nodes)
588
                            all_inst
589
                    else all_inst
590
        reloc_inst = filter Instance.movable all_inst'
591
        node_idx = map Node.idx . filter (not . Node.offline) $
592
                   Container.elems ini_nl
593
        fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
594
        (Table _ _ fin_cv _) = fin_tbl
595
    in
596
      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
597
      then Just fin_tbl -- this round made success, return the new table
598
      else Nothing
599

    
600
-- * Allocation functions
601

    
602
-- | Build failure stats out of a list of failures.
603
collapseFailures :: [FailMode] -> FailStats
604
collapseFailures flst =
605
    map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
606
            [minBound..maxBound]
607

    
608
-- | Update current Allocation solution and failure stats with new
609
-- elements.
610
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
611
concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
612

    
613
concatAllocs as (OpGood ns@(_, _, _, nscore)) =
614
    let -- Choose the old or new solution, based on the cluster score
615
        cntok = asAllocs as
616
        osols = asSolutions as
617
        nsols = case osols of
618
                  [] -> [ns]
619
                  (_, _, _, oscore):[] ->
620
                      if oscore < nscore
621
                      then osols
622
                      else [ns]
623
                  -- FIXME: here we simply concat to lists with more
624
                  -- than one element; we should instead abort, since
625
                  -- this is not a valid usage of this function
626
                  xs -> ns:xs
627
        nsuc = cntok + 1
628
    -- Note: we force evaluation of nsols here in order to keep the
629
    -- memory profile low - we know that we will need nsols for sure
630
    -- in the next cycle, so we force evaluation of nsols, since the
631
    -- foldl' in the caller will only evaluate the tuple, but not the
632
    -- elements of the tuple
633
    in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolutions = nsols }
634

    
635
-- | Given a solution, generates a reasonable description for it.
636
describeSolution :: AllocSolution -> String
637
describeSolution as =
638
  let fcnt = asFailures as
639
      sols = asSolutions as
640
      freasons =
641
        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
642
        filter ((> 0) . snd) . collapseFailures $ fcnt
643
  in if null sols
644
     then "No valid allocation solutions, failure reasons: " ++
645
          (if null fcnt
646
           then "unknown reasons"
647
           else freasons)
648
     else let (_, _, nodes, cv) = head sols
649
          in printf ("score: %.8f, successes %d, failures %d (%s)" ++
650
                     " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
651
             (intercalate "/" . map Node.name $ nodes)
652

    
653
-- | Annotates a solution with the appropriate string.
654
annotateSolution :: AllocSolution -> AllocSolution
655
annotateSolution as = as { asLog = describeSolution as : asLog as }
656

    
657
-- | Reverses an evacuation solution.
658
--
659
-- Rationale: we always concat the results to the top of the lists, so
660
-- for proper jobset execution, we should reverse all lists.
661
reverseEvacSolution :: EvacSolution -> EvacSolution
662
reverseEvacSolution (EvacSolution f m o) =
663
    EvacSolution (reverse f) (reverse m) (reverse o)
664

    
665
-- | Generate the valid node allocation singles or pairs for a new instance.
666
genAllocNodes :: Group.List        -- ^ Group list
667
              -> Node.List         -- ^ The node map
668
              -> Int               -- ^ The number of nodes required
669
              -> Bool              -- ^ Whether to drop or not
670
                                   -- unallocable nodes
671
              -> Result AllocNodes -- ^ The (monadic) result
672
genAllocNodes gl nl count drop_unalloc =
673
    let filter_fn = if drop_unalloc
674
                    then filter (Group.isAllocable .
675
                                 flip Container.find gl . Node.group)
676
                    else id
677
        all_nodes = filter_fn $ getOnline nl
678
        all_pairs = liftM2 (,) all_nodes all_nodes
679
        ok_pairs = filter (\(x, y) -> Node.idx x /= Node.idx y &&
680
                                      Node.group x == Node.group y) all_pairs
681
    in case count of
682
         1 -> Ok (Left (map Node.idx all_nodes))
683
         2 -> Ok (Right (map (\(p, s) -> (Node.idx p, Node.idx s)) ok_pairs))
684
         _ -> Bad "Unsupported number of nodes, only one or two  supported"
685

    
686
-- | Try to allocate an instance on the cluster.
687
tryAlloc :: (Monad m) =>
688
            Node.List         -- ^ The node list
689
         -> Instance.List     -- ^ The instance list
690
         -> Instance.Instance -- ^ The instance to allocate
691
         -> AllocNodes        -- ^ The allocation targets
692
         -> m AllocSolution   -- ^ Possible solution list
693
tryAlloc nl _ inst (Right ok_pairs) =
694
    let sols = foldl' (\cstate (p, s) ->
695
                           concatAllocs cstate $ allocateOnPair nl inst p s
696
                      ) emptyAllocSolution ok_pairs
697

    
698
    in if null ok_pairs -- means we have just one node
699
       then fail "Not enough online nodes"
700
       else return $ annotateSolution sols
701

    
702
tryAlloc nl _ inst (Left all_nodes) =
703
    let sols = foldl' (\cstate ->
704
                           concatAllocs cstate . allocateOnSingle nl inst
705
                      ) emptyAllocSolution all_nodes
706
    in if null all_nodes
707
       then fail "No online nodes"
708
       else return $ annotateSolution sols
709

    
710
-- | Given a group/result, describe it as a nice (list of) messages.
711
solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
712
solutionDescription gl (groupId, result) =
713
  case result of
714
    Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
715
    Bad message -> [printf "Group %s: error %s" gname message]
716
  where grp = Container.find groupId gl
717
        gname = Group.name grp
718
        pol = apolToString (Group.allocPolicy grp)
719

    
720
-- | From a list of possibly bad and possibly empty solutions, filter
721
-- only the groups with a valid result. Note that the result will be
722
-- reversed compared to the original list.
723
filterMGResults :: Group.List
724
                -> [(Gdx, Result AllocSolution)]
725
                -> [(Gdx, AllocSolution)]
726
filterMGResults gl = foldl' fn []
727
    where unallocable = not . Group.isAllocable . flip Container.find gl
728
          fn accu (gdx, rasol) =
729
              case rasol of
730
                Bad _ -> accu
731
                Ok sol | null (asSolutions sol) -> accu
732
                       | unallocable gdx -> accu
733
                       | otherwise -> (gdx, sol):accu
734

    
735
-- | Sort multigroup results based on policy and score.
736
sortMGResults :: Group.List
737
             -> [(Gdx, AllocSolution)]
738
             -> [(Gdx, AllocSolution)]
739
sortMGResults gl sols =
740
    let extractScore (_, _, _, x) = x
741
        solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
742
                               (extractScore . head . asSolutions) sol)
743
    in sortBy (comparing solScore) sols
744

    
745
-- | Finds the best group for an instance on a multi-group cluster.
746
--
747
-- Only solutions in @preferred@ and @last_resort@ groups will be
748
-- accepted as valid, and additionally if the allowed groups parameter
749
-- is not null then allocation will only be run for those group
750
-- indices.
751
findBestAllocGroup :: Group.List           -- ^ The group list
752
                   -> Node.List            -- ^ The node list
753
                   -> Instance.List        -- ^ The instance list
754
                   -> Maybe [Gdx]          -- ^ The allowed groups
755
                   -> Instance.Instance    -- ^ The instance to allocate
756
                   -> Int                  -- ^ Required number of nodes
757
                   -> Result (Gdx, AllocSolution, [String])
758
findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
759
  let groups = splitCluster mgnl mgil
760
      groups' = maybe groups (\gs -> filter ((`elem` gs) . fst) groups)
761
                allowed_gdxs
762
      sols = map (\(gid, (nl, il)) ->
763
                   (gid, genAllocNodes mggl nl cnt False >>=
764
                       tryAlloc nl il inst))
765
             groups'::[(Gdx, Result AllocSolution)]
766
      all_msgs = concatMap (solutionDescription mggl) sols
767
      goodSols = filterMGResults mggl sols
768
      sortedSols = sortMGResults mggl goodSols
769
  in if null sortedSols
770
     then Bad $ intercalate ", " all_msgs
771
     else let (final_group, final_sol) = head sortedSols
772
          in return (final_group, final_sol, all_msgs)
773

    
774
-- | Try to allocate an instance on a multi-group cluster.
775
tryMGAlloc :: Group.List           -- ^ The group list
776
           -> Node.List            -- ^ The node list
777
           -> Instance.List        -- ^ The instance list
778
           -> Instance.Instance    -- ^ The instance to allocate
779
           -> Int                  -- ^ Required number of nodes
780
           -> Result AllocSolution -- ^ Possible solution list
781
tryMGAlloc mggl mgnl mgil inst cnt = do
782
  (best_group, solution, all_msgs) <-
783
      findBestAllocGroup mggl mgnl mgil Nothing inst cnt
784
  let group_name = Group.name $ Container.find best_group mggl
785
      selmsg = "Selected group: " ++ group_name
786
  return $ solution { asLog = selmsg:all_msgs }
787

    
788
-- | Try to relocate an instance on the cluster.
789
tryReloc :: (Monad m) =>
790
            Node.List       -- ^ The node list
791
         -> Instance.List   -- ^ The instance list
792
         -> Idx             -- ^ The index of the instance to move
793
         -> Int             -- ^ The number of nodes required
794
         -> [Ndx]           -- ^ Nodes which should not be used
795
         -> m AllocSolution -- ^ Solution list
796
tryReloc nl il xid 1 ex_idx =
797
    let all_nodes = getOnline nl
798
        inst = Container.find xid il
799
        ex_idx' = Instance.pNode inst:ex_idx
800
        valid_nodes = filter (not . flip elem ex_idx' . Node.idx) all_nodes
801
        valid_idxes = map Node.idx valid_nodes
802
        sols1 = foldl' (\cstate x ->
803
                            let em = do
804
                                  (mnl, i, _, _) <-
805
                                      applyMove nl inst (ReplaceSecondary x)
806
                                  return (mnl, i, [Container.find x mnl],
807
                                          compCV mnl)
808
                            in concatAllocs cstate em
809
                       ) emptyAllocSolution valid_idxes
810
    in return sols1
811

    
812
tryReloc _ _ _ reqn _  = fail $ "Unsupported number of relocation \
813
                                \destinations required (" ++ show reqn ++
814
                                                  "), only one supported"
815

    
816
-- | Change an instance's secondary node.
817
evacInstance :: (Monad m) =>
818
                [Ndx]                      -- ^ Excluded nodes
819
             -> Instance.List              -- ^ The current instance list
820
             -> (Node.List, AllocSolution) -- ^ The current state
821
             -> Idx                        -- ^ The instance to evacuate
822
             -> m (Node.List, AllocSolution)
823
evacInstance ex_ndx il (nl, old_as) idx = do
824
  -- FIXME: hardcoded one node here
825

    
826
  -- Longer explanation: evacuation is currently hardcoded to DRBD
827
  -- instances (which have one secondary); hence, even if the
828
  -- IAllocator protocol can request N nodes for an instance, and all
829
  -- the message parsing/loading pass this, this implementation only
830
  -- supports one; this situation needs to be revisited if we ever
831
  -- support more than one secondary, or if we change the storage
832
  -- model
833
  new_as <- tryReloc nl il idx 1 ex_ndx
834
  case asSolutions new_as of
835
    -- an individual relocation succeeded, we kind of compose the data
836
    -- from the two solutions
837
    csol@(nl', _, _, _):_ ->
838
        return (nl', new_as { asSolutions = csol:asSolutions old_as })
839
    -- this relocation failed, so we fail the entire evac
840
    _ -> fail $ "Can't evacuate instance " ++
841
         Instance.name (Container.find idx il) ++
842
             ": " ++ describeSolution new_as
843

    
844
-- | Try to evacuate a list of nodes.
845
tryEvac :: (Monad m) =>
846
            Node.List       -- ^ The node list
847
         -> Instance.List   -- ^ The instance list
848
         -> [Idx]           -- ^ Instances to be evacuated
849
         -> [Ndx]           -- ^ Restricted nodes (the ones being evacuated)
850
         -> m AllocSolution -- ^ Solution list
851
tryEvac nl il idxs ex_ndx = do
852
  (_, sol) <- foldM (evacInstance ex_ndx il) (nl, emptyAllocSolution) idxs
853
  return sol
854

    
855
-- | Function which fails if the requested mode is change secondary.
856
--
857
-- This is useful since except DRBD, no other disk template can
858
-- execute change secondary; thus, we can just call this function
859
-- instead of always checking for secondary mode. After the call to
860
-- this function, whatever mode we have is just a primary change.
861
failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
862
failOnSecondaryChange ChangeSecondary dt =
863
    fail $ "Instances with disk template '" ++ dtToString dt ++
864
         "' can't execute change secondary"
865
failOnSecondaryChange _ _ = return ()
866

    
867
-- | Run evacuation for a single instance.
868
--
869
-- /Note:/ this function should correctly execute both intra-group
870
-- evacuations (in all modes) and inter-group evacuations (in the
871
-- 'ChangeAll' mode). Of course, this requires that the correct list
872
-- of target nodes is passed.
873
nodeEvacInstance :: Node.List         -- ^ The node list (cluster-wide)
874
                 -> Instance.List     -- ^ Instance list (cluster-wide)
875
                 -> EvacMode          -- ^ The evacuation mode
876
                 -> Instance.Instance -- ^ The instance to be evacuated
877
                 -> Gdx               -- ^ The group we're targetting
878
                 -> [Ndx]             -- ^ The list of available nodes
879
                                      -- for allocation
880
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
881
nodeEvacInstance _ _ mode (Instance.Instance
882
                           {Instance.diskTemplate = dt@DTDiskless}) _ _ =
883
                  failOnSecondaryChange mode dt >>
884
                  fail "Diskless relocations not implemented yet"
885

    
886
nodeEvacInstance _ _ _ (Instance.Instance
887
                        {Instance.diskTemplate = DTPlain}) _ _ =
888
                  fail "Instances of type plain cannot be relocated"
889

    
890
nodeEvacInstance _ _ _ (Instance.Instance
891
                        {Instance.diskTemplate = DTFile}) _ _ =
892
                  fail "Instances of type file cannot be relocated"
893

    
894
nodeEvacInstance _ _ mode  (Instance.Instance
895
                            {Instance.diskTemplate = dt@DTSharedFile}) _ _ =
896
                  failOnSecondaryChange mode dt >>
897
                  fail "Shared file relocations not implemented yet"
898

    
899
nodeEvacInstance _ _ mode (Instance.Instance
900
                           {Instance.diskTemplate = dt@DTBlock}) _ _ =
901
                  failOnSecondaryChange mode dt >>
902
                  fail "Block device relocations not implemented yet"
903

    
904
nodeEvacInstance nl il ChangePrimary
905
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
906
                 _ _ =
907
  do
908
    (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
909
    let idx = Instance.idx inst
910
        il' = Container.add idx inst' il
911
        ops = iMoveToJob nl' il' idx Failover
912
    return (nl', il', ops)
913

    
914
nodeEvacInstance nl il ChangeSecondary
915
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
916
                 gdx avail_nodes =
917
  do
918
    (nl', inst', _, ndx) <- annotateResult "Can't find any good node" $
919
                            eitherToResult $
920
                            foldl' (evacDrbdSecondaryInner nl inst gdx)
921
                            (Left "no nodes available") avail_nodes
922
    let idx = Instance.idx inst
923
        il' = Container.add idx inst' il
924
        ops = iMoveToJob nl' il' idx (ReplaceSecondary ndx)
925
    return (nl', il', ops)
926

    
927
nodeEvacInstance nl il ChangeAll
928
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
929
                 gdx avail_nodes =
930
  do
931
    let primary = Container.find (Instance.pNode inst) nl
932
        idx = Instance.idx inst
933
        no_nodes = Left "no nodes available"
934
    -- if the primary is offline, then we first failover
935
    (nl1, inst1, ops1) <-
936
        if Node.offline primary
937
        then do
938
          (nl', inst', _, _) <-
939
              annotateResult "Failing over to the secondary" $
940
              opToResult $ applyMove nl inst Failover
941
          return (nl', inst', [Failover])
942
        else return (nl, inst, [])
943
    -- we now need to execute a replace secondary to the future
944
    -- primary node
945
    (nl2, inst2, _, new_pdx) <- annotateResult "Searching for a new primary" $
946
                                eitherToResult $
947
                                foldl' (evacDrbdSecondaryInner nl1 inst1 gdx)
948
                                no_nodes avail_nodes
949
    let ops2 = ReplaceSecondary new_pdx:ops1
950
    -- since we chose the new primary, we remove it from the list of
951
    -- available nodes
952
    let avail_nodes_sec = new_pdx `delete` avail_nodes
953
    -- we now execute another failover, the primary stays fixed now
954
    (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" $
955
                          opToResult $ applyMove nl2 inst2 Failover
956
    let ops3 = Failover:ops2
957
    -- and finally another replace secondary, to the final secondary
958
    (nl4, inst4, _, new_sdx) <-
959
        annotateResult "Searching for a new secondary" $
960
        eitherToResult $
961
        foldl' (evacDrbdSecondaryInner nl3 inst3 gdx) no_nodes avail_nodes_sec
962
    let ops4 = ReplaceSecondary new_sdx:ops3
963
        il' = Container.add idx inst4 il
964
        ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
965
    return (nl4, il', ops)
966

    
967
-- | Inner fold function for changing secondary of a DRBD instance.
968
--
969
-- The "running" solution is either a @Left String@, which means we
970
-- don't have yet a working solution, or a @Right (...)@, which
971
-- represents a valid solution; it holds the modified node list, the
972
-- modified instance (after evacuation), the score of that solution,
973
-- and the new secondary node index.
974
evacDrbdSecondaryInner :: Node.List -- ^ Cluster node list
975
                       -> Instance.Instance -- ^ Instance being evacuated
976
                       -> Gdx -- ^ The group index of the instance
977
                       -> Either String ( Node.List
978
                                        , Instance.Instance
979
                                        , Score
980
                                        , Ndx)  -- ^ Current best solution
981
                       -> Ndx  -- ^ Node we're evaluating as new secondary
982
                       -> Either String ( Node.List
983
                                        , Instance.Instance
984
                                        , Score
985
                                        , Ndx) -- ^ New best solution
986
evacDrbdSecondaryInner nl inst gdx accu ndx =
987
    case applyMove nl inst (ReplaceSecondary ndx) of
988
      OpFail fm ->
989
          case accu of
990
            Right _ -> accu
991
            Left _ -> Left $ "Node " ++ Container.nameOf nl ndx ++
992
                      " failed: " ++ show fm
993
      OpGood (nl', inst', _, _) ->
994
          let nodes = Container.elems nl'
995
              -- The fromJust below is ugly (it can fail nastily), but
996
              -- at this point we should have any internal mismatches,
997
              -- and adding a monad here would be quite involved
998
              grpnodes = fromJust (gdx `lookup` (Node.computeGroups nodes))
999
              new_cv = compCVNodes grpnodes
1000
              new_accu = Right (nl', inst', new_cv, ndx)
1001
          in case accu of
1002
               Left _ -> new_accu
1003
               Right (_, _, old_cv, _) ->
1004
                   if old_cv < new_cv
1005
                   then accu
1006
                   else new_accu
1007

    
1008
-- | Computes the nodes in a given group which are available for
1009
-- allocation.
1010
availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1011
                    -> IntSet.IntSet  -- ^ Nodes that are excluded
1012
                    -> Gdx            -- ^ The group for which we
1013
                                      -- query the nodes
1014
                    -> Result [Ndx]   -- ^ List of available node indices
1015
availableGroupNodes group_nodes excl_ndx gdx = do
1016
  local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1017
                 Ok (lookup gdx group_nodes)
1018
  let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1019
  return avail_nodes
1020

    
1021
-- | Updates the evac solution with the results of an instance
1022
-- evacuation.
1023
updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1024
                   -> Idx
1025
                   -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1026
                   -> (Node.List, Instance.List, EvacSolution)
1027
updateEvacSolution (nl, il, es) idx (Bad msg) =
1028
    (nl, il, es { esFailed = (idx, msg):esFailed es})
1029
updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1030
    (nl, il, es { esMoved = new_elem:esMoved es
1031
                , esOpCodes = [opcodes]:esOpCodes es })
1032
     where inst = Container.find idx il
1033
           new_elem = (idx,
1034
                       instancePriGroup nl inst,
1035
                       Instance.allNodes inst)
1036

    
1037
-- | Node-evacuation IAllocator mode main function.
1038
tryNodeEvac :: Group.List    -- ^ The cluster groups
1039
            -> Node.List     -- ^ The node list (cluster-wide, not per group)
1040
            -> Instance.List -- ^ Instance list (cluster-wide)
1041
            -> EvacMode      -- ^ The evacuation mode
1042
            -> [Idx]         -- ^ List of instance (indices) to be evacuated
1043
            -> Result (Node.List, Instance.List, EvacSolution)
1044
tryNodeEvac _ ini_nl ini_il mode idxs =
1045
    let evac_ndx = nodesToEvacuate ini_il mode idxs
1046
        offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1047
        excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1048
        group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1049
                                             (Container.elems nl))) $
1050
                      splitCluster ini_nl ini_il
1051
        (fin_nl, fin_il, esol) =
1052
            foldl' (\state@(nl, il, _) inst ->
1053
                        let gdx = instancePriGroup nl inst in
1054
                        updateEvacSolution state (Instance.idx inst) $
1055
                        availableGroupNodes group_ndx
1056
                          excl_ndx gdx >>=
1057
                        nodeEvacInstance nl il mode inst gdx
1058
                   )
1059
            (ini_nl, ini_il, emptyEvacSolution)
1060
            (map (`Container.find` ini_il) idxs)
1061
    in return (fin_nl, fin_il, reverseEvacSolution esol)
1062

    
1063
-- | Change-group IAllocator mode main function.
1064
--
1065
-- This is very similar to 'tryNodeEvac', the only difference is that
1066
-- we don't choose as target group the current instance group, but
1067
-- instead:
1068
--
1069
--   1. at the start of the function, we compute which are the target
1070
--   groups; either no groups were passed in, in which case we choose
1071
--   all groups out of which we don't evacuate instance, or there were
1072
--   some groups passed, in which case we use those
1073
--
1074
--   2. for each instance, we use 'findBestAllocGroup' to choose the
1075
--   best group to hold the instance, and then we do what
1076
--   'tryNodeEvac' does, except for this group instead of the current
1077
--   instance group.
1078
--
1079
-- Note that the correct behaviour of this function relies on the
1080
-- function 'nodeEvacInstance' to be able to do correctly both
1081
-- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1082
tryChangeGroup :: Group.List    -- ^ The cluster groups
1083
               -> Node.List     -- ^ The node list (cluster-wide)
1084
               -> Instance.List -- ^ Instance list (cluster-wide)
1085
               -> [Gdx]         -- ^ Target groups; if empty, any
1086
                                -- groups not being evacuated
1087
               -> [Idx]         -- ^ List of instance (indices) to be evacuated
1088
               -> Result (Node.List, Instance.List, EvacSolution)
1089
tryChangeGroup gl ini_nl ini_il gdxs idxs =
1090
    let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1091
                               flip Container.find ini_il) idxs
1092
        target_gdxs = (if null gdxs
1093
                       then Container.keys gl
1094
                       else gdxs) \\ evac_gdxs
1095
        offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1096
        excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1097
        group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1098
                                             (Container.elems nl))) $
1099
                      splitCluster ini_nl ini_il
1100
        (fin_nl, fin_il, esol) =
1101
            foldl' (\state@(nl, il, _) inst ->
1102
                        let solution = do
1103
                              let ncnt = Instance.requiredNodes $
1104
                                         Instance.diskTemplate inst
1105
                              (gdx, _, _) <- findBestAllocGroup gl nl il
1106
                                             (Just target_gdxs) inst ncnt
1107
                              av_nodes <- availableGroupNodes group_ndx
1108
                                          excl_ndx gdx
1109
                              nodeEvacInstance nl il ChangeAll inst
1110
                                       gdx av_nodes
1111
                        in updateEvacSolution state
1112
                               (Instance.idx inst) solution
1113
                   )
1114
            (ini_nl, ini_il, emptyEvacSolution)
1115
            (map (`Container.find` ini_il) idxs)
1116
    in return (fin_nl, fin_il, reverseEvacSolution esol)
1117

    
1118
-- | Recursively place instances on the cluster until we're out of space.
1119
iterateAlloc :: Node.List
1120
             -> Instance.List
1121
             -> Maybe Int
1122
             -> Instance.Instance
1123
             -> AllocNodes
1124
             -> [Instance.Instance]
1125
             -> [CStats]
1126
             -> Result AllocResult
1127
iterateAlloc nl il limit newinst allocnodes ixes cstats =
1128
      let depth = length ixes
1129
          newname = printf "new-%d" depth::String
1130
          newidx = length (Container.elems il) + depth
1131
          newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1132
          newlimit = fmap (flip (-) 1) limit
1133
      in case tryAlloc nl il newi2 allocnodes of
1134
           Bad s -> Bad s
1135
           Ok (AllocSolution { asFailures = errs, asSolutions = sols3 }) ->
1136
               let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1137
               case sols3 of
1138
                 [] -> newsol
1139
                 (xnl, xi, _, _):[] ->
1140
                     if limit == Just 0
1141
                     then newsol
1142
                     else iterateAlloc xnl (Container.add newidx xi il)
1143
                          newlimit newinst allocnodes (xi:ixes)
1144
                          (totalResources xnl:cstats)
1145
                 _ -> Bad "Internal error: multiple solutions for single\
1146
                          \ allocation"
1147

    
1148
-- | The core of the tiered allocation mode.
1149
tieredAlloc :: Node.List
1150
            -> Instance.List
1151
            -> Maybe Int
1152
            -> Instance.Instance
1153
            -> AllocNodes
1154
            -> [Instance.Instance]
1155
            -> [CStats]
1156
            -> Result AllocResult
1157
tieredAlloc nl il limit newinst allocnodes ixes cstats =
1158
    case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1159
      Bad s -> Bad s
1160
      Ok (errs, nl', il', ixes', cstats') ->
1161
          let newsol = Ok (errs, nl', il', ixes', cstats')
1162
              ixes_cnt = length ixes'
1163
              (stop, newlimit) = case limit of
1164
                                   Nothing -> (False, Nothing)
1165
                                   Just n -> (n <= ixes_cnt,
1166
                                              Just (n - ixes_cnt)) in
1167
          if stop then newsol else
1168
          case Instance.shrinkByType newinst . fst . last $
1169
               sortBy (comparing snd) errs of
1170
            Bad _ -> newsol
1171
            Ok newinst' -> tieredAlloc nl' il' newlimit
1172
                           newinst' allocnodes ixes' cstats'
1173

    
1174
-- | Compute the tiered spec string description from a list of
1175
-- allocated instances.
1176
tieredSpecMap :: [Instance.Instance]
1177
              -> [String]
1178
tieredSpecMap trl_ixes =
1179
    let fin_trl_ixes = reverse trl_ixes
1180
        ix_byspec = groupBy ((==) `on` Instance.specOf) fin_trl_ixes
1181
        spec_map = map (\ixs -> (Instance.specOf $ head ixs, length ixs))
1182
                   ix_byspec
1183
    in  map (\(spec, cnt) -> printf "%d,%d,%d=%d" (rspecMem spec)
1184
                             (rspecDsk spec) (rspecCpu spec) cnt) spec_map
1185

    
1186
-- * Formatting functions
1187

    
1188
-- | Given the original and final nodes, computes the relocation description.
1189
computeMoves :: Instance.Instance -- ^ The instance to be moved
1190
             -> String -- ^ The instance name
1191
             -> IMove  -- ^ The move being performed
1192
             -> String -- ^ New primary
1193
             -> String -- ^ New secondary
1194
             -> (String, [String])
1195
                -- ^ Tuple of moves and commands list; moves is containing
1196
                -- either @/f/@ for failover or @/r:name/@ for replace
1197
                -- secondary, while the command list holds gnt-instance
1198
                -- commands (without that prefix), e.g \"@failover instance1@\"
1199
computeMoves i inam mv c d =
1200
    case mv of
1201
      Failover -> ("f", [mig])
1202
      FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1203
      ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1204
      ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1205
      ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1206
    where morf = if Instance.running i then "migrate" else "failover"
1207
          mig = printf "%s -f %s" morf inam::String
1208
          rep n = printf "replace-disks -n %s %s" n inam
1209

    
1210
-- | Converts a placement to string format.
1211
printSolutionLine :: Node.List     -- ^ The node list
1212
                  -> Instance.List -- ^ The instance list
1213
                  -> Int           -- ^ Maximum node name length
1214
                  -> Int           -- ^ Maximum instance name length
1215
                  -> Placement     -- ^ The current placement
1216
                  -> Int           -- ^ The index of the placement in
1217
                                   -- the solution
1218
                  -> (String, [String])
1219
printSolutionLine nl il nmlen imlen plc pos =
1220
    let
1221
        pmlen = (2*nmlen + 1)
1222
        (i, p, s, mv, c) = plc
1223
        inst = Container.find i il
1224
        inam = Instance.alias inst
1225
        npri = Node.alias $ Container.find p nl
1226
        nsec = Node.alias $ Container.find s nl
1227
        opri = Node.alias $ Container.find (Instance.pNode inst) nl
1228
        osec = Node.alias $ Container.find (Instance.sNode inst) nl
1229
        (moves, cmds) =  computeMoves inst inam mv npri nsec
1230
        ostr = printf "%s:%s" opri osec::String
1231
        nstr = printf "%s:%s" npri nsec::String
1232
    in
1233
      (printf "  %3d. %-*s %-*s => %-*s %.8f a=%s"
1234
       pos imlen inam pmlen ostr
1235
       pmlen nstr c moves,
1236
       cmds)
1237

    
1238
-- | Return the instance and involved nodes in an instance move.
1239
involvedNodes :: Instance.List -> Placement -> [Ndx]
1240
involvedNodes il plc =
1241
    let (i, np, ns, _, _) = plc
1242
        inst = Container.find i il
1243
        op = Instance.pNode inst
1244
        os = Instance.sNode inst
1245
    in nub [np, ns, op, os]
1246

    
1247
-- | Inner function for splitJobs, that either appends the next job to
1248
-- the current jobset, or starts a new jobset.
1249
mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1250
mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1251
mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1252
    | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1253
    | otherwise = ([n]:cjs, ndx)
1254

    
1255
-- | Break a list of moves into independent groups. Note that this
1256
-- will reverse the order of jobs.
1257
splitJobs :: [MoveJob] -> [JobSet]
1258
splitJobs = fst . foldl mergeJobs ([], [])
1259

    
1260
-- | Given a list of commands, prefix them with @gnt-instance@ and
1261
-- also beautify the display a little.
1262
formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1263
formatJob jsn jsl (sn, (_, _, _, cmds)) =
1264
    let out =
1265
            printf "  echo job %d/%d" jsn sn:
1266
            printf "  check":
1267
            map ("  gnt-instance " ++) cmds
1268
    in if sn == 1
1269
       then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1270
       else out
1271

    
1272
-- | Given a list of commands, prefix them with @gnt-instance@ and
1273
-- also beautify the display a little.
1274
formatCmds :: [JobSet] -> String
1275
formatCmds =
1276
    unlines .
1277
    concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1278
                             (zip [1..] js)) .
1279
    zip [1..]
1280

    
1281
-- | Print the node list.
1282
printNodes :: Node.List -> [String] -> String
1283
printNodes nl fs =
1284
    let fields = case fs of
1285
          [] -> Node.defaultFields
1286
          "+":rest -> Node.defaultFields ++ rest
1287
          _ -> fs
1288
        snl = sortBy (comparing Node.idx) (Container.elems nl)
1289
        (header, isnum) = unzip $ map Node.showHeader fields
1290
    in unlines . map ((:) ' ' .  intercalate " ") $
1291
       formatTable (header:map (Node.list fields) snl) isnum
1292

    
1293
-- | Print the instance list.
1294
printInsts :: Node.List -> Instance.List -> String
1295
printInsts nl il =
1296
    let sil = sortBy (comparing Instance.idx) (Container.elems il)
1297
        helper inst = [ if Instance.running inst then "R" else " "
1298
                      , Instance.name inst
1299
                      , Container.nameOf nl (Instance.pNode inst)
1300
                      , let sdx = Instance.sNode inst
1301
                        in if sdx == Node.noSecondary
1302
                           then  ""
1303
                           else Container.nameOf nl sdx
1304
                      , if Instance.autoBalance inst then "Y" else "N"
1305
                      , printf "%3d" $ Instance.vcpus inst
1306
                      , printf "%5d" $ Instance.mem inst
1307
                      , printf "%5d" $ Instance.dsk inst `div` 1024
1308
                      , printf "%5.3f" lC
1309
                      , printf "%5.3f" lM
1310
                      , printf "%5.3f" lD
1311
                      , printf "%5.3f" lN
1312
                      ]
1313
            where DynUtil lC lM lD lN = Instance.util inst
1314
        header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1315
                 , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1316
        isnum = False:False:False:False:False:repeat True
1317
    in unlines . map ((:) ' ' . intercalate " ") $
1318
       formatTable (header:map helper sil) isnum
1319

    
1320
-- | Shows statistics for a given node list.
1321
printStats :: Node.List -> String
1322
printStats nl =
1323
    let dcvs = compDetailedCV $ Container.elems nl
1324
        (weights, names) = unzip detailedCVInfo
1325
        hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1326
        formatted = map (\(w, header, val) ->
1327
                             printf "%s=%.8f(x%.2f)" header val w::String) hd
1328
    in intercalate ", " formatted
1329

    
1330
-- | Convert a placement into a list of OpCodes (basically a job).
1331
iMoveToJob :: Node.List -> Instance.List
1332
          -> Idx -> IMove -> [OpCodes.OpCode]
1333
iMoveToJob nl il idx move =
1334
    let inst = Container.find idx il
1335
        iname = Instance.name inst
1336
        lookNode  = Just . Container.nameOf nl
1337
        opF = OpCodes.OpInstanceMigrate iname True False True
1338
        opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n)
1339
                OpCodes.ReplaceNewSecondary [] Nothing
1340
    in case move of
1341
         Failover -> [ opF ]
1342
         ReplacePrimary np -> [ opF, opR np, opF ]
1343
         ReplaceSecondary ns -> [ opR ns ]
1344
         ReplaceAndFailover np -> [ opR np, opF ]
1345
         FailoverAndReplace ns -> [ opF, opR ns ]
1346

    
1347
-- * Node group functions
1348

    
1349
-- | Computes the group of an instance.
1350
instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1351
instanceGroup nl i =
1352
  let sidx = Instance.sNode i
1353
      pnode = Container.find (Instance.pNode i) nl
1354
      snode = if sidx == Node.noSecondary
1355
              then pnode
1356
              else Container.find sidx nl
1357
      pgroup = Node.group pnode
1358
      sgroup = Node.group snode
1359
  in if pgroup /= sgroup
1360
     then fail ("Instance placed accross two node groups, primary " ++
1361
                show pgroup ++ ", secondary " ++ show sgroup)
1362
     else return pgroup
1363

    
1364
-- | Computes the group of an instance per the primary node.
1365
instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1366
instancePriGroup nl i =
1367
  let pnode = Container.find (Instance.pNode i) nl
1368
  in  Node.group pnode
1369

    
1370
-- | Compute the list of badly allocated instances (split across node
1371
-- groups).
1372
findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1373
findSplitInstances nl =
1374
  filter (not . isOk . instanceGroup nl) . Container.elems
1375

    
1376
-- | Splits a cluster into the component node groups.
1377
splitCluster :: Node.List -> Instance.List ->
1378
                [(Gdx, (Node.List, Instance.List))]
1379
splitCluster nl il =
1380
  let ngroups = Node.computeGroups (Container.elems nl)
1381
  in map (\(guuid, nodes) ->
1382
           let nidxs = map Node.idx nodes
1383
               nodes' = zip nidxs nodes
1384
               instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1385
           in (guuid, (Container.fromList nodes', instances))) ngroups
1386

    
1387
-- | Compute the list of nodes that are to be evacuated, given a list
1388
-- of instances and an evacuation mode.
1389
nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1390
                -> EvacMode      -- ^ The evacuation mode we're using
1391
                -> [Idx]         -- ^ List of instance indices being evacuated
1392
                -> IntSet.IntSet -- ^ Set of node indices
1393
nodesToEvacuate il mode =
1394
    IntSet.delete Node.noSecondary .
1395
    foldl' (\ns idx ->
1396
                let i = Container.find idx il
1397
                    pdx = Instance.pNode i
1398
                    sdx = Instance.sNode i
1399
                    dt = Instance.diskTemplate i
1400
                    withSecondary = case dt of
1401
                                      DTDrbd8 -> IntSet.insert sdx ns
1402
                                      _ -> ns
1403
                in case mode of
1404
                     ChangePrimary   -> IntSet.insert pdx ns
1405
                     ChangeSecondary -> withSecondary
1406
                     ChangeAll       -> IntSet.insert pdx withSecondary
1407
           ) IntSet.empty