Statistics
| Branch: | Tag: | Revision:

root / htools / Ganeti / HTools / Cluster.hs @ 1ab94e48

History | View | Annotate | Download (63.9 kB)

1
{-| Implementation of cluster-wide logic.
2

    
3
This module holds all pure cluster-logic; I\/O related functionality
4
goes into the /Main/ module for the individual binaries.
5

    
6
-}
7

    
8
{-
9

    
10
Copyright (C) 2009, 2010, 2011 Google Inc.
11

    
12
This program is free software; you can redistribute it and/or modify
13
it under the terms of the GNU General Public License as published by
14
the Free Software Foundation; either version 2 of the License, or
15
(at your option) any later version.
16

    
17
This program is distributed in the hope that it will be useful, but
18
WITHOUT ANY WARRANTY; without even the implied warranty of
19
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
General Public License for more details.
21

    
22
You should have received a copy of the GNU General Public License
23
along with this program; if not, write to the Free Software
24
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25
02110-1301, USA.
26

    
27
-}
28

    
29
module Ganeti.HTools.Cluster
30
    (
31
     -- * Types
32
      AllocSolution(..)
33
    , EvacSolution(..)
34
    , Table(..)
35
    , CStats(..)
36
    , AllocStats
37
    -- * Generic functions
38
    , totalResources
39
    , computeAllocationDelta
40
    -- * First phase functions
41
    , computeBadItems
42
    -- * Second phase functions
43
    , printSolutionLine
44
    , formatCmds
45
    , involvedNodes
46
    , splitJobs
47
    -- * Display functions
48
    , printNodes
49
    , printInsts
50
    -- * Balacing functions
51
    , checkMove
52
    , doNextBalance
53
    , tryBalance
54
    , compCV
55
    , compCVNodes
56
    , compDetailedCV
57
    , printStats
58
    , iMoveToJob
59
    -- * IAllocator functions
60
    , genAllocNodes
61
    , tryAlloc
62
    , tryMGAlloc
63
    , tryReloc
64
    , tryEvac
65
    , tryNodeEvac
66
    , tryChangeGroup
67
    , collapseFailures
68
    -- * Allocation functions
69
    , iterateAlloc
70
    , tieredAlloc
71
     -- * Node group functions
72
    , instanceGroup
73
    , findSplitInstances
74
    , splitCluster
75
    ) where
76

    
77
import qualified Data.IntSet as IntSet
78
import Data.List
79
import Data.Maybe (fromJust)
80
import Data.Ord (comparing)
81
import Text.Printf (printf)
82
import Control.Monad
83

    
84
import qualified Ganeti.HTools.Container as Container
85
import qualified Ganeti.HTools.Instance as Instance
86
import qualified Ganeti.HTools.Node as Node
87
import qualified Ganeti.HTools.Group as Group
88
import Ganeti.HTools.Types
89
import Ganeti.HTools.Utils
90
import Ganeti.HTools.Compat
91
import qualified Ganeti.OpCodes as OpCodes
92

    
93
-- * Types
94

    
95
-- | Allocation\/relocation solution.
96
data AllocSolution = AllocSolution
97
  { asFailures  :: [FailMode]          -- ^ Failure counts
98
  , asAllocs    :: Int                 -- ^ Good allocation count
99
  , asSolutions :: [Node.AllocElement] -- ^ The actual result, length
100
                                       -- of the list depends on the
101
                                       -- allocation/relocation mode
102
  , asLog       :: [String]            -- ^ A list of informational messages
103
  }
104

    
105
-- | Node evacuation/group change iallocator result type. This result
106
-- type consists of actual opcodes (a restricted subset) that are
107
-- transmitted back to Ganeti.
108
data EvacSolution = EvacSolution
109
    { esMoved   :: [(Idx, Gdx, [Ndx])]  -- ^ Instances moved successfully
110
    , esFailed  :: [(Idx, String)]      -- ^ Instances which were not
111
                                        -- relocated
112
    , esOpCodes :: [[OpCodes.OpCode]]   -- ^ List of jobs
113
    }
114

    
115
-- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
116
type AllocResult = (FailStats, Node.List, Instance.List,
117
                    [Instance.Instance], [CStats])
118

    
119
-- | A type denoting the valid allocation mode/pairs.
120
--
121
-- For a one-node allocation, this will be a @Left ['Node.Node']@,
122
-- whereas for a two-node allocation, this will be a @Right
123
-- [('Node.Node', 'Node.Node')]@.
124
type AllocNodes = Either [Ndx] [(Ndx, Ndx)]
125

    
126
-- | The empty solution we start with when computing allocations.
127
emptyAllocSolution :: AllocSolution
128
emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
129
                                   , asSolutions = [], asLog = [] }
130

    
131
-- | The empty evac solution.
132
emptyEvacSolution :: EvacSolution
133
emptyEvacSolution = EvacSolution { esMoved = []
134
                                 , esFailed = []
135
                                 , esOpCodes = []
136
                                 }
137

    
138
-- | The complete state for the balancing solution.
139
data Table = Table Node.List Instance.List Score [Placement]
140
             deriving (Show, Read)
141

    
142
-- | Cluster statistics data type.
143
data CStats = CStats { csFmem :: Integer -- ^ Cluster free mem
144
                     , csFdsk :: Integer -- ^ Cluster free disk
145
                     , csAmem :: Integer -- ^ Cluster allocatable mem
146
                     , csAdsk :: Integer -- ^ Cluster allocatable disk
147
                     , csAcpu :: Integer -- ^ Cluster allocatable cpus
148
                     , csMmem :: Integer -- ^ Max node allocatable mem
149
                     , csMdsk :: Integer -- ^ Max node allocatable disk
150
                     , csMcpu :: Integer -- ^ Max node allocatable cpu
151
                     , csImem :: Integer -- ^ Instance used mem
152
                     , csIdsk :: Integer -- ^ Instance used disk
153
                     , csIcpu :: Integer -- ^ Instance used cpu
154
                     , csTmem :: Double  -- ^ Cluster total mem
155
                     , csTdsk :: Double  -- ^ Cluster total disk
156
                     , csTcpu :: Double  -- ^ Cluster total cpus
157
                     , csVcpu :: Integer -- ^ Cluster virtual cpus (if
158
                                         -- node pCpu has been set,
159
                                         -- otherwise -1)
160
                     , csXmem :: Integer -- ^ Unnacounted for mem
161
                     , csNmem :: Integer -- ^ Node own memory
162
                     , csScore :: Score  -- ^ The cluster score
163
                     , csNinst :: Int    -- ^ The total number of instances
164
                     }
165
            deriving (Show, Read)
166

    
167
-- | Currently used, possibly to allocate, unallocable.
168
type AllocStats = (RSpec, RSpec, RSpec)
169

    
170
-- * Utility functions
171

    
172
-- | Verifies the N+1 status and return the affected nodes.
173
verifyN1 :: [Node.Node] -> [Node.Node]
174
verifyN1 = filter Node.failN1
175

    
176
{-| Computes the pair of bad nodes and instances.
177

    
178
The bad node list is computed via a simple 'verifyN1' check, and the
179
bad instance list is the list of primary and secondary instances of
180
those nodes.
181

    
182
-}
183
computeBadItems :: Node.List -> Instance.List ->
184
                   ([Node.Node], [Instance.Instance])
185
computeBadItems nl il =
186
  let bad_nodes = verifyN1 $ getOnline nl
187
      bad_instances = map (`Container.find` il) .
188
                      sort . nub $
189
                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
190
  in
191
    (bad_nodes, bad_instances)
192

    
193
-- | Zero-initializer for the CStats type.
194
emptyCStats :: CStats
195
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
196

    
197
-- | Update stats with data from a new node.
198
updateCStats :: CStats -> Node.Node -> CStats
199
updateCStats cs node =
200
    let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
201
                 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
202
                 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
203
                 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
204
                 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
205
                 csVcpu = x_vcpu,
206
                 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
207
               }
208
            = cs
209
        inc_amem = Node.fMem node - Node.rMem node
210
        inc_amem' = if inc_amem > 0 then inc_amem else 0
211
        inc_adsk = Node.availDisk node
212
        inc_imem = truncate (Node.tMem node) - Node.nMem node
213
                   - Node.xMem node - Node.fMem node
214
        inc_icpu = Node.uCpu node
215
        inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
216
        inc_vcpu = Node.hiCpu node
217
        inc_acpu = Node.availCpu node
218

    
219
    in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
220
          , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
221
          , csAmem = x_amem + fromIntegral inc_amem'
222
          , csAdsk = x_adsk + fromIntegral inc_adsk
223
          , csAcpu = x_acpu + fromIntegral inc_acpu
224
          , csMmem = max x_mmem (fromIntegral inc_amem')
225
          , csMdsk = max x_mdsk (fromIntegral inc_adsk)
226
          , csMcpu = max x_mcpu (fromIntegral inc_acpu)
227
          , csImem = x_imem + fromIntegral inc_imem
228
          , csIdsk = x_idsk + fromIntegral inc_idsk
229
          , csIcpu = x_icpu + fromIntegral inc_icpu
230
          , csTmem = x_tmem + Node.tMem node
231
          , csTdsk = x_tdsk + Node.tDsk node
232
          , csTcpu = x_tcpu + Node.tCpu node
233
          , csVcpu = x_vcpu + fromIntegral inc_vcpu
234
          , csXmem = x_xmem + fromIntegral (Node.xMem node)
235
          , csNmem = x_nmem + fromIntegral (Node.nMem node)
236
          , csNinst = x_ninst + length (Node.pList node)
237
          }
238

    
239
-- | Compute the total free disk and memory in the cluster.
240
totalResources :: Node.List -> CStats
241
totalResources nl =
242
    let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
243
    in cs { csScore = compCV nl }
244

    
245
-- | Compute the delta between two cluster state.
246
--
247
-- This is used when doing allocations, to understand better the
248
-- available cluster resources. The return value is a triple of the
249
-- current used values, the delta that was still allocated, and what
250
-- was left unallocated.
251
computeAllocationDelta :: CStats -> CStats -> AllocStats
252
computeAllocationDelta cini cfin =
253
    let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini
254
        CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
255
                csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin
256
        rini = RSpec (fromIntegral i_icpu) (fromIntegral i_imem)
257
               (fromIntegral i_idsk)
258
        rfin = RSpec (fromIntegral (f_icpu - i_icpu))
259
               (fromIntegral (f_imem - i_imem))
260
               (fromIntegral (f_idsk - i_idsk))
261
        un_cpu = fromIntegral (v_cpu - f_icpu)::Int
262
        runa = RSpec un_cpu (truncate t_mem - fromIntegral f_imem)
263
               (truncate t_dsk - fromIntegral f_idsk)
264
    in (rini, rfin, runa)
265

    
266
-- | The names and weights of the individual elements in the CV list.
267
detailedCVInfo :: [(Double, String)]
268
detailedCVInfo = [ (1,  "free_mem_cv")
269
                 , (1,  "free_disk_cv")
270
                 , (1,  "n1_cnt")
271
                 , (1,  "reserved_mem_cv")
272
                 , (4,  "offline_all_cnt")
273
                 , (16, "offline_pri_cnt")
274
                 , (1,  "vcpu_ratio_cv")
275
                 , (1,  "cpu_load_cv")
276
                 , (1,  "mem_load_cv")
277
                 , (1,  "disk_load_cv")
278
                 , (1,  "net_load_cv")
279
                 , (2,  "pri_tags_score")
280
                 ]
281

    
282
-- | Holds the weights used by 'compCVNodes' for each metric.
283
detailedCVWeights :: [Double]
284
detailedCVWeights = map fst detailedCVInfo
285

    
286
-- | Compute the mem and disk covariance.
287
compDetailedCV :: [Node.Node] -> [Double]
288
compDetailedCV all_nodes =
289
    let
290
        (offline, nodes) = partition Node.offline all_nodes
291
        mem_l = map Node.pMem nodes
292
        dsk_l = map Node.pDsk nodes
293
        -- metric: memory covariance
294
        mem_cv = stdDev mem_l
295
        -- metric: disk covariance
296
        dsk_cv = stdDev dsk_l
297
        -- metric: count of instances living on N1 failing nodes
298
        n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
299
                                                   length (Node.pList n)) .
300
                   filter Node.failN1 $ nodes :: Double
301
        res_l = map Node.pRem nodes
302
        -- metric: reserved memory covariance
303
        res_cv = stdDev res_l
304
        -- offline instances metrics
305
        offline_ipri = sum . map (length . Node.pList) $ offline
306
        offline_isec = sum . map (length . Node.sList) $ offline
307
        -- metric: count of instances on offline nodes
308
        off_score = fromIntegral (offline_ipri + offline_isec)::Double
309
        -- metric: count of primary instances on offline nodes (this
310
        -- helps with evacuation/failover of primary instances on
311
        -- 2-node clusters with one node offline)
312
        off_pri_score = fromIntegral offline_ipri::Double
313
        cpu_l = map Node.pCpu nodes
314
        -- metric: covariance of vcpu/pcpu ratio
315
        cpu_cv = stdDev cpu_l
316
        -- metrics: covariance of cpu, memory, disk and network load
317
        (c_load, m_load, d_load, n_load) = unzip4 $
318
            map (\n ->
319
                     let DynUtil c1 m1 d1 n1 = Node.utilLoad n
320
                         DynUtil c2 m2 d2 n2 = Node.utilPool n
321
                     in (c1/c2, m1/m2, d1/d2, n1/n2)
322
                ) nodes
323
        -- metric: conflicting instance count
324
        pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
325
        pri_tags_score = fromIntegral pri_tags_inst::Double
326
    in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
327
       , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
328
       , pri_tags_score ]
329

    
330
-- | Compute the /total/ variance.
331
compCVNodes :: [Node.Node] -> Double
332
compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
333

    
334
-- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
335
compCV :: Node.List -> Double
336
compCV = compCVNodes . Container.elems
337

    
338
-- | Compute online nodes from a 'Node.List'.
339
getOnline :: Node.List -> [Node.Node]
340
getOnline = filter (not . Node.offline) . Container.elems
341

    
342
-- * Balancing functions
343

    
344
-- | Compute best table. Note that the ordering of the arguments is important.
345
compareTables :: Table -> Table -> Table
346
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
347
    if a_cv > b_cv then b else a
348

    
349
-- | Applies an instance move to a given node list and instance.
350
applyMove :: Node.List -> Instance.Instance
351
          -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
352
-- Failover (f)
353
applyMove nl inst Failover =
354
    let old_pdx = Instance.pNode inst
355
        old_sdx = Instance.sNode inst
356
        old_p = Container.find old_pdx nl
357
        old_s = Container.find old_sdx nl
358
        int_p = Node.removePri old_p inst
359
        int_s = Node.removeSec old_s inst
360
        force_p = Node.offline old_p
361
        new_nl = do -- Maybe monad
362
          new_p <- Node.addPriEx force_p int_s inst
363
          new_s <- Node.addSec int_p inst old_sdx
364
          let new_inst = Instance.setBoth inst old_sdx old_pdx
365
          return (Container.addTwo old_pdx new_s old_sdx new_p nl,
366
                  new_inst, old_sdx, old_pdx)
367
    in new_nl
368

    
369
-- Replace the primary (f:, r:np, f)
370
applyMove nl inst (ReplacePrimary new_pdx) =
371
    let old_pdx = Instance.pNode inst
372
        old_sdx = Instance.sNode inst
373
        old_p = Container.find old_pdx nl
374
        old_s = Container.find old_sdx nl
375
        tgt_n = Container.find new_pdx nl
376
        int_p = Node.removePri old_p inst
377
        int_s = Node.removeSec old_s inst
378
        force_p = Node.offline old_p
379
        new_nl = do -- Maybe monad
380
          -- check that the current secondary can host the instance
381
          -- during the migration
382
          tmp_s <- Node.addPriEx force_p int_s inst
383
          let tmp_s' = Node.removePri tmp_s inst
384
          new_p <- Node.addPriEx force_p tgt_n inst
385
          new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
386
          let new_inst = Instance.setPri inst new_pdx
387
          return (Container.add new_pdx new_p $
388
                  Container.addTwo old_pdx int_p old_sdx new_s nl,
389
                  new_inst, new_pdx, old_sdx)
390
    in new_nl
391

    
392
-- Replace the secondary (r:ns)
393
applyMove nl inst (ReplaceSecondary new_sdx) =
394
    let old_pdx = Instance.pNode inst
395
        old_sdx = Instance.sNode inst
396
        old_s = Container.find old_sdx nl
397
        tgt_n = Container.find new_sdx nl
398
        int_s = Node.removeSec old_s inst
399
        force_s = Node.offline old_s
400
        new_inst = Instance.setSec inst new_sdx
401
        new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
402
                 \new_s -> return (Container.addTwo new_sdx
403
                                   new_s old_sdx int_s nl,
404
                                   new_inst, old_pdx, new_sdx)
405
    in new_nl
406

    
407
-- Replace the secondary and failover (r:np, f)
408
applyMove nl inst (ReplaceAndFailover new_pdx) =
409
    let old_pdx = Instance.pNode inst
410
        old_sdx = Instance.sNode inst
411
        old_p = Container.find old_pdx nl
412
        old_s = Container.find old_sdx nl
413
        tgt_n = Container.find new_pdx nl
414
        int_p = Node.removePri old_p inst
415
        int_s = Node.removeSec old_s inst
416
        force_s = Node.offline old_s
417
        new_nl = do -- Maybe monad
418
          new_p <- Node.addPri tgt_n inst
419
          new_s <- Node.addSecEx force_s int_p inst new_pdx
420
          let new_inst = Instance.setBoth inst new_pdx old_pdx
421
          return (Container.add new_pdx new_p $
422
                  Container.addTwo old_pdx new_s old_sdx int_s nl,
423
                  new_inst, new_pdx, old_pdx)
424
    in new_nl
425

    
426
-- Failver and replace the secondary (f, r:ns)
427
applyMove nl inst (FailoverAndReplace new_sdx) =
428
    let old_pdx = Instance.pNode inst
429
        old_sdx = Instance.sNode inst
430
        old_p = Container.find old_pdx nl
431
        old_s = Container.find old_sdx nl
432
        tgt_n = Container.find new_sdx nl
433
        int_p = Node.removePri old_p inst
434
        int_s = Node.removeSec old_s inst
435
        force_p = Node.offline old_p
436
        new_nl = do -- Maybe monad
437
          new_p <- Node.addPriEx force_p int_s inst
438
          new_s <- Node.addSecEx force_p tgt_n inst old_sdx
439
          let new_inst = Instance.setBoth inst old_sdx new_sdx
440
          return (Container.add new_sdx new_s $
441
                  Container.addTwo old_sdx new_p old_pdx int_p nl,
442
                  new_inst, old_sdx, new_sdx)
443
    in new_nl
444

    
445
-- | Tries to allocate an instance on one given node.
446
allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
447
                 -> OpResult Node.AllocElement
448
allocateOnSingle nl inst new_pdx =
449
    let p = Container.find new_pdx nl
450
        new_inst = Instance.setBoth inst new_pdx Node.noSecondary
451
    in  Node.addPri p inst >>= \new_p -> do
452
      let new_nl = Container.add new_pdx new_p nl
453
          new_score = compCV nl
454
      return (new_nl, new_inst, [new_p], new_score)
455

    
456
-- | Tries to allocate an instance on a given pair of nodes.
457
allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
458
               -> OpResult Node.AllocElement
459
allocateOnPair nl inst new_pdx new_sdx =
460
    let tgt_p = Container.find new_pdx nl
461
        tgt_s = Container.find new_sdx nl
462
    in do
463
      new_p <- Node.addPri tgt_p inst
464
      new_s <- Node.addSec tgt_s inst new_pdx
465
      let new_inst = Instance.setBoth inst new_pdx new_sdx
466
          new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
467
      return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
468

    
469
-- | Tries to perform an instance move and returns the best table
470
-- between the original one and the new one.
471
checkSingleStep :: Table -- ^ The original table
472
                -> Instance.Instance -- ^ The instance to move
473
                -> Table -- ^ The current best table
474
                -> IMove -- ^ The move to apply
475
                -> Table -- ^ The final best table
476
checkSingleStep ini_tbl target cur_tbl move =
477
    let
478
        Table ini_nl ini_il _ ini_plc = ini_tbl
479
        tmp_resu = applyMove ini_nl target move
480
    in
481
      case tmp_resu of
482
        OpFail _ -> cur_tbl
483
        OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
484
            let tgt_idx = Instance.idx target
485
                upd_cvar = compCV upd_nl
486
                upd_il = Container.add tgt_idx new_inst ini_il
487
                upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
488
                upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
489
            in
490
              compareTables cur_tbl upd_tbl
491

    
492
-- | Given the status of the current secondary as a valid new node and
493
-- the current candidate target node, generate the possible moves for
494
-- a instance.
495
possibleMoves :: Bool      -- ^ Whether the secondary node is a valid new node
496
              -> Bool      -- ^ Whether we can change the primary node
497
              -> Ndx       -- ^ Target node candidate
498
              -> [IMove]   -- ^ List of valid result moves
499

    
500
possibleMoves _ False tdx =
501
    [ReplaceSecondary tdx]
502

    
503
possibleMoves True True tdx =
504
    [ReplaceSecondary tdx,
505
     ReplaceAndFailover tdx,
506
     ReplacePrimary tdx,
507
     FailoverAndReplace tdx]
508

    
509
possibleMoves False True tdx =
510
    [ReplaceSecondary tdx,
511
     ReplaceAndFailover tdx]
512

    
513
-- | Compute the best move for a given instance.
514
checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
515
                  -> Bool              -- ^ Whether disk moves are allowed
516
                  -> Bool              -- ^ Whether instance moves are allowed
517
                  -> Table             -- ^ Original table
518
                  -> Instance.Instance -- ^ Instance to move
519
                  -> Table             -- ^ Best new table for this instance
520
checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
521
    let
522
        opdx = Instance.pNode target
523
        osdx = Instance.sNode target
524
        nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx
525
        use_secondary = elem osdx nodes_idx && inst_moves
526
        aft_failover = if use_secondary -- if allowed to failover
527
                       then checkSingleStep ini_tbl target ini_tbl Failover
528
                       else ini_tbl
529
        all_moves = if disk_moves
530
                    then concatMap
531
                         (possibleMoves use_secondary inst_moves) nodes
532
                    else []
533
    in
534
      -- iterate over the possible nodes for this instance
535
      foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
536

    
537
-- | Compute the best next move.
538
checkMove :: [Ndx]               -- ^ Allowed target node indices
539
          -> Bool                -- ^ Whether disk moves are allowed
540
          -> Bool                -- ^ Whether instance moves are allowed
541
          -> Table               -- ^ The current solution
542
          -> [Instance.Instance] -- ^ List of instances still to move
543
          -> Table               -- ^ The new solution
544
checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
545
    let Table _ _ _ ini_plc = ini_tbl
546
        -- we're using rwhnf from the Control.Parallel.Strategies
547
        -- package; we don't need to use rnf as that would force too
548
        -- much evaluation in single-threaded cases, and in
549
        -- multi-threaded case the weak head normal form is enough to
550
        -- spark the evaluation
551
        tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
552
                               inst_moves ini_tbl)
553
                 victims
554
        -- iterate over all instances, computing the best move
555
        best_tbl = foldl' compareTables ini_tbl tables
556
        Table _ _ _ best_plc = best_tbl
557
    in if length best_plc == length ini_plc
558
       then ini_tbl -- no advancement
559
       else best_tbl
560

    
561
-- | Check if we are allowed to go deeper in the balancing.
562
doNextBalance :: Table     -- ^ The starting table
563
              -> Int       -- ^ Remaining length
564
              -> Score     -- ^ Score at which to stop
565
              -> Bool      -- ^ The resulting table and commands
566
doNextBalance ini_tbl max_rounds min_score =
567
    let Table _ _ ini_cv ini_plc = ini_tbl
568
        ini_plc_len = length ini_plc
569
    in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
570

    
571
-- | Run a balance move.
572
tryBalance :: Table       -- ^ The starting table
573
           -> Bool        -- ^ Allow disk moves
574
           -> Bool        -- ^ Allow instance moves
575
           -> Bool        -- ^ Only evacuate moves
576
           -> Score       -- ^ Min gain threshold
577
           -> Score       -- ^ Min gain
578
           -> Maybe Table -- ^ The resulting table and commands
579
tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
580
    let Table ini_nl ini_il ini_cv _ = ini_tbl
581
        all_inst = Container.elems ini_il
582
        all_inst' = if evac_mode
583
                    then let bad_nodes = map Node.idx . filter Node.offline $
584
                                         Container.elems ini_nl
585
                         in filter (any (`elem` bad_nodes) . Instance.allNodes)
586
                            all_inst
587
                    else all_inst
588
        reloc_inst = filter Instance.movable all_inst'
589
        node_idx = map Node.idx . filter (not . Node.offline) $
590
                   Container.elems ini_nl
591
        fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
592
        (Table _ _ fin_cv _) = fin_tbl
593
    in
594
      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
595
      then Just fin_tbl -- this round made success, return the new table
596
      else Nothing
597

    
598
-- * Allocation functions
599

    
600
-- | Build failure stats out of a list of failures.
601
collapseFailures :: [FailMode] -> FailStats
602
collapseFailures flst =
603
    map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
604
            [minBound..maxBound]
605

    
606
-- | Update current Allocation solution and failure stats with new
607
-- elements.
608
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
609
concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
610

    
611
concatAllocs as (OpGood ns@(_, _, _, nscore)) =
612
    let -- Choose the old or new solution, based on the cluster score
613
        cntok = asAllocs as
614
        osols = asSolutions as
615
        nsols = case osols of
616
                  [] -> [ns]
617
                  (_, _, _, oscore):[] ->
618
                      if oscore < nscore
619
                      then osols
620
                      else [ns]
621
                  -- FIXME: here we simply concat to lists with more
622
                  -- than one element; we should instead abort, since
623
                  -- this is not a valid usage of this function
624
                  xs -> ns:xs
625
        nsuc = cntok + 1
626
    -- Note: we force evaluation of nsols here in order to keep the
627
    -- memory profile low - we know that we will need nsols for sure
628
    -- in the next cycle, so we force evaluation of nsols, since the
629
    -- foldl' in the caller will only evaluate the tuple, but not the
630
    -- elements of the tuple
631
    in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolutions = nsols }
632

    
633
-- | Given a solution, generates a reasonable description for it.
634
describeSolution :: AllocSolution -> String
635
describeSolution as =
636
  let fcnt = asFailures as
637
      sols = asSolutions as
638
      freasons =
639
        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
640
        filter ((> 0) . snd) . collapseFailures $ fcnt
641
  in if null sols
642
     then "No valid allocation solutions, failure reasons: " ++
643
          (if null fcnt
644
           then "unknown reasons"
645
           else freasons)
646
     else let (_, _, nodes, cv) = head sols
647
          in printf ("score: %.8f, successes %d, failures %d (%s)" ++
648
                     " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
649
             (intercalate "/" . map Node.name $ nodes)
650

    
651
-- | Annotates a solution with the appropriate string.
652
annotateSolution :: AllocSolution -> AllocSolution
653
annotateSolution as = as { asLog = describeSolution as : asLog as }
654

    
655
-- | Reverses an evacuation solution.
656
--
657
-- Rationale: we always concat the results to the top of the lists, so
658
-- for proper jobset execution, we should reverse all lists.
659
reverseEvacSolution :: EvacSolution -> EvacSolution
660
reverseEvacSolution (EvacSolution f m o) =
661
    EvacSolution (reverse f) (reverse m) (reverse o)
662

    
663
-- | Generate the valid node allocation singles or pairs for a new instance.
664
genAllocNodes :: Group.List        -- ^ Group list
665
              -> Node.List         -- ^ The node map
666
              -> Int               -- ^ The number of nodes required
667
              -> Bool              -- ^ Whether to drop or not
668
                                   -- unallocable nodes
669
              -> Result AllocNodes -- ^ The (monadic) result
670
genAllocNodes gl nl count drop_unalloc =
671
    let filter_fn = if drop_unalloc
672
                    then filter (Group.isAllocable .
673
                                 flip Container.find gl . Node.group)
674
                    else id
675
        all_nodes = filter_fn $ getOnline nl
676
        all_pairs = liftM2 (,) all_nodes all_nodes
677
        ok_pairs = filter (\(x, y) -> Node.idx x /= Node.idx y &&
678
                                      Node.group x == Node.group y) all_pairs
679
    in case count of
680
         1 -> Ok (Left (map Node.idx all_nodes))
681
         2 -> Ok (Right (map (\(p, s) -> (Node.idx p, Node.idx s)) ok_pairs))
682
         _ -> Bad "Unsupported number of nodes, only one or two  supported"
683

    
684
-- | Try to allocate an instance on the cluster.
685
tryAlloc :: (Monad m) =>
686
            Node.List         -- ^ The node list
687
         -> Instance.List     -- ^ The instance list
688
         -> Instance.Instance -- ^ The instance to allocate
689
         -> AllocNodes        -- ^ The allocation targets
690
         -> m AllocSolution   -- ^ Possible solution list
691
tryAlloc nl _ inst (Right ok_pairs) =
692
    let sols = foldl' (\cstate (p, s) ->
693
                           concatAllocs cstate $ allocateOnPair nl inst p s
694
                      ) emptyAllocSolution ok_pairs
695

    
696
    in if null ok_pairs -- means we have just one node
697
       then fail "Not enough online nodes"
698
       else return $ annotateSolution sols
699

    
700
tryAlloc nl _ inst (Left all_nodes) =
701
    let sols = foldl' (\cstate ->
702
                           concatAllocs cstate . allocateOnSingle nl inst
703
                      ) emptyAllocSolution all_nodes
704
    in if null all_nodes
705
       then fail "No online nodes"
706
       else return $ annotateSolution sols
707

    
708
-- | Given a group/result, describe it as a nice (list of) messages.
709
solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
710
solutionDescription gl (groupId, result) =
711
  case result of
712
    Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
713
    Bad message -> [printf "Group %s: error %s" gname message]
714
  where grp = Container.find groupId gl
715
        gname = Group.name grp
716
        pol = apolToString (Group.allocPolicy grp)
717

    
718
-- | From a list of possibly bad and possibly empty solutions, filter
719
-- only the groups with a valid result. Note that the result will be
720
-- reversed compared to the original list.
721
filterMGResults :: Group.List
722
                -> [(Gdx, Result AllocSolution)]
723
                -> [(Gdx, AllocSolution)]
724
filterMGResults gl = foldl' fn []
725
    where unallocable = not . Group.isAllocable . flip Container.find gl
726
          fn accu (gdx, rasol) =
727
              case rasol of
728
                Bad _ -> accu
729
                Ok sol | null (asSolutions sol) -> accu
730
                       | unallocable gdx -> accu
731
                       | otherwise -> (gdx, sol):accu
732

    
733
-- | Sort multigroup results based on policy and score.
734
sortMGResults :: Group.List
735
             -> [(Gdx, AllocSolution)]
736
             -> [(Gdx, AllocSolution)]
737
sortMGResults gl sols =
738
    let extractScore (_, _, _, x) = x
739
        solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
740
                               (extractScore . head . asSolutions) sol)
741
    in sortBy (comparing solScore) sols
742

    
743
-- | Finds the best group for an instance on a multi-group cluster.
744
--
745
-- Only solutions in @preferred@ and @last_resort@ groups will be
746
-- accepted as valid, and additionally if the allowed groups parameter
747
-- is not null then allocation will only be run for those group
748
-- indices.
749
findBestAllocGroup :: Group.List           -- ^ The group list
750
                   -> Node.List            -- ^ The node list
751
                   -> Instance.List        -- ^ The instance list
752
                   -> Maybe [Gdx]          -- ^ The allowed groups
753
                   -> Instance.Instance    -- ^ The instance to allocate
754
                   -> Int                  -- ^ Required number of nodes
755
                   -> Result (Gdx, AllocSolution, [String])
756
findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
757
  let groups = splitCluster mgnl mgil
758
      groups' = maybe groups (\gs -> filter ((`elem` gs) . fst) groups)
759
                allowed_gdxs
760
      sols = map (\(gid, (nl, il)) ->
761
                   (gid, genAllocNodes mggl nl cnt False >>=
762
                       tryAlloc nl il inst))
763
             groups'::[(Gdx, Result AllocSolution)]
764
      all_msgs = concatMap (solutionDescription mggl) sols
765
      goodSols = filterMGResults mggl sols
766
      sortedSols = sortMGResults mggl goodSols
767
  in if null sortedSols
768
     then Bad $ intercalate ", " all_msgs
769
     else let (final_group, final_sol) = head sortedSols
770
          in return (final_group, final_sol, all_msgs)
771

    
772
-- | Try to allocate an instance on a multi-group cluster.
773
tryMGAlloc :: Group.List           -- ^ The group list
774
           -> Node.List            -- ^ The node list
775
           -> Instance.List        -- ^ The instance list
776
           -> Instance.Instance    -- ^ The instance to allocate
777
           -> Int                  -- ^ Required number of nodes
778
           -> Result AllocSolution -- ^ Possible solution list
779
tryMGAlloc mggl mgnl mgil inst cnt = do
780
  (best_group, solution, all_msgs) <-
781
      findBestAllocGroup mggl mgnl mgil Nothing inst cnt
782
  let group_name = Group.name $ Container.find best_group mggl
783
      selmsg = "Selected group: " ++ group_name
784
  return $ solution { asLog = selmsg:all_msgs }
785

    
786
-- | Try to relocate an instance on the cluster.
787
tryReloc :: (Monad m) =>
788
            Node.List       -- ^ The node list
789
         -> Instance.List   -- ^ The instance list
790
         -> Idx             -- ^ The index of the instance to move
791
         -> Int             -- ^ The number of nodes required
792
         -> [Ndx]           -- ^ Nodes which should not be used
793
         -> m AllocSolution -- ^ Solution list
794
tryReloc nl il xid 1 ex_idx =
795
    let all_nodes = getOnline nl
796
        inst = Container.find xid il
797
        ex_idx' = Instance.pNode inst:ex_idx
798
        valid_nodes = filter (not . flip elem ex_idx' . Node.idx) all_nodes
799
        valid_idxes = map Node.idx valid_nodes
800
        sols1 = foldl' (\cstate x ->
801
                            let em = do
802
                                  (mnl, i, _, _) <-
803
                                      applyMove nl inst (ReplaceSecondary x)
804
                                  return (mnl, i, [Container.find x mnl],
805
                                          compCV mnl)
806
                            in concatAllocs cstate em
807
                       ) emptyAllocSolution valid_idxes
808
    in return sols1
809

    
810
tryReloc _ _ _ reqn _  = fail $ "Unsupported number of relocation \
811
                                \destinations required (" ++ show reqn ++
812
                                                  "), only one supported"
813

    
814
-- | Change an instance's secondary node.
815
evacInstance :: (Monad m) =>
816
                [Ndx]                      -- ^ Excluded nodes
817
             -> Instance.List              -- ^ The current instance list
818
             -> (Node.List, AllocSolution) -- ^ The current state
819
             -> Idx                        -- ^ The instance to evacuate
820
             -> m (Node.List, AllocSolution)
821
evacInstance ex_ndx il (nl, old_as) idx = do
822
  -- FIXME: hardcoded one node here
823

    
824
  -- Longer explanation: evacuation is currently hardcoded to DRBD
825
  -- instances (which have one secondary); hence, even if the
826
  -- IAllocator protocol can request N nodes for an instance, and all
827
  -- the message parsing/loading pass this, this implementation only
828
  -- supports one; this situation needs to be revisited if we ever
829
  -- support more than one secondary, or if we change the storage
830
  -- model
831
  new_as <- tryReloc nl il idx 1 ex_ndx
832
  case asSolutions new_as of
833
    -- an individual relocation succeeded, we kind of compose the data
834
    -- from the two solutions
835
    csol@(nl', _, _, _):_ ->
836
        return (nl', new_as { asSolutions = csol:asSolutions old_as })
837
    -- this relocation failed, so we fail the entire evac
838
    _ -> fail $ "Can't evacuate instance " ++
839
         Instance.name (Container.find idx il) ++
840
             ": " ++ describeSolution new_as
841

    
842
-- | Try to evacuate a list of nodes.
843
tryEvac :: (Monad m) =>
844
            Node.List       -- ^ The node list
845
         -> Instance.List   -- ^ The instance list
846
         -> [Idx]           -- ^ Instances to be evacuated
847
         -> [Ndx]           -- ^ Restricted nodes (the ones being evacuated)
848
         -> m AllocSolution -- ^ Solution list
849
tryEvac nl il idxs ex_ndx = do
850
  (_, sol) <- foldM (evacInstance ex_ndx il) (nl, emptyAllocSolution) idxs
851
  return sol
852

    
853
-- | Function which fails if the requested mode is change secondary.
854
--
855
-- This is useful since except DRBD, no other disk template can
856
-- execute change secondary; thus, we can just call this function
857
-- instead of always checking for secondary mode. After the call to
858
-- this function, whatever mode we have is just a primary change.
859
failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
860
failOnSecondaryChange ChangeSecondary dt =
861
    fail $ "Instances with disk template '" ++ dtToString dt ++
862
         "' can't execute change secondary"
863
failOnSecondaryChange _ _ = return ()
864

    
865
-- | Run evacuation for a single instance.
866
--
867
-- /Note:/ this function should correctly execute both intra-group
868
-- evacuations (in all modes) and inter-group evacuations (in the
869
-- 'ChangeAll' mode). Of course, this requires that the correct list
870
-- of target nodes is passed.
871
nodeEvacInstance :: Node.List         -- ^ The node list (cluster-wide)
872
                 -> Instance.List     -- ^ Instance list (cluster-wide)
873
                 -> EvacMode          -- ^ The evacuation mode
874
                 -> Instance.Instance -- ^ The instance to be evacuated
875
                 -> Gdx               -- ^ The group we're targetting
876
                 -> [Ndx]             -- ^ The list of available nodes
877
                                      -- for allocation
878
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
879
nodeEvacInstance _ _ mode (Instance.Instance
880
                           {Instance.diskTemplate = dt@DTDiskless}) _ _ =
881
                  failOnSecondaryChange mode dt >>
882
                  fail "Diskless relocations not implemented yet"
883

    
884
nodeEvacInstance _ _ _ (Instance.Instance
885
                        {Instance.diskTemplate = DTPlain}) _ _ =
886
                  fail "Instances of type plain cannot be relocated"
887

    
888
nodeEvacInstance _ _ _ (Instance.Instance
889
                        {Instance.diskTemplate = DTFile}) _ _ =
890
                  fail "Instances of type file cannot be relocated"
891

    
892
nodeEvacInstance _ _ mode  (Instance.Instance
893
                            {Instance.diskTemplate = dt@DTSharedFile}) _ _ =
894
                  failOnSecondaryChange mode dt >>
895
                  fail "Shared file relocations not implemented yet"
896

    
897
nodeEvacInstance _ _ mode (Instance.Instance
898
                           {Instance.diskTemplate = dt@DTBlock}) _ _ =
899
                  failOnSecondaryChange mode dt >>
900
                  fail "Block device relocations not implemented yet"
901

    
902
nodeEvacInstance nl il ChangePrimary
903
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
904
                 _ _ =
905
  do
906
    (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
907
    let idx = Instance.idx inst
908
        il' = Container.add idx inst' il
909
        ops = iMoveToJob nl' il' idx Failover
910
    return (nl', il', ops)
911

    
912
nodeEvacInstance nl il ChangeSecondary
913
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
914
                 gdx avail_nodes =
915
  do
916
    (nl', inst', _, ndx) <- annotateResult "Can't find any good node" $
917
                            eitherToResult $
918
                            foldl' (evacDrbdSecondaryInner nl inst gdx)
919
                            (Left "no nodes available") avail_nodes
920
    let idx = Instance.idx inst
921
        il' = Container.add idx inst' il
922
        ops = iMoveToJob nl' il' idx (ReplaceSecondary ndx)
923
    return (nl', il', ops)
924

    
925
-- The algorithm for ChangeAll is as follows:
926
--
927
-- * generate all (primary, secondary) node pairs for the target groups
928
-- * for each pair, execute the needed moves (r:s, f, r:s) and compute
929
--   the final node list state and group score
930
-- * select the best choice via a foldl that uses the same Either
931
--   String solution as the ChangeSecondary mode
932
nodeEvacInstance nl il ChangeAll
933
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
934
                 gdx avail_nodes =
935
  do
936
    let no_nodes = Left "no nodes available"
937
        node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s]
938
    (nl', il', ops, _) <-
939
        annotateResult "Can't find any good nodes for relocation" $
940
        eitherToResult $
941
        foldl'
942
        (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of
943
                          Bad msg ->
944
                              case accu of
945
                                Right _ -> accu
946
                                -- we don't need more details (which
947
                                -- nodes, etc.) as we only selected
948
                                -- this group if we can allocate on
949
                                -- it, hence failures will not
950
                                -- propagate out of this fold loop
951
                                Left _ -> Left $ "Allocation failed: " ++ msg
952
                          Ok result@(_, _, _, new_cv) ->
953
                              let new_accu = Right result in
954
                              case accu of
955
                                Left _ -> new_accu
956
                                Right (_, _, _, old_cv) ->
957
                                    if old_cv < new_cv
958
                                    then accu
959
                                    else new_accu
960
        ) no_nodes node_pairs
961

    
962
    return (nl', il', ops)
963

    
964
-- | Inner fold function for changing secondary of a DRBD instance.
965
--
966
-- The running solution is either a @Left String@, which means we
967
-- don't have yet a working solution, or a @Right (...)@, which
968
-- represents a valid solution; it holds the modified node list, the
969
-- modified instance (after evacuation), the score of that solution,
970
-- and the new secondary node index.
971
evacDrbdSecondaryInner :: Node.List -- ^ Cluster node list
972
                       -> Instance.Instance -- ^ Instance being evacuated
973
                       -> Gdx -- ^ The group index of the instance
974
                       -> Either String ( Node.List
975
                                        , Instance.Instance
976
                                        , Score
977
                                        , Ndx)  -- ^ Current best solution
978
                       -> Ndx  -- ^ Node we're evaluating as new secondary
979
                       -> Either String ( Node.List
980
                                        , Instance.Instance
981
                                        , Score
982
                                        , Ndx) -- ^ New best solution
983
evacDrbdSecondaryInner nl inst gdx accu ndx =
984
    case applyMove nl inst (ReplaceSecondary ndx) of
985
      OpFail fm ->
986
          case accu of
987
            Right _ -> accu
988
            Left _ -> Left $ "Node " ++ Container.nameOf nl ndx ++
989
                      " failed: " ++ show fm
990
      OpGood (nl', inst', _, _) ->
991
          let nodes = Container.elems nl'
992
              -- The fromJust below is ugly (it can fail nastily), but
993
              -- at this point we should have any internal mismatches,
994
              -- and adding a monad here would be quite involved
995
              grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
996
              new_cv = compCVNodes grpnodes
997
              new_accu = Right (nl', inst', new_cv, ndx)
998
          in case accu of
999
               Left _ -> new_accu
1000
               Right (_, _, old_cv, _) ->
1001
                   if old_cv < new_cv
1002
                   then accu
1003
                   else new_accu
1004

    
1005
-- | Compute result of changing all nodes of a DRBD instance.
1006
--
1007
-- Given the target primary and secondary node (which might be in a
1008
-- different group or not), this function will 'execute' all the
1009
-- required steps and assuming all operations succceed, will return
1010
-- the modified node and instance lists, the opcodes needed for this
1011
-- and the new group score.
1012
evacDrbdAllInner :: Node.List         -- ^ Cluster node list
1013
                 -> Instance.List     -- ^ Cluster instance list
1014
                 -> Instance.Instance -- ^ The instance to be moved
1015
                 -> Gdx               -- ^ The target group index
1016
                                      -- (which can differ from the
1017
                                      -- current group of the
1018
                                      -- instance)
1019
                 -> (Ndx, Ndx)        -- ^ Tuple of new
1020
                                      -- primary\/secondary nodes
1021
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score)
1022
evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) =
1023
  do
1024
    let primary = Container.find (Instance.pNode inst) nl
1025
        idx = Instance.idx inst
1026
    -- if the primary is offline, then we first failover
1027
    (nl1, inst1, ops1) <-
1028
        if Node.offline primary
1029
        then do
1030
          (nl', inst', _, _) <-
1031
              annotateResult "Failing over to the secondary" $
1032
              opToResult $ applyMove nl inst Failover
1033
          return (nl', inst', [Failover])
1034
        else return (nl, inst, [])
1035
    let (o1, o2, o3) = (ReplaceSecondary t_pdx,
1036
                        Failover,
1037
                        ReplaceSecondary t_sdx)
1038
    -- we now need to execute a replace secondary to the future
1039
    -- primary node
1040
    (nl2, inst2, _, _) <-
1041
        annotateResult "Changing secondary to new primary" $
1042
        opToResult $
1043
        applyMove nl1 inst1 o1
1044
    let ops2 = o1:ops1
1045
    -- we now execute another failover, the primary stays fixed now
1046
    (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" $
1047
                          opToResult $ applyMove nl2 inst2 o2
1048
    let ops3 = o2:ops2
1049
    -- and finally another replace secondary, to the final secondary
1050
    (nl4, inst4, _, _) <-
1051
        annotateResult "Changing secondary to final secondary" $
1052
        opToResult $
1053
        applyMove nl3 inst3 o3
1054
    let ops4 = o3:ops3
1055
        il' = Container.add idx inst4 il
1056
        ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1057
    let nodes = Container.elems nl4
1058
        -- The fromJust below is ugly (it can fail nastily), but
1059
        -- at this point we should have any internal mismatches,
1060
        -- and adding a monad here would be quite involved
1061
        grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1062
        new_cv = compCVNodes grpnodes
1063
    return (nl4, il', ops, new_cv)
1064

    
1065
-- | Computes the nodes in a given group which are available for
1066
-- allocation.
1067
availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1068
                    -> IntSet.IntSet  -- ^ Nodes that are excluded
1069
                    -> Gdx            -- ^ The group for which we
1070
                                      -- query the nodes
1071
                    -> Result [Ndx]   -- ^ List of available node indices
1072
availableGroupNodes group_nodes excl_ndx gdx = do
1073
  local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1074
                 Ok (lookup gdx group_nodes)
1075
  let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1076
  return avail_nodes
1077

    
1078
-- | Updates the evac solution with the results of an instance
1079
-- evacuation.
1080
updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1081
                   -> Idx
1082
                   -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1083
                   -> (Node.List, Instance.List, EvacSolution)
1084
updateEvacSolution (nl, il, es) idx (Bad msg) =
1085
    (nl, il, es { esFailed = (idx, msg):esFailed es})
1086
updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1087
    (nl, il, es { esMoved = new_elem:esMoved es
1088
                , esOpCodes = opcodes:esOpCodes es })
1089
     where inst = Container.find idx il
1090
           new_elem = (idx,
1091
                       instancePriGroup nl inst,
1092
                       Instance.allNodes inst)
1093

    
1094
-- | Node-evacuation IAllocator mode main function.
1095
tryNodeEvac :: Group.List    -- ^ The cluster groups
1096
            -> Node.List     -- ^ The node list (cluster-wide, not per group)
1097
            -> Instance.List -- ^ Instance list (cluster-wide)
1098
            -> EvacMode      -- ^ The evacuation mode
1099
            -> [Idx]         -- ^ List of instance (indices) to be evacuated
1100
            -> Result (Node.List, Instance.List, EvacSolution)
1101
tryNodeEvac _ ini_nl ini_il mode idxs =
1102
    let evac_ndx = nodesToEvacuate ini_il mode idxs
1103
        offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1104
        excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1105
        group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1106
                                             (Container.elems nl))) $
1107
                      splitCluster ini_nl ini_il
1108
        (fin_nl, fin_il, esol) =
1109
            foldl' (\state@(nl, il, _) inst ->
1110
                        let gdx = instancePriGroup nl inst in
1111
                        updateEvacSolution state (Instance.idx inst) $
1112
                        availableGroupNodes group_ndx
1113
                          excl_ndx gdx >>=
1114
                        nodeEvacInstance nl il mode inst gdx
1115
                   )
1116
            (ini_nl, ini_il, emptyEvacSolution)
1117
            (map (`Container.find` ini_il) idxs)
1118
    in return (fin_nl, fin_il, reverseEvacSolution esol)
1119

    
1120
-- | Change-group IAllocator mode main function.
1121
--
1122
-- This is very similar to 'tryNodeEvac', the only difference is that
1123
-- we don't choose as target group the current instance group, but
1124
-- instead:
1125
--
1126
--   1. at the start of the function, we compute which are the target
1127
--   groups; either no groups were passed in, in which case we choose
1128
--   all groups out of which we don't evacuate instance, or there were
1129
--   some groups passed, in which case we use those
1130
--
1131
--   2. for each instance, we use 'findBestAllocGroup' to choose the
1132
--   best group to hold the instance, and then we do what
1133
--   'tryNodeEvac' does, except for this group instead of the current
1134
--   instance group.
1135
--
1136
-- Note that the correct behaviour of this function relies on the
1137
-- function 'nodeEvacInstance' to be able to do correctly both
1138
-- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1139
tryChangeGroup :: Group.List    -- ^ The cluster groups
1140
               -> Node.List     -- ^ The node list (cluster-wide)
1141
               -> Instance.List -- ^ Instance list (cluster-wide)
1142
               -> [Gdx]         -- ^ Target groups; if empty, any
1143
                                -- groups not being evacuated
1144
               -> [Idx]         -- ^ List of instance (indices) to be evacuated
1145
               -> Result (Node.List, Instance.List, EvacSolution)
1146
tryChangeGroup gl ini_nl ini_il gdxs idxs =
1147
    let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1148
                               flip Container.find ini_il) idxs
1149
        target_gdxs = (if null gdxs
1150
                       then Container.keys gl
1151
                       else gdxs) \\ evac_gdxs
1152
        offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1153
        excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1154
        group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1155
                                             (Container.elems nl))) $
1156
                      splitCluster ini_nl ini_il
1157
        (fin_nl, fin_il, esol) =
1158
            foldl' (\state@(nl, il, _) inst ->
1159
                        let solution = do
1160
                              let ncnt = Instance.requiredNodes $
1161
                                         Instance.diskTemplate inst
1162
                              (gdx, _, _) <- findBestAllocGroup gl nl il
1163
                                             (Just target_gdxs) inst ncnt
1164
                              av_nodes <- availableGroupNodes group_ndx
1165
                                          excl_ndx gdx
1166
                              nodeEvacInstance nl il ChangeAll inst
1167
                                       gdx av_nodes
1168
                        in updateEvacSolution state
1169
                               (Instance.idx inst) solution
1170
                   )
1171
            (ini_nl, ini_il, emptyEvacSolution)
1172
            (map (`Container.find` ini_il) idxs)
1173
    in return (fin_nl, fin_il, reverseEvacSolution esol)
1174

    
1175
-- | Recursively place instances on the cluster until we're out of space.
1176
iterateAlloc :: Node.List
1177
             -> Instance.List
1178
             -> Maybe Int
1179
             -> Instance.Instance
1180
             -> AllocNodes
1181
             -> [Instance.Instance]
1182
             -> [CStats]
1183
             -> Result AllocResult
1184
iterateAlloc nl il limit newinst allocnodes ixes cstats =
1185
      let depth = length ixes
1186
          newname = printf "new-%d" depth::String
1187
          newidx = length (Container.elems il) + depth
1188
          newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1189
          newlimit = fmap (flip (-) 1) limit
1190
      in case tryAlloc nl il newi2 allocnodes of
1191
           Bad s -> Bad s
1192
           Ok (AllocSolution { asFailures = errs, asSolutions = sols3 }) ->
1193
               let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1194
               case sols3 of
1195
                 [] -> newsol
1196
                 (xnl, xi, _, _):[] ->
1197
                     if limit == Just 0
1198
                     then newsol
1199
                     else iterateAlloc xnl (Container.add newidx xi il)
1200
                          newlimit newinst allocnodes (xi:ixes)
1201
                          (totalResources xnl:cstats)
1202
                 _ -> Bad "Internal error: multiple solutions for single\
1203
                          \ allocation"
1204

    
1205
-- | The core of the tiered allocation mode.
1206
tieredAlloc :: Node.List
1207
            -> Instance.List
1208
            -> Maybe Int
1209
            -> Instance.Instance
1210
            -> AllocNodes
1211
            -> [Instance.Instance]
1212
            -> [CStats]
1213
            -> Result AllocResult
1214
tieredAlloc nl il limit newinst allocnodes ixes cstats =
1215
    case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1216
      Bad s -> Bad s
1217
      Ok (errs, nl', il', ixes', cstats') ->
1218
          let newsol = Ok (errs, nl', il', ixes', cstats')
1219
              ixes_cnt = length ixes'
1220
              (stop, newlimit) = case limit of
1221
                                   Nothing -> (False, Nothing)
1222
                                   Just n -> (n <= ixes_cnt,
1223
                                              Just (n - ixes_cnt)) in
1224
          if stop then newsol else
1225
          case Instance.shrinkByType newinst . fst . last $
1226
               sortBy (comparing snd) errs of
1227
            Bad _ -> newsol
1228
            Ok newinst' -> tieredAlloc nl' il' newlimit
1229
                           newinst' allocnodes ixes' cstats'
1230

    
1231
-- * Formatting functions
1232

    
1233
-- | Given the original and final nodes, computes the relocation description.
1234
computeMoves :: Instance.Instance -- ^ The instance to be moved
1235
             -> String -- ^ The instance name
1236
             -> IMove  -- ^ The move being performed
1237
             -> String -- ^ New primary
1238
             -> String -- ^ New secondary
1239
             -> (String, [String])
1240
                -- ^ Tuple of moves and commands list; moves is containing
1241
                -- either @/f/@ for failover or @/r:name/@ for replace
1242
                -- secondary, while the command list holds gnt-instance
1243
                -- commands (without that prefix), e.g \"@failover instance1@\"
1244
computeMoves i inam mv c d =
1245
    case mv of
1246
      Failover -> ("f", [mig])
1247
      FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1248
      ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1249
      ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1250
      ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1251
    where morf = if Instance.running i then "migrate" else "failover"
1252
          mig = printf "%s -f %s" morf inam::String
1253
          rep n = printf "replace-disks -n %s %s" n inam
1254

    
1255
-- | Converts a placement to string format.
1256
printSolutionLine :: Node.List     -- ^ The node list
1257
                  -> Instance.List -- ^ The instance list
1258
                  -> Int           -- ^ Maximum node name length
1259
                  -> Int           -- ^ Maximum instance name length
1260
                  -> Placement     -- ^ The current placement
1261
                  -> Int           -- ^ The index of the placement in
1262
                                   -- the solution
1263
                  -> (String, [String])
1264
printSolutionLine nl il nmlen imlen plc pos =
1265
    let
1266
        pmlen = (2*nmlen + 1)
1267
        (i, p, s, mv, c) = plc
1268
        inst = Container.find i il
1269
        inam = Instance.alias inst
1270
        npri = Node.alias $ Container.find p nl
1271
        nsec = Node.alias $ Container.find s nl
1272
        opri = Node.alias $ Container.find (Instance.pNode inst) nl
1273
        osec = Node.alias $ Container.find (Instance.sNode inst) nl
1274
        (moves, cmds) =  computeMoves inst inam mv npri nsec
1275
        ostr = printf "%s:%s" opri osec::String
1276
        nstr = printf "%s:%s" npri nsec::String
1277
    in
1278
      (printf "  %3d. %-*s %-*s => %-*s %.8f a=%s"
1279
       pos imlen inam pmlen ostr
1280
       pmlen nstr c moves,
1281
       cmds)
1282

    
1283
-- | Return the instance and involved nodes in an instance move.
1284
--
1285
-- Note that the output list length can vary, and is not required nor
1286
-- guaranteed to be of any specific length.
1287
involvedNodes :: Instance.List -- ^ Instance list, used for retrieving
1288
                               -- the instance from its index; note
1289
                               -- that this /must/ be the original
1290
                               -- instance list, so that we can
1291
                               -- retrieve the old nodes
1292
              -> Placement     -- ^ The placement we're investigating,
1293
                               -- containing the new nodes and
1294
                               -- instance index
1295
              -> [Ndx]         -- ^ Resulting list of node indices
1296
involvedNodes il plc =
1297
    let (i, np, ns, _, _) = plc
1298
        inst = Container.find i il
1299
    in nub $ [np, ns] ++ Instance.allNodes inst
1300

    
1301
-- | Inner function for splitJobs, that either appends the next job to
1302
-- the current jobset, or starts a new jobset.
1303
mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1304
mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1305
mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1306
    | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1307
    | otherwise = ([n]:cjs, ndx)
1308

    
1309
-- | Break a list of moves into independent groups. Note that this
1310
-- will reverse the order of jobs.
1311
splitJobs :: [MoveJob] -> [JobSet]
1312
splitJobs = fst . foldl mergeJobs ([], [])
1313

    
1314
-- | Given a list of commands, prefix them with @gnt-instance@ and
1315
-- also beautify the display a little.
1316
formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1317
formatJob jsn jsl (sn, (_, _, _, cmds)) =
1318
    let out =
1319
            printf "  echo job %d/%d" jsn sn:
1320
            printf "  check":
1321
            map ("  gnt-instance " ++) cmds
1322
    in if sn == 1
1323
       then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1324
       else out
1325

    
1326
-- | Given a list of commands, prefix them with @gnt-instance@ and
1327
-- also beautify the display a little.
1328
formatCmds :: [JobSet] -> String
1329
formatCmds =
1330
    unlines .
1331
    concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1332
                             (zip [1..] js)) .
1333
    zip [1..]
1334

    
1335
-- | Print the node list.
1336
printNodes :: Node.List -> [String] -> String
1337
printNodes nl fs =
1338
    let fields = case fs of
1339
          [] -> Node.defaultFields
1340
          "+":rest -> Node.defaultFields ++ rest
1341
          _ -> fs
1342
        snl = sortBy (comparing Node.idx) (Container.elems nl)
1343
        (header, isnum) = unzip $ map Node.showHeader fields
1344
    in unlines . map ((:) ' ' .  intercalate " ") $
1345
       formatTable (header:map (Node.list fields) snl) isnum
1346

    
1347
-- | Print the instance list.
1348
printInsts :: Node.List -> Instance.List -> String
1349
printInsts nl il =
1350
    let sil = sortBy (comparing Instance.idx) (Container.elems il)
1351
        helper inst = [ if Instance.running inst then "R" else " "
1352
                      , Instance.name inst
1353
                      , Container.nameOf nl (Instance.pNode inst)
1354
                      , let sdx = Instance.sNode inst
1355
                        in if sdx == Node.noSecondary
1356
                           then  ""
1357
                           else Container.nameOf nl sdx
1358
                      , if Instance.autoBalance inst then "Y" else "N"
1359
                      , printf "%3d" $ Instance.vcpus inst
1360
                      , printf "%5d" $ Instance.mem inst
1361
                      , printf "%5d" $ Instance.dsk inst `div` 1024
1362
                      , printf "%5.3f" lC
1363
                      , printf "%5.3f" lM
1364
                      , printf "%5.3f" lD
1365
                      , printf "%5.3f" lN
1366
                      ]
1367
            where DynUtil lC lM lD lN = Instance.util inst
1368
        header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1369
                 , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1370
        isnum = False:False:False:False:False:repeat True
1371
    in unlines . map ((:) ' ' . intercalate " ") $
1372
       formatTable (header:map helper sil) isnum
1373

    
1374
-- | Shows statistics for a given node list.
1375
printStats :: Node.List -> String
1376
printStats nl =
1377
    let dcvs = compDetailedCV $ Container.elems nl
1378
        (weights, names) = unzip detailedCVInfo
1379
        hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1380
        formatted = map (\(w, header, val) ->
1381
                             printf "%s=%.8f(x%.2f)" header val w::String) hd
1382
    in intercalate ", " formatted
1383

    
1384
-- | Convert a placement into a list of OpCodes (basically a job).
1385
iMoveToJob :: Node.List        -- ^ The node list; only used for node
1386
                               -- names, so any version is good
1387
                               -- (before or after the operation)
1388
           -> Instance.List    -- ^ The instance list; also used for
1389
                               -- names only
1390
           -> Idx              -- ^ The index of the instance being
1391
                               -- moved
1392
           -> IMove            -- ^ The actual move to be described
1393
           -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to
1394
                               -- the given move
1395
iMoveToJob nl il idx move =
1396
    let inst = Container.find idx il
1397
        iname = Instance.name inst
1398
        lookNode  = Just . Container.nameOf nl
1399
        opF = OpCodes.OpInstanceMigrate iname True False True Nothing
1400
        opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n)
1401
                OpCodes.ReplaceNewSecondary [] Nothing
1402
    in case move of
1403
         Failover -> [ opF ]
1404
         ReplacePrimary np -> [ opF, opR np, opF ]
1405
         ReplaceSecondary ns -> [ opR ns ]
1406
         ReplaceAndFailover np -> [ opR np, opF ]
1407
         FailoverAndReplace ns -> [ opF, opR ns ]
1408

    
1409
-- * Node group functions
1410

    
1411
-- | Computes the group of an instance.
1412
instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1413
instanceGroup nl i =
1414
  let sidx = Instance.sNode i
1415
      pnode = Container.find (Instance.pNode i) nl
1416
      snode = if sidx == Node.noSecondary
1417
              then pnode
1418
              else Container.find sidx nl
1419
      pgroup = Node.group pnode
1420
      sgroup = Node.group snode
1421
  in if pgroup /= sgroup
1422
     then fail ("Instance placed accross two node groups, primary " ++
1423
                show pgroup ++ ", secondary " ++ show sgroup)
1424
     else return pgroup
1425

    
1426
-- | Computes the group of an instance per the primary node.
1427
instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1428
instancePriGroup nl i =
1429
  let pnode = Container.find (Instance.pNode i) nl
1430
  in  Node.group pnode
1431

    
1432
-- | Compute the list of badly allocated instances (split across node
1433
-- groups).
1434
findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1435
findSplitInstances nl =
1436
  filter (not . isOk . instanceGroup nl) . Container.elems
1437

    
1438
-- | Splits a cluster into the component node groups.
1439
splitCluster :: Node.List -> Instance.List ->
1440
                [(Gdx, (Node.List, Instance.List))]
1441
splitCluster nl il =
1442
  let ngroups = Node.computeGroups (Container.elems nl)
1443
  in map (\(guuid, nodes) ->
1444
           let nidxs = map Node.idx nodes
1445
               nodes' = zip nidxs nodes
1446
               instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1447
           in (guuid, (Container.fromList nodes', instances))) ngroups
1448

    
1449
-- | Compute the list of nodes that are to be evacuated, given a list
1450
-- of instances and an evacuation mode.
1451
nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1452
                -> EvacMode      -- ^ The evacuation mode we're using
1453
                -> [Idx]         -- ^ List of instance indices being evacuated
1454
                -> IntSet.IntSet -- ^ Set of node indices
1455
nodesToEvacuate il mode =
1456
    IntSet.delete Node.noSecondary .
1457
    foldl' (\ns idx ->
1458
                let i = Container.find idx il
1459
                    pdx = Instance.pNode i
1460
                    sdx = Instance.sNode i
1461
                    dt = Instance.diskTemplate i
1462
                    withSecondary = case dt of
1463
                                      DTDrbd8 -> IntSet.insert sdx ns
1464
                                      _ -> ns
1465
                in case mode of
1466
                     ChangePrimary   -> IntSet.insert pdx ns
1467
                     ChangeSecondary -> withSecondary
1468
                     ChangeAll       -> IntSet.insert pdx withSecondary
1469
           ) IntSet.empty