Statistics
| Branch: | Tag: | Revision:

root / htools / Ganeti / HTools / Cluster.hs @ 8a8ed513

History | View | Annotate | Download (63.2 kB)

1
{-| Implementation of cluster-wide logic.
2

    
3
This module holds all pure cluster-logic; I\/O related functionality
4
goes into the /Main/ module for the individual binaries.
5

    
6
-}
7

    
8
{-
9

    
10
Copyright (C) 2009, 2010, 2011 Google Inc.
11

    
12
This program is free software; you can redistribute it and/or modify
13
it under the terms of the GNU General Public License as published by
14
the Free Software Foundation; either version 2 of the License, or
15
(at your option) any later version.
16

    
17
This program is distributed in the hope that it will be useful, but
18
WITHOUT ANY WARRANTY; without even the implied warranty of
19
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
General Public License for more details.
21

    
22
You should have received a copy of the GNU General Public License
23
along with this program; if not, write to the Free Software
24
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25
02110-1301, USA.
26

    
27
-}
28

    
29
module Ganeti.HTools.Cluster
30
    (
31
     -- * Types
32
      AllocSolution(..)
33
    , EvacSolution(..)
34
    , Table(..)
35
    , CStats(..)
36
    , AllocStats
37
    , AllocResult
38
    , AllocMethod
39
    -- * Generic functions
40
    , totalResources
41
    , computeAllocationDelta
42
    -- * First phase functions
43
    , computeBadItems
44
    -- * Second phase functions
45
    , printSolutionLine
46
    , formatCmds
47
    , involvedNodes
48
    , splitJobs
49
    -- * Display functions
50
    , printNodes
51
    , printInsts
52
    -- * Balacing functions
53
    , checkMove
54
    , doNextBalance
55
    , tryBalance
56
    , compCV
57
    , compCVNodes
58
    , compDetailedCV
59
    , printStats
60
    , iMoveToJob
61
    -- * IAllocator functions
62
    , genAllocNodes
63
    , tryAlloc
64
    , tryMGAlloc
65
    , tryReloc
66
    , tryNodeEvac
67
    , tryChangeGroup
68
    , collapseFailures
69
    -- * Allocation functions
70
    , iterateAlloc
71
    , tieredAlloc
72
     -- * Node group functions
73
    , instanceGroup
74
    , findSplitInstances
75
    , splitCluster
76
    ) where
77

    
78
import qualified Data.IntSet as IntSet
79
import Data.List
80
import Data.Maybe (fromJust, isNothing)
81
import Data.Ord (comparing)
82
import Text.Printf (printf)
83

    
84
import qualified Ganeti.HTools.Container as Container
85
import qualified Ganeti.HTools.Instance as Instance
86
import qualified Ganeti.HTools.Node as Node
87
import qualified Ganeti.HTools.Group as Group
88
import Ganeti.HTools.Types
89
import Ganeti.HTools.Utils
90
import Ganeti.HTools.Compat
91
import qualified Ganeti.OpCodes as OpCodes
92

    
93
-- * Types
94

    
95
-- | Allocation\/relocation solution.
96
data AllocSolution = AllocSolution
97
  { asFailures :: [FailMode]              -- ^ Failure counts
98
  , asAllocs   :: Int                     -- ^ Good allocation count
99
  , asSolution :: Maybe Node.AllocElement -- ^ The actual allocation result
100
  , asLog      :: [String]                -- ^ Informational messages
101
  }
102

    
103
-- | Node evacuation/group change iallocator result type. This result
104
-- type consists of actual opcodes (a restricted subset) that are
105
-- transmitted back to Ganeti.
106
data EvacSolution = EvacSolution
107
    { esMoved   :: [(Idx, Gdx, [Ndx])]  -- ^ Instances moved successfully
108
    , esFailed  :: [(Idx, String)]      -- ^ Instances which were not
109
                                        -- relocated
110
    , esOpCodes :: [[OpCodes.OpCode]]   -- ^ List of jobs
111
    }
112

    
113
-- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
114
type AllocResult = (FailStats, Node.List, Instance.List,
115
                    [Instance.Instance], [CStats])
116

    
117
-- | A type denoting the valid allocation mode/pairs.
118
--
119
-- For a one-node allocation, this will be a @Left ['Ndx']@, whereas
120
-- for a two-node allocation, this will be a @Right [('Ndx',
121
-- ['Ndx'])]@. In the latter case, the list is basically an
122
-- association list, grouped by primary node and holding the potential
123
-- secondary nodes in the sub-list.
124
type AllocNodes = Either [Ndx] [(Ndx, [Ndx])]
125

    
126
-- | The empty solution we start with when computing allocations.
127
emptyAllocSolution :: AllocSolution
128
emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
129
                                   , asSolution = Nothing, asLog = [] }
130

    
131
-- | The empty evac solution.
132
emptyEvacSolution :: EvacSolution
133
emptyEvacSolution = EvacSolution { esMoved = []
134
                                 , esFailed = []
135
                                 , esOpCodes = []
136
                                 }
137

    
138
-- | The complete state for the balancing solution.
139
data Table = Table Node.List Instance.List Score [Placement]
140
             deriving (Show, Read)
141

    
142
-- | Cluster statistics data type.
143
data CStats = CStats { csFmem :: Integer -- ^ Cluster free mem
144
                     , csFdsk :: Integer -- ^ Cluster free disk
145
                     , csAmem :: Integer -- ^ Cluster allocatable mem
146
                     , csAdsk :: Integer -- ^ Cluster allocatable disk
147
                     , csAcpu :: Integer -- ^ Cluster allocatable cpus
148
                     , csMmem :: Integer -- ^ Max node allocatable mem
149
                     , csMdsk :: Integer -- ^ Max node allocatable disk
150
                     , csMcpu :: Integer -- ^ Max node allocatable cpu
151
                     , csImem :: Integer -- ^ Instance used mem
152
                     , csIdsk :: Integer -- ^ Instance used disk
153
                     , csIcpu :: Integer -- ^ Instance used cpu
154
                     , csTmem :: Double  -- ^ Cluster total mem
155
                     , csTdsk :: Double  -- ^ Cluster total disk
156
                     , csTcpu :: Double  -- ^ Cluster total cpus
157
                     , csVcpu :: Integer -- ^ Cluster virtual cpus (if
158
                                         -- node pCpu has been set,
159
                                         -- otherwise -1)
160
                     , csXmem :: Integer -- ^ Unnacounted for mem
161
                     , csNmem :: Integer -- ^ Node own memory
162
                     , csScore :: Score  -- ^ The cluster score
163
                     , csNinst :: Int    -- ^ The total number of instances
164
                     }
165
            deriving (Show, Read)
166

    
167
-- | Currently used, possibly to allocate, unallocable.
168
type AllocStats = (RSpec, RSpec, RSpec)
169

    
170
-- | A simple type for allocation functions.
171
type AllocMethod =  Node.List           -- ^ Node list
172
                 -> Instance.List       -- ^ Instance list
173
                 -> Maybe Int           -- ^ Optional allocation limit
174
                 -> Instance.Instance   -- ^ Instance spec for allocation
175
                 -> AllocNodes          -- ^ Which nodes we should allocate on
176
                 -> [Instance.Instance] -- ^ Allocated instances
177
                 -> [CStats]            -- ^ Running cluster stats
178
                 -> Result AllocResult  -- ^ Allocation result
179

    
180
-- * Utility functions
181

    
182
-- | Verifies the N+1 status and return the affected nodes.
183
verifyN1 :: [Node.Node] -> [Node.Node]
184
verifyN1 = filter Node.failN1
185

    
186
{-| Computes the pair of bad nodes and instances.
187

    
188
The bad node list is computed via a simple 'verifyN1' check, and the
189
bad instance list is the list of primary and secondary instances of
190
those nodes.
191

    
192
-}
193
computeBadItems :: Node.List -> Instance.List ->
194
                   ([Node.Node], [Instance.Instance])
195
computeBadItems nl il =
196
  let bad_nodes = verifyN1 $ getOnline nl
197
      bad_instances = map (`Container.find` il) .
198
                      sort . nub $
199
                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
200
  in
201
    (bad_nodes, bad_instances)
202

    
203
-- | Zero-initializer for the CStats type.
204
emptyCStats :: CStats
205
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
206

    
207
-- | Update stats with data from a new node.
208
updateCStats :: CStats -> Node.Node -> CStats
209
updateCStats cs node =
210
    let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
211
                 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
212
                 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
213
                 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
214
                 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
215
                 csVcpu = x_vcpu,
216
                 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
217
               }
218
            = cs
219
        inc_amem = Node.fMem node - Node.rMem node
220
        inc_amem' = if inc_amem > 0 then inc_amem else 0
221
        inc_adsk = Node.availDisk node
222
        inc_imem = truncate (Node.tMem node) - Node.nMem node
223
                   - Node.xMem node - Node.fMem node
224
        inc_icpu = Node.uCpu node
225
        inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
226
        inc_vcpu = Node.hiCpu node
227
        inc_acpu = Node.availCpu node
228

    
229
    in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
230
          , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
231
          , csAmem = x_amem + fromIntegral inc_amem'
232
          , csAdsk = x_adsk + fromIntegral inc_adsk
233
          , csAcpu = x_acpu + fromIntegral inc_acpu
234
          , csMmem = max x_mmem (fromIntegral inc_amem')
235
          , csMdsk = max x_mdsk (fromIntegral inc_adsk)
236
          , csMcpu = max x_mcpu (fromIntegral inc_acpu)
237
          , csImem = x_imem + fromIntegral inc_imem
238
          , csIdsk = x_idsk + fromIntegral inc_idsk
239
          , csIcpu = x_icpu + fromIntegral inc_icpu
240
          , csTmem = x_tmem + Node.tMem node
241
          , csTdsk = x_tdsk + Node.tDsk node
242
          , csTcpu = x_tcpu + Node.tCpu node
243
          , csVcpu = x_vcpu + fromIntegral inc_vcpu
244
          , csXmem = x_xmem + fromIntegral (Node.xMem node)
245
          , csNmem = x_nmem + fromIntegral (Node.nMem node)
246
          , csNinst = x_ninst + length (Node.pList node)
247
          }
248

    
249
-- | Compute the total free disk and memory in the cluster.
250
totalResources :: Node.List -> CStats
251
totalResources nl =
252
    let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
253
    in cs { csScore = compCV nl }
254

    
255
-- | Compute the delta between two cluster state.
256
--
257
-- This is used when doing allocations, to understand better the
258
-- available cluster resources. The return value is a triple of the
259
-- current used values, the delta that was still allocated, and what
260
-- was left unallocated.
261
computeAllocationDelta :: CStats -> CStats -> AllocStats
262
computeAllocationDelta cini cfin =
263
    let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini
264
        CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
265
                csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin
266
        rini = RSpec (fromIntegral i_icpu) (fromIntegral i_imem)
267
               (fromIntegral i_idsk)
268
        rfin = RSpec (fromIntegral (f_icpu - i_icpu))
269
               (fromIntegral (f_imem - i_imem))
270
               (fromIntegral (f_idsk - i_idsk))
271
        un_cpu = fromIntegral (v_cpu - f_icpu)::Int
272
        runa = RSpec un_cpu (truncate t_mem - fromIntegral f_imem)
273
               (truncate t_dsk - fromIntegral f_idsk)
274
    in (rini, rfin, runa)
275

    
276
-- | The names and weights of the individual elements in the CV list.
277
detailedCVInfo :: [(Double, String)]
278
detailedCVInfo = [ (1,  "free_mem_cv")
279
                 , (1,  "free_disk_cv")
280
                 , (1,  "n1_cnt")
281
                 , (1,  "reserved_mem_cv")
282
                 , (4,  "offline_all_cnt")
283
                 , (16, "offline_pri_cnt")
284
                 , (1,  "vcpu_ratio_cv")
285
                 , (1,  "cpu_load_cv")
286
                 , (1,  "mem_load_cv")
287
                 , (1,  "disk_load_cv")
288
                 , (1,  "net_load_cv")
289
                 , (2,  "pri_tags_score")
290
                 ]
291

    
292
-- | Holds the weights used by 'compCVNodes' for each metric.
293
detailedCVWeights :: [Double]
294
detailedCVWeights = map fst detailedCVInfo
295

    
296
-- | Compute the mem and disk covariance.
297
compDetailedCV :: [Node.Node] -> [Double]
298
compDetailedCV all_nodes =
299
    let
300
        (offline, nodes) = partition Node.offline all_nodes
301
        mem_l = map Node.pMem nodes
302
        dsk_l = map Node.pDsk nodes
303
        -- metric: memory covariance
304
        mem_cv = stdDev mem_l
305
        -- metric: disk covariance
306
        dsk_cv = stdDev dsk_l
307
        -- metric: count of instances living on N1 failing nodes
308
        n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
309
                                                   length (Node.pList n)) .
310
                   filter Node.failN1 $ nodes :: Double
311
        res_l = map Node.pRem nodes
312
        -- metric: reserved memory covariance
313
        res_cv = stdDev res_l
314
        -- offline instances metrics
315
        offline_ipri = sum . map (length . Node.pList) $ offline
316
        offline_isec = sum . map (length . Node.sList) $ offline
317
        -- metric: count of instances on offline nodes
318
        off_score = fromIntegral (offline_ipri + offline_isec)::Double
319
        -- metric: count of primary instances on offline nodes (this
320
        -- helps with evacuation/failover of primary instances on
321
        -- 2-node clusters with one node offline)
322
        off_pri_score = fromIntegral offline_ipri::Double
323
        cpu_l = map Node.pCpu nodes
324
        -- metric: covariance of vcpu/pcpu ratio
325
        cpu_cv = stdDev cpu_l
326
        -- metrics: covariance of cpu, memory, disk and network load
327
        (c_load, m_load, d_load, n_load) = unzip4 $
328
            map (\n ->
329
                     let DynUtil c1 m1 d1 n1 = Node.utilLoad n
330
                         DynUtil c2 m2 d2 n2 = Node.utilPool n
331
                     in (c1/c2, m1/m2, d1/d2, n1/n2)
332
                ) nodes
333
        -- metric: conflicting instance count
334
        pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
335
        pri_tags_score = fromIntegral pri_tags_inst::Double
336
    in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
337
       , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
338
       , pri_tags_score ]
339

    
340
-- | Compute the /total/ variance.
341
compCVNodes :: [Node.Node] -> Double
342
compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
343

    
344
-- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
345
compCV :: Node.List -> Double
346
compCV = compCVNodes . Container.elems
347

    
348
-- | Compute online nodes from a 'Node.List'.
349
getOnline :: Node.List -> [Node.Node]
350
getOnline = filter (not . Node.offline) . Container.elems
351

    
352
-- * Balancing functions
353

    
354
-- | Compute best table. Note that the ordering of the arguments is important.
355
compareTables :: Table -> Table -> Table
356
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
357
    if a_cv > b_cv then b else a
358

    
359
-- | Applies an instance move to a given node list and instance.
360
applyMove :: Node.List -> Instance.Instance
361
          -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
362
-- Failover (f)
363
applyMove nl inst Failover =
364
    let old_pdx = Instance.pNode inst
365
        old_sdx = Instance.sNode inst
366
        old_p = Container.find old_pdx nl
367
        old_s = Container.find old_sdx nl
368
        int_p = Node.removePri old_p inst
369
        int_s = Node.removeSec old_s inst
370
        force_p = Node.offline old_p
371
        new_nl = do -- Maybe monad
372
          new_p <- Node.addPriEx force_p int_s inst
373
          new_s <- Node.addSec int_p inst old_sdx
374
          let new_inst = Instance.setBoth inst old_sdx old_pdx
375
          return (Container.addTwo old_pdx new_s old_sdx new_p nl,
376
                  new_inst, old_sdx, old_pdx)
377
    in new_nl
378

    
379
-- Replace the primary (f:, r:np, f)
380
applyMove nl inst (ReplacePrimary new_pdx) =
381
    let old_pdx = Instance.pNode inst
382
        old_sdx = Instance.sNode inst
383
        old_p = Container.find old_pdx nl
384
        old_s = Container.find old_sdx nl
385
        tgt_n = Container.find new_pdx nl
386
        int_p = Node.removePri old_p inst
387
        int_s = Node.removeSec old_s inst
388
        force_p = Node.offline old_p
389
        new_nl = do -- Maybe monad
390
          -- check that the current secondary can host the instance
391
          -- during the migration
392
          tmp_s <- Node.addPriEx force_p int_s inst
393
          let tmp_s' = Node.removePri tmp_s inst
394
          new_p <- Node.addPriEx force_p tgt_n inst
395
          new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
396
          let new_inst = Instance.setPri inst new_pdx
397
          return (Container.add new_pdx new_p $
398
                  Container.addTwo old_pdx int_p old_sdx new_s nl,
399
                  new_inst, new_pdx, old_sdx)
400
    in new_nl
401

    
402
-- Replace the secondary (r:ns)
403
applyMove nl inst (ReplaceSecondary new_sdx) =
404
    let old_pdx = Instance.pNode inst
405
        old_sdx = Instance.sNode inst
406
        old_s = Container.find old_sdx nl
407
        tgt_n = Container.find new_sdx nl
408
        int_s = Node.removeSec old_s inst
409
        force_s = Node.offline old_s
410
        new_inst = Instance.setSec inst new_sdx
411
        new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
412
                 \new_s -> return (Container.addTwo new_sdx
413
                                   new_s old_sdx int_s nl,
414
                                   new_inst, old_pdx, new_sdx)
415
    in new_nl
416

    
417
-- Replace the secondary and failover (r:np, f)
418
applyMove nl inst (ReplaceAndFailover new_pdx) =
419
    let old_pdx = Instance.pNode inst
420
        old_sdx = Instance.sNode inst
421
        old_p = Container.find old_pdx nl
422
        old_s = Container.find old_sdx nl
423
        tgt_n = Container.find new_pdx nl
424
        int_p = Node.removePri old_p inst
425
        int_s = Node.removeSec old_s inst
426
        force_s = Node.offline old_s
427
        new_nl = do -- Maybe monad
428
          new_p <- Node.addPri tgt_n inst
429
          new_s <- Node.addSecEx force_s int_p inst new_pdx
430
          let new_inst = Instance.setBoth inst new_pdx old_pdx
431
          return (Container.add new_pdx new_p $
432
                  Container.addTwo old_pdx new_s old_sdx int_s nl,
433
                  new_inst, new_pdx, old_pdx)
434
    in new_nl
435

    
436
-- Failver and replace the secondary (f, r:ns)
437
applyMove nl inst (FailoverAndReplace new_sdx) =
438
    let old_pdx = Instance.pNode inst
439
        old_sdx = Instance.sNode inst
440
        old_p = Container.find old_pdx nl
441
        old_s = Container.find old_sdx nl
442
        tgt_n = Container.find new_sdx nl
443
        int_p = Node.removePri old_p inst
444
        int_s = Node.removeSec old_s inst
445
        force_p = Node.offline old_p
446
        new_nl = do -- Maybe monad
447
          new_p <- Node.addPriEx force_p int_s inst
448
          new_s <- Node.addSecEx force_p tgt_n inst old_sdx
449
          let new_inst = Instance.setBoth inst old_sdx new_sdx
450
          return (Container.add new_sdx new_s $
451
                  Container.addTwo old_sdx new_p old_pdx int_p nl,
452
                  new_inst, old_sdx, new_sdx)
453
    in new_nl
454

    
455
-- | Tries to allocate an instance on one given node.
456
allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
457
                 -> OpResult Node.AllocElement
458
allocateOnSingle nl inst new_pdx =
459
    let p = Container.find new_pdx nl
460
        new_inst = Instance.setBoth inst new_pdx Node.noSecondary
461
    in  Node.addPri p inst >>= \new_p -> do
462
      let new_nl = Container.add new_pdx new_p nl
463
          new_score = compCV nl
464
      return (new_nl, new_inst, [new_p], new_score)
465

    
466
-- | Tries to allocate an instance on a given pair of nodes.
467
allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
468
               -> OpResult Node.AllocElement
469
allocateOnPair nl inst new_pdx new_sdx =
470
    let tgt_p = Container.find new_pdx nl
471
        tgt_s = Container.find new_sdx nl
472
    in do
473
      new_p <- Node.addPri tgt_p inst
474
      new_s <- Node.addSec tgt_s inst new_pdx
475
      let new_inst = Instance.setBoth inst new_pdx new_sdx
476
          new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
477
      return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
478

    
479
-- | Tries to perform an instance move and returns the best table
480
-- between the original one and the new one.
481
checkSingleStep :: Table -- ^ The original table
482
                -> Instance.Instance -- ^ The instance to move
483
                -> Table -- ^ The current best table
484
                -> IMove -- ^ The move to apply
485
                -> Table -- ^ The final best table
486
checkSingleStep ini_tbl target cur_tbl move =
487
    let
488
        Table ini_nl ini_il _ ini_plc = ini_tbl
489
        tmp_resu = applyMove ini_nl target move
490
    in
491
      case tmp_resu of
492
        OpFail _ -> cur_tbl
493
        OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
494
            let tgt_idx = Instance.idx target
495
                upd_cvar = compCV upd_nl
496
                upd_il = Container.add tgt_idx new_inst ini_il
497
                upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
498
                upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
499
            in
500
              compareTables cur_tbl upd_tbl
501

    
502
-- | Given the status of the current secondary as a valid new node and
503
-- the current candidate target node, generate the possible moves for
504
-- a instance.
505
possibleMoves :: Bool      -- ^ Whether the secondary node is a valid new node
506
              -> Bool      -- ^ Whether we can change the primary node
507
              -> Ndx       -- ^ Target node candidate
508
              -> [IMove]   -- ^ List of valid result moves
509

    
510
possibleMoves _ False tdx =
511
    [ReplaceSecondary tdx]
512

    
513
possibleMoves True True tdx =
514
    [ReplaceSecondary tdx,
515
     ReplaceAndFailover tdx,
516
     ReplacePrimary tdx,
517
     FailoverAndReplace tdx]
518

    
519
possibleMoves False True tdx =
520
    [ReplaceSecondary tdx,
521
     ReplaceAndFailover tdx]
522

    
523
-- | Compute the best move for a given instance.
524
checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
525
                  -> Bool              -- ^ Whether disk moves are allowed
526
                  -> Bool              -- ^ Whether instance moves are allowed
527
                  -> Table             -- ^ Original table
528
                  -> Instance.Instance -- ^ Instance to move
529
                  -> Table             -- ^ Best new table for this instance
530
checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
531
    let
532
        opdx = Instance.pNode target
533
        osdx = Instance.sNode target
534
        nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx
535
        use_secondary = elem osdx nodes_idx && inst_moves
536
        aft_failover = if use_secondary -- if allowed to failover
537
                       then checkSingleStep ini_tbl target ini_tbl Failover
538
                       else ini_tbl
539
        all_moves = if disk_moves
540
                    then concatMap
541
                         (possibleMoves use_secondary inst_moves) nodes
542
                    else []
543
    in
544
      -- iterate over the possible nodes for this instance
545
      foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
546

    
547
-- | Compute the best next move.
548
checkMove :: [Ndx]               -- ^ Allowed target node indices
549
          -> Bool                -- ^ Whether disk moves are allowed
550
          -> Bool                -- ^ Whether instance moves are allowed
551
          -> Table               -- ^ The current solution
552
          -> [Instance.Instance] -- ^ List of instances still to move
553
          -> Table               -- ^ The new solution
554
checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
555
    let Table _ _ _ ini_plc = ini_tbl
556
        -- we're using rwhnf from the Control.Parallel.Strategies
557
        -- package; we don't need to use rnf as that would force too
558
        -- much evaluation in single-threaded cases, and in
559
        -- multi-threaded case the weak head normal form is enough to
560
        -- spark the evaluation
561
        tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
562
                               inst_moves ini_tbl)
563
                 victims
564
        -- iterate over all instances, computing the best move
565
        best_tbl = foldl' compareTables ini_tbl tables
566
        Table _ _ _ best_plc = best_tbl
567
    in if length best_plc == length ini_plc
568
       then ini_tbl -- no advancement
569
       else best_tbl
570

    
571
-- | Check if we are allowed to go deeper in the balancing.
572
doNextBalance :: Table     -- ^ The starting table
573
              -> Int       -- ^ Remaining length
574
              -> Score     -- ^ Score at which to stop
575
              -> Bool      -- ^ The resulting table and commands
576
doNextBalance ini_tbl max_rounds min_score =
577
    let Table _ _ ini_cv ini_plc = ini_tbl
578
        ini_plc_len = length ini_plc
579
    in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
580

    
581
-- | Run a balance move.
582
tryBalance :: Table       -- ^ The starting table
583
           -> Bool        -- ^ Allow disk moves
584
           -> Bool        -- ^ Allow instance moves
585
           -> Bool        -- ^ Only evacuate moves
586
           -> Score       -- ^ Min gain threshold
587
           -> Score       -- ^ Min gain
588
           -> Maybe Table -- ^ The resulting table and commands
589
tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
590
    let Table ini_nl ini_il ini_cv _ = ini_tbl
591
        all_inst = Container.elems ini_il
592
        all_inst' = if evac_mode
593
                    then let bad_nodes = map Node.idx . filter Node.offline $
594
                                         Container.elems ini_nl
595
                         in filter (any (`elem` bad_nodes) . Instance.allNodes)
596
                            all_inst
597
                    else all_inst
598
        reloc_inst = filter Instance.movable all_inst'
599
        node_idx = map Node.idx . filter (not . Node.offline) $
600
                   Container.elems ini_nl
601
        fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
602
        (Table _ _ fin_cv _) = fin_tbl
603
    in
604
      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
605
      then Just fin_tbl -- this round made success, return the new table
606
      else Nothing
607

    
608
-- * Allocation functions
609

    
610
-- | Build failure stats out of a list of failures.
611
collapseFailures :: [FailMode] -> FailStats
612
collapseFailures flst =
613
    map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
614
            [minBound..maxBound]
615

    
616
-- | Compares two Maybe AllocElement and chooses the besst score.
617
bestAllocElement :: Maybe Node.AllocElement
618
                 -> Maybe Node.AllocElement
619
                 -> Maybe Node.AllocElement
620
bestAllocElement a Nothing = a
621
bestAllocElement Nothing b = b
622
bestAllocElement a@(Just (_, _, _, ascore)) b@(Just (_, _, _, bscore)) =
623
    if ascore < bscore then a else b
624

    
625
-- | Update current Allocation solution and failure stats with new
626
-- elements.
627
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
628
concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
629

    
630
concatAllocs as (OpGood ns) =
631
    let -- Choose the old or new solution, based on the cluster score
632
        cntok = asAllocs as
633
        osols = asSolution as
634
        nsols = bestAllocElement osols (Just ns)
635
        nsuc = cntok + 1
636
    -- Note: we force evaluation of nsols here in order to keep the
637
    -- memory profile low - we know that we will need nsols for sure
638
    -- in the next cycle, so we force evaluation of nsols, since the
639
    -- foldl' in the caller will only evaluate the tuple, but not the
640
    -- elements of the tuple
641
    in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolution = nsols }
642

    
643
-- | Sums two 'AllocSolution' structures.
644
sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
645
sumAllocs (AllocSolution aFails aAllocs aSols aLog)
646
          (AllocSolution bFails bAllocs bSols bLog) =
647
    -- note: we add b first, since usually it will be smaller; when
648
    -- fold'ing, a will grow and grow whereas b is the per-group
649
    -- result, hence smaller
650
    let nFails  = bFails ++ aFails
651
        nAllocs = aAllocs + bAllocs
652
        nSols   = bestAllocElement aSols bSols
653
        nLog    = bLog ++ aLog
654
    in AllocSolution nFails nAllocs nSols nLog
655

    
656
-- | Given a solution, generates a reasonable description for it.
657
describeSolution :: AllocSolution -> String
658
describeSolution as =
659
  let fcnt = asFailures as
660
      sols = asSolution as
661
      freasons =
662
        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
663
        filter ((> 0) . snd) . collapseFailures $ fcnt
664
  in case sols of
665
     Nothing -> "No valid allocation solutions, failure reasons: " ++
666
                (if null fcnt then "unknown reasons" else freasons)
667
     Just (_, _, nodes, cv) ->
668
         printf ("score: %.8f, successes %d, failures %d (%s)" ++
669
                 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
670
               (intercalate "/" . map Node.name $ nodes)
671

    
672
-- | Annotates a solution with the appropriate string.
673
annotateSolution :: AllocSolution -> AllocSolution
674
annotateSolution as = as { asLog = describeSolution as : asLog as }
675

    
676
-- | Reverses an evacuation solution.
677
--
678
-- Rationale: we always concat the results to the top of the lists, so
679
-- for proper jobset execution, we should reverse all lists.
680
reverseEvacSolution :: EvacSolution -> EvacSolution
681
reverseEvacSolution (EvacSolution f m o) =
682
    EvacSolution (reverse f) (reverse m) (reverse o)
683

    
684
-- | Generate the valid node allocation singles or pairs for a new instance.
685
genAllocNodes :: Group.List        -- ^ Group list
686
              -> Node.List         -- ^ The node map
687
              -> Int               -- ^ The number of nodes required
688
              -> Bool              -- ^ Whether to drop or not
689
                                   -- unallocable nodes
690
              -> Result AllocNodes -- ^ The (monadic) result
691
genAllocNodes gl nl count drop_unalloc =
692
    let filter_fn = if drop_unalloc
693
                    then filter (Group.isAllocable .
694
                                 flip Container.find gl . Node.group)
695
                    else id
696
        all_nodes = filter_fn $ getOnline nl
697
        all_pairs = [(Node.idx p,
698
                      [Node.idx s | s <- all_nodes,
699
                                         Node.idx p /= Node.idx s,
700
                                         Node.group p == Node.group s]) |
701
                     p <- all_nodes]
702
    in case count of
703
         1 -> Ok (Left (map Node.idx all_nodes))
704
         2 -> Ok (Right (filter (not . null . snd) all_pairs))
705
         _ -> Bad "Unsupported number of nodes, only one or two  supported"
706

    
707
-- | Try to allocate an instance on the cluster.
708
tryAlloc :: (Monad m) =>
709
            Node.List         -- ^ The node list
710
         -> Instance.List     -- ^ The instance list
711
         -> Instance.Instance -- ^ The instance to allocate
712
         -> AllocNodes        -- ^ The allocation targets
713
         -> m AllocSolution   -- ^ Possible solution list
714
tryAlloc _  _ _    (Right []) = fail "Not enough online nodes"
715
tryAlloc nl _ inst (Right ok_pairs) =
716
    let psols = parMap rwhnf (\(p, ss) ->
717
                                  foldl' (\cstate ->
718
                                          concatAllocs cstate .
719
                                          allocateOnPair nl inst p)
720
                                  emptyAllocSolution ss) ok_pairs
721
        sols = foldl' sumAllocs emptyAllocSolution psols
722
    in return $ annotateSolution sols
723

    
724
tryAlloc _  _ _    (Left []) = fail "No online nodes"
725
tryAlloc nl _ inst (Left all_nodes) =
726
    let sols = foldl' (\cstate ->
727
                           concatAllocs cstate . allocateOnSingle nl inst
728
                      ) emptyAllocSolution all_nodes
729
    in return $ annotateSolution sols
730

    
731
-- | Given a group/result, describe it as a nice (list of) messages.
732
solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
733
solutionDescription gl (groupId, result) =
734
  case result of
735
    Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
736
    Bad message -> [printf "Group %s: error %s" gname message]
737
  where grp = Container.find groupId gl
738
        gname = Group.name grp
739
        pol = allocPolicyToRaw (Group.allocPolicy grp)
740

    
741
-- | From a list of possibly bad and possibly empty solutions, filter
742
-- only the groups with a valid result. Note that the result will be
743
-- reversed compared to the original list.
744
filterMGResults :: Group.List
745
                -> [(Gdx, Result AllocSolution)]
746
                -> [(Gdx, AllocSolution)]
747
filterMGResults gl = foldl' fn []
748
    where unallocable = not . Group.isAllocable . flip Container.find gl
749
          fn accu (gdx, rasol) =
750
              case rasol of
751
                Bad _ -> accu
752
                Ok sol | isNothing (asSolution sol) -> accu
753
                       | unallocable gdx -> accu
754
                       | otherwise -> (gdx, sol):accu
755

    
756
-- | Sort multigroup results based on policy and score.
757
sortMGResults :: Group.List
758
             -> [(Gdx, AllocSolution)]
759
             -> [(Gdx, AllocSolution)]
760
sortMGResults gl sols =
761
    let extractScore (_, _, _, x) = x
762
        solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
763
                               (extractScore . fromJust . asSolution) sol)
764
    in sortBy (comparing solScore) sols
765

    
766
-- | Finds the best group for an instance on a multi-group cluster.
767
--
768
-- Only solutions in @preferred@ and @last_resort@ groups will be
769
-- accepted as valid, and additionally if the allowed groups parameter
770
-- is not null then allocation will only be run for those group
771
-- indices.
772
findBestAllocGroup :: Group.List           -- ^ The group list
773
                   -> Node.List            -- ^ The node list
774
                   -> Instance.List        -- ^ The instance list
775
                   -> Maybe [Gdx]          -- ^ The allowed groups
776
                   -> Instance.Instance    -- ^ The instance to allocate
777
                   -> Int                  -- ^ Required number of nodes
778
                   -> Result (Gdx, AllocSolution, [String])
779
findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
780
  let groups = splitCluster mgnl mgil
781
      groups' = maybe groups (\gs -> filter ((`elem` gs) . fst) groups)
782
                allowed_gdxs
783
      sols = map (\(gid, (nl, il)) ->
784
                   (gid, genAllocNodes mggl nl cnt False >>=
785
                       tryAlloc nl il inst))
786
             groups'::[(Gdx, Result AllocSolution)]
787
      all_msgs = concatMap (solutionDescription mggl) sols
788
      goodSols = filterMGResults mggl sols
789
      sortedSols = sortMGResults mggl goodSols
790
  in if null sortedSols
791
     then Bad $ intercalate ", " all_msgs
792
     else let (final_group, final_sol) = head sortedSols
793
          in return (final_group, final_sol, all_msgs)
794

    
795
-- | Try to allocate an instance on a multi-group cluster.
796
tryMGAlloc :: Group.List           -- ^ The group list
797
           -> Node.List            -- ^ The node list
798
           -> Instance.List        -- ^ The instance list
799
           -> Instance.Instance    -- ^ The instance to allocate
800
           -> Int                  -- ^ Required number of nodes
801
           -> Result AllocSolution -- ^ Possible solution list
802
tryMGAlloc mggl mgnl mgil inst cnt = do
803
  (best_group, solution, all_msgs) <-
804
      findBestAllocGroup mggl mgnl mgil Nothing inst cnt
805
  let group_name = Group.name $ Container.find best_group mggl
806
      selmsg = "Selected group: " ++ group_name
807
  return $ solution { asLog = selmsg:all_msgs }
808

    
809
-- | Try to relocate an instance on the cluster.
810
tryReloc :: (Monad m) =>
811
            Node.List       -- ^ The node list
812
         -> Instance.List   -- ^ The instance list
813
         -> Idx             -- ^ The index of the instance to move
814
         -> Int             -- ^ The number of nodes required
815
         -> [Ndx]           -- ^ Nodes which should not be used
816
         -> m AllocSolution -- ^ Solution list
817
tryReloc nl il xid 1 ex_idx =
818
    let all_nodes = getOnline nl
819
        inst = Container.find xid il
820
        ex_idx' = Instance.pNode inst:ex_idx
821
        valid_nodes = filter (not . flip elem ex_idx' . Node.idx) all_nodes
822
        valid_idxes = map Node.idx valid_nodes
823
        sols1 = foldl' (\cstate x ->
824
                            let em = do
825
                                  (mnl, i, _, _) <-
826
                                      applyMove nl inst (ReplaceSecondary x)
827
                                  return (mnl, i, [Container.find x mnl],
828
                                          compCV mnl)
829
                            in concatAllocs cstate em
830
                       ) emptyAllocSolution valid_idxes
831
    in return sols1
832

    
833
tryReloc _ _ _ reqn _  = fail $ "Unsupported number of relocation \
834
                                \destinations required (" ++ show reqn ++
835
                                                  "), only one supported"
836

    
837
-- | Function which fails if the requested mode is change secondary.
838
--
839
-- This is useful since except DRBD, no other disk template can
840
-- execute change secondary; thus, we can just call this function
841
-- instead of always checking for secondary mode. After the call to
842
-- this function, whatever mode we have is just a primary change.
843
failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
844
failOnSecondaryChange ChangeSecondary dt =
845
    fail $ "Instances with disk template '" ++ diskTemplateToRaw dt ++
846
         "' can't execute change secondary"
847
failOnSecondaryChange _ _ = return ()
848

    
849
-- | Run evacuation for a single instance.
850
--
851
-- /Note:/ this function should correctly execute both intra-group
852
-- evacuations (in all modes) and inter-group evacuations (in the
853
-- 'ChangeAll' mode). Of course, this requires that the correct list
854
-- of target nodes is passed.
855
nodeEvacInstance :: Node.List         -- ^ The node list (cluster-wide)
856
                 -> Instance.List     -- ^ Instance list (cluster-wide)
857
                 -> EvacMode          -- ^ The evacuation mode
858
                 -> Instance.Instance -- ^ The instance to be evacuated
859
                 -> Gdx               -- ^ The group we're targetting
860
                 -> [Ndx]             -- ^ The list of available nodes
861
                                      -- for allocation
862
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
863
nodeEvacInstance _ _ mode (Instance.Instance
864
                           {Instance.diskTemplate = dt@DTDiskless}) _ _ =
865
                  failOnSecondaryChange mode dt >>
866
                  fail "Diskless relocations not implemented yet"
867

    
868
nodeEvacInstance _ _ _ (Instance.Instance
869
                        {Instance.diskTemplate = DTPlain}) _ _ =
870
                  fail "Instances of type plain cannot be relocated"
871

    
872
nodeEvacInstance _ _ _ (Instance.Instance
873
                        {Instance.diskTemplate = DTFile}) _ _ =
874
                  fail "Instances of type file cannot be relocated"
875

    
876
nodeEvacInstance _ _ mode  (Instance.Instance
877
                            {Instance.diskTemplate = dt@DTSharedFile}) _ _ =
878
                  failOnSecondaryChange mode dt >>
879
                  fail "Shared file relocations not implemented yet"
880

    
881
nodeEvacInstance _ _ mode (Instance.Instance
882
                           {Instance.diskTemplate = dt@DTBlock}) _ _ =
883
                  failOnSecondaryChange mode dt >>
884
                  fail "Block device relocations not implemented yet"
885

    
886
nodeEvacInstance nl il ChangePrimary
887
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
888
                 _ _ =
889
  do
890
    (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
891
    let idx = Instance.idx inst
892
        il' = Container.add idx inst' il
893
        ops = iMoveToJob nl' il' idx Failover
894
    return (nl', il', ops)
895

    
896
nodeEvacInstance nl il ChangeSecondary
897
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
898
                 gdx avail_nodes =
899
  do
900
    (nl', inst', _, ndx) <- annotateResult "Can't find any good node" $
901
                            eitherToResult $
902
                            foldl' (evacDrbdSecondaryInner nl inst gdx)
903
                            (Left "no nodes available") avail_nodes
904
    let idx = Instance.idx inst
905
        il' = Container.add idx inst' il
906
        ops = iMoveToJob nl' il' idx (ReplaceSecondary ndx)
907
    return (nl', il', ops)
908

    
909
-- The algorithm for ChangeAll is as follows:
910
--
911
-- * generate all (primary, secondary) node pairs for the target groups
912
-- * for each pair, execute the needed moves (r:s, f, r:s) and compute
913
--   the final node list state and group score
914
-- * select the best choice via a foldl that uses the same Either
915
--   String solution as the ChangeSecondary mode
916
nodeEvacInstance nl il ChangeAll
917
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
918
                 gdx avail_nodes =
919
  do
920
    let no_nodes = Left "no nodes available"
921
        node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s]
922
    (nl', il', ops, _) <-
923
        annotateResult "Can't find any good nodes for relocation" $
924
        eitherToResult $
925
        foldl'
926
        (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of
927
                          Bad msg ->
928
                              case accu of
929
                                Right _ -> accu
930
                                -- we don't need more details (which
931
                                -- nodes, etc.) as we only selected
932
                                -- this group if we can allocate on
933
                                -- it, hence failures will not
934
                                -- propagate out of this fold loop
935
                                Left _ -> Left $ "Allocation failed: " ++ msg
936
                          Ok result@(_, _, _, new_cv) ->
937
                              let new_accu = Right result in
938
                              case accu of
939
                                Left _ -> new_accu
940
                                Right (_, _, _, old_cv) ->
941
                                    if old_cv < new_cv
942
                                    then accu
943
                                    else new_accu
944
        ) no_nodes node_pairs
945

    
946
    return (nl', il', ops)
947

    
948
-- | Inner fold function for changing secondary of a DRBD instance.
949
--
950
-- The running solution is either a @Left String@, which means we
951
-- don't have yet a working solution, or a @Right (...)@, which
952
-- represents a valid solution; it holds the modified node list, the
953
-- modified instance (after evacuation), the score of that solution,
954
-- and the new secondary node index.
955
evacDrbdSecondaryInner :: Node.List -- ^ Cluster node list
956
                       -> Instance.Instance -- ^ Instance being evacuated
957
                       -> Gdx -- ^ The group index of the instance
958
                       -> Either String ( Node.List
959
                                        , Instance.Instance
960
                                        , Score
961
                                        , Ndx)  -- ^ Current best solution
962
                       -> Ndx  -- ^ Node we're evaluating as new secondary
963
                       -> Either String ( Node.List
964
                                        , Instance.Instance
965
                                        , Score
966
                                        , Ndx) -- ^ New best solution
967
evacDrbdSecondaryInner nl inst gdx accu ndx =
968
    case applyMove nl inst (ReplaceSecondary ndx) of
969
      OpFail fm ->
970
          case accu of
971
            Right _ -> accu
972
            Left _ -> Left $ "Node " ++ Container.nameOf nl ndx ++
973
                      " failed: " ++ show fm
974
      OpGood (nl', inst', _, _) ->
975
          let nodes = Container.elems nl'
976
              -- The fromJust below is ugly (it can fail nastily), but
977
              -- at this point we should have any internal mismatches,
978
              -- and adding a monad here would be quite involved
979
              grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
980
              new_cv = compCVNodes grpnodes
981
              new_accu = Right (nl', inst', new_cv, ndx)
982
          in case accu of
983
               Left _ -> new_accu
984
               Right (_, _, old_cv, _) ->
985
                   if old_cv < new_cv
986
                   then accu
987
                   else new_accu
988

    
989
-- | Compute result of changing all nodes of a DRBD instance.
990
--
991
-- Given the target primary and secondary node (which might be in a
992
-- different group or not), this function will 'execute' all the
993
-- required steps and assuming all operations succceed, will return
994
-- the modified node and instance lists, the opcodes needed for this
995
-- and the new group score.
996
evacDrbdAllInner :: Node.List         -- ^ Cluster node list
997
                 -> Instance.List     -- ^ Cluster instance list
998
                 -> Instance.Instance -- ^ The instance to be moved
999
                 -> Gdx               -- ^ The target group index
1000
                                      -- (which can differ from the
1001
                                      -- current group of the
1002
                                      -- instance)
1003
                 -> (Ndx, Ndx)        -- ^ Tuple of new
1004
                                      -- primary\/secondary nodes
1005
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score)
1006
evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) =
1007
  do
1008
    let primary = Container.find (Instance.pNode inst) nl
1009
        idx = Instance.idx inst
1010
    -- if the primary is offline, then we first failover
1011
    (nl1, inst1, ops1) <-
1012
        if Node.offline primary
1013
        then do
1014
          (nl', inst', _, _) <-
1015
              annotateResult "Failing over to the secondary" $
1016
              opToResult $ applyMove nl inst Failover
1017
          return (nl', inst', [Failover])
1018
        else return (nl, inst, [])
1019
    let (o1, o2, o3) = (ReplaceSecondary t_pdx,
1020
                        Failover,
1021
                        ReplaceSecondary t_sdx)
1022
    -- we now need to execute a replace secondary to the future
1023
    -- primary node
1024
    (nl2, inst2, _, _) <-
1025
        annotateResult "Changing secondary to new primary" $
1026
        opToResult $
1027
        applyMove nl1 inst1 o1
1028
    let ops2 = o1:ops1
1029
    -- we now execute another failover, the primary stays fixed now
1030
    (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" $
1031
                          opToResult $ applyMove nl2 inst2 o2
1032
    let ops3 = o2:ops2
1033
    -- and finally another replace secondary, to the final secondary
1034
    (nl4, inst4, _, _) <-
1035
        annotateResult "Changing secondary to final secondary" $
1036
        opToResult $
1037
        applyMove nl3 inst3 o3
1038
    let ops4 = o3:ops3
1039
        il' = Container.add idx inst4 il
1040
        ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1041
    let nodes = Container.elems nl4
1042
        -- The fromJust below is ugly (it can fail nastily), but
1043
        -- at this point we should have any internal mismatches,
1044
        -- and adding a monad here would be quite involved
1045
        grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1046
        new_cv = compCVNodes grpnodes
1047
    return (nl4, il', ops, new_cv)
1048

    
1049
-- | Computes the nodes in a given group which are available for
1050
-- allocation.
1051
availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1052
                    -> IntSet.IntSet  -- ^ Nodes that are excluded
1053
                    -> Gdx            -- ^ The group for which we
1054
                                      -- query the nodes
1055
                    -> Result [Ndx]   -- ^ List of available node indices
1056
availableGroupNodes group_nodes excl_ndx gdx = do
1057
  local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1058
                 Ok (lookup gdx group_nodes)
1059
  let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1060
  return avail_nodes
1061

    
1062
-- | Updates the evac solution with the results of an instance
1063
-- evacuation.
1064
updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1065
                   -> Idx
1066
                   -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1067
                   -> (Node.List, Instance.List, EvacSolution)
1068
updateEvacSolution (nl, il, es) idx (Bad msg) =
1069
    (nl, il, es { esFailed = (idx, msg):esFailed es})
1070
updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1071
    (nl, il, es { esMoved = new_elem:esMoved es
1072
                , esOpCodes = opcodes:esOpCodes es })
1073
     where inst = Container.find idx il
1074
           new_elem = (idx,
1075
                       instancePriGroup nl inst,
1076
                       Instance.allNodes inst)
1077

    
1078
-- | Node-evacuation IAllocator mode main function.
1079
tryNodeEvac :: Group.List    -- ^ The cluster groups
1080
            -> Node.List     -- ^ The node list (cluster-wide, not per group)
1081
            -> Instance.List -- ^ Instance list (cluster-wide)
1082
            -> EvacMode      -- ^ The evacuation mode
1083
            -> [Idx]         -- ^ List of instance (indices) to be evacuated
1084
            -> Result (Node.List, Instance.List, EvacSolution)
1085
tryNodeEvac _ ini_nl ini_il mode idxs =
1086
    let evac_ndx = nodesToEvacuate ini_il mode idxs
1087
        offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1088
        excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1089
        group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1090
                                             (Container.elems nl))) $
1091
                      splitCluster ini_nl ini_il
1092
        (fin_nl, fin_il, esol) =
1093
            foldl' (\state@(nl, il, _) inst ->
1094
                        let gdx = instancePriGroup nl inst
1095
                            pdx = Instance.pNode inst in
1096
                        updateEvacSolution state (Instance.idx inst) $
1097
                        availableGroupNodes group_ndx
1098
                          (IntSet.insert pdx excl_ndx) gdx >>=
1099
                        nodeEvacInstance nl il mode inst gdx
1100
                   )
1101
            (ini_nl, ini_il, emptyEvacSolution)
1102
            (map (`Container.find` ini_il) idxs)
1103
    in return (fin_nl, fin_il, reverseEvacSolution esol)
1104

    
1105
-- | Change-group IAllocator mode main function.
1106
--
1107
-- This is very similar to 'tryNodeEvac', the only difference is that
1108
-- we don't choose as target group the current instance group, but
1109
-- instead:
1110
--
1111
--   1. at the start of the function, we compute which are the target
1112
--   groups; either no groups were passed in, in which case we choose
1113
--   all groups out of which we don't evacuate instance, or there were
1114
--   some groups passed, in which case we use those
1115
--
1116
--   2. for each instance, we use 'findBestAllocGroup' to choose the
1117
--   best group to hold the instance, and then we do what
1118
--   'tryNodeEvac' does, except for this group instead of the current
1119
--   instance group.
1120
--
1121
-- Note that the correct behaviour of this function relies on the
1122
-- function 'nodeEvacInstance' to be able to do correctly both
1123
-- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1124
tryChangeGroup :: Group.List    -- ^ The cluster groups
1125
               -> Node.List     -- ^ The node list (cluster-wide)
1126
               -> Instance.List -- ^ Instance list (cluster-wide)
1127
               -> [Gdx]         -- ^ Target groups; if empty, any
1128
                                -- groups not being evacuated
1129
               -> [Idx]         -- ^ List of instance (indices) to be evacuated
1130
               -> Result (Node.List, Instance.List, EvacSolution)
1131
tryChangeGroup gl ini_nl ini_il gdxs idxs =
1132
    let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1133
                               flip Container.find ini_il) idxs
1134
        target_gdxs = (if null gdxs
1135
                       then Container.keys gl
1136
                       else gdxs) \\ evac_gdxs
1137
        offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1138
        excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1139
        group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1140
                                             (Container.elems nl))) $
1141
                      splitCluster ini_nl ini_il
1142
        (fin_nl, fin_il, esol) =
1143
            foldl' (\state@(nl, il, _) inst ->
1144
                        let solution = do
1145
                              let ncnt = Instance.requiredNodes $
1146
                                         Instance.diskTemplate inst
1147
                              (gdx, _, _) <- findBestAllocGroup gl nl il
1148
                                             (Just target_gdxs) inst ncnt
1149
                              av_nodes <- availableGroupNodes group_ndx
1150
                                          excl_ndx gdx
1151
                              nodeEvacInstance nl il ChangeAll inst
1152
                                       gdx av_nodes
1153
                        in updateEvacSolution state
1154
                               (Instance.idx inst) solution
1155
                   )
1156
            (ini_nl, ini_il, emptyEvacSolution)
1157
            (map (`Container.find` ini_il) idxs)
1158
    in return (fin_nl, fin_il, reverseEvacSolution esol)
1159

    
1160
-- | Standard-sized allocation method.
1161
--
1162
-- This places instances of the same size on the cluster until we're
1163
-- out of space. The result will be a list of identically-sized
1164
-- instances.
1165
iterateAlloc :: AllocMethod
1166
iterateAlloc nl il limit newinst allocnodes ixes cstats =
1167
      let depth = length ixes
1168
          newname = printf "new-%d" depth::String
1169
          newidx = length (Container.elems il) + depth
1170
          newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1171
          newlimit = fmap (flip (-) 1) limit
1172
      in case tryAlloc nl il newi2 allocnodes of
1173
           Bad s -> Bad s
1174
           Ok (AllocSolution { asFailures = errs, asSolution = sols3 }) ->
1175
               let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1176
               case sols3 of
1177
                 Nothing -> newsol
1178
                 Just (xnl, xi, _, _) ->
1179
                     if limit == Just 0
1180
                     then newsol
1181
                     else iterateAlloc xnl (Container.add newidx xi il)
1182
                          newlimit newinst allocnodes (xi:ixes)
1183
                          (totalResources xnl:cstats)
1184

    
1185
-- | Tiered allocation method.
1186
--
1187
-- This places instances on the cluster, and decreases the spec until
1188
-- we can allocate again. The result will be a list of decreasing
1189
-- instance specs.
1190
tieredAlloc :: AllocMethod
1191
tieredAlloc nl il limit newinst allocnodes ixes cstats =
1192
    case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1193
      Bad s -> Bad s
1194
      Ok (errs, nl', il', ixes', cstats') ->
1195
          let newsol = Ok (errs, nl', il', ixes', cstats')
1196
              ixes_cnt = length ixes'
1197
              (stop, newlimit) = case limit of
1198
                                   Nothing -> (False, Nothing)
1199
                                   Just n -> (n <= ixes_cnt,
1200
                                              Just (n - ixes_cnt)) in
1201
          if stop then newsol else
1202
          case Instance.shrinkByType newinst . fst . last $
1203
               sortBy (comparing snd) errs of
1204
            Bad _ -> newsol
1205
            Ok newinst' -> tieredAlloc nl' il' newlimit
1206
                           newinst' allocnodes ixes' cstats'
1207

    
1208
-- * Formatting functions
1209

    
1210
-- | Given the original and final nodes, computes the relocation description.
1211
computeMoves :: Instance.Instance -- ^ The instance to be moved
1212
             -> String -- ^ The instance name
1213
             -> IMove  -- ^ The move being performed
1214
             -> String -- ^ New primary
1215
             -> String -- ^ New secondary
1216
             -> (String, [String])
1217
                -- ^ Tuple of moves and commands list; moves is containing
1218
                -- either @/f/@ for failover or @/r:name/@ for replace
1219
                -- secondary, while the command list holds gnt-instance
1220
                -- commands (without that prefix), e.g \"@failover instance1@\"
1221
computeMoves i inam mv c d =
1222
    case mv of
1223
      Failover -> ("f", [mig])
1224
      FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1225
      ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1226
      ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1227
      ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1228
    where morf = if Instance.instanceRunning i then "migrate" else "failover"
1229
          mig = printf "%s -f %s" morf inam::String
1230
          rep n = printf "replace-disks -n %s %s" n inam
1231

    
1232
-- | Converts a placement to string format.
1233
printSolutionLine :: Node.List     -- ^ The node list
1234
                  -> Instance.List -- ^ The instance list
1235
                  -> Int           -- ^ Maximum node name length
1236
                  -> Int           -- ^ Maximum instance name length
1237
                  -> Placement     -- ^ The current placement
1238
                  -> Int           -- ^ The index of the placement in
1239
                                   -- the solution
1240
                  -> (String, [String])
1241
printSolutionLine nl il nmlen imlen plc pos =
1242
    let
1243
        pmlen = (2*nmlen + 1)
1244
        (i, p, s, mv, c) = plc
1245
        inst = Container.find i il
1246
        inam = Instance.alias inst
1247
        npri = Node.alias $ Container.find p nl
1248
        nsec = Node.alias $ Container.find s nl
1249
        opri = Node.alias $ Container.find (Instance.pNode inst) nl
1250
        osec = Node.alias $ Container.find (Instance.sNode inst) nl
1251
        (moves, cmds) =  computeMoves inst inam mv npri nsec
1252
        ostr = printf "%s:%s" opri osec::String
1253
        nstr = printf "%s:%s" npri nsec::String
1254
    in
1255
      (printf "  %3d. %-*s %-*s => %-*s %.8f a=%s"
1256
       pos imlen inam pmlen ostr
1257
       pmlen nstr c moves,
1258
       cmds)
1259

    
1260
-- | Return the instance and involved nodes in an instance move.
1261
--
1262
-- Note that the output list length can vary, and is not required nor
1263
-- guaranteed to be of any specific length.
1264
involvedNodes :: Instance.List -- ^ Instance list, used for retrieving
1265
                               -- the instance from its index; note
1266
                               -- that this /must/ be the original
1267
                               -- instance list, so that we can
1268
                               -- retrieve the old nodes
1269
              -> Placement     -- ^ The placement we're investigating,
1270
                               -- containing the new nodes and
1271
                               -- instance index
1272
              -> [Ndx]         -- ^ Resulting list of node indices
1273
involvedNodes il plc =
1274
    let (i, np, ns, _, _) = plc
1275
        inst = Container.find i il
1276
    in nub $ [np, ns] ++ Instance.allNodes inst
1277

    
1278
-- | Inner function for splitJobs, that either appends the next job to
1279
-- the current jobset, or starts a new jobset.
1280
mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1281
mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1282
mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1283
    | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1284
    | otherwise = ([n]:cjs, ndx)
1285

    
1286
-- | Break a list of moves into independent groups. Note that this
1287
-- will reverse the order of jobs.
1288
splitJobs :: [MoveJob] -> [JobSet]
1289
splitJobs = fst . foldl mergeJobs ([], [])
1290

    
1291
-- | Given a list of commands, prefix them with @gnt-instance@ and
1292
-- also beautify the display a little.
1293
formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1294
formatJob jsn jsl (sn, (_, _, _, cmds)) =
1295
    let out =
1296
            printf "  echo job %d/%d" jsn sn:
1297
            printf "  check":
1298
            map ("  gnt-instance " ++) cmds
1299
    in if sn == 1
1300
       then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1301
       else out
1302

    
1303
-- | Given a list of commands, prefix them with @gnt-instance@ and
1304
-- also beautify the display a little.
1305
formatCmds :: [JobSet] -> String
1306
formatCmds =
1307
    unlines .
1308
    concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1309
                             (zip [1..] js)) .
1310
    zip [1..]
1311

    
1312
-- | Print the node list.
1313
printNodes :: Node.List -> [String] -> String
1314
printNodes nl fs =
1315
    let fields = case fs of
1316
          [] -> Node.defaultFields
1317
          "+":rest -> Node.defaultFields ++ rest
1318
          _ -> fs
1319
        snl = sortBy (comparing Node.idx) (Container.elems nl)
1320
        (header, isnum) = unzip $ map Node.showHeader fields
1321
    in unlines . map ((:) ' ' .  intercalate " ") $
1322
       formatTable (header:map (Node.list fields) snl) isnum
1323

    
1324
-- | Print the instance list.
1325
printInsts :: Node.List -> Instance.List -> String
1326
printInsts nl il =
1327
    let sil = sortBy (comparing Instance.idx) (Container.elems il)
1328
        helper inst = [ if Instance.instanceRunning inst then "R" else " "
1329
                      , Instance.name inst
1330
                      , Container.nameOf nl (Instance.pNode inst)
1331
                      , let sdx = Instance.sNode inst
1332
                        in if sdx == Node.noSecondary
1333
                           then  ""
1334
                           else Container.nameOf nl sdx
1335
                      , if Instance.autoBalance inst then "Y" else "N"
1336
                      , printf "%3d" $ Instance.vcpus inst
1337
                      , printf "%5d" $ Instance.mem inst
1338
                      , printf "%5d" $ Instance.dsk inst `div` 1024
1339
                      , printf "%5.3f" lC
1340
                      , printf "%5.3f" lM
1341
                      , printf "%5.3f" lD
1342
                      , printf "%5.3f" lN
1343
                      ]
1344
            where DynUtil lC lM lD lN = Instance.util inst
1345
        header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1346
                 , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1347
        isnum = False:False:False:False:False:repeat True
1348
    in unlines . map ((:) ' ' . intercalate " ") $
1349
       formatTable (header:map helper sil) isnum
1350

    
1351
-- | Shows statistics for a given node list.
1352
printStats :: Node.List -> String
1353
printStats nl =
1354
    let dcvs = compDetailedCV $ Container.elems nl
1355
        (weights, names) = unzip detailedCVInfo
1356
        hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1357
        formatted = map (\(w, header, val) ->
1358
                             printf "%s=%.8f(x%.2f)" header val w::String) hd
1359
    in intercalate ", " formatted
1360

    
1361
-- | Convert a placement into a list of OpCodes (basically a job).
1362
iMoveToJob :: Node.List        -- ^ The node list; only used for node
1363
                               -- names, so any version is good
1364
                               -- (before or after the operation)
1365
           -> Instance.List    -- ^ The instance list; also used for
1366
                               -- names only
1367
           -> Idx              -- ^ The index of the instance being
1368
                               -- moved
1369
           -> IMove            -- ^ The actual move to be described
1370
           -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to
1371
                               -- the given move
1372
iMoveToJob nl il idx move =
1373
    let inst = Container.find idx il
1374
        iname = Instance.name inst
1375
        lookNode  = Just . Container.nameOf nl
1376
        opF = OpCodes.OpInstanceMigrate iname True False True Nothing
1377
        opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n)
1378
                OpCodes.ReplaceNewSecondary [] Nothing
1379
    in case move of
1380
         Failover -> [ opF ]
1381
         ReplacePrimary np -> [ opF, opR np, opF ]
1382
         ReplaceSecondary ns -> [ opR ns ]
1383
         ReplaceAndFailover np -> [ opR np, opF ]
1384
         FailoverAndReplace ns -> [ opF, opR ns ]
1385

    
1386
-- * Node group functions
1387

    
1388
-- | Computes the group of an instance.
1389
instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1390
instanceGroup nl i =
1391
  let sidx = Instance.sNode i
1392
      pnode = Container.find (Instance.pNode i) nl
1393
      snode = if sidx == Node.noSecondary
1394
              then pnode
1395
              else Container.find sidx nl
1396
      pgroup = Node.group pnode
1397
      sgroup = Node.group snode
1398
  in if pgroup /= sgroup
1399
     then fail ("Instance placed accross two node groups, primary " ++
1400
                show pgroup ++ ", secondary " ++ show sgroup)
1401
     else return pgroup
1402

    
1403
-- | Computes the group of an instance per the primary node.
1404
instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1405
instancePriGroup nl i =
1406
  let pnode = Container.find (Instance.pNode i) nl
1407
  in  Node.group pnode
1408

    
1409
-- | Compute the list of badly allocated instances (split across node
1410
-- groups).
1411
findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1412
findSplitInstances nl =
1413
  filter (not . isOk . instanceGroup nl) . Container.elems
1414

    
1415
-- | Splits a cluster into the component node groups.
1416
splitCluster :: Node.List -> Instance.List ->
1417
                [(Gdx, (Node.List, Instance.List))]
1418
splitCluster nl il =
1419
  let ngroups = Node.computeGroups (Container.elems nl)
1420
  in map (\(guuid, nodes) ->
1421
           let nidxs = map Node.idx nodes
1422
               nodes' = zip nidxs nodes
1423
               instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1424
           in (guuid, (Container.fromList nodes', instances))) ngroups
1425

    
1426
-- | Compute the list of nodes that are to be evacuated, given a list
1427
-- of instances and an evacuation mode.
1428
nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1429
                -> EvacMode      -- ^ The evacuation mode we're using
1430
                -> [Idx]         -- ^ List of instance indices being evacuated
1431
                -> IntSet.IntSet -- ^ Set of node indices
1432
nodesToEvacuate il mode =
1433
    IntSet.delete Node.noSecondary .
1434
    foldl' (\ns idx ->
1435
                let i = Container.find idx il
1436
                    pdx = Instance.pNode i
1437
                    sdx = Instance.sNode i
1438
                    dt = Instance.diskTemplate i
1439
                    withSecondary = case dt of
1440
                                      DTDrbd8 -> IntSet.insert sdx ns
1441
                                      _ -> ns
1442
                in case mode of
1443
                     ChangePrimary   -> IntSet.insert pdx ns
1444
                     ChangeSecondary -> withSecondary
1445
                     ChangeAll       -> IntSet.insert pdx withSecondary
1446
           ) IntSet.empty