Statistics
| Branch: | Tag: | Revision:

root / htools / Ganeti / HTools / Cluster.hs @ 7eda951b

History | View | Annotate | Download (63.2 kB)

1
{-| Implementation of cluster-wide logic.
2

    
3
This module holds all pure cluster-logic; I\/O related functionality
4
goes into the /Main/ module for the individual binaries.
5

    
6
-}
7

    
8
{-
9

    
10
Copyright (C) 2009, 2010, 2011 Google Inc.
11

    
12
This program is free software; you can redistribute it and/or modify
13
it under the terms of the GNU General Public License as published by
14
the Free Software Foundation; either version 2 of the License, or
15
(at your option) any later version.
16

    
17
This program is distributed in the hope that it will be useful, but
18
WITHOUT ANY WARRANTY; without even the implied warranty of
19
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
General Public License for more details.
21

    
22
You should have received a copy of the GNU General Public License
23
along with this program; if not, write to the Free Software
24
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25
02110-1301, USA.
26

    
27
-}
28

    
29
module Ganeti.HTools.Cluster
30
    (
31
     -- * Types
32
      AllocSolution(..)
33
    , EvacSolution(..)
34
    , Table(..)
35
    , CStats(..)
36
    , AllocStats
37
    , AllocMethod
38
    -- * Generic functions
39
    , totalResources
40
    , computeAllocationDelta
41
    -- * First phase functions
42
    , computeBadItems
43
    -- * Second phase functions
44
    , printSolutionLine
45
    , formatCmds
46
    , involvedNodes
47
    , splitJobs
48
    -- * Display functions
49
    , printNodes
50
    , printInsts
51
    -- * Balacing functions
52
    , checkMove
53
    , doNextBalance
54
    , tryBalance
55
    , compCV
56
    , compCVNodes
57
    , compDetailedCV
58
    , printStats
59
    , iMoveToJob
60
    -- * IAllocator functions
61
    , genAllocNodes
62
    , tryAlloc
63
    , tryMGAlloc
64
    , tryReloc
65
    , tryNodeEvac
66
    , tryChangeGroup
67
    , collapseFailures
68
    -- * Allocation functions
69
    , iterateAlloc
70
    , tieredAlloc
71
     -- * Node group functions
72
    , instanceGroup
73
    , findSplitInstances
74
    , splitCluster
75
    ) where
76

    
77
import qualified Data.IntSet as IntSet
78
import Data.List
79
import Data.Maybe (fromJust, isNothing)
80
import Data.Ord (comparing)
81
import Text.Printf (printf)
82

    
83
import qualified Ganeti.HTools.Container as Container
84
import qualified Ganeti.HTools.Instance as Instance
85
import qualified Ganeti.HTools.Node as Node
86
import qualified Ganeti.HTools.Group as Group
87
import Ganeti.HTools.Types
88
import Ganeti.HTools.Utils
89
import Ganeti.HTools.Compat
90
import qualified Ganeti.OpCodes as OpCodes
91

    
92
-- * Types
93

    
94
-- | Allocation\/relocation solution.
95
data AllocSolution = AllocSolution
96
  { asFailures :: [FailMode]              -- ^ Failure counts
97
  , asAllocs   :: Int                     -- ^ Good allocation count
98
  , asSolution :: Maybe Node.AllocElement -- ^ The actual allocation result
99
  , asLog      :: [String]                -- ^ Informational messages
100
  }
101

    
102
-- | Node evacuation/group change iallocator result type. This result
103
-- type consists of actual opcodes (a restricted subset) that are
104
-- transmitted back to Ganeti.
105
data EvacSolution = EvacSolution
106
    { esMoved   :: [(Idx, Gdx, [Ndx])]  -- ^ Instances moved successfully
107
    , esFailed  :: [(Idx, String)]      -- ^ Instances which were not
108
                                        -- relocated
109
    , esOpCodes :: [[OpCodes.OpCode]]   -- ^ List of jobs
110
    }
111

    
112
-- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
113
type AllocResult = (FailStats, Node.List, Instance.List,
114
                    [Instance.Instance], [CStats])
115

    
116
-- | A type denoting the valid allocation mode/pairs.
117
--
118
-- For a one-node allocation, this will be a @Left ['Ndx']@, whereas
119
-- for a two-node allocation, this will be a @Right [('Ndx',
120
-- ['Ndx'])]@. In the latter case, the list is basically an
121
-- association list, grouped by primary node and holding the potential
122
-- secondary nodes in the sub-list.
123
type AllocNodes = Either [Ndx] [(Ndx, [Ndx])]
124

    
125
-- | The empty solution we start with when computing allocations.
126
emptyAllocSolution :: AllocSolution
127
emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
128
                                   , asSolution = Nothing, asLog = [] }
129

    
130
-- | The empty evac solution.
131
emptyEvacSolution :: EvacSolution
132
emptyEvacSolution = EvacSolution { esMoved = []
133
                                 , esFailed = []
134
                                 , esOpCodes = []
135
                                 }
136

    
137
-- | The complete state for the balancing solution.
138
data Table = Table Node.List Instance.List Score [Placement]
139
             deriving (Show, Read)
140

    
141
-- | Cluster statistics data type.
142
data CStats = CStats { csFmem :: Integer -- ^ Cluster free mem
143
                     , csFdsk :: Integer -- ^ Cluster free disk
144
                     , csAmem :: Integer -- ^ Cluster allocatable mem
145
                     , csAdsk :: Integer -- ^ Cluster allocatable disk
146
                     , csAcpu :: Integer -- ^ Cluster allocatable cpus
147
                     , csMmem :: Integer -- ^ Max node allocatable mem
148
                     , csMdsk :: Integer -- ^ Max node allocatable disk
149
                     , csMcpu :: Integer -- ^ Max node allocatable cpu
150
                     , csImem :: Integer -- ^ Instance used mem
151
                     , csIdsk :: Integer -- ^ Instance used disk
152
                     , csIcpu :: Integer -- ^ Instance used cpu
153
                     , csTmem :: Double  -- ^ Cluster total mem
154
                     , csTdsk :: Double  -- ^ Cluster total disk
155
                     , csTcpu :: Double  -- ^ Cluster total cpus
156
                     , csVcpu :: Integer -- ^ Cluster virtual cpus (if
157
                                         -- node pCpu has been set,
158
                                         -- otherwise -1)
159
                     , csXmem :: Integer -- ^ Unnacounted for mem
160
                     , csNmem :: Integer -- ^ Node own memory
161
                     , csScore :: Score  -- ^ The cluster score
162
                     , csNinst :: Int    -- ^ The total number of instances
163
                     }
164
            deriving (Show, Read)
165

    
166
-- | Currently used, possibly to allocate, unallocable.
167
type AllocStats = (RSpec, RSpec, RSpec)
168

    
169
-- | A simple type for allocation functions.
170
type AllocMethod =  Node.List           -- ^ Node list
171
                 -> Instance.List       -- ^ Instance list
172
                 -> Maybe Int           -- ^ Optional allocation limit
173
                 -> Instance.Instance   -- ^ Instance spec for allocation
174
                 -> AllocNodes          -- ^ Which nodes we should allocate on
175
                 -> [Instance.Instance] -- ^ Allocated instances
176
                 -> [CStats]            -- ^ Running cluster stats
177
                 -> Result AllocResult  -- ^ Allocation result
178

    
179
-- * Utility functions
180

    
181
-- | Verifies the N+1 status and return the affected nodes.
182
verifyN1 :: [Node.Node] -> [Node.Node]
183
verifyN1 = filter Node.failN1
184

    
185
{-| Computes the pair of bad nodes and instances.
186

    
187
The bad node list is computed via a simple 'verifyN1' check, and the
188
bad instance list is the list of primary and secondary instances of
189
those nodes.
190

    
191
-}
192
computeBadItems :: Node.List -> Instance.List ->
193
                   ([Node.Node], [Instance.Instance])
194
computeBadItems nl il =
195
  let bad_nodes = verifyN1 $ getOnline nl
196
      bad_instances = map (`Container.find` il) .
197
                      sort . nub $
198
                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
199
  in
200
    (bad_nodes, bad_instances)
201

    
202
-- | Zero-initializer for the CStats type.
203
emptyCStats :: CStats
204
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
205

    
206
-- | Update stats with data from a new node.
207
updateCStats :: CStats -> Node.Node -> CStats
208
updateCStats cs node =
209
    let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
210
                 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
211
                 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
212
                 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
213
                 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
214
                 csVcpu = x_vcpu,
215
                 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
216
               }
217
            = cs
218
        inc_amem = Node.fMem node - Node.rMem node
219
        inc_amem' = if inc_amem > 0 then inc_amem else 0
220
        inc_adsk = Node.availDisk node
221
        inc_imem = truncate (Node.tMem node) - Node.nMem node
222
                   - Node.xMem node - Node.fMem node
223
        inc_icpu = Node.uCpu node
224
        inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
225
        inc_vcpu = Node.hiCpu node
226
        inc_acpu = Node.availCpu node
227

    
228
    in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
229
          , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
230
          , csAmem = x_amem + fromIntegral inc_amem'
231
          , csAdsk = x_adsk + fromIntegral inc_adsk
232
          , csAcpu = x_acpu + fromIntegral inc_acpu
233
          , csMmem = max x_mmem (fromIntegral inc_amem')
234
          , csMdsk = max x_mdsk (fromIntegral inc_adsk)
235
          , csMcpu = max x_mcpu (fromIntegral inc_acpu)
236
          , csImem = x_imem + fromIntegral inc_imem
237
          , csIdsk = x_idsk + fromIntegral inc_idsk
238
          , csIcpu = x_icpu + fromIntegral inc_icpu
239
          , csTmem = x_tmem + Node.tMem node
240
          , csTdsk = x_tdsk + Node.tDsk node
241
          , csTcpu = x_tcpu + Node.tCpu node
242
          , csVcpu = x_vcpu + fromIntegral inc_vcpu
243
          , csXmem = x_xmem + fromIntegral (Node.xMem node)
244
          , csNmem = x_nmem + fromIntegral (Node.nMem node)
245
          , csNinst = x_ninst + length (Node.pList node)
246
          }
247

    
248
-- | Compute the total free disk and memory in the cluster.
249
totalResources :: Node.List -> CStats
250
totalResources nl =
251
    let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
252
    in cs { csScore = compCV nl }
253

    
254
-- | Compute the delta between two cluster state.
255
--
256
-- This is used when doing allocations, to understand better the
257
-- available cluster resources. The return value is a triple of the
258
-- current used values, the delta that was still allocated, and what
259
-- was left unallocated.
260
computeAllocationDelta :: CStats -> CStats -> AllocStats
261
computeAllocationDelta cini cfin =
262
    let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini
263
        CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
264
                csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin
265
        rini = RSpec (fromIntegral i_icpu) (fromIntegral i_imem)
266
               (fromIntegral i_idsk)
267
        rfin = RSpec (fromIntegral (f_icpu - i_icpu))
268
               (fromIntegral (f_imem - i_imem))
269
               (fromIntegral (f_idsk - i_idsk))
270
        un_cpu = fromIntegral (v_cpu - f_icpu)::Int
271
        runa = RSpec un_cpu (truncate t_mem - fromIntegral f_imem)
272
               (truncate t_dsk - fromIntegral f_idsk)
273
    in (rini, rfin, runa)
274

    
275
-- | The names and weights of the individual elements in the CV list.
276
detailedCVInfo :: [(Double, String)]
277
detailedCVInfo = [ (1,  "free_mem_cv")
278
                 , (1,  "free_disk_cv")
279
                 , (1,  "n1_cnt")
280
                 , (1,  "reserved_mem_cv")
281
                 , (4,  "offline_all_cnt")
282
                 , (16, "offline_pri_cnt")
283
                 , (1,  "vcpu_ratio_cv")
284
                 , (1,  "cpu_load_cv")
285
                 , (1,  "mem_load_cv")
286
                 , (1,  "disk_load_cv")
287
                 , (1,  "net_load_cv")
288
                 , (2,  "pri_tags_score")
289
                 ]
290

    
291
-- | Holds the weights used by 'compCVNodes' for each metric.
292
detailedCVWeights :: [Double]
293
detailedCVWeights = map fst detailedCVInfo
294

    
295
-- | Compute the mem and disk covariance.
296
compDetailedCV :: [Node.Node] -> [Double]
297
compDetailedCV all_nodes =
298
    let
299
        (offline, nodes) = partition Node.offline all_nodes
300
        mem_l = map Node.pMem nodes
301
        dsk_l = map Node.pDsk nodes
302
        -- metric: memory covariance
303
        mem_cv = stdDev mem_l
304
        -- metric: disk covariance
305
        dsk_cv = stdDev dsk_l
306
        -- metric: count of instances living on N1 failing nodes
307
        n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
308
                                                   length (Node.pList n)) .
309
                   filter Node.failN1 $ nodes :: Double
310
        res_l = map Node.pRem nodes
311
        -- metric: reserved memory covariance
312
        res_cv = stdDev res_l
313
        -- offline instances metrics
314
        offline_ipri = sum . map (length . Node.pList) $ offline
315
        offline_isec = sum . map (length . Node.sList) $ offline
316
        -- metric: count of instances on offline nodes
317
        off_score = fromIntegral (offline_ipri + offline_isec)::Double
318
        -- metric: count of primary instances on offline nodes (this
319
        -- helps with evacuation/failover of primary instances on
320
        -- 2-node clusters with one node offline)
321
        off_pri_score = fromIntegral offline_ipri::Double
322
        cpu_l = map Node.pCpu nodes
323
        -- metric: covariance of vcpu/pcpu ratio
324
        cpu_cv = stdDev cpu_l
325
        -- metrics: covariance of cpu, memory, disk and network load
326
        (c_load, m_load, d_load, n_load) = unzip4 $
327
            map (\n ->
328
                     let DynUtil c1 m1 d1 n1 = Node.utilLoad n
329
                         DynUtil c2 m2 d2 n2 = Node.utilPool n
330
                     in (c1/c2, m1/m2, d1/d2, n1/n2)
331
                ) nodes
332
        -- metric: conflicting instance count
333
        pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
334
        pri_tags_score = fromIntegral pri_tags_inst::Double
335
    in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
336
       , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
337
       , pri_tags_score ]
338

    
339
-- | Compute the /total/ variance.
340
compCVNodes :: [Node.Node] -> Double
341
compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
342

    
343
-- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
344
compCV :: Node.List -> Double
345
compCV = compCVNodes . Container.elems
346

    
347
-- | Compute online nodes from a 'Node.List'.
348
getOnline :: Node.List -> [Node.Node]
349
getOnline = filter (not . Node.offline) . Container.elems
350

    
351
-- * Balancing functions
352

    
353
-- | Compute best table. Note that the ordering of the arguments is important.
354
compareTables :: Table -> Table -> Table
355
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
356
    if a_cv > b_cv then b else a
357

    
358
-- | Applies an instance move to a given node list and instance.
359
applyMove :: Node.List -> Instance.Instance
360
          -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
361
-- Failover (f)
362
applyMove nl inst Failover =
363
    let old_pdx = Instance.pNode inst
364
        old_sdx = Instance.sNode inst
365
        old_p = Container.find old_pdx nl
366
        old_s = Container.find old_sdx nl
367
        int_p = Node.removePri old_p inst
368
        int_s = Node.removeSec old_s inst
369
        force_p = Node.offline old_p
370
        new_nl = do -- Maybe monad
371
          new_p <- Node.addPriEx force_p int_s inst
372
          new_s <- Node.addSec int_p inst old_sdx
373
          let new_inst = Instance.setBoth inst old_sdx old_pdx
374
          return (Container.addTwo old_pdx new_s old_sdx new_p nl,
375
                  new_inst, old_sdx, old_pdx)
376
    in new_nl
377

    
378
-- Replace the primary (f:, r:np, f)
379
applyMove nl inst (ReplacePrimary new_pdx) =
380
    let old_pdx = Instance.pNode inst
381
        old_sdx = Instance.sNode inst
382
        old_p = Container.find old_pdx nl
383
        old_s = Container.find old_sdx nl
384
        tgt_n = Container.find new_pdx nl
385
        int_p = Node.removePri old_p inst
386
        int_s = Node.removeSec old_s inst
387
        force_p = Node.offline old_p
388
        new_nl = do -- Maybe monad
389
          -- check that the current secondary can host the instance
390
          -- during the migration
391
          tmp_s <- Node.addPriEx force_p int_s inst
392
          let tmp_s' = Node.removePri tmp_s inst
393
          new_p <- Node.addPriEx force_p tgt_n inst
394
          new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
395
          let new_inst = Instance.setPri inst new_pdx
396
          return (Container.add new_pdx new_p $
397
                  Container.addTwo old_pdx int_p old_sdx new_s nl,
398
                  new_inst, new_pdx, old_sdx)
399
    in new_nl
400

    
401
-- Replace the secondary (r:ns)
402
applyMove nl inst (ReplaceSecondary new_sdx) =
403
    let old_pdx = Instance.pNode inst
404
        old_sdx = Instance.sNode inst
405
        old_s = Container.find old_sdx nl
406
        tgt_n = Container.find new_sdx nl
407
        int_s = Node.removeSec old_s inst
408
        force_s = Node.offline old_s
409
        new_inst = Instance.setSec inst new_sdx
410
        new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
411
                 \new_s -> return (Container.addTwo new_sdx
412
                                   new_s old_sdx int_s nl,
413
                                   new_inst, old_pdx, new_sdx)
414
    in new_nl
415

    
416
-- Replace the secondary and failover (r:np, f)
417
applyMove nl inst (ReplaceAndFailover new_pdx) =
418
    let old_pdx = Instance.pNode inst
419
        old_sdx = Instance.sNode inst
420
        old_p = Container.find old_pdx nl
421
        old_s = Container.find old_sdx nl
422
        tgt_n = Container.find new_pdx nl
423
        int_p = Node.removePri old_p inst
424
        int_s = Node.removeSec old_s inst
425
        force_s = Node.offline old_s
426
        new_nl = do -- Maybe monad
427
          new_p <- Node.addPri tgt_n inst
428
          new_s <- Node.addSecEx force_s int_p inst new_pdx
429
          let new_inst = Instance.setBoth inst new_pdx old_pdx
430
          return (Container.add new_pdx new_p $
431
                  Container.addTwo old_pdx new_s old_sdx int_s nl,
432
                  new_inst, new_pdx, old_pdx)
433
    in new_nl
434

    
435
-- Failver and replace the secondary (f, r:ns)
436
applyMove nl inst (FailoverAndReplace new_sdx) =
437
    let old_pdx = Instance.pNode inst
438
        old_sdx = Instance.sNode inst
439
        old_p = Container.find old_pdx nl
440
        old_s = Container.find old_sdx nl
441
        tgt_n = Container.find new_sdx nl
442
        int_p = Node.removePri old_p inst
443
        int_s = Node.removeSec old_s inst
444
        force_p = Node.offline old_p
445
        new_nl = do -- Maybe monad
446
          new_p <- Node.addPriEx force_p int_s inst
447
          new_s <- Node.addSecEx force_p tgt_n inst old_sdx
448
          let new_inst = Instance.setBoth inst old_sdx new_sdx
449
          return (Container.add new_sdx new_s $
450
                  Container.addTwo old_sdx new_p old_pdx int_p nl,
451
                  new_inst, old_sdx, new_sdx)
452
    in new_nl
453

    
454
-- | Tries to allocate an instance on one given node.
455
allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
456
                 -> OpResult Node.AllocElement
457
allocateOnSingle nl inst new_pdx =
458
    let p = Container.find new_pdx nl
459
        new_inst = Instance.setBoth inst new_pdx Node.noSecondary
460
    in  Node.addPri p inst >>= \new_p -> do
461
      let new_nl = Container.add new_pdx new_p nl
462
          new_score = compCV nl
463
      return (new_nl, new_inst, [new_p], new_score)
464

    
465
-- | Tries to allocate an instance on a given pair of nodes.
466
allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
467
               -> OpResult Node.AllocElement
468
allocateOnPair nl inst new_pdx new_sdx =
469
    let tgt_p = Container.find new_pdx nl
470
        tgt_s = Container.find new_sdx nl
471
    in do
472
      new_p <- Node.addPri tgt_p inst
473
      new_s <- Node.addSec tgt_s inst new_pdx
474
      let new_inst = Instance.setBoth inst new_pdx new_sdx
475
          new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
476
      return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
477

    
478
-- | Tries to perform an instance move and returns the best table
479
-- between the original one and the new one.
480
checkSingleStep :: Table -- ^ The original table
481
                -> Instance.Instance -- ^ The instance to move
482
                -> Table -- ^ The current best table
483
                -> IMove -- ^ The move to apply
484
                -> Table -- ^ The final best table
485
checkSingleStep ini_tbl target cur_tbl move =
486
    let
487
        Table ini_nl ini_il _ ini_plc = ini_tbl
488
        tmp_resu = applyMove ini_nl target move
489
    in
490
      case tmp_resu of
491
        OpFail _ -> cur_tbl
492
        OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
493
            let tgt_idx = Instance.idx target
494
                upd_cvar = compCV upd_nl
495
                upd_il = Container.add tgt_idx new_inst ini_il
496
                upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
497
                upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
498
            in
499
              compareTables cur_tbl upd_tbl
500

    
501
-- | Given the status of the current secondary as a valid new node and
502
-- the current candidate target node, generate the possible moves for
503
-- a instance.
504
possibleMoves :: Bool      -- ^ Whether the secondary node is a valid new node
505
              -> Bool      -- ^ Whether we can change the primary node
506
              -> Ndx       -- ^ Target node candidate
507
              -> [IMove]   -- ^ List of valid result moves
508

    
509
possibleMoves _ False tdx =
510
    [ReplaceSecondary tdx]
511

    
512
possibleMoves True True tdx =
513
    [ReplaceSecondary tdx,
514
     ReplaceAndFailover tdx,
515
     ReplacePrimary tdx,
516
     FailoverAndReplace tdx]
517

    
518
possibleMoves False True tdx =
519
    [ReplaceSecondary tdx,
520
     ReplaceAndFailover tdx]
521

    
522
-- | Compute the best move for a given instance.
523
checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
524
                  -> Bool              -- ^ Whether disk moves are allowed
525
                  -> Bool              -- ^ Whether instance moves are allowed
526
                  -> Table             -- ^ Original table
527
                  -> Instance.Instance -- ^ Instance to move
528
                  -> Table             -- ^ Best new table for this instance
529
checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
530
    let
531
        opdx = Instance.pNode target
532
        osdx = Instance.sNode target
533
        nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx
534
        use_secondary = elem osdx nodes_idx && inst_moves
535
        aft_failover = if use_secondary -- if allowed to failover
536
                       then checkSingleStep ini_tbl target ini_tbl Failover
537
                       else ini_tbl
538
        all_moves = if disk_moves
539
                    then concatMap
540
                         (possibleMoves use_secondary inst_moves) nodes
541
                    else []
542
    in
543
      -- iterate over the possible nodes for this instance
544
      foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
545

    
546
-- | Compute the best next move.
547
checkMove :: [Ndx]               -- ^ Allowed target node indices
548
          -> Bool                -- ^ Whether disk moves are allowed
549
          -> Bool                -- ^ Whether instance moves are allowed
550
          -> Table               -- ^ The current solution
551
          -> [Instance.Instance] -- ^ List of instances still to move
552
          -> Table               -- ^ The new solution
553
checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
554
    let Table _ _ _ ini_plc = ini_tbl
555
        -- we're using rwhnf from the Control.Parallel.Strategies
556
        -- package; we don't need to use rnf as that would force too
557
        -- much evaluation in single-threaded cases, and in
558
        -- multi-threaded case the weak head normal form is enough to
559
        -- spark the evaluation
560
        tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
561
                               inst_moves ini_tbl)
562
                 victims
563
        -- iterate over all instances, computing the best move
564
        best_tbl = foldl' compareTables ini_tbl tables
565
        Table _ _ _ best_plc = best_tbl
566
    in if length best_plc == length ini_plc
567
       then ini_tbl -- no advancement
568
       else best_tbl
569

    
570
-- | Check if we are allowed to go deeper in the balancing.
571
doNextBalance :: Table     -- ^ The starting table
572
              -> Int       -- ^ Remaining length
573
              -> Score     -- ^ Score at which to stop
574
              -> Bool      -- ^ The resulting table and commands
575
doNextBalance ini_tbl max_rounds min_score =
576
    let Table _ _ ini_cv ini_plc = ini_tbl
577
        ini_plc_len = length ini_plc
578
    in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
579

    
580
-- | Run a balance move.
581
tryBalance :: Table       -- ^ The starting table
582
           -> Bool        -- ^ Allow disk moves
583
           -> Bool        -- ^ Allow instance moves
584
           -> Bool        -- ^ Only evacuate moves
585
           -> Score       -- ^ Min gain threshold
586
           -> Score       -- ^ Min gain
587
           -> Maybe Table -- ^ The resulting table and commands
588
tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
589
    let Table ini_nl ini_il ini_cv _ = ini_tbl
590
        all_inst = Container.elems ini_il
591
        all_inst' = if evac_mode
592
                    then let bad_nodes = map Node.idx . filter Node.offline $
593
                                         Container.elems ini_nl
594
                         in filter (any (`elem` bad_nodes) . Instance.allNodes)
595
                            all_inst
596
                    else all_inst
597
        reloc_inst = filter Instance.movable all_inst'
598
        node_idx = map Node.idx . filter (not . Node.offline) $
599
                   Container.elems ini_nl
600
        fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
601
        (Table _ _ fin_cv _) = fin_tbl
602
    in
603
      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
604
      then Just fin_tbl -- this round made success, return the new table
605
      else Nothing
606

    
607
-- * Allocation functions
608

    
609
-- | Build failure stats out of a list of failures.
610
collapseFailures :: [FailMode] -> FailStats
611
collapseFailures flst =
612
    map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
613
            [minBound..maxBound]
614

    
615
-- | Compares two Maybe AllocElement and chooses the besst score.
616
bestAllocElement :: Maybe Node.AllocElement
617
                 -> Maybe Node.AllocElement
618
                 -> Maybe Node.AllocElement
619
bestAllocElement a Nothing = a
620
bestAllocElement Nothing b = b
621
bestAllocElement a@(Just (_, _, _, ascore)) b@(Just (_, _, _, bscore)) =
622
    if ascore < bscore then a else b
623

    
624
-- | Update current Allocation solution and failure stats with new
625
-- elements.
626
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
627
concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
628

    
629
concatAllocs as (OpGood ns) =
630
    let -- Choose the old or new solution, based on the cluster score
631
        cntok = asAllocs as
632
        osols = asSolution as
633
        nsols = bestAllocElement osols (Just ns)
634
        nsuc = cntok + 1
635
    -- Note: we force evaluation of nsols here in order to keep the
636
    -- memory profile low - we know that we will need nsols for sure
637
    -- in the next cycle, so we force evaluation of nsols, since the
638
    -- foldl' in the caller will only evaluate the tuple, but not the
639
    -- elements of the tuple
640
    in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolution = nsols }
641

    
642
-- | Sums two 'AllocSolution' structures.
643
sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
644
sumAllocs (AllocSolution aFails aAllocs aSols aLog)
645
          (AllocSolution bFails bAllocs bSols bLog) =
646
    -- note: we add b first, since usually it will be smaller; when
647
    -- fold'ing, a will grow and grow whereas b is the per-group
648
    -- result, hence smaller
649
    let nFails  = bFails ++ aFails
650
        nAllocs = aAllocs + bAllocs
651
        nSols   = bestAllocElement aSols bSols
652
        nLog    = bLog ++ aLog
653
    in AllocSolution nFails nAllocs nSols nLog
654

    
655
-- | Given a solution, generates a reasonable description for it.
656
describeSolution :: AllocSolution -> String
657
describeSolution as =
658
  let fcnt = asFailures as
659
      sols = asSolution as
660
      freasons =
661
        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
662
        filter ((> 0) . snd) . collapseFailures $ fcnt
663
  in case sols of
664
     Nothing -> "No valid allocation solutions, failure reasons: " ++
665
                (if null fcnt then "unknown reasons" else freasons)
666
     Just (_, _, nodes, cv) ->
667
         printf ("score: %.8f, successes %d, failures %d (%s)" ++
668
                 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
669
               (intercalate "/" . map Node.name $ nodes)
670

    
671
-- | Annotates a solution with the appropriate string.
672
annotateSolution :: AllocSolution -> AllocSolution
673
annotateSolution as = as { asLog = describeSolution as : asLog as }
674

    
675
-- | Reverses an evacuation solution.
676
--
677
-- Rationale: we always concat the results to the top of the lists, so
678
-- for proper jobset execution, we should reverse all lists.
679
reverseEvacSolution :: EvacSolution -> EvacSolution
680
reverseEvacSolution (EvacSolution f m o) =
681
    EvacSolution (reverse f) (reverse m) (reverse o)
682

    
683
-- | Generate the valid node allocation singles or pairs for a new instance.
684
genAllocNodes :: Group.List        -- ^ Group list
685
              -> Node.List         -- ^ The node map
686
              -> Int               -- ^ The number of nodes required
687
              -> Bool              -- ^ Whether to drop or not
688
                                   -- unallocable nodes
689
              -> Result AllocNodes -- ^ The (monadic) result
690
genAllocNodes gl nl count drop_unalloc =
691
    let filter_fn = if drop_unalloc
692
                    then filter (Group.isAllocable .
693
                                 flip Container.find gl . Node.group)
694
                    else id
695
        all_nodes = filter_fn $ getOnline nl
696
        all_pairs = [(Node.idx p,
697
                      [Node.idx s | s <- all_nodes,
698
                                         Node.idx p /= Node.idx s,
699
                                         Node.group p == Node.group s]) |
700
                     p <- all_nodes]
701
    in case count of
702
         1 -> Ok (Left (map Node.idx all_nodes))
703
         2 -> Ok (Right (filter (not . null . snd) all_pairs))
704
         _ -> Bad "Unsupported number of nodes, only one or two  supported"
705

    
706
-- | Try to allocate an instance on the cluster.
707
tryAlloc :: (Monad m) =>
708
            Node.List         -- ^ The node list
709
         -> Instance.List     -- ^ The instance list
710
         -> Instance.Instance -- ^ The instance to allocate
711
         -> AllocNodes        -- ^ The allocation targets
712
         -> m AllocSolution   -- ^ Possible solution list
713
tryAlloc _  _ _    (Right []) = fail "Not enough online nodes"
714
tryAlloc nl _ inst (Right ok_pairs) =
715
    let psols = parMap rwhnf (\(p, ss) ->
716
                                  foldl' (\cstate ->
717
                                          concatAllocs cstate .
718
                                          allocateOnPair nl inst p)
719
                                  emptyAllocSolution ss) ok_pairs
720
        sols = foldl' sumAllocs emptyAllocSolution psols
721
    in return $ annotateSolution sols
722

    
723
tryAlloc _  _ _    (Left []) = fail "No online nodes"
724
tryAlloc nl _ inst (Left all_nodes) =
725
    let sols = foldl' (\cstate ->
726
                           concatAllocs cstate . allocateOnSingle nl inst
727
                      ) emptyAllocSolution all_nodes
728
    in return $ annotateSolution sols
729

    
730
-- | Given a group/result, describe it as a nice (list of) messages.
731
solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
732
solutionDescription gl (groupId, result) =
733
  case result of
734
    Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
735
    Bad message -> [printf "Group %s: error %s" gname message]
736
  where grp = Container.find groupId gl
737
        gname = Group.name grp
738
        pol = allocPolicyToString (Group.allocPolicy grp)
739

    
740
-- | From a list of possibly bad and possibly empty solutions, filter
741
-- only the groups with a valid result. Note that the result will be
742
-- reversed compared to the original list.
743
filterMGResults :: Group.List
744
                -> [(Gdx, Result AllocSolution)]
745
                -> [(Gdx, AllocSolution)]
746
filterMGResults gl = foldl' fn []
747
    where unallocable = not . Group.isAllocable . flip Container.find gl
748
          fn accu (gdx, rasol) =
749
              case rasol of
750
                Bad _ -> accu
751
                Ok sol | isNothing (asSolution sol) -> accu
752
                       | unallocable gdx -> accu
753
                       | otherwise -> (gdx, sol):accu
754

    
755
-- | Sort multigroup results based on policy and score.
756
sortMGResults :: Group.List
757
             -> [(Gdx, AllocSolution)]
758
             -> [(Gdx, AllocSolution)]
759
sortMGResults gl sols =
760
    let extractScore (_, _, _, x) = x
761
        solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
762
                               (extractScore . fromJust . asSolution) sol)
763
    in sortBy (comparing solScore) sols
764

    
765
-- | Finds the best group for an instance on a multi-group cluster.
766
--
767
-- Only solutions in @preferred@ and @last_resort@ groups will be
768
-- accepted as valid, and additionally if the allowed groups parameter
769
-- is not null then allocation will only be run for those group
770
-- indices.
771
findBestAllocGroup :: Group.List           -- ^ The group list
772
                   -> Node.List            -- ^ The node list
773
                   -> Instance.List        -- ^ The instance list
774
                   -> Maybe [Gdx]          -- ^ The allowed groups
775
                   -> Instance.Instance    -- ^ The instance to allocate
776
                   -> Int                  -- ^ Required number of nodes
777
                   -> Result (Gdx, AllocSolution, [String])
778
findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
779
  let groups = splitCluster mgnl mgil
780
      groups' = maybe groups (\gs -> filter ((`elem` gs) . fst) groups)
781
                allowed_gdxs
782
      sols = map (\(gid, (nl, il)) ->
783
                   (gid, genAllocNodes mggl nl cnt False >>=
784
                       tryAlloc nl il inst))
785
             groups'::[(Gdx, Result AllocSolution)]
786
      all_msgs = concatMap (solutionDescription mggl) sols
787
      goodSols = filterMGResults mggl sols
788
      sortedSols = sortMGResults mggl goodSols
789
  in if null sortedSols
790
     then Bad $ intercalate ", " all_msgs
791
     else let (final_group, final_sol) = head sortedSols
792
          in return (final_group, final_sol, all_msgs)
793

    
794
-- | Try to allocate an instance on a multi-group cluster.
795
tryMGAlloc :: Group.List           -- ^ The group list
796
           -> Node.List            -- ^ The node list
797
           -> Instance.List        -- ^ The instance list
798
           -> Instance.Instance    -- ^ The instance to allocate
799
           -> Int                  -- ^ Required number of nodes
800
           -> Result AllocSolution -- ^ Possible solution list
801
tryMGAlloc mggl mgnl mgil inst cnt = do
802
  (best_group, solution, all_msgs) <-
803
      findBestAllocGroup mggl mgnl mgil Nothing inst cnt
804
  let group_name = Group.name $ Container.find best_group mggl
805
      selmsg = "Selected group: " ++ group_name
806
  return $ solution { asLog = selmsg:all_msgs }
807

    
808
-- | Try to relocate an instance on the cluster.
809
tryReloc :: (Monad m) =>
810
            Node.List       -- ^ The node list
811
         -> Instance.List   -- ^ The instance list
812
         -> Idx             -- ^ The index of the instance to move
813
         -> Int             -- ^ The number of nodes required
814
         -> [Ndx]           -- ^ Nodes which should not be used
815
         -> m AllocSolution -- ^ Solution list
816
tryReloc nl il xid 1 ex_idx =
817
    let all_nodes = getOnline nl
818
        inst = Container.find xid il
819
        ex_idx' = Instance.pNode inst:ex_idx
820
        valid_nodes = filter (not . flip elem ex_idx' . Node.idx) all_nodes
821
        valid_idxes = map Node.idx valid_nodes
822
        sols1 = foldl' (\cstate x ->
823
                            let em = do
824
                                  (mnl, i, _, _) <-
825
                                      applyMove nl inst (ReplaceSecondary x)
826
                                  return (mnl, i, [Container.find x mnl],
827
                                          compCV mnl)
828
                            in concatAllocs cstate em
829
                       ) emptyAllocSolution valid_idxes
830
    in return sols1
831

    
832
tryReloc _ _ _ reqn _  = fail $ "Unsupported number of relocation \
833
                                \destinations required (" ++ show reqn ++
834
                                                  "), only one supported"
835

    
836
-- | Function which fails if the requested mode is change secondary.
837
--
838
-- This is useful since except DRBD, no other disk template can
839
-- execute change secondary; thus, we can just call this function
840
-- instead of always checking for secondary mode. After the call to
841
-- this function, whatever mode we have is just a primary change.
842
failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
843
failOnSecondaryChange ChangeSecondary dt =
844
    fail $ "Instances with disk template '" ++ diskTemplateToString dt ++
845
         "' can't execute change secondary"
846
failOnSecondaryChange _ _ = return ()
847

    
848
-- | Run evacuation for a single instance.
849
--
850
-- /Note:/ this function should correctly execute both intra-group
851
-- evacuations (in all modes) and inter-group evacuations (in the
852
-- 'ChangeAll' mode). Of course, this requires that the correct list
853
-- of target nodes is passed.
854
nodeEvacInstance :: Node.List         -- ^ The node list (cluster-wide)
855
                 -> Instance.List     -- ^ Instance list (cluster-wide)
856
                 -> EvacMode          -- ^ The evacuation mode
857
                 -> Instance.Instance -- ^ The instance to be evacuated
858
                 -> Gdx               -- ^ The group we're targetting
859
                 -> [Ndx]             -- ^ The list of available nodes
860
                                      -- for allocation
861
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
862
nodeEvacInstance _ _ mode (Instance.Instance
863
                           {Instance.diskTemplate = dt@DTDiskless}) _ _ =
864
                  failOnSecondaryChange mode dt >>
865
                  fail "Diskless relocations not implemented yet"
866

    
867
nodeEvacInstance _ _ _ (Instance.Instance
868
                        {Instance.diskTemplate = DTPlain}) _ _ =
869
                  fail "Instances of type plain cannot be relocated"
870

    
871
nodeEvacInstance _ _ _ (Instance.Instance
872
                        {Instance.diskTemplate = DTFile}) _ _ =
873
                  fail "Instances of type file cannot be relocated"
874

    
875
nodeEvacInstance _ _ mode  (Instance.Instance
876
                            {Instance.diskTemplate = dt@DTSharedFile}) _ _ =
877
                  failOnSecondaryChange mode dt >>
878
                  fail "Shared file relocations not implemented yet"
879

    
880
nodeEvacInstance _ _ mode (Instance.Instance
881
                           {Instance.diskTemplate = dt@DTBlock}) _ _ =
882
                  failOnSecondaryChange mode dt >>
883
                  fail "Block device relocations not implemented yet"
884

    
885
nodeEvacInstance nl il ChangePrimary
886
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
887
                 _ _ =
888
  do
889
    (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
890
    let idx = Instance.idx inst
891
        il' = Container.add idx inst' il
892
        ops = iMoveToJob nl' il' idx Failover
893
    return (nl', il', ops)
894

    
895
nodeEvacInstance nl il ChangeSecondary
896
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
897
                 gdx avail_nodes =
898
  do
899
    (nl', inst', _, ndx) <- annotateResult "Can't find any good node" $
900
                            eitherToResult $
901
                            foldl' (evacDrbdSecondaryInner nl inst gdx)
902
                            (Left "no nodes available") avail_nodes
903
    let idx = Instance.idx inst
904
        il' = Container.add idx inst' il
905
        ops = iMoveToJob nl' il' idx (ReplaceSecondary ndx)
906
    return (nl', il', ops)
907

    
908
-- The algorithm for ChangeAll is as follows:
909
--
910
-- * generate all (primary, secondary) node pairs for the target groups
911
-- * for each pair, execute the needed moves (r:s, f, r:s) and compute
912
--   the final node list state and group score
913
-- * select the best choice via a foldl that uses the same Either
914
--   String solution as the ChangeSecondary mode
915
nodeEvacInstance nl il ChangeAll
916
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
917
                 gdx avail_nodes =
918
  do
919
    let no_nodes = Left "no nodes available"
920
        node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s]
921
    (nl', il', ops, _) <-
922
        annotateResult "Can't find any good nodes for relocation" $
923
        eitherToResult $
924
        foldl'
925
        (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of
926
                          Bad msg ->
927
                              case accu of
928
                                Right _ -> accu
929
                                -- we don't need more details (which
930
                                -- nodes, etc.) as we only selected
931
                                -- this group if we can allocate on
932
                                -- it, hence failures will not
933
                                -- propagate out of this fold loop
934
                                Left _ -> Left $ "Allocation failed: " ++ msg
935
                          Ok result@(_, _, _, new_cv) ->
936
                              let new_accu = Right result in
937
                              case accu of
938
                                Left _ -> new_accu
939
                                Right (_, _, _, old_cv) ->
940
                                    if old_cv < new_cv
941
                                    then accu
942
                                    else new_accu
943
        ) no_nodes node_pairs
944

    
945
    return (nl', il', ops)
946

    
947
-- | Inner fold function for changing secondary of a DRBD instance.
948
--
949
-- The running solution is either a @Left String@, which means we
950
-- don't have yet a working solution, or a @Right (...)@, which
951
-- represents a valid solution; it holds the modified node list, the
952
-- modified instance (after evacuation), the score of that solution,
953
-- and the new secondary node index.
954
evacDrbdSecondaryInner :: Node.List -- ^ Cluster node list
955
                       -> Instance.Instance -- ^ Instance being evacuated
956
                       -> Gdx -- ^ The group index of the instance
957
                       -> Either String ( Node.List
958
                                        , Instance.Instance
959
                                        , Score
960
                                        , Ndx)  -- ^ Current best solution
961
                       -> Ndx  -- ^ Node we're evaluating as new secondary
962
                       -> Either String ( Node.List
963
                                        , Instance.Instance
964
                                        , Score
965
                                        , Ndx) -- ^ New best solution
966
evacDrbdSecondaryInner nl inst gdx accu ndx =
967
    case applyMove nl inst (ReplaceSecondary ndx) of
968
      OpFail fm ->
969
          case accu of
970
            Right _ -> accu
971
            Left _ -> Left $ "Node " ++ Container.nameOf nl ndx ++
972
                      " failed: " ++ show fm
973
      OpGood (nl', inst', _, _) ->
974
          let nodes = Container.elems nl'
975
              -- The fromJust below is ugly (it can fail nastily), but
976
              -- at this point we should have any internal mismatches,
977
              -- and adding a monad here would be quite involved
978
              grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
979
              new_cv = compCVNodes grpnodes
980
              new_accu = Right (nl', inst', new_cv, ndx)
981
          in case accu of
982
               Left _ -> new_accu
983
               Right (_, _, old_cv, _) ->
984
                   if old_cv < new_cv
985
                   then accu
986
                   else new_accu
987

    
988
-- | Compute result of changing all nodes of a DRBD instance.
989
--
990
-- Given the target primary and secondary node (which might be in a
991
-- different group or not), this function will 'execute' all the
992
-- required steps and assuming all operations succceed, will return
993
-- the modified node and instance lists, the opcodes needed for this
994
-- and the new group score.
995
evacDrbdAllInner :: Node.List         -- ^ Cluster node list
996
                 -> Instance.List     -- ^ Cluster instance list
997
                 -> Instance.Instance -- ^ The instance to be moved
998
                 -> Gdx               -- ^ The target group index
999
                                      -- (which can differ from the
1000
                                      -- current group of the
1001
                                      -- instance)
1002
                 -> (Ndx, Ndx)        -- ^ Tuple of new
1003
                                      -- primary\/secondary nodes
1004
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score)
1005
evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) =
1006
  do
1007
    let primary = Container.find (Instance.pNode inst) nl
1008
        idx = Instance.idx inst
1009
    -- if the primary is offline, then we first failover
1010
    (nl1, inst1, ops1) <-
1011
        if Node.offline primary
1012
        then do
1013
          (nl', inst', _, _) <-
1014
              annotateResult "Failing over to the secondary" $
1015
              opToResult $ applyMove nl inst Failover
1016
          return (nl', inst', [Failover])
1017
        else return (nl, inst, [])
1018
    let (o1, o2, o3) = (ReplaceSecondary t_pdx,
1019
                        Failover,
1020
                        ReplaceSecondary t_sdx)
1021
    -- we now need to execute a replace secondary to the future
1022
    -- primary node
1023
    (nl2, inst2, _, _) <-
1024
        annotateResult "Changing secondary to new primary" $
1025
        opToResult $
1026
        applyMove nl1 inst1 o1
1027
    let ops2 = o1:ops1
1028
    -- we now execute another failover, the primary stays fixed now
1029
    (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" $
1030
                          opToResult $ applyMove nl2 inst2 o2
1031
    let ops3 = o2:ops2
1032
    -- and finally another replace secondary, to the final secondary
1033
    (nl4, inst4, _, _) <-
1034
        annotateResult "Changing secondary to final secondary" $
1035
        opToResult $
1036
        applyMove nl3 inst3 o3
1037
    let ops4 = o3:ops3
1038
        il' = Container.add idx inst4 il
1039
        ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1040
    let nodes = Container.elems nl4
1041
        -- The fromJust below is ugly (it can fail nastily), but
1042
        -- at this point we should have any internal mismatches,
1043
        -- and adding a monad here would be quite involved
1044
        grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1045
        new_cv = compCVNodes grpnodes
1046
    return (nl4, il', ops, new_cv)
1047

    
1048
-- | Computes the nodes in a given group which are available for
1049
-- allocation.
1050
availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1051
                    -> IntSet.IntSet  -- ^ Nodes that are excluded
1052
                    -> Gdx            -- ^ The group for which we
1053
                                      -- query the nodes
1054
                    -> Result [Ndx]   -- ^ List of available node indices
1055
availableGroupNodes group_nodes excl_ndx gdx = do
1056
  local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1057
                 Ok (lookup gdx group_nodes)
1058
  let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1059
  return avail_nodes
1060

    
1061
-- | Updates the evac solution with the results of an instance
1062
-- evacuation.
1063
updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1064
                   -> Idx
1065
                   -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1066
                   -> (Node.List, Instance.List, EvacSolution)
1067
updateEvacSolution (nl, il, es) idx (Bad msg) =
1068
    (nl, il, es { esFailed = (idx, msg):esFailed es})
1069
updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1070
    (nl, il, es { esMoved = new_elem:esMoved es
1071
                , esOpCodes = opcodes:esOpCodes es })
1072
     where inst = Container.find idx il
1073
           new_elem = (idx,
1074
                       instancePriGroup nl inst,
1075
                       Instance.allNodes inst)
1076

    
1077
-- | Node-evacuation IAllocator mode main function.
1078
tryNodeEvac :: Group.List    -- ^ The cluster groups
1079
            -> Node.List     -- ^ The node list (cluster-wide, not per group)
1080
            -> Instance.List -- ^ Instance list (cluster-wide)
1081
            -> EvacMode      -- ^ The evacuation mode
1082
            -> [Idx]         -- ^ List of instance (indices) to be evacuated
1083
            -> Result (Node.List, Instance.List, EvacSolution)
1084
tryNodeEvac _ ini_nl ini_il mode idxs =
1085
    let evac_ndx = nodesToEvacuate ini_il mode idxs
1086
        offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1087
        excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1088
        group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1089
                                             (Container.elems nl))) $
1090
                      splitCluster ini_nl ini_il
1091
        (fin_nl, fin_il, esol) =
1092
            foldl' (\state@(nl, il, _) inst ->
1093
                        let gdx = instancePriGroup nl inst
1094
                            pdx = Instance.pNode inst in
1095
                        updateEvacSolution state (Instance.idx inst) $
1096
                        availableGroupNodes group_ndx
1097
                          (IntSet.insert pdx excl_ndx) gdx >>=
1098
                        nodeEvacInstance nl il mode inst gdx
1099
                   )
1100
            (ini_nl, ini_il, emptyEvacSolution)
1101
            (map (`Container.find` ini_il) idxs)
1102
    in return (fin_nl, fin_il, reverseEvacSolution esol)
1103

    
1104
-- | Change-group IAllocator mode main function.
1105
--
1106
-- This is very similar to 'tryNodeEvac', the only difference is that
1107
-- we don't choose as target group the current instance group, but
1108
-- instead:
1109
--
1110
--   1. at the start of the function, we compute which are the target
1111
--   groups; either no groups were passed in, in which case we choose
1112
--   all groups out of which we don't evacuate instance, or there were
1113
--   some groups passed, in which case we use those
1114
--
1115
--   2. for each instance, we use 'findBestAllocGroup' to choose the
1116
--   best group to hold the instance, and then we do what
1117
--   'tryNodeEvac' does, except for this group instead of the current
1118
--   instance group.
1119
--
1120
-- Note that the correct behaviour of this function relies on the
1121
-- function 'nodeEvacInstance' to be able to do correctly both
1122
-- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1123
tryChangeGroup :: Group.List    -- ^ The cluster groups
1124
               -> Node.List     -- ^ The node list (cluster-wide)
1125
               -> Instance.List -- ^ Instance list (cluster-wide)
1126
               -> [Gdx]         -- ^ Target groups; if empty, any
1127
                                -- groups not being evacuated
1128
               -> [Idx]         -- ^ List of instance (indices) to be evacuated
1129
               -> Result (Node.List, Instance.List, EvacSolution)
1130
tryChangeGroup gl ini_nl ini_il gdxs idxs =
1131
    let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1132
                               flip Container.find ini_il) idxs
1133
        target_gdxs = (if null gdxs
1134
                       then Container.keys gl
1135
                       else gdxs) \\ evac_gdxs
1136
        offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1137
        excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1138
        group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1139
                                             (Container.elems nl))) $
1140
                      splitCluster ini_nl ini_il
1141
        (fin_nl, fin_il, esol) =
1142
            foldl' (\state@(nl, il, _) inst ->
1143
                        let solution = do
1144
                              let ncnt = Instance.requiredNodes $
1145
                                         Instance.diskTemplate inst
1146
                              (gdx, _, _) <- findBestAllocGroup gl nl il
1147
                                             (Just target_gdxs) inst ncnt
1148
                              av_nodes <- availableGroupNodes group_ndx
1149
                                          excl_ndx gdx
1150
                              nodeEvacInstance nl il ChangeAll inst
1151
                                       gdx av_nodes
1152
                        in updateEvacSolution state
1153
                               (Instance.idx inst) solution
1154
                   )
1155
            (ini_nl, ini_il, emptyEvacSolution)
1156
            (map (`Container.find` ini_il) idxs)
1157
    in return (fin_nl, fin_il, reverseEvacSolution esol)
1158

    
1159
-- | Standard-sized allocation method.
1160
--
1161
-- This places instances of the same size on the cluster until we're
1162
-- out of space. The result will be a list of identically-sized
1163
-- instances.
1164
iterateAlloc :: AllocMethod
1165
iterateAlloc nl il limit newinst allocnodes ixes cstats =
1166
      let depth = length ixes
1167
          newname = printf "new-%d" depth::String
1168
          newidx = length (Container.elems il) + depth
1169
          newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1170
          newlimit = fmap (flip (-) 1) limit
1171
      in case tryAlloc nl il newi2 allocnodes of
1172
           Bad s -> Bad s
1173
           Ok (AllocSolution { asFailures = errs, asSolution = sols3 }) ->
1174
               let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1175
               case sols3 of
1176
                 Nothing -> newsol
1177
                 Just (xnl, xi, _, _) ->
1178
                     if limit == Just 0
1179
                     then newsol
1180
                     else iterateAlloc xnl (Container.add newidx xi il)
1181
                          newlimit newinst allocnodes (xi:ixes)
1182
                          (totalResources xnl:cstats)
1183

    
1184
-- | Tiered allocation method.
1185
--
1186
-- This places instances on the cluster, and decreases the spec until
1187
-- we can allocate again. The result will be a list of decreasing
1188
-- instance specs.
1189
tieredAlloc :: AllocMethod
1190
tieredAlloc nl il limit newinst allocnodes ixes cstats =
1191
    case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1192
      Bad s -> Bad s
1193
      Ok (errs, nl', il', ixes', cstats') ->
1194
          let newsol = Ok (errs, nl', il', ixes', cstats')
1195
              ixes_cnt = length ixes'
1196
              (stop, newlimit) = case limit of
1197
                                   Nothing -> (False, Nothing)
1198
                                   Just n -> (n <= ixes_cnt,
1199
                                              Just (n - ixes_cnt)) in
1200
          if stop then newsol else
1201
          case Instance.shrinkByType newinst . fst . last $
1202
               sortBy (comparing snd) errs of
1203
            Bad _ -> newsol
1204
            Ok newinst' -> tieredAlloc nl' il' newlimit
1205
                           newinst' allocnodes ixes' cstats'
1206

    
1207
-- * Formatting functions
1208

    
1209
-- | Given the original and final nodes, computes the relocation description.
1210
computeMoves :: Instance.Instance -- ^ The instance to be moved
1211
             -> String -- ^ The instance name
1212
             -> IMove  -- ^ The move being performed
1213
             -> String -- ^ New primary
1214
             -> String -- ^ New secondary
1215
             -> (String, [String])
1216
                -- ^ Tuple of moves and commands list; moves is containing
1217
                -- either @/f/@ for failover or @/r:name/@ for replace
1218
                -- secondary, while the command list holds gnt-instance
1219
                -- commands (without that prefix), e.g \"@failover instance1@\"
1220
computeMoves i inam mv c d =
1221
    case mv of
1222
      Failover -> ("f", [mig])
1223
      FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1224
      ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1225
      ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1226
      ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1227
    where morf = if Instance.running i then "migrate" else "failover"
1228
          mig = printf "%s -f %s" morf inam::String
1229
          rep n = printf "replace-disks -n %s %s" n inam
1230

    
1231
-- | Converts a placement to string format.
1232
printSolutionLine :: Node.List     -- ^ The node list
1233
                  -> Instance.List -- ^ The instance list
1234
                  -> Int           -- ^ Maximum node name length
1235
                  -> Int           -- ^ Maximum instance name length
1236
                  -> Placement     -- ^ The current placement
1237
                  -> Int           -- ^ The index of the placement in
1238
                                   -- the solution
1239
                  -> (String, [String])
1240
printSolutionLine nl il nmlen imlen plc pos =
1241
    let
1242
        pmlen = (2*nmlen + 1)
1243
        (i, p, s, mv, c) = plc
1244
        inst = Container.find i il
1245
        inam = Instance.alias inst
1246
        npri = Node.alias $ Container.find p nl
1247
        nsec = Node.alias $ Container.find s nl
1248
        opri = Node.alias $ Container.find (Instance.pNode inst) nl
1249
        osec = Node.alias $ Container.find (Instance.sNode inst) nl
1250
        (moves, cmds) =  computeMoves inst inam mv npri nsec
1251
        ostr = printf "%s:%s" opri osec::String
1252
        nstr = printf "%s:%s" npri nsec::String
1253
    in
1254
      (printf "  %3d. %-*s %-*s => %-*s %.8f a=%s"
1255
       pos imlen inam pmlen ostr
1256
       pmlen nstr c moves,
1257
       cmds)
1258

    
1259
-- | Return the instance and involved nodes in an instance move.
1260
--
1261
-- Note that the output list length can vary, and is not required nor
1262
-- guaranteed to be of any specific length.
1263
involvedNodes :: Instance.List -- ^ Instance list, used for retrieving
1264
                               -- the instance from its index; note
1265
                               -- that this /must/ be the original
1266
                               -- instance list, so that we can
1267
                               -- retrieve the old nodes
1268
              -> Placement     -- ^ The placement we're investigating,
1269
                               -- containing the new nodes and
1270
                               -- instance index
1271
              -> [Ndx]         -- ^ Resulting list of node indices
1272
involvedNodes il plc =
1273
    let (i, np, ns, _, _) = plc
1274
        inst = Container.find i il
1275
    in nub $ [np, ns] ++ Instance.allNodes inst
1276

    
1277
-- | Inner function for splitJobs, that either appends the next job to
1278
-- the current jobset, or starts a new jobset.
1279
mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1280
mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1281
mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1282
    | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1283
    | otherwise = ([n]:cjs, ndx)
1284

    
1285
-- | Break a list of moves into independent groups. Note that this
1286
-- will reverse the order of jobs.
1287
splitJobs :: [MoveJob] -> [JobSet]
1288
splitJobs = fst . foldl mergeJobs ([], [])
1289

    
1290
-- | Given a list of commands, prefix them with @gnt-instance@ and
1291
-- also beautify the display a little.
1292
formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1293
formatJob jsn jsl (sn, (_, _, _, cmds)) =
1294
    let out =
1295
            printf "  echo job %d/%d" jsn sn:
1296
            printf "  check":
1297
            map ("  gnt-instance " ++) cmds
1298
    in if sn == 1
1299
       then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1300
       else out
1301

    
1302
-- | Given a list of commands, prefix them with @gnt-instance@ and
1303
-- also beautify the display a little.
1304
formatCmds :: [JobSet] -> String
1305
formatCmds =
1306
    unlines .
1307
    concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1308
                             (zip [1..] js)) .
1309
    zip [1..]
1310

    
1311
-- | Print the node list.
1312
printNodes :: Node.List -> [String] -> String
1313
printNodes nl fs =
1314
    let fields = case fs of
1315
          [] -> Node.defaultFields
1316
          "+":rest -> Node.defaultFields ++ rest
1317
          _ -> fs
1318
        snl = sortBy (comparing Node.idx) (Container.elems nl)
1319
        (header, isnum) = unzip $ map Node.showHeader fields
1320
    in unlines . map ((:) ' ' .  intercalate " ") $
1321
       formatTable (header:map (Node.list fields) snl) isnum
1322

    
1323
-- | Print the instance list.
1324
printInsts :: Node.List -> Instance.List -> String
1325
printInsts nl il =
1326
    let sil = sortBy (comparing Instance.idx) (Container.elems il)
1327
        helper inst = [ if Instance.running inst then "R" else " "
1328
                      , Instance.name inst
1329
                      , Container.nameOf nl (Instance.pNode inst)
1330
                      , let sdx = Instance.sNode inst
1331
                        in if sdx == Node.noSecondary
1332
                           then  ""
1333
                           else Container.nameOf nl sdx
1334
                      , if Instance.autoBalance inst then "Y" else "N"
1335
                      , printf "%3d" $ Instance.vcpus inst
1336
                      , printf "%5d" $ Instance.mem inst
1337
                      , printf "%5d" $ Instance.dsk inst `div` 1024
1338
                      , printf "%5.3f" lC
1339
                      , printf "%5.3f" lM
1340
                      , printf "%5.3f" lD
1341
                      , printf "%5.3f" lN
1342
                      ]
1343
            where DynUtil lC lM lD lN = Instance.util inst
1344
        header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1345
                 , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1346
        isnum = False:False:False:False:False:repeat True
1347
    in unlines . map ((:) ' ' . intercalate " ") $
1348
       formatTable (header:map helper sil) isnum
1349

    
1350
-- | Shows statistics for a given node list.
1351
printStats :: Node.List -> String
1352
printStats nl =
1353
    let dcvs = compDetailedCV $ Container.elems nl
1354
        (weights, names) = unzip detailedCVInfo
1355
        hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1356
        formatted = map (\(w, header, val) ->
1357
                             printf "%s=%.8f(x%.2f)" header val w::String) hd
1358
    in intercalate ", " formatted
1359

    
1360
-- | Convert a placement into a list of OpCodes (basically a job).
1361
iMoveToJob :: Node.List        -- ^ The node list; only used for node
1362
                               -- names, so any version is good
1363
                               -- (before or after the operation)
1364
           -> Instance.List    -- ^ The instance list; also used for
1365
                               -- names only
1366
           -> Idx              -- ^ The index of the instance being
1367
                               -- moved
1368
           -> IMove            -- ^ The actual move to be described
1369
           -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to
1370
                               -- the given move
1371
iMoveToJob nl il idx move =
1372
    let inst = Container.find idx il
1373
        iname = Instance.name inst
1374
        lookNode  = Just . Container.nameOf nl
1375
        opF = OpCodes.OpInstanceMigrate iname True False True Nothing
1376
        opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n)
1377
                OpCodes.ReplaceNewSecondary [] Nothing
1378
    in case move of
1379
         Failover -> [ opF ]
1380
         ReplacePrimary np -> [ opF, opR np, opF ]
1381
         ReplaceSecondary ns -> [ opR ns ]
1382
         ReplaceAndFailover np -> [ opR np, opF ]
1383
         FailoverAndReplace ns -> [ opF, opR ns ]
1384

    
1385
-- * Node group functions
1386

    
1387
-- | Computes the group of an instance.
1388
instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1389
instanceGroup nl i =
1390
  let sidx = Instance.sNode i
1391
      pnode = Container.find (Instance.pNode i) nl
1392
      snode = if sidx == Node.noSecondary
1393
              then pnode
1394
              else Container.find sidx nl
1395
      pgroup = Node.group pnode
1396
      sgroup = Node.group snode
1397
  in if pgroup /= sgroup
1398
     then fail ("Instance placed accross two node groups, primary " ++
1399
                show pgroup ++ ", secondary " ++ show sgroup)
1400
     else return pgroup
1401

    
1402
-- | Computes the group of an instance per the primary node.
1403
instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1404
instancePriGroup nl i =
1405
  let pnode = Container.find (Instance.pNode i) nl
1406
  in  Node.group pnode
1407

    
1408
-- | Compute the list of badly allocated instances (split across node
1409
-- groups).
1410
findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1411
findSplitInstances nl =
1412
  filter (not . isOk . instanceGroup nl) . Container.elems
1413

    
1414
-- | Splits a cluster into the component node groups.
1415
splitCluster :: Node.List -> Instance.List ->
1416
                [(Gdx, (Node.List, Instance.List))]
1417
splitCluster nl il =
1418
  let ngroups = Node.computeGroups (Container.elems nl)
1419
  in map (\(guuid, nodes) ->
1420
           let nidxs = map Node.idx nodes
1421
               nodes' = zip nidxs nodes
1422
               instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1423
           in (guuid, (Container.fromList nodes', instances))) ngroups
1424

    
1425
-- | Compute the list of nodes that are to be evacuated, given a list
1426
-- of instances and an evacuation mode.
1427
nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1428
                -> EvacMode      -- ^ The evacuation mode we're using
1429
                -> [Idx]         -- ^ List of instance indices being evacuated
1430
                -> IntSet.IntSet -- ^ Set of node indices
1431
nodesToEvacuate il mode =
1432
    IntSet.delete Node.noSecondary .
1433
    foldl' (\ns idx ->
1434
                let i = Container.find idx il
1435
                    pdx = Instance.pNode i
1436
                    sdx = Instance.sNode i
1437
                    dt = Instance.diskTemplate i
1438
                    withSecondary = case dt of
1439
                                      DTDrbd8 -> IntSet.insert sdx ns
1440
                                      _ -> ns
1441
                in case mode of
1442
                     ChangePrimary   -> IntSet.insert pdx ns
1443
                     ChangeSecondary -> withSecondary
1444
                     ChangeAll       -> IntSet.insert pdx withSecondary
1445
           ) IntSet.empty