Statistics
| Branch: | Tag: | Revision:

root / htools / Ganeti / HTools / Cluster.hs @ 16670b57

History | View | Annotate | Download (60.9 kB)

1
{-| Implementation of cluster-wide logic.
2

    
3
This module holds all pure cluster-logic; I\/O related functionality
4
goes into the /Main/ module for the individual binaries.
5

    
6
-}
7

    
8
{-
9

    
10
Copyright (C) 2009, 2010, 2011, 2012 Google Inc.
11

    
12
This program is free software; you can redistribute it and/or modify
13
it under the terms of the GNU General Public License as published by
14
the Free Software Foundation; either version 2 of the License, or
15
(at your option) any later version.
16

    
17
This program is distributed in the hope that it will be useful, but
18
WITHOUT ANY WARRANTY; without even the implied warranty of
19
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
General Public License for more details.
21

    
22
You should have received a copy of the GNU General Public License
23
along with this program; if not, write to the Free Software
24
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25
02110-1301, USA.
26

    
27
-}
28

    
29
module Ganeti.HTools.Cluster
30
  (
31
    -- * Types
32
    AllocSolution(..)
33
  , EvacSolution(..)
34
  , Table(..)
35
  , CStats(..)
36
  , AllocStats
37
  , AllocResult
38
  , AllocMethod
39
  -- * Generic functions
40
  , totalResources
41
  , computeAllocationDelta
42
  -- * First phase functions
43
  , computeBadItems
44
  -- * Second phase functions
45
  , printSolutionLine
46
  , formatCmds
47
  , involvedNodes
48
  , splitJobs
49
  -- * Display functions
50
  , printNodes
51
  , printInsts
52
  -- * Balacing functions
53
  , checkMove
54
  , doNextBalance
55
  , tryBalance
56
  , compCV
57
  , compCVNodes
58
  , compDetailedCV
59
  , printStats
60
  , iMoveToJob
61
  -- * IAllocator functions
62
  , genAllocNodes
63
  , tryAlloc
64
  , tryMGAlloc
65
  , tryNodeEvac
66
  , tryChangeGroup
67
  , collapseFailures
68
  -- * Allocation functions
69
  , iterateAlloc
70
  , tieredAlloc
71
  -- * Node group functions
72
  , instanceGroup
73
  , findSplitInstances
74
  , splitCluster
75
  ) where
76

    
77
import qualified Data.IntSet as IntSet
78
import Data.List
79
import Data.Maybe (fromJust, isNothing)
80
import Data.Ord (comparing)
81
import Text.Printf (printf)
82

    
83
import qualified Ganeti.HTools.Container as Container
84
import qualified Ganeti.HTools.Instance as Instance
85
import qualified Ganeti.HTools.Node as Node
86
import qualified Ganeti.HTools.Group as Group
87
import Ganeti.HTools.Types
88
import Ganeti.HTools.Utils
89
import Ganeti.HTools.Compat
90
import qualified Ganeti.OpCodes as OpCodes
91

    
92
-- * Types
93

    
94
-- | Allocation\/relocation solution.
95
data AllocSolution = AllocSolution
96
  { asFailures :: [FailMode]              -- ^ Failure counts
97
  , asAllocs   :: Int                     -- ^ Good allocation count
98
  , asSolution :: Maybe Node.AllocElement -- ^ The actual allocation result
99
  , asLog      :: [String]                -- ^ Informational messages
100
  }
101

    
102
-- | Node evacuation/group change iallocator result type. This result
103
-- type consists of actual opcodes (a restricted subset) that are
104
-- transmitted back to Ganeti.
105
data EvacSolution = EvacSolution
106
  { esMoved   :: [(Idx, Gdx, [Ndx])]  -- ^ Instances moved successfully
107
  , esFailed  :: [(Idx, String)]      -- ^ Instances which were not
108
                                      -- relocated
109
  , esOpCodes :: [[OpCodes.OpCode]]   -- ^ List of jobs
110
  } deriving (Show)
111

    
112
-- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
113
type AllocResult = (FailStats, Node.List, Instance.List,
114
                    [Instance.Instance], [CStats])
115

    
116
-- | A type denoting the valid allocation mode/pairs.
117
--
118
-- For a one-node allocation, this will be a @Left ['Ndx']@, whereas
119
-- for a two-node allocation, this will be a @Right [('Ndx',
120
-- ['Ndx'])]@. In the latter case, the list is basically an
121
-- association list, grouped by primary node and holding the potential
122
-- secondary nodes in the sub-list.
123
type AllocNodes = Either [Ndx] [(Ndx, [Ndx])]
124

    
125
-- | The empty solution we start with when computing allocations.
126
emptyAllocSolution :: AllocSolution
127
emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
128
                                   , asSolution = Nothing, asLog = [] }
129

    
130
-- | The empty evac solution.
131
emptyEvacSolution :: EvacSolution
132
emptyEvacSolution = EvacSolution { esMoved = []
133
                                 , esFailed = []
134
                                 , esOpCodes = []
135
                                 }
136

    
137
-- | The complete state for the balancing solution.
138
data Table = Table Node.List Instance.List Score [Placement]
139
             deriving (Show, Read)
140

    
141
-- | Cluster statistics data type.
142
data CStats = CStats { csFmem :: Integer -- ^ Cluster free mem
143
                     , csFdsk :: Integer -- ^ Cluster free disk
144
                     , csAmem :: Integer -- ^ Cluster allocatable mem
145
                     , csAdsk :: Integer -- ^ Cluster allocatable disk
146
                     , csAcpu :: Integer -- ^ Cluster allocatable cpus
147
                     , csMmem :: Integer -- ^ Max node allocatable mem
148
                     , csMdsk :: Integer -- ^ Max node allocatable disk
149
                     , csMcpu :: Integer -- ^ Max node allocatable cpu
150
                     , csImem :: Integer -- ^ Instance used mem
151
                     , csIdsk :: Integer -- ^ Instance used disk
152
                     , csIcpu :: Integer -- ^ Instance used cpu
153
                     , csTmem :: Double  -- ^ Cluster total mem
154
                     , csTdsk :: Double  -- ^ Cluster total disk
155
                     , csTcpu :: Double  -- ^ Cluster total cpus
156
                     , csVcpu :: Integer -- ^ Cluster virtual cpus (if
157
                                         -- node pCpu has been set,
158
                                         -- otherwise -1)
159
                     , csXmem :: Integer -- ^ Unnacounted for mem
160
                     , csNmem :: Integer -- ^ Node own memory
161
                     , csScore :: Score  -- ^ The cluster score
162
                     , csNinst :: Int    -- ^ The total number of instances
163
                     }
164
            deriving (Show, Read)
165

    
166
-- | Currently used, possibly to allocate, unallocable.
167
type AllocStats = (RSpec, RSpec, RSpec)
168

    
169
-- | A simple type for allocation functions.
170
type AllocMethod =  Node.List           -- ^ Node list
171
                 -> Instance.List       -- ^ Instance list
172
                 -> Maybe Int           -- ^ Optional allocation limit
173
                 -> Instance.Instance   -- ^ Instance spec for allocation
174
                 -> AllocNodes          -- ^ Which nodes we should allocate on
175
                 -> [Instance.Instance] -- ^ Allocated instances
176
                 -> [CStats]            -- ^ Running cluster stats
177
                 -> Result AllocResult  -- ^ Allocation result
178

    
179
-- * Utility functions
180

    
181
-- | Verifies the N+1 status and return the affected nodes.
182
verifyN1 :: [Node.Node] -> [Node.Node]
183
verifyN1 = filter Node.failN1
184

    
185
{-| Computes the pair of bad nodes and instances.
186

    
187
The bad node list is computed via a simple 'verifyN1' check, and the
188
bad instance list is the list of primary and secondary instances of
189
those nodes.
190

    
191
-}
192
computeBadItems :: Node.List -> Instance.List ->
193
                   ([Node.Node], [Instance.Instance])
194
computeBadItems nl il =
195
  let bad_nodes = verifyN1 $ getOnline nl
196
      bad_instances = map (`Container.find` il) .
197
                      sort . nub $
198
                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
199
  in
200
    (bad_nodes, bad_instances)
201

    
202
-- | Extracts the node pairs for an instance. This can fail if the
203
-- instance is single-homed. FIXME: this needs to be improved,
204
-- together with the general enhancement for handling non-DRBD moves.
205
instanceNodes :: Node.List -> Instance.Instance ->
206
                 (Ndx, Ndx, Node.Node, Node.Node)
207
instanceNodes nl inst =
208
  let old_pdx = Instance.pNode inst
209
      old_sdx = Instance.sNode inst
210
      old_p = Container.find old_pdx nl
211
      old_s = Container.find old_sdx nl
212
  in (old_pdx, old_sdx, old_p, old_s)
213

    
214
-- | Zero-initializer for the CStats type.
215
emptyCStats :: CStats
216
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
217

    
218
-- | Update stats with data from a new node.
219
updateCStats :: CStats -> Node.Node -> CStats
220
updateCStats cs node =
221
  let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
222
               csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
223
               csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
224
               csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
225
               csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
226
               csVcpu = x_vcpu,
227
               csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
228
             }
229
        = cs
230
      inc_amem = Node.fMem node - Node.rMem node
231
      inc_amem' = if inc_amem > 0 then inc_amem else 0
232
      inc_adsk = Node.availDisk node
233
      inc_imem = truncate (Node.tMem node) - Node.nMem node
234
                 - Node.xMem node - Node.fMem node
235
      inc_icpu = Node.uCpu node
236
      inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
237
      inc_vcpu = Node.hiCpu node
238
      inc_acpu = Node.availCpu node
239
  in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
240
        , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
241
        , csAmem = x_amem + fromIntegral inc_amem'
242
        , csAdsk = x_adsk + fromIntegral inc_adsk
243
        , csAcpu = x_acpu + fromIntegral inc_acpu
244
        , csMmem = max x_mmem (fromIntegral inc_amem')
245
        , csMdsk = max x_mdsk (fromIntegral inc_adsk)
246
        , csMcpu = max x_mcpu (fromIntegral inc_acpu)
247
        , csImem = x_imem + fromIntegral inc_imem
248
        , csIdsk = x_idsk + fromIntegral inc_idsk
249
        , csIcpu = x_icpu + fromIntegral inc_icpu
250
        , csTmem = x_tmem + Node.tMem node
251
        , csTdsk = x_tdsk + Node.tDsk node
252
        , csTcpu = x_tcpu + Node.tCpu node
253
        , csVcpu = x_vcpu + fromIntegral inc_vcpu
254
        , csXmem = x_xmem + fromIntegral (Node.xMem node)
255
        , csNmem = x_nmem + fromIntegral (Node.nMem node)
256
        , csNinst = x_ninst + length (Node.pList node)
257
        }
258

    
259
-- | Compute the total free disk and memory in the cluster.
260
totalResources :: Node.List -> CStats
261
totalResources nl =
262
  let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
263
  in cs { csScore = compCV nl }
264

    
265
-- | Compute the delta between two cluster state.
266
--
267
-- This is used when doing allocations, to understand better the
268
-- available cluster resources. The return value is a triple of the
269
-- current used values, the delta that was still allocated, and what
270
-- was left unallocated.
271
computeAllocationDelta :: CStats -> CStats -> AllocStats
272
computeAllocationDelta cini cfin =
273
  let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini
274
      CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
275
              csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin
276
      rini = RSpec (fromIntegral i_icpu) (fromIntegral i_imem)
277
             (fromIntegral i_idsk)
278
      rfin = RSpec (fromIntegral (f_icpu - i_icpu))
279
             (fromIntegral (f_imem - i_imem))
280
             (fromIntegral (f_idsk - i_idsk))
281
      un_cpu = fromIntegral (v_cpu - f_icpu)::Int
282
      runa = RSpec un_cpu (truncate t_mem - fromIntegral f_imem)
283
             (truncate t_dsk - fromIntegral f_idsk)
284
  in (rini, rfin, runa)
285

    
286
-- | The names and weights of the individual elements in the CV list.
287
detailedCVInfo :: [(Double, String)]
288
detailedCVInfo = [ (1,  "free_mem_cv")
289
                 , (1,  "free_disk_cv")
290
                 , (1,  "n1_cnt")
291
                 , (1,  "reserved_mem_cv")
292
                 , (4,  "offline_all_cnt")
293
                 , (16, "offline_pri_cnt")
294
                 , (1,  "vcpu_ratio_cv")
295
                 , (1,  "cpu_load_cv")
296
                 , (1,  "mem_load_cv")
297
                 , (1,  "disk_load_cv")
298
                 , (1,  "net_load_cv")
299
                 , (2,  "pri_tags_score")
300
                 ]
301

    
302
-- | Holds the weights used by 'compCVNodes' for each metric.
303
detailedCVWeights :: [Double]
304
detailedCVWeights = map fst detailedCVInfo
305

    
306
-- | Compute the mem and disk covariance.
307
compDetailedCV :: [Node.Node] -> [Double]
308
compDetailedCV all_nodes =
309
  let (offline, nodes) = partition Node.offline all_nodes
310
      mem_l = map Node.pMem nodes
311
      dsk_l = map Node.pDsk nodes
312
      -- metric: memory covariance
313
      mem_cv = stdDev mem_l
314
      -- metric: disk covariance
315
      dsk_cv = stdDev dsk_l
316
      -- metric: count of instances living on N1 failing nodes
317
      n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
318
                                                 length (Node.pList n)) .
319
                 filter Node.failN1 $ nodes :: Double
320
      res_l = map Node.pRem nodes
321
      -- metric: reserved memory covariance
322
      res_cv = stdDev res_l
323
      -- offline instances metrics
324
      offline_ipri = sum . map (length . Node.pList) $ offline
325
      offline_isec = sum . map (length . Node.sList) $ offline
326
      -- metric: count of instances on offline nodes
327
      off_score = fromIntegral (offline_ipri + offline_isec)::Double
328
      -- metric: count of primary instances on offline nodes (this
329
      -- helps with evacuation/failover of primary instances on
330
      -- 2-node clusters with one node offline)
331
      off_pri_score = fromIntegral offline_ipri::Double
332
      cpu_l = map Node.pCpu nodes
333
      -- metric: covariance of vcpu/pcpu ratio
334
      cpu_cv = stdDev cpu_l
335
      -- metrics: covariance of cpu, memory, disk and network load
336
      (c_load, m_load, d_load, n_load) =
337
        unzip4 $ map (\n ->
338
                      let DynUtil c1 m1 d1 n1 = Node.utilLoad n
339
                          DynUtil c2 m2 d2 n2 = Node.utilPool n
340
                      in (c1/c2, m1/m2, d1/d2, n1/n2)) nodes
341
      -- metric: conflicting instance count
342
      pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
343
      pri_tags_score = fromIntegral pri_tags_inst::Double
344
  in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
345
     , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
346
     , pri_tags_score ]
347

    
348
-- | Compute the /total/ variance.
349
compCVNodes :: [Node.Node] -> Double
350
compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
351

    
352
-- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
353
compCV :: Node.List -> Double
354
compCV = compCVNodes . Container.elems
355

    
356
-- | Compute online nodes from a 'Node.List'.
357
getOnline :: Node.List -> [Node.Node]
358
getOnline = filter (not . Node.offline) . Container.elems
359

    
360
-- * Balancing functions
361

    
362
-- | Compute best table. Note that the ordering of the arguments is important.
363
compareTables :: Table -> Table -> Table
364
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
365
  if a_cv > b_cv then b else a
366

    
367
-- | Applies an instance move to a given node list and instance.
368
applyMove :: Node.List -> Instance.Instance
369
          -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
370
-- Failover (f)
371
applyMove nl inst Failover =
372
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
373
      int_p = Node.removePri old_p inst
374
      int_s = Node.removeSec old_s inst
375
      new_nl = do -- Maybe monad
376
        new_p <- Node.addPriEx (Node.offline old_p) int_s inst
377
        new_s <- Node.addSec int_p inst old_sdx
378
        let new_inst = Instance.setBoth inst old_sdx old_pdx
379
        return (Container.addTwo old_pdx new_s old_sdx new_p nl,
380
                new_inst, old_sdx, old_pdx)
381
  in new_nl
382

    
383
-- Replace the primary (f:, r:np, f)
384
applyMove nl inst (ReplacePrimary new_pdx) =
385
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
386
      tgt_n = Container.find new_pdx nl
387
      int_p = Node.removePri old_p inst
388
      int_s = Node.removeSec old_s inst
389
      force_p = Node.offline old_p
390
      new_nl = do -- Maybe monad
391
                  -- check that the current secondary can host the instance
392
                  -- during the migration
393
        tmp_s <- Node.addPriEx force_p int_s inst
394
        let tmp_s' = Node.removePri tmp_s inst
395
        new_p <- Node.addPriEx force_p tgt_n inst
396
        new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
397
        let new_inst = Instance.setPri inst new_pdx
398
        return (Container.add new_pdx new_p $
399
                Container.addTwo old_pdx int_p old_sdx new_s nl,
400
                new_inst, new_pdx, old_sdx)
401
  in new_nl
402

    
403
-- Replace the secondary (r:ns)
404
applyMove nl inst (ReplaceSecondary new_sdx) =
405
  let old_pdx = Instance.pNode inst
406
      old_sdx = Instance.sNode inst
407
      old_s = Container.find old_sdx nl
408
      tgt_n = Container.find new_sdx nl
409
      int_s = Node.removeSec old_s inst
410
      force_s = Node.offline old_s
411
      new_inst = Instance.setSec inst new_sdx
412
      new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
413
               \new_s -> return (Container.addTwo new_sdx
414
                                 new_s old_sdx int_s nl,
415
                                 new_inst, old_pdx, new_sdx)
416
  in new_nl
417

    
418
-- Replace the secondary and failover (r:np, f)
419
applyMove nl inst (ReplaceAndFailover new_pdx) =
420
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
421
      tgt_n = Container.find new_pdx nl
422
      int_p = Node.removePri old_p inst
423
      int_s = Node.removeSec old_s inst
424
      force_s = Node.offline old_s
425
      new_nl = do -- Maybe monad
426
        new_p <- Node.addPri tgt_n inst
427
        new_s <- Node.addSecEx force_s int_p inst new_pdx
428
        let new_inst = Instance.setBoth inst new_pdx old_pdx
429
        return (Container.add new_pdx new_p $
430
                Container.addTwo old_pdx new_s old_sdx int_s nl,
431
                new_inst, new_pdx, old_pdx)
432
  in new_nl
433

    
434
-- Failver and replace the secondary (f, r:ns)
435
applyMove nl inst (FailoverAndReplace new_sdx) =
436
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
437
      tgt_n = Container.find new_sdx nl
438
      int_p = Node.removePri old_p inst
439
      int_s = Node.removeSec old_s inst
440
      force_p = Node.offline old_p
441
      new_nl = do -- Maybe monad
442
        new_p <- Node.addPriEx force_p int_s inst
443
        new_s <- Node.addSecEx force_p tgt_n inst old_sdx
444
        let new_inst = Instance.setBoth inst old_sdx new_sdx
445
        return (Container.add new_sdx new_s $
446
                Container.addTwo old_sdx new_p old_pdx int_p nl,
447
                new_inst, old_sdx, new_sdx)
448
  in new_nl
449

    
450
-- | Tries to allocate an instance on one given node.
451
allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
452
                 -> OpResult Node.AllocElement
453
allocateOnSingle nl inst new_pdx =
454
  let p = Container.find new_pdx nl
455
      new_inst = Instance.setBoth inst new_pdx Node.noSecondary
456
  in do
457
    Instance.instMatchesPolicy inst (Node.iPolicy p)
458
    new_p <- Node.addPri p inst
459
    let new_nl = Container.add new_pdx new_p nl
460
        new_score = compCV nl
461
    return (new_nl, new_inst, [new_p], new_score)
462

    
463
-- | Tries to allocate an instance on a given pair of nodes.
464
allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
465
               -> OpResult Node.AllocElement
466
allocateOnPair nl inst new_pdx new_sdx =
467
  let tgt_p = Container.find new_pdx nl
468
      tgt_s = Container.find new_sdx nl
469
  in do
470
    Instance.instMatchesPolicy inst (Node.iPolicy tgt_p)
471
    new_p <- Node.addPri tgt_p inst
472
    new_s <- Node.addSec tgt_s inst new_pdx
473
    let new_inst = Instance.setBoth inst new_pdx new_sdx
474
        new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
475
    return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
476

    
477
-- | Tries to perform an instance move and returns the best table
478
-- between the original one and the new one.
479
checkSingleStep :: Table -- ^ The original table
480
                -> Instance.Instance -- ^ The instance to move
481
                -> Table -- ^ The current best table
482
                -> IMove -- ^ The move to apply
483
                -> Table -- ^ The final best table
484
checkSingleStep ini_tbl target cur_tbl move =
485
  let Table ini_nl ini_il _ ini_plc = ini_tbl
486
      tmp_resu = applyMove ini_nl target move
487
  in case tmp_resu of
488
       OpFail _ -> cur_tbl
489
       OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
490
         let tgt_idx = Instance.idx target
491
             upd_cvar = compCV upd_nl
492
             upd_il = Container.add tgt_idx new_inst ini_il
493
             upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
494
             upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
495
         in compareTables cur_tbl upd_tbl
496

    
497
-- | Given the status of the current secondary as a valid new node and
498
-- the current candidate target node, generate the possible moves for
499
-- a instance.
500
possibleMoves :: Bool      -- ^ Whether the secondary node is a valid new node
501
              -> Bool      -- ^ Whether we can change the primary node
502
              -> Ndx       -- ^ Target node candidate
503
              -> [IMove]   -- ^ List of valid result moves
504

    
505
possibleMoves _ False tdx =
506
  [ReplaceSecondary tdx]
507

    
508
possibleMoves True True tdx =
509
  [ ReplaceSecondary tdx
510
  , ReplaceAndFailover tdx
511
  , ReplacePrimary tdx
512
  , FailoverAndReplace tdx
513
  ]
514

    
515
possibleMoves False True tdx =
516
  [ ReplaceSecondary tdx
517
  , ReplaceAndFailover tdx
518
  ]
519

    
520
-- | Compute the best move for a given instance.
521
checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
522
                  -> Bool              -- ^ Whether disk moves are allowed
523
                  -> Bool              -- ^ Whether instance moves are allowed
524
                  -> Table             -- ^ Original table
525
                  -> Instance.Instance -- ^ Instance to move
526
                  -> Table             -- ^ Best new table for this instance
527
checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
528
  let opdx = Instance.pNode target
529
      osdx = Instance.sNode target
530
      bad_nodes = [opdx, osdx]
531
      nodes = filter (`notElem` bad_nodes) nodes_idx
532
      use_secondary = elem osdx nodes_idx && inst_moves
533
      aft_failover = if use_secondary -- if allowed to failover
534
                       then checkSingleStep ini_tbl target ini_tbl Failover
535
                       else ini_tbl
536
      all_moves = if disk_moves
537
                    then concatMap
538
                           (possibleMoves use_secondary inst_moves) nodes
539
                    else []
540
    in
541
      -- iterate over the possible nodes for this instance
542
      foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
543

    
544
-- | Compute the best next move.
545
checkMove :: [Ndx]               -- ^ Allowed target node indices
546
          -> Bool                -- ^ Whether disk moves are allowed
547
          -> Bool                -- ^ Whether instance moves are allowed
548
          -> Table               -- ^ The current solution
549
          -> [Instance.Instance] -- ^ List of instances still to move
550
          -> Table               -- ^ The new solution
551
checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
552
  let Table _ _ _ ini_plc = ini_tbl
553
      -- we're using rwhnf from the Control.Parallel.Strategies
554
      -- package; we don't need to use rnf as that would force too
555
      -- much evaluation in single-threaded cases, and in
556
      -- multi-threaded case the weak head normal form is enough to
557
      -- spark the evaluation
558
      tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
559
                             inst_moves ini_tbl)
560
               victims
561
      -- iterate over all instances, computing the best move
562
      best_tbl = foldl' compareTables ini_tbl tables
563
      Table _ _ _ best_plc = best_tbl
564
  in if length best_plc == length ini_plc
565
       then ini_tbl -- no advancement
566
       else best_tbl
567

    
568
-- | Check if we are allowed to go deeper in the balancing.
569
doNextBalance :: Table     -- ^ The starting table
570
              -> Int       -- ^ Remaining length
571
              -> Score     -- ^ Score at which to stop
572
              -> Bool      -- ^ The resulting table and commands
573
doNextBalance ini_tbl max_rounds min_score =
574
  let Table _ _ ini_cv ini_plc = ini_tbl
575
      ini_plc_len = length ini_plc
576
  in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
577

    
578
-- | Run a balance move.
579
tryBalance :: Table       -- ^ The starting table
580
           -> Bool        -- ^ Allow disk moves
581
           -> Bool        -- ^ Allow instance moves
582
           -> Bool        -- ^ Only evacuate moves
583
           -> Score       -- ^ Min gain threshold
584
           -> Score       -- ^ Min gain
585
           -> Maybe Table -- ^ The resulting table and commands
586
tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
587
    let Table ini_nl ini_il ini_cv _ = ini_tbl
588
        all_inst = Container.elems ini_il
589
        all_inst' = if evac_mode
590
                    then let bad_nodes = map Node.idx . filter Node.offline $
591
                                         Container.elems ini_nl
592
                         in filter (any (`elem` bad_nodes) . Instance.allNodes)
593
                            all_inst
594
                    else all_inst
595
        reloc_inst = filter Instance.movable all_inst'
596
        node_idx = map Node.idx . filter (not . Node.offline) $
597
                   Container.elems ini_nl
598
        fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
599
        (Table _ _ fin_cv _) = fin_tbl
600
    in
601
      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
602
      then Just fin_tbl -- this round made success, return the new table
603
      else Nothing
604

    
605
-- * Allocation functions
606

    
607
-- | Build failure stats out of a list of failures.
608
collapseFailures :: [FailMode] -> FailStats
609
collapseFailures flst =
610
    map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
611
            [minBound..maxBound]
612

    
613
-- | Compares two Maybe AllocElement and chooses the besst score.
614
bestAllocElement :: Maybe Node.AllocElement
615
                 -> Maybe Node.AllocElement
616
                 -> Maybe Node.AllocElement
617
bestAllocElement a Nothing = a
618
bestAllocElement Nothing b = b
619
bestAllocElement a@(Just (_, _, _, ascore)) b@(Just (_, _, _, bscore)) =
620
  if ascore < bscore then a else b
621

    
622
-- | Update current Allocation solution and failure stats with new
623
-- elements.
624
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
625
concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
626

    
627
concatAllocs as (OpGood ns) =
628
  let -- Choose the old or new solution, based on the cluster score
629
    cntok = asAllocs as
630
    osols = asSolution as
631
    nsols = bestAllocElement osols (Just ns)
632
    nsuc = cntok + 1
633
    -- Note: we force evaluation of nsols here in order to keep the
634
    -- memory profile low - we know that we will need nsols for sure
635
    -- in the next cycle, so we force evaluation of nsols, since the
636
    -- foldl' in the caller will only evaluate the tuple, but not the
637
    -- elements of the tuple
638
  in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolution = nsols }
639

    
640
-- | Sums two 'AllocSolution' structures.
641
sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
642
sumAllocs (AllocSolution aFails aAllocs aSols aLog)
643
          (AllocSolution bFails bAllocs bSols bLog) =
644
  -- note: we add b first, since usually it will be smaller; when
645
  -- fold'ing, a will grow and grow whereas b is the per-group
646
  -- result, hence smaller
647
  let nFails  = bFails ++ aFails
648
      nAllocs = aAllocs + bAllocs
649
      nSols   = bestAllocElement aSols bSols
650
      nLog    = bLog ++ aLog
651
  in AllocSolution nFails nAllocs nSols nLog
652

    
653
-- | Given a solution, generates a reasonable description for it.
654
describeSolution :: AllocSolution -> String
655
describeSolution as =
656
  let fcnt = asFailures as
657
      sols = asSolution as
658
      freasons =
659
        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
660
        filter ((> 0) . snd) . collapseFailures $ fcnt
661
  in case sols of
662
     Nothing -> "No valid allocation solutions, failure reasons: " ++
663
                (if null fcnt then "unknown reasons" else freasons)
664
     Just (_, _, nodes, cv) ->
665
         printf ("score: %.8f, successes %d, failures %d (%s)" ++
666
                 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
667
               (intercalate "/" . map Node.name $ nodes)
668

    
669
-- | Annotates a solution with the appropriate string.
670
annotateSolution :: AllocSolution -> AllocSolution
671
annotateSolution as = as { asLog = describeSolution as : asLog as }
672

    
673
-- | Reverses an evacuation solution.
674
--
675
-- Rationale: we always concat the results to the top of the lists, so
676
-- for proper jobset execution, we should reverse all lists.
677
reverseEvacSolution :: EvacSolution -> EvacSolution
678
reverseEvacSolution (EvacSolution f m o) =
679
  EvacSolution (reverse f) (reverse m) (reverse o)
680

    
681
-- | Generate the valid node allocation singles or pairs for a new instance.
682
genAllocNodes :: Group.List        -- ^ Group list
683
              -> Node.List         -- ^ The node map
684
              -> Int               -- ^ The number of nodes required
685
              -> Bool              -- ^ Whether to drop or not
686
                                   -- unallocable nodes
687
              -> Result AllocNodes -- ^ The (monadic) result
688
genAllocNodes gl nl count drop_unalloc =
689
  let filter_fn = if drop_unalloc
690
                    then filter (Group.isAllocable .
691
                                 flip Container.find gl . Node.group)
692
                    else id
693
      all_nodes = filter_fn $ getOnline nl
694
      all_pairs = [(Node.idx p,
695
                    [Node.idx s | s <- all_nodes,
696
                                       Node.idx p /= Node.idx s,
697
                                       Node.group p == Node.group s]) |
698
                   p <- all_nodes]
699
  in case count of
700
       1 -> Ok (Left (map Node.idx all_nodes))
701
       2 -> Ok (Right (filter (not . null . snd) all_pairs))
702
       _ -> Bad "Unsupported number of nodes, only one or two  supported"
703

    
704
-- | Try to allocate an instance on the cluster.
705
tryAlloc :: (Monad m) =>
706
            Node.List         -- ^ The node list
707
         -> Instance.List     -- ^ The instance list
708
         -> Instance.Instance -- ^ The instance to allocate
709
         -> AllocNodes        -- ^ The allocation targets
710
         -> m AllocSolution   -- ^ Possible solution list
711
tryAlloc _  _ _    (Right []) = fail "Not enough online nodes"
712
tryAlloc nl _ inst (Right ok_pairs) =
713
  let psols = parMap rwhnf (\(p, ss) ->
714
                              foldl' (\cstate ->
715
                                        concatAllocs cstate .
716
                                        allocateOnPair nl inst p)
717
                              emptyAllocSolution ss) ok_pairs
718
      sols = foldl' sumAllocs emptyAllocSolution psols
719
  in return $ annotateSolution sols
720

    
721
tryAlloc _  _ _    (Left []) = fail "No online nodes"
722
tryAlloc nl _ inst (Left all_nodes) =
723
  let sols = foldl' (\cstate ->
724
                       concatAllocs cstate . allocateOnSingle nl inst
725
                    ) emptyAllocSolution all_nodes
726
  in return $ annotateSolution sols
727

    
728
-- | Given a group/result, describe it as a nice (list of) messages.
729
solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
730
solutionDescription gl (groupId, result) =
731
  case result of
732
    Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
733
    Bad message -> [printf "Group %s: error %s" gname message]
734
  where grp = Container.find groupId gl
735
        gname = Group.name grp
736
        pol = allocPolicyToRaw (Group.allocPolicy grp)
737

    
738
-- | From a list of possibly bad and possibly empty solutions, filter
739
-- only the groups with a valid result. Note that the result will be
740
-- reversed compared to the original list.
741
filterMGResults :: Group.List
742
                -> [(Gdx, Result AllocSolution)]
743
                -> [(Gdx, AllocSolution)]
744
filterMGResults gl = foldl' fn []
745
  where unallocable = not . Group.isAllocable . flip Container.find gl
746
        fn accu (gdx, rasol) =
747
          case rasol of
748
            Bad _ -> accu
749
            Ok sol | isNothing (asSolution sol) -> accu
750
                   | unallocable gdx -> accu
751
                   | otherwise -> (gdx, sol):accu
752

    
753
-- | Sort multigroup results based on policy and score.
754
sortMGResults :: Group.List
755
             -> [(Gdx, AllocSolution)]
756
             -> [(Gdx, AllocSolution)]
757
sortMGResults gl sols =
758
  let extractScore (_, _, _, x) = x
759
      solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
760
                             (extractScore . fromJust . asSolution) sol)
761
  in sortBy (comparing solScore) sols
762

    
763
-- | Finds the best group for an instance on a multi-group cluster.
764
--
765
-- Only solutions in @preferred@ and @last_resort@ groups will be
766
-- accepted as valid, and additionally if the allowed groups parameter
767
-- is not null then allocation will only be run for those group
768
-- indices.
769
findBestAllocGroup :: Group.List           -- ^ The group list
770
                   -> Node.List            -- ^ The node list
771
                   -> Instance.List        -- ^ The instance list
772
                   -> Maybe [Gdx]          -- ^ The allowed groups
773
                   -> Instance.Instance    -- ^ The instance to allocate
774
                   -> Int                  -- ^ Required number of nodes
775
                   -> Result (Gdx, AllocSolution, [String])
776
findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
777
  let groups = splitCluster mgnl mgil
778
      groups' = maybe groups (\gs -> filter ((`elem` gs) . fst) groups)
779
                allowed_gdxs
780
      sols = map (\(gid, (nl, il)) ->
781
                   (gid, genAllocNodes mggl nl cnt False >>=
782
                       tryAlloc nl il inst))
783
             groups'::[(Gdx, Result AllocSolution)]
784
      all_msgs = concatMap (solutionDescription mggl) sols
785
      goodSols = filterMGResults mggl sols
786
      sortedSols = sortMGResults mggl goodSols
787
  in if null sortedSols
788
       then if null groups'
789
              then Bad $ "no groups for evacuation: allowed groups was" ++
790
                     show allowed_gdxs ++ ", all groups: " ++
791
                     show (map fst groups)
792
              else Bad $ intercalate ", " all_msgs
793
       else let (final_group, final_sol) = head sortedSols
794
            in return (final_group, final_sol, all_msgs)
795

    
796
-- | Try to allocate an instance on a multi-group cluster.
797
tryMGAlloc :: Group.List           -- ^ The group list
798
           -> Node.List            -- ^ The node list
799
           -> Instance.List        -- ^ The instance list
800
           -> Instance.Instance    -- ^ The instance to allocate
801
           -> Int                  -- ^ Required number of nodes
802
           -> Result AllocSolution -- ^ Possible solution list
803
tryMGAlloc mggl mgnl mgil inst cnt = do
804
  (best_group, solution, all_msgs) <-
805
      findBestAllocGroup mggl mgnl mgil Nothing inst cnt
806
  let group_name = Group.name $ Container.find best_group mggl
807
      selmsg = "Selected group: " ++ group_name
808
  return $ solution { asLog = selmsg:all_msgs }
809

    
810
-- | Function which fails if the requested mode is change secondary.
811
--
812
-- This is useful since except DRBD, no other disk template can
813
-- execute change secondary; thus, we can just call this function
814
-- instead of always checking for secondary mode. After the call to
815
-- this function, whatever mode we have is just a primary change.
816
failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
817
failOnSecondaryChange ChangeSecondary dt =
818
  fail $ "Instances with disk template '" ++ diskTemplateToRaw dt ++
819
         "' can't execute change secondary"
820
failOnSecondaryChange _ _ = return ()
821

    
822
-- | Run evacuation for a single instance.
823
--
824
-- /Note:/ this function should correctly execute both intra-group
825
-- evacuations (in all modes) and inter-group evacuations (in the
826
-- 'ChangeAll' mode). Of course, this requires that the correct list
827
-- of target nodes is passed.
828
nodeEvacInstance :: Node.List         -- ^ The node list (cluster-wide)
829
                 -> Instance.List     -- ^ Instance list (cluster-wide)
830
                 -> EvacMode          -- ^ The evacuation mode
831
                 -> Instance.Instance -- ^ The instance to be evacuated
832
                 -> Gdx               -- ^ The group we're targetting
833
                 -> [Ndx]             -- ^ The list of available nodes
834
                                      -- for allocation
835
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
836
nodeEvacInstance _ _ mode (Instance.Instance
837
                           {Instance.diskTemplate = dt@DTDiskless}) _ _ =
838
                  failOnSecondaryChange mode dt >>
839
                  fail "Diskless relocations not implemented yet"
840

    
841
nodeEvacInstance _ _ _ (Instance.Instance
842
                        {Instance.diskTemplate = DTPlain}) _ _ =
843
                  fail "Instances of type plain cannot be relocated"
844

    
845
nodeEvacInstance _ _ _ (Instance.Instance
846
                        {Instance.diskTemplate = DTFile}) _ _ =
847
                  fail "Instances of type file cannot be relocated"
848

    
849
nodeEvacInstance _ _ mode  (Instance.Instance
850
                            {Instance.diskTemplate = dt@DTSharedFile}) _ _ =
851
                  failOnSecondaryChange mode dt >>
852
                  fail "Shared file relocations not implemented yet"
853

    
854
nodeEvacInstance _ _ mode (Instance.Instance
855
                           {Instance.diskTemplate = dt@DTBlock}) _ _ =
856
                  failOnSecondaryChange mode dt >>
857
                  fail "Block device relocations not implemented yet"
858

    
859
nodeEvacInstance nl il ChangePrimary
860
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
861
                 _ _ =
862
  do
863
    (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
864
    let idx = Instance.idx inst
865
        il' = Container.add idx inst' il
866
        ops = iMoveToJob nl' il' idx Failover
867
    return (nl', il', ops)
868

    
869
nodeEvacInstance nl il ChangeSecondary
870
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
871
                 gdx avail_nodes =
872
  do
873
    (nl', inst', _, ndx) <- annotateResult "Can't find any good node" $
874
                            eitherToResult $
875
                            foldl' (evacDrbdSecondaryInner nl inst gdx)
876
                            (Left "no nodes available") avail_nodes
877
    let idx = Instance.idx inst
878
        il' = Container.add idx inst' il
879
        ops = iMoveToJob nl' il' idx (ReplaceSecondary ndx)
880
    return (nl', il', ops)
881

    
882
-- The algorithm for ChangeAll is as follows:
883
--
884
-- * generate all (primary, secondary) node pairs for the target groups
885
-- * for each pair, execute the needed moves (r:s, f, r:s) and compute
886
--   the final node list state and group score
887
-- * select the best choice via a foldl that uses the same Either
888
--   String solution as the ChangeSecondary mode
889
nodeEvacInstance nl il ChangeAll
890
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
891
                 gdx avail_nodes =
892
  do
893
    let no_nodes = Left "no nodes available"
894
        node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s]
895
    (nl', il', ops, _) <-
896
        annotateResult "Can't find any good nodes for relocation" $
897
        eitherToResult $
898
        foldl'
899
        (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of
900
                          Bad msg ->
901
                              case accu of
902
                                Right _ -> accu
903
                                -- we don't need more details (which
904
                                -- nodes, etc.) as we only selected
905
                                -- this group if we can allocate on
906
                                -- it, hence failures will not
907
                                -- propagate out of this fold loop
908
                                Left _ -> Left $ "Allocation failed: " ++ msg
909
                          Ok result@(_, _, _, new_cv) ->
910
                              let new_accu = Right result in
911
                              case accu of
912
                                Left _ -> new_accu
913
                                Right (_, _, _, old_cv) ->
914
                                    if old_cv < new_cv
915
                                    then accu
916
                                    else new_accu
917
        ) no_nodes node_pairs
918

    
919
    return (nl', il', ops)
920

    
921
-- | Inner fold function for changing secondary of a DRBD instance.
922
--
923
-- The running solution is either a @Left String@, which means we
924
-- don't have yet a working solution, or a @Right (...)@, which
925
-- represents a valid solution; it holds the modified node list, the
926
-- modified instance (after evacuation), the score of that solution,
927
-- and the new secondary node index.
928
evacDrbdSecondaryInner :: Node.List -- ^ Cluster node list
929
                       -> Instance.Instance -- ^ Instance being evacuated
930
                       -> Gdx -- ^ The group index of the instance
931
                       -> Either String ( Node.List
932
                                        , Instance.Instance
933
                                        , Score
934
                                        , Ndx)  -- ^ Current best solution
935
                       -> Ndx  -- ^ Node we're evaluating as new secondary
936
                       -> Either String ( Node.List
937
                                        , Instance.Instance
938
                                        , Score
939
                                        , Ndx) -- ^ New best solution
940
evacDrbdSecondaryInner nl inst gdx accu ndx =
941
  case applyMove nl inst (ReplaceSecondary ndx) of
942
    OpFail fm ->
943
      case accu of
944
        Right _ -> accu
945
        Left _ -> Left $ "Node " ++ Container.nameOf nl ndx ++
946
                  " failed: " ++ show fm
947
    OpGood (nl', inst', _, _) ->
948
      let nodes = Container.elems nl'
949
          -- The fromJust below is ugly (it can fail nastily), but
950
          -- at this point we should have any internal mismatches,
951
          -- and adding a monad here would be quite involved
952
          grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
953
          new_cv = compCVNodes grpnodes
954
          new_accu = Right (nl', inst', new_cv, ndx)
955
      in case accu of
956
           Left _ -> new_accu
957
           Right (_, _, old_cv, _) ->
958
             if old_cv < new_cv
959
               then accu
960
               else new_accu
961

    
962
-- | Compute result of changing all nodes of a DRBD instance.
963
--
964
-- Given the target primary and secondary node (which might be in a
965
-- different group or not), this function will 'execute' all the
966
-- required steps and assuming all operations succceed, will return
967
-- the modified node and instance lists, the opcodes needed for this
968
-- and the new group score.
969
evacDrbdAllInner :: Node.List         -- ^ Cluster node list
970
                 -> Instance.List     -- ^ Cluster instance list
971
                 -> Instance.Instance -- ^ The instance to be moved
972
                 -> Gdx               -- ^ The target group index
973
                                      -- (which can differ from the
974
                                      -- current group of the
975
                                      -- instance)
976
                 -> (Ndx, Ndx)        -- ^ Tuple of new
977
                                      -- primary\/secondary nodes
978
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score)
979
evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) = do
980
  let primary = Container.find (Instance.pNode inst) nl
981
      idx = Instance.idx inst
982
  -- if the primary is offline, then we first failover
983
  (nl1, inst1, ops1) <-
984
    if Node.offline primary
985
      then do
986
        (nl', inst', _, _) <-
987
          annotateResult "Failing over to the secondary" $
988
          opToResult $ applyMove nl inst Failover
989
        return (nl', inst', [Failover])
990
      else return (nl, inst, [])
991
  let (o1, o2, o3) = (ReplaceSecondary t_pdx,
992
                      Failover,
993
                      ReplaceSecondary t_sdx)
994
  -- we now need to execute a replace secondary to the future
995
  -- primary node
996
  (nl2, inst2, _, _) <-
997
    annotateResult "Changing secondary to new primary" $
998
    opToResult $
999
    applyMove nl1 inst1 o1
1000
  let ops2 = o1:ops1
1001
  -- we now execute another failover, the primary stays fixed now
1002
  (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" $
1003
                        opToResult $ applyMove nl2 inst2 o2
1004
  let ops3 = o2:ops2
1005
  -- and finally another replace secondary, to the final secondary
1006
  (nl4, inst4, _, _) <-
1007
    annotateResult "Changing secondary to final secondary" $
1008
    opToResult $
1009
    applyMove nl3 inst3 o3
1010
  let ops4 = o3:ops3
1011
      il' = Container.add idx inst4 il
1012
      ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1013
  let nodes = Container.elems nl4
1014
      -- The fromJust below is ugly (it can fail nastily), but
1015
      -- at this point we should have any internal mismatches,
1016
      -- and adding a monad here would be quite involved
1017
      grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1018
      new_cv = compCVNodes grpnodes
1019
  return (nl4, il', ops, new_cv)
1020

    
1021
-- | Computes the nodes in a given group which are available for
1022
-- allocation.
1023
availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1024
                    -> IntSet.IntSet  -- ^ Nodes that are excluded
1025
                    -> Gdx            -- ^ The group for which we
1026
                                      -- query the nodes
1027
                    -> Result [Ndx]   -- ^ List of available node indices
1028
availableGroupNodes group_nodes excl_ndx gdx = do
1029
  local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1030
                 Ok (lookup gdx group_nodes)
1031
  let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1032
  return avail_nodes
1033

    
1034
-- | Updates the evac solution with the results of an instance
1035
-- evacuation.
1036
updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1037
                   -> Idx
1038
                   -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1039
                   -> (Node.List, Instance.List, EvacSolution)
1040
updateEvacSolution (nl, il, es) idx (Bad msg) =
1041
  (nl, il, es { esFailed = (idx, msg):esFailed es})
1042
updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1043
  (nl, il, es { esMoved = new_elem:esMoved es
1044
              , esOpCodes = opcodes:esOpCodes es })
1045
    where inst = Container.find idx il
1046
          new_elem = (idx,
1047
                      instancePriGroup nl inst,
1048
                      Instance.allNodes inst)
1049

    
1050
-- | Node-evacuation IAllocator mode main function.
1051
tryNodeEvac :: Group.List    -- ^ The cluster groups
1052
            -> Node.List     -- ^ The node list (cluster-wide, not per group)
1053
            -> Instance.List -- ^ Instance list (cluster-wide)
1054
            -> EvacMode      -- ^ The evacuation mode
1055
            -> [Idx]         -- ^ List of instance (indices) to be evacuated
1056
            -> Result (Node.List, Instance.List, EvacSolution)
1057
tryNodeEvac _ ini_nl ini_il mode idxs =
1058
  let evac_ndx = nodesToEvacuate ini_il mode idxs
1059
      offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1060
      excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1061
      group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1062
                                           (Container.elems nl))) $
1063
                  splitCluster ini_nl ini_il
1064
      (fin_nl, fin_il, esol) =
1065
        foldl' (\state@(nl, il, _) inst ->
1066
                  let gdx = instancePriGroup nl inst
1067
                      pdx = Instance.pNode inst in
1068
                  updateEvacSolution state (Instance.idx inst) $
1069
                  availableGroupNodes group_ndx
1070
                    (IntSet.insert pdx excl_ndx) gdx >>=
1071
                      nodeEvacInstance nl il mode inst gdx
1072
               )
1073
        (ini_nl, ini_il, emptyEvacSolution)
1074
        (map (`Container.find` ini_il) idxs)
1075
  in return (fin_nl, fin_il, reverseEvacSolution esol)
1076

    
1077
-- | Change-group IAllocator mode main function.
1078
--
1079
-- This is very similar to 'tryNodeEvac', the only difference is that
1080
-- we don't choose as target group the current instance group, but
1081
-- instead:
1082
--
1083
--   1. at the start of the function, we compute which are the target
1084
--   groups; either no groups were passed in, in which case we choose
1085
--   all groups out of which we don't evacuate instance, or there were
1086
--   some groups passed, in which case we use those
1087
--
1088
--   2. for each instance, we use 'findBestAllocGroup' to choose the
1089
--   best group to hold the instance, and then we do what
1090
--   'tryNodeEvac' does, except for this group instead of the current
1091
--   instance group.
1092
--
1093
-- Note that the correct behaviour of this function relies on the
1094
-- function 'nodeEvacInstance' to be able to do correctly both
1095
-- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1096
tryChangeGroup :: Group.List    -- ^ The cluster groups
1097
               -> Node.List     -- ^ The node list (cluster-wide)
1098
               -> Instance.List -- ^ Instance list (cluster-wide)
1099
               -> [Gdx]         -- ^ Target groups; if empty, any
1100
                                -- groups not being evacuated
1101
               -> [Idx]         -- ^ List of instance (indices) to be evacuated
1102
               -> Result (Node.List, Instance.List, EvacSolution)
1103
tryChangeGroup gl ini_nl ini_il gdxs idxs =
1104
  let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1105
                             flip Container.find ini_il) idxs
1106
      target_gdxs = (if null gdxs
1107
                       then Container.keys gl
1108
                       else gdxs) \\ evac_gdxs
1109
      offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1110
      excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1111
      group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1112
                                           (Container.elems nl))) $
1113
                  splitCluster ini_nl ini_il
1114
      (fin_nl, fin_il, esol) =
1115
        foldl' (\state@(nl, il, _) inst ->
1116
                  let solution = do
1117
                        let ncnt = Instance.requiredNodes $
1118
                                   Instance.diskTemplate inst
1119
                        (gdx, _, _) <- findBestAllocGroup gl nl il
1120
                                       (Just target_gdxs) inst ncnt
1121
                        av_nodes <- availableGroupNodes group_ndx
1122
                                    excl_ndx gdx
1123
                        nodeEvacInstance nl il ChangeAll inst gdx av_nodes
1124
                  in updateEvacSolution state (Instance.idx inst) solution
1125
               )
1126
        (ini_nl, ini_il, emptyEvacSolution)
1127
        (map (`Container.find` ini_il) idxs)
1128
  in return (fin_nl, fin_il, reverseEvacSolution esol)
1129

    
1130
-- | Standard-sized allocation method.
1131
--
1132
-- This places instances of the same size on the cluster until we're
1133
-- out of space. The result will be a list of identically-sized
1134
-- instances.
1135
iterateAlloc :: AllocMethod
1136
iterateAlloc nl il limit newinst allocnodes ixes cstats =
1137
  let depth = length ixes
1138
      newname = printf "new-%d" depth::String
1139
      newidx = Container.size il
1140
      newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1141
      newlimit = fmap (flip (-) 1) limit
1142
  in case tryAlloc nl il newi2 allocnodes of
1143
       Bad s -> Bad s
1144
       Ok (AllocSolution { asFailures = errs, asSolution = sols3 }) ->
1145
         let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1146
         case sols3 of
1147
           Nothing -> newsol
1148
           Just (xnl, xi, _, _) ->
1149
             if limit == Just 0
1150
               then newsol
1151
               else iterateAlloc xnl (Container.add newidx xi il)
1152
                      newlimit newinst allocnodes (xi:ixes)
1153
                      (totalResources xnl:cstats)
1154

    
1155
-- | Tiered allocation method.
1156
--
1157
-- This places instances on the cluster, and decreases the spec until
1158
-- we can allocate again. The result will be a list of decreasing
1159
-- instance specs.
1160
tieredAlloc :: AllocMethod
1161
tieredAlloc nl il limit newinst allocnodes ixes cstats =
1162
  case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1163
    Bad s -> Bad s
1164
    Ok (errs, nl', il', ixes', cstats') ->
1165
      let newsol = Ok (errs, nl', il', ixes', cstats')
1166
          ixes_cnt = length ixes'
1167
          (stop, newlimit) = case limit of
1168
                               Nothing -> (False, Nothing)
1169
                               Just n -> (n <= ixes_cnt,
1170
                                            Just (n - ixes_cnt)) in
1171
      if stop then newsol else
1172
          case Instance.shrinkByType newinst . fst . last $
1173
               sortBy (comparing snd) errs of
1174
            Bad _ -> newsol
1175
            Ok newinst' -> tieredAlloc nl' il' newlimit
1176
                           newinst' allocnodes ixes' cstats'
1177

    
1178
-- * Formatting functions
1179

    
1180
-- | Given the original and final nodes, computes the relocation description.
1181
computeMoves :: Instance.Instance -- ^ The instance to be moved
1182
             -> String -- ^ The instance name
1183
             -> IMove  -- ^ The move being performed
1184
             -> String -- ^ New primary
1185
             -> String -- ^ New secondary
1186
             -> (String, [String])
1187
                -- ^ Tuple of moves and commands list; moves is containing
1188
                -- either @/f/@ for failover or @/r:name/@ for replace
1189
                -- secondary, while the command list holds gnt-instance
1190
                -- commands (without that prefix), e.g \"@failover instance1@\"
1191
computeMoves i inam mv c d =
1192
  case mv of
1193
    Failover -> ("f", [mig])
1194
    FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1195
    ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1196
    ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1197
    ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1198
  where morf = if Instance.instanceRunning i then "migrate" else "failover"
1199
        mig = printf "%s -f %s" morf inam::String
1200
        rep n = printf "replace-disks -n %s %s" n inam
1201

    
1202
-- | Converts a placement to string format.
1203
printSolutionLine :: Node.List     -- ^ The node list
1204
                  -> Instance.List -- ^ The instance list
1205
                  -> Int           -- ^ Maximum node name length
1206
                  -> Int           -- ^ Maximum instance name length
1207
                  -> Placement     -- ^ The current placement
1208
                  -> Int           -- ^ The index of the placement in
1209
                                   -- the solution
1210
                  -> (String, [String])
1211
printSolutionLine nl il nmlen imlen plc pos =
1212
  let pmlen = (2*nmlen + 1)
1213
      (i, p, s, mv, c) = plc
1214
      inst = Container.find i il
1215
      inam = Instance.alias inst
1216
      npri = Node.alias $ Container.find p nl
1217
      nsec = Node.alias $ Container.find s nl
1218
      opri = Node.alias $ Container.find (Instance.pNode inst) nl
1219
      osec = Node.alias $ Container.find (Instance.sNode inst) nl
1220
      (moves, cmds) =  computeMoves inst inam mv npri nsec
1221
      ostr = printf "%s:%s" opri osec::String
1222
      nstr = printf "%s:%s" npri nsec::String
1223
  in (printf "  %3d. %-*s %-*s => %-*s %.8f a=%s"
1224
      pos imlen inam pmlen ostr
1225
      pmlen nstr c moves,
1226
      cmds)
1227

    
1228
-- | Return the instance and involved nodes in an instance move.
1229
--
1230
-- Note that the output list length can vary, and is not required nor
1231
-- guaranteed to be of any specific length.
1232
involvedNodes :: Instance.List -- ^ Instance list, used for retrieving
1233
                               -- the instance from its index; note
1234
                               -- that this /must/ be the original
1235
                               -- instance list, so that we can
1236
                               -- retrieve the old nodes
1237
              -> Placement     -- ^ The placement we're investigating,
1238
                               -- containing the new nodes and
1239
                               -- instance index
1240
              -> [Ndx]         -- ^ Resulting list of node indices
1241
involvedNodes il plc =
1242
  let (i, np, ns, _, _) = plc
1243
      inst = Container.find i il
1244
  in nub $ [np, ns] ++ Instance.allNodes inst
1245

    
1246
-- | Inner function for splitJobs, that either appends the next job to
1247
-- the current jobset, or starts a new jobset.
1248
mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1249
mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1250
mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1251
  | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1252
  | otherwise = ([n]:cjs, ndx)
1253

    
1254
-- | Break a list of moves into independent groups. Note that this
1255
-- will reverse the order of jobs.
1256
splitJobs :: [MoveJob] -> [JobSet]
1257
splitJobs = fst . foldl mergeJobs ([], [])
1258

    
1259
-- | Given a list of commands, prefix them with @gnt-instance@ and
1260
-- also beautify the display a little.
1261
formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1262
formatJob jsn jsl (sn, (_, _, _, cmds)) =
1263
  let out =
1264
        printf "  echo job %d/%d" jsn sn:
1265
        printf "  check":
1266
        map ("  gnt-instance " ++) cmds
1267
  in if sn == 1
1268
       then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1269
       else out
1270

    
1271
-- | Given a list of commands, prefix them with @gnt-instance@ and
1272
-- also beautify the display a little.
1273
formatCmds :: [JobSet] -> String
1274
formatCmds =
1275
  unlines .
1276
  concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1277
                           (zip [1..] js)) .
1278
  zip [1..]
1279

    
1280
-- | Print the node list.
1281
printNodes :: Node.List -> [String] -> String
1282
printNodes nl fs =
1283
  let fields = case fs of
1284
                 [] -> Node.defaultFields
1285
                 "+":rest -> Node.defaultFields ++ rest
1286
                 _ -> fs
1287
      snl = sortBy (comparing Node.idx) (Container.elems nl)
1288
      (header, isnum) = unzip $ map Node.showHeader fields
1289
  in unlines . map ((:) ' ' .  unwords) $
1290
     formatTable (header:map (Node.list fields) snl) isnum
1291

    
1292
-- | Print the instance list.
1293
printInsts :: Node.List -> Instance.List -> String
1294
printInsts nl il =
1295
  let sil = sortBy (comparing Instance.idx) (Container.elems il)
1296
      helper inst = [ if Instance.instanceRunning inst then "R" else " "
1297
                    , Instance.name inst
1298
                    , Container.nameOf nl (Instance.pNode inst)
1299
                    , let sdx = Instance.sNode inst
1300
                      in if sdx == Node.noSecondary
1301
                           then  ""
1302
                           else Container.nameOf nl sdx
1303
                    , if Instance.autoBalance inst then "Y" else "N"
1304
                    , printf "%3d" $ Instance.vcpus inst
1305
                    , printf "%5d" $ Instance.mem inst
1306
                    , printf "%5d" $ Instance.dsk inst `div` 1024
1307
                    , printf "%5.3f" lC
1308
                    , printf "%5.3f" lM
1309
                    , printf "%5.3f" lD
1310
                    , printf "%5.3f" lN
1311
                    ]
1312
          where DynUtil lC lM lD lN = Instance.util inst
1313
      header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1314
               , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1315
      isnum = False:False:False:False:False:repeat True
1316
  in unlines . map ((:) ' ' . unwords) $
1317
     formatTable (header:map helper sil) isnum
1318

    
1319
-- | Shows statistics for a given node list.
1320
printStats :: Node.List -> String
1321
printStats nl =
1322
  let dcvs = compDetailedCV $ Container.elems nl
1323
      (weights, names) = unzip detailedCVInfo
1324
      hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1325
      formatted = map (\(w, header, val) ->
1326
                         printf "%s=%.8f(x%.2f)" header val w::String) hd
1327
  in intercalate ", " formatted
1328

    
1329
-- | Convert a placement into a list of OpCodes (basically a job).
1330
iMoveToJob :: Node.List        -- ^ The node list; only used for node
1331
                               -- names, so any version is good
1332
                               -- (before or after the operation)
1333
           -> Instance.List    -- ^ The instance list; also used for
1334
                               -- names only
1335
           -> Idx              -- ^ The index of the instance being
1336
                               -- moved
1337
           -> IMove            -- ^ The actual move to be described
1338
           -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to
1339
                               -- the given move
1340
iMoveToJob nl il idx move =
1341
  let inst = Container.find idx il
1342
      iname = Instance.name inst
1343
      lookNode  = Just . Container.nameOf nl
1344
      opF = OpCodes.OpInstanceMigrate iname True False True Nothing
1345
      opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n)
1346
              OpCodes.ReplaceNewSecondary [] Nothing
1347
  in case move of
1348
       Failover -> [ opF ]
1349
       ReplacePrimary np -> [ opF, opR np, opF ]
1350
       ReplaceSecondary ns -> [ opR ns ]
1351
       ReplaceAndFailover np -> [ opR np, opF ]
1352
       FailoverAndReplace ns -> [ opF, opR ns ]
1353

    
1354
-- * Node group functions
1355

    
1356
-- | Computes the group of an instance.
1357
instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1358
instanceGroup nl i =
1359
  let sidx = Instance.sNode i
1360
      pnode = Container.find (Instance.pNode i) nl
1361
      snode = if sidx == Node.noSecondary
1362
              then pnode
1363
              else Container.find sidx nl
1364
      pgroup = Node.group pnode
1365
      sgroup = Node.group snode
1366
  in if pgroup /= sgroup
1367
       then fail ("Instance placed accross two node groups, primary " ++
1368
                  show pgroup ++ ", secondary " ++ show sgroup)
1369
       else return pgroup
1370

    
1371
-- | Computes the group of an instance per the primary node.
1372
instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1373
instancePriGroup nl i =
1374
  let pnode = Container.find (Instance.pNode i) nl
1375
  in  Node.group pnode
1376

    
1377
-- | Compute the list of badly allocated instances (split across node
1378
-- groups).
1379
findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1380
findSplitInstances nl =
1381
  filter (not . isOk . instanceGroup nl) . Container.elems
1382

    
1383
-- | Splits a cluster into the component node groups.
1384
splitCluster :: Node.List -> Instance.List ->
1385
                [(Gdx, (Node.List, Instance.List))]
1386
splitCluster nl il =
1387
  let ngroups = Node.computeGroups (Container.elems nl)
1388
  in map (\(guuid, nodes) ->
1389
           let nidxs = map Node.idx nodes
1390
               nodes' = zip nidxs nodes
1391
               instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1392
           in (guuid, (Container.fromList nodes', instances))) ngroups
1393

    
1394
-- | Compute the list of nodes that are to be evacuated, given a list
1395
-- of instances and an evacuation mode.
1396
nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1397
                -> EvacMode      -- ^ The evacuation mode we're using
1398
                -> [Idx]         -- ^ List of instance indices being evacuated
1399
                -> IntSet.IntSet -- ^ Set of node indices
1400
nodesToEvacuate il mode =
1401
  IntSet.delete Node.noSecondary .
1402
  foldl' (\ns idx ->
1403
            let i = Container.find idx il
1404
                pdx = Instance.pNode i
1405
                sdx = Instance.sNode i
1406
                dt = Instance.diskTemplate i
1407
                withSecondary = case dt of
1408
                                  DTDrbd8 -> IntSet.insert sdx ns
1409
                                  _ -> ns
1410
            in case mode of
1411
                 ChangePrimary   -> IntSet.insert pdx ns
1412
                 ChangeSecondary -> withSecondary
1413
                 ChangeAll       -> IntSet.insert pdx withSecondary
1414
         ) IntSet.empty