Statistics
| Branch: | Tag: | Revision:

root / htools / Ganeti / HTools / Cluster.hs @ 90c2f1e8

History | View | Annotate | Download (60.7 kB)

1
{-| Implementation of cluster-wide logic.
2

    
3
This module holds all pure cluster-logic; I\/O related functionality
4
goes into the /Main/ module for the individual binaries.
5

    
6
-}
7

    
8
{-
9

    
10
Copyright (C) 2009, 2010, 2011, 2012 Google Inc.
11

    
12
This program is free software; you can redistribute it and/or modify
13
it under the terms of the GNU General Public License as published by
14
the Free Software Foundation; either version 2 of the License, or
15
(at your option) any later version.
16

    
17
This program is distributed in the hope that it will be useful, but
18
WITHOUT ANY WARRANTY; without even the implied warranty of
19
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
General Public License for more details.
21

    
22
You should have received a copy of the GNU General Public License
23
along with this program; if not, write to the Free Software
24
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25
02110-1301, USA.
26

    
27
-}
28

    
29
module Ganeti.HTools.Cluster
30
  (
31
    -- * Types
32
    AllocSolution(..)
33
  , EvacSolution(..)
34
  , Table(..)
35
  , CStats(..)
36
  , AllocStats
37
  , AllocResult
38
  , AllocMethod
39
  -- * Generic functions
40
  , totalResources
41
  , computeAllocationDelta
42
  -- * First phase functions
43
  , computeBadItems
44
  -- * Second phase functions
45
  , printSolutionLine
46
  , formatCmds
47
  , involvedNodes
48
  , splitJobs
49
  -- * Display functions
50
  , printNodes
51
  , printInsts
52
  -- * Balacing functions
53
  , checkMove
54
  , doNextBalance
55
  , tryBalance
56
  , compCV
57
  , compCVNodes
58
  , compDetailedCV
59
  , printStats
60
  , iMoveToJob
61
  -- * IAllocator functions
62
  , genAllocNodes
63
  , tryAlloc
64
  , tryMGAlloc
65
  , tryNodeEvac
66
  , tryChangeGroup
67
  , collapseFailures
68
  -- * Allocation functions
69
  , iterateAlloc
70
  , tieredAlloc
71
  -- * Node group functions
72
  , instanceGroup
73
  , findSplitInstances
74
  , splitCluster
75
  ) where
76

    
77
import qualified Data.IntSet as IntSet
78
import Data.List
79
import Data.Maybe (fromJust, isNothing)
80
import Data.Ord (comparing)
81
import Text.Printf (printf)
82

    
83
import qualified Ganeti.HTools.Container as Container
84
import qualified Ganeti.HTools.Instance as Instance
85
import qualified Ganeti.HTools.Node as Node
86
import qualified Ganeti.HTools.Group as Group
87
import Ganeti.HTools.Types
88
import Ganeti.HTools.Utils
89
import Ganeti.HTools.Compat
90
import qualified Ganeti.OpCodes as OpCodes
91

    
92
-- * Types
93

    
94
-- | Allocation\/relocation solution.
95
data AllocSolution = AllocSolution
96
  { asFailures :: [FailMode]              -- ^ Failure counts
97
  , asAllocs   :: Int                     -- ^ Good allocation count
98
  , asSolution :: Maybe Node.AllocElement -- ^ The actual allocation result
99
  , asLog      :: [String]                -- ^ Informational messages
100
  }
101

    
102
-- | Node evacuation/group change iallocator result type. This result
103
-- type consists of actual opcodes (a restricted subset) that are
104
-- transmitted back to Ganeti.
105
data EvacSolution = EvacSolution
106
  { esMoved   :: [(Idx, Gdx, [Ndx])]  -- ^ Instances moved successfully
107
  , esFailed  :: [(Idx, String)]      -- ^ Instances which were not
108
                                      -- relocated
109
  , esOpCodes :: [[OpCodes.OpCode]]   -- ^ List of jobs
110
  } deriving (Show)
111

    
112
-- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
113
type AllocResult = (FailStats, Node.List, Instance.List,
114
                    [Instance.Instance], [CStats])
115

    
116
-- | A type denoting the valid allocation mode/pairs.
117
--
118
-- For a one-node allocation, this will be a @Left ['Ndx']@, whereas
119
-- for a two-node allocation, this will be a @Right [('Ndx',
120
-- ['Ndx'])]@. In the latter case, the list is basically an
121
-- association list, grouped by primary node and holding the potential
122
-- secondary nodes in the sub-list.
123
type AllocNodes = Either [Ndx] [(Ndx, [Ndx])]
124

    
125
-- | The empty solution we start with when computing allocations.
126
emptyAllocSolution :: AllocSolution
127
emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
128
                                   , asSolution = Nothing, asLog = [] }
129

    
130
-- | The empty evac solution.
131
emptyEvacSolution :: EvacSolution
132
emptyEvacSolution = EvacSolution { esMoved = []
133
                                 , esFailed = []
134
                                 , esOpCodes = []
135
                                 }
136

    
137
-- | The complete state for the balancing solution.
138
data Table = Table Node.List Instance.List Score [Placement]
139
             deriving (Show, Read)
140

    
141
-- | Cluster statistics data type.
142
data CStats = CStats
143
  { csFmem :: Integer -- ^ Cluster free mem
144
  , csFdsk :: Integer -- ^ Cluster free disk
145
  , csAmem :: Integer -- ^ Cluster allocatable mem
146
  , csAdsk :: Integer -- ^ Cluster allocatable disk
147
  , csAcpu :: Integer -- ^ Cluster allocatable cpus
148
  , csMmem :: Integer -- ^ Max node allocatable mem
149
  , csMdsk :: Integer -- ^ Max node allocatable disk
150
  , csMcpu :: Integer -- ^ Max node allocatable cpu
151
  , csImem :: Integer -- ^ Instance used mem
152
  , csIdsk :: Integer -- ^ Instance used disk
153
  , csIcpu :: Integer -- ^ Instance used cpu
154
  , csTmem :: Double  -- ^ Cluster total mem
155
  , csTdsk :: Double  -- ^ Cluster total disk
156
  , csTcpu :: Double  -- ^ Cluster total cpus
157
  , csVcpu :: Integer -- ^ Cluster total virtual cpus
158
  , csNcpu :: Double  -- ^ Equivalent to 'csIcpu' but in terms of
159
                      -- physical CPUs, i.e. normalised used phys CPUs
160
  , csXmem :: Integer -- ^ Unnacounted for mem
161
  , csNmem :: Integer -- ^ Node own memory
162
  , csScore :: Score  -- ^ The cluster score
163
  , csNinst :: Int    -- ^ The total number of instances
164
  } deriving (Show, Read)
165

    
166
-- | Currently used, possibly to allocate, unallocable.
167
type AllocStats = (RSpec, RSpec, RSpec)
168

    
169
-- | A simple type for allocation functions.
170
type AllocMethod =  Node.List           -- ^ Node list
171
                 -> Instance.List       -- ^ Instance list
172
                 -> Maybe Int           -- ^ Optional allocation limit
173
                 -> Instance.Instance   -- ^ Instance spec for allocation
174
                 -> AllocNodes          -- ^ Which nodes we should allocate on
175
                 -> [Instance.Instance] -- ^ Allocated instances
176
                 -> [CStats]            -- ^ Running cluster stats
177
                 -> Result AllocResult  -- ^ Allocation result
178

    
179
-- * Utility functions
180

    
181
-- | Verifies the N+1 status and return the affected nodes.
182
verifyN1 :: [Node.Node] -> [Node.Node]
183
verifyN1 = filter Node.failN1
184

    
185
{-| Computes the pair of bad nodes and instances.
186

    
187
The bad node list is computed via a simple 'verifyN1' check, and the
188
bad instance list is the list of primary and secondary instances of
189
those nodes.
190

    
191
-}
192
computeBadItems :: Node.List -> Instance.List ->
193
                   ([Node.Node], [Instance.Instance])
194
computeBadItems nl il =
195
  let bad_nodes = verifyN1 $ getOnline nl
196
      bad_instances = map (`Container.find` il) .
197
                      sort . nub $
198
                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
199
  in
200
    (bad_nodes, bad_instances)
201

    
202
-- | Extracts the node pairs for an instance. This can fail if the
203
-- instance is single-homed. FIXME: this needs to be improved,
204
-- together with the general enhancement for handling non-DRBD moves.
205
instanceNodes :: Node.List -> Instance.Instance ->
206
                 (Ndx, Ndx, Node.Node, Node.Node)
207
instanceNodes nl inst =
208
  let old_pdx = Instance.pNode inst
209
      old_sdx = Instance.sNode inst
210
      old_p = Container.find old_pdx nl
211
      old_s = Container.find old_sdx nl
212
  in (old_pdx, old_sdx, old_p, old_s)
213

    
214
-- | Zero-initializer for the CStats type.
215
emptyCStats :: CStats
216
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
217

    
218
-- | Update stats with data from a new node.
219
updateCStats :: CStats -> Node.Node -> CStats
220
updateCStats cs node =
221
  let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
222
               csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
223
               csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
224
               csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
225
               csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
226
               csVcpu = x_vcpu, csNcpu = x_ncpu,
227
               csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
228
             }
229
        = cs
230
      inc_amem = Node.fMem node - Node.rMem node
231
      inc_amem' = if inc_amem > 0 then inc_amem else 0
232
      inc_adsk = Node.availDisk node
233
      inc_imem = truncate (Node.tMem node) - Node.nMem node
234
                 - Node.xMem node - Node.fMem node
235
      inc_icpu = Node.uCpu node
236
      inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
237
      inc_vcpu = Node.hiCpu node
238
      inc_acpu = Node.availCpu node
239
      inc_ncpu = fromIntegral (Node.uCpu node) /
240
                 iPolicyVcpuRatio (Node.iPolicy node)
241
  in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
242
        , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
243
        , csAmem = x_amem + fromIntegral inc_amem'
244
        , csAdsk = x_adsk + fromIntegral inc_adsk
245
        , csAcpu = x_acpu + fromIntegral inc_acpu
246
        , csMmem = max x_mmem (fromIntegral inc_amem')
247
        , csMdsk = max x_mdsk (fromIntegral inc_adsk)
248
        , csMcpu = max x_mcpu (fromIntegral inc_acpu)
249
        , csImem = x_imem + fromIntegral inc_imem
250
        , csIdsk = x_idsk + fromIntegral inc_idsk
251
        , csIcpu = x_icpu + fromIntegral inc_icpu
252
        , csTmem = x_tmem + Node.tMem node
253
        , csTdsk = x_tdsk + Node.tDsk node
254
        , csTcpu = x_tcpu + Node.tCpu node
255
        , csVcpu = x_vcpu + fromIntegral inc_vcpu
256
        , csNcpu = x_ncpu + inc_ncpu
257
        , csXmem = x_xmem + fromIntegral (Node.xMem node)
258
        , csNmem = x_nmem + fromIntegral (Node.nMem node)
259
        , csNinst = x_ninst + length (Node.pList node)
260
        }
261

    
262
-- | Compute the total free disk and memory in the cluster.
263
totalResources :: Node.List -> CStats
264
totalResources nl =
265
  let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
266
  in cs { csScore = compCV nl }
267

    
268
-- | Compute the delta between two cluster state.
269
--
270
-- This is used when doing allocations, to understand better the
271
-- available cluster resources. The return value is a triple of the
272
-- current used values, the delta that was still allocated, and what
273
-- was left unallocated.
274
computeAllocationDelta :: CStats -> CStats -> AllocStats
275
computeAllocationDelta cini cfin =
276
  let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini
277
      CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
278
              csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin
279
      rini = RSpec (fromIntegral i_icpu) (fromIntegral i_imem)
280
             (fromIntegral i_idsk)
281
      rfin = RSpec (fromIntegral (f_icpu - i_icpu))
282
             (fromIntegral (f_imem - i_imem))
283
             (fromIntegral (f_idsk - i_idsk))
284
      un_cpu = fromIntegral (v_cpu - f_icpu)::Int
285
      runa = RSpec un_cpu (truncate t_mem - fromIntegral f_imem)
286
             (truncate t_dsk - fromIntegral f_idsk)
287
  in (rini, rfin, runa)
288

    
289
-- | The names and weights of the individual elements in the CV list.
290
detailedCVInfo :: [(Double, String)]
291
detailedCVInfo = [ (1,  "free_mem_cv")
292
                 , (1,  "free_disk_cv")
293
                 , (1,  "n1_cnt")
294
                 , (1,  "reserved_mem_cv")
295
                 , (4,  "offline_all_cnt")
296
                 , (16, "offline_pri_cnt")
297
                 , (1,  "vcpu_ratio_cv")
298
                 , (1,  "cpu_load_cv")
299
                 , (1,  "mem_load_cv")
300
                 , (1,  "disk_load_cv")
301
                 , (1,  "net_load_cv")
302
                 , (2,  "pri_tags_score")
303
                 ]
304

    
305
-- | Holds the weights used by 'compCVNodes' for each metric.
306
detailedCVWeights :: [Double]
307
detailedCVWeights = map fst detailedCVInfo
308

    
309
-- | Compute the mem and disk covariance.
310
compDetailedCV :: [Node.Node] -> [Double]
311
compDetailedCV all_nodes =
312
  let (offline, nodes) = partition Node.offline all_nodes
313
      mem_l = map Node.pMem nodes
314
      dsk_l = map Node.pDsk nodes
315
      -- metric: memory covariance
316
      mem_cv = stdDev mem_l
317
      -- metric: disk covariance
318
      dsk_cv = stdDev dsk_l
319
      -- metric: count of instances living on N1 failing nodes
320
      n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
321
                                                 length (Node.pList n)) .
322
                 filter Node.failN1 $ nodes :: Double
323
      res_l = map Node.pRem nodes
324
      -- metric: reserved memory covariance
325
      res_cv = stdDev res_l
326
      -- offline instances metrics
327
      offline_ipri = sum . map (length . Node.pList) $ offline
328
      offline_isec = sum . map (length . Node.sList) $ offline
329
      -- metric: count of instances on offline nodes
330
      off_score = fromIntegral (offline_ipri + offline_isec)::Double
331
      -- metric: count of primary instances on offline nodes (this
332
      -- helps with evacuation/failover of primary instances on
333
      -- 2-node clusters with one node offline)
334
      off_pri_score = fromIntegral offline_ipri::Double
335
      cpu_l = map Node.pCpu nodes
336
      -- metric: covariance of vcpu/pcpu ratio
337
      cpu_cv = stdDev cpu_l
338
      -- metrics: covariance of cpu, memory, disk and network load
339
      (c_load, m_load, d_load, n_load) =
340
        unzip4 $ map (\n ->
341
                      let DynUtil c1 m1 d1 n1 = Node.utilLoad n
342
                          DynUtil c2 m2 d2 n2 = Node.utilPool n
343
                      in (c1/c2, m1/m2, d1/d2, n1/n2)) nodes
344
      -- metric: conflicting instance count
345
      pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
346
      pri_tags_score = fromIntegral pri_tags_inst::Double
347
  in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
348
     , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
349
     , pri_tags_score ]
350

    
351
-- | Compute the /total/ variance.
352
compCVNodes :: [Node.Node] -> Double
353
compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
354

    
355
-- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
356
compCV :: Node.List -> Double
357
compCV = compCVNodes . Container.elems
358

    
359
-- | Compute online nodes from a 'Node.List'.
360
getOnline :: Node.List -> [Node.Node]
361
getOnline = filter (not . Node.offline) . Container.elems
362

    
363
-- * Balancing functions
364

    
365
-- | Compute best table. Note that the ordering of the arguments is important.
366
compareTables :: Table -> Table -> Table
367
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
368
  if a_cv > b_cv then b else a
369

    
370
-- | Applies an instance move to a given node list and instance.
371
applyMove :: Node.List -> Instance.Instance
372
          -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
373
-- Failover (f)
374
applyMove nl inst Failover =
375
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
376
      int_p = Node.removePri old_p inst
377
      int_s = Node.removeSec old_s inst
378
      new_nl = do -- Maybe monad
379
        new_p <- Node.addPriEx (Node.offline old_p) int_s inst
380
        new_s <- Node.addSec int_p inst old_sdx
381
        let new_inst = Instance.setBoth inst old_sdx old_pdx
382
        return (Container.addTwo old_pdx new_s old_sdx new_p nl,
383
                new_inst, old_sdx, old_pdx)
384
  in new_nl
385

    
386
-- Replace the primary (f:, r:np, f)
387
applyMove nl inst (ReplacePrimary new_pdx) =
388
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
389
      tgt_n = Container.find new_pdx nl
390
      int_p = Node.removePri old_p inst
391
      int_s = Node.removeSec old_s inst
392
      force_p = Node.offline old_p
393
      new_nl = do -- Maybe monad
394
                  -- check that the current secondary can host the instance
395
                  -- during the migration
396
        tmp_s <- Node.addPriEx force_p int_s inst
397
        let tmp_s' = Node.removePri tmp_s inst
398
        new_p <- Node.addPriEx force_p tgt_n inst
399
        new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
400
        let new_inst = Instance.setPri inst new_pdx
401
        return (Container.add new_pdx new_p $
402
                Container.addTwo old_pdx int_p old_sdx new_s nl,
403
                new_inst, new_pdx, old_sdx)
404
  in new_nl
405

    
406
-- Replace the secondary (r:ns)
407
applyMove nl inst (ReplaceSecondary new_sdx) =
408
  let old_pdx = Instance.pNode inst
409
      old_sdx = Instance.sNode inst
410
      old_s = Container.find old_sdx nl
411
      tgt_n = Container.find new_sdx nl
412
      int_s = Node.removeSec old_s inst
413
      force_s = Node.offline old_s
414
      new_inst = Instance.setSec inst new_sdx
415
      new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
416
               \new_s -> return (Container.addTwo new_sdx
417
                                 new_s old_sdx int_s nl,
418
                                 new_inst, old_pdx, new_sdx)
419
  in new_nl
420

    
421
-- Replace the secondary and failover (r:np, f)
422
applyMove nl inst (ReplaceAndFailover new_pdx) =
423
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
424
      tgt_n = Container.find new_pdx nl
425
      int_p = Node.removePri old_p inst
426
      int_s = Node.removeSec old_s inst
427
      force_s = Node.offline old_s
428
      new_nl = do -- Maybe monad
429
        new_p <- Node.addPri tgt_n inst
430
        new_s <- Node.addSecEx force_s int_p inst new_pdx
431
        let new_inst = Instance.setBoth inst new_pdx old_pdx
432
        return (Container.add new_pdx new_p $
433
                Container.addTwo old_pdx new_s old_sdx int_s nl,
434
                new_inst, new_pdx, old_pdx)
435
  in new_nl
436

    
437
-- Failver and replace the secondary (f, r:ns)
438
applyMove nl inst (FailoverAndReplace new_sdx) =
439
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
440
      tgt_n = Container.find new_sdx nl
441
      int_p = Node.removePri old_p inst
442
      int_s = Node.removeSec old_s inst
443
      force_p = Node.offline old_p
444
      new_nl = do -- Maybe monad
445
        new_p <- Node.addPriEx force_p int_s inst
446
        new_s <- Node.addSecEx force_p tgt_n inst old_sdx
447
        let new_inst = Instance.setBoth inst old_sdx new_sdx
448
        return (Container.add new_sdx new_s $
449
                Container.addTwo old_sdx new_p old_pdx int_p nl,
450
                new_inst, old_sdx, new_sdx)
451
  in new_nl
452

    
453
-- | Tries to allocate an instance on one given node.
454
allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
455
                 -> OpResult Node.AllocElement
456
allocateOnSingle nl inst new_pdx =
457
  let p = Container.find new_pdx nl
458
      new_inst = Instance.setBoth inst new_pdx Node.noSecondary
459
  in do
460
    Instance.instMatchesPolicy inst (Node.iPolicy p)
461
    new_p <- Node.addPri p inst
462
    let new_nl = Container.add new_pdx new_p nl
463
        new_score = compCV nl
464
    return (new_nl, new_inst, [new_p], new_score)
465

    
466
-- | Tries to allocate an instance on a given pair of nodes.
467
allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
468
               -> OpResult Node.AllocElement
469
allocateOnPair nl inst new_pdx new_sdx =
470
  let tgt_p = Container.find new_pdx nl
471
      tgt_s = Container.find new_sdx nl
472
  in do
473
    Instance.instMatchesPolicy inst (Node.iPolicy tgt_p)
474
    new_p <- Node.addPri tgt_p inst
475
    new_s <- Node.addSec tgt_s inst new_pdx
476
    let new_inst = Instance.setBoth inst new_pdx new_sdx
477
        new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
478
    return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
479

    
480
-- | Tries to perform an instance move and returns the best table
481
-- between the original one and the new one.
482
checkSingleStep :: Table -- ^ The original table
483
                -> Instance.Instance -- ^ The instance to move
484
                -> Table -- ^ The current best table
485
                -> IMove -- ^ The move to apply
486
                -> Table -- ^ The final best table
487
checkSingleStep ini_tbl target cur_tbl move =
488
  let Table ini_nl ini_il _ ini_plc = ini_tbl
489
      tmp_resu = applyMove ini_nl target move
490
  in case tmp_resu of
491
       OpFail _ -> cur_tbl
492
       OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
493
         let tgt_idx = Instance.idx target
494
             upd_cvar = compCV upd_nl
495
             upd_il = Container.add tgt_idx new_inst ini_il
496
             upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
497
             upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
498
         in compareTables cur_tbl upd_tbl
499

    
500
-- | Given the status of the current secondary as a valid new node and
501
-- the current candidate target node, generate the possible moves for
502
-- a instance.
503
possibleMoves :: Bool      -- ^ Whether the secondary node is a valid new node
504
              -> Bool      -- ^ Whether we can change the primary node
505
              -> Ndx       -- ^ Target node candidate
506
              -> [IMove]   -- ^ List of valid result moves
507

    
508
possibleMoves _ False tdx =
509
  [ReplaceSecondary tdx]
510

    
511
possibleMoves True True tdx =
512
  [ ReplaceSecondary tdx
513
  , ReplaceAndFailover tdx
514
  , ReplacePrimary tdx
515
  , FailoverAndReplace tdx
516
  ]
517

    
518
possibleMoves False True tdx =
519
  [ ReplaceSecondary tdx
520
  , ReplaceAndFailover tdx
521
  ]
522

    
523
-- | Compute the best move for a given instance.
524
checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
525
                  -> Bool              -- ^ Whether disk moves are allowed
526
                  -> Bool              -- ^ Whether instance moves are allowed
527
                  -> Table             -- ^ Original table
528
                  -> Instance.Instance -- ^ Instance to move
529
                  -> Table             -- ^ Best new table for this instance
530
checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
531
  let opdx = Instance.pNode target
532
      osdx = Instance.sNode target
533
      bad_nodes = [opdx, osdx]
534
      nodes = filter (`notElem` bad_nodes) nodes_idx
535
      use_secondary = elem osdx nodes_idx && inst_moves
536
      aft_failover = if use_secondary -- if allowed to failover
537
                       then checkSingleStep ini_tbl target ini_tbl Failover
538
                       else ini_tbl
539
      all_moves = if disk_moves
540
                    then concatMap
541
                           (possibleMoves use_secondary inst_moves) nodes
542
                    else []
543
    in
544
      -- iterate over the possible nodes for this instance
545
      foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
546

    
547
-- | Compute the best next move.
548
checkMove :: [Ndx]               -- ^ Allowed target node indices
549
          -> Bool                -- ^ Whether disk moves are allowed
550
          -> Bool                -- ^ Whether instance moves are allowed
551
          -> Table               -- ^ The current solution
552
          -> [Instance.Instance] -- ^ List of instances still to move
553
          -> Table               -- ^ The new solution
554
checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
555
  let Table _ _ _ ini_plc = ini_tbl
556
      -- we're using rwhnf from the Control.Parallel.Strategies
557
      -- package; we don't need to use rnf as that would force too
558
      -- much evaluation in single-threaded cases, and in
559
      -- multi-threaded case the weak head normal form is enough to
560
      -- spark the evaluation
561
      tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
562
                             inst_moves ini_tbl)
563
               victims
564
      -- iterate over all instances, computing the best move
565
      best_tbl = foldl' compareTables ini_tbl tables
566
      Table _ _ _ best_plc = best_tbl
567
  in if length best_plc == length ini_plc
568
       then ini_tbl -- no advancement
569
       else best_tbl
570

    
571
-- | Check if we are allowed to go deeper in the balancing.
572
doNextBalance :: Table     -- ^ The starting table
573
              -> Int       -- ^ Remaining length
574
              -> Score     -- ^ Score at which to stop
575
              -> Bool      -- ^ The resulting table and commands
576
doNextBalance ini_tbl max_rounds min_score =
577
  let Table _ _ ini_cv ini_plc = ini_tbl
578
      ini_plc_len = length ini_plc
579
  in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
580

    
581
-- | Run a balance move.
582
tryBalance :: Table       -- ^ The starting table
583
           -> Bool        -- ^ Allow disk moves
584
           -> Bool        -- ^ Allow instance moves
585
           -> Bool        -- ^ Only evacuate moves
586
           -> Score       -- ^ Min gain threshold
587
           -> Score       -- ^ Min gain
588
           -> Maybe Table -- ^ The resulting table and commands
589
tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
590
    let Table ini_nl ini_il ini_cv _ = ini_tbl
591
        all_inst = Container.elems ini_il
592
        all_inst' = if evac_mode
593
                    then let bad_nodes = map Node.idx . filter Node.offline $
594
                                         Container.elems ini_nl
595
                         in filter (any (`elem` bad_nodes) . Instance.allNodes)
596
                            all_inst
597
                    else all_inst
598
        reloc_inst = filter Instance.movable all_inst'
599
        node_idx = map Node.idx . filter (not . Node.offline) $
600
                   Container.elems ini_nl
601
        fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
602
        (Table _ _ fin_cv _) = fin_tbl
603
    in
604
      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
605
      then Just fin_tbl -- this round made success, return the new table
606
      else Nothing
607

    
608
-- * Allocation functions
609

    
610
-- | Build failure stats out of a list of failures.
611
collapseFailures :: [FailMode] -> FailStats
612
collapseFailures flst =
613
    map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
614
            [minBound..maxBound]
615

    
616
-- | Compares two Maybe AllocElement and chooses the besst score.
617
bestAllocElement :: Maybe Node.AllocElement
618
                 -> Maybe Node.AllocElement
619
                 -> Maybe Node.AllocElement
620
bestAllocElement a Nothing = a
621
bestAllocElement Nothing b = b
622
bestAllocElement a@(Just (_, _, _, ascore)) b@(Just (_, _, _, bscore)) =
623
  if ascore < bscore then a else b
624

    
625
-- | Update current Allocation solution and failure stats with new
626
-- elements.
627
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
628
concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
629

    
630
concatAllocs as (OpGood ns) =
631
  let -- Choose the old or new solution, based on the cluster score
632
    cntok = asAllocs as
633
    osols = asSolution as
634
    nsols = bestAllocElement osols (Just ns)
635
    nsuc = cntok + 1
636
    -- Note: we force evaluation of nsols here in order to keep the
637
    -- memory profile low - we know that we will need nsols for sure
638
    -- in the next cycle, so we force evaluation of nsols, since the
639
    -- foldl' in the caller will only evaluate the tuple, but not the
640
    -- elements of the tuple
641
  in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolution = nsols }
642

    
643
-- | Sums two 'AllocSolution' structures.
644
sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
645
sumAllocs (AllocSolution aFails aAllocs aSols aLog)
646
          (AllocSolution bFails bAllocs bSols bLog) =
647
  -- note: we add b first, since usually it will be smaller; when
648
  -- fold'ing, a will grow and grow whereas b is the per-group
649
  -- result, hence smaller
650
  let nFails  = bFails ++ aFails
651
      nAllocs = aAllocs + bAllocs
652
      nSols   = bestAllocElement aSols bSols
653
      nLog    = bLog ++ aLog
654
  in AllocSolution nFails nAllocs nSols nLog
655

    
656
-- | Given a solution, generates a reasonable description for it.
657
describeSolution :: AllocSolution -> String
658
describeSolution as =
659
  let fcnt = asFailures as
660
      sols = asSolution as
661
      freasons =
662
        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
663
        filter ((> 0) . snd) . collapseFailures $ fcnt
664
  in case sols of
665
     Nothing -> "No valid allocation solutions, failure reasons: " ++
666
                (if null fcnt then "unknown reasons" else freasons)
667
     Just (_, _, nodes, cv) ->
668
         printf ("score: %.8f, successes %d, failures %d (%s)" ++
669
                 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
670
               (intercalate "/" . map Node.name $ nodes)
671

    
672
-- | Annotates a solution with the appropriate string.
673
annotateSolution :: AllocSolution -> AllocSolution
674
annotateSolution as = as { asLog = describeSolution as : asLog as }
675

    
676
-- | Reverses an evacuation solution.
677
--
678
-- Rationale: we always concat the results to the top of the lists, so
679
-- for proper jobset execution, we should reverse all lists.
680
reverseEvacSolution :: EvacSolution -> EvacSolution
681
reverseEvacSolution (EvacSolution f m o) =
682
  EvacSolution (reverse f) (reverse m) (reverse o)
683

    
684
-- | Generate the valid node allocation singles or pairs for a new instance.
685
genAllocNodes :: Group.List        -- ^ Group list
686
              -> Node.List         -- ^ The node map
687
              -> Int               -- ^ The number of nodes required
688
              -> Bool              -- ^ Whether to drop or not
689
                                   -- unallocable nodes
690
              -> Result AllocNodes -- ^ The (monadic) result
691
genAllocNodes gl nl count drop_unalloc =
692
  let filter_fn = if drop_unalloc
693
                    then filter (Group.isAllocable .
694
                                 flip Container.find gl . Node.group)
695
                    else id
696
      all_nodes = filter_fn $ getOnline nl
697
      all_pairs = [(Node.idx p,
698
                    [Node.idx s | s <- all_nodes,
699
                                       Node.idx p /= Node.idx s,
700
                                       Node.group p == Node.group s]) |
701
                   p <- all_nodes]
702
  in case count of
703
       1 -> Ok (Left (map Node.idx all_nodes))
704
       2 -> Ok (Right (filter (not . null . snd) all_pairs))
705
       _ -> Bad "Unsupported number of nodes, only one or two  supported"
706

    
707
-- | Try to allocate an instance on the cluster.
708
tryAlloc :: (Monad m) =>
709
            Node.List         -- ^ The node list
710
         -> Instance.List     -- ^ The instance list
711
         -> Instance.Instance -- ^ The instance to allocate
712
         -> AllocNodes        -- ^ The allocation targets
713
         -> m AllocSolution   -- ^ Possible solution list
714
tryAlloc _  _ _    (Right []) = fail "Not enough online nodes"
715
tryAlloc nl _ inst (Right ok_pairs) =
716
  let psols = parMap rwhnf (\(p, ss) ->
717
                              foldl' (\cstate ->
718
                                        concatAllocs cstate .
719
                                        allocateOnPair nl inst p)
720
                              emptyAllocSolution ss) ok_pairs
721
      sols = foldl' sumAllocs emptyAllocSolution psols
722
  in return $ annotateSolution sols
723

    
724
tryAlloc _  _ _    (Left []) = fail "No online nodes"
725
tryAlloc nl _ inst (Left all_nodes) =
726
  let sols = foldl' (\cstate ->
727
                       concatAllocs cstate . allocateOnSingle nl inst
728
                    ) emptyAllocSolution all_nodes
729
  in return $ annotateSolution sols
730

    
731
-- | Given a group/result, describe it as a nice (list of) messages.
732
solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
733
solutionDescription gl (groupId, result) =
734
  case result of
735
    Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
736
    Bad message -> [printf "Group %s: error %s" gname message]
737
  where grp = Container.find groupId gl
738
        gname = Group.name grp
739
        pol = allocPolicyToRaw (Group.allocPolicy grp)
740

    
741
-- | From a list of possibly bad and possibly empty solutions, filter
742
-- only the groups with a valid result. Note that the result will be
743
-- reversed compared to the original list.
744
filterMGResults :: Group.List
745
                -> [(Gdx, Result AllocSolution)]
746
                -> [(Gdx, AllocSolution)]
747
filterMGResults gl = foldl' fn []
748
  where unallocable = not . Group.isAllocable . flip Container.find gl
749
        fn accu (gdx, rasol) =
750
          case rasol of
751
            Bad _ -> accu
752
            Ok sol | isNothing (asSolution sol) -> accu
753
                   | unallocable gdx -> accu
754
                   | otherwise -> (gdx, sol):accu
755

    
756
-- | Sort multigroup results based on policy and score.
757
sortMGResults :: Group.List
758
             -> [(Gdx, AllocSolution)]
759
             -> [(Gdx, AllocSolution)]
760
sortMGResults gl sols =
761
  let extractScore (_, _, _, x) = x
762
      solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
763
                             (extractScore . fromJust . asSolution) sol)
764
  in sortBy (comparing solScore) sols
765

    
766
-- | Finds the best group for an instance on a multi-group cluster.
767
--
768
-- Only solutions in @preferred@ and @last_resort@ groups will be
769
-- accepted as valid, and additionally if the allowed groups parameter
770
-- is not null then allocation will only be run for those group
771
-- indices.
772
findBestAllocGroup :: Group.List           -- ^ The group list
773
                   -> Node.List            -- ^ The node list
774
                   -> Instance.List        -- ^ The instance list
775
                   -> Maybe [Gdx]          -- ^ The allowed groups
776
                   -> Instance.Instance    -- ^ The instance to allocate
777
                   -> Int                  -- ^ Required number of nodes
778
                   -> Result (Gdx, AllocSolution, [String])
779
findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
780
  let groups = splitCluster mgnl mgil
781
      groups' = maybe groups (\gs -> filter ((`elem` gs) . fst) groups)
782
                allowed_gdxs
783
      sols = map (\(gid, (nl, il)) ->
784
                   (gid, genAllocNodes mggl nl cnt False >>=
785
                       tryAlloc nl il inst))
786
             groups'::[(Gdx, Result AllocSolution)]
787
      all_msgs = concatMap (solutionDescription mggl) sols
788
      goodSols = filterMGResults mggl sols
789
      sortedSols = sortMGResults mggl goodSols
790
  in if null sortedSols
791
       then if null groups'
792
              then Bad $ "no groups for evacuation: allowed groups was" ++
793
                     show allowed_gdxs ++ ", all groups: " ++
794
                     show (map fst groups)
795
              else Bad $ intercalate ", " all_msgs
796
       else let (final_group, final_sol) = head sortedSols
797
            in return (final_group, final_sol, all_msgs)
798

    
799
-- | Try to allocate an instance on a multi-group cluster.
800
tryMGAlloc :: Group.List           -- ^ The group list
801
           -> Node.List            -- ^ The node list
802
           -> Instance.List        -- ^ The instance list
803
           -> Instance.Instance    -- ^ The instance to allocate
804
           -> Int                  -- ^ Required number of nodes
805
           -> Result AllocSolution -- ^ Possible solution list
806
tryMGAlloc mggl mgnl mgil inst cnt = do
807
  (best_group, solution, all_msgs) <-
808
      findBestAllocGroup mggl mgnl mgil Nothing inst cnt
809
  let group_name = Group.name $ Container.find best_group mggl
810
      selmsg = "Selected group: " ++ group_name
811
  return $ solution { asLog = selmsg:all_msgs }
812

    
813
-- | Function which fails if the requested mode is change secondary.
814
--
815
-- This is useful since except DRBD, no other disk template can
816
-- execute change secondary; thus, we can just call this function
817
-- instead of always checking for secondary mode. After the call to
818
-- this function, whatever mode we have is just a primary change.
819
failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
820
failOnSecondaryChange ChangeSecondary dt =
821
  fail $ "Instances with disk template '" ++ diskTemplateToRaw dt ++
822
         "' can't execute change secondary"
823
failOnSecondaryChange _ _ = return ()
824

    
825
-- | Run evacuation for a single instance.
826
--
827
-- /Note:/ this function should correctly execute both intra-group
828
-- evacuations (in all modes) and inter-group evacuations (in the
829
-- 'ChangeAll' mode). Of course, this requires that the correct list
830
-- of target nodes is passed.
831
nodeEvacInstance :: Node.List         -- ^ The node list (cluster-wide)
832
                 -> Instance.List     -- ^ Instance list (cluster-wide)
833
                 -> EvacMode          -- ^ The evacuation mode
834
                 -> Instance.Instance -- ^ The instance to be evacuated
835
                 -> Gdx               -- ^ The group we're targetting
836
                 -> [Ndx]             -- ^ The list of available nodes
837
                                      -- for allocation
838
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
839
nodeEvacInstance _ _ mode (Instance.Instance
840
                           {Instance.diskTemplate = dt@DTDiskless}) _ _ =
841
                  failOnSecondaryChange mode dt >>
842
                  fail "Diskless relocations not implemented yet"
843

    
844
nodeEvacInstance _ _ _ (Instance.Instance
845
                        {Instance.diskTemplate = DTPlain}) _ _ =
846
                  fail "Instances of type plain cannot be relocated"
847

    
848
nodeEvacInstance _ _ _ (Instance.Instance
849
                        {Instance.diskTemplate = DTFile}) _ _ =
850
                  fail "Instances of type file cannot be relocated"
851

    
852
nodeEvacInstance _ _ mode  (Instance.Instance
853
                            {Instance.diskTemplate = dt@DTSharedFile}) _ _ =
854
                  failOnSecondaryChange mode dt >>
855
                  fail "Shared file relocations not implemented yet"
856

    
857
nodeEvacInstance _ _ mode (Instance.Instance
858
                           {Instance.diskTemplate = dt@DTBlock}) _ _ =
859
                  failOnSecondaryChange mode dt >>
860
                  fail "Block device relocations not implemented yet"
861

    
862
nodeEvacInstance nl il ChangePrimary
863
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
864
                 _ _ =
865
  do
866
    (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
867
    let idx = Instance.idx inst
868
        il' = Container.add idx inst' il
869
        ops = iMoveToJob nl' il' idx Failover
870
    return (nl', il', ops)
871

    
872
nodeEvacInstance nl il ChangeSecondary
873
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
874
                 gdx avail_nodes =
875
  do
876
    (nl', inst', _, ndx) <- annotateResult "Can't find any good node" $
877
                            eitherToResult $
878
                            foldl' (evacDrbdSecondaryInner nl inst gdx)
879
                            (Left "no nodes available") avail_nodes
880
    let idx = Instance.idx inst
881
        il' = Container.add idx inst' il
882
        ops = iMoveToJob nl' il' idx (ReplaceSecondary ndx)
883
    return (nl', il', ops)
884

    
885
-- The algorithm for ChangeAll is as follows:
886
--
887
-- * generate all (primary, secondary) node pairs for the target groups
888
-- * for each pair, execute the needed moves (r:s, f, r:s) and compute
889
--   the final node list state and group score
890
-- * select the best choice via a foldl that uses the same Either
891
--   String solution as the ChangeSecondary mode
892
nodeEvacInstance nl il ChangeAll
893
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
894
                 gdx avail_nodes =
895
  do
896
    let no_nodes = Left "no nodes available"
897
        node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s]
898
    (nl', il', ops, _) <-
899
        annotateResult "Can't find any good nodes for relocation" $
900
        eitherToResult $
901
        foldl'
902
        (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of
903
                          Bad msg ->
904
                              case accu of
905
                                Right _ -> accu
906
                                -- we don't need more details (which
907
                                -- nodes, etc.) as we only selected
908
                                -- this group if we can allocate on
909
                                -- it, hence failures will not
910
                                -- propagate out of this fold loop
911
                                Left _ -> Left $ "Allocation failed: " ++ msg
912
                          Ok result@(_, _, _, new_cv) ->
913
                              let new_accu = Right result in
914
                              case accu of
915
                                Left _ -> new_accu
916
                                Right (_, _, _, old_cv) ->
917
                                    if old_cv < new_cv
918
                                    then accu
919
                                    else new_accu
920
        ) no_nodes node_pairs
921

    
922
    return (nl', il', ops)
923

    
924
-- | Inner fold function for changing secondary of a DRBD instance.
925
--
926
-- The running solution is either a @Left String@, which means we
927
-- don't have yet a working solution, or a @Right (...)@, which
928
-- represents a valid solution; it holds the modified node list, the
929
-- modified instance (after evacuation), the score of that solution,
930
-- and the new secondary node index.
931
evacDrbdSecondaryInner :: Node.List -- ^ Cluster node list
932
                       -> Instance.Instance -- ^ Instance being evacuated
933
                       -> Gdx -- ^ The group index of the instance
934
                       -> Either String ( Node.List
935
                                        , Instance.Instance
936
                                        , Score
937
                                        , Ndx)  -- ^ Current best solution
938
                       -> Ndx  -- ^ Node we're evaluating as new secondary
939
                       -> Either String ( Node.List
940
                                        , Instance.Instance
941
                                        , Score
942
                                        , Ndx) -- ^ New best solution
943
evacDrbdSecondaryInner nl inst gdx accu ndx =
944
  case applyMove nl inst (ReplaceSecondary ndx) of
945
    OpFail fm ->
946
      case accu of
947
        Right _ -> accu
948
        Left _ -> Left $ "Node " ++ Container.nameOf nl ndx ++
949
                  " failed: " ++ show fm
950
    OpGood (nl', inst', _, _) ->
951
      let nodes = Container.elems nl'
952
          -- The fromJust below is ugly (it can fail nastily), but
953
          -- at this point we should have any internal mismatches,
954
          -- and adding a monad here would be quite involved
955
          grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
956
          new_cv = compCVNodes grpnodes
957
          new_accu = Right (nl', inst', new_cv, ndx)
958
      in case accu of
959
           Left _ -> new_accu
960
           Right (_, _, old_cv, _) ->
961
             if old_cv < new_cv
962
               then accu
963
               else new_accu
964

    
965
-- | Compute result of changing all nodes of a DRBD instance.
966
--
967
-- Given the target primary and secondary node (which might be in a
968
-- different group or not), this function will 'execute' all the
969
-- required steps and assuming all operations succceed, will return
970
-- the modified node and instance lists, the opcodes needed for this
971
-- and the new group score.
972
evacDrbdAllInner :: Node.List         -- ^ Cluster node list
973
                 -> Instance.List     -- ^ Cluster instance list
974
                 -> Instance.Instance -- ^ The instance to be moved
975
                 -> Gdx               -- ^ The target group index
976
                                      -- (which can differ from the
977
                                      -- current group of the
978
                                      -- instance)
979
                 -> (Ndx, Ndx)        -- ^ Tuple of new
980
                                      -- primary\/secondary nodes
981
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score)
982
evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) = do
983
  let primary = Container.find (Instance.pNode inst) nl
984
      idx = Instance.idx inst
985
  -- if the primary is offline, then we first failover
986
  (nl1, inst1, ops1) <-
987
    if Node.offline primary
988
      then do
989
        (nl', inst', _, _) <-
990
          annotateResult "Failing over to the secondary" $
991
          opToResult $ applyMove nl inst Failover
992
        return (nl', inst', [Failover])
993
      else return (nl, inst, [])
994
  let (o1, o2, o3) = (ReplaceSecondary t_pdx,
995
                      Failover,
996
                      ReplaceSecondary t_sdx)
997
  -- we now need to execute a replace secondary to the future
998
  -- primary node
999
  (nl2, inst2, _, _) <-
1000
    annotateResult "Changing secondary to new primary" $
1001
    opToResult $
1002
    applyMove nl1 inst1 o1
1003
  let ops2 = o1:ops1
1004
  -- we now execute another failover, the primary stays fixed now
1005
  (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" $
1006
                        opToResult $ applyMove nl2 inst2 o2
1007
  let ops3 = o2:ops2
1008
  -- and finally another replace secondary, to the final secondary
1009
  (nl4, inst4, _, _) <-
1010
    annotateResult "Changing secondary to final secondary" $
1011
    opToResult $
1012
    applyMove nl3 inst3 o3
1013
  let ops4 = o3:ops3
1014
      il' = Container.add idx inst4 il
1015
      ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1016
  let nodes = Container.elems nl4
1017
      -- The fromJust below is ugly (it can fail nastily), but
1018
      -- at this point we should have any internal mismatches,
1019
      -- and adding a monad here would be quite involved
1020
      grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1021
      new_cv = compCVNodes grpnodes
1022
  return (nl4, il', ops, new_cv)
1023

    
1024
-- | Computes the nodes in a given group which are available for
1025
-- allocation.
1026
availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1027
                    -> IntSet.IntSet  -- ^ Nodes that are excluded
1028
                    -> Gdx            -- ^ The group for which we
1029
                                      -- query the nodes
1030
                    -> Result [Ndx]   -- ^ List of available node indices
1031
availableGroupNodes group_nodes excl_ndx gdx = do
1032
  local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1033
                 Ok (lookup gdx group_nodes)
1034
  let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1035
  return avail_nodes
1036

    
1037
-- | Updates the evac solution with the results of an instance
1038
-- evacuation.
1039
updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1040
                   -> Idx
1041
                   -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1042
                   -> (Node.List, Instance.List, EvacSolution)
1043
updateEvacSolution (nl, il, es) idx (Bad msg) =
1044
  (nl, il, es { esFailed = (idx, msg):esFailed es})
1045
updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1046
  (nl, il, es { esMoved = new_elem:esMoved es
1047
              , esOpCodes = opcodes:esOpCodes es })
1048
    where inst = Container.find idx il
1049
          new_elem = (idx,
1050
                      instancePriGroup nl inst,
1051
                      Instance.allNodes inst)
1052

    
1053
-- | Node-evacuation IAllocator mode main function.
1054
tryNodeEvac :: Group.List    -- ^ The cluster groups
1055
            -> Node.List     -- ^ The node list (cluster-wide, not per group)
1056
            -> Instance.List -- ^ Instance list (cluster-wide)
1057
            -> EvacMode      -- ^ The evacuation mode
1058
            -> [Idx]         -- ^ List of instance (indices) to be evacuated
1059
            -> Result (Node.List, Instance.List, EvacSolution)
1060
tryNodeEvac _ ini_nl ini_il mode idxs =
1061
  let evac_ndx = nodesToEvacuate ini_il mode idxs
1062
      offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1063
      excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1064
      group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1065
                                           (Container.elems nl))) $
1066
                  splitCluster ini_nl ini_il
1067
      (fin_nl, fin_il, esol) =
1068
        foldl' (\state@(nl, il, _) inst ->
1069
                  let gdx = instancePriGroup nl inst
1070
                      pdx = Instance.pNode inst in
1071
                  updateEvacSolution state (Instance.idx inst) $
1072
                  availableGroupNodes group_ndx
1073
                    (IntSet.insert pdx excl_ndx) gdx >>=
1074
                      nodeEvacInstance nl il mode inst gdx
1075
               )
1076
        (ini_nl, ini_il, emptyEvacSolution)
1077
        (map (`Container.find` ini_il) idxs)
1078
  in return (fin_nl, fin_il, reverseEvacSolution esol)
1079

    
1080
-- | Change-group IAllocator mode main function.
1081
--
1082
-- This is very similar to 'tryNodeEvac', the only difference is that
1083
-- we don't choose as target group the current instance group, but
1084
-- instead:
1085
--
1086
--   1. at the start of the function, we compute which are the target
1087
--   groups; either no groups were passed in, in which case we choose
1088
--   all groups out of which we don't evacuate instance, or there were
1089
--   some groups passed, in which case we use those
1090
--
1091
--   2. for each instance, we use 'findBestAllocGroup' to choose the
1092
--   best group to hold the instance, and then we do what
1093
--   'tryNodeEvac' does, except for this group instead of the current
1094
--   instance group.
1095
--
1096
-- Note that the correct behaviour of this function relies on the
1097
-- function 'nodeEvacInstance' to be able to do correctly both
1098
-- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1099
tryChangeGroup :: Group.List    -- ^ The cluster groups
1100
               -> Node.List     -- ^ The node list (cluster-wide)
1101
               -> Instance.List -- ^ Instance list (cluster-wide)
1102
               -> [Gdx]         -- ^ Target groups; if empty, any
1103
                                -- groups not being evacuated
1104
               -> [Idx]         -- ^ List of instance (indices) to be evacuated
1105
               -> Result (Node.List, Instance.List, EvacSolution)
1106
tryChangeGroup gl ini_nl ini_il gdxs idxs =
1107
  let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1108
                             flip Container.find ini_il) idxs
1109
      target_gdxs = (if null gdxs
1110
                       then Container.keys gl
1111
                       else gdxs) \\ evac_gdxs
1112
      offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1113
      excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1114
      group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1115
                                           (Container.elems nl))) $
1116
                  splitCluster ini_nl ini_il
1117
      (fin_nl, fin_il, esol) =
1118
        foldl' (\state@(nl, il, _) inst ->
1119
                  let solution = do
1120
                        let ncnt = Instance.requiredNodes $
1121
                                   Instance.diskTemplate inst
1122
                        (gdx, _, _) <- findBestAllocGroup gl nl il
1123
                                       (Just target_gdxs) inst ncnt
1124
                        av_nodes <- availableGroupNodes group_ndx
1125
                                    excl_ndx gdx
1126
                        nodeEvacInstance nl il ChangeAll inst gdx av_nodes
1127
                  in updateEvacSolution state (Instance.idx inst) solution
1128
               )
1129
        (ini_nl, ini_il, emptyEvacSolution)
1130
        (map (`Container.find` ini_il) idxs)
1131
  in return (fin_nl, fin_il, reverseEvacSolution esol)
1132

    
1133
-- | Standard-sized allocation method.
1134
--
1135
-- This places instances of the same size on the cluster until we're
1136
-- out of space. The result will be a list of identically-sized
1137
-- instances.
1138
iterateAlloc :: AllocMethod
1139
iterateAlloc nl il limit newinst allocnodes ixes cstats =
1140
  let depth = length ixes
1141
      newname = printf "new-%d" depth::String
1142
      newidx = Container.size il
1143
      newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1144
      newlimit = fmap (flip (-) 1) limit
1145
  in case tryAlloc nl il newi2 allocnodes of
1146
       Bad s -> Bad s
1147
       Ok (AllocSolution { asFailures = errs, asSolution = sols3 }) ->
1148
         let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1149
         case sols3 of
1150
           Nothing -> newsol
1151
           Just (xnl, xi, _, _) ->
1152
             if limit == Just 0
1153
               then newsol
1154
               else iterateAlloc xnl (Container.add newidx xi il)
1155
                      newlimit newinst allocnodes (xi:ixes)
1156
                      (totalResources xnl:cstats)
1157

    
1158
-- | Tiered allocation method.
1159
--
1160
-- This places instances on the cluster, and decreases the spec until
1161
-- we can allocate again. The result will be a list of decreasing
1162
-- instance specs.
1163
tieredAlloc :: AllocMethod
1164
tieredAlloc nl il limit newinst allocnodes ixes cstats =
1165
  case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1166
    Bad s -> Bad s
1167
    Ok (errs, nl', il', ixes', cstats') ->
1168
      let newsol = Ok (errs, nl', il', ixes', cstats')
1169
          ixes_cnt = length ixes'
1170
          (stop, newlimit) = case limit of
1171
                               Nothing -> (False, Nothing)
1172
                               Just n -> (n <= ixes_cnt,
1173
                                            Just (n - ixes_cnt)) in
1174
      if stop then newsol else
1175
          case Instance.shrinkByType newinst . fst . last $
1176
               sortBy (comparing snd) errs of
1177
            Bad _ -> newsol
1178
            Ok newinst' -> tieredAlloc nl' il' newlimit
1179
                           newinst' allocnodes ixes' cstats'
1180

    
1181
-- * Formatting functions
1182

    
1183
-- | Given the original and final nodes, computes the relocation description.
1184
computeMoves :: Instance.Instance -- ^ The instance to be moved
1185
             -> String -- ^ The instance name
1186
             -> IMove  -- ^ The move being performed
1187
             -> String -- ^ New primary
1188
             -> String -- ^ New secondary
1189
             -> (String, [String])
1190
                -- ^ Tuple of moves and commands list; moves is containing
1191
                -- either @/f/@ for failover or @/r:name/@ for replace
1192
                -- secondary, while the command list holds gnt-instance
1193
                -- commands (without that prefix), e.g \"@failover instance1@\"
1194
computeMoves i inam mv c d =
1195
  case mv of
1196
    Failover -> ("f", [mig])
1197
    FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1198
    ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1199
    ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1200
    ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1201
  where morf = if Instance.instanceRunning i then "migrate" else "failover"
1202
        mig = printf "%s -f %s" morf inam::String
1203
        rep n = printf "replace-disks -n %s %s" n inam
1204

    
1205
-- | Converts a placement to string format.
1206
printSolutionLine :: Node.List     -- ^ The node list
1207
                  -> Instance.List -- ^ The instance list
1208
                  -> Int           -- ^ Maximum node name length
1209
                  -> Int           -- ^ Maximum instance name length
1210
                  -> Placement     -- ^ The current placement
1211
                  -> Int           -- ^ The index of the placement in
1212
                                   -- the solution
1213
                  -> (String, [String])
1214
printSolutionLine nl il nmlen imlen plc pos =
1215
  let pmlen = (2*nmlen + 1)
1216
      (i, p, s, mv, c) = plc
1217
      inst = Container.find i il
1218
      inam = Instance.alias inst
1219
      npri = Node.alias $ Container.find p nl
1220
      nsec = Node.alias $ Container.find s nl
1221
      opri = Node.alias $ Container.find (Instance.pNode inst) nl
1222
      osec = Node.alias $ Container.find (Instance.sNode inst) nl
1223
      (moves, cmds) =  computeMoves inst inam mv npri nsec
1224
      ostr = printf "%s:%s" opri osec::String
1225
      nstr = printf "%s:%s" npri nsec::String
1226
  in (printf "  %3d. %-*s %-*s => %-*s %.8f a=%s"
1227
      pos imlen inam pmlen ostr
1228
      pmlen nstr c moves,
1229
      cmds)
1230

    
1231
-- | Return the instance and involved nodes in an instance move.
1232
--
1233
-- Note that the output list length can vary, and is not required nor
1234
-- guaranteed to be of any specific length.
1235
involvedNodes :: Instance.List -- ^ Instance list, used for retrieving
1236
                               -- the instance from its index; note
1237
                               -- that this /must/ be the original
1238
                               -- instance list, so that we can
1239
                               -- retrieve the old nodes
1240
              -> Placement     -- ^ The placement we're investigating,
1241
                               -- containing the new nodes and
1242
                               -- instance index
1243
              -> [Ndx]         -- ^ Resulting list of node indices
1244
involvedNodes il plc =
1245
  let (i, np, ns, _, _) = plc
1246
      inst = Container.find i il
1247
  in nub $ [np, ns] ++ Instance.allNodes inst
1248

    
1249
-- | Inner function for splitJobs, that either appends the next job to
1250
-- the current jobset, or starts a new jobset.
1251
mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1252
mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1253
mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1254
  | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1255
  | otherwise = ([n]:cjs, ndx)
1256

    
1257
-- | Break a list of moves into independent groups. Note that this
1258
-- will reverse the order of jobs.
1259
splitJobs :: [MoveJob] -> [JobSet]
1260
splitJobs = fst . foldl mergeJobs ([], [])
1261

    
1262
-- | Given a list of commands, prefix them with @gnt-instance@ and
1263
-- also beautify the display a little.
1264
formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1265
formatJob jsn jsl (sn, (_, _, _, cmds)) =
1266
  let out =
1267
        printf "  echo job %d/%d" jsn sn:
1268
        printf "  check":
1269
        map ("  gnt-instance " ++) cmds
1270
  in if sn == 1
1271
       then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1272
       else out
1273

    
1274
-- | Given a list of commands, prefix them with @gnt-instance@ and
1275
-- also beautify the display a little.
1276
formatCmds :: [JobSet] -> String
1277
formatCmds =
1278
  unlines .
1279
  concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1280
                           (zip [1..] js)) .
1281
  zip [1..]
1282

    
1283
-- | Print the node list.
1284
printNodes :: Node.List -> [String] -> String
1285
printNodes nl fs =
1286
  let fields = case fs of
1287
                 [] -> Node.defaultFields
1288
                 "+":rest -> Node.defaultFields ++ rest
1289
                 _ -> fs
1290
      snl = sortBy (comparing Node.idx) (Container.elems nl)
1291
      (header, isnum) = unzip $ map Node.showHeader fields
1292
  in unlines . map ((:) ' ' .  unwords) $
1293
     formatTable (header:map (Node.list fields) snl) isnum
1294

    
1295
-- | Print the instance list.
1296
printInsts :: Node.List -> Instance.List -> String
1297
printInsts nl il =
1298
  let sil = sortBy (comparing Instance.idx) (Container.elems il)
1299
      helper inst = [ if Instance.instanceRunning inst then "R" else " "
1300
                    , Instance.name inst
1301
                    , Container.nameOf nl (Instance.pNode inst)
1302
                    , let sdx = Instance.sNode inst
1303
                      in if sdx == Node.noSecondary
1304
                           then  ""
1305
                           else Container.nameOf nl sdx
1306
                    , if Instance.autoBalance inst then "Y" else "N"
1307
                    , printf "%3d" $ Instance.vcpus inst
1308
                    , printf "%5d" $ Instance.mem inst
1309
                    , printf "%5d" $ Instance.dsk inst `div` 1024
1310
                    , printf "%5.3f" lC
1311
                    , printf "%5.3f" lM
1312
                    , printf "%5.3f" lD
1313
                    , printf "%5.3f" lN
1314
                    ]
1315
          where DynUtil lC lM lD lN = Instance.util inst
1316
      header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1317
               , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1318
      isnum = False:False:False:False:False:repeat True
1319
  in unlines . map ((:) ' ' . unwords) $
1320
     formatTable (header:map helper sil) isnum
1321

    
1322
-- | Shows statistics for a given node list.
1323
printStats :: Node.List -> String
1324
printStats nl =
1325
  let dcvs = compDetailedCV $ Container.elems nl
1326
      (weights, names) = unzip detailedCVInfo
1327
      hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1328
      formatted = map (\(w, header, val) ->
1329
                         printf "%s=%.8f(x%.2f)" header val w::String) hd
1330
  in intercalate ", " formatted
1331

    
1332
-- | Convert a placement into a list of OpCodes (basically a job).
1333
iMoveToJob :: Node.List        -- ^ The node list; only used for node
1334
                               -- names, so any version is good
1335
                               -- (before or after the operation)
1336
           -> Instance.List    -- ^ The instance list; also used for
1337
                               -- names only
1338
           -> Idx              -- ^ The index of the instance being
1339
                               -- moved
1340
           -> IMove            -- ^ The actual move to be described
1341
           -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to
1342
                               -- the given move
1343
iMoveToJob nl il idx move =
1344
  let inst = Container.find idx il
1345
      iname = Instance.name inst
1346
      lookNode  = Just . Container.nameOf nl
1347
      opF = OpCodes.OpInstanceMigrate iname True False True Nothing
1348
      opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n)
1349
              OpCodes.ReplaceNewSecondary [] Nothing
1350
  in case move of
1351
       Failover -> [ opF ]
1352
       ReplacePrimary np -> [ opF, opR np, opF ]
1353
       ReplaceSecondary ns -> [ opR ns ]
1354
       ReplaceAndFailover np -> [ opR np, opF ]
1355
       FailoverAndReplace ns -> [ opF, opR ns ]
1356

    
1357
-- * Node group functions
1358

    
1359
-- | Computes the group of an instance.
1360
instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1361
instanceGroup nl i =
1362
  let sidx = Instance.sNode i
1363
      pnode = Container.find (Instance.pNode i) nl
1364
      snode = if sidx == Node.noSecondary
1365
              then pnode
1366
              else Container.find sidx nl
1367
      pgroup = Node.group pnode
1368
      sgroup = Node.group snode
1369
  in if pgroup /= sgroup
1370
       then fail ("Instance placed accross two node groups, primary " ++
1371
                  show pgroup ++ ", secondary " ++ show sgroup)
1372
       else return pgroup
1373

    
1374
-- | Computes the group of an instance per the primary node.
1375
instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1376
instancePriGroup nl i =
1377
  let pnode = Container.find (Instance.pNode i) nl
1378
  in  Node.group pnode
1379

    
1380
-- | Compute the list of badly allocated instances (split across node
1381
-- groups).
1382
findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1383
findSplitInstances nl =
1384
  filter (not . isOk . instanceGroup nl) . Container.elems
1385

    
1386
-- | Splits a cluster into the component node groups.
1387
splitCluster :: Node.List -> Instance.List ->
1388
                [(Gdx, (Node.List, Instance.List))]
1389
splitCluster nl il =
1390
  let ngroups = Node.computeGroups (Container.elems nl)
1391
  in map (\(guuid, nodes) ->
1392
           let nidxs = map Node.idx nodes
1393
               nodes' = zip nidxs nodes
1394
               instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1395
           in (guuid, (Container.fromList nodes', instances))) ngroups
1396

    
1397
-- | Compute the list of nodes that are to be evacuated, given a list
1398
-- of instances and an evacuation mode.
1399
nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1400
                -> EvacMode      -- ^ The evacuation mode we're using
1401
                -> [Idx]         -- ^ List of instance indices being evacuated
1402
                -> IntSet.IntSet -- ^ Set of node indices
1403
nodesToEvacuate il mode =
1404
  IntSet.delete Node.noSecondary .
1405
  foldl' (\ns idx ->
1406
            let i = Container.find idx il
1407
                pdx = Instance.pNode i
1408
                sdx = Instance.sNode i
1409
                dt = Instance.diskTemplate i
1410
                withSecondary = case dt of
1411
                                  DTDrbd8 -> IntSet.insert sdx ns
1412
                                  _ -> ns
1413
            in case mode of
1414
                 ChangePrimary   -> IntSet.insert pdx ns
1415
                 ChangeSecondary -> withSecondary
1416
                 ChangeAll       -> IntSet.insert pdx withSecondary
1417
         ) IntSet.empty