Statistics
| Branch: | Tag: | Revision:

root / htools / Ganeti / HTools / Cluster.hs @ 2922d2c5

History | View | Annotate | Download (61.6 kB)

1
{-| Implementation of cluster-wide logic.
2

    
3
This module holds all pure cluster-logic; I\/O related functionality
4
goes into the /Main/ module for the individual binaries.
5

    
6
-}
7

    
8
{-
9

    
10
Copyright (C) 2009, 2010, 2011, 2012 Google Inc.
11

    
12
This program is free software; you can redistribute it and/or modify
13
it under the terms of the GNU General Public License as published by
14
the Free Software Foundation; either version 2 of the License, or
15
(at your option) any later version.
16

    
17
This program is distributed in the hope that it will be useful, but
18
WITHOUT ANY WARRANTY; without even the implied warranty of
19
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
General Public License for more details.
21

    
22
You should have received a copy of the GNU General Public License
23
along with this program; if not, write to the Free Software
24
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25
02110-1301, USA.
26

    
27
-}
28

    
29
module Ganeti.HTools.Cluster
30
  (
31
    -- * Types
32
    AllocSolution(..)
33
  , EvacSolution(..)
34
  , Table(..)
35
  , CStats(..)
36
  , AllocResult
37
  , AllocMethod
38
  -- * Generic functions
39
  , totalResources
40
  , computeAllocationDelta
41
  -- * First phase functions
42
  , computeBadItems
43
  -- * Second phase functions
44
  , printSolutionLine
45
  , formatCmds
46
  , involvedNodes
47
  , splitJobs
48
  -- * Display functions
49
  , printNodes
50
  , printInsts
51
  -- * Balacing functions
52
  , checkMove
53
  , doNextBalance
54
  , tryBalance
55
  , compCV
56
  , compCVNodes
57
  , compDetailedCV
58
  , printStats
59
  , iMoveToJob
60
  -- * IAllocator functions
61
  , genAllocNodes
62
  , tryAlloc
63
  , tryMGAlloc
64
  , tryNodeEvac
65
  , tryChangeGroup
66
  , collapseFailures
67
  -- * Allocation functions
68
  , iterateAlloc
69
  , tieredAlloc
70
  -- * Node group functions
71
  , instanceGroup
72
  , findSplitInstances
73
  , splitCluster
74
  ) where
75

    
76
import qualified Data.IntSet as IntSet
77
import Data.List
78
import Data.Maybe (fromJust, isNothing)
79
import Data.Ord (comparing)
80
import Text.Printf (printf)
81

    
82
import qualified Ganeti.HTools.Container as Container
83
import qualified Ganeti.HTools.Instance as Instance
84
import qualified Ganeti.HTools.Node as Node
85
import qualified Ganeti.HTools.Group as Group
86
import Ganeti.HTools.Types
87
import Ganeti.HTools.Utils
88
import Ganeti.HTools.Compat
89
import qualified Ganeti.OpCodes as OpCodes
90

    
91
-- * Types
92

    
93
-- | Allocation\/relocation solution.
94
data AllocSolution = AllocSolution
95
  { asFailures :: [FailMode]              -- ^ Failure counts
96
  , asAllocs   :: Int                     -- ^ Good allocation count
97
  , asSolution :: Maybe Node.AllocElement -- ^ The actual allocation result
98
  , asLog      :: [String]                -- ^ Informational messages
99
  }
100

    
101
-- | Node evacuation/group change iallocator result type. This result
102
-- type consists of actual opcodes (a restricted subset) that are
103
-- transmitted back to Ganeti.
104
data EvacSolution = EvacSolution
105
  { esMoved   :: [(Idx, Gdx, [Ndx])]  -- ^ Instances moved successfully
106
  , esFailed  :: [(Idx, String)]      -- ^ Instances which were not
107
                                      -- relocated
108
  , esOpCodes :: [[OpCodes.OpCode]]   -- ^ List of jobs
109
  } deriving (Show)
110

    
111
-- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
112
type AllocResult = (FailStats, Node.List, Instance.List,
113
                    [Instance.Instance], [CStats])
114

    
115
-- | A type denoting the valid allocation mode/pairs.
116
--
117
-- For a one-node allocation, this will be a @Left ['Ndx']@, whereas
118
-- for a two-node allocation, this will be a @Right [('Ndx',
119
-- ['Ndx'])]@. In the latter case, the list is basically an
120
-- association list, grouped by primary node and holding the potential
121
-- secondary nodes in the sub-list.
122
type AllocNodes = Either [Ndx] [(Ndx, [Ndx])]
123

    
124
-- | The empty solution we start with when computing allocations.
125
emptyAllocSolution :: AllocSolution
126
emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
127
                                   , asSolution = Nothing, asLog = [] }
128

    
129
-- | The empty evac solution.
130
emptyEvacSolution :: EvacSolution
131
emptyEvacSolution = EvacSolution { esMoved = []
132
                                 , esFailed = []
133
                                 , esOpCodes = []
134
                                 }
135

    
136
-- | The complete state for the balancing solution.
137
data Table = Table Node.List Instance.List Score [Placement]
138
             deriving (Show, Read)
139

    
140
-- | Cluster statistics data type.
141
data CStats = CStats
142
  { csFmem :: Integer -- ^ Cluster free mem
143
  , csFdsk :: Integer -- ^ Cluster free disk
144
  , csAmem :: Integer -- ^ Cluster allocatable mem
145
  , csAdsk :: Integer -- ^ Cluster allocatable disk
146
  , csAcpu :: Integer -- ^ Cluster allocatable cpus
147
  , csMmem :: Integer -- ^ Max node allocatable mem
148
  , csMdsk :: Integer -- ^ Max node allocatable disk
149
  , csMcpu :: Integer -- ^ Max node allocatable cpu
150
  , csImem :: Integer -- ^ Instance used mem
151
  , csIdsk :: Integer -- ^ Instance used disk
152
  , csIcpu :: Integer -- ^ Instance used cpu
153
  , csTmem :: Double  -- ^ Cluster total mem
154
  , csTdsk :: Double  -- ^ Cluster total disk
155
  , csTcpu :: Double  -- ^ Cluster total cpus
156
  , csVcpu :: Integer -- ^ Cluster total virtual cpus
157
  , csNcpu :: Double  -- ^ Equivalent to 'csIcpu' but in terms of
158
                      -- physical CPUs, i.e. normalised used phys CPUs
159
  , csXmem :: Integer -- ^ Unnacounted for mem
160
  , csNmem :: Integer -- ^ Node own memory
161
  , csScore :: Score  -- ^ The cluster score
162
  , csNinst :: Int    -- ^ The total number of instances
163
  } deriving (Show, Read)
164

    
165
-- | A simple type for allocation functions.
166
type AllocMethod =  Node.List           -- ^ Node list
167
                 -> Instance.List       -- ^ Instance list
168
                 -> Maybe Int           -- ^ Optional allocation limit
169
                 -> Instance.Instance   -- ^ Instance spec for allocation
170
                 -> AllocNodes          -- ^ Which nodes we should allocate on
171
                 -> [Instance.Instance] -- ^ Allocated instances
172
                 -> [CStats]            -- ^ Running cluster stats
173
                 -> Result AllocResult  -- ^ Allocation result
174

    
175
-- * Utility functions
176

    
177
-- | Verifies the N+1 status and return the affected nodes.
178
verifyN1 :: [Node.Node] -> [Node.Node]
179
verifyN1 = filter Node.failN1
180

    
181
{-| Computes the pair of bad nodes and instances.
182

    
183
The bad node list is computed via a simple 'verifyN1' check, and the
184
bad instance list is the list of primary and secondary instances of
185
those nodes.
186

    
187
-}
188
computeBadItems :: Node.List -> Instance.List ->
189
                   ([Node.Node], [Instance.Instance])
190
computeBadItems nl il =
191
  let bad_nodes = verifyN1 $ getOnline nl
192
      bad_instances = map (`Container.find` il) .
193
                      sort . nub $
194
                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
195
  in
196
    (bad_nodes, bad_instances)
197

    
198
-- | Extracts the node pairs for an instance. This can fail if the
199
-- instance is single-homed. FIXME: this needs to be improved,
200
-- together with the general enhancement for handling non-DRBD moves.
201
instanceNodes :: Node.List -> Instance.Instance ->
202
                 (Ndx, Ndx, Node.Node, Node.Node)
203
instanceNodes nl inst =
204
  let old_pdx = Instance.pNode inst
205
      old_sdx = Instance.sNode inst
206
      old_p = Container.find old_pdx nl
207
      old_s = Container.find old_sdx nl
208
  in (old_pdx, old_sdx, old_p, old_s)
209

    
210
-- | Zero-initializer for the CStats type.
211
emptyCStats :: CStats
212
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
213

    
214
-- | Update stats with data from a new node.
215
updateCStats :: CStats -> Node.Node -> CStats
216
updateCStats cs node =
217
  let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
218
               csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
219
               csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
220
               csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
221
               csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
222
               csVcpu = x_vcpu, csNcpu = x_ncpu,
223
               csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
224
             }
225
        = cs
226
      inc_amem = Node.fMem node - Node.rMem node
227
      inc_amem' = if inc_amem > 0 then inc_amem else 0
228
      inc_adsk = Node.availDisk node
229
      inc_imem = truncate (Node.tMem node) - Node.nMem node
230
                 - Node.xMem node - Node.fMem node
231
      inc_icpu = Node.uCpu node
232
      inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
233
      inc_vcpu = Node.hiCpu node
234
      inc_acpu = Node.availCpu node
235
      inc_ncpu = fromIntegral (Node.uCpu node) /
236
                 iPolicyVcpuRatio (Node.iPolicy node)
237
  in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
238
        , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
239
        , csAmem = x_amem + fromIntegral inc_amem'
240
        , csAdsk = x_adsk + fromIntegral inc_adsk
241
        , csAcpu = x_acpu + fromIntegral inc_acpu
242
        , csMmem = max x_mmem (fromIntegral inc_amem')
243
        , csMdsk = max x_mdsk (fromIntegral inc_adsk)
244
        , csMcpu = max x_mcpu (fromIntegral inc_acpu)
245
        , csImem = x_imem + fromIntegral inc_imem
246
        , csIdsk = x_idsk + fromIntegral inc_idsk
247
        , csIcpu = x_icpu + fromIntegral inc_icpu
248
        , csTmem = x_tmem + Node.tMem node
249
        , csTdsk = x_tdsk + Node.tDsk node
250
        , csTcpu = x_tcpu + Node.tCpu node
251
        , csVcpu = x_vcpu + fromIntegral inc_vcpu
252
        , csNcpu = x_ncpu + inc_ncpu
253
        , csXmem = x_xmem + fromIntegral (Node.xMem node)
254
        , csNmem = x_nmem + fromIntegral (Node.nMem node)
255
        , csNinst = x_ninst + length (Node.pList node)
256
        }
257

    
258
-- | Compute the total free disk and memory in the cluster.
259
totalResources :: Node.List -> CStats
260
totalResources nl =
261
  let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
262
  in cs { csScore = compCV nl }
263

    
264
-- | Compute the delta between two cluster state.
265
--
266
-- This is used when doing allocations, to understand better the
267
-- available cluster resources. The return value is a triple of the
268
-- current used values, the delta that was still allocated, and what
269
-- was left unallocated.
270
computeAllocationDelta :: CStats -> CStats -> AllocStats
271
computeAllocationDelta cini cfin =
272
  let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu,
273
              csNcpu = i_ncpu } = cini
274
      CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
275
              csTmem = t_mem, csTdsk = t_dsk, csVcpu = f_vcpu,
276
              csNcpu = f_ncpu, csTcpu = f_tcpu } = cfin
277
      rini = AllocInfo { allocInfoVCpus = fromIntegral i_icpu
278
                       , allocInfoNCpus = i_ncpu
279
                       , allocInfoMem   = fromIntegral i_imem
280
                       , allocInfoDisk  = fromIntegral i_idsk
281
                       }
282
      rfin = AllocInfo { allocInfoVCpus = fromIntegral (f_icpu - i_icpu)
283
                       , allocInfoNCpus = f_ncpu - i_ncpu
284
                       , allocInfoMem   = fromIntegral (f_imem - i_imem)
285
                       , allocInfoDisk  = fromIntegral (f_idsk - i_idsk)
286
                       }
287
      runa = AllocInfo { allocInfoVCpus = fromIntegral (f_vcpu - f_icpu)
288
                       , allocInfoNCpus = f_tcpu - f_ncpu
289
                       , allocInfoMem   = truncate t_mem - fromIntegral f_imem
290
                       , allocInfoDisk  = truncate t_dsk - fromIntegral f_idsk
291
                       }
292
  in (rini, rfin, runa)
293

    
294
-- | The names and weights of the individual elements in the CV list.
295
detailedCVInfo :: [(Double, String)]
296
detailedCVInfo = [ (1,  "free_mem_cv")
297
                 , (1,  "free_disk_cv")
298
                 , (1,  "n1_cnt")
299
                 , (1,  "reserved_mem_cv")
300
                 , (4,  "offline_all_cnt")
301
                 , (16, "offline_pri_cnt")
302
                 , (1,  "vcpu_ratio_cv")
303
                 , (1,  "cpu_load_cv")
304
                 , (1,  "mem_load_cv")
305
                 , (1,  "disk_load_cv")
306
                 , (1,  "net_load_cv")
307
                 , (2,  "pri_tags_score")
308
                 ]
309

    
310
-- | Holds the weights used by 'compCVNodes' for each metric.
311
detailedCVWeights :: [Double]
312
detailedCVWeights = map fst detailedCVInfo
313

    
314
-- | Compute the mem and disk covariance.
315
compDetailedCV :: [Node.Node] -> [Double]
316
compDetailedCV all_nodes =
317
  let (offline, nodes) = partition Node.offline all_nodes
318
      mem_l = map Node.pMem nodes
319
      dsk_l = map Node.pDsk nodes
320
      -- metric: memory covariance
321
      mem_cv = stdDev mem_l
322
      -- metric: disk covariance
323
      dsk_cv = stdDev dsk_l
324
      -- metric: count of instances living on N1 failing nodes
325
      n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
326
                                                 length (Node.pList n)) .
327
                 filter Node.failN1 $ nodes :: Double
328
      res_l = map Node.pRem nodes
329
      -- metric: reserved memory covariance
330
      res_cv = stdDev res_l
331
      -- offline instances metrics
332
      offline_ipri = sum . map (length . Node.pList) $ offline
333
      offline_isec = sum . map (length . Node.sList) $ offline
334
      -- metric: count of instances on offline nodes
335
      off_score = fromIntegral (offline_ipri + offline_isec)::Double
336
      -- metric: count of primary instances on offline nodes (this
337
      -- helps with evacuation/failover of primary instances on
338
      -- 2-node clusters with one node offline)
339
      off_pri_score = fromIntegral offline_ipri::Double
340
      cpu_l = map Node.pCpu nodes
341
      -- metric: covariance of vcpu/pcpu ratio
342
      cpu_cv = stdDev cpu_l
343
      -- metrics: covariance of cpu, memory, disk and network load
344
      (c_load, m_load, d_load, n_load) =
345
        unzip4 $ map (\n ->
346
                      let DynUtil c1 m1 d1 n1 = Node.utilLoad n
347
                          DynUtil c2 m2 d2 n2 = Node.utilPool n
348
                      in (c1/c2, m1/m2, d1/d2, n1/n2)) nodes
349
      -- metric: conflicting instance count
350
      pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
351
      pri_tags_score = fromIntegral pri_tags_inst::Double
352
  in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
353
     , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
354
     , pri_tags_score ]
355

    
356
-- | Compute the /total/ variance.
357
compCVNodes :: [Node.Node] -> Double
358
compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
359

    
360
-- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
361
compCV :: Node.List -> Double
362
compCV = compCVNodes . Container.elems
363

    
364
-- | Compute online nodes from a 'Node.List'.
365
getOnline :: Node.List -> [Node.Node]
366
getOnline = filter (not . Node.offline) . Container.elems
367

    
368
-- * Balancing functions
369

    
370
-- | Compute best table. Note that the ordering of the arguments is important.
371
compareTables :: Table -> Table -> Table
372
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
373
  if a_cv > b_cv then b else a
374

    
375
-- | Applies an instance move to a given node list and instance.
376
applyMove :: Node.List -> Instance.Instance
377
          -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
378
-- Failover (f)
379
applyMove nl inst Failover =
380
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
381
      int_p = Node.removePri old_p inst
382
      int_s = Node.removeSec old_s inst
383
      new_nl = do -- Maybe monad
384
        new_p <- Node.addPriEx (Node.offline old_p) int_s inst
385
        new_s <- Node.addSec int_p inst old_sdx
386
        let new_inst = Instance.setBoth inst old_sdx old_pdx
387
        return (Container.addTwo old_pdx new_s old_sdx new_p nl,
388
                new_inst, old_sdx, old_pdx)
389
  in new_nl
390

    
391
-- Replace the primary (f:, r:np, f)
392
applyMove nl inst (ReplacePrimary new_pdx) =
393
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
394
      tgt_n = Container.find new_pdx nl
395
      int_p = Node.removePri old_p inst
396
      int_s = Node.removeSec old_s inst
397
      force_p = Node.offline old_p
398
      new_nl = do -- Maybe monad
399
                  -- check that the current secondary can host the instance
400
                  -- during the migration
401
        tmp_s <- Node.addPriEx force_p int_s inst
402
        let tmp_s' = Node.removePri tmp_s inst
403
        new_p <- Node.addPriEx force_p tgt_n inst
404
        new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
405
        let new_inst = Instance.setPri inst new_pdx
406
        return (Container.add new_pdx new_p $
407
                Container.addTwo old_pdx int_p old_sdx new_s nl,
408
                new_inst, new_pdx, old_sdx)
409
  in new_nl
410

    
411
-- Replace the secondary (r:ns)
412
applyMove nl inst (ReplaceSecondary new_sdx) =
413
  let old_pdx = Instance.pNode inst
414
      old_sdx = Instance.sNode inst
415
      old_s = Container.find old_sdx nl
416
      tgt_n = Container.find new_sdx nl
417
      int_s = Node.removeSec old_s inst
418
      force_s = Node.offline old_s
419
      new_inst = Instance.setSec inst new_sdx
420
      new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
421
               \new_s -> return (Container.addTwo new_sdx
422
                                 new_s old_sdx int_s nl,
423
                                 new_inst, old_pdx, new_sdx)
424
  in new_nl
425

    
426
-- Replace the secondary and failover (r:np, f)
427
applyMove nl inst (ReplaceAndFailover new_pdx) =
428
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
429
      tgt_n = Container.find new_pdx nl
430
      int_p = Node.removePri old_p inst
431
      int_s = Node.removeSec old_s inst
432
      force_s = Node.offline old_s
433
      new_nl = do -- Maybe monad
434
        new_p <- Node.addPri tgt_n inst
435
        new_s <- Node.addSecEx force_s int_p inst new_pdx
436
        let new_inst = Instance.setBoth inst new_pdx old_pdx
437
        return (Container.add new_pdx new_p $
438
                Container.addTwo old_pdx new_s old_sdx int_s nl,
439
                new_inst, new_pdx, old_pdx)
440
  in new_nl
441

    
442
-- Failver and replace the secondary (f, r:ns)
443
applyMove nl inst (FailoverAndReplace new_sdx) =
444
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
445
      tgt_n = Container.find new_sdx nl
446
      int_p = Node.removePri old_p inst
447
      int_s = Node.removeSec old_s inst
448
      force_p = Node.offline old_p
449
      new_nl = do -- Maybe monad
450
        new_p <- Node.addPriEx force_p int_s inst
451
        new_s <- Node.addSecEx force_p tgt_n inst old_sdx
452
        let new_inst = Instance.setBoth inst old_sdx new_sdx
453
        return (Container.add new_sdx new_s $
454
                Container.addTwo old_sdx new_p old_pdx int_p nl,
455
                new_inst, old_sdx, new_sdx)
456
  in new_nl
457

    
458
-- | Tries to allocate an instance on one given node.
459
allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
460
                 -> OpResult Node.AllocElement
461
allocateOnSingle nl inst new_pdx =
462
  let p = Container.find new_pdx nl
463
      new_inst = Instance.setBoth inst new_pdx Node.noSecondary
464
  in do
465
    Instance.instMatchesPolicy inst (Node.iPolicy p)
466
    new_p <- Node.addPri p inst
467
    let new_nl = Container.add new_pdx new_p nl
468
        new_score = compCV nl
469
    return (new_nl, new_inst, [new_p], new_score)
470

    
471
-- | Tries to allocate an instance on a given pair of nodes.
472
allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
473
               -> OpResult Node.AllocElement
474
allocateOnPair nl inst new_pdx new_sdx =
475
  let tgt_p = Container.find new_pdx nl
476
      tgt_s = Container.find new_sdx nl
477
  in do
478
    Instance.instMatchesPolicy inst (Node.iPolicy tgt_p)
479
    new_p <- Node.addPri tgt_p inst
480
    new_s <- Node.addSec tgt_s inst new_pdx
481
    let new_inst = Instance.setBoth inst new_pdx new_sdx
482
        new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
483
    return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
484

    
485
-- | Tries to perform an instance move and returns the best table
486
-- between the original one and the new one.
487
checkSingleStep :: Table -- ^ The original table
488
                -> Instance.Instance -- ^ The instance to move
489
                -> Table -- ^ The current best table
490
                -> IMove -- ^ The move to apply
491
                -> Table -- ^ The final best table
492
checkSingleStep ini_tbl target cur_tbl move =
493
  let Table ini_nl ini_il _ ini_plc = ini_tbl
494
      tmp_resu = applyMove ini_nl target move
495
  in case tmp_resu of
496
       OpFail _ -> cur_tbl
497
       OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
498
         let tgt_idx = Instance.idx target
499
             upd_cvar = compCV upd_nl
500
             upd_il = Container.add tgt_idx new_inst ini_il
501
             upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
502
             upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
503
         in compareTables cur_tbl upd_tbl
504

    
505
-- | Given the status of the current secondary as a valid new node and
506
-- the current candidate target node, generate the possible moves for
507
-- a instance.
508
possibleMoves :: Bool      -- ^ Whether the secondary node is a valid new node
509
              -> Bool      -- ^ Whether we can change the primary node
510
              -> Ndx       -- ^ Target node candidate
511
              -> [IMove]   -- ^ List of valid result moves
512

    
513
possibleMoves _ False tdx =
514
  [ReplaceSecondary tdx]
515

    
516
possibleMoves True True tdx =
517
  [ ReplaceSecondary tdx
518
  , ReplaceAndFailover tdx
519
  , ReplacePrimary tdx
520
  , FailoverAndReplace tdx
521
  ]
522

    
523
possibleMoves False True tdx =
524
  [ ReplaceSecondary tdx
525
  , ReplaceAndFailover tdx
526
  ]
527

    
528
-- | Compute the best move for a given instance.
529
checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
530
                  -> Bool              -- ^ Whether disk moves are allowed
531
                  -> Bool              -- ^ Whether instance moves are allowed
532
                  -> Table             -- ^ Original table
533
                  -> Instance.Instance -- ^ Instance to move
534
                  -> Table             -- ^ Best new table for this instance
535
checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
536
  let opdx = Instance.pNode target
537
      osdx = Instance.sNode target
538
      bad_nodes = [opdx, osdx]
539
      nodes = filter (`notElem` bad_nodes) nodes_idx
540
      use_secondary = elem osdx nodes_idx && inst_moves
541
      aft_failover = if use_secondary -- if allowed to failover
542
                       then checkSingleStep ini_tbl target ini_tbl Failover
543
                       else ini_tbl
544
      all_moves = if disk_moves
545
                    then concatMap
546
                           (possibleMoves use_secondary inst_moves) nodes
547
                    else []
548
    in
549
      -- iterate over the possible nodes for this instance
550
      foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
551

    
552
-- | Compute the best next move.
553
checkMove :: [Ndx]               -- ^ Allowed target node indices
554
          -> Bool                -- ^ Whether disk moves are allowed
555
          -> Bool                -- ^ Whether instance moves are allowed
556
          -> Table               -- ^ The current solution
557
          -> [Instance.Instance] -- ^ List of instances still to move
558
          -> Table               -- ^ The new solution
559
checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
560
  let Table _ _ _ ini_plc = ini_tbl
561
      -- we're using rwhnf from the Control.Parallel.Strategies
562
      -- package; we don't need to use rnf as that would force too
563
      -- much evaluation in single-threaded cases, and in
564
      -- multi-threaded case the weak head normal form is enough to
565
      -- spark the evaluation
566
      tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
567
                             inst_moves ini_tbl)
568
               victims
569
      -- iterate over all instances, computing the best move
570
      best_tbl = foldl' compareTables ini_tbl tables
571
      Table _ _ _ best_plc = best_tbl
572
  in if length best_plc == length ini_plc
573
       then ini_tbl -- no advancement
574
       else best_tbl
575

    
576
-- | Check if we are allowed to go deeper in the balancing.
577
doNextBalance :: Table     -- ^ The starting table
578
              -> Int       -- ^ Remaining length
579
              -> Score     -- ^ Score at which to stop
580
              -> Bool      -- ^ The resulting table and commands
581
doNextBalance ini_tbl max_rounds min_score =
582
  let Table _ _ ini_cv ini_plc = ini_tbl
583
      ini_plc_len = length ini_plc
584
  in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
585

    
586
-- | Run a balance move.
587
tryBalance :: Table       -- ^ The starting table
588
           -> Bool        -- ^ Allow disk moves
589
           -> Bool        -- ^ Allow instance moves
590
           -> Bool        -- ^ Only evacuate moves
591
           -> Score       -- ^ Min gain threshold
592
           -> Score       -- ^ Min gain
593
           -> Maybe Table -- ^ The resulting table and commands
594
tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
595
    let Table ini_nl ini_il ini_cv _ = ini_tbl
596
        all_inst = Container.elems ini_il
597
        all_inst' = if evac_mode
598
                    then let bad_nodes = map Node.idx . filter Node.offline $
599
                                         Container.elems ini_nl
600
                         in filter (any (`elem` bad_nodes) . Instance.allNodes)
601
                            all_inst
602
                    else all_inst
603
        reloc_inst = filter Instance.movable all_inst'
604
        node_idx = map Node.idx . filter (not . Node.offline) $
605
                   Container.elems ini_nl
606
        fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
607
        (Table _ _ fin_cv _) = fin_tbl
608
    in
609
      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
610
      then Just fin_tbl -- this round made success, return the new table
611
      else Nothing
612

    
613
-- * Allocation functions
614

    
615
-- | Build failure stats out of a list of failures.
616
collapseFailures :: [FailMode] -> FailStats
617
collapseFailures flst =
618
    map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
619
            [minBound..maxBound]
620

    
621
-- | Compares two Maybe AllocElement and chooses the besst score.
622
bestAllocElement :: Maybe Node.AllocElement
623
                 -> Maybe Node.AllocElement
624
                 -> Maybe Node.AllocElement
625
bestAllocElement a Nothing = a
626
bestAllocElement Nothing b = b
627
bestAllocElement a@(Just (_, _, _, ascore)) b@(Just (_, _, _, bscore)) =
628
  if ascore < bscore then a else b
629

    
630
-- | Update current Allocation solution and failure stats with new
631
-- elements.
632
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
633
concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
634

    
635
concatAllocs as (OpGood ns) =
636
  let -- Choose the old or new solution, based on the cluster score
637
    cntok = asAllocs as
638
    osols = asSolution as
639
    nsols = bestAllocElement osols (Just ns)
640
    nsuc = cntok + 1
641
    -- Note: we force evaluation of nsols here in order to keep the
642
    -- memory profile low - we know that we will need nsols for sure
643
    -- in the next cycle, so we force evaluation of nsols, since the
644
    -- foldl' in the caller will only evaluate the tuple, but not the
645
    -- elements of the tuple
646
  in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolution = nsols }
647

    
648
-- | Sums two 'AllocSolution' structures.
649
sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
650
sumAllocs (AllocSolution aFails aAllocs aSols aLog)
651
          (AllocSolution bFails bAllocs bSols bLog) =
652
  -- note: we add b first, since usually it will be smaller; when
653
  -- fold'ing, a will grow and grow whereas b is the per-group
654
  -- result, hence smaller
655
  let nFails  = bFails ++ aFails
656
      nAllocs = aAllocs + bAllocs
657
      nSols   = bestAllocElement aSols bSols
658
      nLog    = bLog ++ aLog
659
  in AllocSolution nFails nAllocs nSols nLog
660

    
661
-- | Given a solution, generates a reasonable description for it.
662
describeSolution :: AllocSolution -> String
663
describeSolution as =
664
  let fcnt = asFailures as
665
      sols = asSolution as
666
      freasons =
667
        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
668
        filter ((> 0) . snd) . collapseFailures $ fcnt
669
  in case sols of
670
     Nothing -> "No valid allocation solutions, failure reasons: " ++
671
                (if null fcnt then "unknown reasons" else freasons)
672
     Just (_, _, nodes, cv) ->
673
         printf ("score: %.8f, successes %d, failures %d (%s)" ++
674
                 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
675
               (intercalate "/" . map Node.name $ nodes)
676

    
677
-- | Annotates a solution with the appropriate string.
678
annotateSolution :: AllocSolution -> AllocSolution
679
annotateSolution as = as { asLog = describeSolution as : asLog as }
680

    
681
-- | Reverses an evacuation solution.
682
--
683
-- Rationale: we always concat the results to the top of the lists, so
684
-- for proper jobset execution, we should reverse all lists.
685
reverseEvacSolution :: EvacSolution -> EvacSolution
686
reverseEvacSolution (EvacSolution f m o) =
687
  EvacSolution (reverse f) (reverse m) (reverse o)
688

    
689
-- | Generate the valid node allocation singles or pairs for a new instance.
690
genAllocNodes :: Group.List        -- ^ Group list
691
              -> Node.List         -- ^ The node map
692
              -> Int               -- ^ The number of nodes required
693
              -> Bool              -- ^ Whether to drop or not
694
                                   -- unallocable nodes
695
              -> Result AllocNodes -- ^ The (monadic) result
696
genAllocNodes gl nl count drop_unalloc =
697
  let filter_fn = if drop_unalloc
698
                    then filter (Group.isAllocable .
699
                                 flip Container.find gl . Node.group)
700
                    else id
701
      all_nodes = filter_fn $ getOnline nl
702
      all_pairs = [(Node.idx p,
703
                    [Node.idx s | s <- all_nodes,
704
                                       Node.idx p /= Node.idx s,
705
                                       Node.group p == Node.group s]) |
706
                   p <- all_nodes]
707
  in case count of
708
       1 -> Ok (Left (map Node.idx all_nodes))
709
       2 -> Ok (Right (filter (not . null . snd) all_pairs))
710
       _ -> Bad "Unsupported number of nodes, only one or two  supported"
711

    
712
-- | Try to allocate an instance on the cluster.
713
tryAlloc :: (Monad m) =>
714
            Node.List         -- ^ The node list
715
         -> Instance.List     -- ^ The instance list
716
         -> Instance.Instance -- ^ The instance to allocate
717
         -> AllocNodes        -- ^ The allocation targets
718
         -> m AllocSolution   -- ^ Possible solution list
719
tryAlloc _  _ _    (Right []) = fail "Not enough online nodes"
720
tryAlloc nl _ inst (Right ok_pairs) =
721
  let psols = parMap rwhnf (\(p, ss) ->
722
                              foldl' (\cstate ->
723
                                        concatAllocs cstate .
724
                                        allocateOnPair nl inst p)
725
                              emptyAllocSolution ss) ok_pairs
726
      sols = foldl' sumAllocs emptyAllocSolution psols
727
  in return $ annotateSolution sols
728

    
729
tryAlloc _  _ _    (Left []) = fail "No online nodes"
730
tryAlloc nl _ inst (Left all_nodes) =
731
  let sols = foldl' (\cstate ->
732
                       concatAllocs cstate . allocateOnSingle nl inst
733
                    ) emptyAllocSolution all_nodes
734
  in return $ annotateSolution sols
735

    
736
-- | Given a group/result, describe it as a nice (list of) messages.
737
solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
738
solutionDescription gl (groupId, result) =
739
  case result of
740
    Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
741
    Bad message -> [printf "Group %s: error %s" gname message]
742
  where grp = Container.find groupId gl
743
        gname = Group.name grp
744
        pol = allocPolicyToRaw (Group.allocPolicy grp)
745

    
746
-- | From a list of possibly bad and possibly empty solutions, filter
747
-- only the groups with a valid result. Note that the result will be
748
-- reversed compared to the original list.
749
filterMGResults :: Group.List
750
                -> [(Gdx, Result AllocSolution)]
751
                -> [(Gdx, AllocSolution)]
752
filterMGResults gl = foldl' fn []
753
  where unallocable = not . Group.isAllocable . flip Container.find gl
754
        fn accu (gdx, rasol) =
755
          case rasol of
756
            Bad _ -> accu
757
            Ok sol | isNothing (asSolution sol) -> accu
758
                   | unallocable gdx -> accu
759
                   | otherwise -> (gdx, sol):accu
760

    
761
-- | Sort multigroup results based on policy and score.
762
sortMGResults :: Group.List
763
             -> [(Gdx, AllocSolution)]
764
             -> [(Gdx, AllocSolution)]
765
sortMGResults gl sols =
766
  let extractScore (_, _, _, x) = x
767
      solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
768
                             (extractScore . fromJust . asSolution) sol)
769
  in sortBy (comparing solScore) sols
770

    
771
-- | Finds the best group for an instance on a multi-group cluster.
772
--
773
-- Only solutions in @preferred@ and @last_resort@ groups will be
774
-- accepted as valid, and additionally if the allowed groups parameter
775
-- is not null then allocation will only be run for those group
776
-- indices.
777
findBestAllocGroup :: Group.List           -- ^ The group list
778
                   -> Node.List            -- ^ The node list
779
                   -> Instance.List        -- ^ The instance list
780
                   -> Maybe [Gdx]          -- ^ The allowed groups
781
                   -> Instance.Instance    -- ^ The instance to allocate
782
                   -> Int                  -- ^ Required number of nodes
783
                   -> Result (Gdx, AllocSolution, [String])
784
findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
785
  let groups = splitCluster mgnl mgil
786
      groups' = maybe groups (\gs -> filter ((`elem` gs) . fst) groups)
787
                allowed_gdxs
788
      sols = map (\(gid, (nl, il)) ->
789
                   (gid, genAllocNodes mggl nl cnt False >>=
790
                       tryAlloc nl il inst))
791
             groups'::[(Gdx, Result AllocSolution)]
792
      all_msgs = concatMap (solutionDescription mggl) sols
793
      goodSols = filterMGResults mggl sols
794
      sortedSols = sortMGResults mggl goodSols
795
  in if null sortedSols
796
       then if null groups'
797
              then Bad $ "no groups for evacuation: allowed groups was" ++
798
                     show allowed_gdxs ++ ", all groups: " ++
799
                     show (map fst groups)
800
              else Bad $ intercalate ", " all_msgs
801
       else let (final_group, final_sol) = head sortedSols
802
            in return (final_group, final_sol, all_msgs)
803

    
804
-- | Try to allocate an instance on a multi-group cluster.
805
tryMGAlloc :: Group.List           -- ^ The group list
806
           -> Node.List            -- ^ The node list
807
           -> Instance.List        -- ^ The instance list
808
           -> Instance.Instance    -- ^ The instance to allocate
809
           -> Int                  -- ^ Required number of nodes
810
           -> Result AllocSolution -- ^ Possible solution list
811
tryMGAlloc mggl mgnl mgil inst cnt = do
812
  (best_group, solution, all_msgs) <-
813
      findBestAllocGroup mggl mgnl mgil Nothing inst cnt
814
  let group_name = Group.name $ Container.find best_group mggl
815
      selmsg = "Selected group: " ++ group_name
816
  return $ solution { asLog = selmsg:all_msgs }
817

    
818
-- | Function which fails if the requested mode is change secondary.
819
--
820
-- This is useful since except DRBD, no other disk template can
821
-- execute change secondary; thus, we can just call this function
822
-- instead of always checking for secondary mode. After the call to
823
-- this function, whatever mode we have is just a primary change.
824
failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
825
failOnSecondaryChange ChangeSecondary dt =
826
  fail $ "Instances with disk template '" ++ diskTemplateToRaw dt ++
827
         "' can't execute change secondary"
828
failOnSecondaryChange _ _ = return ()
829

    
830
-- | Run evacuation for a single instance.
831
--
832
-- /Note:/ this function should correctly execute both intra-group
833
-- evacuations (in all modes) and inter-group evacuations (in the
834
-- 'ChangeAll' mode). Of course, this requires that the correct list
835
-- of target nodes is passed.
836
nodeEvacInstance :: Node.List         -- ^ The node list (cluster-wide)
837
                 -> Instance.List     -- ^ Instance list (cluster-wide)
838
                 -> EvacMode          -- ^ The evacuation mode
839
                 -> Instance.Instance -- ^ The instance to be evacuated
840
                 -> Gdx               -- ^ The group we're targetting
841
                 -> [Ndx]             -- ^ The list of available nodes
842
                                      -- for allocation
843
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
844
nodeEvacInstance _ _ mode (Instance.Instance
845
                           {Instance.diskTemplate = dt@DTDiskless}) _ _ =
846
                  failOnSecondaryChange mode dt >>
847
                  fail "Diskless relocations not implemented yet"
848

    
849
nodeEvacInstance _ _ _ (Instance.Instance
850
                        {Instance.diskTemplate = DTPlain}) _ _ =
851
                  fail "Instances of type plain cannot be relocated"
852

    
853
nodeEvacInstance _ _ _ (Instance.Instance
854
                        {Instance.diskTemplate = DTFile}) _ _ =
855
                  fail "Instances of type file cannot be relocated"
856

    
857
nodeEvacInstance _ _ mode  (Instance.Instance
858
                            {Instance.diskTemplate = dt@DTSharedFile}) _ _ =
859
                  failOnSecondaryChange mode dt >>
860
                  fail "Shared file relocations not implemented yet"
861

    
862
nodeEvacInstance _ _ mode (Instance.Instance
863
                           {Instance.diskTemplate = dt@DTBlock}) _ _ =
864
                  failOnSecondaryChange mode dt >>
865
                  fail "Block device relocations not implemented yet"
866

    
867
nodeEvacInstance _ _ mode  (Instance.Instance
868
                            {Instance.diskTemplate = dt@DTRbd}) _ _ =
869
                  failOnSecondaryChange mode dt >>
870
                  fail "Rbd relocations not implemented yet"
871

    
872
nodeEvacInstance nl il ChangePrimary
873
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
874
                 _ _ =
875
  do
876
    (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
877
    let idx = Instance.idx inst
878
        il' = Container.add idx inst' il
879
        ops = iMoveToJob nl' il' idx Failover
880
    return (nl', il', ops)
881

    
882
nodeEvacInstance nl il ChangeSecondary
883
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
884
                 gdx avail_nodes =
885
  do
886
    (nl', inst', _, ndx) <- annotateResult "Can't find any good node" $
887
                            eitherToResult $
888
                            foldl' (evacDrbdSecondaryInner nl inst gdx)
889
                            (Left "no nodes available") avail_nodes
890
    let idx = Instance.idx inst
891
        il' = Container.add idx inst' il
892
        ops = iMoveToJob nl' il' idx (ReplaceSecondary ndx)
893
    return (nl', il', ops)
894

    
895
-- The algorithm for ChangeAll is as follows:
896
--
897
-- * generate all (primary, secondary) node pairs for the target groups
898
-- * for each pair, execute the needed moves (r:s, f, r:s) and compute
899
--   the final node list state and group score
900
-- * select the best choice via a foldl that uses the same Either
901
--   String solution as the ChangeSecondary mode
902
nodeEvacInstance nl il ChangeAll
903
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
904
                 gdx avail_nodes =
905
  do
906
    let no_nodes = Left "no nodes available"
907
        node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s]
908
    (nl', il', ops, _) <-
909
        annotateResult "Can't find any good nodes for relocation" $
910
        eitherToResult $
911
        foldl'
912
        (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of
913
                          Bad msg ->
914
                              case accu of
915
                                Right _ -> accu
916
                                -- we don't need more details (which
917
                                -- nodes, etc.) as we only selected
918
                                -- this group if we can allocate on
919
                                -- it, hence failures will not
920
                                -- propagate out of this fold loop
921
                                Left _ -> Left $ "Allocation failed: " ++ msg
922
                          Ok result@(_, _, _, new_cv) ->
923
                              let new_accu = Right result in
924
                              case accu of
925
                                Left _ -> new_accu
926
                                Right (_, _, _, old_cv) ->
927
                                    if old_cv < new_cv
928
                                    then accu
929
                                    else new_accu
930
        ) no_nodes node_pairs
931

    
932
    return (nl', il', ops)
933

    
934
-- | Inner fold function for changing secondary of a DRBD instance.
935
--
936
-- The running solution is either a @Left String@, which means we
937
-- don't have yet a working solution, or a @Right (...)@, which
938
-- represents a valid solution; it holds the modified node list, the
939
-- modified instance (after evacuation), the score of that solution,
940
-- and the new secondary node index.
941
evacDrbdSecondaryInner :: Node.List -- ^ Cluster node list
942
                       -> Instance.Instance -- ^ Instance being evacuated
943
                       -> Gdx -- ^ The group index of the instance
944
                       -> Either String ( Node.List
945
                                        , Instance.Instance
946
                                        , Score
947
                                        , Ndx)  -- ^ Current best solution
948
                       -> Ndx  -- ^ Node we're evaluating as new secondary
949
                       -> Either String ( Node.List
950
                                        , Instance.Instance
951
                                        , Score
952
                                        , Ndx) -- ^ New best solution
953
evacDrbdSecondaryInner nl inst gdx accu ndx =
954
  case applyMove nl inst (ReplaceSecondary ndx) of
955
    OpFail fm ->
956
      case accu of
957
        Right _ -> accu
958
        Left _ -> Left $ "Node " ++ Container.nameOf nl ndx ++
959
                  " failed: " ++ show fm
960
    OpGood (nl', inst', _, _) ->
961
      let nodes = Container.elems nl'
962
          -- The fromJust below is ugly (it can fail nastily), but
963
          -- at this point we should have any internal mismatches,
964
          -- and adding a monad here would be quite involved
965
          grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
966
          new_cv = compCVNodes grpnodes
967
          new_accu = Right (nl', inst', new_cv, ndx)
968
      in case accu of
969
           Left _ -> new_accu
970
           Right (_, _, old_cv, _) ->
971
             if old_cv < new_cv
972
               then accu
973
               else new_accu
974

    
975
-- | Compute result of changing all nodes of a DRBD instance.
976
--
977
-- Given the target primary and secondary node (which might be in a
978
-- different group or not), this function will 'execute' all the
979
-- required steps and assuming all operations succceed, will return
980
-- the modified node and instance lists, the opcodes needed for this
981
-- and the new group score.
982
evacDrbdAllInner :: Node.List         -- ^ Cluster node list
983
                 -> Instance.List     -- ^ Cluster instance list
984
                 -> Instance.Instance -- ^ The instance to be moved
985
                 -> Gdx               -- ^ The target group index
986
                                      -- (which can differ from the
987
                                      -- current group of the
988
                                      -- instance)
989
                 -> (Ndx, Ndx)        -- ^ Tuple of new
990
                                      -- primary\/secondary nodes
991
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score)
992
evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) = do
993
  let primary = Container.find (Instance.pNode inst) nl
994
      idx = Instance.idx inst
995
  -- if the primary is offline, then we first failover
996
  (nl1, inst1, ops1) <-
997
    if Node.offline primary
998
      then do
999
        (nl', inst', _, _) <-
1000
          annotateResult "Failing over to the secondary" $
1001
          opToResult $ applyMove nl inst Failover
1002
        return (nl', inst', [Failover])
1003
      else return (nl, inst, [])
1004
  let (o1, o2, o3) = (ReplaceSecondary t_pdx,
1005
                      Failover,
1006
                      ReplaceSecondary t_sdx)
1007
  -- we now need to execute a replace secondary to the future
1008
  -- primary node
1009
  (nl2, inst2, _, _) <-
1010
    annotateResult "Changing secondary to new primary" $
1011
    opToResult $
1012
    applyMove nl1 inst1 o1
1013
  let ops2 = o1:ops1
1014
  -- we now execute another failover, the primary stays fixed now
1015
  (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" $
1016
                        opToResult $ applyMove nl2 inst2 o2
1017
  let ops3 = o2:ops2
1018
  -- and finally another replace secondary, to the final secondary
1019
  (nl4, inst4, _, _) <-
1020
    annotateResult "Changing secondary to final secondary" $
1021
    opToResult $
1022
    applyMove nl3 inst3 o3
1023
  let ops4 = o3:ops3
1024
      il' = Container.add idx inst4 il
1025
      ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1026
  let nodes = Container.elems nl4
1027
      -- The fromJust below is ugly (it can fail nastily), but
1028
      -- at this point we should have any internal mismatches,
1029
      -- and adding a monad here would be quite involved
1030
      grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1031
      new_cv = compCVNodes grpnodes
1032
  return (nl4, il', ops, new_cv)
1033

    
1034
-- | Computes the nodes in a given group which are available for
1035
-- allocation.
1036
availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1037
                    -> IntSet.IntSet  -- ^ Nodes that are excluded
1038
                    -> Gdx            -- ^ The group for which we
1039
                                      -- query the nodes
1040
                    -> Result [Ndx]   -- ^ List of available node indices
1041
availableGroupNodes group_nodes excl_ndx gdx = do
1042
  local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1043
                 Ok (lookup gdx group_nodes)
1044
  let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1045
  return avail_nodes
1046

    
1047
-- | Updates the evac solution with the results of an instance
1048
-- evacuation.
1049
updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1050
                   -> Idx
1051
                   -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1052
                   -> (Node.List, Instance.List, EvacSolution)
1053
updateEvacSolution (nl, il, es) idx (Bad msg) =
1054
  (nl, il, es { esFailed = (idx, msg):esFailed es})
1055
updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1056
  (nl, il, es { esMoved = new_elem:esMoved es
1057
              , esOpCodes = opcodes:esOpCodes es })
1058
    where inst = Container.find idx il
1059
          new_elem = (idx,
1060
                      instancePriGroup nl inst,
1061
                      Instance.allNodes inst)
1062

    
1063
-- | Node-evacuation IAllocator mode main function.
1064
tryNodeEvac :: Group.List    -- ^ The cluster groups
1065
            -> Node.List     -- ^ The node list (cluster-wide, not per group)
1066
            -> Instance.List -- ^ Instance list (cluster-wide)
1067
            -> EvacMode      -- ^ The evacuation mode
1068
            -> [Idx]         -- ^ List of instance (indices) to be evacuated
1069
            -> Result (Node.List, Instance.List, EvacSolution)
1070
tryNodeEvac _ ini_nl ini_il mode idxs =
1071
  let evac_ndx = nodesToEvacuate ini_il mode idxs
1072
      offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1073
      excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1074
      group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1075
                                           (Container.elems nl))) $
1076
                  splitCluster ini_nl ini_il
1077
      (fin_nl, fin_il, esol) =
1078
        foldl' (\state@(nl, il, _) inst ->
1079
                  let gdx = instancePriGroup nl inst
1080
                      pdx = Instance.pNode inst in
1081
                  updateEvacSolution state (Instance.idx inst) $
1082
                  availableGroupNodes group_ndx
1083
                    (IntSet.insert pdx excl_ndx) gdx >>=
1084
                      nodeEvacInstance nl il mode inst gdx
1085
               )
1086
        (ini_nl, ini_il, emptyEvacSolution)
1087
        (map (`Container.find` ini_il) idxs)
1088
  in return (fin_nl, fin_il, reverseEvacSolution esol)
1089

    
1090
-- | Change-group IAllocator mode main function.
1091
--
1092
-- This is very similar to 'tryNodeEvac', the only difference is that
1093
-- we don't choose as target group the current instance group, but
1094
-- instead:
1095
--
1096
--   1. at the start of the function, we compute which are the target
1097
--   groups; either no groups were passed in, in which case we choose
1098
--   all groups out of which we don't evacuate instance, or there were
1099
--   some groups passed, in which case we use those
1100
--
1101
--   2. for each instance, we use 'findBestAllocGroup' to choose the
1102
--   best group to hold the instance, and then we do what
1103
--   'tryNodeEvac' does, except for this group instead of the current
1104
--   instance group.
1105
--
1106
-- Note that the correct behaviour of this function relies on the
1107
-- function 'nodeEvacInstance' to be able to do correctly both
1108
-- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1109
tryChangeGroup :: Group.List    -- ^ The cluster groups
1110
               -> Node.List     -- ^ The node list (cluster-wide)
1111
               -> Instance.List -- ^ Instance list (cluster-wide)
1112
               -> [Gdx]         -- ^ Target groups; if empty, any
1113
                                -- groups not being evacuated
1114
               -> [Idx]         -- ^ List of instance (indices) to be evacuated
1115
               -> Result (Node.List, Instance.List, EvacSolution)
1116
tryChangeGroup gl ini_nl ini_il gdxs idxs =
1117
  let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1118
                             flip Container.find ini_il) idxs
1119
      target_gdxs = (if null gdxs
1120
                       then Container.keys gl
1121
                       else gdxs) \\ evac_gdxs
1122
      offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1123
      excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1124
      group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1125
                                           (Container.elems nl))) $
1126
                  splitCluster ini_nl ini_il
1127
      (fin_nl, fin_il, esol) =
1128
        foldl' (\state@(nl, il, _) inst ->
1129
                  let solution = do
1130
                        let ncnt = Instance.requiredNodes $
1131
                                   Instance.diskTemplate inst
1132
                        (gdx, _, _) <- findBestAllocGroup gl nl il
1133
                                       (Just target_gdxs) inst ncnt
1134
                        av_nodes <- availableGroupNodes group_ndx
1135
                                    excl_ndx gdx
1136
                        nodeEvacInstance nl il ChangeAll inst gdx av_nodes
1137
                  in updateEvacSolution state (Instance.idx inst) solution
1138
               )
1139
        (ini_nl, ini_il, emptyEvacSolution)
1140
        (map (`Container.find` ini_il) idxs)
1141
  in return (fin_nl, fin_il, reverseEvacSolution esol)
1142

    
1143
-- | Standard-sized allocation method.
1144
--
1145
-- This places instances of the same size on the cluster until we're
1146
-- out of space. The result will be a list of identically-sized
1147
-- instances.
1148
iterateAlloc :: AllocMethod
1149
iterateAlloc nl il limit newinst allocnodes ixes cstats =
1150
  let depth = length ixes
1151
      newname = printf "new-%d" depth::String
1152
      newidx = Container.size il
1153
      newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1154
      newlimit = fmap (flip (-) 1) limit
1155
  in case tryAlloc nl il newi2 allocnodes of
1156
       Bad s -> Bad s
1157
       Ok (AllocSolution { asFailures = errs, asSolution = sols3 }) ->
1158
         let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1159
         case sols3 of
1160
           Nothing -> newsol
1161
           Just (xnl, xi, _, _) ->
1162
             if limit == Just 0
1163
               then newsol
1164
               else iterateAlloc xnl (Container.add newidx xi il)
1165
                      newlimit newinst allocnodes (xi:ixes)
1166
                      (totalResources xnl:cstats)
1167

    
1168
-- | Tiered allocation method.
1169
--
1170
-- This places instances on the cluster, and decreases the spec until
1171
-- we can allocate again. The result will be a list of decreasing
1172
-- instance specs.
1173
tieredAlloc :: AllocMethod
1174
tieredAlloc nl il limit newinst allocnodes ixes cstats =
1175
  case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1176
    Bad s -> Bad s
1177
    Ok (errs, nl', il', ixes', cstats') ->
1178
      let newsol = Ok (errs, nl', il', ixes', cstats')
1179
          ixes_cnt = length ixes'
1180
          (stop, newlimit) = case limit of
1181
                               Nothing -> (False, Nothing)
1182
                               Just n -> (n <= ixes_cnt,
1183
                                            Just (n - ixes_cnt)) in
1184
      if stop then newsol else
1185
          case Instance.shrinkByType newinst . fst . last $
1186
               sortBy (comparing snd) errs of
1187
            Bad _ -> newsol
1188
            Ok newinst' -> tieredAlloc nl' il' newlimit
1189
                           newinst' allocnodes ixes' cstats'
1190

    
1191
-- * Formatting functions
1192

    
1193
-- | Given the original and final nodes, computes the relocation description.
1194
computeMoves :: Instance.Instance -- ^ The instance to be moved
1195
             -> String -- ^ The instance name
1196
             -> IMove  -- ^ The move being performed
1197
             -> String -- ^ New primary
1198
             -> String -- ^ New secondary
1199
             -> (String, [String])
1200
                -- ^ Tuple of moves and commands list; moves is containing
1201
                -- either @/f/@ for failover or @/r:name/@ for replace
1202
                -- secondary, while the command list holds gnt-instance
1203
                -- commands (without that prefix), e.g \"@failover instance1@\"
1204
computeMoves i inam mv c d =
1205
  case mv of
1206
    Failover -> ("f", [mig])
1207
    FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1208
    ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1209
    ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1210
    ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1211
  where morf = if Instance.instanceRunning i then "migrate" else "failover"
1212
        mig = printf "%s -f %s" morf inam::String
1213
        rep n = printf "replace-disks -n %s %s" n inam
1214

    
1215
-- | Converts a placement to string format.
1216
printSolutionLine :: Node.List     -- ^ The node list
1217
                  -> Instance.List -- ^ The instance list
1218
                  -> Int           -- ^ Maximum node name length
1219
                  -> Int           -- ^ Maximum instance name length
1220
                  -> Placement     -- ^ The current placement
1221
                  -> Int           -- ^ The index of the placement in
1222
                                   -- the solution
1223
                  -> (String, [String])
1224
printSolutionLine nl il nmlen imlen plc pos =
1225
  let pmlen = (2*nmlen + 1)
1226
      (i, p, s, mv, c) = plc
1227
      inst = Container.find i il
1228
      inam = Instance.alias inst
1229
      npri = Node.alias $ Container.find p nl
1230
      nsec = Node.alias $ Container.find s nl
1231
      opri = Node.alias $ Container.find (Instance.pNode inst) nl
1232
      osec = Node.alias $ Container.find (Instance.sNode inst) nl
1233
      (moves, cmds) =  computeMoves inst inam mv npri nsec
1234
      ostr = printf "%s:%s" opri osec::String
1235
      nstr = printf "%s:%s" npri nsec::String
1236
  in (printf "  %3d. %-*s %-*s => %-*s %.8f a=%s"
1237
      pos imlen inam pmlen ostr
1238
      pmlen nstr c moves,
1239
      cmds)
1240

    
1241
-- | Return the instance and involved nodes in an instance move.
1242
--
1243
-- Note that the output list length can vary, and is not required nor
1244
-- guaranteed to be of any specific length.
1245
involvedNodes :: Instance.List -- ^ Instance list, used for retrieving
1246
                               -- the instance from its index; note
1247
                               -- that this /must/ be the original
1248
                               -- instance list, so that we can
1249
                               -- retrieve the old nodes
1250
              -> Placement     -- ^ The placement we're investigating,
1251
                               -- containing the new nodes and
1252
                               -- instance index
1253
              -> [Ndx]         -- ^ Resulting list of node indices
1254
involvedNodes il plc =
1255
  let (i, np, ns, _, _) = plc
1256
      inst = Container.find i il
1257
  in nub $ [np, ns] ++ Instance.allNodes inst
1258

    
1259
-- | Inner function for splitJobs, that either appends the next job to
1260
-- the current jobset, or starts a new jobset.
1261
mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1262
mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1263
mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1264
  | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1265
  | otherwise = ([n]:cjs, ndx)
1266

    
1267
-- | Break a list of moves into independent groups. Note that this
1268
-- will reverse the order of jobs.
1269
splitJobs :: [MoveJob] -> [JobSet]
1270
splitJobs = fst . foldl mergeJobs ([], [])
1271

    
1272
-- | Given a list of commands, prefix them with @gnt-instance@ and
1273
-- also beautify the display a little.
1274
formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1275
formatJob jsn jsl (sn, (_, _, _, cmds)) =
1276
  let out =
1277
        printf "  echo job %d/%d" jsn sn:
1278
        printf "  check":
1279
        map ("  gnt-instance " ++) cmds
1280
  in if sn == 1
1281
       then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1282
       else out
1283

    
1284
-- | Given a list of commands, prefix them with @gnt-instance@ and
1285
-- also beautify the display a little.
1286
formatCmds :: [JobSet] -> String
1287
formatCmds =
1288
  unlines .
1289
  concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1290
                           (zip [1..] js)) .
1291
  zip [1..]
1292

    
1293
-- | Print the node list.
1294
printNodes :: Node.List -> [String] -> String
1295
printNodes nl fs =
1296
  let fields = case fs of
1297
                 [] -> Node.defaultFields
1298
                 "+":rest -> Node.defaultFields ++ rest
1299
                 _ -> fs
1300
      snl = sortBy (comparing Node.idx) (Container.elems nl)
1301
      (header, isnum) = unzip $ map Node.showHeader fields
1302
  in unlines . map ((:) ' ' .  unwords) $
1303
     formatTable (header:map (Node.list fields) snl) isnum
1304

    
1305
-- | Print the instance list.
1306
printInsts :: Node.List -> Instance.List -> String
1307
printInsts nl il =
1308
  let sil = sortBy (comparing Instance.idx) (Container.elems il)
1309
      helper inst = [ if Instance.instanceRunning inst then "R" else " "
1310
                    , Instance.name inst
1311
                    , Container.nameOf nl (Instance.pNode inst)
1312
                    , let sdx = Instance.sNode inst
1313
                      in if sdx == Node.noSecondary
1314
                           then  ""
1315
                           else Container.nameOf nl sdx
1316
                    , if Instance.autoBalance inst then "Y" else "N"
1317
                    , printf "%3d" $ Instance.vcpus inst
1318
                    , printf "%5d" $ Instance.mem inst
1319
                    , printf "%5d" $ Instance.dsk inst `div` 1024
1320
                    , printf "%5.3f" lC
1321
                    , printf "%5.3f" lM
1322
                    , printf "%5.3f" lD
1323
                    , printf "%5.3f" lN
1324
                    ]
1325
          where DynUtil lC lM lD lN = Instance.util inst
1326
      header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1327
               , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1328
      isnum = False:False:False:False:False:repeat True
1329
  in unlines . map ((:) ' ' . unwords) $
1330
     formatTable (header:map helper sil) isnum
1331

    
1332
-- | Shows statistics for a given node list.
1333
printStats :: String -> Node.List -> String
1334
printStats lp nl =
1335
  let dcvs = compDetailedCV $ Container.elems nl
1336
      (weights, names) = unzip detailedCVInfo
1337
      hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1338
      header = [ "Field", "Value", "Weight" ]
1339
      formatted = map (\(w, h, val) ->
1340
                         [ h
1341
                         , printf "%.8f" val
1342
                         , printf "x%.2f" w
1343
                         ]) hd
1344
  in unlines . map ((++) lp) . map ((:) ' ' . unwords) $
1345
     formatTable (header:formatted) $ False:repeat True
1346

    
1347
-- | Convert a placement into a list of OpCodes (basically a job).
1348
iMoveToJob :: Node.List        -- ^ The node list; only used for node
1349
                               -- names, so any version is good
1350
                               -- (before or after the operation)
1351
           -> Instance.List    -- ^ The instance list; also used for
1352
                               -- names only
1353
           -> Idx              -- ^ The index of the instance being
1354
                               -- moved
1355
           -> IMove            -- ^ The actual move to be described
1356
           -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to
1357
                               -- the given move
1358
iMoveToJob nl il idx move =
1359
  let inst = Container.find idx il
1360
      iname = Instance.name inst
1361
      lookNode  = Just . Container.nameOf nl
1362
      opF = OpCodes.OpInstanceMigrate iname True False True Nothing
1363
      opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n)
1364
              OpCodes.ReplaceNewSecondary [] Nothing
1365
  in case move of
1366
       Failover -> [ opF ]
1367
       ReplacePrimary np -> [ opF, opR np, opF ]
1368
       ReplaceSecondary ns -> [ opR ns ]
1369
       ReplaceAndFailover np -> [ opR np, opF ]
1370
       FailoverAndReplace ns -> [ opF, opR ns ]
1371

    
1372
-- * Node group functions
1373

    
1374
-- | Computes the group of an instance.
1375
instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1376
instanceGroup nl i =
1377
  let sidx = Instance.sNode i
1378
      pnode = Container.find (Instance.pNode i) nl
1379
      snode = if sidx == Node.noSecondary
1380
              then pnode
1381
              else Container.find sidx nl
1382
      pgroup = Node.group pnode
1383
      sgroup = Node.group snode
1384
  in if pgroup /= sgroup
1385
       then fail ("Instance placed accross two node groups, primary " ++
1386
                  show pgroup ++ ", secondary " ++ show sgroup)
1387
       else return pgroup
1388

    
1389
-- | Computes the group of an instance per the primary node.
1390
instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1391
instancePriGroup nl i =
1392
  let pnode = Container.find (Instance.pNode i) nl
1393
  in  Node.group pnode
1394

    
1395
-- | Compute the list of badly allocated instances (split across node
1396
-- groups).
1397
findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1398
findSplitInstances nl =
1399
  filter (not . isOk . instanceGroup nl) . Container.elems
1400

    
1401
-- | Splits a cluster into the component node groups.
1402
splitCluster :: Node.List -> Instance.List ->
1403
                [(Gdx, (Node.List, Instance.List))]
1404
splitCluster nl il =
1405
  let ngroups = Node.computeGroups (Container.elems nl)
1406
  in map (\(guuid, nodes) ->
1407
           let nidxs = map Node.idx nodes
1408
               nodes' = zip nidxs nodes
1409
               instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1410
           in (guuid, (Container.fromList nodes', instances))) ngroups
1411

    
1412
-- | Compute the list of nodes that are to be evacuated, given a list
1413
-- of instances and an evacuation mode.
1414
nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1415
                -> EvacMode      -- ^ The evacuation mode we're using
1416
                -> [Idx]         -- ^ List of instance indices being evacuated
1417
                -> IntSet.IntSet -- ^ Set of node indices
1418
nodesToEvacuate il mode =
1419
  IntSet.delete Node.noSecondary .
1420
  foldl' (\ns idx ->
1421
            let i = Container.find idx il
1422
                pdx = Instance.pNode i
1423
                sdx = Instance.sNode i
1424
                dt = Instance.diskTemplate i
1425
                withSecondary = case dt of
1426
                                  DTDrbd8 -> IntSet.insert sdx ns
1427
                                  _ -> ns
1428
            in case mode of
1429
                 ChangePrimary   -> IntSet.insert pdx ns
1430
                 ChangeSecondary -> withSecondary
1431
                 ChangeAll       -> IntSet.insert pdx withSecondary
1432
         ) IntSet.empty