Statistics
| Branch: | Tag: | Revision:

root / htools / Ganeti / HTools / Cluster.hs @ c3024b7e

History | View | Annotate | Download (61.6 kB)

1
{-| Implementation of cluster-wide logic.
2

    
3
This module holds all pure cluster-logic; I\/O related functionality
4
goes into the /Main/ module for the individual binaries.
5

    
6
-}
7

    
8
{-
9

    
10
Copyright (C) 2009, 2010, 2011, 2012 Google Inc.
11

    
12
This program is free software; you can redistribute it and/or modify
13
it under the terms of the GNU General Public License as published by
14
the Free Software Foundation; either version 2 of the License, or
15
(at your option) any later version.
16

    
17
This program is distributed in the hope that it will be useful, but
18
WITHOUT ANY WARRANTY; without even the implied warranty of
19
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
General Public License for more details.
21

    
22
You should have received a copy of the GNU General Public License
23
along with this program; if not, write to the Free Software
24
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25
02110-1301, USA.
26

    
27
-}
28

    
29
module Ganeti.HTools.Cluster
30
  (
31
    -- * Types
32
    AllocSolution(..)
33
  , EvacSolution(..)
34
  , Table(..)
35
  , CStats(..)
36
  , AllocResult
37
  , AllocMethod
38
  -- * Generic functions
39
  , totalResources
40
  , computeAllocationDelta
41
  -- * First phase functions
42
  , computeBadItems
43
  -- * Second phase functions
44
  , printSolutionLine
45
  , formatCmds
46
  , involvedNodes
47
  , splitJobs
48
  -- * Display functions
49
  , printNodes
50
  , printInsts
51
  -- * Balacing functions
52
  , checkMove
53
  , doNextBalance
54
  , tryBalance
55
  , compCV
56
  , compCVNodes
57
  , compDetailedCV
58
  , printStats
59
  , iMoveToJob
60
  -- * IAllocator functions
61
  , genAllocNodes
62
  , tryAlloc
63
  , tryMGAlloc
64
  , tryNodeEvac
65
  , tryChangeGroup
66
  , collapseFailures
67
  -- * Allocation functions
68
  , iterateAlloc
69
  , tieredAlloc
70
  -- * Node group functions
71
  , instanceGroup
72
  , findSplitInstances
73
  , splitCluster
74
  ) where
75

    
76
import qualified Data.IntSet as IntSet
77
import Data.List
78
import Data.Maybe (fromJust, isNothing)
79
import Data.Ord (comparing)
80
import Text.Printf (printf)
81

    
82
import qualified Ganeti.HTools.Container as Container
83
import qualified Ganeti.HTools.Instance as Instance
84
import qualified Ganeti.HTools.Node as Node
85
import qualified Ganeti.HTools.Group as Group
86
import Ganeti.HTools.Types
87
import Ganeti.HTools.Utils
88
import Ganeti.HTools.Compat
89
import qualified Ganeti.OpCodes as OpCodes
90

    
91
-- * Types
92

    
93
-- | Allocation\/relocation solution.
94
data AllocSolution = AllocSolution
95
  { asFailures :: [FailMode]              -- ^ Failure counts
96
  , asAllocs   :: Int                     -- ^ Good allocation count
97
  , asSolution :: Maybe Node.AllocElement -- ^ The actual allocation result
98
  , asLog      :: [String]                -- ^ Informational messages
99
  }
100

    
101
-- | Node evacuation/group change iallocator result type. This result
102
-- type consists of actual opcodes (a restricted subset) that are
103
-- transmitted back to Ganeti.
104
data EvacSolution = EvacSolution
105
  { esMoved   :: [(Idx, Gdx, [Ndx])]  -- ^ Instances moved successfully
106
  , esFailed  :: [(Idx, String)]      -- ^ Instances which were not
107
                                      -- relocated
108
  , esOpCodes :: [[OpCodes.OpCode]]   -- ^ List of jobs
109
  } deriving (Show)
110

    
111
-- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
112
type AllocResult = (FailStats, Node.List, Instance.List,
113
                    [Instance.Instance], [CStats])
114

    
115
-- | A type denoting the valid allocation mode/pairs.
116
--
117
-- For a one-node allocation, this will be a @Left ['Ndx']@, whereas
118
-- for a two-node allocation, this will be a @Right [('Ndx',
119
-- ['Ndx'])]@. In the latter case, the list is basically an
120
-- association list, grouped by primary node and holding the potential
121
-- secondary nodes in the sub-list.
122
type AllocNodes = Either [Ndx] [(Ndx, [Ndx])]
123

    
124
-- | The empty solution we start with when computing allocations.
125
emptyAllocSolution :: AllocSolution
126
emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
127
                                   , asSolution = Nothing, asLog = [] }
128

    
129
-- | The empty evac solution.
130
emptyEvacSolution :: EvacSolution
131
emptyEvacSolution = EvacSolution { esMoved = []
132
                                 , esFailed = []
133
                                 , esOpCodes = []
134
                                 }
135

    
136
-- | The complete state for the balancing solution.
137
data Table = Table Node.List Instance.List Score [Placement]
138
             deriving (Show, Read)
139

    
140
-- | Cluster statistics data type.
141
data CStats = CStats
142
  { csFmem :: Integer -- ^ Cluster free mem
143
  , csFdsk :: Integer -- ^ Cluster free disk
144
  , csAmem :: Integer -- ^ Cluster allocatable mem
145
  , csAdsk :: Integer -- ^ Cluster allocatable disk
146
  , csAcpu :: Integer -- ^ Cluster allocatable cpus
147
  , csMmem :: Integer -- ^ Max node allocatable mem
148
  , csMdsk :: Integer -- ^ Max node allocatable disk
149
  , csMcpu :: Integer -- ^ Max node allocatable cpu
150
  , csImem :: Integer -- ^ Instance used mem
151
  , csIdsk :: Integer -- ^ Instance used disk
152
  , csIcpu :: Integer -- ^ Instance used cpu
153
  , csTmem :: Double  -- ^ Cluster total mem
154
  , csTdsk :: Double  -- ^ Cluster total disk
155
  , csTcpu :: Double  -- ^ Cluster total cpus
156
  , csVcpu :: Integer -- ^ Cluster total virtual cpus
157
  , csNcpu :: Double  -- ^ Equivalent to 'csIcpu' but in terms of
158
                      -- physical CPUs, i.e. normalised used phys CPUs
159
  , csXmem :: Integer -- ^ Unnacounted for mem
160
  , csNmem :: Integer -- ^ Node own memory
161
  , csScore :: Score  -- ^ The cluster score
162
  , csNinst :: Int    -- ^ The total number of instances
163
  } deriving (Show, Read)
164

    
165
-- | A simple type for allocation functions.
166
type AllocMethod =  Node.List           -- ^ Node list
167
                 -> Instance.List       -- ^ Instance list
168
                 -> Maybe Int           -- ^ Optional allocation limit
169
                 -> Instance.Instance   -- ^ Instance spec for allocation
170
                 -> AllocNodes          -- ^ Which nodes we should allocate on
171
                 -> [Instance.Instance] -- ^ Allocated instances
172
                 -> [CStats]            -- ^ Running cluster stats
173
                 -> Result AllocResult  -- ^ Allocation result
174

    
175
-- * Utility functions
176

    
177
-- | Verifies the N+1 status and return the affected nodes.
178
verifyN1 :: [Node.Node] -> [Node.Node]
179
verifyN1 = filter Node.failN1
180

    
181
{-| Computes the pair of bad nodes and instances.
182

    
183
The bad node list is computed via a simple 'verifyN1' check, and the
184
bad instance list is the list of primary and secondary instances of
185
those nodes.
186

    
187
-}
188
computeBadItems :: Node.List -> Instance.List ->
189
                   ([Node.Node], [Instance.Instance])
190
computeBadItems nl il =
191
  let bad_nodes = verifyN1 $ getOnline nl
192
      bad_instances = map (`Container.find` il) .
193
                      sort . nub $
194
                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
195
  in
196
    (bad_nodes, bad_instances)
197

    
198
-- | Extracts the node pairs for an instance. This can fail if the
199
-- instance is single-homed. FIXME: this needs to be improved,
200
-- together with the general enhancement for handling non-DRBD moves.
201
instanceNodes :: Node.List -> Instance.Instance ->
202
                 (Ndx, Ndx, Node.Node, Node.Node)
203
instanceNodes nl inst =
204
  let old_pdx = Instance.pNode inst
205
      old_sdx = Instance.sNode inst
206
      old_p = Container.find old_pdx nl
207
      old_s = Container.find old_sdx nl
208
  in (old_pdx, old_sdx, old_p, old_s)
209

    
210
-- | Zero-initializer for the CStats type.
211
emptyCStats :: CStats
212
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
213

    
214
-- | Update stats with data from a new node.
215
updateCStats :: CStats -> Node.Node -> CStats
216
updateCStats cs node =
217
  let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
218
               csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
219
               csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
220
               csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
221
               csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
222
               csVcpu = x_vcpu, csNcpu = x_ncpu,
223
               csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
224
             }
225
        = cs
226
      inc_amem = Node.fMem node - Node.rMem node
227
      inc_amem' = if inc_amem > 0 then inc_amem else 0
228
      inc_adsk = Node.availDisk node
229
      inc_imem = truncate (Node.tMem node) - Node.nMem node
230
                 - Node.xMem node - Node.fMem node
231
      inc_icpu = Node.uCpu node
232
      inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
233
      inc_vcpu = Node.hiCpu node
234
      inc_acpu = Node.availCpu node
235
      inc_ncpu = fromIntegral (Node.uCpu node) /
236
                 iPolicyVcpuRatio (Node.iPolicy node)
237
  in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
238
        , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
239
        , csAmem = x_amem + fromIntegral inc_amem'
240
        , csAdsk = x_adsk + fromIntegral inc_adsk
241
        , csAcpu = x_acpu + fromIntegral inc_acpu
242
        , csMmem = max x_mmem (fromIntegral inc_amem')
243
        , csMdsk = max x_mdsk (fromIntegral inc_adsk)
244
        , csMcpu = max x_mcpu (fromIntegral inc_acpu)
245
        , csImem = x_imem + fromIntegral inc_imem
246
        , csIdsk = x_idsk + fromIntegral inc_idsk
247
        , csIcpu = x_icpu + fromIntegral inc_icpu
248
        , csTmem = x_tmem + Node.tMem node
249
        , csTdsk = x_tdsk + Node.tDsk node
250
        , csTcpu = x_tcpu + Node.tCpu node
251
        , csVcpu = x_vcpu + fromIntegral inc_vcpu
252
        , csNcpu = x_ncpu + inc_ncpu
253
        , csXmem = x_xmem + fromIntegral (Node.xMem node)
254
        , csNmem = x_nmem + fromIntegral (Node.nMem node)
255
        , csNinst = x_ninst + length (Node.pList node)
256
        }
257

    
258
-- | Compute the total free disk and memory in the cluster.
259
totalResources :: Node.List -> CStats
260
totalResources nl =
261
  let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
262
  in cs { csScore = compCV nl }
263

    
264
-- | Compute the delta between two cluster state.
265
--
266
-- This is used when doing allocations, to understand better the
267
-- available cluster resources. The return value is a triple of the
268
-- current used values, the delta that was still allocated, and what
269
-- was left unallocated.
270
computeAllocationDelta :: CStats -> CStats -> AllocStats
271
computeAllocationDelta cini cfin =
272
  let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu,
273
              csNcpu = i_ncpu } = cini
274
      CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
275
              csTmem = t_mem, csTdsk = t_dsk, csVcpu = f_vcpu,
276
              csNcpu = f_ncpu, csTcpu = f_tcpu } = cfin
277
      rini = AllocInfo { allocInfoVCpus = fromIntegral i_icpu
278
                       , allocInfoNCpus = i_ncpu
279
                       , allocInfoMem   = fromIntegral i_imem
280
                       , allocInfoDisk  = fromIntegral i_idsk
281
                       }
282
      rfin = AllocInfo { allocInfoVCpus = fromIntegral (f_icpu - i_icpu)
283
                       , allocInfoNCpus = f_ncpu - i_ncpu
284
                       , allocInfoMem   = fromIntegral (f_imem - i_imem)
285
                       , allocInfoDisk  = fromIntegral (f_idsk - i_idsk)
286
                       }
287
      runa = AllocInfo { allocInfoVCpus = fromIntegral (f_vcpu - f_icpu)
288
                       , allocInfoNCpus = f_tcpu - f_ncpu
289
                       , allocInfoMem   = truncate t_mem - fromIntegral f_imem
290
                       , allocInfoDisk  = truncate t_dsk - fromIntegral f_idsk
291
                       }
292
  in (rini, rfin, runa)
293

    
294
-- | The names and weights of the individual elements in the CV list.
295
detailedCVInfo :: [(Double, String)]
296
detailedCVInfo = [ (1,  "free_mem_cv")
297
                 , (1,  "free_disk_cv")
298
                 , (1,  "n1_cnt")
299
                 , (1,  "reserved_mem_cv")
300
                 , (4,  "offline_all_cnt")
301
                 , (16, "offline_pri_cnt")
302
                 , (1,  "vcpu_ratio_cv")
303
                 , (1,  "cpu_load_cv")
304
                 , (1,  "mem_load_cv")
305
                 , (1,  "disk_load_cv")
306
                 , (1,  "net_load_cv")
307
                 , (2,  "pri_tags_score")
308
                 , (1,  "spindles_cv")
309
                 ]
310

    
311
-- | Holds the weights used by 'compCVNodes' for each metric.
312
detailedCVWeights :: [Double]
313
detailedCVWeights = map fst detailedCVInfo
314

    
315
-- | Compute the mem and disk covariance.
316
compDetailedCV :: [Node.Node] -> [Double]
317
compDetailedCV all_nodes =
318
  let (offline, nodes) = partition Node.offline all_nodes
319
      mem_l = map Node.pMem nodes
320
      dsk_l = map Node.pDsk nodes
321
      -- metric: memory covariance
322
      mem_cv = stdDev mem_l
323
      -- metric: disk covariance
324
      dsk_cv = stdDev dsk_l
325
      -- metric: count of instances living on N1 failing nodes
326
      n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
327
                                                 length (Node.pList n)) .
328
                 filter Node.failN1 $ nodes :: Double
329
      res_l = map Node.pRem nodes
330
      -- metric: reserved memory covariance
331
      res_cv = stdDev res_l
332
      -- offline instances metrics
333
      offline_ipri = sum . map (length . Node.pList) $ offline
334
      offline_isec = sum . map (length . Node.sList) $ offline
335
      -- metric: count of instances on offline nodes
336
      off_score = fromIntegral (offline_ipri + offline_isec)::Double
337
      -- metric: count of primary instances on offline nodes (this
338
      -- helps with evacuation/failover of primary instances on
339
      -- 2-node clusters with one node offline)
340
      off_pri_score = fromIntegral offline_ipri::Double
341
      cpu_l = map Node.pCpu nodes
342
      -- metric: covariance of vcpu/pcpu ratio
343
      cpu_cv = stdDev cpu_l
344
      -- metrics: covariance of cpu, memory, disk and network load
345
      (c_load, m_load, d_load, n_load) =
346
        unzip4 $ map (\n ->
347
                      let DynUtil c1 m1 d1 n1 = Node.utilLoad n
348
                          DynUtil c2 m2 d2 n2 = Node.utilPool n
349
                      in (c1/c2, m1/m2, d1/d2, n1/n2)) nodes
350
      -- metric: conflicting instance count
351
      pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
352
      pri_tags_score = fromIntegral pri_tags_inst::Double
353
      -- metric: spindles %
354
      spindles_cv = map (\n -> Node.instSpindles n / Node.hiSpindles n) nodes
355
  in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
356
     , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
357
     , pri_tags_score, stdDev spindles_cv ]
358

    
359
-- | Compute the /total/ variance.
360
compCVNodes :: [Node.Node] -> Double
361
compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
362

    
363
-- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
364
compCV :: Node.List -> Double
365
compCV = compCVNodes . Container.elems
366

    
367
-- | Compute online nodes from a 'Node.List'.
368
getOnline :: Node.List -> [Node.Node]
369
getOnline = filter (not . Node.offline) . Container.elems
370

    
371
-- * Balancing functions
372

    
373
-- | Compute best table. Note that the ordering of the arguments is important.
374
compareTables :: Table -> Table -> Table
375
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
376
  if a_cv > b_cv then b else a
377

    
378
-- | Applies an instance move to a given node list and instance.
379
applyMove :: Node.List -> Instance.Instance
380
          -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
381
-- Failover (f)
382
applyMove nl inst Failover =
383
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
384
      int_p = Node.removePri old_p inst
385
      int_s = Node.removeSec old_s inst
386
      new_nl = do -- Maybe monad
387
        new_p <- Node.addPriEx (Node.offline old_p) int_s inst
388
        new_s <- Node.addSec int_p inst old_sdx
389
        let new_inst = Instance.setBoth inst old_sdx old_pdx
390
        return (Container.addTwo old_pdx new_s old_sdx new_p nl,
391
                new_inst, old_sdx, old_pdx)
392
  in new_nl
393

    
394
-- Replace the primary (f:, r:np, f)
395
applyMove nl inst (ReplacePrimary new_pdx) =
396
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
397
      tgt_n = Container.find new_pdx nl
398
      int_p = Node.removePri old_p inst
399
      int_s = Node.removeSec old_s inst
400
      force_p = Node.offline old_p
401
      new_nl = do -- Maybe monad
402
                  -- check that the current secondary can host the instance
403
                  -- during the migration
404
        tmp_s <- Node.addPriEx force_p int_s inst
405
        let tmp_s' = Node.removePri tmp_s inst
406
        new_p <- Node.addPriEx force_p tgt_n inst
407
        new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
408
        let new_inst = Instance.setPri inst new_pdx
409
        return (Container.add new_pdx new_p $
410
                Container.addTwo old_pdx int_p old_sdx new_s nl,
411
                new_inst, new_pdx, old_sdx)
412
  in new_nl
413

    
414
-- Replace the secondary (r:ns)
415
applyMove nl inst (ReplaceSecondary new_sdx) =
416
  let old_pdx = Instance.pNode inst
417
      old_sdx = Instance.sNode inst
418
      old_s = Container.find old_sdx nl
419
      tgt_n = Container.find new_sdx nl
420
      int_s = Node.removeSec old_s inst
421
      force_s = Node.offline old_s
422
      new_inst = Instance.setSec inst new_sdx
423
      new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
424
               \new_s -> return (Container.addTwo new_sdx
425
                                 new_s old_sdx int_s nl,
426
                                 new_inst, old_pdx, new_sdx)
427
  in new_nl
428

    
429
-- Replace the secondary and failover (r:np, f)
430
applyMove nl inst (ReplaceAndFailover new_pdx) =
431
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
432
      tgt_n = Container.find new_pdx nl
433
      int_p = Node.removePri old_p inst
434
      int_s = Node.removeSec old_s inst
435
      force_s = Node.offline old_s
436
      new_nl = do -- Maybe monad
437
        new_p <- Node.addPri tgt_n inst
438
        new_s <- Node.addSecEx force_s int_p inst new_pdx
439
        let new_inst = Instance.setBoth inst new_pdx old_pdx
440
        return (Container.add new_pdx new_p $
441
                Container.addTwo old_pdx new_s old_sdx int_s nl,
442
                new_inst, new_pdx, old_pdx)
443
  in new_nl
444

    
445
-- Failver and replace the secondary (f, r:ns)
446
applyMove nl inst (FailoverAndReplace new_sdx) =
447
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
448
      tgt_n = Container.find new_sdx nl
449
      int_p = Node.removePri old_p inst
450
      int_s = Node.removeSec old_s inst
451
      force_p = Node.offline old_p
452
      new_nl = do -- Maybe monad
453
        new_p <- Node.addPriEx force_p int_s inst
454
        new_s <- Node.addSecEx force_p tgt_n inst old_sdx
455
        let new_inst = Instance.setBoth inst old_sdx new_sdx
456
        return (Container.add new_sdx new_s $
457
                Container.addTwo old_sdx new_p old_pdx int_p nl,
458
                new_inst, old_sdx, new_sdx)
459
  in new_nl
460

    
461
-- | Tries to allocate an instance on one given node.
462
allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
463
                 -> OpResult Node.AllocElement
464
allocateOnSingle nl inst new_pdx =
465
  let p = Container.find new_pdx nl
466
      new_inst = Instance.setBoth inst new_pdx Node.noSecondary
467
  in do
468
    Instance.instMatchesPolicy inst (Node.iPolicy p)
469
    new_p <- Node.addPri p inst
470
    let new_nl = Container.add new_pdx new_p nl
471
        new_score = compCV nl
472
    return (new_nl, new_inst, [new_p], new_score)
473

    
474
-- | Tries to allocate an instance on a given pair of nodes.
475
allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
476
               -> OpResult Node.AllocElement
477
allocateOnPair nl inst new_pdx new_sdx =
478
  let tgt_p = Container.find new_pdx nl
479
      tgt_s = Container.find new_sdx nl
480
  in do
481
    Instance.instMatchesPolicy inst (Node.iPolicy tgt_p)
482
    new_p <- Node.addPri tgt_p inst
483
    new_s <- Node.addSec tgt_s inst new_pdx
484
    let new_inst = Instance.setBoth inst new_pdx new_sdx
485
        new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
486
    return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
487

    
488
-- | Tries to perform an instance move and returns the best table
489
-- between the original one and the new one.
490
checkSingleStep :: Table -- ^ The original table
491
                -> Instance.Instance -- ^ The instance to move
492
                -> Table -- ^ The current best table
493
                -> IMove -- ^ The move to apply
494
                -> Table -- ^ The final best table
495
checkSingleStep ini_tbl target cur_tbl move =
496
  let Table ini_nl ini_il _ ini_plc = ini_tbl
497
      tmp_resu = applyMove ini_nl target move
498
  in case tmp_resu of
499
       OpFail _ -> cur_tbl
500
       OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
501
         let tgt_idx = Instance.idx target
502
             upd_cvar = compCV upd_nl
503
             upd_il = Container.add tgt_idx new_inst ini_il
504
             upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
505
             upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
506
         in compareTables cur_tbl upd_tbl
507

    
508
-- | Given the status of the current secondary as a valid new node and
509
-- the current candidate target node, generate the possible moves for
510
-- a instance.
511
possibleMoves :: Bool      -- ^ Whether the secondary node is a valid new node
512
              -> Bool      -- ^ Whether we can change the primary node
513
              -> Ndx       -- ^ Target node candidate
514
              -> [IMove]   -- ^ List of valid result moves
515

    
516
possibleMoves _ False tdx =
517
  [ReplaceSecondary tdx]
518

    
519
possibleMoves True True tdx =
520
  [ ReplaceSecondary tdx
521
  , ReplaceAndFailover tdx
522
  , ReplacePrimary tdx
523
  , FailoverAndReplace tdx
524
  ]
525

    
526
possibleMoves False True tdx =
527
  [ ReplaceSecondary tdx
528
  , ReplaceAndFailover tdx
529
  ]
530

    
531
-- | Compute the best move for a given instance.
532
checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
533
                  -> Bool              -- ^ Whether disk moves are allowed
534
                  -> Bool              -- ^ Whether instance moves are allowed
535
                  -> Table             -- ^ Original table
536
                  -> Instance.Instance -- ^ Instance to move
537
                  -> Table             -- ^ Best new table for this instance
538
checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
539
  let opdx = Instance.pNode target
540
      osdx = Instance.sNode target
541
      bad_nodes = [opdx, osdx]
542
      nodes = filter (`notElem` bad_nodes) nodes_idx
543
      use_secondary = elem osdx nodes_idx && inst_moves
544
      aft_failover = if use_secondary -- if allowed to failover
545
                       then checkSingleStep ini_tbl target ini_tbl Failover
546
                       else ini_tbl
547
      all_moves = if disk_moves
548
                    then concatMap
549
                           (possibleMoves use_secondary inst_moves) nodes
550
                    else []
551
    in
552
      -- iterate over the possible nodes for this instance
553
      foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
554

    
555
-- | Compute the best next move.
556
checkMove :: [Ndx]               -- ^ Allowed target node indices
557
          -> Bool                -- ^ Whether disk moves are allowed
558
          -> Bool                -- ^ Whether instance moves are allowed
559
          -> Table               -- ^ The current solution
560
          -> [Instance.Instance] -- ^ List of instances still to move
561
          -> Table               -- ^ The new solution
562
checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
563
  let Table _ _ _ ini_plc = ini_tbl
564
      -- we're using rwhnf from the Control.Parallel.Strategies
565
      -- package; we don't need to use rnf as that would force too
566
      -- much evaluation in single-threaded cases, and in
567
      -- multi-threaded case the weak head normal form is enough to
568
      -- spark the evaluation
569
      tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
570
                             inst_moves ini_tbl)
571
               victims
572
      -- iterate over all instances, computing the best move
573
      best_tbl = foldl' compareTables ini_tbl tables
574
      Table _ _ _ best_plc = best_tbl
575
  in if length best_plc == length ini_plc
576
       then ini_tbl -- no advancement
577
       else best_tbl
578

    
579
-- | Check if we are allowed to go deeper in the balancing.
580
doNextBalance :: Table     -- ^ The starting table
581
              -> Int       -- ^ Remaining length
582
              -> Score     -- ^ Score at which to stop
583
              -> Bool      -- ^ The resulting table and commands
584
doNextBalance ini_tbl max_rounds min_score =
585
  let Table _ _ ini_cv ini_plc = ini_tbl
586
      ini_plc_len = length ini_plc
587
  in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
588

    
589
-- | Run a balance move.
590
tryBalance :: Table       -- ^ The starting table
591
           -> Bool        -- ^ Allow disk moves
592
           -> Bool        -- ^ Allow instance moves
593
           -> Bool        -- ^ Only evacuate moves
594
           -> Score       -- ^ Min gain threshold
595
           -> Score       -- ^ Min gain
596
           -> Maybe Table -- ^ The resulting table and commands
597
tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
598
    let Table ini_nl ini_il ini_cv _ = ini_tbl
599
        all_inst = Container.elems ini_il
600
        all_inst' = if evac_mode
601
                    then let bad_nodes = map Node.idx . filter Node.offline $
602
                                         Container.elems ini_nl
603
                         in filter (any (`elem` bad_nodes) . Instance.allNodes)
604
                            all_inst
605
                    else all_inst
606
        reloc_inst = filter Instance.movable all_inst'
607
        node_idx = map Node.idx . filter (not . Node.offline) $
608
                   Container.elems ini_nl
609
        fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
610
        (Table _ _ fin_cv _) = fin_tbl
611
    in
612
      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
613
      then Just fin_tbl -- this round made success, return the new table
614
      else Nothing
615

    
616
-- * Allocation functions
617

    
618
-- | Build failure stats out of a list of failures.
619
collapseFailures :: [FailMode] -> FailStats
620
collapseFailures flst =
621
    map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
622
            [minBound..maxBound]
623

    
624
-- | Compares two Maybe AllocElement and chooses the besst score.
625
bestAllocElement :: Maybe Node.AllocElement
626
                 -> Maybe Node.AllocElement
627
                 -> Maybe Node.AllocElement
628
bestAllocElement a Nothing = a
629
bestAllocElement Nothing b = b
630
bestAllocElement a@(Just (_, _, _, ascore)) b@(Just (_, _, _, bscore)) =
631
  if ascore < bscore then a else b
632

    
633
-- | Update current Allocation solution and failure stats with new
634
-- elements.
635
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
636
concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
637

    
638
concatAllocs as (OpGood ns) =
639
  let -- Choose the old or new solution, based on the cluster score
640
    cntok = asAllocs as
641
    osols = asSolution as
642
    nsols = bestAllocElement osols (Just ns)
643
    nsuc = cntok + 1
644
    -- Note: we force evaluation of nsols here in order to keep the
645
    -- memory profile low - we know that we will need nsols for sure
646
    -- in the next cycle, so we force evaluation of nsols, since the
647
    -- foldl' in the caller will only evaluate the tuple, but not the
648
    -- elements of the tuple
649
  in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolution = nsols }
650

    
651
-- | Sums two 'AllocSolution' structures.
652
sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
653
sumAllocs (AllocSolution aFails aAllocs aSols aLog)
654
          (AllocSolution bFails bAllocs bSols bLog) =
655
  -- note: we add b first, since usually it will be smaller; when
656
  -- fold'ing, a will grow and grow whereas b is the per-group
657
  -- result, hence smaller
658
  let nFails  = bFails ++ aFails
659
      nAllocs = aAllocs + bAllocs
660
      nSols   = bestAllocElement aSols bSols
661
      nLog    = bLog ++ aLog
662
  in AllocSolution nFails nAllocs nSols nLog
663

    
664
-- | Given a solution, generates a reasonable description for it.
665
describeSolution :: AllocSolution -> String
666
describeSolution as =
667
  let fcnt = asFailures as
668
      sols = asSolution as
669
      freasons =
670
        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
671
        filter ((> 0) . snd) . collapseFailures $ fcnt
672
  in case sols of
673
     Nothing -> "No valid allocation solutions, failure reasons: " ++
674
                (if null fcnt then "unknown reasons" else freasons)
675
     Just (_, _, nodes, cv) ->
676
         printf ("score: %.8f, successes %d, failures %d (%s)" ++
677
                 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
678
               (intercalate "/" . map Node.name $ nodes)
679

    
680
-- | Annotates a solution with the appropriate string.
681
annotateSolution :: AllocSolution -> AllocSolution
682
annotateSolution as = as { asLog = describeSolution as : asLog as }
683

    
684
-- | Reverses an evacuation solution.
685
--
686
-- Rationale: we always concat the results to the top of the lists, so
687
-- for proper jobset execution, we should reverse all lists.
688
reverseEvacSolution :: EvacSolution -> EvacSolution
689
reverseEvacSolution (EvacSolution f m o) =
690
  EvacSolution (reverse f) (reverse m) (reverse o)
691

    
692
-- | Generate the valid node allocation singles or pairs for a new instance.
693
genAllocNodes :: Group.List        -- ^ Group list
694
              -> Node.List         -- ^ The node map
695
              -> Int               -- ^ The number of nodes required
696
              -> Bool              -- ^ Whether to drop or not
697
                                   -- unallocable nodes
698
              -> Result AllocNodes -- ^ The (monadic) result
699
genAllocNodes gl nl count drop_unalloc =
700
  let filter_fn = if drop_unalloc
701
                    then filter (Group.isAllocable .
702
                                 flip Container.find gl . Node.group)
703
                    else id
704
      all_nodes = filter_fn $ getOnline nl
705
      all_pairs = [(Node.idx p,
706
                    [Node.idx s | s <- all_nodes,
707
                                       Node.idx p /= Node.idx s,
708
                                       Node.group p == Node.group s]) |
709
                   p <- all_nodes]
710
  in case count of
711
       1 -> Ok (Left (map Node.idx all_nodes))
712
       2 -> Ok (Right (filter (not . null . snd) all_pairs))
713
       _ -> Bad "Unsupported number of nodes, only one or two  supported"
714

    
715
-- | Try to allocate an instance on the cluster.
716
tryAlloc :: (Monad m) =>
717
            Node.List         -- ^ The node list
718
         -> Instance.List     -- ^ The instance list
719
         -> Instance.Instance -- ^ The instance to allocate
720
         -> AllocNodes        -- ^ The allocation targets
721
         -> m AllocSolution   -- ^ Possible solution list
722
tryAlloc _  _ _    (Right []) = fail "Not enough online nodes"
723
tryAlloc nl _ inst (Right ok_pairs) =
724
  let psols = parMap rwhnf (\(p, ss) ->
725
                              foldl' (\cstate ->
726
                                        concatAllocs cstate .
727
                                        allocateOnPair nl inst p)
728
                              emptyAllocSolution ss) ok_pairs
729
      sols = foldl' sumAllocs emptyAllocSolution psols
730
  in return $ annotateSolution sols
731

    
732
tryAlloc _  _ _    (Left []) = fail "No online nodes"
733
tryAlloc nl _ inst (Left all_nodes) =
734
  let sols = foldl' (\cstate ->
735
                       concatAllocs cstate . allocateOnSingle nl inst
736
                    ) emptyAllocSolution all_nodes
737
  in return $ annotateSolution sols
738

    
739
-- | Given a group/result, describe it as a nice (list of) messages.
740
solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
741
solutionDescription gl (groupId, result) =
742
  case result of
743
    Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
744
    Bad message -> [printf "Group %s: error %s" gname message]
745
  where grp = Container.find groupId gl
746
        gname = Group.name grp
747
        pol = allocPolicyToRaw (Group.allocPolicy grp)
748

    
749
-- | From a list of possibly bad and possibly empty solutions, filter
750
-- only the groups with a valid result. Note that the result will be
751
-- reversed compared to the original list.
752
filterMGResults :: Group.List
753
                -> [(Gdx, Result AllocSolution)]
754
                -> [(Gdx, AllocSolution)]
755
filterMGResults gl = foldl' fn []
756
  where unallocable = not . Group.isAllocable . flip Container.find gl
757
        fn accu (gdx, rasol) =
758
          case rasol of
759
            Bad _ -> accu
760
            Ok sol | isNothing (asSolution sol) -> accu
761
                   | unallocable gdx -> accu
762
                   | otherwise -> (gdx, sol):accu
763

    
764
-- | Sort multigroup results based on policy and score.
765
sortMGResults :: Group.List
766
             -> [(Gdx, AllocSolution)]
767
             -> [(Gdx, AllocSolution)]
768
sortMGResults gl sols =
769
  let extractScore (_, _, _, x) = x
770
      solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
771
                             (extractScore . fromJust . asSolution) sol)
772
  in sortBy (comparing solScore) sols
773

    
774
-- | Finds the best group for an instance on a multi-group cluster.
775
--
776
-- Only solutions in @preferred@ and @last_resort@ groups will be
777
-- accepted as valid, and additionally if the allowed groups parameter
778
-- is not null then allocation will only be run for those group
779
-- indices.
780
findBestAllocGroup :: Group.List           -- ^ The group list
781
                   -> Node.List            -- ^ The node list
782
                   -> Instance.List        -- ^ The instance list
783
                   -> Maybe [Gdx]          -- ^ The allowed groups
784
                   -> Instance.Instance    -- ^ The instance to allocate
785
                   -> Int                  -- ^ Required number of nodes
786
                   -> Result (Gdx, AllocSolution, [String])
787
findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
788
  let groups = splitCluster mgnl mgil
789
      groups' = maybe groups (\gs -> filter ((`elem` gs) . fst) groups)
790
                allowed_gdxs
791
      sols = map (\(gid, (nl, il)) ->
792
                   (gid, genAllocNodes mggl nl cnt False >>=
793
                       tryAlloc nl il inst))
794
             groups'::[(Gdx, Result AllocSolution)]
795
      all_msgs = concatMap (solutionDescription mggl) sols
796
      goodSols = filterMGResults mggl sols
797
      sortedSols = sortMGResults mggl goodSols
798
  in if null sortedSols
799
       then if null groups'
800
              then Bad $ "no groups for evacuation: allowed groups was" ++
801
                     show allowed_gdxs ++ ", all groups: " ++
802
                     show (map fst groups)
803
              else Bad $ intercalate ", " all_msgs
804
       else let (final_group, final_sol) = head sortedSols
805
            in return (final_group, final_sol, all_msgs)
806

    
807
-- | Try to allocate an instance on a multi-group cluster.
808
tryMGAlloc :: Group.List           -- ^ The group list
809
           -> Node.List            -- ^ The node list
810
           -> Instance.List        -- ^ The instance list
811
           -> Instance.Instance    -- ^ The instance to allocate
812
           -> Int                  -- ^ Required number of nodes
813
           -> Result AllocSolution -- ^ Possible solution list
814
tryMGAlloc mggl mgnl mgil inst cnt = do
815
  (best_group, solution, all_msgs) <-
816
      findBestAllocGroup mggl mgnl mgil Nothing inst cnt
817
  let group_name = Group.name $ Container.find best_group mggl
818
      selmsg = "Selected group: " ++ group_name
819
  return $ solution { asLog = selmsg:all_msgs }
820

    
821
-- | Function which fails if the requested mode is change secondary.
822
--
823
-- This is useful since except DRBD, no other disk template can
824
-- execute change secondary; thus, we can just call this function
825
-- instead of always checking for secondary mode. After the call to
826
-- this function, whatever mode we have is just a primary change.
827
failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
828
failOnSecondaryChange ChangeSecondary dt =
829
  fail $ "Instances with disk template '" ++ diskTemplateToRaw dt ++
830
         "' can't execute change secondary"
831
failOnSecondaryChange _ _ = return ()
832

    
833
-- | Run evacuation for a single instance.
834
--
835
-- /Note:/ this function should correctly execute both intra-group
836
-- evacuations (in all modes) and inter-group evacuations (in the
837
-- 'ChangeAll' mode). Of course, this requires that the correct list
838
-- of target nodes is passed.
839
nodeEvacInstance :: Node.List         -- ^ The node list (cluster-wide)
840
                 -> Instance.List     -- ^ Instance list (cluster-wide)
841
                 -> EvacMode          -- ^ The evacuation mode
842
                 -> Instance.Instance -- ^ The instance to be evacuated
843
                 -> Gdx               -- ^ The group we're targetting
844
                 -> [Ndx]             -- ^ The list of available nodes
845
                                      -- for allocation
846
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
847
nodeEvacInstance _ _ mode (Instance.Instance
848
                           {Instance.diskTemplate = dt@DTDiskless}) _ _ =
849
                  failOnSecondaryChange mode dt >>
850
                  fail "Diskless relocations not implemented yet"
851

    
852
nodeEvacInstance _ _ _ (Instance.Instance
853
                        {Instance.diskTemplate = DTPlain}) _ _ =
854
                  fail "Instances of type plain cannot be relocated"
855

    
856
nodeEvacInstance _ _ _ (Instance.Instance
857
                        {Instance.diskTemplate = DTFile}) _ _ =
858
                  fail "Instances of type file cannot be relocated"
859

    
860
nodeEvacInstance _ _ mode  (Instance.Instance
861
                            {Instance.diskTemplate = dt@DTSharedFile}) _ _ =
862
                  failOnSecondaryChange mode dt >>
863
                  fail "Shared file relocations not implemented yet"
864

    
865
nodeEvacInstance _ _ mode (Instance.Instance
866
                           {Instance.diskTemplate = dt@DTBlock}) _ _ =
867
                  failOnSecondaryChange mode dt >>
868
                  fail "Block device relocations not implemented yet"
869

    
870
nodeEvacInstance _ _ mode  (Instance.Instance
871
                            {Instance.diskTemplate = dt@DTRbd}) _ _ =
872
                  failOnSecondaryChange mode dt >>
873
                  fail "Rbd relocations not implemented yet"
874

    
875
nodeEvacInstance nl il ChangePrimary
876
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
877
                 _ _ =
878
  do
879
    (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
880
    let idx = Instance.idx inst
881
        il' = Container.add idx inst' il
882
        ops = iMoveToJob nl' il' idx Failover
883
    return (nl', il', ops)
884

    
885
nodeEvacInstance nl il ChangeSecondary
886
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
887
                 gdx avail_nodes =
888
  do
889
    (nl', inst', _, ndx) <- annotateResult "Can't find any good node" $
890
                            eitherToResult $
891
                            foldl' (evacDrbdSecondaryInner nl inst gdx)
892
                            (Left "no nodes available") avail_nodes
893
    let idx = Instance.idx inst
894
        il' = Container.add idx inst' il
895
        ops = iMoveToJob nl' il' idx (ReplaceSecondary ndx)
896
    return (nl', il', ops)
897

    
898
-- The algorithm for ChangeAll is as follows:
899
--
900
-- * generate all (primary, secondary) node pairs for the target groups
901
-- * for each pair, execute the needed moves (r:s, f, r:s) and compute
902
--   the final node list state and group score
903
-- * select the best choice via a foldl that uses the same Either
904
--   String solution as the ChangeSecondary mode
905
nodeEvacInstance nl il ChangeAll
906
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
907
                 gdx avail_nodes =
908
  do
909
    let no_nodes = Left "no nodes available"
910
        node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s]
911
    (nl', il', ops, _) <-
912
        annotateResult "Can't find any good nodes for relocation" $
913
        eitherToResult $
914
        foldl'
915
        (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of
916
                          Bad msg ->
917
                              case accu of
918
                                Right _ -> accu
919
                                -- we don't need more details (which
920
                                -- nodes, etc.) as we only selected
921
                                -- this group if we can allocate on
922
                                -- it, hence failures will not
923
                                -- propagate out of this fold loop
924
                                Left _ -> Left $ "Allocation failed: " ++ msg
925
                          Ok result@(_, _, _, new_cv) ->
926
                              let new_accu = Right result in
927
                              case accu of
928
                                Left _ -> new_accu
929
                                Right (_, _, _, old_cv) ->
930
                                    if old_cv < new_cv
931
                                    then accu
932
                                    else new_accu
933
        ) no_nodes node_pairs
934

    
935
    return (nl', il', ops)
936

    
937
-- | Inner fold function for changing secondary of a DRBD instance.
938
--
939
-- The running solution is either a @Left String@, which means we
940
-- don't have yet a working solution, or a @Right (...)@, which
941
-- represents a valid solution; it holds the modified node list, the
942
-- modified instance (after evacuation), the score of that solution,
943
-- and the new secondary node index.
944
evacDrbdSecondaryInner :: Node.List -- ^ Cluster node list
945
                       -> Instance.Instance -- ^ Instance being evacuated
946
                       -> Gdx -- ^ The group index of the instance
947
                       -> Either String ( Node.List
948
                                        , Instance.Instance
949
                                        , Score
950
                                        , Ndx)  -- ^ Current best solution
951
                       -> Ndx  -- ^ Node we're evaluating as new secondary
952
                       -> Either String ( Node.List
953
                                        , Instance.Instance
954
                                        , Score
955
                                        , Ndx) -- ^ New best solution
956
evacDrbdSecondaryInner nl inst gdx accu ndx =
957
  case applyMove nl inst (ReplaceSecondary ndx) of
958
    OpFail fm ->
959
      case accu of
960
        Right _ -> accu
961
        Left _ -> Left $ "Node " ++ Container.nameOf nl ndx ++
962
                  " failed: " ++ show fm
963
    OpGood (nl', inst', _, _) ->
964
      let nodes = Container.elems nl'
965
          -- The fromJust below is ugly (it can fail nastily), but
966
          -- at this point we should have any internal mismatches,
967
          -- and adding a monad here would be quite involved
968
          grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
969
          new_cv = compCVNodes grpnodes
970
          new_accu = Right (nl', inst', new_cv, ndx)
971
      in case accu of
972
           Left _ -> new_accu
973
           Right (_, _, old_cv, _) ->
974
             if old_cv < new_cv
975
               then accu
976
               else new_accu
977

    
978
-- | Compute result of changing all nodes of a DRBD instance.
979
--
980
-- Given the target primary and secondary node (which might be in a
981
-- different group or not), this function will 'execute' all the
982
-- required steps and assuming all operations succceed, will return
983
-- the modified node and instance lists, the opcodes needed for this
984
-- and the new group score.
985
evacDrbdAllInner :: Node.List         -- ^ Cluster node list
986
                 -> Instance.List     -- ^ Cluster instance list
987
                 -> Instance.Instance -- ^ The instance to be moved
988
                 -> Gdx               -- ^ The target group index
989
                                      -- (which can differ from the
990
                                      -- current group of the
991
                                      -- instance)
992
                 -> (Ndx, Ndx)        -- ^ Tuple of new
993
                                      -- primary\/secondary nodes
994
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score)
995
evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) = do
996
  let primary = Container.find (Instance.pNode inst) nl
997
      idx = Instance.idx inst
998
  -- if the primary is offline, then we first failover
999
  (nl1, inst1, ops1) <-
1000
    if Node.offline primary
1001
      then do
1002
        (nl', inst', _, _) <-
1003
          annotateResult "Failing over to the secondary" $
1004
          opToResult $ applyMove nl inst Failover
1005
        return (nl', inst', [Failover])
1006
      else return (nl, inst, [])
1007
  let (o1, o2, o3) = (ReplaceSecondary t_pdx,
1008
                      Failover,
1009
                      ReplaceSecondary t_sdx)
1010
  -- we now need to execute a replace secondary to the future
1011
  -- primary node
1012
  (nl2, inst2, _, _) <-
1013
    annotateResult "Changing secondary to new primary" $
1014
    opToResult $
1015
    applyMove nl1 inst1 o1
1016
  let ops2 = o1:ops1
1017
  -- we now execute another failover, the primary stays fixed now
1018
  (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" $
1019
                        opToResult $ applyMove nl2 inst2 o2
1020
  let ops3 = o2:ops2
1021
  -- and finally another replace secondary, to the final secondary
1022
  (nl4, inst4, _, _) <-
1023
    annotateResult "Changing secondary to final secondary" $
1024
    opToResult $
1025
    applyMove nl3 inst3 o3
1026
  let ops4 = o3:ops3
1027
      il' = Container.add idx inst4 il
1028
      ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1029
  let nodes = Container.elems nl4
1030
      -- The fromJust below is ugly (it can fail nastily), but
1031
      -- at this point we should have any internal mismatches,
1032
      -- and adding a monad here would be quite involved
1033
      grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1034
      new_cv = compCVNodes grpnodes
1035
  return (nl4, il', ops, new_cv)
1036

    
1037
-- | Computes the nodes in a given group which are available for
1038
-- allocation.
1039
availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1040
                    -> IntSet.IntSet  -- ^ Nodes that are excluded
1041
                    -> Gdx            -- ^ The group for which we
1042
                                      -- query the nodes
1043
                    -> Result [Ndx]   -- ^ List of available node indices
1044
availableGroupNodes group_nodes excl_ndx gdx = do
1045
  local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1046
                 Ok (lookup gdx group_nodes)
1047
  let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1048
  return avail_nodes
1049

    
1050
-- | Updates the evac solution with the results of an instance
1051
-- evacuation.
1052
updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1053
                   -> Idx
1054
                   -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1055
                   -> (Node.List, Instance.List, EvacSolution)
1056
updateEvacSolution (nl, il, es) idx (Bad msg) =
1057
  (nl, il, es { esFailed = (idx, msg):esFailed es})
1058
updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1059
  (nl, il, es { esMoved = new_elem:esMoved es
1060
              , esOpCodes = opcodes:esOpCodes es })
1061
    where inst = Container.find idx il
1062
          new_elem = (idx,
1063
                      instancePriGroup nl inst,
1064
                      Instance.allNodes inst)
1065

    
1066
-- | Node-evacuation IAllocator mode main function.
1067
tryNodeEvac :: Group.List    -- ^ The cluster groups
1068
            -> Node.List     -- ^ The node list (cluster-wide, not per group)
1069
            -> Instance.List -- ^ Instance list (cluster-wide)
1070
            -> EvacMode      -- ^ The evacuation mode
1071
            -> [Idx]         -- ^ List of instance (indices) to be evacuated
1072
            -> Result (Node.List, Instance.List, EvacSolution)
1073
tryNodeEvac _ ini_nl ini_il mode idxs =
1074
  let evac_ndx = nodesToEvacuate ini_il mode idxs
1075
      offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1076
      excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1077
      group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1078
                                           (Container.elems nl))) $
1079
                  splitCluster ini_nl ini_il
1080
      (fin_nl, fin_il, esol) =
1081
        foldl' (\state@(nl, il, _) inst ->
1082
                  let gdx = instancePriGroup nl inst
1083
                      pdx = Instance.pNode inst in
1084
                  updateEvacSolution state (Instance.idx inst) $
1085
                  availableGroupNodes group_ndx
1086
                    (IntSet.insert pdx excl_ndx) gdx >>=
1087
                      nodeEvacInstance nl il mode inst gdx
1088
               )
1089
        (ini_nl, ini_il, emptyEvacSolution)
1090
        (map (`Container.find` ini_il) idxs)
1091
  in return (fin_nl, fin_il, reverseEvacSolution esol)
1092

    
1093
-- | Change-group IAllocator mode main function.
1094
--
1095
-- This is very similar to 'tryNodeEvac', the only difference is that
1096
-- we don't choose as target group the current instance group, but
1097
-- instead:
1098
--
1099
--   1. at the start of the function, we compute which are the target
1100
--   groups; either no groups were passed in, in which case we choose
1101
--   all groups out of which we don't evacuate instance, or there were
1102
--   some groups passed, in which case we use those
1103
--
1104
--   2. for each instance, we use 'findBestAllocGroup' to choose the
1105
--   best group to hold the instance, and then we do what
1106
--   'tryNodeEvac' does, except for this group instead of the current
1107
--   instance group.
1108
--
1109
-- Note that the correct behaviour of this function relies on the
1110
-- function 'nodeEvacInstance' to be able to do correctly both
1111
-- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1112
tryChangeGroup :: Group.List    -- ^ The cluster groups
1113
               -> Node.List     -- ^ The node list (cluster-wide)
1114
               -> Instance.List -- ^ Instance list (cluster-wide)
1115
               -> [Gdx]         -- ^ Target groups; if empty, any
1116
                                -- groups not being evacuated
1117
               -> [Idx]         -- ^ List of instance (indices) to be evacuated
1118
               -> Result (Node.List, Instance.List, EvacSolution)
1119
tryChangeGroup gl ini_nl ini_il gdxs idxs =
1120
  let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1121
                             flip Container.find ini_il) idxs
1122
      target_gdxs = (if null gdxs
1123
                       then Container.keys gl
1124
                       else gdxs) \\ evac_gdxs
1125
      offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1126
      excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1127
      group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1128
                                           (Container.elems nl))) $
1129
                  splitCluster ini_nl ini_il
1130
      (fin_nl, fin_il, esol) =
1131
        foldl' (\state@(nl, il, _) inst ->
1132
                  let solution = do
1133
                        let ncnt = Instance.requiredNodes $
1134
                                   Instance.diskTemplate inst
1135
                        (gdx, _, _) <- findBestAllocGroup gl nl il
1136
                                       (Just target_gdxs) inst ncnt
1137
                        av_nodes <- availableGroupNodes group_ndx
1138
                                    excl_ndx gdx
1139
                        nodeEvacInstance nl il ChangeAll inst gdx av_nodes
1140
                  in updateEvacSolution state (Instance.idx inst) solution
1141
               )
1142
        (ini_nl, ini_il, emptyEvacSolution)
1143
        (map (`Container.find` ini_il) idxs)
1144
  in return (fin_nl, fin_il, reverseEvacSolution esol)
1145

    
1146
-- | Standard-sized allocation method.
1147
--
1148
-- This places instances of the same size on the cluster until we're
1149
-- out of space. The result will be a list of identically-sized
1150
-- instances.
1151
iterateAlloc :: AllocMethod
1152
iterateAlloc nl il limit newinst allocnodes ixes cstats =
1153
  let depth = length ixes
1154
      newname = printf "new-%d" depth::String
1155
      newidx = Container.size il
1156
      newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1157
      newlimit = fmap (flip (-) 1) limit
1158
  in case tryAlloc nl il newi2 allocnodes of
1159
       Bad s -> Bad s
1160
       Ok (AllocSolution { asFailures = errs, asSolution = sols3 }) ->
1161
         let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1162
         case sols3 of
1163
           Nothing -> newsol
1164
           Just (xnl, xi, _, _) ->
1165
             if limit == Just 0
1166
               then newsol
1167
               else iterateAlloc xnl (Container.add newidx xi il)
1168
                      newlimit newinst allocnodes (xi:ixes)
1169
                      (totalResources xnl:cstats)
1170

    
1171
-- | Tiered allocation method.
1172
--
1173
-- This places instances on the cluster, and decreases the spec until
1174
-- we can allocate again. The result will be a list of decreasing
1175
-- instance specs.
1176
tieredAlloc :: AllocMethod
1177
tieredAlloc nl il limit newinst allocnodes ixes cstats =
1178
  case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1179
    Bad s -> Bad s
1180
    Ok (errs, nl', il', ixes', cstats') ->
1181
      let newsol = Ok (errs, nl', il', ixes', cstats')
1182
          ixes_cnt = length ixes'
1183
          (stop, newlimit) = case limit of
1184
                               Nothing -> (False, Nothing)
1185
                               Just n -> (n <= ixes_cnt,
1186
                                            Just (n - ixes_cnt)) in
1187
      if stop then newsol else
1188
          case Instance.shrinkByType newinst . fst . last $
1189
               sortBy (comparing snd) errs of
1190
            Bad _ -> newsol
1191
            Ok newinst' -> tieredAlloc nl' il' newlimit
1192
                           newinst' allocnodes ixes' cstats'
1193

    
1194
-- * Formatting functions
1195

    
1196
-- | Given the original and final nodes, computes the relocation description.
1197
computeMoves :: Instance.Instance -- ^ The instance to be moved
1198
             -> String -- ^ The instance name
1199
             -> IMove  -- ^ The move being performed
1200
             -> String -- ^ New primary
1201
             -> String -- ^ New secondary
1202
             -> (String, [String])
1203
                -- ^ Tuple of moves and commands list; moves is containing
1204
                -- either @/f/@ for failover or @/r:name/@ for replace
1205
                -- secondary, while the command list holds gnt-instance
1206
                -- commands (without that prefix), e.g \"@failover instance1@\"
1207
computeMoves i inam mv c d =
1208
  case mv of
1209
    Failover -> ("f", [mig])
1210
    FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1211
    ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1212
    ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1213
    ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1214
  where morf = if Instance.isRunning i then "migrate" else "failover"
1215
        mig = printf "%s -f %s" morf inam::String
1216
        rep n = printf "replace-disks -n %s %s" n inam
1217

    
1218
-- | Converts a placement to string format.
1219
printSolutionLine :: Node.List     -- ^ The node list
1220
                  -> Instance.List -- ^ The instance list
1221
                  -> Int           -- ^ Maximum node name length
1222
                  -> Int           -- ^ Maximum instance name length
1223
                  -> Placement     -- ^ The current placement
1224
                  -> Int           -- ^ The index of the placement in
1225
                                   -- the solution
1226
                  -> (String, [String])
1227
printSolutionLine nl il nmlen imlen plc pos =
1228
  let pmlen = (2*nmlen + 1)
1229
      (i, p, s, mv, c) = plc
1230
      inst = Container.find i il
1231
      inam = Instance.alias inst
1232
      npri = Node.alias $ Container.find p nl
1233
      nsec = Node.alias $ Container.find s nl
1234
      opri = Node.alias $ Container.find (Instance.pNode inst) nl
1235
      osec = Node.alias $ Container.find (Instance.sNode inst) nl
1236
      (moves, cmds) =  computeMoves inst inam mv npri nsec
1237
      ostr = printf "%s:%s" opri osec::String
1238
      nstr = printf "%s:%s" npri nsec::String
1239
  in (printf "  %3d. %-*s %-*s => %-*s %.8f a=%s"
1240
      pos imlen inam pmlen ostr
1241
      pmlen nstr c moves,
1242
      cmds)
1243

    
1244
-- | Return the instance and involved nodes in an instance move.
1245
--
1246
-- Note that the output list length can vary, and is not required nor
1247
-- guaranteed to be of any specific length.
1248
involvedNodes :: Instance.List -- ^ Instance list, used for retrieving
1249
                               -- the instance from its index; note
1250
                               -- that this /must/ be the original
1251
                               -- instance list, so that we can
1252
                               -- retrieve the old nodes
1253
              -> Placement     -- ^ The placement we're investigating,
1254
                               -- containing the new nodes and
1255
                               -- instance index
1256
              -> [Ndx]         -- ^ Resulting list of node indices
1257
involvedNodes il plc =
1258
  let (i, np, ns, _, _) = plc
1259
      inst = Container.find i il
1260
  in nub $ [np, ns] ++ Instance.allNodes inst
1261

    
1262
-- | Inner function for splitJobs, that either appends the next job to
1263
-- the current jobset, or starts a new jobset.
1264
mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1265
mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1266
mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1267
  | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1268
  | otherwise = ([n]:cjs, ndx)
1269

    
1270
-- | Break a list of moves into independent groups. Note that this
1271
-- will reverse the order of jobs.
1272
splitJobs :: [MoveJob] -> [JobSet]
1273
splitJobs = fst . foldl mergeJobs ([], [])
1274

    
1275
-- | Given a list of commands, prefix them with @gnt-instance@ and
1276
-- also beautify the display a little.
1277
formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1278
formatJob jsn jsl (sn, (_, _, _, cmds)) =
1279
  let out =
1280
        printf "  echo job %d/%d" jsn sn:
1281
        printf "  check":
1282
        map ("  gnt-instance " ++) cmds
1283
  in if sn == 1
1284
       then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1285
       else out
1286

    
1287
-- | Given a list of commands, prefix them with @gnt-instance@ and
1288
-- also beautify the display a little.
1289
formatCmds :: [JobSet] -> String
1290
formatCmds =
1291
  unlines .
1292
  concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1293
                           (zip [1..] js)) .
1294
  zip [1..]
1295

    
1296
-- | Print the node list.
1297
printNodes :: Node.List -> [String] -> String
1298
printNodes nl fs =
1299
  let fields = case fs of
1300
                 [] -> Node.defaultFields
1301
                 "+":rest -> Node.defaultFields ++ rest
1302
                 _ -> fs
1303
      snl = sortBy (comparing Node.idx) (Container.elems nl)
1304
      (header, isnum) = unzip $ map Node.showHeader fields
1305
  in printTable "" header (map (Node.list fields) snl) isnum
1306

    
1307
-- | Print the instance list.
1308
printInsts :: Node.List -> Instance.List -> String
1309
printInsts nl il =
1310
  let sil = sortBy (comparing Instance.idx) (Container.elems il)
1311
      helper inst = [ if Instance.isRunning inst then "R" else " "
1312
                    , Instance.name inst
1313
                    , Container.nameOf nl (Instance.pNode inst)
1314
                    , let sdx = Instance.sNode inst
1315
                      in if sdx == Node.noSecondary
1316
                           then  ""
1317
                           else Container.nameOf nl sdx
1318
                    , if Instance.autoBalance inst then "Y" else "N"
1319
                    , printf "%3d" $ Instance.vcpus inst
1320
                    , printf "%5d" $ Instance.mem inst
1321
                    , printf "%5d" $ Instance.dsk inst `div` 1024
1322
                    , printf "%5.3f" lC
1323
                    , printf "%5.3f" lM
1324
                    , printf "%5.3f" lD
1325
                    , printf "%5.3f" lN
1326
                    ]
1327
          where DynUtil lC lM lD lN = Instance.util inst
1328
      header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1329
               , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1330
      isnum = False:False:False:False:False:repeat True
1331
  in printTable "" header (map helper sil) isnum
1332

    
1333
-- | Shows statistics for a given node list.
1334
printStats :: String -> Node.List -> String
1335
printStats lp nl =
1336
  let dcvs = compDetailedCV $ Container.elems nl
1337
      (weights, names) = unzip detailedCVInfo
1338
      hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1339
      header = [ "Field", "Value", "Weight" ]
1340
      formatted = map (\(w, h, val) ->
1341
                         [ h
1342
                         , printf "%.8f" val
1343
                         , printf "x%.2f" w
1344
                         ]) hd
1345
  in printTable lp header formatted $ False:repeat True
1346

    
1347
-- | Convert a placement into a list of OpCodes (basically a job).
1348
iMoveToJob :: Node.List        -- ^ The node list; only used for node
1349
                               -- names, so any version is good
1350
                               -- (before or after the operation)
1351
           -> Instance.List    -- ^ The instance list; also used for
1352
                               -- names only
1353
           -> Idx              -- ^ The index of the instance being
1354
                               -- moved
1355
           -> IMove            -- ^ The actual move to be described
1356
           -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to
1357
                               -- the given move
1358
iMoveToJob nl il idx move =
1359
  let inst = Container.find idx il
1360
      iname = Instance.name inst
1361
      lookNode  = Just . Container.nameOf nl
1362
      opF = OpCodes.OpInstanceMigrate iname True False True Nothing
1363
      opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n)
1364
              OpCodes.ReplaceNewSecondary [] Nothing
1365
  in case move of
1366
       Failover -> [ opF ]
1367
       ReplacePrimary np -> [ opF, opR np, opF ]
1368
       ReplaceSecondary ns -> [ opR ns ]
1369
       ReplaceAndFailover np -> [ opR np, opF ]
1370
       FailoverAndReplace ns -> [ opF, opR ns ]
1371

    
1372
-- * Node group functions
1373

    
1374
-- | Computes the group of an instance.
1375
instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1376
instanceGroup nl i =
1377
  let sidx = Instance.sNode i
1378
      pnode = Container.find (Instance.pNode i) nl
1379
      snode = if sidx == Node.noSecondary
1380
              then pnode
1381
              else Container.find sidx nl
1382
      pgroup = Node.group pnode
1383
      sgroup = Node.group snode
1384
  in if pgroup /= sgroup
1385
       then fail ("Instance placed accross two node groups, primary " ++
1386
                  show pgroup ++ ", secondary " ++ show sgroup)
1387
       else return pgroup
1388

    
1389
-- | Computes the group of an instance per the primary node.
1390
instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1391
instancePriGroup nl i =
1392
  let pnode = Container.find (Instance.pNode i) nl
1393
  in  Node.group pnode
1394

    
1395
-- | Compute the list of badly allocated instances (split across node
1396
-- groups).
1397
findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1398
findSplitInstances nl =
1399
  filter (not . isOk . instanceGroup nl) . Container.elems
1400

    
1401
-- | Splits a cluster into the component node groups.
1402
splitCluster :: Node.List -> Instance.List ->
1403
                [(Gdx, (Node.List, Instance.List))]
1404
splitCluster nl il =
1405
  let ngroups = Node.computeGroups (Container.elems nl)
1406
  in map (\(guuid, nodes) ->
1407
           let nidxs = map Node.idx nodes
1408
               nodes' = zip nidxs nodes
1409
               instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1410
           in (guuid, (Container.fromList nodes', instances))) ngroups
1411

    
1412
-- | Compute the list of nodes that are to be evacuated, given a list
1413
-- of instances and an evacuation mode.
1414
nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1415
                -> EvacMode      -- ^ The evacuation mode we're using
1416
                -> [Idx]         -- ^ List of instance indices being evacuated
1417
                -> IntSet.IntSet -- ^ Set of node indices
1418
nodesToEvacuate il mode =
1419
  IntSet.delete Node.noSecondary .
1420
  foldl' (\ns idx ->
1421
            let i = Container.find idx il
1422
                pdx = Instance.pNode i
1423
                sdx = Instance.sNode i
1424
                dt = Instance.diskTemplate i
1425
                withSecondary = case dt of
1426
                                  DTDrbd8 -> IntSet.insert sdx ns
1427
                                  _ -> ns
1428
            in case mode of
1429
                 ChangePrimary   -> IntSet.insert pdx ns
1430
                 ChangeSecondary -> withSecondary
1431
                 ChangeAll       -> IntSet.insert pdx withSecondary
1432
         ) IntSet.empty