Statistics
| Branch: | Tag: | Revision:

root / htools / Ganeti / HTools / Cluster.hs @ bebe7a73

History | View | Annotate | Download (62 kB)

1
{-| Implementation of cluster-wide logic.
2

    
3
This module holds all pure cluster-logic; I\/O related functionality
4
goes into the /Main/ module for the individual binaries.
5

    
6
-}
7

    
8
{-
9

    
10
Copyright (C) 2009, 2010, 2011, 2012 Google Inc.
11

    
12
This program is free software; you can redistribute it and/or modify
13
it under the terms of the GNU General Public License as published by
14
the Free Software Foundation; either version 2 of the License, or
15
(at your option) any later version.
16

    
17
This program is distributed in the hope that it will be useful, but
18
WITHOUT ANY WARRANTY; without even the implied warranty of
19
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
General Public License for more details.
21

    
22
You should have received a copy of the GNU General Public License
23
along with this program; if not, write to the Free Software
24
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25
02110-1301, USA.
26

    
27
-}
28

    
29
module Ganeti.HTools.Cluster
30
  (
31
    -- * Types
32
    AllocSolution(..)
33
  , EvacSolution(..)
34
  , Table(..)
35
  , CStats(..)
36
  , AllocResult
37
  , AllocMethod
38
  -- * Generic functions
39
  , totalResources
40
  , computeAllocationDelta
41
  -- * First phase functions
42
  , computeBadItems
43
  -- * Second phase functions
44
  , printSolutionLine
45
  , formatCmds
46
  , involvedNodes
47
  , splitJobs
48
  -- * Display functions
49
  , printNodes
50
  , printInsts
51
  -- * Balacing functions
52
  , checkMove
53
  , doNextBalance
54
  , tryBalance
55
  , compCV
56
  , compCVNodes
57
  , compDetailedCV
58
  , printStats
59
  , iMoveToJob
60
  -- * IAllocator functions
61
  , genAllocNodes
62
  , tryAlloc
63
  , tryMGAlloc
64
  , tryNodeEvac
65
  , tryChangeGroup
66
  , collapseFailures
67
  -- * Allocation functions
68
  , iterateAlloc
69
  , tieredAlloc
70
  -- * Node group functions
71
  , instanceGroup
72
  , findSplitInstances
73
  , splitCluster
74
  ) where
75

    
76
import qualified Data.IntSet as IntSet
77
import Data.List
78
import Data.Maybe (fromJust, isNothing)
79
import Data.Ord (comparing)
80
import Text.Printf (printf)
81

    
82
import qualified Ganeti.HTools.Container as Container
83
import qualified Ganeti.HTools.Instance as Instance
84
import qualified Ganeti.HTools.Node as Node
85
import qualified Ganeti.HTools.Group as Group
86
import Ganeti.HTools.Types
87
import Ganeti.HTools.Utils
88
import Ganeti.HTools.Compat
89
import qualified Ganeti.OpCodes as OpCodes
90

    
91
-- * Types
92

    
93
-- | Allocation\/relocation solution.
94
data AllocSolution = AllocSolution
95
  { asFailures :: [FailMode]              -- ^ Failure counts
96
  , asAllocs   :: Int                     -- ^ Good allocation count
97
  , asSolution :: Maybe Node.AllocElement -- ^ The actual allocation result
98
  , asLog      :: [String]                -- ^ Informational messages
99
  }
100

    
101
-- | Node evacuation/group change iallocator result type. This result
102
-- type consists of actual opcodes (a restricted subset) that are
103
-- transmitted back to Ganeti.
104
data EvacSolution = EvacSolution
105
  { esMoved   :: [(Idx, Gdx, [Ndx])]  -- ^ Instances moved successfully
106
  , esFailed  :: [(Idx, String)]      -- ^ Instances which were not
107
                                      -- relocated
108
  , esOpCodes :: [[OpCodes.OpCode]]   -- ^ List of jobs
109
  } deriving (Show)
110

    
111
-- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
112
type AllocResult = (FailStats, Node.List, Instance.List,
113
                    [Instance.Instance], [CStats])
114

    
115
-- | A type denoting the valid allocation mode/pairs.
116
--
117
-- For a one-node allocation, this will be a @Left ['Ndx']@, whereas
118
-- for a two-node allocation, this will be a @Right [('Ndx',
119
-- ['Ndx'])]@. In the latter case, the list is basically an
120
-- association list, grouped by primary node and holding the potential
121
-- secondary nodes in the sub-list.
122
type AllocNodes = Either [Ndx] [(Ndx, [Ndx])]
123

    
124
-- | The empty solution we start with when computing allocations.
125
emptyAllocSolution :: AllocSolution
126
emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
127
                                   , asSolution = Nothing, asLog = [] }
128

    
129
-- | The empty evac solution.
130
emptyEvacSolution :: EvacSolution
131
emptyEvacSolution = EvacSolution { esMoved = []
132
                                 , esFailed = []
133
                                 , esOpCodes = []
134
                                 }
135

    
136
-- | The complete state for the balancing solution.
137
data Table = Table Node.List Instance.List Score [Placement]
138
             deriving (Show, Read)
139

    
140
-- | Cluster statistics data type.
141
data CStats = CStats
142
  { csFmem :: Integer -- ^ Cluster free mem
143
  , csFdsk :: Integer -- ^ Cluster free disk
144
  , csAmem :: Integer -- ^ Cluster allocatable mem
145
  , csAdsk :: Integer -- ^ Cluster allocatable disk
146
  , csAcpu :: Integer -- ^ Cluster allocatable cpus
147
  , csMmem :: Integer -- ^ Max node allocatable mem
148
  , csMdsk :: Integer -- ^ Max node allocatable disk
149
  , csMcpu :: Integer -- ^ Max node allocatable cpu
150
  , csImem :: Integer -- ^ Instance used mem
151
  , csIdsk :: Integer -- ^ Instance used disk
152
  , csIcpu :: Integer -- ^ Instance used cpu
153
  , csTmem :: Double  -- ^ Cluster total mem
154
  , csTdsk :: Double  -- ^ Cluster total disk
155
  , csTcpu :: Double  -- ^ Cluster total cpus
156
  , csVcpu :: Integer -- ^ Cluster total virtual cpus
157
  , csNcpu :: Double  -- ^ Equivalent to 'csIcpu' but in terms of
158
                      -- physical CPUs, i.e. normalised used phys CPUs
159
  , csXmem :: Integer -- ^ Unnacounted for mem
160
  , csNmem :: Integer -- ^ Node own memory
161
  , csScore :: Score  -- ^ The cluster score
162
  , csNinst :: Int    -- ^ The total number of instances
163
  } deriving (Show, Read)
164

    
165
-- | A simple type for allocation functions.
166
type AllocMethod =  Node.List           -- ^ Node list
167
                 -> Instance.List       -- ^ Instance list
168
                 -> Maybe Int           -- ^ Optional allocation limit
169
                 -> Instance.Instance   -- ^ Instance spec for allocation
170
                 -> AllocNodes          -- ^ Which nodes we should allocate on
171
                 -> [Instance.Instance] -- ^ Allocated instances
172
                 -> [CStats]            -- ^ Running cluster stats
173
                 -> Result AllocResult  -- ^ Allocation result
174

    
175
-- | A simple type for the running solution of evacuations.
176
type EvacInnerState =
177
  Either String (Node.List, Instance.Instance, Score, Ndx)
178

    
179
-- * Utility functions
180

    
181
-- | Verifies the N+1 status and return the affected nodes.
182
verifyN1 :: [Node.Node] -> [Node.Node]
183
verifyN1 = filter Node.failN1
184

    
185
{-| Computes the pair of bad nodes and instances.
186

    
187
The bad node list is computed via a simple 'verifyN1' check, and the
188
bad instance list is the list of primary and secondary instances of
189
those nodes.
190

    
191
-}
192
computeBadItems :: Node.List -> Instance.List ->
193
                   ([Node.Node], [Instance.Instance])
194
computeBadItems nl il =
195
  let bad_nodes = verifyN1 $ getOnline nl
196
      bad_instances = map (`Container.find` il) .
197
                      sort . nub $
198
                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
199
  in
200
    (bad_nodes, bad_instances)
201

    
202
-- | Extracts the node pairs for an instance. This can fail if the
203
-- instance is single-homed. FIXME: this needs to be improved,
204
-- together with the general enhancement for handling non-DRBD moves.
205
instanceNodes :: Node.List -> Instance.Instance ->
206
                 (Ndx, Ndx, Node.Node, Node.Node)
207
instanceNodes nl inst =
208
  let old_pdx = Instance.pNode inst
209
      old_sdx = Instance.sNode inst
210
      old_p = Container.find old_pdx nl
211
      old_s = Container.find old_sdx nl
212
  in (old_pdx, old_sdx, old_p, old_s)
213

    
214
-- | Zero-initializer for the CStats type.
215
emptyCStats :: CStats
216
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
217

    
218
-- | Update stats with data from a new node.
219
updateCStats :: CStats -> Node.Node -> CStats
220
updateCStats cs node =
221
  let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
222
               csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
223
               csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
224
               csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
225
               csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
226
               csVcpu = x_vcpu, csNcpu = x_ncpu,
227
               csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
228
             }
229
        = cs
230
      inc_amem = Node.fMem node - Node.rMem node
231
      inc_amem' = if inc_amem > 0 then inc_amem else 0
232
      inc_adsk = Node.availDisk node
233
      inc_imem = truncate (Node.tMem node) - Node.nMem node
234
                 - Node.xMem node - Node.fMem node
235
      inc_icpu = Node.uCpu node
236
      inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
237
      inc_vcpu = Node.hiCpu node
238
      inc_acpu = Node.availCpu node
239
      inc_ncpu = fromIntegral (Node.uCpu node) /
240
                 iPolicyVcpuRatio (Node.iPolicy node)
241
  in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
242
        , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
243
        , csAmem = x_amem + fromIntegral inc_amem'
244
        , csAdsk = x_adsk + fromIntegral inc_adsk
245
        , csAcpu = x_acpu + fromIntegral inc_acpu
246
        , csMmem = max x_mmem (fromIntegral inc_amem')
247
        , csMdsk = max x_mdsk (fromIntegral inc_adsk)
248
        , csMcpu = max x_mcpu (fromIntegral inc_acpu)
249
        , csImem = x_imem + fromIntegral inc_imem
250
        , csIdsk = x_idsk + fromIntegral inc_idsk
251
        , csIcpu = x_icpu + fromIntegral inc_icpu
252
        , csTmem = x_tmem + Node.tMem node
253
        , csTdsk = x_tdsk + Node.tDsk node
254
        , csTcpu = x_tcpu + Node.tCpu node
255
        , csVcpu = x_vcpu + fromIntegral inc_vcpu
256
        , csNcpu = x_ncpu + inc_ncpu
257
        , csXmem = x_xmem + fromIntegral (Node.xMem node)
258
        , csNmem = x_nmem + fromIntegral (Node.nMem node)
259
        , csNinst = x_ninst + length (Node.pList node)
260
        }
261

    
262
-- | Compute the total free disk and memory in the cluster.
263
totalResources :: Node.List -> CStats
264
totalResources nl =
265
  let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
266
  in cs { csScore = compCV nl }
267

    
268
-- | Compute the delta between two cluster state.
269
--
270
-- This is used when doing allocations, to understand better the
271
-- available cluster resources. The return value is a triple of the
272
-- current used values, the delta that was still allocated, and what
273
-- was left unallocated.
274
computeAllocationDelta :: CStats -> CStats -> AllocStats
275
computeAllocationDelta cini cfin =
276
  let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu,
277
              csNcpu = i_ncpu } = cini
278
      CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
279
              csTmem = t_mem, csTdsk = t_dsk, csVcpu = f_vcpu,
280
              csNcpu = f_ncpu, csTcpu = f_tcpu } = cfin
281
      rini = AllocInfo { allocInfoVCpus = fromIntegral i_icpu
282
                       , allocInfoNCpus = i_ncpu
283
                       , allocInfoMem   = fromIntegral i_imem
284
                       , allocInfoDisk  = fromIntegral i_idsk
285
                       }
286
      rfin = AllocInfo { allocInfoVCpus = fromIntegral (f_icpu - i_icpu)
287
                       , allocInfoNCpus = f_ncpu - i_ncpu
288
                       , allocInfoMem   = fromIntegral (f_imem - i_imem)
289
                       , allocInfoDisk  = fromIntegral (f_idsk - i_idsk)
290
                       }
291
      runa = AllocInfo { allocInfoVCpus = fromIntegral (f_vcpu - f_icpu)
292
                       , allocInfoNCpus = f_tcpu - f_ncpu
293
                       , allocInfoMem   = truncate t_mem - fromIntegral f_imem
294
                       , allocInfoDisk  = truncate t_dsk - fromIntegral f_idsk
295
                       }
296
  in (rini, rfin, runa)
297

    
298
-- | The names and weights of the individual elements in the CV list.
299
detailedCVInfo :: [(Double, String)]
300
detailedCVInfo = [ (1,  "free_mem_cv")
301
                 , (1,  "free_disk_cv")
302
                 , (1,  "n1_cnt")
303
                 , (1,  "reserved_mem_cv")
304
                 , (4,  "offline_all_cnt")
305
                 , (16, "offline_pri_cnt")
306
                 , (1,  "vcpu_ratio_cv")
307
                 , (1,  "cpu_load_cv")
308
                 , (1,  "mem_load_cv")
309
                 , (1,  "disk_load_cv")
310
                 , (1,  "net_load_cv")
311
                 , (2,  "pri_tags_score")
312
                 , (1,  "spindles_cv")
313
                 ]
314

    
315
-- | Holds the weights used by 'compCVNodes' for each metric.
316
detailedCVWeights :: [Double]
317
detailedCVWeights = map fst detailedCVInfo
318

    
319
-- | Compute the mem and disk covariance.
320
compDetailedCV :: [Node.Node] -> [Double]
321
compDetailedCV all_nodes =
322
  let (offline, nodes) = partition Node.offline all_nodes
323
      mem_l = map Node.pMem nodes
324
      dsk_l = map Node.pDsk nodes
325
      -- metric: memory covariance
326
      mem_cv = stdDev mem_l
327
      -- metric: disk covariance
328
      dsk_cv = stdDev dsk_l
329
      -- metric: count of instances living on N1 failing nodes
330
      n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
331
                                                 length (Node.pList n)) .
332
                 filter Node.failN1 $ nodes :: Double
333
      res_l = map Node.pRem nodes
334
      -- metric: reserved memory covariance
335
      res_cv = stdDev res_l
336
      -- offline instances metrics
337
      offline_ipri = sum . map (length . Node.pList) $ offline
338
      offline_isec = sum . map (length . Node.sList) $ offline
339
      -- metric: count of instances on offline nodes
340
      off_score = fromIntegral (offline_ipri + offline_isec)::Double
341
      -- metric: count of primary instances on offline nodes (this
342
      -- helps with evacuation/failover of primary instances on
343
      -- 2-node clusters with one node offline)
344
      off_pri_score = fromIntegral offline_ipri::Double
345
      cpu_l = map Node.pCpu nodes
346
      -- metric: covariance of vcpu/pcpu ratio
347
      cpu_cv = stdDev cpu_l
348
      -- metrics: covariance of cpu, memory, disk and network load
349
      (c_load, m_load, d_load, n_load) =
350
        unzip4 $ map (\n ->
351
                      let DynUtil c1 m1 d1 n1 = Node.utilLoad n
352
                          DynUtil c2 m2 d2 n2 = Node.utilPool n
353
                      in (c1/c2, m1/m2, d1/d2, n1/n2)) nodes
354
      -- metric: conflicting instance count
355
      pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
356
      pri_tags_score = fromIntegral pri_tags_inst::Double
357
      -- metric: spindles %
358
      spindles_cv = map (\n -> Node.instSpindles n / Node.hiSpindles n) nodes
359
  in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
360
     , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
361
     , pri_tags_score, stdDev spindles_cv ]
362

    
363
-- | Compute the /total/ variance.
364
compCVNodes :: [Node.Node] -> Double
365
compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
366

    
367
-- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
368
compCV :: Node.List -> Double
369
compCV = compCVNodes . Container.elems
370

    
371
-- | Compute online nodes from a 'Node.List'.
372
getOnline :: Node.List -> [Node.Node]
373
getOnline = filter (not . Node.offline) . Container.elems
374

    
375
-- * Balancing functions
376

    
377
-- | Compute best table. Note that the ordering of the arguments is important.
378
compareTables :: Table -> Table -> Table
379
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
380
  if a_cv > b_cv then b else a
381

    
382
-- | Applies an instance move to a given node list and instance.
383
applyMove :: Node.List -> Instance.Instance
384
          -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
385
-- Failover (f)
386
applyMove nl inst Failover =
387
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
388
      int_p = Node.removePri old_p inst
389
      int_s = Node.removeSec old_s inst
390
      new_nl = do -- Maybe monad
391
        new_p <- Node.addPriEx (Node.offline old_p) int_s inst
392
        new_s <- Node.addSec int_p inst old_sdx
393
        let new_inst = Instance.setBoth inst old_sdx old_pdx
394
        return (Container.addTwo old_pdx new_s old_sdx new_p nl,
395
                new_inst, old_sdx, old_pdx)
396
  in new_nl
397

    
398
-- Replace the primary (f:, r:np, f)
399
applyMove nl inst (ReplacePrimary new_pdx) =
400
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
401
      tgt_n = Container.find new_pdx nl
402
      int_p = Node.removePri old_p inst
403
      int_s = Node.removeSec old_s inst
404
      force_p = Node.offline old_p
405
      new_nl = do -- Maybe monad
406
                  -- check that the current secondary can host the instance
407
                  -- during the migration
408
        tmp_s <- Node.addPriEx force_p int_s inst
409
        let tmp_s' = Node.removePri tmp_s inst
410
        new_p <- Node.addPriEx force_p tgt_n inst
411
        new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
412
        let new_inst = Instance.setPri inst new_pdx
413
        return (Container.add new_pdx new_p $
414
                Container.addTwo old_pdx int_p old_sdx new_s nl,
415
                new_inst, new_pdx, old_sdx)
416
  in new_nl
417

    
418
-- Replace the secondary (r:ns)
419
applyMove nl inst (ReplaceSecondary new_sdx) =
420
  let old_pdx = Instance.pNode inst
421
      old_sdx = Instance.sNode inst
422
      old_s = Container.find old_sdx nl
423
      tgt_n = Container.find new_sdx nl
424
      int_s = Node.removeSec old_s inst
425
      force_s = Node.offline old_s
426
      new_inst = Instance.setSec inst new_sdx
427
      new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
428
               \new_s -> return (Container.addTwo new_sdx
429
                                 new_s old_sdx int_s nl,
430
                                 new_inst, old_pdx, new_sdx)
431
  in new_nl
432

    
433
-- Replace the secondary and failover (r:np, f)
434
applyMove nl inst (ReplaceAndFailover new_pdx) =
435
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
436
      tgt_n = Container.find new_pdx nl
437
      int_p = Node.removePri old_p inst
438
      int_s = Node.removeSec old_s inst
439
      force_s = Node.offline old_s
440
      new_nl = do -- Maybe monad
441
        new_p <- Node.addPri tgt_n inst
442
        new_s <- Node.addSecEx force_s int_p inst new_pdx
443
        let new_inst = Instance.setBoth inst new_pdx old_pdx
444
        return (Container.add new_pdx new_p $
445
                Container.addTwo old_pdx new_s old_sdx int_s nl,
446
                new_inst, new_pdx, old_pdx)
447
  in new_nl
448

    
449
-- Failver and replace the secondary (f, r:ns)
450
applyMove nl inst (FailoverAndReplace new_sdx) =
451
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
452
      tgt_n = Container.find new_sdx nl
453
      int_p = Node.removePri old_p inst
454
      int_s = Node.removeSec old_s inst
455
      force_p = Node.offline old_p
456
      new_nl = do -- Maybe monad
457
        new_p <- Node.addPriEx force_p int_s inst
458
        new_s <- Node.addSecEx force_p tgt_n inst old_sdx
459
        let new_inst = Instance.setBoth inst old_sdx new_sdx
460
        return (Container.add new_sdx new_s $
461
                Container.addTwo old_sdx new_p old_pdx int_p nl,
462
                new_inst, old_sdx, new_sdx)
463
  in new_nl
464

    
465
-- | Tries to allocate an instance on one given node.
466
allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
467
                 -> OpResult Node.AllocElement
468
allocateOnSingle nl inst new_pdx =
469
  let p = Container.find new_pdx nl
470
      new_inst = Instance.setBoth inst new_pdx Node.noSecondary
471
  in do
472
    Instance.instMatchesPolicy inst (Node.iPolicy p)
473
    new_p <- Node.addPri p inst
474
    let new_nl = Container.add new_pdx new_p nl
475
        new_score = compCV nl
476
    return (new_nl, new_inst, [new_p], new_score)
477

    
478
-- | Tries to allocate an instance on a given pair of nodes.
479
allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
480
               -> OpResult Node.AllocElement
481
allocateOnPair nl inst new_pdx new_sdx =
482
  let tgt_p = Container.find new_pdx nl
483
      tgt_s = Container.find new_sdx nl
484
  in do
485
    Instance.instMatchesPolicy inst (Node.iPolicy tgt_p)
486
    new_p <- Node.addPri tgt_p inst
487
    new_s <- Node.addSec tgt_s inst new_pdx
488
    let new_inst = Instance.setBoth inst new_pdx new_sdx
489
        new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
490
    return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
491

    
492
-- | Tries to perform an instance move and returns the best table
493
-- between the original one and the new one.
494
checkSingleStep :: Table -- ^ The original table
495
                -> Instance.Instance -- ^ The instance to move
496
                -> Table -- ^ The current best table
497
                -> IMove -- ^ The move to apply
498
                -> Table -- ^ The final best table
499
checkSingleStep ini_tbl target cur_tbl move =
500
  let Table ini_nl ini_il _ ini_plc = ini_tbl
501
      tmp_resu = applyMove ini_nl target move
502
  in case tmp_resu of
503
       OpFail _ -> cur_tbl
504
       OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
505
         let tgt_idx = Instance.idx target
506
             upd_cvar = compCV upd_nl
507
             upd_il = Container.add tgt_idx new_inst ini_il
508
             upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
509
             upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
510
         in compareTables cur_tbl upd_tbl
511

    
512
-- | Given the status of the current secondary as a valid new node and
513
-- the current candidate target node, generate the possible moves for
514
-- a instance.
515
possibleMoves :: MirrorType -- ^ The mirroring type of the instance
516
              -> Bool       -- ^ Whether the secondary node is a valid new node
517
              -> Bool       -- ^ Whether we can change the primary node
518
              -> Ndx        -- ^ Target node candidate
519
              -> [IMove]    -- ^ List of valid result moves
520

    
521
possibleMoves MirrorNone _ _ _ = []
522

    
523
possibleMoves MirrorExternal _ _ _ = []
524

    
525
possibleMoves MirrorInternal _ False tdx =
526
  [ ReplaceSecondary tdx ]
527

    
528
possibleMoves MirrorInternal True True tdx =
529
  [ ReplaceSecondary tdx
530
  , ReplaceAndFailover tdx
531
  , ReplacePrimary tdx
532
  , FailoverAndReplace tdx
533
  ]
534

    
535
possibleMoves MirrorInternal False True tdx =
536
  [ ReplaceSecondary tdx
537
  , ReplaceAndFailover tdx
538
  ]
539

    
540
-- | Compute the best move for a given instance.
541
checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
542
                  -> Bool              -- ^ Whether disk moves are allowed
543
                  -> Bool              -- ^ Whether instance moves are allowed
544
                  -> Table             -- ^ Original table
545
                  -> Instance.Instance -- ^ Instance to move
546
                  -> Table             -- ^ Best new table for this instance
547
checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
548
  let opdx = Instance.pNode target
549
      osdx = Instance.sNode target
550
      bad_nodes = [opdx, osdx]
551
      nodes = filter (`notElem` bad_nodes) nodes_idx
552
      mir_type = templateMirrorType $ Instance.diskTemplate target
553
      use_secondary = elem osdx nodes_idx && inst_moves
554
      aft_failover = if mir_type == MirrorInternal && use_secondary
555
                       -- if drbd and allowed to failover
556
                       then checkSingleStep ini_tbl target ini_tbl Failover
557
                       else ini_tbl
558
      all_moves =
559
        if disk_moves
560
          then concatMap (possibleMoves mir_type use_secondary inst_moves)
561
               nodes
562
          else []
563
    in
564
      -- iterate over the possible nodes for this instance
565
      foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
566

    
567
-- | Compute the best next move.
568
checkMove :: [Ndx]               -- ^ Allowed target node indices
569
          -> Bool                -- ^ Whether disk moves are allowed
570
          -> Bool                -- ^ Whether instance moves are allowed
571
          -> Table               -- ^ The current solution
572
          -> [Instance.Instance] -- ^ List of instances still to move
573
          -> Table               -- ^ The new solution
574
checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
575
  let Table _ _ _ ini_plc = ini_tbl
576
      -- we're using rwhnf from the Control.Parallel.Strategies
577
      -- package; we don't need to use rnf as that would force too
578
      -- much evaluation in single-threaded cases, and in
579
      -- multi-threaded case the weak head normal form is enough to
580
      -- spark the evaluation
581
      tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
582
                             inst_moves ini_tbl)
583
               victims
584
      -- iterate over all instances, computing the best move
585
      best_tbl = foldl' compareTables ini_tbl tables
586
      Table _ _ _ best_plc = best_tbl
587
  in if length best_plc == length ini_plc
588
       then ini_tbl -- no advancement
589
       else best_tbl
590

    
591
-- | Check if we are allowed to go deeper in the balancing.
592
doNextBalance :: Table     -- ^ The starting table
593
              -> Int       -- ^ Remaining length
594
              -> Score     -- ^ Score at which to stop
595
              -> Bool      -- ^ The resulting table and commands
596
doNextBalance ini_tbl max_rounds min_score =
597
  let Table _ _ ini_cv ini_plc = ini_tbl
598
      ini_plc_len = length ini_plc
599
  in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
600

    
601
-- | Run a balance move.
602
tryBalance :: Table       -- ^ The starting table
603
           -> Bool        -- ^ Allow disk moves
604
           -> Bool        -- ^ Allow instance moves
605
           -> Bool        -- ^ Only evacuate moves
606
           -> Score       -- ^ Min gain threshold
607
           -> Score       -- ^ Min gain
608
           -> Maybe Table -- ^ The resulting table and commands
609
tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
610
    let Table ini_nl ini_il ini_cv _ = ini_tbl
611
        all_inst = Container.elems ini_il
612
        all_nodes = Container.elems ini_nl
613
        (offline_nodes, online_nodes) = partition Node.offline all_nodes
614
        all_inst' = if evac_mode
615
                      then let bad_nodes = map Node.idx offline_nodes
616
                           in filter (any (`elem` bad_nodes) .
617
                                          Instance.allNodes) all_inst
618
                      else all_inst
619
        reloc_inst = filter Instance.movable all_inst'
620
        node_idx = map Node.idx online_nodes
621
        fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
622
        (Table _ _ fin_cv _) = fin_tbl
623
    in
624
      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
625
      then Just fin_tbl -- this round made success, return the new table
626
      else Nothing
627

    
628
-- * Allocation functions
629

    
630
-- | Build failure stats out of a list of failures.
631
collapseFailures :: [FailMode] -> FailStats
632
collapseFailures flst =
633
    map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
634
            [minBound..maxBound]
635

    
636
-- | Compares two Maybe AllocElement and chooses the besst score.
637
bestAllocElement :: Maybe Node.AllocElement
638
                 -> Maybe Node.AllocElement
639
                 -> Maybe Node.AllocElement
640
bestAllocElement a Nothing = a
641
bestAllocElement Nothing b = b
642
bestAllocElement a@(Just (_, _, _, ascore)) b@(Just (_, _, _, bscore)) =
643
  if ascore < bscore then a else b
644

    
645
-- | Update current Allocation solution and failure stats with new
646
-- elements.
647
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
648
concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
649

    
650
concatAllocs as (OpGood ns) =
651
  let -- Choose the old or new solution, based on the cluster score
652
    cntok = asAllocs as
653
    osols = asSolution as
654
    nsols = bestAllocElement osols (Just ns)
655
    nsuc = cntok + 1
656
    -- Note: we force evaluation of nsols here in order to keep the
657
    -- memory profile low - we know that we will need nsols for sure
658
    -- in the next cycle, so we force evaluation of nsols, since the
659
    -- foldl' in the caller will only evaluate the tuple, but not the
660
    -- elements of the tuple
661
  in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolution = nsols }
662

    
663
-- | Sums two 'AllocSolution' structures.
664
sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
665
sumAllocs (AllocSolution aFails aAllocs aSols aLog)
666
          (AllocSolution bFails bAllocs bSols bLog) =
667
  -- note: we add b first, since usually it will be smaller; when
668
  -- fold'ing, a will grow and grow whereas b is the per-group
669
  -- result, hence smaller
670
  let nFails  = bFails ++ aFails
671
      nAllocs = aAllocs + bAllocs
672
      nSols   = bestAllocElement aSols bSols
673
      nLog    = bLog ++ aLog
674
  in AllocSolution nFails nAllocs nSols nLog
675

    
676
-- | Given a solution, generates a reasonable description for it.
677
describeSolution :: AllocSolution -> String
678
describeSolution as =
679
  let fcnt = asFailures as
680
      sols = asSolution as
681
      freasons =
682
        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
683
        filter ((> 0) . snd) . collapseFailures $ fcnt
684
  in case sols of
685
     Nothing -> "No valid allocation solutions, failure reasons: " ++
686
                (if null fcnt then "unknown reasons" else freasons)
687
     Just (_, _, nodes, cv) ->
688
         printf ("score: %.8f, successes %d, failures %d (%s)" ++
689
                 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
690
               (intercalate "/" . map Node.name $ nodes)
691

    
692
-- | Annotates a solution with the appropriate string.
693
annotateSolution :: AllocSolution -> AllocSolution
694
annotateSolution as = as { asLog = describeSolution as : asLog as }
695

    
696
-- | Reverses an evacuation solution.
697
--
698
-- Rationale: we always concat the results to the top of the lists, so
699
-- for proper jobset execution, we should reverse all lists.
700
reverseEvacSolution :: EvacSolution -> EvacSolution
701
reverseEvacSolution (EvacSolution f m o) =
702
  EvacSolution (reverse f) (reverse m) (reverse o)
703

    
704
-- | Generate the valid node allocation singles or pairs for a new instance.
705
genAllocNodes :: Group.List        -- ^ Group list
706
              -> Node.List         -- ^ The node map
707
              -> Int               -- ^ The number of nodes required
708
              -> Bool              -- ^ Whether to drop or not
709
                                   -- unallocable nodes
710
              -> Result AllocNodes -- ^ The (monadic) result
711
genAllocNodes gl nl count drop_unalloc =
712
  let filter_fn = if drop_unalloc
713
                    then filter (Group.isAllocable .
714
                                 flip Container.find gl . Node.group)
715
                    else id
716
      all_nodes = filter_fn $ getOnline nl
717
      all_pairs = [(Node.idx p,
718
                    [Node.idx s | s <- all_nodes,
719
                                       Node.idx p /= Node.idx s,
720
                                       Node.group p == Node.group s]) |
721
                   p <- all_nodes]
722
  in case count of
723
       1 -> Ok (Left (map Node.idx all_nodes))
724
       2 -> Ok (Right (filter (not . null . snd) all_pairs))
725
       _ -> Bad "Unsupported number of nodes, only one or two  supported"
726

    
727
-- | Try to allocate an instance on the cluster.
728
tryAlloc :: (Monad m) =>
729
            Node.List         -- ^ The node list
730
         -> Instance.List     -- ^ The instance list
731
         -> Instance.Instance -- ^ The instance to allocate
732
         -> AllocNodes        -- ^ The allocation targets
733
         -> m AllocSolution   -- ^ Possible solution list
734
tryAlloc _  _ _    (Right []) = fail "Not enough online nodes"
735
tryAlloc nl _ inst (Right ok_pairs) =
736
  let psols = parMap rwhnf (\(p, ss) ->
737
                              foldl' (\cstate ->
738
                                        concatAllocs cstate .
739
                                        allocateOnPair nl inst p)
740
                              emptyAllocSolution ss) ok_pairs
741
      sols = foldl' sumAllocs emptyAllocSolution psols
742
  in return $ annotateSolution sols
743

    
744
tryAlloc _  _ _    (Left []) = fail "No online nodes"
745
tryAlloc nl _ inst (Left all_nodes) =
746
  let sols = foldl' (\cstate ->
747
                       concatAllocs cstate . allocateOnSingle nl inst
748
                    ) emptyAllocSolution all_nodes
749
  in return $ annotateSolution sols
750

    
751
-- | Given a group/result, describe it as a nice (list of) messages.
752
solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
753
solutionDescription gl (groupId, result) =
754
  case result of
755
    Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
756
    Bad message -> [printf "Group %s: error %s" gname message]
757
  where grp = Container.find groupId gl
758
        gname = Group.name grp
759
        pol = allocPolicyToRaw (Group.allocPolicy grp)
760

    
761
-- | From a list of possibly bad and possibly empty solutions, filter
762
-- only the groups with a valid result. Note that the result will be
763
-- reversed compared to the original list.
764
filterMGResults :: Group.List
765
                -> [(Gdx, Result AllocSolution)]
766
                -> [(Gdx, AllocSolution)]
767
filterMGResults gl = foldl' fn []
768
  where unallocable = not . Group.isAllocable . flip Container.find gl
769
        fn accu (gdx, rasol) =
770
          case rasol of
771
            Bad _ -> accu
772
            Ok sol | isNothing (asSolution sol) -> accu
773
                   | unallocable gdx -> accu
774
                   | otherwise -> (gdx, sol):accu
775

    
776
-- | Sort multigroup results based on policy and score.
777
sortMGResults :: Group.List
778
             -> [(Gdx, AllocSolution)]
779
             -> [(Gdx, AllocSolution)]
780
sortMGResults gl sols =
781
  let extractScore (_, _, _, x) = x
782
      solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
783
                             (extractScore . fromJust . asSolution) sol)
784
  in sortBy (comparing solScore) sols
785

    
786
-- | Finds the best group for an instance on a multi-group cluster.
787
--
788
-- Only solutions in @preferred@ and @last_resort@ groups will be
789
-- accepted as valid, and additionally if the allowed groups parameter
790
-- is not null then allocation will only be run for those group
791
-- indices.
792
findBestAllocGroup :: Group.List           -- ^ The group list
793
                   -> Node.List            -- ^ The node list
794
                   -> Instance.List        -- ^ The instance list
795
                   -> Maybe [Gdx]          -- ^ The allowed groups
796
                   -> Instance.Instance    -- ^ The instance to allocate
797
                   -> Int                  -- ^ Required number of nodes
798
                   -> Result (Gdx, AllocSolution, [String])
799
findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
800
  let groups = splitCluster mgnl mgil
801
      groups' = maybe groups (\gs -> filter ((`elem` gs) . fst) groups)
802
                allowed_gdxs
803
      sols = map (\(gid, (nl, il)) ->
804
                   (gid, genAllocNodes mggl nl cnt False >>=
805
                       tryAlloc nl il inst))
806
             groups'::[(Gdx, Result AllocSolution)]
807
      all_msgs = concatMap (solutionDescription mggl) sols
808
      goodSols = filterMGResults mggl sols
809
      sortedSols = sortMGResults mggl goodSols
810
  in if null sortedSols
811
       then if null groups'
812
              then Bad $ "no groups for evacuation: allowed groups was" ++
813
                     show allowed_gdxs ++ ", all groups: " ++
814
                     show (map fst groups)
815
              else Bad $ intercalate ", " all_msgs
816
       else let (final_group, final_sol) = head sortedSols
817
            in return (final_group, final_sol, all_msgs)
818

    
819
-- | Try to allocate an instance on a multi-group cluster.
820
tryMGAlloc :: Group.List           -- ^ The group list
821
           -> Node.List            -- ^ The node list
822
           -> Instance.List        -- ^ The instance list
823
           -> Instance.Instance    -- ^ The instance to allocate
824
           -> Int                  -- ^ Required number of nodes
825
           -> Result AllocSolution -- ^ Possible solution list
826
tryMGAlloc mggl mgnl mgil inst cnt = do
827
  (best_group, solution, all_msgs) <-
828
      findBestAllocGroup mggl mgnl mgil Nothing inst cnt
829
  let group_name = Group.name $ Container.find best_group mggl
830
      selmsg = "Selected group: " ++ group_name
831
  return $ solution { asLog = selmsg:all_msgs }
832

    
833
-- | Function which fails if the requested mode is change secondary.
834
--
835
-- This is useful since except DRBD, no other disk template can
836
-- execute change secondary; thus, we can just call this function
837
-- instead of always checking for secondary mode. After the call to
838
-- this function, whatever mode we have is just a primary change.
839
failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
840
failOnSecondaryChange ChangeSecondary dt =
841
  fail $ "Instances with disk template '" ++ diskTemplateToRaw dt ++
842
         "' can't execute change secondary"
843
failOnSecondaryChange _ _ = return ()
844

    
845
-- | Run evacuation for a single instance.
846
--
847
-- /Note:/ this function should correctly execute both intra-group
848
-- evacuations (in all modes) and inter-group evacuations (in the
849
-- 'ChangeAll' mode). Of course, this requires that the correct list
850
-- of target nodes is passed.
851
nodeEvacInstance :: Node.List         -- ^ The node list (cluster-wide)
852
                 -> Instance.List     -- ^ Instance list (cluster-wide)
853
                 -> EvacMode          -- ^ The evacuation mode
854
                 -> Instance.Instance -- ^ The instance to be evacuated
855
                 -> Gdx               -- ^ The group we're targetting
856
                 -> [Ndx]             -- ^ The list of available nodes
857
                                      -- for allocation
858
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
859
nodeEvacInstance _ _ mode (Instance.Instance
860
                           {Instance.diskTemplate = dt@DTDiskless}) _ _ =
861
                  failOnSecondaryChange mode dt >>
862
                  fail "Diskless relocations not implemented yet"
863

    
864
nodeEvacInstance _ _ _ (Instance.Instance
865
                        {Instance.diskTemplate = DTPlain}) _ _ =
866
                  fail "Instances of type plain cannot be relocated"
867

    
868
nodeEvacInstance _ _ _ (Instance.Instance
869
                        {Instance.diskTemplate = DTFile}) _ _ =
870
                  fail "Instances of type file cannot be relocated"
871

    
872
nodeEvacInstance _ _ mode  (Instance.Instance
873
                            {Instance.diskTemplate = dt@DTSharedFile}) _ _ =
874
                  failOnSecondaryChange mode dt >>
875
                  fail "Shared file relocations not implemented yet"
876

    
877
nodeEvacInstance _ _ mode (Instance.Instance
878
                           {Instance.diskTemplate = dt@DTBlock}) _ _ =
879
                  failOnSecondaryChange mode dt >>
880
                  fail "Block device relocations not implemented yet"
881

    
882
nodeEvacInstance _ _ mode  (Instance.Instance
883
                            {Instance.diskTemplate = dt@DTRbd}) _ _ =
884
                  failOnSecondaryChange mode dt >>
885
                  fail "Rbd relocations not implemented yet"
886

    
887
nodeEvacInstance nl il ChangePrimary
888
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
889
                 _ _ =
890
  do
891
    (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
892
    let idx = Instance.idx inst
893
        il' = Container.add idx inst' il
894
        ops = iMoveToJob nl' il' idx Failover
895
    return (nl', il', ops)
896

    
897
nodeEvacInstance nl il ChangeSecondary
898
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
899
                 gdx avail_nodes =
900
  do
901
    (nl', inst', _, ndx) <- annotateResult "Can't find any good node" $
902
                            eitherToResult $
903
                            foldl' (evacDrbdSecondaryInner nl inst gdx)
904
                            (Left "no nodes available") avail_nodes
905
    let idx = Instance.idx inst
906
        il' = Container.add idx inst' il
907
        ops = iMoveToJob nl' il' idx (ReplaceSecondary ndx)
908
    return (nl', il', ops)
909

    
910
-- The algorithm for ChangeAll is as follows:
911
--
912
-- * generate all (primary, secondary) node pairs for the target groups
913
-- * for each pair, execute the needed moves (r:s, f, r:s) and compute
914
--   the final node list state and group score
915
-- * select the best choice via a foldl that uses the same Either
916
--   String solution as the ChangeSecondary mode
917
nodeEvacInstance nl il ChangeAll
918
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
919
                 gdx avail_nodes =
920
  do
921
    let no_nodes = Left "no nodes available"
922
        node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s]
923
    (nl', il', ops, _) <-
924
        annotateResult "Can't find any good nodes for relocation" $
925
        eitherToResult $
926
        foldl'
927
        (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of
928
                          Bad msg ->
929
                              case accu of
930
                                Right _ -> accu
931
                                -- we don't need more details (which
932
                                -- nodes, etc.) as we only selected
933
                                -- this group if we can allocate on
934
                                -- it, hence failures will not
935
                                -- propagate out of this fold loop
936
                                Left _ -> Left $ "Allocation failed: " ++ msg
937
                          Ok result@(_, _, _, new_cv) ->
938
                              let new_accu = Right result in
939
                              case accu of
940
                                Left _ -> new_accu
941
                                Right (_, _, _, old_cv) ->
942
                                    if old_cv < new_cv
943
                                    then accu
944
                                    else new_accu
945
        ) no_nodes node_pairs
946

    
947
    return (nl', il', ops)
948

    
949
-- | Inner fold function for changing secondary of a DRBD instance.
950
--
951
-- The running solution is either a @Left String@, which means we
952
-- don't have yet a working solution, or a @Right (...)@, which
953
-- represents a valid solution; it holds the modified node list, the
954
-- modified instance (after evacuation), the score of that solution,
955
-- and the new secondary node index.
956
evacDrbdSecondaryInner :: Node.List -- ^ Cluster node list
957
                       -> Instance.Instance -- ^ Instance being evacuated
958
                       -> Gdx -- ^ The group index of the instance
959
                       -> EvacInnerState  -- ^ Current best solution
960
                       -> Ndx  -- ^ Node we're evaluating as new secondary
961
                       -> EvacInnerState -- ^ New best solution
962
evacDrbdSecondaryInner nl inst gdx accu ndx =
963
  case applyMove nl inst (ReplaceSecondary ndx) of
964
    OpFail fm ->
965
      case accu of
966
        Right _ -> accu
967
        Left _ -> Left $ "Node " ++ Container.nameOf nl ndx ++
968
                  " failed: " ++ show fm
969
    OpGood (nl', inst', _, _) ->
970
      let nodes = Container.elems nl'
971
          -- The fromJust below is ugly (it can fail nastily), but
972
          -- at this point we should have any internal mismatches,
973
          -- and adding a monad here would be quite involved
974
          grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
975
          new_cv = compCVNodes grpnodes
976
          new_accu = Right (nl', inst', new_cv, ndx)
977
      in case accu of
978
           Left _ -> new_accu
979
           Right (_, _, old_cv, _) ->
980
             if old_cv < new_cv
981
               then accu
982
               else new_accu
983

    
984
-- | Compute result of changing all nodes of a DRBD instance.
985
--
986
-- Given the target primary and secondary node (which might be in a
987
-- different group or not), this function will 'execute' all the
988
-- required steps and assuming all operations succceed, will return
989
-- the modified node and instance lists, the opcodes needed for this
990
-- and the new group score.
991
evacDrbdAllInner :: Node.List         -- ^ Cluster node list
992
                 -> Instance.List     -- ^ Cluster instance list
993
                 -> Instance.Instance -- ^ The instance to be moved
994
                 -> Gdx               -- ^ The target group index
995
                                      -- (which can differ from the
996
                                      -- current group of the
997
                                      -- instance)
998
                 -> (Ndx, Ndx)        -- ^ Tuple of new
999
                                      -- primary\/secondary nodes
1000
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score)
1001
evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) = do
1002
  let primary = Container.find (Instance.pNode inst) nl
1003
      idx = Instance.idx inst
1004
  -- if the primary is offline, then we first failover
1005
  (nl1, inst1, ops1) <-
1006
    if Node.offline primary
1007
      then do
1008
        (nl', inst', _, _) <-
1009
          annotateResult "Failing over to the secondary" $
1010
          opToResult $ applyMove nl inst Failover
1011
        return (nl', inst', [Failover])
1012
      else return (nl, inst, [])
1013
  let (o1, o2, o3) = (ReplaceSecondary t_pdx,
1014
                      Failover,
1015
                      ReplaceSecondary t_sdx)
1016
  -- we now need to execute a replace secondary to the future
1017
  -- primary node
1018
  (nl2, inst2, _, _) <-
1019
    annotateResult "Changing secondary to new primary" $
1020
    opToResult $
1021
    applyMove nl1 inst1 o1
1022
  let ops2 = o1:ops1
1023
  -- we now execute another failover, the primary stays fixed now
1024
  (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" $
1025
                        opToResult $ applyMove nl2 inst2 o2
1026
  let ops3 = o2:ops2
1027
  -- and finally another replace secondary, to the final secondary
1028
  (nl4, inst4, _, _) <-
1029
    annotateResult "Changing secondary to final secondary" $
1030
    opToResult $
1031
    applyMove nl3 inst3 o3
1032
  let ops4 = o3:ops3
1033
      il' = Container.add idx inst4 il
1034
      ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1035
  let nodes = Container.elems nl4
1036
      -- The fromJust below is ugly (it can fail nastily), but
1037
      -- at this point we should have any internal mismatches,
1038
      -- and adding a monad here would be quite involved
1039
      grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1040
      new_cv = compCVNodes grpnodes
1041
  return (nl4, il', ops, new_cv)
1042

    
1043
-- | Computes the nodes in a given group which are available for
1044
-- allocation.
1045
availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1046
                    -> IntSet.IntSet  -- ^ Nodes that are excluded
1047
                    -> Gdx            -- ^ The group for which we
1048
                                      -- query the nodes
1049
                    -> Result [Ndx]   -- ^ List of available node indices
1050
availableGroupNodes group_nodes excl_ndx gdx = do
1051
  local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1052
                 Ok (lookup gdx group_nodes)
1053
  let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1054
  return avail_nodes
1055

    
1056
-- | Updates the evac solution with the results of an instance
1057
-- evacuation.
1058
updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1059
                   -> Idx
1060
                   -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1061
                   -> (Node.List, Instance.List, EvacSolution)
1062
updateEvacSolution (nl, il, es) idx (Bad msg) =
1063
  (nl, il, es { esFailed = (idx, msg):esFailed es})
1064
updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1065
  (nl, il, es { esMoved = new_elem:esMoved es
1066
              , esOpCodes = opcodes:esOpCodes es })
1067
    where inst = Container.find idx il
1068
          new_elem = (idx,
1069
                      instancePriGroup nl inst,
1070
                      Instance.allNodes inst)
1071

    
1072
-- | Node-evacuation IAllocator mode main function.
1073
tryNodeEvac :: Group.List    -- ^ The cluster groups
1074
            -> Node.List     -- ^ The node list (cluster-wide, not per group)
1075
            -> Instance.List -- ^ Instance list (cluster-wide)
1076
            -> EvacMode      -- ^ The evacuation mode
1077
            -> [Idx]         -- ^ List of instance (indices) to be evacuated
1078
            -> Result (Node.List, Instance.List, EvacSolution)
1079
tryNodeEvac _ ini_nl ini_il mode idxs =
1080
  let evac_ndx = nodesToEvacuate ini_il mode idxs
1081
      offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1082
      excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1083
      group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1084
                                           (Container.elems nl))) $
1085
                  splitCluster ini_nl ini_il
1086
      (fin_nl, fin_il, esol) =
1087
        foldl' (\state@(nl, il, _) inst ->
1088
                  let gdx = instancePriGroup nl inst
1089
                      pdx = Instance.pNode inst in
1090
                  updateEvacSolution state (Instance.idx inst) $
1091
                  availableGroupNodes group_ndx
1092
                    (IntSet.insert pdx excl_ndx) gdx >>=
1093
                      nodeEvacInstance nl il mode inst gdx
1094
               )
1095
        (ini_nl, ini_il, emptyEvacSolution)
1096
        (map (`Container.find` ini_il) idxs)
1097
  in return (fin_nl, fin_il, reverseEvacSolution esol)
1098

    
1099
-- | Change-group IAllocator mode main function.
1100
--
1101
-- This is very similar to 'tryNodeEvac', the only difference is that
1102
-- we don't choose as target group the current instance group, but
1103
-- instead:
1104
--
1105
--   1. at the start of the function, we compute which are the target
1106
--   groups; either no groups were passed in, in which case we choose
1107
--   all groups out of which we don't evacuate instance, or there were
1108
--   some groups passed, in which case we use those
1109
--
1110
--   2. for each instance, we use 'findBestAllocGroup' to choose the
1111
--   best group to hold the instance, and then we do what
1112
--   'tryNodeEvac' does, except for this group instead of the current
1113
--   instance group.
1114
--
1115
-- Note that the correct behaviour of this function relies on the
1116
-- function 'nodeEvacInstance' to be able to do correctly both
1117
-- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1118
tryChangeGroup :: Group.List    -- ^ The cluster groups
1119
               -> Node.List     -- ^ The node list (cluster-wide)
1120
               -> Instance.List -- ^ Instance list (cluster-wide)
1121
               -> [Gdx]         -- ^ Target groups; if empty, any
1122
                                -- groups not being evacuated
1123
               -> [Idx]         -- ^ List of instance (indices) to be evacuated
1124
               -> Result (Node.List, Instance.List, EvacSolution)
1125
tryChangeGroup gl ini_nl ini_il gdxs idxs =
1126
  let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1127
                             flip Container.find ini_il) idxs
1128
      target_gdxs = (if null gdxs
1129
                       then Container.keys gl
1130
                       else gdxs) \\ evac_gdxs
1131
      offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1132
      excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1133
      group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1134
                                           (Container.elems nl))) $
1135
                  splitCluster ini_nl ini_il
1136
      (fin_nl, fin_il, esol) =
1137
        foldl' (\state@(nl, il, _) inst ->
1138
                  let solution = do
1139
                        let ncnt = Instance.requiredNodes $
1140
                                   Instance.diskTemplate inst
1141
                        (gdx, _, _) <- findBestAllocGroup gl nl il
1142
                                       (Just target_gdxs) inst ncnt
1143
                        av_nodes <- availableGroupNodes group_ndx
1144
                                    excl_ndx gdx
1145
                        nodeEvacInstance nl il ChangeAll inst gdx av_nodes
1146
                  in updateEvacSolution state (Instance.idx inst) solution
1147
               )
1148
        (ini_nl, ini_il, emptyEvacSolution)
1149
        (map (`Container.find` ini_il) idxs)
1150
  in return (fin_nl, fin_il, reverseEvacSolution esol)
1151

    
1152
-- | Standard-sized allocation method.
1153
--
1154
-- This places instances of the same size on the cluster until we're
1155
-- out of space. The result will be a list of identically-sized
1156
-- instances.
1157
iterateAlloc :: AllocMethod
1158
iterateAlloc nl il limit newinst allocnodes ixes cstats =
1159
  let depth = length ixes
1160
      newname = printf "new-%d" depth::String
1161
      newidx = Container.size il
1162
      newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1163
      newlimit = fmap (flip (-) 1) limit
1164
  in case tryAlloc nl il newi2 allocnodes of
1165
       Bad s -> Bad s
1166
       Ok (AllocSolution { asFailures = errs, asSolution = sols3 }) ->
1167
         let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1168
         case sols3 of
1169
           Nothing -> newsol
1170
           Just (xnl, xi, _, _) ->
1171
             if limit == Just 0
1172
               then newsol
1173
               else iterateAlloc xnl (Container.add newidx xi il)
1174
                      newlimit newinst allocnodes (xi:ixes)
1175
                      (totalResources xnl:cstats)
1176

    
1177
-- | Tiered allocation method.
1178
--
1179
-- This places instances on the cluster, and decreases the spec until
1180
-- we can allocate again. The result will be a list of decreasing
1181
-- instance specs.
1182
tieredAlloc :: AllocMethod
1183
tieredAlloc nl il limit newinst allocnodes ixes cstats =
1184
  case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1185
    Bad s -> Bad s
1186
    Ok (errs, nl', il', ixes', cstats') ->
1187
      let newsol = Ok (errs, nl', il', ixes', cstats')
1188
          ixes_cnt = length ixes'
1189
          (stop, newlimit) = case limit of
1190
                               Nothing -> (False, Nothing)
1191
                               Just n -> (n <= ixes_cnt,
1192
                                            Just (n - ixes_cnt)) in
1193
      if stop then newsol else
1194
          case Instance.shrinkByType newinst . fst . last $
1195
               sortBy (comparing snd) errs of
1196
            Bad _ -> newsol
1197
            Ok newinst' -> tieredAlloc nl' il' newlimit
1198
                           newinst' allocnodes ixes' cstats'
1199

    
1200
-- * Formatting functions
1201

    
1202
-- | Given the original and final nodes, computes the relocation description.
1203
computeMoves :: Instance.Instance -- ^ The instance to be moved
1204
             -> String -- ^ The instance name
1205
             -> IMove  -- ^ The move being performed
1206
             -> String -- ^ New primary
1207
             -> String -- ^ New secondary
1208
             -> (String, [String])
1209
                -- ^ Tuple of moves and commands list; moves is containing
1210
                -- either @/f/@ for failover or @/r:name/@ for replace
1211
                -- secondary, while the command list holds gnt-instance
1212
                -- commands (without that prefix), e.g \"@failover instance1@\"
1213
computeMoves i inam mv c d =
1214
  case mv of
1215
    Failover -> ("f", [mig])
1216
    FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1217
    ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1218
    ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1219
    ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1220
  where morf = if Instance.isRunning i then "migrate" else "failover"
1221
        mig = printf "%s -f %s" morf inam::String
1222
        rep n = printf "replace-disks -n %s %s" n inam
1223

    
1224
-- | Converts a placement to string format.
1225
printSolutionLine :: Node.List     -- ^ The node list
1226
                  -> Instance.List -- ^ The instance list
1227
                  -> Int           -- ^ Maximum node name length
1228
                  -> Int           -- ^ Maximum instance name length
1229
                  -> Placement     -- ^ The current placement
1230
                  -> Int           -- ^ The index of the placement in
1231
                                   -- the solution
1232
                  -> (String, [String])
1233
printSolutionLine nl il nmlen imlen plc pos =
1234
  let pmlen = (2*nmlen + 1)
1235
      (i, p, s, mv, c) = plc
1236
      old_sec = Instance.sNode inst
1237
      inst = Container.find i il
1238
      inam = Instance.alias inst
1239
      npri = Node.alias $ Container.find p nl
1240
      nsec = Node.alias $ Container.find s nl
1241
      opri = Node.alias $ Container.find (Instance.pNode inst) nl
1242
      osec = Node.alias $ Container.find old_sec nl
1243
      (moves, cmds) =  computeMoves inst inam mv npri nsec
1244
      -- FIXME: this should check instead/also the disk template
1245
      ostr = if old_sec == Node.noSecondary
1246
               then printf "%s" opri
1247
               else printf "%s:%s" opri osec
1248
      nstr = if s == Node.noSecondary
1249
               then printf "%s" npri
1250
               else printf "%s:%s" npri nsec
1251
  in (printf "  %3d. %-*s %-*s => %-*s %12.8f a=%s"
1252
      pos imlen inam pmlen (ostr::String)
1253
      pmlen (nstr::String) c moves,
1254
      cmds)
1255

    
1256
-- | Return the instance and involved nodes in an instance move.
1257
--
1258
-- Note that the output list length can vary, and is not required nor
1259
-- guaranteed to be of any specific length.
1260
involvedNodes :: Instance.List -- ^ Instance list, used for retrieving
1261
                               -- the instance from its index; note
1262
                               -- that this /must/ be the original
1263
                               -- instance list, so that we can
1264
                               -- retrieve the old nodes
1265
              -> Placement     -- ^ The placement we're investigating,
1266
                               -- containing the new nodes and
1267
                               -- instance index
1268
              -> [Ndx]         -- ^ Resulting list of node indices
1269
involvedNodes il plc =
1270
  let (i, np, ns, _, _) = plc
1271
      inst = Container.find i il
1272
  in nub $ [np, ns] ++ Instance.allNodes inst
1273

    
1274
-- | Inner function for splitJobs, that either appends the next job to
1275
-- the current jobset, or starts a new jobset.
1276
mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1277
mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1278
mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1279
  | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1280
  | otherwise = ([n]:cjs, ndx)
1281

    
1282
-- | Break a list of moves into independent groups. Note that this
1283
-- will reverse the order of jobs.
1284
splitJobs :: [MoveJob] -> [JobSet]
1285
splitJobs = fst . foldl mergeJobs ([], [])
1286

    
1287
-- | Given a list of commands, prefix them with @gnt-instance@ and
1288
-- also beautify the display a little.
1289
formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1290
formatJob jsn jsl (sn, (_, _, _, cmds)) =
1291
  let out =
1292
        printf "  echo job %d/%d" jsn sn:
1293
        printf "  check":
1294
        map ("  gnt-instance " ++) cmds
1295
  in if sn == 1
1296
       then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1297
       else out
1298

    
1299
-- | Given a list of commands, prefix them with @gnt-instance@ and
1300
-- also beautify the display a little.
1301
formatCmds :: [JobSet] -> String
1302
formatCmds =
1303
  unlines .
1304
  concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1305
                           (zip [1..] js)) .
1306
  zip [1..]
1307

    
1308
-- | Print the node list.
1309
printNodes :: Node.List -> [String] -> String
1310
printNodes nl fs =
1311
  let fields = case fs of
1312
                 [] -> Node.defaultFields
1313
                 "+":rest -> Node.defaultFields ++ rest
1314
                 _ -> fs
1315
      snl = sortBy (comparing Node.idx) (Container.elems nl)
1316
      (header, isnum) = unzip $ map Node.showHeader fields
1317
  in printTable "" header (map (Node.list fields) snl) isnum
1318

    
1319
-- | Print the instance list.
1320
printInsts :: Node.List -> Instance.List -> String
1321
printInsts nl il =
1322
  let sil = sortBy (comparing Instance.idx) (Container.elems il)
1323
      helper inst = [ if Instance.isRunning inst then "R" else " "
1324
                    , Instance.name inst
1325
                    , Container.nameOf nl (Instance.pNode inst)
1326
                    , let sdx = Instance.sNode inst
1327
                      in if sdx == Node.noSecondary
1328
                           then  ""
1329
                           else Container.nameOf nl sdx
1330
                    , if Instance.autoBalance inst then "Y" else "N"
1331
                    , printf "%3d" $ Instance.vcpus inst
1332
                    , printf "%5d" $ Instance.mem inst
1333
                    , printf "%5d" $ Instance.dsk inst `div` 1024
1334
                    , printf "%5.3f" lC
1335
                    , printf "%5.3f" lM
1336
                    , printf "%5.3f" lD
1337
                    , printf "%5.3f" lN
1338
                    ]
1339
          where DynUtil lC lM lD lN = Instance.util inst
1340
      header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1341
               , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1342
      isnum = False:False:False:False:False:repeat True
1343
  in printTable "" header (map helper sil) isnum
1344

    
1345
-- | Shows statistics for a given node list.
1346
printStats :: String -> Node.List -> String
1347
printStats lp nl =
1348
  let dcvs = compDetailedCV $ Container.elems nl
1349
      (weights, names) = unzip detailedCVInfo
1350
      hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1351
      header = [ "Field", "Value", "Weight" ]
1352
      formatted = map (\(w, h, val) ->
1353
                         [ h
1354
                         , printf "%.8f" val
1355
                         , printf "x%.2f" w
1356
                         ]) hd
1357
  in printTable lp header formatted $ False:repeat True
1358

    
1359
-- | Convert a placement into a list of OpCodes (basically a job).
1360
iMoveToJob :: Node.List        -- ^ The node list; only used for node
1361
                               -- names, so any version is good
1362
                               -- (before or after the operation)
1363
           -> Instance.List    -- ^ The instance list; also used for
1364
                               -- names only
1365
           -> Idx              -- ^ The index of the instance being
1366
                               -- moved
1367
           -> IMove            -- ^ The actual move to be described
1368
           -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to
1369
                               -- the given move
1370
iMoveToJob nl il idx move =
1371
  let inst = Container.find idx il
1372
      iname = Instance.name inst
1373
      lookNode  = Just . Container.nameOf nl
1374
      opF = OpCodes.OpInstanceMigrate iname True False True Nothing
1375
      opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n)
1376
              OpCodes.ReplaceNewSecondary [] Nothing
1377
  in case move of
1378
       Failover -> [ opF ]
1379
       ReplacePrimary np -> [ opF, opR np, opF ]
1380
       ReplaceSecondary ns -> [ opR ns ]
1381
       ReplaceAndFailover np -> [ opR np, opF ]
1382
       FailoverAndReplace ns -> [ opF, opR ns ]
1383

    
1384
-- * Node group functions
1385

    
1386
-- | Computes the group of an instance.
1387
instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1388
instanceGroup nl i =
1389
  let sidx = Instance.sNode i
1390
      pnode = Container.find (Instance.pNode i) nl
1391
      snode = if sidx == Node.noSecondary
1392
              then pnode
1393
              else Container.find sidx nl
1394
      pgroup = Node.group pnode
1395
      sgroup = Node.group snode
1396
  in if pgroup /= sgroup
1397
       then fail ("Instance placed accross two node groups, primary " ++
1398
                  show pgroup ++ ", secondary " ++ show sgroup)
1399
       else return pgroup
1400

    
1401
-- | Computes the group of an instance per the primary node.
1402
instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1403
instancePriGroup nl i =
1404
  let pnode = Container.find (Instance.pNode i) nl
1405
  in  Node.group pnode
1406

    
1407
-- | Compute the list of badly allocated instances (split across node
1408
-- groups).
1409
findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1410
findSplitInstances nl =
1411
  filter (not . isOk . instanceGroup nl) . Container.elems
1412

    
1413
-- | Splits a cluster into the component node groups.
1414
splitCluster :: Node.List -> Instance.List ->
1415
                [(Gdx, (Node.List, Instance.List))]
1416
splitCluster nl il =
1417
  let ngroups = Node.computeGroups (Container.elems nl)
1418
  in map (\(guuid, nodes) ->
1419
           let nidxs = map Node.idx nodes
1420
               nodes' = zip nidxs nodes
1421
               instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1422
           in (guuid, (Container.fromList nodes', instances))) ngroups
1423

    
1424
-- | Compute the list of nodes that are to be evacuated, given a list
1425
-- of instances and an evacuation mode.
1426
nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1427
                -> EvacMode      -- ^ The evacuation mode we're using
1428
                -> [Idx]         -- ^ List of instance indices being evacuated
1429
                -> IntSet.IntSet -- ^ Set of node indices
1430
nodesToEvacuate il mode =
1431
  IntSet.delete Node.noSecondary .
1432
  foldl' (\ns idx ->
1433
            let i = Container.find idx il
1434
                pdx = Instance.pNode i
1435
                sdx = Instance.sNode i
1436
                dt = Instance.diskTemplate i
1437
                withSecondary = case dt of
1438
                                  DTDrbd8 -> IntSet.insert sdx ns
1439
                                  _ -> ns
1440
            in case mode of
1441
                 ChangePrimary   -> IntSet.insert pdx ns
1442
                 ChangeSecondary -> withSecondary
1443
                 ChangeAll       -> IntSet.insert pdx withSecondary
1444
         ) IntSet.empty