Statistics
| Branch: | Tag: | Revision:

root / htools / Ganeti / HTools / Cluster.hs @ 255f55a9

History | View | Annotate | Download (61.9 kB)

1
{-| Implementation of cluster-wide logic.
2

    
3
This module holds all pure cluster-logic; I\/O related functionality
4
goes into the /Main/ module for the individual binaries.
5

    
6
-}
7

    
8
{-
9

    
10
Copyright (C) 2009, 2010, 2011 Google Inc.
11

    
12
This program is free software; you can redistribute it and/or modify
13
it under the terms of the GNU General Public License as published by
14
the Free Software Foundation; either version 2 of the License, or
15
(at your option) any later version.
16

    
17
This program is distributed in the hope that it will be useful, but
18
WITHOUT ANY WARRANTY; without even the implied warranty of
19
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
General Public License for more details.
21

    
22
You should have received a copy of the GNU General Public License
23
along with this program; if not, write to the Free Software
24
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25
02110-1301, USA.
26

    
27
-}
28

    
29
module Ganeti.HTools.Cluster
30
  (
31
    -- * Types
32
    AllocSolution(..)
33
  , EvacSolution(..)
34
  , Table(..)
35
  , CStats(..)
36
  , AllocStats
37
  , AllocResult
38
  , AllocMethod
39
  -- * Generic functions
40
  , totalResources
41
  , computeAllocationDelta
42
  -- * First phase functions
43
  , computeBadItems
44
  -- * Second phase functions
45
  , printSolutionLine
46
  , formatCmds
47
  , involvedNodes
48
  , splitJobs
49
  -- * Display functions
50
  , printNodes
51
  , printInsts
52
  -- * Balacing functions
53
  , checkMove
54
  , doNextBalance
55
  , tryBalance
56
  , compCV
57
  , compCVNodes
58
  , compDetailedCV
59
  , printStats
60
  , iMoveToJob
61
  -- * IAllocator functions
62
  , genAllocNodes
63
  , tryAlloc
64
  , tryMGAlloc
65
  , tryReloc
66
  , tryNodeEvac
67
  , tryChangeGroup
68
  , collapseFailures
69
  -- * Allocation functions
70
  , iterateAlloc
71
  , tieredAlloc
72
  -- * Node group functions
73
  , instanceGroup
74
  , findSplitInstances
75
  , splitCluster
76
  ) where
77

    
78
import qualified Data.IntSet as IntSet
79
import Data.List
80
import Data.Maybe (fromJust, isNothing)
81
import Data.Ord (comparing)
82
import Text.Printf (printf)
83

    
84
import qualified Ganeti.HTools.Container as Container
85
import qualified Ganeti.HTools.Instance as Instance
86
import qualified Ganeti.HTools.Node as Node
87
import qualified Ganeti.HTools.Group as Group
88
import Ganeti.HTools.Types
89
import Ganeti.HTools.Utils
90
import Ganeti.HTools.Compat
91
import qualified Ganeti.OpCodes as OpCodes
92

    
93
-- * Types
94

    
95
-- | Allocation\/relocation solution.
96
data AllocSolution = AllocSolution
97
  { asFailures :: [FailMode]              -- ^ Failure counts
98
  , asAllocs   :: Int                     -- ^ Good allocation count
99
  , asSolution :: Maybe Node.AllocElement -- ^ The actual allocation result
100
  , asLog      :: [String]                -- ^ Informational messages
101
  }
102

    
103
-- | Node evacuation/group change iallocator result type. This result
104
-- type consists of actual opcodes (a restricted subset) that are
105
-- transmitted back to Ganeti.
106
data EvacSolution = EvacSolution
107
  { esMoved   :: [(Idx, Gdx, [Ndx])]  -- ^ Instances moved successfully
108
  , esFailed  :: [(Idx, String)]      -- ^ Instances which were not
109
                                      -- relocated
110
  , esOpCodes :: [[OpCodes.OpCode]]   -- ^ List of jobs
111
  }
112

    
113
-- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
114
type AllocResult = (FailStats, Node.List, Instance.List,
115
                    [Instance.Instance], [CStats])
116

    
117
-- | A type denoting the valid allocation mode/pairs.
118
--
119
-- For a one-node allocation, this will be a @Left ['Ndx']@, whereas
120
-- for a two-node allocation, this will be a @Right [('Ndx',
121
-- ['Ndx'])]@. In the latter case, the list is basically an
122
-- association list, grouped by primary node and holding the potential
123
-- secondary nodes in the sub-list.
124
type AllocNodes = Either [Ndx] [(Ndx, [Ndx])]
125

    
126
-- | The empty solution we start with when computing allocations.
127
emptyAllocSolution :: AllocSolution
128
emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
129
                                   , asSolution = Nothing, asLog = [] }
130

    
131
-- | The empty evac solution.
132
emptyEvacSolution :: EvacSolution
133
emptyEvacSolution = EvacSolution { esMoved = []
134
                                 , esFailed = []
135
                                 , esOpCodes = []
136
                                 }
137

    
138
-- | The complete state for the balancing solution.
139
data Table = Table Node.List Instance.List Score [Placement]
140
             deriving (Show, Read)
141

    
142
-- | Cluster statistics data type.
143
data CStats = CStats { csFmem :: Integer -- ^ Cluster free mem
144
                     , csFdsk :: Integer -- ^ Cluster free disk
145
                     , csAmem :: Integer -- ^ Cluster allocatable mem
146
                     , csAdsk :: Integer -- ^ Cluster allocatable disk
147
                     , csAcpu :: Integer -- ^ Cluster allocatable cpus
148
                     , csMmem :: Integer -- ^ Max node allocatable mem
149
                     , csMdsk :: Integer -- ^ Max node allocatable disk
150
                     , csMcpu :: Integer -- ^ Max node allocatable cpu
151
                     , csImem :: Integer -- ^ Instance used mem
152
                     , csIdsk :: Integer -- ^ Instance used disk
153
                     , csIcpu :: Integer -- ^ Instance used cpu
154
                     , csTmem :: Double  -- ^ Cluster total mem
155
                     , csTdsk :: Double  -- ^ Cluster total disk
156
                     , csTcpu :: Double  -- ^ Cluster total cpus
157
                     , csVcpu :: Integer -- ^ Cluster virtual cpus (if
158
                                         -- node pCpu has been set,
159
                                         -- otherwise -1)
160
                     , csXmem :: Integer -- ^ Unnacounted for mem
161
                     , csNmem :: Integer -- ^ Node own memory
162
                     , csScore :: Score  -- ^ The cluster score
163
                     , csNinst :: Int    -- ^ The total number of instances
164
                     }
165
            deriving (Show, Read)
166

    
167
-- | Currently used, possibly to allocate, unallocable.
168
type AllocStats = (RSpec, RSpec, RSpec)
169

    
170
-- | A simple type for allocation functions.
171
type AllocMethod =  Node.List           -- ^ Node list
172
                 -> Instance.List       -- ^ Instance list
173
                 -> Maybe Int           -- ^ Optional allocation limit
174
                 -> Instance.Instance   -- ^ Instance spec for allocation
175
                 -> AllocNodes          -- ^ Which nodes we should allocate on
176
                 -> [Instance.Instance] -- ^ Allocated instances
177
                 -> [CStats]            -- ^ Running cluster stats
178
                 -> Result AllocResult  -- ^ Allocation result
179

    
180
-- * Utility functions
181

    
182
-- | Verifies the N+1 status and return the affected nodes.
183
verifyN1 :: [Node.Node] -> [Node.Node]
184
verifyN1 = filter Node.failN1
185

    
186
{-| Computes the pair of bad nodes and instances.
187

    
188
The bad node list is computed via a simple 'verifyN1' check, and the
189
bad instance list is the list of primary and secondary instances of
190
those nodes.
191

    
192
-}
193
computeBadItems :: Node.List -> Instance.List ->
194
                   ([Node.Node], [Instance.Instance])
195
computeBadItems nl il =
196
  let bad_nodes = verifyN1 $ getOnline nl
197
      bad_instances = map (`Container.find` il) .
198
                      sort . nub $
199
                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
200
  in
201
    (bad_nodes, bad_instances)
202

    
203
-- | Extracts the node pairs for an instance. This can fail if the
204
-- instance is single-homed. FIXME: this needs to be improved,
205
-- together with the general enhancement for handling non-DRBD moves.
206
instanceNodes :: Node.List -> Instance.Instance ->
207
                 (Ndx, Ndx, Node.Node, Node.Node)
208
instanceNodes nl inst =
209
  let old_pdx = Instance.pNode inst
210
      old_sdx = Instance.sNode inst
211
      old_p = Container.find old_pdx nl
212
      old_s = Container.find old_sdx nl
213
  in (old_pdx, old_sdx, old_p, old_s)
214

    
215
-- | Zero-initializer for the CStats type.
216
emptyCStats :: CStats
217
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
218

    
219
-- | Update stats with data from a new node.
220
updateCStats :: CStats -> Node.Node -> CStats
221
updateCStats cs node =
222
  let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
223
               csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
224
               csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
225
               csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
226
               csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
227
               csVcpu = x_vcpu,
228
               csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
229
             }
230
        = cs
231
      inc_amem = Node.fMem node - Node.rMem node
232
      inc_amem' = if inc_amem > 0 then inc_amem else 0
233
      inc_adsk = Node.availDisk node
234
      inc_imem = truncate (Node.tMem node) - Node.nMem node
235
                 - Node.xMem node - Node.fMem node
236
      inc_icpu = Node.uCpu node
237
      inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
238
      inc_vcpu = Node.hiCpu node
239
      inc_acpu = Node.availCpu node
240
  in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
241
        , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
242
        , csAmem = x_amem + fromIntegral inc_amem'
243
        , csAdsk = x_adsk + fromIntegral inc_adsk
244
        , csAcpu = x_acpu + fromIntegral inc_acpu
245
        , csMmem = max x_mmem (fromIntegral inc_amem')
246
        , csMdsk = max x_mdsk (fromIntegral inc_adsk)
247
        , csMcpu = max x_mcpu (fromIntegral inc_acpu)
248
        , csImem = x_imem + fromIntegral inc_imem
249
        , csIdsk = x_idsk + fromIntegral inc_idsk
250
        , csIcpu = x_icpu + fromIntegral inc_icpu
251
        , csTmem = x_tmem + Node.tMem node
252
        , csTdsk = x_tdsk + Node.tDsk node
253
        , csTcpu = x_tcpu + Node.tCpu node
254
        , csVcpu = x_vcpu + fromIntegral inc_vcpu
255
        , csXmem = x_xmem + fromIntegral (Node.xMem node)
256
        , csNmem = x_nmem + fromIntegral (Node.nMem node)
257
        , csNinst = x_ninst + length (Node.pList node)
258
        }
259

    
260
-- | Compute the total free disk and memory in the cluster.
261
totalResources :: Node.List -> CStats
262
totalResources nl =
263
  let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
264
  in cs { csScore = compCV nl }
265

    
266
-- | Compute the delta between two cluster state.
267
--
268
-- This is used when doing allocations, to understand better the
269
-- available cluster resources. The return value is a triple of the
270
-- current used values, the delta that was still allocated, and what
271
-- was left unallocated.
272
computeAllocationDelta :: CStats -> CStats -> AllocStats
273
computeAllocationDelta cini cfin =
274
  let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini
275
      CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
276
              csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin
277
      rini = RSpec (fromIntegral i_icpu) (fromIntegral i_imem)
278
             (fromIntegral i_idsk)
279
      rfin = RSpec (fromIntegral (f_icpu - i_icpu))
280
             (fromIntegral (f_imem - i_imem))
281
             (fromIntegral (f_idsk - i_idsk))
282
      un_cpu = fromIntegral (v_cpu - f_icpu)::Int
283
      runa = RSpec un_cpu (truncate t_mem - fromIntegral f_imem)
284
             (truncate t_dsk - fromIntegral f_idsk)
285
  in (rini, rfin, runa)
286

    
287
-- | The names and weights of the individual elements in the CV list.
288
detailedCVInfo :: [(Double, String)]
289
detailedCVInfo = [ (1,  "free_mem_cv")
290
                 , (1,  "free_disk_cv")
291
                 , (1,  "n1_cnt")
292
                 , (1,  "reserved_mem_cv")
293
                 , (4,  "offline_all_cnt")
294
                 , (16, "offline_pri_cnt")
295
                 , (1,  "vcpu_ratio_cv")
296
                 , (1,  "cpu_load_cv")
297
                 , (1,  "mem_load_cv")
298
                 , (1,  "disk_load_cv")
299
                 , (1,  "net_load_cv")
300
                 , (2,  "pri_tags_score")
301
                 ]
302

    
303
-- | Holds the weights used by 'compCVNodes' for each metric.
304
detailedCVWeights :: [Double]
305
detailedCVWeights = map fst detailedCVInfo
306

    
307
-- | Compute the mem and disk covariance.
308
compDetailedCV :: [Node.Node] -> [Double]
309
compDetailedCV all_nodes =
310
  let (offline, nodes) = partition Node.offline all_nodes
311
      mem_l = map Node.pMem nodes
312
      dsk_l = map Node.pDsk nodes
313
      -- metric: memory covariance
314
      mem_cv = stdDev mem_l
315
      -- metric: disk covariance
316
      dsk_cv = stdDev dsk_l
317
      -- metric: count of instances living on N1 failing nodes
318
      n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
319
                                                 length (Node.pList n)) .
320
                 filter Node.failN1 $ nodes :: Double
321
      res_l = map Node.pRem nodes
322
      -- metric: reserved memory covariance
323
      res_cv = stdDev res_l
324
      -- offline instances metrics
325
      offline_ipri = sum . map (length . Node.pList) $ offline
326
      offline_isec = sum . map (length . Node.sList) $ offline
327
      -- metric: count of instances on offline nodes
328
      off_score = fromIntegral (offline_ipri + offline_isec)::Double
329
      -- metric: count of primary instances on offline nodes (this
330
      -- helps with evacuation/failover of primary instances on
331
      -- 2-node clusters with one node offline)
332
      off_pri_score = fromIntegral offline_ipri::Double
333
      cpu_l = map Node.pCpu nodes
334
      -- metric: covariance of vcpu/pcpu ratio
335
      cpu_cv = stdDev cpu_l
336
      -- metrics: covariance of cpu, memory, disk and network load
337
      (c_load, m_load, d_load, n_load) =
338
        unzip4 $ map (\n ->
339
                      let DynUtil c1 m1 d1 n1 = Node.utilLoad n
340
                          DynUtil c2 m2 d2 n2 = Node.utilPool n
341
                      in (c1/c2, m1/m2, d1/d2, n1/n2)) nodes
342
      -- metric: conflicting instance count
343
      pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
344
      pri_tags_score = fromIntegral pri_tags_inst::Double
345
  in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
346
     , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
347
     , pri_tags_score ]
348

    
349
-- | Compute the /total/ variance.
350
compCVNodes :: [Node.Node] -> Double
351
compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
352

    
353
-- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
354
compCV :: Node.List -> Double
355
compCV = compCVNodes . Container.elems
356

    
357
-- | Compute online nodes from a 'Node.List'.
358
getOnline :: Node.List -> [Node.Node]
359
getOnline = filter (not . Node.offline) . Container.elems
360

    
361
-- * Balancing functions
362

    
363
-- | Compute best table. Note that the ordering of the arguments is important.
364
compareTables :: Table -> Table -> Table
365
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
366
  if a_cv > b_cv then b else a
367

    
368
-- | Applies an instance move to a given node list and instance.
369
applyMove :: Node.List -> Instance.Instance
370
          -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
371
-- Failover (f)
372
applyMove nl inst Failover =
373
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
374
      int_p = Node.removePri old_p inst
375
      int_s = Node.removeSec old_s inst
376
      force_p = Node.offline old_p
377
      new_nl = do -- Maybe monad
378
        new_p <- Node.addPriEx force_p int_s inst
379
        new_s <- Node.addSec int_p inst old_sdx
380
        let new_inst = Instance.setBoth inst old_sdx old_pdx
381
        return (Container.addTwo old_pdx new_s old_sdx new_p nl,
382
                new_inst, old_sdx, old_pdx)
383
  in new_nl
384

    
385
-- Replace the primary (f:, r:np, f)
386
applyMove nl inst (ReplacePrimary new_pdx) =
387
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
388
      tgt_n = Container.find new_pdx nl
389
      int_p = Node.removePri old_p inst
390
      int_s = Node.removeSec old_s inst
391
      force_p = Node.offline old_p
392
      new_nl = do -- Maybe monad
393
                  -- check that the current secondary can host the instance
394
                  -- during the migration
395
        tmp_s <- Node.addPriEx force_p int_s inst
396
        let tmp_s' = Node.removePri tmp_s inst
397
        new_p <- Node.addPriEx force_p tgt_n inst
398
        new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
399
        let new_inst = Instance.setPri inst new_pdx
400
        return (Container.add new_pdx new_p $
401
                Container.addTwo old_pdx int_p old_sdx new_s nl,
402
                new_inst, new_pdx, old_sdx)
403
  in new_nl
404

    
405
-- Replace the secondary (r:ns)
406
applyMove nl inst (ReplaceSecondary new_sdx) =
407
  let old_pdx = Instance.pNode inst
408
      old_sdx = Instance.sNode inst
409
      old_s = Container.find old_sdx nl
410
      tgt_n = Container.find new_sdx nl
411
      int_s = Node.removeSec old_s inst
412
      force_s = Node.offline old_s
413
      new_inst = Instance.setSec inst new_sdx
414
      new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
415
               \new_s -> return (Container.addTwo new_sdx
416
                                 new_s old_sdx int_s nl,
417
                                 new_inst, old_pdx, new_sdx)
418
  in new_nl
419

    
420
-- Replace the secondary and failover (r:np, f)
421
applyMove nl inst (ReplaceAndFailover new_pdx) =
422
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
423
      tgt_n = Container.find new_pdx nl
424
      int_p = Node.removePri old_p inst
425
      int_s = Node.removeSec old_s inst
426
      force_s = Node.offline old_s
427
      new_nl = do -- Maybe monad
428
        new_p <- Node.addPri tgt_n inst
429
        new_s <- Node.addSecEx force_s int_p inst new_pdx
430
        let new_inst = Instance.setBoth inst new_pdx old_pdx
431
        return (Container.add new_pdx new_p $
432
                Container.addTwo old_pdx new_s old_sdx int_s nl,
433
                new_inst, new_pdx, old_pdx)
434
  in new_nl
435

    
436
-- Failver and replace the secondary (f, r:ns)
437
applyMove nl inst (FailoverAndReplace new_sdx) =
438
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
439
      tgt_n = Container.find new_sdx nl
440
      int_p = Node.removePri old_p inst
441
      int_s = Node.removeSec old_s inst
442
      force_p = Node.offline old_p
443
      new_nl = do -- Maybe monad
444
        new_p <- Node.addPriEx force_p int_s inst
445
        new_s <- Node.addSecEx force_p tgt_n inst old_sdx
446
        let new_inst = Instance.setBoth inst old_sdx new_sdx
447
        return (Container.add new_sdx new_s $
448
                Container.addTwo old_sdx new_p old_pdx int_p nl,
449
                new_inst, old_sdx, new_sdx)
450
  in new_nl
451

    
452
-- | Tries to allocate an instance on one given node.
453
allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
454
                 -> OpResult Node.AllocElement
455
allocateOnSingle nl inst new_pdx =
456
  let p = Container.find new_pdx nl
457
      new_inst = Instance.setBoth inst new_pdx Node.noSecondary
458
  in  Node.addPri p inst >>= \new_p -> do
459
    let new_nl = Container.add new_pdx new_p nl
460
        new_score = compCV nl
461
    return (new_nl, new_inst, [new_p], new_score)
462

    
463
-- | Tries to allocate an instance on a given pair of nodes.
464
allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
465
               -> OpResult Node.AllocElement
466
allocateOnPair nl inst new_pdx new_sdx =
467
  let tgt_p = Container.find new_pdx nl
468
      tgt_s = Container.find new_sdx nl
469
  in do
470
    new_p <- Node.addPri tgt_p inst
471
    new_s <- Node.addSec tgt_s inst new_pdx
472
    let new_inst = Instance.setBoth inst new_pdx new_sdx
473
        new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
474
    return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
475

    
476
-- | Tries to perform an instance move and returns the best table
477
-- between the original one and the new one.
478
checkSingleStep :: Table -- ^ The original table
479
                -> Instance.Instance -- ^ The instance to move
480
                -> Table -- ^ The current best table
481
                -> IMove -- ^ The move to apply
482
                -> Table -- ^ The final best table
483
checkSingleStep ini_tbl target cur_tbl move =
484
  let Table ini_nl ini_il _ ini_plc = ini_tbl
485
      tmp_resu = applyMove ini_nl target move
486
  in case tmp_resu of
487
       OpFail _ -> cur_tbl
488
       OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
489
         let tgt_idx = Instance.idx target
490
             upd_cvar = compCV upd_nl
491
             upd_il = Container.add tgt_idx new_inst ini_il
492
             upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
493
             upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
494
         in compareTables cur_tbl upd_tbl
495

    
496
-- | Given the status of the current secondary as a valid new node and
497
-- the current candidate target node, generate the possible moves for
498
-- a instance.
499
possibleMoves :: Bool      -- ^ Whether the secondary node is a valid new node
500
              -> Bool      -- ^ Whether we can change the primary node
501
              -> Ndx       -- ^ Target node candidate
502
              -> [IMove]   -- ^ List of valid result moves
503

    
504
possibleMoves _ False tdx =
505
  [ReplaceSecondary tdx]
506

    
507
possibleMoves True True tdx =
508
  [ ReplaceSecondary tdx
509
  , ReplaceAndFailover tdx
510
  , ReplacePrimary tdx
511
  , FailoverAndReplace tdx
512
  ]
513

    
514
possibleMoves False True tdx =
515
  [ ReplaceSecondary tdx
516
  , ReplaceAndFailover tdx
517
  ]
518

    
519
-- | Compute the best move for a given instance.
520
checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
521
                  -> Bool              -- ^ Whether disk moves are allowed
522
                  -> Bool              -- ^ Whether instance moves are allowed
523
                  -> Table             -- ^ Original table
524
                  -> Instance.Instance -- ^ Instance to move
525
                  -> Table             -- ^ Best new table for this instance
526
checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
527
  let opdx = Instance.pNode target
528
      osdx = Instance.sNode target
529
      nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx
530
      use_secondary = elem osdx nodes_idx && inst_moves
531
      aft_failover = if use_secondary -- if allowed to failover
532
                       then checkSingleStep ini_tbl target ini_tbl Failover
533
                       else ini_tbl
534
      all_moves = if disk_moves
535
                    then concatMap
536
                           (possibleMoves use_secondary inst_moves) nodes
537
                    else []
538
    in
539
      -- iterate over the possible nodes for this instance
540
      foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
541

    
542
-- | Compute the best next move.
543
checkMove :: [Ndx]               -- ^ Allowed target node indices
544
          -> Bool                -- ^ Whether disk moves are allowed
545
          -> Bool                -- ^ Whether instance moves are allowed
546
          -> Table               -- ^ The current solution
547
          -> [Instance.Instance] -- ^ List of instances still to move
548
          -> Table               -- ^ The new solution
549
checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
550
  let Table _ _ _ ini_plc = ini_tbl
551
      -- we're using rwhnf from the Control.Parallel.Strategies
552
      -- package; we don't need to use rnf as that would force too
553
      -- much evaluation in single-threaded cases, and in
554
      -- multi-threaded case the weak head normal form is enough to
555
      -- spark the evaluation
556
      tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
557
                             inst_moves ini_tbl)
558
               victims
559
      -- iterate over all instances, computing the best move
560
      best_tbl = foldl' compareTables ini_tbl tables
561
      Table _ _ _ best_plc = best_tbl
562
  in if length best_plc == length ini_plc
563
       then ini_tbl -- no advancement
564
       else best_tbl
565

    
566
-- | Check if we are allowed to go deeper in the balancing.
567
doNextBalance :: Table     -- ^ The starting table
568
              -> Int       -- ^ Remaining length
569
              -> Score     -- ^ Score at which to stop
570
              -> Bool      -- ^ The resulting table and commands
571
doNextBalance ini_tbl max_rounds min_score =
572
  let Table _ _ ini_cv ini_plc = ini_tbl
573
      ini_plc_len = length ini_plc
574
  in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
575

    
576
-- | Run a balance move.
577
tryBalance :: Table       -- ^ The starting table
578
           -> Bool        -- ^ Allow disk moves
579
           -> Bool        -- ^ Allow instance moves
580
           -> Bool        -- ^ Only evacuate moves
581
           -> Score       -- ^ Min gain threshold
582
           -> Score       -- ^ Min gain
583
           -> Maybe Table -- ^ The resulting table and commands
584
tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
585
    let Table ini_nl ini_il ini_cv _ = ini_tbl
586
        all_inst = Container.elems ini_il
587
        all_inst' = if evac_mode
588
                    then let bad_nodes = map Node.idx . filter Node.offline $
589
                                         Container.elems ini_nl
590
                         in filter (any (`elem` bad_nodes) . Instance.allNodes)
591
                            all_inst
592
                    else all_inst
593
        reloc_inst = filter Instance.movable all_inst'
594
        node_idx = map Node.idx . filter (not . Node.offline) $
595
                   Container.elems ini_nl
596
        fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
597
        (Table _ _ fin_cv _) = fin_tbl
598
    in
599
      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
600
      then Just fin_tbl -- this round made success, return the new table
601
      else Nothing
602

    
603
-- * Allocation functions
604

    
605
-- | Build failure stats out of a list of failures.
606
collapseFailures :: [FailMode] -> FailStats
607
collapseFailures flst =
608
    map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
609
            [minBound..maxBound]
610

    
611
-- | Compares two Maybe AllocElement and chooses the besst score.
612
bestAllocElement :: Maybe Node.AllocElement
613
                 -> Maybe Node.AllocElement
614
                 -> Maybe Node.AllocElement
615
bestAllocElement a Nothing = a
616
bestAllocElement Nothing b = b
617
bestAllocElement a@(Just (_, _, _, ascore)) b@(Just (_, _, _, bscore)) =
618
  if ascore < bscore then a else b
619

    
620
-- | Update current Allocation solution and failure stats with new
621
-- elements.
622
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
623
concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
624

    
625
concatAllocs as (OpGood ns) =
626
  let -- Choose the old or new solution, based on the cluster score
627
    cntok = asAllocs as
628
    osols = asSolution as
629
    nsols = bestAllocElement osols (Just ns)
630
    nsuc = cntok + 1
631
    -- Note: we force evaluation of nsols here in order to keep the
632
    -- memory profile low - we know that we will need nsols for sure
633
    -- in the next cycle, so we force evaluation of nsols, since the
634
    -- foldl' in the caller will only evaluate the tuple, but not the
635
    -- elements of the tuple
636
  in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolution = nsols }
637

    
638
-- | Sums two 'AllocSolution' structures.
639
sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
640
sumAllocs (AllocSolution aFails aAllocs aSols aLog)
641
          (AllocSolution bFails bAllocs bSols bLog) =
642
  -- note: we add b first, since usually it will be smaller; when
643
  -- fold'ing, a will grow and grow whereas b is the per-group
644
  -- result, hence smaller
645
  let nFails  = bFails ++ aFails
646
      nAllocs = aAllocs + bAllocs
647
      nSols   = bestAllocElement aSols bSols
648
      nLog    = bLog ++ aLog
649
  in AllocSolution nFails nAllocs nSols nLog
650

    
651
-- | Given a solution, generates a reasonable description for it.
652
describeSolution :: AllocSolution -> String
653
describeSolution as =
654
  let fcnt = asFailures as
655
      sols = asSolution as
656
      freasons =
657
        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
658
        filter ((> 0) . snd) . collapseFailures $ fcnt
659
  in case sols of
660
     Nothing -> "No valid allocation solutions, failure reasons: " ++
661
                (if null fcnt then "unknown reasons" else freasons)
662
     Just (_, _, nodes, cv) ->
663
         printf ("score: %.8f, successes %d, failures %d (%s)" ++
664
                 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
665
               (intercalate "/" . map Node.name $ nodes)
666

    
667
-- | Annotates a solution with the appropriate string.
668
annotateSolution :: AllocSolution -> AllocSolution
669
annotateSolution as = as { asLog = describeSolution as : asLog as }
670

    
671
-- | Reverses an evacuation solution.
672
--
673
-- Rationale: we always concat the results to the top of the lists, so
674
-- for proper jobset execution, we should reverse all lists.
675
reverseEvacSolution :: EvacSolution -> EvacSolution
676
reverseEvacSolution (EvacSolution f m o) =
677
  EvacSolution (reverse f) (reverse m) (reverse o)
678

    
679
-- | Generate the valid node allocation singles or pairs for a new instance.
680
genAllocNodes :: Group.List        -- ^ Group list
681
              -> Node.List         -- ^ The node map
682
              -> Int               -- ^ The number of nodes required
683
              -> Bool              -- ^ Whether to drop or not
684
                                   -- unallocable nodes
685
              -> Result AllocNodes -- ^ The (monadic) result
686
genAllocNodes gl nl count drop_unalloc =
687
  let filter_fn = if drop_unalloc
688
                    then filter (Group.isAllocable .
689
                                 flip Container.find gl . Node.group)
690
                    else id
691
      all_nodes = filter_fn $ getOnline nl
692
      all_pairs = [(Node.idx p,
693
                    [Node.idx s | s <- all_nodes,
694
                                       Node.idx p /= Node.idx s,
695
                                       Node.group p == Node.group s]) |
696
                   p <- all_nodes]
697
  in case count of
698
       1 -> Ok (Left (map Node.idx all_nodes))
699
       2 -> Ok (Right (filter (not . null . snd) all_pairs))
700
       _ -> Bad "Unsupported number of nodes, only one or two  supported"
701

    
702
-- | Try to allocate an instance on the cluster.
703
tryAlloc :: (Monad m) =>
704
            Node.List         -- ^ The node list
705
         -> Instance.List     -- ^ The instance list
706
         -> Instance.Instance -- ^ The instance to allocate
707
         -> AllocNodes        -- ^ The allocation targets
708
         -> m AllocSolution   -- ^ Possible solution list
709
tryAlloc _  _ _    (Right []) = fail "Not enough online nodes"
710
tryAlloc nl _ inst (Right ok_pairs) =
711
  let psols = parMap rwhnf (\(p, ss) ->
712
                              foldl' (\cstate ->
713
                                        concatAllocs cstate .
714
                                        allocateOnPair nl inst p)
715
                              emptyAllocSolution ss) ok_pairs
716
      sols = foldl' sumAllocs emptyAllocSolution psols
717
  in return $ annotateSolution sols
718

    
719
tryAlloc _  _ _    (Left []) = fail "No online nodes"
720
tryAlloc nl _ inst (Left all_nodes) =
721
  let sols = foldl' (\cstate ->
722
                       concatAllocs cstate . allocateOnSingle nl inst
723
                    ) emptyAllocSolution all_nodes
724
  in return $ annotateSolution sols
725

    
726
-- | Given a group/result, describe it as a nice (list of) messages.
727
solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
728
solutionDescription gl (groupId, result) =
729
  case result of
730
    Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
731
    Bad message -> [printf "Group %s: error %s" gname message]
732
  where grp = Container.find groupId gl
733
        gname = Group.name grp
734
        pol = allocPolicyToRaw (Group.allocPolicy grp)
735

    
736
-- | From a list of possibly bad and possibly empty solutions, filter
737
-- only the groups with a valid result. Note that the result will be
738
-- reversed compared to the original list.
739
filterMGResults :: Group.List
740
                -> [(Gdx, Result AllocSolution)]
741
                -> [(Gdx, AllocSolution)]
742
filterMGResults gl = foldl' fn []
743
  where unallocable = not . Group.isAllocable . flip Container.find gl
744
        fn accu (gdx, rasol) =
745
          case rasol of
746
            Bad _ -> accu
747
            Ok sol | isNothing (asSolution sol) -> accu
748
                   | unallocable gdx -> accu
749
                   | otherwise -> (gdx, sol):accu
750

    
751
-- | Sort multigroup results based on policy and score.
752
sortMGResults :: Group.List
753
             -> [(Gdx, AllocSolution)]
754
             -> [(Gdx, AllocSolution)]
755
sortMGResults gl sols =
756
  let extractScore (_, _, _, x) = x
757
      solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
758
                             (extractScore . fromJust . asSolution) sol)
759
  in sortBy (comparing solScore) sols
760

    
761
-- | Finds the best group for an instance on a multi-group cluster.
762
--
763
-- Only solutions in @preferred@ and @last_resort@ groups will be
764
-- accepted as valid, and additionally if the allowed groups parameter
765
-- is not null then allocation will only be run for those group
766
-- indices.
767
findBestAllocGroup :: Group.List           -- ^ The group list
768
                   -> Node.List            -- ^ The node list
769
                   -> Instance.List        -- ^ The instance list
770
                   -> Maybe [Gdx]          -- ^ The allowed groups
771
                   -> Instance.Instance    -- ^ The instance to allocate
772
                   -> Int                  -- ^ Required number of nodes
773
                   -> Result (Gdx, AllocSolution, [String])
774
findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
775
  let groups = splitCluster mgnl mgil
776
      groups' = maybe groups (\gs -> filter ((`elem` gs) . fst) groups)
777
                allowed_gdxs
778
      sols = map (\(gid, (nl, il)) ->
779
                   (gid, genAllocNodes mggl nl cnt False >>=
780
                       tryAlloc nl il inst))
781
             groups'::[(Gdx, Result AllocSolution)]
782
      all_msgs = concatMap (solutionDescription mggl) sols
783
      goodSols = filterMGResults mggl sols
784
      sortedSols = sortMGResults mggl goodSols
785
  in if null sortedSols
786
       then Bad $ intercalate ", " all_msgs
787
       else let (final_group, final_sol) = head sortedSols
788
            in return (final_group, final_sol, all_msgs)
789

    
790
-- | Try to allocate an instance on a multi-group cluster.
791
tryMGAlloc :: Group.List           -- ^ The group list
792
           -> Node.List            -- ^ The node list
793
           -> Instance.List        -- ^ The instance list
794
           -> Instance.Instance    -- ^ The instance to allocate
795
           -> Int                  -- ^ Required number of nodes
796
           -> Result AllocSolution -- ^ Possible solution list
797
tryMGAlloc mggl mgnl mgil inst cnt = do
798
  (best_group, solution, all_msgs) <-
799
      findBestAllocGroup mggl mgnl mgil Nothing inst cnt
800
  let group_name = Group.name $ Container.find best_group mggl
801
      selmsg = "Selected group: " ++ group_name
802
  return $ solution { asLog = selmsg:all_msgs }
803

    
804
-- | Try to relocate an instance on the cluster.
805
tryReloc :: (Monad m) =>
806
            Node.List       -- ^ The node list
807
         -> Instance.List   -- ^ The instance list
808
         -> Idx             -- ^ The index of the instance to move
809
         -> Int             -- ^ The number of nodes required
810
         -> [Ndx]           -- ^ Nodes which should not be used
811
         -> m AllocSolution -- ^ Solution list
812
tryReloc nl il xid 1 ex_idx =
813
  let all_nodes = getOnline nl
814
      inst = Container.find xid il
815
      ex_idx' = Instance.pNode inst:ex_idx
816
      valid_nodes = filter (not . flip elem ex_idx' . Node.idx) all_nodes
817
      valid_idxes = map Node.idx valid_nodes
818
      sols1 = foldl' (\cstate x ->
819
                        let em = do
820
                              (mnl, i, _, _) <-
821
                                applyMove nl inst (ReplaceSecondary x)
822
                              return (mnl, i, [Container.find x mnl],
823
                                         compCV mnl)
824
                        in concatAllocs cstate em
825
                     ) emptyAllocSolution valid_idxes
826
  in return sols1
827

    
828
tryReloc _ _ _ reqn _  = fail $ "Unsupported number of relocation \
829
                                \destinations required (" ++ show reqn ++
830
                                                  "), only one supported"
831

    
832
-- | Function which fails if the requested mode is change secondary.
833
--
834
-- This is useful since except DRBD, no other disk template can
835
-- execute change secondary; thus, we can just call this function
836
-- instead of always checking for secondary mode. After the call to
837
-- this function, whatever mode we have is just a primary change.
838
failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
839
failOnSecondaryChange ChangeSecondary dt =
840
  fail $ "Instances with disk template '" ++ diskTemplateToRaw dt ++
841
         "' can't execute change secondary"
842
failOnSecondaryChange _ _ = return ()
843

    
844
-- | Run evacuation for a single instance.
845
--
846
-- /Note:/ this function should correctly execute both intra-group
847
-- evacuations (in all modes) and inter-group evacuations (in the
848
-- 'ChangeAll' mode). Of course, this requires that the correct list
849
-- of target nodes is passed.
850
nodeEvacInstance :: Node.List         -- ^ The node list (cluster-wide)
851
                 -> Instance.List     -- ^ Instance list (cluster-wide)
852
                 -> EvacMode          -- ^ The evacuation mode
853
                 -> Instance.Instance -- ^ The instance to be evacuated
854
                 -> Gdx               -- ^ The group we're targetting
855
                 -> [Ndx]             -- ^ The list of available nodes
856
                                      -- for allocation
857
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
858
nodeEvacInstance _ _ mode (Instance.Instance
859
                           {Instance.diskTemplate = dt@DTDiskless}) _ _ =
860
                  failOnSecondaryChange mode dt >>
861
                  fail "Diskless relocations not implemented yet"
862

    
863
nodeEvacInstance _ _ _ (Instance.Instance
864
                        {Instance.diskTemplate = DTPlain}) _ _ =
865
                  fail "Instances of type plain cannot be relocated"
866

    
867
nodeEvacInstance _ _ _ (Instance.Instance
868
                        {Instance.diskTemplate = DTFile}) _ _ =
869
                  fail "Instances of type file cannot be relocated"
870

    
871
nodeEvacInstance _ _ mode  (Instance.Instance
872
                            {Instance.diskTemplate = dt@DTSharedFile}) _ _ =
873
                  failOnSecondaryChange mode dt >>
874
                  fail "Shared file relocations not implemented yet"
875

    
876
nodeEvacInstance _ _ mode (Instance.Instance
877
                           {Instance.diskTemplate = dt@DTBlock}) _ _ =
878
                  failOnSecondaryChange mode dt >>
879
                  fail "Block device relocations not implemented yet"
880

    
881
nodeEvacInstance nl il ChangePrimary
882
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
883
                 _ _ =
884
  do
885
    (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
886
    let idx = Instance.idx inst
887
        il' = Container.add idx inst' il
888
        ops = iMoveToJob nl' il' idx Failover
889
    return (nl', il', ops)
890

    
891
nodeEvacInstance nl il ChangeSecondary
892
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
893
                 gdx avail_nodes =
894
  do
895
    (nl', inst', _, ndx) <- annotateResult "Can't find any good node" $
896
                            eitherToResult $
897
                            foldl' (evacDrbdSecondaryInner nl inst gdx)
898
                            (Left "no nodes available") avail_nodes
899
    let idx = Instance.idx inst
900
        il' = Container.add idx inst' il
901
        ops = iMoveToJob nl' il' idx (ReplaceSecondary ndx)
902
    return (nl', il', ops)
903

    
904
-- The algorithm for ChangeAll is as follows:
905
--
906
-- * generate all (primary, secondary) node pairs for the target groups
907
-- * for each pair, execute the needed moves (r:s, f, r:s) and compute
908
--   the final node list state and group score
909
-- * select the best choice via a foldl that uses the same Either
910
--   String solution as the ChangeSecondary mode
911
nodeEvacInstance nl il ChangeAll
912
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
913
                 gdx avail_nodes =
914
  do
915
    let no_nodes = Left "no nodes available"
916
        node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s]
917
    (nl', il', ops, _) <-
918
        annotateResult "Can't find any good nodes for relocation" $
919
        eitherToResult $
920
        foldl'
921
        (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of
922
                          Bad msg ->
923
                              case accu of
924
                                Right _ -> accu
925
                                -- we don't need more details (which
926
                                -- nodes, etc.) as we only selected
927
                                -- this group if we can allocate on
928
                                -- it, hence failures will not
929
                                -- propagate out of this fold loop
930
                                Left _ -> Left $ "Allocation failed: " ++ msg
931
                          Ok result@(_, _, _, new_cv) ->
932
                              let new_accu = Right result in
933
                              case accu of
934
                                Left _ -> new_accu
935
                                Right (_, _, _, old_cv) ->
936
                                    if old_cv < new_cv
937
                                    then accu
938
                                    else new_accu
939
        ) no_nodes node_pairs
940

    
941
    return (nl', il', ops)
942

    
943
-- | Inner fold function for changing secondary of a DRBD instance.
944
--
945
-- The running solution is either a @Left String@, which means we
946
-- don't have yet a working solution, or a @Right (...)@, which
947
-- represents a valid solution; it holds the modified node list, the
948
-- modified instance (after evacuation), the score of that solution,
949
-- and the new secondary node index.
950
evacDrbdSecondaryInner :: Node.List -- ^ Cluster node list
951
                       -> Instance.Instance -- ^ Instance being evacuated
952
                       -> Gdx -- ^ The group index of the instance
953
                       -> Either String ( Node.List
954
                                        , Instance.Instance
955
                                        , Score
956
                                        , Ndx)  -- ^ Current best solution
957
                       -> Ndx  -- ^ Node we're evaluating as new secondary
958
                       -> Either String ( Node.List
959
                                        , Instance.Instance
960
                                        , Score
961
                                        , Ndx) -- ^ New best solution
962
evacDrbdSecondaryInner nl inst gdx accu ndx =
963
  case applyMove nl inst (ReplaceSecondary ndx) of
964
    OpFail fm ->
965
      case accu of
966
        Right _ -> accu
967
        Left _ -> Left $ "Node " ++ Container.nameOf nl ndx ++
968
                  " failed: " ++ show fm
969
    OpGood (nl', inst', _, _) ->
970
      let nodes = Container.elems nl'
971
          -- The fromJust below is ugly (it can fail nastily), but
972
          -- at this point we should have any internal mismatches,
973
          -- and adding a monad here would be quite involved
974
          grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
975
          new_cv = compCVNodes grpnodes
976
          new_accu = Right (nl', inst', new_cv, ndx)
977
      in case accu of
978
           Left _ -> new_accu
979
           Right (_, _, old_cv, _) ->
980
             if old_cv < new_cv
981
               then accu
982
               else new_accu
983

    
984
-- | Compute result of changing all nodes of a DRBD instance.
985
--
986
-- Given the target primary and secondary node (which might be in a
987
-- different group or not), this function will 'execute' all the
988
-- required steps and assuming all operations succceed, will return
989
-- the modified node and instance lists, the opcodes needed for this
990
-- and the new group score.
991
evacDrbdAllInner :: Node.List         -- ^ Cluster node list
992
                 -> Instance.List     -- ^ Cluster instance list
993
                 -> Instance.Instance -- ^ The instance to be moved
994
                 -> Gdx               -- ^ The target group index
995
                                      -- (which can differ from the
996
                                      -- current group of the
997
                                      -- instance)
998
                 -> (Ndx, Ndx)        -- ^ Tuple of new
999
                                      -- primary\/secondary nodes
1000
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score)
1001
evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) = do
1002
  let primary = Container.find (Instance.pNode inst) nl
1003
      idx = Instance.idx inst
1004
  -- if the primary is offline, then we first failover
1005
  (nl1, inst1, ops1) <-
1006
    if Node.offline primary
1007
      then do
1008
        (nl', inst', _, _) <-
1009
          annotateResult "Failing over to the secondary" $
1010
          opToResult $ applyMove nl inst Failover
1011
        return (nl', inst', [Failover])
1012
      else return (nl, inst, [])
1013
  let (o1, o2, o3) = (ReplaceSecondary t_pdx,
1014
                      Failover,
1015
                      ReplaceSecondary t_sdx)
1016
  -- we now need to execute a replace secondary to the future
1017
  -- primary node
1018
  (nl2, inst2, _, _) <-
1019
    annotateResult "Changing secondary to new primary" $
1020
    opToResult $
1021
    applyMove nl1 inst1 o1
1022
  let ops2 = o1:ops1
1023
  -- we now execute another failover, the primary stays fixed now
1024
  (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" $
1025
                        opToResult $ applyMove nl2 inst2 o2
1026
  let ops3 = o2:ops2
1027
  -- and finally another replace secondary, to the final secondary
1028
  (nl4, inst4, _, _) <-
1029
    annotateResult "Changing secondary to final secondary" $
1030
    opToResult $
1031
    applyMove nl3 inst3 o3
1032
  let ops4 = o3:ops3
1033
      il' = Container.add idx inst4 il
1034
      ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1035
  let nodes = Container.elems nl4
1036
      -- The fromJust below is ugly (it can fail nastily), but
1037
      -- at this point we should have any internal mismatches,
1038
      -- and adding a monad here would be quite involved
1039
      grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1040
      new_cv = compCVNodes grpnodes
1041
  return (nl4, il', ops, new_cv)
1042

    
1043
-- | Computes the nodes in a given group which are available for
1044
-- allocation.
1045
availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1046
                    -> IntSet.IntSet  -- ^ Nodes that are excluded
1047
                    -> Gdx            -- ^ The group for which we
1048
                                      -- query the nodes
1049
                    -> Result [Ndx]   -- ^ List of available node indices
1050
availableGroupNodes group_nodes excl_ndx gdx = do
1051
  local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1052
                 Ok (lookup gdx group_nodes)
1053
  let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1054
  return avail_nodes
1055

    
1056
-- | Updates the evac solution with the results of an instance
1057
-- evacuation.
1058
updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1059
                   -> Idx
1060
                   -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1061
                   -> (Node.List, Instance.List, EvacSolution)
1062
updateEvacSolution (nl, il, es) idx (Bad msg) =
1063
  (nl, il, es { esFailed = (idx, msg):esFailed es})
1064
updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1065
  (nl, il, es { esMoved = new_elem:esMoved es
1066
              , esOpCodes = opcodes:esOpCodes es })
1067
    where inst = Container.find idx il
1068
          new_elem = (idx,
1069
                      instancePriGroup nl inst,
1070
                      Instance.allNodes inst)
1071

    
1072
-- | Node-evacuation IAllocator mode main function.
1073
tryNodeEvac :: Group.List    -- ^ The cluster groups
1074
            -> Node.List     -- ^ The node list (cluster-wide, not per group)
1075
            -> Instance.List -- ^ Instance list (cluster-wide)
1076
            -> EvacMode      -- ^ The evacuation mode
1077
            -> [Idx]         -- ^ List of instance (indices) to be evacuated
1078
            -> Result (Node.List, Instance.List, EvacSolution)
1079
tryNodeEvac _ ini_nl ini_il mode idxs =
1080
  let evac_ndx = nodesToEvacuate ini_il mode idxs
1081
      offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1082
      excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1083
      group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1084
                                           (Container.elems nl))) $
1085
                  splitCluster ini_nl ini_il
1086
      (fin_nl, fin_il, esol) =
1087
        foldl' (\state@(nl, il, _) inst ->
1088
                  let gdx = instancePriGroup nl inst
1089
                      pdx = Instance.pNode inst in
1090
                  updateEvacSolution state (Instance.idx inst) $
1091
                  availableGroupNodes group_ndx
1092
                    (IntSet.insert pdx excl_ndx) gdx >>=
1093
                      nodeEvacInstance nl il mode inst gdx
1094
               )
1095
        (ini_nl, ini_il, emptyEvacSolution)
1096
        (map (`Container.find` ini_il) idxs)
1097
  in return (fin_nl, fin_il, reverseEvacSolution esol)
1098

    
1099
-- | Change-group IAllocator mode main function.
1100
--
1101
-- This is very similar to 'tryNodeEvac', the only difference is that
1102
-- we don't choose as target group the current instance group, but
1103
-- instead:
1104
--
1105
--   1. at the start of the function, we compute which are the target
1106
--   groups; either no groups were passed in, in which case we choose
1107
--   all groups out of which we don't evacuate instance, or there were
1108
--   some groups passed, in which case we use those
1109
--
1110
--   2. for each instance, we use 'findBestAllocGroup' to choose the
1111
--   best group to hold the instance, and then we do what
1112
--   'tryNodeEvac' does, except for this group instead of the current
1113
--   instance group.
1114
--
1115
-- Note that the correct behaviour of this function relies on the
1116
-- function 'nodeEvacInstance' to be able to do correctly both
1117
-- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1118
tryChangeGroup :: Group.List    -- ^ The cluster groups
1119
               -> Node.List     -- ^ The node list (cluster-wide)
1120
               -> Instance.List -- ^ Instance list (cluster-wide)
1121
               -> [Gdx]         -- ^ Target groups; if empty, any
1122
                                -- groups not being evacuated
1123
               -> [Idx]         -- ^ List of instance (indices) to be evacuated
1124
               -> Result (Node.List, Instance.List, EvacSolution)
1125
tryChangeGroup gl ini_nl ini_il gdxs idxs =
1126
  let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1127
                             flip Container.find ini_il) idxs
1128
      target_gdxs = (if null gdxs
1129
                       then Container.keys gl
1130
                       else gdxs) \\ evac_gdxs
1131
      offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1132
      excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1133
      group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1134
                                           (Container.elems nl))) $
1135
                  splitCluster ini_nl ini_il
1136
      (fin_nl, fin_il, esol) =
1137
        foldl' (\state@(nl, il, _) inst ->
1138
                  let solution = do
1139
                        let ncnt = Instance.requiredNodes $
1140
                                   Instance.diskTemplate inst
1141
                        (gdx, _, _) <- findBestAllocGroup gl nl il
1142
                                       (Just target_gdxs) inst ncnt
1143
                        av_nodes <- availableGroupNodes group_ndx
1144
                                    excl_ndx gdx
1145
                        nodeEvacInstance nl il ChangeAll inst gdx av_nodes
1146
                  in updateEvacSolution state (Instance.idx inst) solution
1147
               )
1148
        (ini_nl, ini_il, emptyEvacSolution)
1149
        (map (`Container.find` ini_il) idxs)
1150
  in return (fin_nl, fin_il, reverseEvacSolution esol)
1151

    
1152
-- | Standard-sized allocation method.
1153
--
1154
-- This places instances of the same size on the cluster until we're
1155
-- out of space. The result will be a list of identically-sized
1156
-- instances.
1157
iterateAlloc :: AllocMethod
1158
iterateAlloc nl il limit newinst allocnodes ixes cstats =
1159
  let depth = length ixes
1160
      newname = printf "new-%d" depth::String
1161
      newidx = length (Container.elems il) + depth
1162
      newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1163
      newlimit = fmap (flip (-) 1) limit
1164
  in case tryAlloc nl il newi2 allocnodes of
1165
       Bad s -> Bad s
1166
       Ok (AllocSolution { asFailures = errs, asSolution = sols3 }) ->
1167
         let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1168
         case sols3 of
1169
           Nothing -> newsol
1170
           Just (xnl, xi, _, _) ->
1171
             if limit == Just 0
1172
               then newsol
1173
               else iterateAlloc xnl (Container.add newidx xi il)
1174
                      newlimit newinst allocnodes (xi:ixes)
1175
                      (totalResources xnl:cstats)
1176

    
1177
-- | Tiered allocation method.
1178
--
1179
-- This places instances on the cluster, and decreases the spec until
1180
-- we can allocate again. The result will be a list of decreasing
1181
-- instance specs.
1182
tieredAlloc :: AllocMethod
1183
tieredAlloc nl il limit newinst allocnodes ixes cstats =
1184
  case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1185
    Bad s -> Bad s
1186
    Ok (errs, nl', il', ixes', cstats') ->
1187
      let newsol = Ok (errs, nl', il', ixes', cstats')
1188
          ixes_cnt = length ixes'
1189
          (stop, newlimit) = case limit of
1190
                               Nothing -> (False, Nothing)
1191
                               Just n -> (n <= ixes_cnt,
1192
                                            Just (n - ixes_cnt)) in
1193
      if stop then newsol else
1194
          case Instance.shrinkByType newinst . fst . last $
1195
               sortBy (comparing snd) errs of
1196
            Bad _ -> newsol
1197
            Ok newinst' -> tieredAlloc nl' il' newlimit
1198
                           newinst' allocnodes ixes' cstats'
1199

    
1200
-- * Formatting functions
1201

    
1202
-- | Given the original and final nodes, computes the relocation description.
1203
computeMoves :: Instance.Instance -- ^ The instance to be moved
1204
             -> String -- ^ The instance name
1205
             -> IMove  -- ^ The move being performed
1206
             -> String -- ^ New primary
1207
             -> String -- ^ New secondary
1208
             -> (String, [String])
1209
                -- ^ Tuple of moves and commands list; moves is containing
1210
                -- either @/f/@ for failover or @/r:name/@ for replace
1211
                -- secondary, while the command list holds gnt-instance
1212
                -- commands (without that prefix), e.g \"@failover instance1@\"
1213
computeMoves i inam mv c d =
1214
  case mv of
1215
    Failover -> ("f", [mig])
1216
    FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1217
    ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1218
    ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1219
    ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1220
  where morf = if Instance.instanceRunning i then "migrate" else "failover"
1221
        mig = printf "%s -f %s" morf inam::String
1222
        rep n = printf "replace-disks -n %s %s" n inam
1223

    
1224
-- | Converts a placement to string format.
1225
printSolutionLine :: Node.List     -- ^ The node list
1226
                  -> Instance.List -- ^ The instance list
1227
                  -> Int           -- ^ Maximum node name length
1228
                  -> Int           -- ^ Maximum instance name length
1229
                  -> Placement     -- ^ The current placement
1230
                  -> Int           -- ^ The index of the placement in
1231
                                   -- the solution
1232
                  -> (String, [String])
1233
printSolutionLine nl il nmlen imlen plc pos =
1234
  let pmlen = (2*nmlen + 1)
1235
      (i, p, s, mv, c) = plc
1236
      inst = Container.find i il
1237
      inam = Instance.alias inst
1238
      npri = Node.alias $ Container.find p nl
1239
      nsec = Node.alias $ Container.find s nl
1240
      opri = Node.alias $ Container.find (Instance.pNode inst) nl
1241
      osec = Node.alias $ Container.find (Instance.sNode inst) nl
1242
      (moves, cmds) =  computeMoves inst inam mv npri nsec
1243
      ostr = printf "%s:%s" opri osec::String
1244
      nstr = printf "%s:%s" npri nsec::String
1245
  in (printf "  %3d. %-*s %-*s => %-*s %.8f a=%s"
1246
      pos imlen inam pmlen ostr
1247
      pmlen nstr c moves,
1248
      cmds)
1249

    
1250
-- | Return the instance and involved nodes in an instance move.
1251
--
1252
-- Note that the output list length can vary, and is not required nor
1253
-- guaranteed to be of any specific length.
1254
involvedNodes :: Instance.List -- ^ Instance list, used for retrieving
1255
                               -- the instance from its index; note
1256
                               -- that this /must/ be the original
1257
                               -- instance list, so that we can
1258
                               -- retrieve the old nodes
1259
              -> Placement     -- ^ The placement we're investigating,
1260
                               -- containing the new nodes and
1261
                               -- instance index
1262
              -> [Ndx]         -- ^ Resulting list of node indices
1263
involvedNodes il plc =
1264
  let (i, np, ns, _, _) = plc
1265
      inst = Container.find i il
1266
  in nub $ [np, ns] ++ Instance.allNodes inst
1267

    
1268
-- | Inner function for splitJobs, that either appends the next job to
1269
-- the current jobset, or starts a new jobset.
1270
mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1271
mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1272
mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1273
  | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1274
  | otherwise = ([n]:cjs, ndx)
1275

    
1276
-- | Break a list of moves into independent groups. Note that this
1277
-- will reverse the order of jobs.
1278
splitJobs :: [MoveJob] -> [JobSet]
1279
splitJobs = fst . foldl mergeJobs ([], [])
1280

    
1281
-- | Given a list of commands, prefix them with @gnt-instance@ and
1282
-- also beautify the display a little.
1283
formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1284
formatJob jsn jsl (sn, (_, _, _, cmds)) =
1285
  let out =
1286
        printf "  echo job %d/%d" jsn sn:
1287
        printf "  check":
1288
        map ("  gnt-instance " ++) cmds
1289
  in if sn == 1
1290
       then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1291
       else out
1292

    
1293
-- | Given a list of commands, prefix them with @gnt-instance@ and
1294
-- also beautify the display a little.
1295
formatCmds :: [JobSet] -> String
1296
formatCmds =
1297
  unlines .
1298
  concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1299
                           (zip [1..] js)) .
1300
  zip [1..]
1301

    
1302
-- | Print the node list.
1303
printNodes :: Node.List -> [String] -> String
1304
printNodes nl fs =
1305
  let fields = case fs of
1306
                 [] -> Node.defaultFields
1307
                 "+":rest -> Node.defaultFields ++ rest
1308
                 _ -> fs
1309
      snl = sortBy (comparing Node.idx) (Container.elems nl)
1310
      (header, isnum) = unzip $ map Node.showHeader fields
1311
  in unlines . map ((:) ' ' .  intercalate " ") $
1312
     formatTable (header:map (Node.list fields) snl) isnum
1313

    
1314
-- | Print the instance list.
1315
printInsts :: Node.List -> Instance.List -> String
1316
printInsts nl il =
1317
  let sil = sortBy (comparing Instance.idx) (Container.elems il)
1318
      helper inst = [ if Instance.instanceRunning inst then "R" else " "
1319
                    , Instance.name inst
1320
                    , Container.nameOf nl (Instance.pNode inst)
1321
                    , let sdx = Instance.sNode inst
1322
                      in if sdx == Node.noSecondary
1323
                           then  ""
1324
                           else Container.nameOf nl sdx
1325
                    , if Instance.autoBalance inst then "Y" else "N"
1326
                    , printf "%3d" $ Instance.vcpus inst
1327
                    , printf "%5d" $ Instance.mem inst
1328
                    , printf "%5d" $ Instance.dsk inst `div` 1024
1329
                    , printf "%5.3f" lC
1330
                    , printf "%5.3f" lM
1331
                    , printf "%5.3f" lD
1332
                    , printf "%5.3f" lN
1333
                    ]
1334
          where DynUtil lC lM lD lN = Instance.util inst
1335
      header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1336
               , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1337
      isnum = False:False:False:False:False:repeat True
1338
  in unlines . map ((:) ' ' . intercalate " ") $
1339
     formatTable (header:map helper sil) isnum
1340

    
1341
-- | Shows statistics for a given node list.
1342
printStats :: Node.List -> String
1343
printStats nl =
1344
  let dcvs = compDetailedCV $ Container.elems nl
1345
      (weights, names) = unzip detailedCVInfo
1346
      hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1347
      formatted = map (\(w, header, val) ->
1348
                         printf "%s=%.8f(x%.2f)" header val w::String) hd
1349
  in intercalate ", " formatted
1350

    
1351
-- | Convert a placement into a list of OpCodes (basically a job).
1352
iMoveToJob :: Node.List        -- ^ The node list; only used for node
1353
                               -- names, so any version is good
1354
                               -- (before or after the operation)
1355
           -> Instance.List    -- ^ The instance list; also used for
1356
                               -- names only
1357
           -> Idx              -- ^ The index of the instance being
1358
                               -- moved
1359
           -> IMove            -- ^ The actual move to be described
1360
           -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to
1361
                               -- the given move
1362
iMoveToJob nl il idx move =
1363
  let inst = Container.find idx il
1364
      iname = Instance.name inst
1365
      lookNode  = Just . Container.nameOf nl
1366
      opF = OpCodes.OpInstanceMigrate iname True False True Nothing
1367
      opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n)
1368
              OpCodes.ReplaceNewSecondary [] Nothing
1369
  in case move of
1370
       Failover -> [ opF ]
1371
       ReplacePrimary np -> [ opF, opR np, opF ]
1372
       ReplaceSecondary ns -> [ opR ns ]
1373
       ReplaceAndFailover np -> [ opR np, opF ]
1374
       FailoverAndReplace ns -> [ opF, opR ns ]
1375

    
1376
-- * Node group functions
1377

    
1378
-- | Computes the group of an instance.
1379
instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1380
instanceGroup nl i =
1381
  let sidx = Instance.sNode i
1382
      pnode = Container.find (Instance.pNode i) nl
1383
      snode = if sidx == Node.noSecondary
1384
              then pnode
1385
              else Container.find sidx nl
1386
      pgroup = Node.group pnode
1387
      sgroup = Node.group snode
1388
  in if pgroup /= sgroup
1389
       then fail ("Instance placed accross two node groups, primary " ++
1390
                  show pgroup ++ ", secondary " ++ show sgroup)
1391
       else return pgroup
1392

    
1393
-- | Computes the group of an instance per the primary node.
1394
instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1395
instancePriGroup nl i =
1396
  let pnode = Container.find (Instance.pNode i) nl
1397
  in  Node.group pnode
1398

    
1399
-- | Compute the list of badly allocated instances (split across node
1400
-- groups).
1401
findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1402
findSplitInstances nl =
1403
  filter (not . isOk . instanceGroup nl) . Container.elems
1404

    
1405
-- | Splits a cluster into the component node groups.
1406
splitCluster :: Node.List -> Instance.List ->
1407
                [(Gdx, (Node.List, Instance.List))]
1408
splitCluster nl il =
1409
  let ngroups = Node.computeGroups (Container.elems nl)
1410
  in map (\(guuid, nodes) ->
1411
           let nidxs = map Node.idx nodes
1412
               nodes' = zip nidxs nodes
1413
               instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1414
           in (guuid, (Container.fromList nodes', instances))) ngroups
1415

    
1416
-- | Compute the list of nodes that are to be evacuated, given a list
1417
-- of instances and an evacuation mode.
1418
nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1419
                -> EvacMode      -- ^ The evacuation mode we're using
1420
                -> [Idx]         -- ^ List of instance indices being evacuated
1421
                -> IntSet.IntSet -- ^ Set of node indices
1422
nodesToEvacuate il mode =
1423
  IntSet.delete Node.noSecondary .
1424
  foldl' (\ns idx ->
1425
            let i = Container.find idx il
1426
                pdx = Instance.pNode i
1427
                sdx = Instance.sNode i
1428
                dt = Instance.diskTemplate i
1429
                withSecondary = case dt of
1430
                                  DTDrbd8 -> IntSet.insert sdx ns
1431
                                  _ -> ns
1432
            in case mode of
1433
                 ChangePrimary   -> IntSet.insert pdx ns
1434
                 ChangeSecondary -> withSecondary
1435
                 ChangeAll       -> IntSet.insert pdx withSecondary
1436
         ) IntSet.empty