Statistics
| Branch: | Tag: | Revision:

root / htools / Ganeti / HTools / Cluster.hs @ 6a855aaa

History | View | Annotate | Download (62.2 kB)

1
{-| Implementation of cluster-wide logic.
2

    
3
This module holds all pure cluster-logic; I\/O related functionality
4
goes into the /Main/ module for the individual binaries.
5

    
6
-}
7

    
8
{-
9

    
10
Copyright (C) 2009, 2010, 2011, 2012 Google Inc.
11

    
12
This program is free software; you can redistribute it and/or modify
13
it under the terms of the GNU General Public License as published by
14
the Free Software Foundation; either version 2 of the License, or
15
(at your option) any later version.
16

    
17
This program is distributed in the hope that it will be useful, but
18
WITHOUT ANY WARRANTY; without even the implied warranty of
19
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
General Public License for more details.
21

    
22
You should have received a copy of the GNU General Public License
23
along with this program; if not, write to the Free Software
24
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25
02110-1301, USA.
26

    
27
-}
28

    
29
module Ganeti.HTools.Cluster
30
  (
31
    -- * Types
32
    AllocSolution(..)
33
  , EvacSolution(..)
34
  , Table(..)
35
  , CStats(..)
36
  , AllocStats
37
  , AllocResult
38
  , AllocMethod
39
  -- * Generic functions
40
  , totalResources
41
  , computeAllocationDelta
42
  -- * First phase functions
43
  , computeBadItems
44
  -- * Second phase functions
45
  , printSolutionLine
46
  , formatCmds
47
  , involvedNodes
48
  , splitJobs
49
  -- * Display functions
50
  , printNodes
51
  , printInsts
52
  -- * Balacing functions
53
  , checkMove
54
  , doNextBalance
55
  , tryBalance
56
  , compCV
57
  , compCVNodes
58
  , compDetailedCV
59
  , printStats
60
  , iMoveToJob
61
  -- * IAllocator functions
62
  , genAllocNodes
63
  , tryAlloc
64
  , tryMGAlloc
65
  , tryReloc
66
  , tryNodeEvac
67
  , tryChangeGroup
68
  , collapseFailures
69
  -- * Allocation functions
70
  , iterateAlloc
71
  , tieredAlloc
72
  -- * Node group functions
73
  , instanceGroup
74
  , findSplitInstances
75
  , splitCluster
76
  ) where
77

    
78
import qualified Data.IntSet as IntSet
79
import Data.List
80
import Data.Maybe (fromJust, isNothing)
81
import Data.Ord (comparing)
82
import Text.Printf (printf)
83

    
84
import qualified Ganeti.HTools.Container as Container
85
import qualified Ganeti.HTools.Instance as Instance
86
import qualified Ganeti.HTools.Node as Node
87
import qualified Ganeti.HTools.Group as Group
88
import Ganeti.HTools.Types
89
import Ganeti.HTools.Utils
90
import Ganeti.HTools.Compat
91
import qualified Ganeti.OpCodes as OpCodes
92

    
93
-- * Types
94

    
95
-- | Allocation\/relocation solution.
96
data AllocSolution = AllocSolution
97
  { asFailures :: [FailMode]              -- ^ Failure counts
98
  , asAllocs   :: Int                     -- ^ Good allocation count
99
  , asSolution :: Maybe Node.AllocElement -- ^ The actual allocation result
100
  , asLog      :: [String]                -- ^ Informational messages
101
  }
102

    
103
-- | Node evacuation/group change iallocator result type. This result
104
-- type consists of actual opcodes (a restricted subset) that are
105
-- transmitted back to Ganeti.
106
data EvacSolution = EvacSolution
107
  { esMoved   :: [(Idx, Gdx, [Ndx])]  -- ^ Instances moved successfully
108
  , esFailed  :: [(Idx, String)]      -- ^ Instances which were not
109
                                      -- relocated
110
  , esOpCodes :: [[OpCodes.OpCode]]   -- ^ List of jobs
111
  } deriving (Show)
112

    
113
-- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
114
type AllocResult = (FailStats, Node.List, Instance.List,
115
                    [Instance.Instance], [CStats])
116

    
117
-- | A type denoting the valid allocation mode/pairs.
118
--
119
-- For a one-node allocation, this will be a @Left ['Ndx']@, whereas
120
-- for a two-node allocation, this will be a @Right [('Ndx',
121
-- ['Ndx'])]@. In the latter case, the list is basically an
122
-- association list, grouped by primary node and holding the potential
123
-- secondary nodes in the sub-list.
124
type AllocNodes = Either [Ndx] [(Ndx, [Ndx])]
125

    
126
-- | The empty solution we start with when computing allocations.
127
emptyAllocSolution :: AllocSolution
128
emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
129
                                   , asSolution = Nothing, asLog = [] }
130

    
131
-- | The empty evac solution.
132
emptyEvacSolution :: EvacSolution
133
emptyEvacSolution = EvacSolution { esMoved = []
134
                                 , esFailed = []
135
                                 , esOpCodes = []
136
                                 }
137

    
138
-- | The complete state for the balancing solution.
139
data Table = Table Node.List Instance.List Score [Placement]
140
             deriving (Show, Read)
141

    
142
-- | Cluster statistics data type.
143
data CStats = CStats { csFmem :: Integer -- ^ Cluster free mem
144
                     , csFdsk :: Integer -- ^ Cluster free disk
145
                     , csAmem :: Integer -- ^ Cluster allocatable mem
146
                     , csAdsk :: Integer -- ^ Cluster allocatable disk
147
                     , csAcpu :: Integer -- ^ Cluster allocatable cpus
148
                     , csMmem :: Integer -- ^ Max node allocatable mem
149
                     , csMdsk :: Integer -- ^ Max node allocatable disk
150
                     , csMcpu :: Integer -- ^ Max node allocatable cpu
151
                     , csImem :: Integer -- ^ Instance used mem
152
                     , csIdsk :: Integer -- ^ Instance used disk
153
                     , csIcpu :: Integer -- ^ Instance used cpu
154
                     , csTmem :: Double  -- ^ Cluster total mem
155
                     , csTdsk :: Double  -- ^ Cluster total disk
156
                     , csTcpu :: Double  -- ^ Cluster total cpus
157
                     , csVcpu :: Integer -- ^ Cluster virtual cpus (if
158
                                         -- node pCpu has been set,
159
                                         -- otherwise -1)
160
                     , csXmem :: Integer -- ^ Unnacounted for mem
161
                     , csNmem :: Integer -- ^ Node own memory
162
                     , csScore :: Score  -- ^ The cluster score
163
                     , csNinst :: Int    -- ^ The total number of instances
164
                     }
165
            deriving (Show, Read)
166

    
167
-- | Currently used, possibly to allocate, unallocable.
168
type AllocStats = (RSpec, RSpec, RSpec)
169

    
170
-- | A simple type for allocation functions.
171
type AllocMethod =  Node.List           -- ^ Node list
172
                 -> Instance.List       -- ^ Instance list
173
                 -> Maybe Int           -- ^ Optional allocation limit
174
                 -> Instance.Instance   -- ^ Instance spec for allocation
175
                 -> AllocNodes          -- ^ Which nodes we should allocate on
176
                 -> [Instance.Instance] -- ^ Allocated instances
177
                 -> [CStats]            -- ^ Running cluster stats
178
                 -> Result AllocResult  -- ^ Allocation result
179

    
180
-- * Utility functions
181

    
182
-- | Verifies the N+1 status and return the affected nodes.
183
verifyN1 :: [Node.Node] -> [Node.Node]
184
verifyN1 = filter Node.failN1
185

    
186
{-| Computes the pair of bad nodes and instances.
187

    
188
The bad node list is computed via a simple 'verifyN1' check, and the
189
bad instance list is the list of primary and secondary instances of
190
those nodes.
191

    
192
-}
193
computeBadItems :: Node.List -> Instance.List ->
194
                   ([Node.Node], [Instance.Instance])
195
computeBadItems nl il =
196
  let bad_nodes = verifyN1 $ getOnline nl
197
      bad_instances = map (`Container.find` il) .
198
                      sort . nub $
199
                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
200
  in
201
    (bad_nodes, bad_instances)
202

    
203
-- | Extracts the node pairs for an instance. This can fail if the
204
-- instance is single-homed. FIXME: this needs to be improved,
205
-- together with the general enhancement for handling non-DRBD moves.
206
instanceNodes :: Node.List -> Instance.Instance ->
207
                 (Ndx, Ndx, Node.Node, Node.Node)
208
instanceNodes nl inst =
209
  let old_pdx = Instance.pNode inst
210
      old_sdx = Instance.sNode inst
211
      old_p = Container.find old_pdx nl
212
      old_s = Container.find old_sdx nl
213
  in (old_pdx, old_sdx, old_p, old_s)
214

    
215
-- | Zero-initializer for the CStats type.
216
emptyCStats :: CStats
217
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
218

    
219
-- | Update stats with data from a new node.
220
updateCStats :: CStats -> Node.Node -> CStats
221
updateCStats cs node =
222
  let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
223
               csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
224
               csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
225
               csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
226
               csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
227
               csVcpu = x_vcpu,
228
               csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
229
             }
230
        = cs
231
      inc_amem = Node.fMem node - Node.rMem node
232
      inc_amem' = if inc_amem > 0 then inc_amem else 0
233
      inc_adsk = Node.availDisk node
234
      inc_imem = truncate (Node.tMem node) - Node.nMem node
235
                 - Node.xMem node - Node.fMem node
236
      inc_icpu = Node.uCpu node
237
      inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
238
      inc_vcpu = Node.hiCpu node
239
      inc_acpu = Node.availCpu node
240
  in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
241
        , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
242
        , csAmem = x_amem + fromIntegral inc_amem'
243
        , csAdsk = x_adsk + fromIntegral inc_adsk
244
        , csAcpu = x_acpu + fromIntegral inc_acpu
245
        , csMmem = max x_mmem (fromIntegral inc_amem')
246
        , csMdsk = max x_mdsk (fromIntegral inc_adsk)
247
        , csMcpu = max x_mcpu (fromIntegral inc_acpu)
248
        , csImem = x_imem + fromIntegral inc_imem
249
        , csIdsk = x_idsk + fromIntegral inc_idsk
250
        , csIcpu = x_icpu + fromIntegral inc_icpu
251
        , csTmem = x_tmem + Node.tMem node
252
        , csTdsk = x_tdsk + Node.tDsk node
253
        , csTcpu = x_tcpu + Node.tCpu node
254
        , csVcpu = x_vcpu + fromIntegral inc_vcpu
255
        , csXmem = x_xmem + fromIntegral (Node.xMem node)
256
        , csNmem = x_nmem + fromIntegral (Node.nMem node)
257
        , csNinst = x_ninst + length (Node.pList node)
258
        }
259

    
260
-- | Compute the total free disk and memory in the cluster.
261
totalResources :: Node.List -> CStats
262
totalResources nl =
263
  let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
264
  in cs { csScore = compCV nl }
265

    
266
-- | Compute the delta between two cluster state.
267
--
268
-- This is used when doing allocations, to understand better the
269
-- available cluster resources. The return value is a triple of the
270
-- current used values, the delta that was still allocated, and what
271
-- was left unallocated.
272
computeAllocationDelta :: CStats -> CStats -> AllocStats
273
computeAllocationDelta cini cfin =
274
  let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini
275
      CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
276
              csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin
277
      rini = RSpec (fromIntegral i_icpu) (fromIntegral i_imem)
278
             (fromIntegral i_idsk)
279
      rfin = RSpec (fromIntegral (f_icpu - i_icpu))
280
             (fromIntegral (f_imem - i_imem))
281
             (fromIntegral (f_idsk - i_idsk))
282
      un_cpu = fromIntegral (v_cpu - f_icpu)::Int
283
      runa = RSpec un_cpu (truncate t_mem - fromIntegral f_imem)
284
             (truncate t_dsk - fromIntegral f_idsk)
285
  in (rini, rfin, runa)
286

    
287
-- | The names and weights of the individual elements in the CV list.
288
detailedCVInfo :: [(Double, String)]
289
detailedCVInfo = [ (1,  "free_mem_cv")
290
                 , (1,  "free_disk_cv")
291
                 , (1,  "n1_cnt")
292
                 , (1,  "reserved_mem_cv")
293
                 , (4,  "offline_all_cnt")
294
                 , (16, "offline_pri_cnt")
295
                 , (1,  "vcpu_ratio_cv")
296
                 , (1,  "cpu_load_cv")
297
                 , (1,  "mem_load_cv")
298
                 , (1,  "disk_load_cv")
299
                 , (1,  "net_load_cv")
300
                 , (2,  "pri_tags_score")
301
                 ]
302

    
303
-- | Holds the weights used by 'compCVNodes' for each metric.
304
detailedCVWeights :: [Double]
305
detailedCVWeights = map fst detailedCVInfo
306

    
307
-- | Compute the mem and disk covariance.
308
compDetailedCV :: [Node.Node] -> [Double]
309
compDetailedCV all_nodes =
310
  let (offline, nodes) = partition Node.offline all_nodes
311
      mem_l = map Node.pMem nodes
312
      dsk_l = map Node.pDsk nodes
313
      -- metric: memory covariance
314
      mem_cv = stdDev mem_l
315
      -- metric: disk covariance
316
      dsk_cv = stdDev dsk_l
317
      -- metric: count of instances living on N1 failing nodes
318
      n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
319
                                                 length (Node.pList n)) .
320
                 filter Node.failN1 $ nodes :: Double
321
      res_l = map Node.pRem nodes
322
      -- metric: reserved memory covariance
323
      res_cv = stdDev res_l
324
      -- offline instances metrics
325
      offline_ipri = sum . map (length . Node.pList) $ offline
326
      offline_isec = sum . map (length . Node.sList) $ offline
327
      -- metric: count of instances on offline nodes
328
      off_score = fromIntegral (offline_ipri + offline_isec)::Double
329
      -- metric: count of primary instances on offline nodes (this
330
      -- helps with evacuation/failover of primary instances on
331
      -- 2-node clusters with one node offline)
332
      off_pri_score = fromIntegral offline_ipri::Double
333
      cpu_l = map Node.pCpu nodes
334
      -- metric: covariance of vcpu/pcpu ratio
335
      cpu_cv = stdDev cpu_l
336
      -- metrics: covariance of cpu, memory, disk and network load
337
      (c_load, m_load, d_load, n_load) =
338
        unzip4 $ map (\n ->
339
                      let DynUtil c1 m1 d1 n1 = Node.utilLoad n
340
                          DynUtil c2 m2 d2 n2 = Node.utilPool n
341
                      in (c1/c2, m1/m2, d1/d2, n1/n2)) nodes
342
      -- metric: conflicting instance count
343
      pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
344
      pri_tags_score = fromIntegral pri_tags_inst::Double
345
  in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
346
     , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
347
     , pri_tags_score ]
348

    
349
-- | Compute the /total/ variance.
350
compCVNodes :: [Node.Node] -> Double
351
compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
352

    
353
-- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
354
compCV :: Node.List -> Double
355
compCV = compCVNodes . Container.elems
356

    
357
-- | Compute online nodes from a 'Node.List'.
358
getOnline :: Node.List -> [Node.Node]
359
getOnline = filter (not . Node.offline) . Container.elems
360

    
361
-- * Balancing functions
362

    
363
-- | Compute best table. Note that the ordering of the arguments is important.
364
compareTables :: Table -> Table -> Table
365
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
366
  if a_cv > b_cv then b else a
367

    
368
-- | Applies an instance move to a given node list and instance.
369
applyMove :: Node.List -> Instance.Instance
370
          -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
371
-- Failover (f)
372
applyMove nl inst Failover =
373
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
374
      int_p = Node.removePri old_p inst
375
      int_s = Node.removeSec old_s inst
376
      new_nl = do -- Maybe monad
377
        new_p <- Node.addPriEx (Node.offline old_p) int_s inst
378
        new_s <- Node.addSec int_p inst old_sdx
379
        let new_inst = Instance.setBoth inst old_sdx old_pdx
380
        return (Container.addTwo old_pdx new_s old_sdx new_p nl,
381
                new_inst, old_sdx, old_pdx)
382
  in new_nl
383

    
384
-- Replace the primary (f:, r:np, f)
385
applyMove nl inst (ReplacePrimary new_pdx) =
386
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
387
      tgt_n = Container.find new_pdx nl
388
      int_p = Node.removePri old_p inst
389
      int_s = Node.removeSec old_s inst
390
      force_p = Node.offline old_p
391
      new_nl = do -- Maybe monad
392
                  -- check that the current secondary can host the instance
393
                  -- during the migration
394
        tmp_s <- Node.addPriEx force_p int_s inst
395
        let tmp_s' = Node.removePri tmp_s inst
396
        new_p <- Node.addPriEx force_p tgt_n inst
397
        new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
398
        let new_inst = Instance.setPri inst new_pdx
399
        return (Container.add new_pdx new_p $
400
                Container.addTwo old_pdx int_p old_sdx new_s nl,
401
                new_inst, new_pdx, old_sdx)
402
  in new_nl
403

    
404
-- Replace the secondary (r:ns)
405
applyMove nl inst (ReplaceSecondary new_sdx) =
406
  let old_pdx = Instance.pNode inst
407
      old_sdx = Instance.sNode inst
408
      old_s = Container.find old_sdx nl
409
      tgt_n = Container.find new_sdx nl
410
      int_s = Node.removeSec old_s inst
411
      force_s = Node.offline old_s
412
      new_inst = Instance.setSec inst new_sdx
413
      new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
414
               \new_s -> return (Container.addTwo new_sdx
415
                                 new_s old_sdx int_s nl,
416
                                 new_inst, old_pdx, new_sdx)
417
  in new_nl
418

    
419
-- Replace the secondary and failover (r:np, f)
420
applyMove nl inst (ReplaceAndFailover new_pdx) =
421
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
422
      tgt_n = Container.find new_pdx nl
423
      int_p = Node.removePri old_p inst
424
      int_s = Node.removeSec old_s inst
425
      force_s = Node.offline old_s
426
      new_nl = do -- Maybe monad
427
        new_p <- Node.addPri tgt_n inst
428
        new_s <- Node.addSecEx force_s int_p inst new_pdx
429
        let new_inst = Instance.setBoth inst new_pdx old_pdx
430
        return (Container.add new_pdx new_p $
431
                Container.addTwo old_pdx new_s old_sdx int_s nl,
432
                new_inst, new_pdx, old_pdx)
433
  in new_nl
434

    
435
-- Failver and replace the secondary (f, r:ns)
436
applyMove nl inst (FailoverAndReplace new_sdx) =
437
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
438
      tgt_n = Container.find new_sdx nl
439
      int_p = Node.removePri old_p inst
440
      int_s = Node.removeSec old_s inst
441
      force_p = Node.offline old_p
442
      new_nl = do -- Maybe monad
443
        new_p <- Node.addPriEx force_p int_s inst
444
        new_s <- Node.addSecEx force_p tgt_n inst old_sdx
445
        let new_inst = Instance.setBoth inst old_sdx new_sdx
446
        return (Container.add new_sdx new_s $
447
                Container.addTwo old_sdx new_p old_pdx int_p nl,
448
                new_inst, old_sdx, new_sdx)
449
  in new_nl
450

    
451
-- | Tries to allocate an instance on one given node.
452
allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
453
                 -> OpResult Node.AllocElement
454
allocateOnSingle nl inst new_pdx =
455
  let p = Container.find new_pdx nl
456
      new_inst = Instance.setBoth inst new_pdx Node.noSecondary
457
  in do
458
    Instance.instMatchesPolicy inst (Node.iPolicy p)
459
    new_p <- Node.addPri p inst
460
    let new_nl = Container.add new_pdx new_p nl
461
        new_score = compCV nl
462
    return (new_nl, new_inst, [new_p], new_score)
463

    
464
-- | Tries to allocate an instance on a given pair of nodes.
465
allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
466
               -> OpResult Node.AllocElement
467
allocateOnPair nl inst new_pdx new_sdx =
468
  let tgt_p = Container.find new_pdx nl
469
      tgt_s = Container.find new_sdx nl
470
  in do
471
    Instance.instMatchesPolicy inst (Node.iPolicy tgt_p)
472
    new_p <- Node.addPri tgt_p inst
473
    new_s <- Node.addSec tgt_s inst new_pdx
474
    let new_inst = Instance.setBoth inst new_pdx new_sdx
475
        new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
476
    return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
477

    
478
-- | Tries to perform an instance move and returns the best table
479
-- between the original one and the new one.
480
checkSingleStep :: Table -- ^ The original table
481
                -> Instance.Instance -- ^ The instance to move
482
                -> Table -- ^ The current best table
483
                -> IMove -- ^ The move to apply
484
                -> Table -- ^ The final best table
485
checkSingleStep ini_tbl target cur_tbl move =
486
  let Table ini_nl ini_il _ ini_plc = ini_tbl
487
      tmp_resu = applyMove ini_nl target move
488
  in case tmp_resu of
489
       OpFail _ -> cur_tbl
490
       OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
491
         let tgt_idx = Instance.idx target
492
             upd_cvar = compCV upd_nl
493
             upd_il = Container.add tgt_idx new_inst ini_il
494
             upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
495
             upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
496
         in compareTables cur_tbl upd_tbl
497

    
498
-- | Given the status of the current secondary as a valid new node and
499
-- the current candidate target node, generate the possible moves for
500
-- a instance.
501
possibleMoves :: Bool      -- ^ Whether the secondary node is a valid new node
502
              -> Bool      -- ^ Whether we can change the primary node
503
              -> Ndx       -- ^ Target node candidate
504
              -> [IMove]   -- ^ List of valid result moves
505

    
506
possibleMoves _ False tdx =
507
  [ReplaceSecondary tdx]
508

    
509
possibleMoves True True tdx =
510
  [ ReplaceSecondary tdx
511
  , ReplaceAndFailover tdx
512
  , ReplacePrimary tdx
513
  , FailoverAndReplace tdx
514
  ]
515

    
516
possibleMoves False True tdx =
517
  [ ReplaceSecondary tdx
518
  , ReplaceAndFailover tdx
519
  ]
520

    
521
-- | Compute the best move for a given instance.
522
checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
523
                  -> Bool              -- ^ Whether disk moves are allowed
524
                  -> Bool              -- ^ Whether instance moves are allowed
525
                  -> Table             -- ^ Original table
526
                  -> Instance.Instance -- ^ Instance to move
527
                  -> Table             -- ^ Best new table for this instance
528
checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
529
  let opdx = Instance.pNode target
530
      osdx = Instance.sNode target
531
      bad_nodes = [opdx, osdx]
532
      nodes = filter (`notElem` bad_nodes) nodes_idx
533
      use_secondary = elem osdx nodes_idx && inst_moves
534
      aft_failover = if use_secondary -- if allowed to failover
535
                       then checkSingleStep ini_tbl target ini_tbl Failover
536
                       else ini_tbl
537
      all_moves = if disk_moves
538
                    then concatMap
539
                           (possibleMoves use_secondary inst_moves) nodes
540
                    else []
541
    in
542
      -- iterate over the possible nodes for this instance
543
      foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
544

    
545
-- | Compute the best next move.
546
checkMove :: [Ndx]               -- ^ Allowed target node indices
547
          -> Bool                -- ^ Whether disk moves are allowed
548
          -> Bool                -- ^ Whether instance moves are allowed
549
          -> Table               -- ^ The current solution
550
          -> [Instance.Instance] -- ^ List of instances still to move
551
          -> Table               -- ^ The new solution
552
checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
553
  let Table _ _ _ ini_plc = ini_tbl
554
      -- we're using rwhnf from the Control.Parallel.Strategies
555
      -- package; we don't need to use rnf as that would force too
556
      -- much evaluation in single-threaded cases, and in
557
      -- multi-threaded case the weak head normal form is enough to
558
      -- spark the evaluation
559
      tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
560
                             inst_moves ini_tbl)
561
               victims
562
      -- iterate over all instances, computing the best move
563
      best_tbl = foldl' compareTables ini_tbl tables
564
      Table _ _ _ best_plc = best_tbl
565
  in if length best_plc == length ini_plc
566
       then ini_tbl -- no advancement
567
       else best_tbl
568

    
569
-- | Check if we are allowed to go deeper in the balancing.
570
doNextBalance :: Table     -- ^ The starting table
571
              -> Int       -- ^ Remaining length
572
              -> Score     -- ^ Score at which to stop
573
              -> Bool      -- ^ The resulting table and commands
574
doNextBalance ini_tbl max_rounds min_score =
575
  let Table _ _ ini_cv ini_plc = ini_tbl
576
      ini_plc_len = length ini_plc
577
  in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
578

    
579
-- | Run a balance move.
580
tryBalance :: Table       -- ^ The starting table
581
           -> Bool        -- ^ Allow disk moves
582
           -> Bool        -- ^ Allow instance moves
583
           -> Bool        -- ^ Only evacuate moves
584
           -> Score       -- ^ Min gain threshold
585
           -> Score       -- ^ Min gain
586
           -> Maybe Table -- ^ The resulting table and commands
587
tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
588
    let Table ini_nl ini_il ini_cv _ = ini_tbl
589
        all_inst = Container.elems ini_il
590
        all_inst' = if evac_mode
591
                    then let bad_nodes = map Node.idx . filter Node.offline $
592
                                         Container.elems ini_nl
593
                         in filter (any (`elem` bad_nodes) . Instance.allNodes)
594
                            all_inst
595
                    else all_inst
596
        reloc_inst = filter Instance.movable all_inst'
597
        node_idx = map Node.idx . filter (not . Node.offline) $
598
                   Container.elems ini_nl
599
        fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
600
        (Table _ _ fin_cv _) = fin_tbl
601
    in
602
      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
603
      then Just fin_tbl -- this round made success, return the new table
604
      else Nothing
605

    
606
-- * Allocation functions
607

    
608
-- | Build failure stats out of a list of failures.
609
collapseFailures :: [FailMode] -> FailStats
610
collapseFailures flst =
611
    map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
612
            [minBound..maxBound]
613

    
614
-- | Compares two Maybe AllocElement and chooses the besst score.
615
bestAllocElement :: Maybe Node.AllocElement
616
                 -> Maybe Node.AllocElement
617
                 -> Maybe Node.AllocElement
618
bestAllocElement a Nothing = a
619
bestAllocElement Nothing b = b
620
bestAllocElement a@(Just (_, _, _, ascore)) b@(Just (_, _, _, bscore)) =
621
  if ascore < bscore then a else b
622

    
623
-- | Update current Allocation solution and failure stats with new
624
-- elements.
625
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
626
concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
627

    
628
concatAllocs as (OpGood ns) =
629
  let -- Choose the old or new solution, based on the cluster score
630
    cntok = asAllocs as
631
    osols = asSolution as
632
    nsols = bestAllocElement osols (Just ns)
633
    nsuc = cntok + 1
634
    -- Note: we force evaluation of nsols here in order to keep the
635
    -- memory profile low - we know that we will need nsols for sure
636
    -- in the next cycle, so we force evaluation of nsols, since the
637
    -- foldl' in the caller will only evaluate the tuple, but not the
638
    -- elements of the tuple
639
  in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolution = nsols }
640

    
641
-- | Sums two 'AllocSolution' structures.
642
sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
643
sumAllocs (AllocSolution aFails aAllocs aSols aLog)
644
          (AllocSolution bFails bAllocs bSols bLog) =
645
  -- note: we add b first, since usually it will be smaller; when
646
  -- fold'ing, a will grow and grow whereas b is the per-group
647
  -- result, hence smaller
648
  let nFails  = bFails ++ aFails
649
      nAllocs = aAllocs + bAllocs
650
      nSols   = bestAllocElement aSols bSols
651
      nLog    = bLog ++ aLog
652
  in AllocSolution nFails nAllocs nSols nLog
653

    
654
-- | Given a solution, generates a reasonable description for it.
655
describeSolution :: AllocSolution -> String
656
describeSolution as =
657
  let fcnt = asFailures as
658
      sols = asSolution as
659
      freasons =
660
        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
661
        filter ((> 0) . snd) . collapseFailures $ fcnt
662
  in case sols of
663
     Nothing -> "No valid allocation solutions, failure reasons: " ++
664
                (if null fcnt then "unknown reasons" else freasons)
665
     Just (_, _, nodes, cv) ->
666
         printf ("score: %.8f, successes %d, failures %d (%s)" ++
667
                 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
668
               (intercalate "/" . map Node.name $ nodes)
669

    
670
-- | Annotates a solution with the appropriate string.
671
annotateSolution :: AllocSolution -> AllocSolution
672
annotateSolution as = as { asLog = describeSolution as : asLog as }
673

    
674
-- | Reverses an evacuation solution.
675
--
676
-- Rationale: we always concat the results to the top of the lists, so
677
-- for proper jobset execution, we should reverse all lists.
678
reverseEvacSolution :: EvacSolution -> EvacSolution
679
reverseEvacSolution (EvacSolution f m o) =
680
  EvacSolution (reverse f) (reverse m) (reverse o)
681

    
682
-- | Generate the valid node allocation singles or pairs for a new instance.
683
genAllocNodes :: Group.List        -- ^ Group list
684
              -> Node.List         -- ^ The node map
685
              -> Int               -- ^ The number of nodes required
686
              -> Bool              -- ^ Whether to drop or not
687
                                   -- unallocable nodes
688
              -> Result AllocNodes -- ^ The (monadic) result
689
genAllocNodes gl nl count drop_unalloc =
690
  let filter_fn = if drop_unalloc
691
                    then filter (Group.isAllocable .
692
                                 flip Container.find gl . Node.group)
693
                    else id
694
      all_nodes = filter_fn $ getOnline nl
695
      all_pairs = [(Node.idx p,
696
                    [Node.idx s | s <- all_nodes,
697
                                       Node.idx p /= Node.idx s,
698
                                       Node.group p == Node.group s]) |
699
                   p <- all_nodes]
700
  in case count of
701
       1 -> Ok (Left (map Node.idx all_nodes))
702
       2 -> Ok (Right (filter (not . null . snd) all_pairs))
703
       _ -> Bad "Unsupported number of nodes, only one or two  supported"
704

    
705
-- | Try to allocate an instance on the cluster.
706
tryAlloc :: (Monad m) =>
707
            Node.List         -- ^ The node list
708
         -> Instance.List     -- ^ The instance list
709
         -> Instance.Instance -- ^ The instance to allocate
710
         -> AllocNodes        -- ^ The allocation targets
711
         -> m AllocSolution   -- ^ Possible solution list
712
tryAlloc _  _ _    (Right []) = fail "Not enough online nodes"
713
tryAlloc nl _ inst (Right ok_pairs) =
714
  let psols = parMap rwhnf (\(p, ss) ->
715
                              foldl' (\cstate ->
716
                                        concatAllocs cstate .
717
                                        allocateOnPair nl inst p)
718
                              emptyAllocSolution ss) ok_pairs
719
      sols = foldl' sumAllocs emptyAllocSolution psols
720
  in return $ annotateSolution sols
721

    
722
tryAlloc _  _ _    (Left []) = fail "No online nodes"
723
tryAlloc nl _ inst (Left all_nodes) =
724
  let sols = foldl' (\cstate ->
725
                       concatAllocs cstate . allocateOnSingle nl inst
726
                    ) emptyAllocSolution all_nodes
727
  in return $ annotateSolution sols
728

    
729
-- | Given a group/result, describe it as a nice (list of) messages.
730
solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
731
solutionDescription gl (groupId, result) =
732
  case result of
733
    Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
734
    Bad message -> [printf "Group %s: error %s" gname message]
735
  where grp = Container.find groupId gl
736
        gname = Group.name grp
737
        pol = allocPolicyToRaw (Group.allocPolicy grp)
738

    
739
-- | From a list of possibly bad and possibly empty solutions, filter
740
-- only the groups with a valid result. Note that the result will be
741
-- reversed compared to the original list.
742
filterMGResults :: Group.List
743
                -> [(Gdx, Result AllocSolution)]
744
                -> [(Gdx, AllocSolution)]
745
filterMGResults gl = foldl' fn []
746
  where unallocable = not . Group.isAllocable . flip Container.find gl
747
        fn accu (gdx, rasol) =
748
          case rasol of
749
            Bad _ -> accu
750
            Ok sol | isNothing (asSolution sol) -> accu
751
                   | unallocable gdx -> accu
752
                   | otherwise -> (gdx, sol):accu
753

    
754
-- | Sort multigroup results based on policy and score.
755
sortMGResults :: Group.List
756
             -> [(Gdx, AllocSolution)]
757
             -> [(Gdx, AllocSolution)]
758
sortMGResults gl sols =
759
  let extractScore (_, _, _, x) = x
760
      solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
761
                             (extractScore . fromJust . asSolution) sol)
762
  in sortBy (comparing solScore) sols
763

    
764
-- | Finds the best group for an instance on a multi-group cluster.
765
--
766
-- Only solutions in @preferred@ and @last_resort@ groups will be
767
-- accepted as valid, and additionally if the allowed groups parameter
768
-- is not null then allocation will only be run for those group
769
-- indices.
770
findBestAllocGroup :: Group.List           -- ^ The group list
771
                   -> Node.List            -- ^ The node list
772
                   -> Instance.List        -- ^ The instance list
773
                   -> Maybe [Gdx]          -- ^ The allowed groups
774
                   -> Instance.Instance    -- ^ The instance to allocate
775
                   -> Int                  -- ^ Required number of nodes
776
                   -> Result (Gdx, AllocSolution, [String])
777
findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
778
  let groups = splitCluster mgnl mgil
779
      groups' = maybe groups (\gs -> filter ((`elem` gs) . fst) groups)
780
                allowed_gdxs
781
      sols = map (\(gid, (nl, il)) ->
782
                   (gid, genAllocNodes mggl nl cnt False >>=
783
                       tryAlloc nl il inst))
784
             groups'::[(Gdx, Result AllocSolution)]
785
      all_msgs = concatMap (solutionDescription mggl) sols
786
      goodSols = filterMGResults mggl sols
787
      sortedSols = sortMGResults mggl goodSols
788
  in if null sortedSols
789
       then if null groups'
790
              then Bad $ "no groups for evacuation: allowed groups was" ++
791
                     show allowed_gdxs ++ ", all groups: " ++
792
                     show (map fst groups)
793
              else Bad $ intercalate ", " all_msgs
794
       else let (final_group, final_sol) = head sortedSols
795
            in return (final_group, final_sol, all_msgs)
796

    
797
-- | Try to allocate an instance on a multi-group cluster.
798
tryMGAlloc :: Group.List           -- ^ The group list
799
           -> Node.List            -- ^ The node list
800
           -> Instance.List        -- ^ The instance list
801
           -> Instance.Instance    -- ^ The instance to allocate
802
           -> Int                  -- ^ Required number of nodes
803
           -> Result AllocSolution -- ^ Possible solution list
804
tryMGAlloc mggl mgnl mgil inst cnt = do
805
  (best_group, solution, all_msgs) <-
806
      findBestAllocGroup mggl mgnl mgil Nothing inst cnt
807
  let group_name = Group.name $ Container.find best_group mggl
808
      selmsg = "Selected group: " ++ group_name
809
  return $ solution { asLog = selmsg:all_msgs }
810

    
811
-- | Try to relocate an instance on the cluster.
812
tryReloc :: (Monad m) =>
813
            Node.List       -- ^ The node list
814
         -> Instance.List   -- ^ The instance list
815
         -> Idx             -- ^ The index of the instance to move
816
         -> Int             -- ^ The number of nodes required
817
         -> [Ndx]           -- ^ Nodes which should not be used
818
         -> m AllocSolution -- ^ Solution list
819
tryReloc nl il xid 1 ex_idx =
820
  let all_nodes = getOnline nl
821
      inst = Container.find xid il
822
      ex_idx' = Instance.pNode inst:ex_idx
823
      valid_nodes = filter (not . flip elem ex_idx' . Node.idx) all_nodes
824
      valid_idxes = map Node.idx valid_nodes
825
      sols1 = foldl' (\cstate x ->
826
                        let em = do
827
                              (mnl, i, _, _) <-
828
                                applyMove nl inst (ReplaceSecondary x)
829
                              return (mnl, i, [Container.find x mnl],
830
                                         compCV mnl)
831
                        in concatAllocs cstate em
832
                     ) emptyAllocSolution valid_idxes
833
  in return sols1
834

    
835
tryReloc _ _ _ reqn _  = fail $ "Unsupported number of relocation \
836
                                \destinations required (" ++ show reqn ++
837
                                                  "), only one supported"
838

    
839
-- | Function which fails if the requested mode is change secondary.
840
--
841
-- This is useful since except DRBD, no other disk template can
842
-- execute change secondary; thus, we can just call this function
843
-- instead of always checking for secondary mode. After the call to
844
-- this function, whatever mode we have is just a primary change.
845
failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
846
failOnSecondaryChange ChangeSecondary dt =
847
  fail $ "Instances with disk template '" ++ diskTemplateToRaw dt ++
848
         "' can't execute change secondary"
849
failOnSecondaryChange _ _ = return ()
850

    
851
-- | Run evacuation for a single instance.
852
--
853
-- /Note:/ this function should correctly execute both intra-group
854
-- evacuations (in all modes) and inter-group evacuations (in the
855
-- 'ChangeAll' mode). Of course, this requires that the correct list
856
-- of target nodes is passed.
857
nodeEvacInstance :: Node.List         -- ^ The node list (cluster-wide)
858
                 -> Instance.List     -- ^ Instance list (cluster-wide)
859
                 -> EvacMode          -- ^ The evacuation mode
860
                 -> Instance.Instance -- ^ The instance to be evacuated
861
                 -> Gdx               -- ^ The group we're targetting
862
                 -> [Ndx]             -- ^ The list of available nodes
863
                                      -- for allocation
864
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
865
nodeEvacInstance _ _ mode (Instance.Instance
866
                           {Instance.diskTemplate = dt@DTDiskless}) _ _ =
867
                  failOnSecondaryChange mode dt >>
868
                  fail "Diskless relocations not implemented yet"
869

    
870
nodeEvacInstance _ _ _ (Instance.Instance
871
                        {Instance.diskTemplate = DTPlain}) _ _ =
872
                  fail "Instances of type plain cannot be relocated"
873

    
874
nodeEvacInstance _ _ _ (Instance.Instance
875
                        {Instance.diskTemplate = DTFile}) _ _ =
876
                  fail "Instances of type file cannot be relocated"
877

    
878
nodeEvacInstance _ _ mode  (Instance.Instance
879
                            {Instance.diskTemplate = dt@DTSharedFile}) _ _ =
880
                  failOnSecondaryChange mode dt >>
881
                  fail "Shared file relocations not implemented yet"
882

    
883
nodeEvacInstance _ _ mode (Instance.Instance
884
                           {Instance.diskTemplate = dt@DTBlock}) _ _ =
885
                  failOnSecondaryChange mode dt >>
886
                  fail "Block device relocations not implemented yet"
887

    
888
nodeEvacInstance nl il ChangePrimary
889
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
890
                 _ _ =
891
  do
892
    (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
893
    let idx = Instance.idx inst
894
        il' = Container.add idx inst' il
895
        ops = iMoveToJob nl' il' idx Failover
896
    return (nl', il', ops)
897

    
898
nodeEvacInstance nl il ChangeSecondary
899
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
900
                 gdx avail_nodes =
901
  do
902
    (nl', inst', _, ndx) <- annotateResult "Can't find any good node" $
903
                            eitherToResult $
904
                            foldl' (evacDrbdSecondaryInner nl inst gdx)
905
                            (Left "no nodes available") avail_nodes
906
    let idx = Instance.idx inst
907
        il' = Container.add idx inst' il
908
        ops = iMoveToJob nl' il' idx (ReplaceSecondary ndx)
909
    return (nl', il', ops)
910

    
911
-- The algorithm for ChangeAll is as follows:
912
--
913
-- * generate all (primary, secondary) node pairs for the target groups
914
-- * for each pair, execute the needed moves (r:s, f, r:s) and compute
915
--   the final node list state and group score
916
-- * select the best choice via a foldl that uses the same Either
917
--   String solution as the ChangeSecondary mode
918
nodeEvacInstance nl il ChangeAll
919
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
920
                 gdx avail_nodes =
921
  do
922
    let no_nodes = Left "no nodes available"
923
        node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s]
924
    (nl', il', ops, _) <-
925
        annotateResult "Can't find any good nodes for relocation" $
926
        eitherToResult $
927
        foldl'
928
        (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of
929
                          Bad msg ->
930
                              case accu of
931
                                Right _ -> accu
932
                                -- we don't need more details (which
933
                                -- nodes, etc.) as we only selected
934
                                -- this group if we can allocate on
935
                                -- it, hence failures will not
936
                                -- propagate out of this fold loop
937
                                Left _ -> Left $ "Allocation failed: " ++ msg
938
                          Ok result@(_, _, _, new_cv) ->
939
                              let new_accu = Right result in
940
                              case accu of
941
                                Left _ -> new_accu
942
                                Right (_, _, _, old_cv) ->
943
                                    if old_cv < new_cv
944
                                    then accu
945
                                    else new_accu
946
        ) no_nodes node_pairs
947

    
948
    return (nl', il', ops)
949

    
950
-- | Inner fold function for changing secondary of a DRBD instance.
951
--
952
-- The running solution is either a @Left String@, which means we
953
-- don't have yet a working solution, or a @Right (...)@, which
954
-- represents a valid solution; it holds the modified node list, the
955
-- modified instance (after evacuation), the score of that solution,
956
-- and the new secondary node index.
957
evacDrbdSecondaryInner :: Node.List -- ^ Cluster node list
958
                       -> Instance.Instance -- ^ Instance being evacuated
959
                       -> Gdx -- ^ The group index of the instance
960
                       -> Either String ( Node.List
961
                                        , Instance.Instance
962
                                        , Score
963
                                        , Ndx)  -- ^ Current best solution
964
                       -> Ndx  -- ^ Node we're evaluating as new secondary
965
                       -> Either String ( Node.List
966
                                        , Instance.Instance
967
                                        , Score
968
                                        , Ndx) -- ^ New best solution
969
evacDrbdSecondaryInner nl inst gdx accu ndx =
970
  case applyMove nl inst (ReplaceSecondary ndx) of
971
    OpFail fm ->
972
      case accu of
973
        Right _ -> accu
974
        Left _ -> Left $ "Node " ++ Container.nameOf nl ndx ++
975
                  " failed: " ++ show fm
976
    OpGood (nl', inst', _, _) ->
977
      let nodes = Container.elems nl'
978
          -- The fromJust below is ugly (it can fail nastily), but
979
          -- at this point we should have any internal mismatches,
980
          -- and adding a monad here would be quite involved
981
          grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
982
          new_cv = compCVNodes grpnodes
983
          new_accu = Right (nl', inst', new_cv, ndx)
984
      in case accu of
985
           Left _ -> new_accu
986
           Right (_, _, old_cv, _) ->
987
             if old_cv < new_cv
988
               then accu
989
               else new_accu
990

    
991
-- | Compute result of changing all nodes of a DRBD instance.
992
--
993
-- Given the target primary and secondary node (which might be in a
994
-- different group or not), this function will 'execute' all the
995
-- required steps and assuming all operations succceed, will return
996
-- the modified node and instance lists, the opcodes needed for this
997
-- and the new group score.
998
evacDrbdAllInner :: Node.List         -- ^ Cluster node list
999
                 -> Instance.List     -- ^ Cluster instance list
1000
                 -> Instance.Instance -- ^ The instance to be moved
1001
                 -> Gdx               -- ^ The target group index
1002
                                      -- (which can differ from the
1003
                                      -- current group of the
1004
                                      -- instance)
1005
                 -> (Ndx, Ndx)        -- ^ Tuple of new
1006
                                      -- primary\/secondary nodes
1007
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score)
1008
evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) = do
1009
  let primary = Container.find (Instance.pNode inst) nl
1010
      idx = Instance.idx inst
1011
  -- if the primary is offline, then we first failover
1012
  (nl1, inst1, ops1) <-
1013
    if Node.offline primary
1014
      then do
1015
        (nl', inst', _, _) <-
1016
          annotateResult "Failing over to the secondary" $
1017
          opToResult $ applyMove nl inst Failover
1018
        return (nl', inst', [Failover])
1019
      else return (nl, inst, [])
1020
  let (o1, o2, o3) = (ReplaceSecondary t_pdx,
1021
                      Failover,
1022
                      ReplaceSecondary t_sdx)
1023
  -- we now need to execute a replace secondary to the future
1024
  -- primary node
1025
  (nl2, inst2, _, _) <-
1026
    annotateResult "Changing secondary to new primary" $
1027
    opToResult $
1028
    applyMove nl1 inst1 o1
1029
  let ops2 = o1:ops1
1030
  -- we now execute another failover, the primary stays fixed now
1031
  (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" $
1032
                        opToResult $ applyMove nl2 inst2 o2
1033
  let ops3 = o2:ops2
1034
  -- and finally another replace secondary, to the final secondary
1035
  (nl4, inst4, _, _) <-
1036
    annotateResult "Changing secondary to final secondary" $
1037
    opToResult $
1038
    applyMove nl3 inst3 o3
1039
  let ops4 = o3:ops3
1040
      il' = Container.add idx inst4 il
1041
      ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1042
  let nodes = Container.elems nl4
1043
      -- The fromJust below is ugly (it can fail nastily), but
1044
      -- at this point we should have any internal mismatches,
1045
      -- and adding a monad here would be quite involved
1046
      grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1047
      new_cv = compCVNodes grpnodes
1048
  return (nl4, il', ops, new_cv)
1049

    
1050
-- | Computes the nodes in a given group which are available for
1051
-- allocation.
1052
availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1053
                    -> IntSet.IntSet  -- ^ Nodes that are excluded
1054
                    -> Gdx            -- ^ The group for which we
1055
                                      -- query the nodes
1056
                    -> Result [Ndx]   -- ^ List of available node indices
1057
availableGroupNodes group_nodes excl_ndx gdx = do
1058
  local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1059
                 Ok (lookup gdx group_nodes)
1060
  let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1061
  return avail_nodes
1062

    
1063
-- | Updates the evac solution with the results of an instance
1064
-- evacuation.
1065
updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1066
                   -> Idx
1067
                   -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1068
                   -> (Node.List, Instance.List, EvacSolution)
1069
updateEvacSolution (nl, il, es) idx (Bad msg) =
1070
  (nl, il, es { esFailed = (idx, msg):esFailed es})
1071
updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1072
  (nl, il, es { esMoved = new_elem:esMoved es
1073
              , esOpCodes = opcodes:esOpCodes es })
1074
    where inst = Container.find idx il
1075
          new_elem = (idx,
1076
                      instancePriGroup nl inst,
1077
                      Instance.allNodes inst)
1078

    
1079
-- | Node-evacuation IAllocator mode main function.
1080
tryNodeEvac :: Group.List    -- ^ The cluster groups
1081
            -> Node.List     -- ^ The node list (cluster-wide, not per group)
1082
            -> Instance.List -- ^ Instance list (cluster-wide)
1083
            -> EvacMode      -- ^ The evacuation mode
1084
            -> [Idx]         -- ^ List of instance (indices) to be evacuated
1085
            -> Result (Node.List, Instance.List, EvacSolution)
1086
tryNodeEvac _ ini_nl ini_il mode idxs =
1087
  let evac_ndx = nodesToEvacuate ini_il mode idxs
1088
      offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1089
      excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1090
      group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1091
                                           (Container.elems nl))) $
1092
                  splitCluster ini_nl ini_il
1093
      (fin_nl, fin_il, esol) =
1094
        foldl' (\state@(nl, il, _) inst ->
1095
                  let gdx = instancePriGroup nl inst
1096
                      pdx = Instance.pNode inst in
1097
                  updateEvacSolution state (Instance.idx inst) $
1098
                  availableGroupNodes group_ndx
1099
                    (IntSet.insert pdx excl_ndx) gdx >>=
1100
                      nodeEvacInstance nl il mode inst gdx
1101
               )
1102
        (ini_nl, ini_il, emptyEvacSolution)
1103
        (map (`Container.find` ini_il) idxs)
1104
  in return (fin_nl, fin_il, reverseEvacSolution esol)
1105

    
1106
-- | Change-group IAllocator mode main function.
1107
--
1108
-- This is very similar to 'tryNodeEvac', the only difference is that
1109
-- we don't choose as target group the current instance group, but
1110
-- instead:
1111
--
1112
--   1. at the start of the function, we compute which are the target
1113
--   groups; either no groups were passed in, in which case we choose
1114
--   all groups out of which we don't evacuate instance, or there were
1115
--   some groups passed, in which case we use those
1116
--
1117
--   2. for each instance, we use 'findBestAllocGroup' to choose the
1118
--   best group to hold the instance, and then we do what
1119
--   'tryNodeEvac' does, except for this group instead of the current
1120
--   instance group.
1121
--
1122
-- Note that the correct behaviour of this function relies on the
1123
-- function 'nodeEvacInstance' to be able to do correctly both
1124
-- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1125
tryChangeGroup :: Group.List    -- ^ The cluster groups
1126
               -> Node.List     -- ^ The node list (cluster-wide)
1127
               -> Instance.List -- ^ Instance list (cluster-wide)
1128
               -> [Gdx]         -- ^ Target groups; if empty, any
1129
                                -- groups not being evacuated
1130
               -> [Idx]         -- ^ List of instance (indices) to be evacuated
1131
               -> Result (Node.List, Instance.List, EvacSolution)
1132
tryChangeGroup gl ini_nl ini_il gdxs idxs =
1133
  let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1134
                             flip Container.find ini_il) idxs
1135
      target_gdxs = (if null gdxs
1136
                       then Container.keys gl
1137
                       else gdxs) \\ evac_gdxs
1138
      offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1139
      excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1140
      group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1141
                                           (Container.elems nl))) $
1142
                  splitCluster ini_nl ini_il
1143
      (fin_nl, fin_il, esol) =
1144
        foldl' (\state@(nl, il, _) inst ->
1145
                  let solution = do
1146
                        let ncnt = Instance.requiredNodes $
1147
                                   Instance.diskTemplate inst
1148
                        (gdx, _, _) <- findBestAllocGroup gl nl il
1149
                                       (Just target_gdxs) inst ncnt
1150
                        av_nodes <- availableGroupNodes group_ndx
1151
                                    excl_ndx gdx
1152
                        nodeEvacInstance nl il ChangeAll inst gdx av_nodes
1153
                  in updateEvacSolution state (Instance.idx inst) solution
1154
               )
1155
        (ini_nl, ini_il, emptyEvacSolution)
1156
        (map (`Container.find` ini_il) idxs)
1157
  in return (fin_nl, fin_il, reverseEvacSolution esol)
1158

    
1159
-- | Standard-sized allocation method.
1160
--
1161
-- This places instances of the same size on the cluster until we're
1162
-- out of space. The result will be a list of identically-sized
1163
-- instances.
1164
iterateAlloc :: AllocMethod
1165
iterateAlloc nl il limit newinst allocnodes ixes cstats =
1166
  let depth = length ixes
1167
      newname = printf "new-%d" depth::String
1168
      newidx = Container.size il
1169
      newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1170
      newlimit = fmap (flip (-) 1) limit
1171
  in case tryAlloc nl il newi2 allocnodes of
1172
       Bad s -> Bad s
1173
       Ok (AllocSolution { asFailures = errs, asSolution = sols3 }) ->
1174
         let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1175
         case sols3 of
1176
           Nothing -> newsol
1177
           Just (xnl, xi, _, _) ->
1178
             if limit == Just 0
1179
               then newsol
1180
               else iterateAlloc xnl (Container.add newidx xi il)
1181
                      newlimit newinst allocnodes (xi:ixes)
1182
                      (totalResources xnl:cstats)
1183

    
1184
-- | Tiered allocation method.
1185
--
1186
-- This places instances on the cluster, and decreases the spec until
1187
-- we can allocate again. The result will be a list of decreasing
1188
-- instance specs.
1189
tieredAlloc :: AllocMethod
1190
tieredAlloc nl il limit newinst allocnodes ixes cstats =
1191
  case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1192
    Bad s -> Bad s
1193
    Ok (errs, nl', il', ixes', cstats') ->
1194
      let newsol = Ok (errs, nl', il', ixes', cstats')
1195
          ixes_cnt = length ixes'
1196
          (stop, newlimit) = case limit of
1197
                               Nothing -> (False, Nothing)
1198
                               Just n -> (n <= ixes_cnt,
1199
                                            Just (n - ixes_cnt)) in
1200
      if stop then newsol else
1201
          case Instance.shrinkByType newinst . fst . last $
1202
               sortBy (comparing snd) errs of
1203
            Bad _ -> newsol
1204
            Ok newinst' -> tieredAlloc nl' il' newlimit
1205
                           newinst' allocnodes ixes' cstats'
1206

    
1207
-- * Formatting functions
1208

    
1209
-- | Given the original and final nodes, computes the relocation description.
1210
computeMoves :: Instance.Instance -- ^ The instance to be moved
1211
             -> String -- ^ The instance name
1212
             -> IMove  -- ^ The move being performed
1213
             -> String -- ^ New primary
1214
             -> String -- ^ New secondary
1215
             -> (String, [String])
1216
                -- ^ Tuple of moves and commands list; moves is containing
1217
                -- either @/f/@ for failover or @/r:name/@ for replace
1218
                -- secondary, while the command list holds gnt-instance
1219
                -- commands (without that prefix), e.g \"@failover instance1@\"
1220
computeMoves i inam mv c d =
1221
  case mv of
1222
    Failover -> ("f", [mig])
1223
    FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1224
    ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1225
    ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1226
    ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1227
  where morf = if Instance.instanceRunning i then "migrate" else "failover"
1228
        mig = printf "%s -f %s" morf inam::String
1229
        rep n = printf "replace-disks -n %s %s" n inam
1230

    
1231
-- | Converts a placement to string format.
1232
printSolutionLine :: Node.List     -- ^ The node list
1233
                  -> Instance.List -- ^ The instance list
1234
                  -> Int           -- ^ Maximum node name length
1235
                  -> Int           -- ^ Maximum instance name length
1236
                  -> Placement     -- ^ The current placement
1237
                  -> Int           -- ^ The index of the placement in
1238
                                   -- the solution
1239
                  -> (String, [String])
1240
printSolutionLine nl il nmlen imlen plc pos =
1241
  let pmlen = (2*nmlen + 1)
1242
      (i, p, s, mv, c) = plc
1243
      inst = Container.find i il
1244
      inam = Instance.alias inst
1245
      npri = Node.alias $ Container.find p nl
1246
      nsec = Node.alias $ Container.find s nl
1247
      opri = Node.alias $ Container.find (Instance.pNode inst) nl
1248
      osec = Node.alias $ Container.find (Instance.sNode inst) nl
1249
      (moves, cmds) =  computeMoves inst inam mv npri nsec
1250
      ostr = printf "%s:%s" opri osec::String
1251
      nstr = printf "%s:%s" npri nsec::String
1252
  in (printf "  %3d. %-*s %-*s => %-*s %.8f a=%s"
1253
      pos imlen inam pmlen ostr
1254
      pmlen nstr c moves,
1255
      cmds)
1256

    
1257
-- | Return the instance and involved nodes in an instance move.
1258
--
1259
-- Note that the output list length can vary, and is not required nor
1260
-- guaranteed to be of any specific length.
1261
involvedNodes :: Instance.List -- ^ Instance list, used for retrieving
1262
                               -- the instance from its index; note
1263
                               -- that this /must/ be the original
1264
                               -- instance list, so that we can
1265
                               -- retrieve the old nodes
1266
              -> Placement     -- ^ The placement we're investigating,
1267
                               -- containing the new nodes and
1268
                               -- instance index
1269
              -> [Ndx]         -- ^ Resulting list of node indices
1270
involvedNodes il plc =
1271
  let (i, np, ns, _, _) = plc
1272
      inst = Container.find i il
1273
  in nub $ [np, ns] ++ Instance.allNodes inst
1274

    
1275
-- | Inner function for splitJobs, that either appends the next job to
1276
-- the current jobset, or starts a new jobset.
1277
mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1278
mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1279
mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1280
  | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1281
  | otherwise = ([n]:cjs, ndx)
1282

    
1283
-- | Break a list of moves into independent groups. Note that this
1284
-- will reverse the order of jobs.
1285
splitJobs :: [MoveJob] -> [JobSet]
1286
splitJobs = fst . foldl mergeJobs ([], [])
1287

    
1288
-- | Given a list of commands, prefix them with @gnt-instance@ and
1289
-- also beautify the display a little.
1290
formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1291
formatJob jsn jsl (sn, (_, _, _, cmds)) =
1292
  let out =
1293
        printf "  echo job %d/%d" jsn sn:
1294
        printf "  check":
1295
        map ("  gnt-instance " ++) cmds
1296
  in if sn == 1
1297
       then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1298
       else out
1299

    
1300
-- | Given a list of commands, prefix them with @gnt-instance@ and
1301
-- also beautify the display a little.
1302
formatCmds :: [JobSet] -> String
1303
formatCmds =
1304
  unlines .
1305
  concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1306
                           (zip [1..] js)) .
1307
  zip [1..]
1308

    
1309
-- | Print the node list.
1310
printNodes :: Node.List -> [String] -> String
1311
printNodes nl fs =
1312
  let fields = case fs of
1313
                 [] -> Node.defaultFields
1314
                 "+":rest -> Node.defaultFields ++ rest
1315
                 _ -> fs
1316
      snl = sortBy (comparing Node.idx) (Container.elems nl)
1317
      (header, isnum) = unzip $ map Node.showHeader fields
1318
  in unlines . map ((:) ' ' .  unwords) $
1319
     formatTable (header:map (Node.list fields) snl) isnum
1320

    
1321
-- | Print the instance list.
1322
printInsts :: Node.List -> Instance.List -> String
1323
printInsts nl il =
1324
  let sil = sortBy (comparing Instance.idx) (Container.elems il)
1325
      helper inst = [ if Instance.instanceRunning inst then "R" else " "
1326
                    , Instance.name inst
1327
                    , Container.nameOf nl (Instance.pNode inst)
1328
                    , let sdx = Instance.sNode inst
1329
                      in if sdx == Node.noSecondary
1330
                           then  ""
1331
                           else Container.nameOf nl sdx
1332
                    , if Instance.autoBalance inst then "Y" else "N"
1333
                    , printf "%3d" $ Instance.vcpus inst
1334
                    , printf "%5d" $ Instance.mem inst
1335
                    , printf "%5d" $ Instance.dsk inst `div` 1024
1336
                    , printf "%5.3f" lC
1337
                    , printf "%5.3f" lM
1338
                    , printf "%5.3f" lD
1339
                    , printf "%5.3f" lN
1340
                    ]
1341
          where DynUtil lC lM lD lN = Instance.util inst
1342
      header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1343
               , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1344
      isnum = False:False:False:False:False:repeat True
1345
  in unlines . map ((:) ' ' . unwords) $
1346
     formatTable (header:map helper sil) isnum
1347

    
1348
-- | Shows statistics for a given node list.
1349
printStats :: Node.List -> String
1350
printStats nl =
1351
  let dcvs = compDetailedCV $ Container.elems nl
1352
      (weights, names) = unzip detailedCVInfo
1353
      hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1354
      formatted = map (\(w, header, val) ->
1355
                         printf "%s=%.8f(x%.2f)" header val w::String) hd
1356
  in intercalate ", " formatted
1357

    
1358
-- | Convert a placement into a list of OpCodes (basically a job).
1359
iMoveToJob :: Node.List        -- ^ The node list; only used for node
1360
                               -- names, so any version is good
1361
                               -- (before or after the operation)
1362
           -> Instance.List    -- ^ The instance list; also used for
1363
                               -- names only
1364
           -> Idx              -- ^ The index of the instance being
1365
                               -- moved
1366
           -> IMove            -- ^ The actual move to be described
1367
           -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to
1368
                               -- the given move
1369
iMoveToJob nl il idx move =
1370
  let inst = Container.find idx il
1371
      iname = Instance.name inst
1372
      lookNode  = Just . Container.nameOf nl
1373
      opF = OpCodes.OpInstanceMigrate iname True False True Nothing
1374
      opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n)
1375
              OpCodes.ReplaceNewSecondary [] Nothing
1376
  in case move of
1377
       Failover -> [ opF ]
1378
       ReplacePrimary np -> [ opF, opR np, opF ]
1379
       ReplaceSecondary ns -> [ opR ns ]
1380
       ReplaceAndFailover np -> [ opR np, opF ]
1381
       FailoverAndReplace ns -> [ opF, opR ns ]
1382

    
1383
-- * Node group functions
1384

    
1385
-- | Computes the group of an instance.
1386
instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1387
instanceGroup nl i =
1388
  let sidx = Instance.sNode i
1389
      pnode = Container.find (Instance.pNode i) nl
1390
      snode = if sidx == Node.noSecondary
1391
              then pnode
1392
              else Container.find sidx nl
1393
      pgroup = Node.group pnode
1394
      sgroup = Node.group snode
1395
  in if pgroup /= sgroup
1396
       then fail ("Instance placed accross two node groups, primary " ++
1397
                  show pgroup ++ ", secondary " ++ show sgroup)
1398
       else return pgroup
1399

    
1400
-- | Computes the group of an instance per the primary node.
1401
instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1402
instancePriGroup nl i =
1403
  let pnode = Container.find (Instance.pNode i) nl
1404
  in  Node.group pnode
1405

    
1406
-- | Compute the list of badly allocated instances (split across node
1407
-- groups).
1408
findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1409
findSplitInstances nl =
1410
  filter (not . isOk . instanceGroup nl) . Container.elems
1411

    
1412
-- | Splits a cluster into the component node groups.
1413
splitCluster :: Node.List -> Instance.List ->
1414
                [(Gdx, (Node.List, Instance.List))]
1415
splitCluster nl il =
1416
  let ngroups = Node.computeGroups (Container.elems nl)
1417
  in map (\(guuid, nodes) ->
1418
           let nidxs = map Node.idx nodes
1419
               nodes' = zip nidxs nodes
1420
               instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1421
           in (guuid, (Container.fromList nodes', instances))) ngroups
1422

    
1423
-- | Compute the list of nodes that are to be evacuated, given a list
1424
-- of instances and an evacuation mode.
1425
nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1426
                -> EvacMode      -- ^ The evacuation mode we're using
1427
                -> [Idx]         -- ^ List of instance indices being evacuated
1428
                -> IntSet.IntSet -- ^ Set of node indices
1429
nodesToEvacuate il mode =
1430
  IntSet.delete Node.noSecondary .
1431
  foldl' (\ns idx ->
1432
            let i = Container.find idx il
1433
                pdx = Instance.pNode i
1434
                sdx = Instance.sNode i
1435
                dt = Instance.diskTemplate i
1436
                withSecondary = case dt of
1437
                                  DTDrbd8 -> IntSet.insert sdx ns
1438
                                  _ -> ns
1439
            in case mode of
1440
                 ChangePrimary   -> IntSet.insert pdx ns
1441
                 ChangeSecondary -> withSecondary
1442
                 ChangeAll       -> IntSet.insert pdx withSecondary
1443
         ) IntSet.empty