Statistics
| Branch: | Tag: | Revision:

root / htools / Ganeti / HTools / Cluster.hs @ 29a30533

History | View | Annotate | Download (65.8 kB)

1
{-| Implementation of cluster-wide logic.
2

    
3
This module holds all pure cluster-logic; I\/O related functionality
4
goes into the /Main/ module for the individual binaries.
5

    
6
-}
7

    
8
{-
9

    
10
Copyright (C) 2009, 2010, 2011, 2012 Google Inc.
11

    
12
This program is free software; you can redistribute it and/or modify
13
it under the terms of the GNU General Public License as published by
14
the Free Software Foundation; either version 2 of the License, or
15
(at your option) any later version.
16

    
17
This program is distributed in the hope that it will be useful, but
18
WITHOUT ANY WARRANTY; without even the implied warranty of
19
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
General Public License for more details.
21

    
22
You should have received a copy of the GNU General Public License
23
along with this program; if not, write to the Free Software
24
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25
02110-1301, USA.
26

    
27
-}
28

    
29
module Ganeti.HTools.Cluster
30
  (
31
    -- * Types
32
    AllocSolution(..)
33
  , EvacSolution(..)
34
  , Table(..)
35
  , CStats(..)
36
  , AllocResult
37
  , AllocMethod
38
  , AllocSolutionList
39
  -- * Generic functions
40
  , totalResources
41
  , computeAllocationDelta
42
  -- * First phase functions
43
  , computeBadItems
44
  -- * Second phase functions
45
  , printSolutionLine
46
  , formatCmds
47
  , involvedNodes
48
  , splitJobs
49
  -- * Display functions
50
  , printNodes
51
  , printInsts
52
  -- * Balacing functions
53
  , checkMove
54
  , doNextBalance
55
  , tryBalance
56
  , compCV
57
  , compCVNodes
58
  , compDetailedCV
59
  , printStats
60
  , iMoveToJob
61
  -- * IAllocator functions
62
  , genAllocNodes
63
  , tryAlloc
64
  , tryMGAlloc
65
  , tryNodeEvac
66
  , tryChangeGroup
67
  , collapseFailures
68
  , allocList
69
  -- * Allocation functions
70
  , iterateAlloc
71
  , tieredAlloc
72
  -- * Node group functions
73
  , instanceGroup
74
  , findSplitInstances
75
  , splitCluster
76
  ) where
77

    
78
import qualified Data.IntSet as IntSet
79
import Data.List
80
import Data.Maybe (fromJust, isNothing)
81
import Data.Ord (comparing)
82
import Text.Printf (printf)
83

    
84
import Ganeti.BasicTypes
85
import qualified Ganeti.HTools.Container as Container
86
import qualified Ganeti.HTools.Instance as Instance
87
import qualified Ganeti.HTools.Node as Node
88
import qualified Ganeti.HTools.Group as Group
89
import Ganeti.HTools.Types
90
import Ganeti.Compat
91
import qualified Ganeti.OpCodes as OpCodes
92
import Ganeti.Utils
93

    
94
-- * Types
95

    
96
-- | Allocation\/relocation solution.
97
data AllocSolution = AllocSolution
98
  { asFailures :: [FailMode]              -- ^ Failure counts
99
  , asAllocs   :: Int                     -- ^ Good allocation count
100
  , asSolution :: Maybe Node.AllocElement -- ^ The actual allocation result
101
  , asLog      :: [String]                -- ^ Informational messages
102
  }
103

    
104
-- | Node evacuation/group change iallocator result type. This result
105
-- type consists of actual opcodes (a restricted subset) that are
106
-- transmitted back to Ganeti.
107
data EvacSolution = EvacSolution
108
  { esMoved   :: [(Idx, Gdx, [Ndx])]  -- ^ Instances moved successfully
109
  , esFailed  :: [(Idx, String)]      -- ^ Instances which were not
110
                                      -- relocated
111
  , esOpCodes :: [[OpCodes.OpCode]]   -- ^ List of jobs
112
  } deriving (Show)
113

    
114
-- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
115
type AllocResult = (FailStats, Node.List, Instance.List,
116
                    [Instance.Instance], [CStats])
117

    
118
-- | Type alias for easier handling.
119
type AllocSolutionList = [(Instance.Instance, AllocSolution)]
120

    
121
-- | A type denoting the valid allocation mode/pairs.
122
--
123
-- For a one-node allocation, this will be a @Left ['Ndx']@, whereas
124
-- for a two-node allocation, this will be a @Right [('Ndx',
125
-- ['Ndx'])]@. In the latter case, the list is basically an
126
-- association list, grouped by primary node and holding the potential
127
-- secondary nodes in the sub-list.
128
type AllocNodes = Either [Ndx] [(Ndx, [Ndx])]
129

    
130
-- | The empty solution we start with when computing allocations.
131
emptyAllocSolution :: AllocSolution
132
emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
133
                                   , asSolution = Nothing, asLog = [] }
134

    
135
-- | The empty evac solution.
136
emptyEvacSolution :: EvacSolution
137
emptyEvacSolution = EvacSolution { esMoved = []
138
                                 , esFailed = []
139
                                 , esOpCodes = []
140
                                 }
141

    
142
-- | The complete state for the balancing solution.
143
data Table = Table Node.List Instance.List Score [Placement]
144
             deriving (Show, Read)
145

    
146
-- | Cluster statistics data type.
147
data CStats = CStats
148
  { csFmem :: Integer -- ^ Cluster free mem
149
  , csFdsk :: Integer -- ^ Cluster free disk
150
  , csAmem :: Integer -- ^ Cluster allocatable mem
151
  , csAdsk :: Integer -- ^ Cluster allocatable disk
152
  , csAcpu :: Integer -- ^ Cluster allocatable cpus
153
  , csMmem :: Integer -- ^ Max node allocatable mem
154
  , csMdsk :: Integer -- ^ Max node allocatable disk
155
  , csMcpu :: Integer -- ^ Max node allocatable cpu
156
  , csImem :: Integer -- ^ Instance used mem
157
  , csIdsk :: Integer -- ^ Instance used disk
158
  , csIcpu :: Integer -- ^ Instance used cpu
159
  , csTmem :: Double  -- ^ Cluster total mem
160
  , csTdsk :: Double  -- ^ Cluster total disk
161
  , csTcpu :: Double  -- ^ Cluster total cpus
162
  , csVcpu :: Integer -- ^ Cluster total virtual cpus
163
  , csNcpu :: Double  -- ^ Equivalent to 'csIcpu' but in terms of
164
                      -- physical CPUs, i.e. normalised used phys CPUs
165
  , csXmem :: Integer -- ^ Unnacounted for mem
166
  , csNmem :: Integer -- ^ Node own memory
167
  , csScore :: Score  -- ^ The cluster score
168
  , csNinst :: Int    -- ^ The total number of instances
169
  } deriving (Show, Read)
170

    
171
-- | A simple type for allocation functions.
172
type AllocMethod =  Node.List           -- ^ Node list
173
                 -> Instance.List       -- ^ Instance list
174
                 -> Maybe Int           -- ^ Optional allocation limit
175
                 -> Instance.Instance   -- ^ Instance spec for allocation
176
                 -> AllocNodes          -- ^ Which nodes we should allocate on
177
                 -> [Instance.Instance] -- ^ Allocated instances
178
                 -> [CStats]            -- ^ Running cluster stats
179
                 -> Result AllocResult  -- ^ Allocation result
180

    
181
-- | A simple type for the running solution of evacuations.
182
type EvacInnerState =
183
  Either String (Node.List, Instance.Instance, Score, Ndx)
184

    
185
-- * Utility functions
186

    
187
-- | Verifies the N+1 status and return the affected nodes.
188
verifyN1 :: [Node.Node] -> [Node.Node]
189
verifyN1 = filter Node.failN1
190

    
191
{-| Computes the pair of bad nodes and instances.
192

    
193
The bad node list is computed via a simple 'verifyN1' check, and the
194
bad instance list is the list of primary and secondary instances of
195
those nodes.
196

    
197
-}
198
computeBadItems :: Node.List -> Instance.List ->
199
                   ([Node.Node], [Instance.Instance])
200
computeBadItems nl il =
201
  let bad_nodes = verifyN1 $ getOnline nl
202
      bad_instances = map (`Container.find` il) .
203
                      sort . nub $
204
                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
205
  in
206
    (bad_nodes, bad_instances)
207

    
208
-- | Extracts the node pairs for an instance. This can fail if the
209
-- instance is single-homed. FIXME: this needs to be improved,
210
-- together with the general enhancement for handling non-DRBD moves.
211
instanceNodes :: Node.List -> Instance.Instance ->
212
                 (Ndx, Ndx, Node.Node, Node.Node)
213
instanceNodes nl inst =
214
  let old_pdx = Instance.pNode inst
215
      old_sdx = Instance.sNode inst
216
      old_p = Container.find old_pdx nl
217
      old_s = Container.find old_sdx nl
218
  in (old_pdx, old_sdx, old_p, old_s)
219

    
220
-- | Zero-initializer for the CStats type.
221
emptyCStats :: CStats
222
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
223

    
224
-- | Update stats with data from a new node.
225
updateCStats :: CStats -> Node.Node -> CStats
226
updateCStats cs node =
227
  let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
228
               csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
229
               csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
230
               csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
231
               csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
232
               csVcpu = x_vcpu, csNcpu = x_ncpu,
233
               csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
234
             }
235
        = cs
236
      inc_amem = Node.fMem node - Node.rMem node
237
      inc_amem' = if inc_amem > 0 then inc_amem else 0
238
      inc_adsk = Node.availDisk node
239
      inc_imem = truncate (Node.tMem node) - Node.nMem node
240
                 - Node.xMem node - Node.fMem node
241
      inc_icpu = Node.uCpu node
242
      inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
243
      inc_vcpu = Node.hiCpu node
244
      inc_acpu = Node.availCpu node
245
      inc_ncpu = fromIntegral (Node.uCpu node) /
246
                 iPolicyVcpuRatio (Node.iPolicy node)
247
  in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
248
        , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
249
        , csAmem = x_amem + fromIntegral inc_amem'
250
        , csAdsk = x_adsk + fromIntegral inc_adsk
251
        , csAcpu = x_acpu + fromIntegral inc_acpu
252
        , csMmem = max x_mmem (fromIntegral inc_amem')
253
        , csMdsk = max x_mdsk (fromIntegral inc_adsk)
254
        , csMcpu = max x_mcpu (fromIntegral inc_acpu)
255
        , csImem = x_imem + fromIntegral inc_imem
256
        , csIdsk = x_idsk + fromIntegral inc_idsk
257
        , csIcpu = x_icpu + fromIntegral inc_icpu
258
        , csTmem = x_tmem + Node.tMem node
259
        , csTdsk = x_tdsk + Node.tDsk node
260
        , csTcpu = x_tcpu + Node.tCpu node
261
        , csVcpu = x_vcpu + fromIntegral inc_vcpu
262
        , csNcpu = x_ncpu + inc_ncpu
263
        , csXmem = x_xmem + fromIntegral (Node.xMem node)
264
        , csNmem = x_nmem + fromIntegral (Node.nMem node)
265
        , csNinst = x_ninst + length (Node.pList node)
266
        }
267

    
268
-- | Compute the total free disk and memory in the cluster.
269
totalResources :: Node.List -> CStats
270
totalResources nl =
271
  let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
272
  in cs { csScore = compCV nl }
273

    
274
-- | Compute the delta between two cluster state.
275
--
276
-- This is used when doing allocations, to understand better the
277
-- available cluster resources. The return value is a triple of the
278
-- current used values, the delta that was still allocated, and what
279
-- was left unallocated.
280
computeAllocationDelta :: CStats -> CStats -> AllocStats
281
computeAllocationDelta cini cfin =
282
  let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu,
283
              csNcpu = i_ncpu } = cini
284
      CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
285
              csTmem = t_mem, csTdsk = t_dsk, csVcpu = f_vcpu,
286
              csNcpu = f_ncpu, csTcpu = f_tcpu } = cfin
287
      rini = AllocInfo { allocInfoVCpus = fromIntegral i_icpu
288
                       , allocInfoNCpus = i_ncpu
289
                       , allocInfoMem   = fromIntegral i_imem
290
                       , allocInfoDisk  = fromIntegral i_idsk
291
                       }
292
      rfin = AllocInfo { allocInfoVCpus = fromIntegral (f_icpu - i_icpu)
293
                       , allocInfoNCpus = f_ncpu - i_ncpu
294
                       , allocInfoMem   = fromIntegral (f_imem - i_imem)
295
                       , allocInfoDisk  = fromIntegral (f_idsk - i_idsk)
296
                       }
297
      runa = AllocInfo { allocInfoVCpus = fromIntegral (f_vcpu - f_icpu)
298
                       , allocInfoNCpus = f_tcpu - f_ncpu
299
                       , allocInfoMem   = truncate t_mem - fromIntegral f_imem
300
                       , allocInfoDisk  = truncate t_dsk - fromIntegral f_idsk
301
                       }
302
  in (rini, rfin, runa)
303

    
304
-- | The names and weights of the individual elements in the CV list.
305
detailedCVInfo :: [(Double, String)]
306
detailedCVInfo = [ (1,  "free_mem_cv")
307
                 , (1,  "free_disk_cv")
308
                 , (1,  "n1_cnt")
309
                 , (1,  "reserved_mem_cv")
310
                 , (4,  "offline_all_cnt")
311
                 , (16, "offline_pri_cnt")
312
                 , (1,  "vcpu_ratio_cv")
313
                 , (1,  "cpu_load_cv")
314
                 , (1,  "mem_load_cv")
315
                 , (1,  "disk_load_cv")
316
                 , (1,  "net_load_cv")
317
                 , (2,  "pri_tags_score")
318
                 , (1,  "spindles_cv")
319
                 ]
320

    
321
-- | Holds the weights used by 'compCVNodes' for each metric.
322
detailedCVWeights :: [Double]
323
detailedCVWeights = map fst detailedCVInfo
324

    
325
-- | Compute the mem and disk covariance.
326
compDetailedCV :: [Node.Node] -> [Double]
327
compDetailedCV all_nodes =
328
  let (offline, nodes) = partition Node.offline all_nodes
329
      mem_l = map Node.pMem nodes
330
      dsk_l = map Node.pDsk nodes
331
      -- metric: memory covariance
332
      mem_cv = stdDev mem_l
333
      -- metric: disk covariance
334
      dsk_cv = stdDev dsk_l
335
      -- metric: count of instances living on N1 failing nodes
336
      n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
337
                                                 length (Node.pList n)) .
338
                 filter Node.failN1 $ nodes :: Double
339
      res_l = map Node.pRem nodes
340
      -- metric: reserved memory covariance
341
      res_cv = stdDev res_l
342
      -- offline instances metrics
343
      offline_ipri = sum . map (length . Node.pList) $ offline
344
      offline_isec = sum . map (length . Node.sList) $ offline
345
      -- metric: count of instances on offline nodes
346
      off_score = fromIntegral (offline_ipri + offline_isec)::Double
347
      -- metric: count of primary instances on offline nodes (this
348
      -- helps with evacuation/failover of primary instances on
349
      -- 2-node clusters with one node offline)
350
      off_pri_score = fromIntegral offline_ipri::Double
351
      cpu_l = map Node.pCpu nodes
352
      -- metric: covariance of vcpu/pcpu ratio
353
      cpu_cv = stdDev cpu_l
354
      -- metrics: covariance of cpu, memory, disk and network load
355
      (c_load, m_load, d_load, n_load) =
356
        unzip4 $ map (\n ->
357
                      let DynUtil c1 m1 d1 n1 = Node.utilLoad n
358
                          DynUtil c2 m2 d2 n2 = Node.utilPool n
359
                      in (c1/c2, m1/m2, d1/d2, n1/n2)) nodes
360
      -- metric: conflicting instance count
361
      pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
362
      pri_tags_score = fromIntegral pri_tags_inst::Double
363
      -- metric: spindles %
364
      spindles_cv = map (\n -> Node.instSpindles n / Node.hiSpindles n) nodes
365
  in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
366
     , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
367
     , pri_tags_score, stdDev spindles_cv ]
368

    
369
-- | Compute the /total/ variance.
370
compCVNodes :: [Node.Node] -> Double
371
compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
372

    
373
-- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
374
compCV :: Node.List -> Double
375
compCV = compCVNodes . Container.elems
376

    
377
-- | Compute online nodes from a 'Node.List'.
378
getOnline :: Node.List -> [Node.Node]
379
getOnline = filter (not . Node.offline) . Container.elems
380

    
381
-- * Balancing functions
382

    
383
-- | Compute best table. Note that the ordering of the arguments is important.
384
compareTables :: Table -> Table -> Table
385
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
386
  if a_cv > b_cv then b else a
387

    
388
-- | Applies an instance move to a given node list and instance.
389
applyMove :: Node.List -> Instance.Instance
390
          -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
391
-- Failover (f)
392
applyMove nl inst Failover =
393
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
394
      int_p = Node.removePri old_p inst
395
      int_s = Node.removeSec old_s inst
396
      new_nl = do -- Maybe monad
397
        new_p <- Node.addPriEx (Node.offline old_p) int_s inst
398
        new_s <- Node.addSec int_p inst old_sdx
399
        let new_inst = Instance.setBoth inst old_sdx old_pdx
400
        return (Container.addTwo old_pdx new_s old_sdx new_p nl,
401
                new_inst, old_sdx, old_pdx)
402
  in new_nl
403

    
404
-- Failover to any (fa)
405
applyMove nl inst (FailoverToAny new_pdx) = do
406
  let (old_pdx, old_sdx, old_pnode, _) = instanceNodes nl inst
407
      new_pnode = Container.find new_pdx nl
408
      force_failover = Node.offline old_pnode
409
  new_pnode' <- Node.addPriEx force_failover new_pnode inst
410
  let old_pnode' = Node.removePri old_pnode inst
411
      inst' = Instance.setPri inst new_pdx
412
      nl' = Container.addTwo old_pdx old_pnode' new_pdx new_pnode' nl
413
  return (nl', inst', new_pdx, old_sdx)
414

    
415
-- Replace the primary (f:, r:np, f)
416
applyMove nl inst (ReplacePrimary new_pdx) =
417
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
418
      tgt_n = Container.find new_pdx nl
419
      int_p = Node.removePri old_p inst
420
      int_s = Node.removeSec old_s inst
421
      force_p = Node.offline old_p
422
      new_nl = do -- Maybe monad
423
                  -- check that the current secondary can host the instance
424
                  -- during the migration
425
        tmp_s <- Node.addPriEx force_p int_s inst
426
        let tmp_s' = Node.removePri tmp_s inst
427
        new_p <- Node.addPriEx force_p tgt_n inst
428
        new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
429
        let new_inst = Instance.setPri inst new_pdx
430
        return (Container.add new_pdx new_p $
431
                Container.addTwo old_pdx int_p old_sdx new_s nl,
432
                new_inst, new_pdx, old_sdx)
433
  in new_nl
434

    
435
-- Replace the secondary (r:ns)
436
applyMove nl inst (ReplaceSecondary new_sdx) =
437
  let old_pdx = Instance.pNode inst
438
      old_sdx = Instance.sNode inst
439
      old_s = Container.find old_sdx nl
440
      tgt_n = Container.find new_sdx nl
441
      int_s = Node.removeSec old_s inst
442
      force_s = Node.offline old_s
443
      new_inst = Instance.setSec inst new_sdx
444
      new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
445
               \new_s -> return (Container.addTwo new_sdx
446
                                 new_s old_sdx int_s nl,
447
                                 new_inst, old_pdx, new_sdx)
448
  in new_nl
449

    
450
-- Replace the secondary and failover (r:np, f)
451
applyMove nl inst (ReplaceAndFailover new_pdx) =
452
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
453
      tgt_n = Container.find new_pdx nl
454
      int_p = Node.removePri old_p inst
455
      int_s = Node.removeSec old_s inst
456
      force_s = Node.offline old_s
457
      new_nl = do -- Maybe monad
458
        new_p <- Node.addPri tgt_n inst
459
        new_s <- Node.addSecEx force_s int_p inst new_pdx
460
        let new_inst = Instance.setBoth inst new_pdx old_pdx
461
        return (Container.add new_pdx new_p $
462
                Container.addTwo old_pdx new_s old_sdx int_s nl,
463
                new_inst, new_pdx, old_pdx)
464
  in new_nl
465

    
466
-- Failver and replace the secondary (f, r:ns)
467
applyMove nl inst (FailoverAndReplace new_sdx) =
468
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
469
      tgt_n = Container.find new_sdx nl
470
      int_p = Node.removePri old_p inst
471
      int_s = Node.removeSec old_s inst
472
      force_p = Node.offline old_p
473
      new_nl = do -- Maybe monad
474
        new_p <- Node.addPriEx force_p int_s inst
475
        new_s <- Node.addSecEx force_p tgt_n inst old_sdx
476
        let new_inst = Instance.setBoth inst old_sdx new_sdx
477
        return (Container.add new_sdx new_s $
478
                Container.addTwo old_sdx new_p old_pdx int_p nl,
479
                new_inst, old_sdx, new_sdx)
480
  in new_nl
481

    
482
-- | Tries to allocate an instance on one given node.
483
allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
484
                 -> OpResult Node.AllocElement
485
allocateOnSingle nl inst new_pdx =
486
  let p = Container.find new_pdx nl
487
      new_inst = Instance.setBoth inst new_pdx Node.noSecondary
488
  in do
489
    Instance.instMatchesPolicy inst (Node.iPolicy p)
490
    new_p <- Node.addPri p inst
491
    let new_nl = Container.add new_pdx new_p nl
492
        new_score = compCV new_nl
493
    return (new_nl, new_inst, [new_p], new_score)
494

    
495
-- | Tries to allocate an instance on a given pair of nodes.
496
allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
497
               -> OpResult Node.AllocElement
498
allocateOnPair nl inst new_pdx new_sdx =
499
  let tgt_p = Container.find new_pdx nl
500
      tgt_s = Container.find new_sdx nl
501
  in do
502
    Instance.instMatchesPolicy inst (Node.iPolicy tgt_p)
503
    new_p <- Node.addPri tgt_p inst
504
    new_s <- Node.addSec tgt_s inst new_pdx
505
    let new_inst = Instance.setBoth inst new_pdx new_sdx
506
        new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
507
    return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
508

    
509
-- | Tries to perform an instance move and returns the best table
510
-- between the original one and the new one.
511
checkSingleStep :: Table -- ^ The original table
512
                -> Instance.Instance -- ^ The instance to move
513
                -> Table -- ^ The current best table
514
                -> IMove -- ^ The move to apply
515
                -> Table -- ^ The final best table
516
checkSingleStep ini_tbl target cur_tbl move =
517
  let Table ini_nl ini_il _ ini_plc = ini_tbl
518
      tmp_resu = applyMove ini_nl target move
519
  in case tmp_resu of
520
       Bad _ -> cur_tbl
521
       Ok (upd_nl, new_inst, pri_idx, sec_idx) ->
522
         let tgt_idx = Instance.idx target
523
             upd_cvar = compCV upd_nl
524
             upd_il = Container.add tgt_idx new_inst ini_il
525
             upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
526
             upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
527
         in compareTables cur_tbl upd_tbl
528

    
529
-- | Given the status of the current secondary as a valid new node and
530
-- the current candidate target node, generate the possible moves for
531
-- a instance.
532
possibleMoves :: MirrorType -- ^ The mirroring type of the instance
533
              -> Bool       -- ^ Whether the secondary node is a valid new node
534
              -> Bool       -- ^ Whether we can change the primary node
535
              -> Ndx        -- ^ Target node candidate
536
              -> [IMove]    -- ^ List of valid result moves
537

    
538
possibleMoves MirrorNone _ _ _ = []
539

    
540
possibleMoves MirrorExternal _ False _ = []
541

    
542
possibleMoves MirrorExternal _ True tdx =
543
  [ FailoverToAny tdx ]
544

    
545
possibleMoves MirrorInternal _ False tdx =
546
  [ ReplaceSecondary tdx ]
547

    
548
possibleMoves MirrorInternal True True tdx =
549
  [ ReplaceSecondary tdx
550
  , ReplaceAndFailover tdx
551
  , ReplacePrimary tdx
552
  , FailoverAndReplace tdx
553
  ]
554

    
555
possibleMoves MirrorInternal False True tdx =
556
  [ ReplaceSecondary tdx
557
  , ReplaceAndFailover tdx
558
  ]
559

    
560
-- | Compute the best move for a given instance.
561
checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
562
                  -> Bool              -- ^ Whether disk moves are allowed
563
                  -> Bool              -- ^ Whether instance moves are allowed
564
                  -> Table             -- ^ Original table
565
                  -> Instance.Instance -- ^ Instance to move
566
                  -> Table             -- ^ Best new table for this instance
567
checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
568
  let opdx = Instance.pNode target
569
      osdx = Instance.sNode target
570
      bad_nodes = [opdx, osdx]
571
      nodes = filter (`notElem` bad_nodes) nodes_idx
572
      mir_type = Instance.mirrorType target
573
      use_secondary = elem osdx nodes_idx && inst_moves
574
      aft_failover = if mir_type == MirrorInternal && use_secondary
575
                       -- if drbd and allowed to failover
576
                       then checkSingleStep ini_tbl target ini_tbl Failover
577
                       else ini_tbl
578
      all_moves =
579
        if disk_moves
580
          then concatMap (possibleMoves mir_type use_secondary inst_moves)
581
               nodes
582
          else []
583
    in
584
      -- iterate over the possible nodes for this instance
585
      foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
586

    
587
-- | Compute the best next move.
588
checkMove :: [Ndx]               -- ^ Allowed target node indices
589
          -> Bool                -- ^ Whether disk moves are allowed
590
          -> Bool                -- ^ Whether instance moves are allowed
591
          -> Table               -- ^ The current solution
592
          -> [Instance.Instance] -- ^ List of instances still to move
593
          -> Table               -- ^ The new solution
594
checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
595
  let Table _ _ _ ini_plc = ini_tbl
596
      -- we're using rwhnf from the Control.Parallel.Strategies
597
      -- package; we don't need to use rnf as that would force too
598
      -- much evaluation in single-threaded cases, and in
599
      -- multi-threaded case the weak head normal form is enough to
600
      -- spark the evaluation
601
      tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
602
                             inst_moves ini_tbl)
603
               victims
604
      -- iterate over all instances, computing the best move
605
      best_tbl = foldl' compareTables ini_tbl tables
606
      Table _ _ _ best_plc = best_tbl
607
  in if length best_plc == length ini_plc
608
       then ini_tbl -- no advancement
609
       else best_tbl
610

    
611
-- | Check if we are allowed to go deeper in the balancing.
612
doNextBalance :: Table     -- ^ The starting table
613
              -> Int       -- ^ Remaining length
614
              -> Score     -- ^ Score at which to stop
615
              -> Bool      -- ^ The resulting table and commands
616
doNextBalance ini_tbl max_rounds min_score =
617
  let Table _ _ ini_cv ini_plc = ini_tbl
618
      ini_plc_len = length ini_plc
619
  in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
620

    
621
-- | Run a balance move.
622
tryBalance :: Table       -- ^ The starting table
623
           -> Bool        -- ^ Allow disk moves
624
           -> Bool        -- ^ Allow instance moves
625
           -> Bool        -- ^ Only evacuate moves
626
           -> Score       -- ^ Min gain threshold
627
           -> Score       -- ^ Min gain
628
           -> Maybe Table -- ^ The resulting table and commands
629
tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
630
    let Table ini_nl ini_il ini_cv _ = ini_tbl
631
        all_inst = Container.elems ini_il
632
        all_nodes = Container.elems ini_nl
633
        (offline_nodes, online_nodes) = partition Node.offline all_nodes
634
        all_inst' = if evac_mode
635
                      then let bad_nodes = map Node.idx offline_nodes
636
                           in filter (any (`elem` bad_nodes) .
637
                                          Instance.allNodes) all_inst
638
                      else all_inst
639
        reloc_inst = filter Instance.movable all_inst'
640
        node_idx = map Node.idx online_nodes
641
        fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
642
        (Table _ _ fin_cv _) = fin_tbl
643
    in
644
      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
645
      then Just fin_tbl -- this round made success, return the new table
646
      else Nothing
647

    
648
-- * Allocation functions
649

    
650
-- | Build failure stats out of a list of failures.
651
collapseFailures :: [FailMode] -> FailStats
652
collapseFailures flst =
653
    map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
654
            [minBound..maxBound]
655

    
656
-- | Compares two Maybe AllocElement and chooses the besst score.
657
bestAllocElement :: Maybe Node.AllocElement
658
                 -> Maybe Node.AllocElement
659
                 -> Maybe Node.AllocElement
660
bestAllocElement a Nothing = a
661
bestAllocElement Nothing b = b
662
bestAllocElement a@(Just (_, _, _, ascore)) b@(Just (_, _, _, bscore)) =
663
  if ascore < bscore then a else b
664

    
665
-- | Update current Allocation solution and failure stats with new
666
-- elements.
667
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
668
concatAllocs as (Bad reason) = as { asFailures = reason : asFailures as }
669

    
670
concatAllocs as (Ok ns) =
671
  let -- Choose the old or new solution, based on the cluster score
672
    cntok = asAllocs as
673
    osols = asSolution as
674
    nsols = bestAllocElement osols (Just ns)
675
    nsuc = cntok + 1
676
    -- Note: we force evaluation of nsols here in order to keep the
677
    -- memory profile low - we know that we will need nsols for sure
678
    -- in the next cycle, so we force evaluation of nsols, since the
679
    -- foldl' in the caller will only evaluate the tuple, but not the
680
    -- elements of the tuple
681
  in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolution = nsols }
682

    
683
-- | Sums two 'AllocSolution' structures.
684
sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
685
sumAllocs (AllocSolution aFails aAllocs aSols aLog)
686
          (AllocSolution bFails bAllocs bSols bLog) =
687
  -- note: we add b first, since usually it will be smaller; when
688
  -- fold'ing, a will grow and grow whereas b is the per-group
689
  -- result, hence smaller
690
  let nFails  = bFails ++ aFails
691
      nAllocs = aAllocs + bAllocs
692
      nSols   = bestAllocElement aSols bSols
693
      nLog    = bLog ++ aLog
694
  in AllocSolution nFails nAllocs nSols nLog
695

    
696
-- | Given a solution, generates a reasonable description for it.
697
describeSolution :: AllocSolution -> String
698
describeSolution as =
699
  let fcnt = asFailures as
700
      sols = asSolution as
701
      freasons =
702
        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
703
        filter ((> 0) . snd) . collapseFailures $ fcnt
704
  in case sols of
705
     Nothing -> "No valid allocation solutions, failure reasons: " ++
706
                (if null fcnt then "unknown reasons" else freasons)
707
     Just (_, _, nodes, cv) ->
708
         printf ("score: %.8f, successes %d, failures %d (%s)" ++
709
                 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
710
               (intercalate "/" . map Node.name $ nodes)
711

    
712
-- | Annotates a solution with the appropriate string.
713
annotateSolution :: AllocSolution -> AllocSolution
714
annotateSolution as = as { asLog = describeSolution as : asLog as }
715

    
716
-- | Reverses an evacuation solution.
717
--
718
-- Rationale: we always concat the results to the top of the lists, so
719
-- for proper jobset execution, we should reverse all lists.
720
reverseEvacSolution :: EvacSolution -> EvacSolution
721
reverseEvacSolution (EvacSolution f m o) =
722
  EvacSolution (reverse f) (reverse m) (reverse o)
723

    
724
-- | Generate the valid node allocation singles or pairs for a new instance.
725
genAllocNodes :: Group.List        -- ^ Group list
726
              -> Node.List         -- ^ The node map
727
              -> Int               -- ^ The number of nodes required
728
              -> Bool              -- ^ Whether to drop or not
729
                                   -- unallocable nodes
730
              -> Result AllocNodes -- ^ The (monadic) result
731
genAllocNodes gl nl count drop_unalloc =
732
  let filter_fn = if drop_unalloc
733
                    then filter (Group.isAllocable .
734
                                 flip Container.find gl . Node.group)
735
                    else id
736
      all_nodes = filter_fn $ getOnline nl
737
      all_pairs = [(Node.idx p,
738
                    [Node.idx s | s <- all_nodes,
739
                                       Node.idx p /= Node.idx s,
740
                                       Node.group p == Node.group s]) |
741
                   p <- all_nodes]
742
  in case count of
743
       1 -> Ok (Left (map Node.idx all_nodes))
744
       2 -> Ok (Right (filter (not . null . snd) all_pairs))
745
       _ -> Bad "Unsupported number of nodes, only one or two  supported"
746

    
747
-- | Try to allocate an instance on the cluster.
748
tryAlloc :: (Monad m) =>
749
            Node.List         -- ^ The node list
750
         -> Instance.List     -- ^ The instance list
751
         -> Instance.Instance -- ^ The instance to allocate
752
         -> AllocNodes        -- ^ The allocation targets
753
         -> m AllocSolution   -- ^ Possible solution list
754
tryAlloc _  _ _    (Right []) = fail "Not enough online nodes"
755
tryAlloc nl _ inst (Right ok_pairs) =
756
  let psols = parMap rwhnf (\(p, ss) ->
757
                              foldl' (\cstate ->
758
                                        concatAllocs cstate .
759
                                        allocateOnPair nl inst p)
760
                              emptyAllocSolution ss) ok_pairs
761
      sols = foldl' sumAllocs emptyAllocSolution psols
762
  in return $ annotateSolution sols
763

    
764
tryAlloc _  _ _    (Left []) = fail "No online nodes"
765
tryAlloc nl _ inst (Left all_nodes) =
766
  let sols = foldl' (\cstate ->
767
                       concatAllocs cstate . allocateOnSingle nl inst
768
                    ) emptyAllocSolution all_nodes
769
  in return $ annotateSolution sols
770

    
771
-- | Given a group/result, describe it as a nice (list of) messages.
772
solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
773
solutionDescription gl (groupId, result) =
774
  case result of
775
    Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
776
    Bad message -> [printf "Group %s: error %s" gname message]
777
  where grp = Container.find groupId gl
778
        gname = Group.name grp
779
        pol = allocPolicyToRaw (Group.allocPolicy grp)
780

    
781
-- | From a list of possibly bad and possibly empty solutions, filter
782
-- only the groups with a valid result. Note that the result will be
783
-- reversed compared to the original list.
784
filterMGResults :: Group.List
785
                -> [(Gdx, Result AllocSolution)]
786
                -> [(Gdx, AllocSolution)]
787
filterMGResults gl = foldl' fn []
788
  where unallocable = not . Group.isAllocable . flip Container.find gl
789
        fn accu (gdx, rasol) =
790
          case rasol of
791
            Bad _ -> accu
792
            Ok sol | isNothing (asSolution sol) -> accu
793
                   | unallocable gdx -> accu
794
                   | otherwise -> (gdx, sol):accu
795

    
796
-- | Sort multigroup results based on policy and score.
797
sortMGResults :: Group.List
798
             -> [(Gdx, AllocSolution)]
799
             -> [(Gdx, AllocSolution)]
800
sortMGResults gl sols =
801
  let extractScore (_, _, _, x) = x
802
      solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
803
                             (extractScore . fromJust . asSolution) sol)
804
  in sortBy (comparing solScore) sols
805

    
806
-- | Finds the best group for an instance on a multi-group cluster.
807
--
808
-- Only solutions in @preferred@ and @last_resort@ groups will be
809
-- accepted as valid, and additionally if the allowed groups parameter
810
-- is not null then allocation will only be run for those group
811
-- indices.
812
findBestAllocGroup :: Group.List           -- ^ The group list
813
                   -> Node.List            -- ^ The node list
814
                   -> Instance.List        -- ^ The instance list
815
                   -> Maybe [Gdx]          -- ^ The allowed groups
816
                   -> Instance.Instance    -- ^ The instance to allocate
817
                   -> Int                  -- ^ Required number of nodes
818
                   -> Result (Gdx, AllocSolution, [String])
819
findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
820
  let groups = splitCluster mgnl mgil
821
      groups' = maybe groups (\gs -> filter ((`elem` gs) . fst) groups)
822
                allowed_gdxs
823
      sols = map (\(gid, (nl, il)) ->
824
                   (gid, genAllocNodes mggl nl cnt False >>=
825
                       tryAlloc nl il inst))
826
             groups'::[(Gdx, Result AllocSolution)]
827
      all_msgs = concatMap (solutionDescription mggl) sols
828
      goodSols = filterMGResults mggl sols
829
      sortedSols = sortMGResults mggl goodSols
830
  in if null sortedSols
831
       then Bad $ if null groups'
832
                    then "no groups for evacuation: allowed groups was" ++
833
                         show allowed_gdxs ++ ", all groups: " ++
834
                         show (map fst groups)
835
                    else intercalate ", " all_msgs
836
       else let (final_group, final_sol) = head sortedSols
837
            in return (final_group, final_sol, all_msgs)
838

    
839
-- | Try to allocate an instance on a multi-group cluster.
840
tryMGAlloc :: Group.List           -- ^ The group list
841
           -> Node.List            -- ^ The node list
842
           -> Instance.List        -- ^ The instance list
843
           -> Instance.Instance    -- ^ The instance to allocate
844
           -> Int                  -- ^ Required number of nodes
845
           -> Result AllocSolution -- ^ Possible solution list
846
tryMGAlloc mggl mgnl mgil inst cnt = do
847
  (best_group, solution, all_msgs) <-
848
      findBestAllocGroup mggl mgnl mgil Nothing inst cnt
849
  let group_name = Group.name $ Container.find best_group mggl
850
      selmsg = "Selected group: " ++ group_name
851
  return $ solution { asLog = selmsg:all_msgs }
852

    
853
-- | Calculate the new instance list after allocation solution.
854
updateIl :: Instance.List           -- ^ The original instance list
855
         -> Maybe Node.AllocElement -- ^ The result of the allocation attempt
856
         -> Instance.List           -- ^ The updated instance list
857
updateIl il Nothing = il
858
updateIl il (Just (_, xi, _, _)) = Container.add (Container.size il) xi il
859

    
860
-- | Extract the the new node list from the allocation solution.
861
extractNl :: Node.List               -- ^ The original node list
862
          -> Maybe Node.AllocElement -- ^ The result of the allocation attempt
863
          -> Node.List               -- ^ The new node list
864
extractNl nl Nothing = nl
865
extractNl _ (Just (xnl, _, _, _)) = xnl
866

    
867
-- | Try to allocate a list of instances on a multi-group cluster.
868
allocList :: Group.List                  -- ^ The group list
869
          -> Node.List                   -- ^ The node list
870
          -> Instance.List               -- ^ The instance list
871
          -> [(Instance.Instance, Int)]  -- ^ The instance to allocate
872
          -> AllocSolutionList           -- ^ Possible solution list
873
          -> Result (Node.List, Instance.List,
874
                     AllocSolutionList)  -- ^ The final solution list
875
allocList _  nl il [] result = Ok (nl, il, result)
876
allocList gl nl il ((xi, xicnt):xies) result = do
877
  ares <- tryMGAlloc gl nl il xi xicnt
878
  let sol = asSolution ares
879
      nl' = extractNl nl sol
880
      il' = updateIl il sol
881
  allocList gl nl' il' xies ((xi, ares):result)
882

    
883
-- | Function which fails if the requested mode is change secondary.
884
--
885
-- This is useful since except DRBD, no other disk template can
886
-- execute change secondary; thus, we can just call this function
887
-- instead of always checking for secondary mode. After the call to
888
-- this function, whatever mode we have is just a primary change.
889
failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
890
failOnSecondaryChange ChangeSecondary dt =
891
  fail $ "Instances with disk template '" ++ diskTemplateToRaw dt ++
892
         "' can't execute change secondary"
893
failOnSecondaryChange _ _ = return ()
894

    
895
-- | Run evacuation for a single instance.
896
--
897
-- /Note:/ this function should correctly execute both intra-group
898
-- evacuations (in all modes) and inter-group evacuations (in the
899
-- 'ChangeAll' mode). Of course, this requires that the correct list
900
-- of target nodes is passed.
901
nodeEvacInstance :: Node.List         -- ^ The node list (cluster-wide)
902
                 -> Instance.List     -- ^ Instance list (cluster-wide)
903
                 -> EvacMode          -- ^ The evacuation mode
904
                 -> Instance.Instance -- ^ The instance to be evacuated
905
                 -> Gdx               -- ^ The group we're targetting
906
                 -> [Ndx]             -- ^ The list of available nodes
907
                                      -- for allocation
908
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
909
nodeEvacInstance nl il mode inst@(Instance.Instance
910
                                  {Instance.diskTemplate = dt@DTDiskless})
911
                 gdx avail_nodes =
912
                   failOnSecondaryChange mode dt >>
913
                   evacOneNodeOnly nl il inst gdx avail_nodes
914

    
915
nodeEvacInstance _ _ _ (Instance.Instance
916
                        {Instance.diskTemplate = DTPlain}) _ _ =
917
                  fail "Instances of type plain cannot be relocated"
918

    
919
nodeEvacInstance _ _ _ (Instance.Instance
920
                        {Instance.diskTemplate = DTFile}) _ _ =
921
                  fail "Instances of type file cannot be relocated"
922

    
923
nodeEvacInstance nl il mode inst@(Instance.Instance
924
                                  {Instance.diskTemplate = dt@DTSharedFile})
925
                 gdx avail_nodes =
926
                   failOnSecondaryChange mode dt >>
927
                   evacOneNodeOnly nl il inst gdx avail_nodes
928

    
929
nodeEvacInstance nl il mode inst@(Instance.Instance
930
                                  {Instance.diskTemplate = dt@DTBlock})
931
                 gdx avail_nodes =
932
                   failOnSecondaryChange mode dt >>
933
                   evacOneNodeOnly nl il inst gdx avail_nodes
934

    
935
nodeEvacInstance nl il mode inst@(Instance.Instance
936
                                  {Instance.diskTemplate = dt@DTRbd})
937
                 gdx avail_nodes =
938
                   failOnSecondaryChange mode dt >>
939
                   evacOneNodeOnly nl il inst gdx avail_nodes
940

    
941
nodeEvacInstance nl il ChangePrimary
942
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
943
                 _ _ =
944
  do
945
    (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
946
    let idx = Instance.idx inst
947
        il' = Container.add idx inst' il
948
        ops = iMoveToJob nl' il' idx Failover
949
    return (nl', il', ops)
950

    
951
nodeEvacInstance nl il ChangeSecondary
952
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
953
                 gdx avail_nodes =
954
  evacOneNodeOnly nl il inst gdx avail_nodes
955

    
956
-- The algorithm for ChangeAll is as follows:
957
--
958
-- * generate all (primary, secondary) node pairs for the target groups
959
-- * for each pair, execute the needed moves (r:s, f, r:s) and compute
960
--   the final node list state and group score
961
-- * select the best choice via a foldl that uses the same Either
962
--   String solution as the ChangeSecondary mode
963
nodeEvacInstance nl il ChangeAll
964
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
965
                 gdx avail_nodes =
966
  do
967
    let no_nodes = Left "no nodes available"
968
        node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s]
969
    (nl', il', ops, _) <-
970
        annotateResult "Can't find any good nodes for relocation" .
971
        eitherToResult $
972
        foldl'
973
        (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of
974
                          Bad msg ->
975
                              case accu of
976
                                Right _ -> accu
977
                                -- we don't need more details (which
978
                                -- nodes, etc.) as we only selected
979
                                -- this group if we can allocate on
980
                                -- it, hence failures will not
981
                                -- propagate out of this fold loop
982
                                Left _ -> Left $ "Allocation failed: " ++ msg
983
                          Ok result@(_, _, _, new_cv) ->
984
                              let new_accu = Right result in
985
                              case accu of
986
                                Left _ -> new_accu
987
                                Right (_, _, _, old_cv) ->
988
                                    if old_cv < new_cv
989
                                    then accu
990
                                    else new_accu
991
        ) no_nodes node_pairs
992

    
993
    return (nl', il', ops)
994

    
995
-- | Generic function for changing one node of an instance.
996
--
997
-- This is similar to 'nodeEvacInstance' but will be used in a few of
998
-- its sub-patterns. It folds the inner function 'evacOneNodeInner'
999
-- over the list of available nodes, which results in the best choice
1000
-- for relocation.
1001
evacOneNodeOnly :: Node.List         -- ^ The node list (cluster-wide)
1002
                -> Instance.List     -- ^ Instance list (cluster-wide)
1003
                -> Instance.Instance -- ^ The instance to be evacuated
1004
                -> Gdx               -- ^ The group we're targetting
1005
                -> [Ndx]             -- ^ The list of available nodes
1006
                                      -- for allocation
1007
                -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1008
evacOneNodeOnly nl il inst gdx avail_nodes = do
1009
  op_fn <- case Instance.mirrorType inst of
1010
             MirrorNone -> Bad "Can't relocate/evacuate non-mirrored instances"
1011
             MirrorInternal -> Ok ReplaceSecondary
1012
             MirrorExternal -> Ok FailoverToAny
1013
  (nl', inst', _, ndx) <- annotateResult "Can't find any good node" .
1014
                          eitherToResult $
1015
                          foldl' (evacOneNodeInner nl inst gdx op_fn)
1016
                          (Left "no nodes available") avail_nodes
1017
  let idx = Instance.idx inst
1018
      il' = Container.add idx inst' il
1019
      ops = iMoveToJob nl' il' idx (op_fn ndx)
1020
  return (nl', il', ops)
1021

    
1022
-- | Inner fold function for changing one node of an instance.
1023
--
1024
-- Depending on the instance disk template, this will either change
1025
-- the secondary (for DRBD) or the primary node (for shared
1026
-- storage). However, the operation is generic otherwise.
1027
--
1028
-- The running solution is either a @Left String@, which means we
1029
-- don't have yet a working solution, or a @Right (...)@, which
1030
-- represents a valid solution; it holds the modified node list, the
1031
-- modified instance (after evacuation), the score of that solution,
1032
-- and the new secondary node index.
1033
evacOneNodeInner :: Node.List         -- ^ Cluster node list
1034
                 -> Instance.Instance -- ^ Instance being evacuated
1035
                 -> Gdx               -- ^ The group index of the instance
1036
                 -> (Ndx -> IMove)    -- ^ Operation constructor
1037
                 -> EvacInnerState    -- ^ Current best solution
1038
                 -> Ndx               -- ^ Node we're evaluating as target
1039
                 -> EvacInnerState    -- ^ New best solution
1040
evacOneNodeInner nl inst gdx op_fn accu ndx =
1041
  case applyMove nl inst (op_fn ndx) of
1042
    Bad fm -> let fail_msg = "Node " ++ Container.nameOf nl ndx ++
1043
                             " failed: " ++ show fm
1044
              in either (const $ Left fail_msg) (const accu) accu
1045
    Ok (nl', inst', _, _) ->
1046
      let nodes = Container.elems nl'
1047
          -- The fromJust below is ugly (it can fail nastily), but
1048
          -- at this point we should have any internal mismatches,
1049
          -- and adding a monad here would be quite involved
1050
          grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1051
          new_cv = compCVNodes grpnodes
1052
          new_accu = Right (nl', inst', new_cv, ndx)
1053
      in case accu of
1054
           Left _ -> new_accu
1055
           Right (_, _, old_cv, _) ->
1056
             if old_cv < new_cv
1057
               then accu
1058
               else new_accu
1059

    
1060
-- | Compute result of changing all nodes of a DRBD instance.
1061
--
1062
-- Given the target primary and secondary node (which might be in a
1063
-- different group or not), this function will 'execute' all the
1064
-- required steps and assuming all operations succceed, will return
1065
-- the modified node and instance lists, the opcodes needed for this
1066
-- and the new group score.
1067
evacDrbdAllInner :: Node.List         -- ^ Cluster node list
1068
                 -> Instance.List     -- ^ Cluster instance list
1069
                 -> Instance.Instance -- ^ The instance to be moved
1070
                 -> Gdx               -- ^ The target group index
1071
                                      -- (which can differ from the
1072
                                      -- current group of the
1073
                                      -- instance)
1074
                 -> (Ndx, Ndx)        -- ^ Tuple of new
1075
                                      -- primary\/secondary nodes
1076
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score)
1077
evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) = do
1078
  let primary = Container.find (Instance.pNode inst) nl
1079
      idx = Instance.idx inst
1080
  -- if the primary is offline, then we first failover
1081
  (nl1, inst1, ops1) <-
1082
    if Node.offline primary
1083
      then do
1084
        (nl', inst', _, _) <-
1085
          annotateResult "Failing over to the secondary" .
1086
          opToResult $ applyMove nl inst Failover
1087
        return (nl', inst', [Failover])
1088
      else return (nl, inst, [])
1089
  let (o1, o2, o3) = (ReplaceSecondary t_pdx,
1090
                      Failover,
1091
                      ReplaceSecondary t_sdx)
1092
  -- we now need to execute a replace secondary to the future
1093
  -- primary node
1094
  (nl2, inst2, _, _) <-
1095
    annotateResult "Changing secondary to new primary" .
1096
    opToResult $
1097
    applyMove nl1 inst1 o1
1098
  let ops2 = o1:ops1
1099
  -- we now execute another failover, the primary stays fixed now
1100
  (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" .
1101
                        opToResult $ applyMove nl2 inst2 o2
1102
  let ops3 = o2:ops2
1103
  -- and finally another replace secondary, to the final secondary
1104
  (nl4, inst4, _, _) <-
1105
    annotateResult "Changing secondary to final secondary" .
1106
    opToResult $
1107
    applyMove nl3 inst3 o3
1108
  let ops4 = o3:ops3
1109
      il' = Container.add idx inst4 il
1110
      ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1111
  let nodes = Container.elems nl4
1112
      -- The fromJust below is ugly (it can fail nastily), but
1113
      -- at this point we should have any internal mismatches,
1114
      -- and adding a monad here would be quite involved
1115
      grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1116
      new_cv = compCVNodes grpnodes
1117
  return (nl4, il', ops, new_cv)
1118

    
1119
-- | Computes the nodes in a given group which are available for
1120
-- allocation.
1121
availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1122
                    -> IntSet.IntSet  -- ^ Nodes that are excluded
1123
                    -> Gdx            -- ^ The group for which we
1124
                                      -- query the nodes
1125
                    -> Result [Ndx]   -- ^ List of available node indices
1126
availableGroupNodes group_nodes excl_ndx gdx = do
1127
  local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1128
                 Ok (lookup gdx group_nodes)
1129
  let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1130
  return avail_nodes
1131

    
1132
-- | Updates the evac solution with the results of an instance
1133
-- evacuation.
1134
updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1135
                   -> Idx
1136
                   -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1137
                   -> (Node.List, Instance.List, EvacSolution)
1138
updateEvacSolution (nl, il, es) idx (Bad msg) =
1139
  (nl, il, es { esFailed = (idx, msg):esFailed es})
1140
updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1141
  (nl, il, es { esMoved = new_elem:esMoved es
1142
              , esOpCodes = opcodes:esOpCodes es })
1143
    where inst = Container.find idx il
1144
          new_elem = (idx,
1145
                      instancePriGroup nl inst,
1146
                      Instance.allNodes inst)
1147

    
1148
-- | Node-evacuation IAllocator mode main function.
1149
tryNodeEvac :: Group.List    -- ^ The cluster groups
1150
            -> Node.List     -- ^ The node list (cluster-wide, not per group)
1151
            -> Instance.List -- ^ Instance list (cluster-wide)
1152
            -> EvacMode      -- ^ The evacuation mode
1153
            -> [Idx]         -- ^ List of instance (indices) to be evacuated
1154
            -> Result (Node.List, Instance.List, EvacSolution)
1155
tryNodeEvac _ ini_nl ini_il mode idxs =
1156
  let evac_ndx = nodesToEvacuate ini_il mode idxs
1157
      offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1158
      excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1159
      group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1160
                                           (Container.elems nl))) $
1161
                  splitCluster ini_nl ini_il
1162
      (fin_nl, fin_il, esol) =
1163
        foldl' (\state@(nl, il, _) inst ->
1164
                  let gdx = instancePriGroup nl inst
1165
                      pdx = Instance.pNode inst in
1166
                  updateEvacSolution state (Instance.idx inst) $
1167
                  availableGroupNodes group_ndx
1168
                    (IntSet.insert pdx excl_ndx) gdx >>=
1169
                      nodeEvacInstance nl il mode inst gdx
1170
               )
1171
        (ini_nl, ini_il, emptyEvacSolution)
1172
        (map (`Container.find` ini_il) idxs)
1173
  in return (fin_nl, fin_il, reverseEvacSolution esol)
1174

    
1175
-- | Change-group IAllocator mode main function.
1176
--
1177
-- This is very similar to 'tryNodeEvac', the only difference is that
1178
-- we don't choose as target group the current instance group, but
1179
-- instead:
1180
--
1181
--   1. at the start of the function, we compute which are the target
1182
--   groups; either no groups were passed in, in which case we choose
1183
--   all groups out of which we don't evacuate instance, or there were
1184
--   some groups passed, in which case we use those
1185
--
1186
--   2. for each instance, we use 'findBestAllocGroup' to choose the
1187
--   best group to hold the instance, and then we do what
1188
--   'tryNodeEvac' does, except for this group instead of the current
1189
--   instance group.
1190
--
1191
-- Note that the correct behaviour of this function relies on the
1192
-- function 'nodeEvacInstance' to be able to do correctly both
1193
-- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1194
tryChangeGroup :: Group.List    -- ^ The cluster groups
1195
               -> Node.List     -- ^ The node list (cluster-wide)
1196
               -> Instance.List -- ^ Instance list (cluster-wide)
1197
               -> [Gdx]         -- ^ Target groups; if empty, any
1198
                                -- groups not being evacuated
1199
               -> [Idx]         -- ^ List of instance (indices) to be evacuated
1200
               -> Result (Node.List, Instance.List, EvacSolution)
1201
tryChangeGroup gl ini_nl ini_il gdxs idxs =
1202
  let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1203
                             flip Container.find ini_il) idxs
1204
      target_gdxs = (if null gdxs
1205
                       then Container.keys gl
1206
                       else gdxs) \\ evac_gdxs
1207
      offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1208
      excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1209
      group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1210
                                           (Container.elems nl))) $
1211
                  splitCluster ini_nl ini_il
1212
      (fin_nl, fin_il, esol) =
1213
        foldl' (\state@(nl, il, _) inst ->
1214
                  let solution = do
1215
                        let ncnt = Instance.requiredNodes $
1216
                                   Instance.diskTemplate inst
1217
                        (gdx, _, _) <- findBestAllocGroup gl nl il
1218
                                       (Just target_gdxs) inst ncnt
1219
                        av_nodes <- availableGroupNodes group_ndx
1220
                                    excl_ndx gdx
1221
                        nodeEvacInstance nl il ChangeAll inst gdx av_nodes
1222
                  in updateEvacSolution state (Instance.idx inst) solution
1223
               )
1224
        (ini_nl, ini_il, emptyEvacSolution)
1225
        (map (`Container.find` ini_il) idxs)
1226
  in return (fin_nl, fin_il, reverseEvacSolution esol)
1227

    
1228
-- | Standard-sized allocation method.
1229
--
1230
-- This places instances of the same size on the cluster until we're
1231
-- out of space. The result will be a list of identically-sized
1232
-- instances.
1233
iterateAlloc :: AllocMethod
1234
iterateAlloc nl il limit newinst allocnodes ixes cstats =
1235
  let depth = length ixes
1236
      newname = printf "new-%d" depth::String
1237
      newidx = Container.size il
1238
      newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1239
      newlimit = fmap (flip (-) 1) limit
1240
  in case tryAlloc nl il newi2 allocnodes of
1241
       Bad s -> Bad s
1242
       Ok (AllocSolution { asFailures = errs, asSolution = sols3 }) ->
1243
         let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1244
         case sols3 of
1245
           Nothing -> newsol
1246
           Just (xnl, xi, _, _) ->
1247
             if limit == Just 0
1248
               then newsol
1249
               else iterateAlloc xnl (Container.add newidx xi il)
1250
                      newlimit newinst allocnodes (xi:ixes)
1251
                      (totalResources xnl:cstats)
1252

    
1253
-- | Tiered allocation method.
1254
--
1255
-- This places instances on the cluster, and decreases the spec until
1256
-- we can allocate again. The result will be a list of decreasing
1257
-- instance specs.
1258
tieredAlloc :: AllocMethod
1259
tieredAlloc nl il limit newinst allocnodes ixes cstats =
1260
  case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1261
    Bad s -> Bad s
1262
    Ok (errs, nl', il', ixes', cstats') ->
1263
      let newsol = Ok (errs, nl', il', ixes', cstats')
1264
          ixes_cnt = length ixes'
1265
          (stop, newlimit) = case limit of
1266
                               Nothing -> (False, Nothing)
1267
                               Just n -> (n <= ixes_cnt,
1268
                                            Just (n - ixes_cnt)) in
1269
      if stop then newsol else
1270
          case Instance.shrinkByType newinst . fst . last $
1271
               sortBy (comparing snd) errs of
1272
            Bad _ -> newsol
1273
            Ok newinst' -> tieredAlloc nl' il' newlimit
1274
                           newinst' allocnodes ixes' cstats'
1275

    
1276
-- * Formatting functions
1277

    
1278
-- | Given the original and final nodes, computes the relocation description.
1279
computeMoves :: Instance.Instance -- ^ The instance to be moved
1280
             -> String -- ^ The instance name
1281
             -> IMove  -- ^ The move being performed
1282
             -> String -- ^ New primary
1283
             -> String -- ^ New secondary
1284
             -> (String, [String])
1285
                -- ^ Tuple of moves and commands list; moves is containing
1286
                -- either @/f/@ for failover or @/r:name/@ for replace
1287
                -- secondary, while the command list holds gnt-instance
1288
                -- commands (without that prefix), e.g \"@failover instance1@\"
1289
computeMoves i inam mv c d =
1290
  case mv of
1291
    Failover -> ("f", [mig])
1292
    FailoverToAny _ -> (printf "fa:%s" c, [mig_any])
1293
    FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1294
    ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1295
    ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1296
    ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1297
  where morf = if Instance.isRunning i then "migrate" else "failover"
1298
        mig = printf "%s -f %s" morf inam::String
1299
        mig_any = printf "%s -f -n %s %s" morf c inam::String
1300
        rep n = printf "replace-disks -n %s %s" n inam::String
1301

    
1302
-- | Converts a placement to string format.
1303
printSolutionLine :: Node.List     -- ^ The node list
1304
                  -> Instance.List -- ^ The instance list
1305
                  -> Int           -- ^ Maximum node name length
1306
                  -> Int           -- ^ Maximum instance name length
1307
                  -> Placement     -- ^ The current placement
1308
                  -> Int           -- ^ The index of the placement in
1309
                                   -- the solution
1310
                  -> (String, [String])
1311
printSolutionLine nl il nmlen imlen plc pos =
1312
  let pmlen = (2*nmlen + 1)
1313
      (i, p, s, mv, c) = plc
1314
      old_sec = Instance.sNode inst
1315
      inst = Container.find i il
1316
      inam = Instance.alias inst
1317
      npri = Node.alias $ Container.find p nl
1318
      nsec = Node.alias $ Container.find s nl
1319
      opri = Node.alias $ Container.find (Instance.pNode inst) nl
1320
      osec = Node.alias $ Container.find old_sec nl
1321
      (moves, cmds) =  computeMoves inst inam mv npri nsec
1322
      -- FIXME: this should check instead/also the disk template
1323
      ostr = if old_sec == Node.noSecondary
1324
               then printf "%s" opri::String
1325
               else printf "%s:%s" opri osec::String
1326
      nstr = if s == Node.noSecondary
1327
               then printf "%s" npri::String
1328
               else printf "%s:%s" npri nsec::String
1329
  in (printf "  %3d. %-*s %-*s => %-*s %12.8f a=%s"
1330
      pos imlen inam pmlen ostr pmlen nstr c moves,
1331
      cmds)
1332

    
1333
-- | Return the instance and involved nodes in an instance move.
1334
--
1335
-- Note that the output list length can vary, and is not required nor
1336
-- guaranteed to be of any specific length.
1337
involvedNodes :: Instance.List -- ^ Instance list, used for retrieving
1338
                               -- the instance from its index; note
1339
                               -- that this /must/ be the original
1340
                               -- instance list, so that we can
1341
                               -- retrieve the old nodes
1342
              -> Placement     -- ^ The placement we're investigating,
1343
                               -- containing the new nodes and
1344
                               -- instance index
1345
              -> [Ndx]         -- ^ Resulting list of node indices
1346
involvedNodes il plc =
1347
  let (i, np, ns, _, _) = plc
1348
      inst = Container.find i il
1349
  in nub $ [np, ns] ++ Instance.allNodes inst
1350

    
1351
-- | Inner function for splitJobs, that either appends the next job to
1352
-- the current jobset, or starts a new jobset.
1353
mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1354
mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1355
mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1356
  | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1357
  | otherwise = ([n]:cjs, ndx)
1358

    
1359
-- | Break a list of moves into independent groups. Note that this
1360
-- will reverse the order of jobs.
1361
splitJobs :: [MoveJob] -> [JobSet]
1362
splitJobs = fst . foldl mergeJobs ([], [])
1363

    
1364
-- | Given a list of commands, prefix them with @gnt-instance@ and
1365
-- also beautify the display a little.
1366
formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1367
formatJob jsn jsl (sn, (_, _, _, cmds)) =
1368
  let out =
1369
        printf "  echo job %d/%d" jsn sn:
1370
        printf "  check":
1371
        map ("  gnt-instance " ++) cmds
1372
  in if sn == 1
1373
       then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1374
       else out
1375

    
1376
-- | Given a list of commands, prefix them with @gnt-instance@ and
1377
-- also beautify the display a little.
1378
formatCmds :: [JobSet] -> String
1379
formatCmds =
1380
  unlines .
1381
  concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1382
                           (zip [1..] js)) .
1383
  zip [1..]
1384

    
1385
-- | Print the node list.
1386
printNodes :: Node.List -> [String] -> String
1387
printNodes nl fs =
1388
  let fields = case fs of
1389
                 [] -> Node.defaultFields
1390
                 "+":rest -> Node.defaultFields ++ rest
1391
                 _ -> fs
1392
      snl = sortBy (comparing Node.idx) (Container.elems nl)
1393
      (header, isnum) = unzip $ map Node.showHeader fields
1394
  in printTable "" header (map (Node.list fields) snl) isnum
1395

    
1396
-- | Print the instance list.
1397
printInsts :: Node.List -> Instance.List -> String
1398
printInsts nl il =
1399
  let sil = sortBy (comparing Instance.idx) (Container.elems il)
1400
      helper inst = [ if Instance.isRunning inst then "R" else " "
1401
                    , Instance.name inst
1402
                    , Container.nameOf nl (Instance.pNode inst)
1403
                    , let sdx = Instance.sNode inst
1404
                      in if sdx == Node.noSecondary
1405
                           then  ""
1406
                           else Container.nameOf nl sdx
1407
                    , if Instance.autoBalance inst then "Y" else "N"
1408
                    , printf "%3d" $ Instance.vcpus inst
1409
                    , printf "%5d" $ Instance.mem inst
1410
                    , printf "%5d" $ Instance.dsk inst `div` 1024
1411
                    , printf "%5.3f" lC
1412
                    , printf "%5.3f" lM
1413
                    , printf "%5.3f" lD
1414
                    , printf "%5.3f" lN
1415
                    ]
1416
          where DynUtil lC lM lD lN = Instance.util inst
1417
      header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1418
               , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1419
      isnum = False:False:False:False:False:repeat True
1420
  in printTable "" header (map helper sil) isnum
1421

    
1422
-- | Shows statistics for a given node list.
1423
printStats :: String -> Node.List -> String
1424
printStats lp nl =
1425
  let dcvs = compDetailedCV $ Container.elems nl
1426
      (weights, names) = unzip detailedCVInfo
1427
      hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1428
      header = [ "Field", "Value", "Weight" ]
1429
      formatted = map (\(w, h, val) ->
1430
                         [ h
1431
                         , printf "%.8f" val
1432
                         , printf "x%.2f" w
1433
                         ]) hd
1434
  in printTable lp header formatted $ False:repeat True
1435

    
1436
-- | Convert a placement into a list of OpCodes (basically a job).
1437
iMoveToJob :: Node.List        -- ^ The node list; only used for node
1438
                               -- names, so any version is good
1439
                               -- (before or after the operation)
1440
           -> Instance.List    -- ^ The instance list; also used for
1441
                               -- names only
1442
           -> Idx              -- ^ The index of the instance being
1443
                               -- moved
1444
           -> IMove            -- ^ The actual move to be described
1445
           -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to
1446
                               -- the given move
1447
iMoveToJob nl il idx move =
1448
  let inst = Container.find idx il
1449
      iname = Instance.name inst
1450
      lookNode  = Just . Container.nameOf nl
1451
      opF = OpCodes.OpInstanceMigrate iname True False True Nothing
1452
      opFA n = OpCodes.OpInstanceMigrate iname True False True (lookNode n)
1453
      opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n)
1454
              OpCodes.ReplaceNewSecondary [] Nothing
1455
  in case move of
1456
       Failover -> [ opF ]
1457
       FailoverToAny np -> [ opFA np ]
1458
       ReplacePrimary np -> [ opF, opR np, opF ]
1459
       ReplaceSecondary ns -> [ opR ns ]
1460
       ReplaceAndFailover np -> [ opR np, opF ]
1461
       FailoverAndReplace ns -> [ opF, opR ns ]
1462

    
1463
-- * Node group functions
1464

    
1465
-- | Computes the group of an instance.
1466
instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1467
instanceGroup nl i =
1468
  let sidx = Instance.sNode i
1469
      pnode = Container.find (Instance.pNode i) nl
1470
      snode = if sidx == Node.noSecondary
1471
              then pnode
1472
              else Container.find sidx nl
1473
      pgroup = Node.group pnode
1474
      sgroup = Node.group snode
1475
  in if pgroup /= sgroup
1476
       then fail ("Instance placed accross two node groups, primary " ++
1477
                  show pgroup ++ ", secondary " ++ show sgroup)
1478
       else return pgroup
1479

    
1480
-- | Computes the group of an instance per the primary node.
1481
instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1482
instancePriGroup nl i =
1483
  let pnode = Container.find (Instance.pNode i) nl
1484
  in  Node.group pnode
1485

    
1486
-- | Compute the list of badly allocated instances (split across node
1487
-- groups).
1488
findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1489
findSplitInstances nl =
1490
  filter (not . isOk . instanceGroup nl) . Container.elems
1491

    
1492
-- | Splits a cluster into the component node groups.
1493
splitCluster :: Node.List -> Instance.List ->
1494
                [(Gdx, (Node.List, Instance.List))]
1495
splitCluster nl il =
1496
  let ngroups = Node.computeGroups (Container.elems nl)
1497
  in map (\(guuid, nodes) ->
1498
           let nidxs = map Node.idx nodes
1499
               nodes' = zip nidxs nodes
1500
               instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1501
           in (guuid, (Container.fromList nodes', instances))) ngroups
1502

    
1503
-- | Compute the list of nodes that are to be evacuated, given a list
1504
-- of instances and an evacuation mode.
1505
nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1506
                -> EvacMode      -- ^ The evacuation mode we're using
1507
                -> [Idx]         -- ^ List of instance indices being evacuated
1508
                -> IntSet.IntSet -- ^ Set of node indices
1509
nodesToEvacuate il mode =
1510
  IntSet.delete Node.noSecondary .
1511
  foldl' (\ns idx ->
1512
            let i = Container.find idx il
1513
                pdx = Instance.pNode i
1514
                sdx = Instance.sNode i
1515
                dt = Instance.diskTemplate i
1516
                withSecondary = case dt of
1517
                                  DTDrbd8 -> IntSet.insert sdx ns
1518
                                  _ -> ns
1519
            in case mode of
1520
                 ChangePrimary   -> IntSet.insert pdx ns
1521
                 ChangeSecondary -> withSecondary
1522
                 ChangeAll       -> IntSet.insert pdx withSecondary
1523
         ) IntSet.empty