Statistics
| Branch: | Tag: | Revision:

root / src / Ganeti / HTools / Cluster.hs @ 72747d91

History | View | Annotate | Download (67.2 kB)

1
{-| Implementation of cluster-wide logic.
2

    
3
This module holds all pure cluster-logic; I\/O related functionality
4
goes into the /Main/ module for the individual binaries.
5

    
6
-}
7

    
8
{-
9

    
10
Copyright (C) 2009, 2010, 2011, 2012, 2013 Google Inc.
11

    
12
This program is free software; you can redistribute it and/or modify
13
it under the terms of the GNU General Public License as published by
14
the Free Software Foundation; either version 2 of the License, or
15
(at your option) any later version.
16

    
17
This program is distributed in the hope that it will be useful, but
18
WITHOUT ANY WARRANTY; without even the implied warranty of
19
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
General Public License for more details.
21

    
22
You should have received a copy of the GNU General Public License
23
along with this program; if not, write to the Free Software
24
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25
02110-1301, USA.
26

    
27
-}
28

    
29
module Ganeti.HTools.Cluster
30
  (
31
    -- * Types
32
    AllocSolution(..)
33
  , EvacSolution(..)
34
  , Table(..)
35
  , CStats(..)
36
  , AllocResult
37
  , AllocMethod
38
  , AllocSolutionList
39
  -- * Generic functions
40
  , totalResources
41
  , computeAllocationDelta
42
  -- * First phase functions
43
  , computeBadItems
44
  -- * Second phase functions
45
  , printSolutionLine
46
  , formatCmds
47
  , involvedNodes
48
  , splitJobs
49
  -- * Display functions
50
  , printNodes
51
  , printInsts
52
  -- * Balacing functions
53
  , checkMove
54
  , doNextBalance
55
  , tryBalance
56
  , compCV
57
  , compCVNodes
58
  , compDetailedCV
59
  , printStats
60
  , iMoveToJob
61
  -- * IAllocator functions
62
  , genAllocNodes
63
  , tryAlloc
64
  , tryMGAlloc
65
  , tryNodeEvac
66
  , tryChangeGroup
67
  , collapseFailures
68
  , allocList
69
  -- * Allocation functions
70
  , iterateAlloc
71
  , tieredAlloc
72
  -- * Node group functions
73
  , instanceGroup
74
  , findSplitInstances
75
  , splitCluster
76
  ) where
77

    
78
import qualified Data.IntSet as IntSet
79
import Data.List
80
import Data.Maybe (fromJust, isNothing)
81
import Data.Ord (comparing)
82
import Text.Printf (printf)
83

    
84
import Ganeti.BasicTypes
85
import qualified Ganeti.HTools.Container as Container
86
import qualified Ganeti.HTools.Instance as Instance
87
import qualified Ganeti.HTools.Node as Node
88
import qualified Ganeti.HTools.Group as Group
89
import Ganeti.HTools.Types
90
import Ganeti.Compat
91
import qualified Ganeti.OpCodes as OpCodes
92
import Ganeti.Utils
93
import Ganeti.Types (mkNonEmpty)
94

    
95
-- * Types
96

    
97
-- | Allocation\/relocation solution.
98
data AllocSolution = AllocSolution
99
  { asFailures :: [FailMode]              -- ^ Failure counts
100
  , asAllocs   :: Int                     -- ^ Good allocation count
101
  , asSolution :: Maybe Node.AllocElement -- ^ The actual allocation result
102
  , asLog      :: [String]                -- ^ Informational messages
103
  }
104

    
105
-- | Node evacuation/group change iallocator result type. This result
106
-- type consists of actual opcodes (a restricted subset) that are
107
-- transmitted back to Ganeti.
108
data EvacSolution = EvacSolution
109
  { esMoved   :: [(Idx, Gdx, [Ndx])]  -- ^ Instances moved successfully
110
  , esFailed  :: [(Idx, String)]      -- ^ Instances which were not
111
                                      -- relocated
112
  , esOpCodes :: [[OpCodes.OpCode]]   -- ^ List of jobs
113
  } deriving (Show)
114

    
115
-- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
116
type AllocResult = (FailStats, Node.List, Instance.List,
117
                    [Instance.Instance], [CStats])
118

    
119
-- | Type alias for easier handling.
120
type AllocSolutionList = [(Instance.Instance, AllocSolution)]
121

    
122
-- | A type denoting the valid allocation mode/pairs.
123
--
124
-- For a one-node allocation, this will be a @Left ['Ndx']@, whereas
125
-- for a two-node allocation, this will be a @Right [('Ndx',
126
-- ['Ndx'])]@. In the latter case, the list is basically an
127
-- association list, grouped by primary node and holding the potential
128
-- secondary nodes in the sub-list.
129
type AllocNodes = Either [Ndx] [(Ndx, [Ndx])]
130

    
131
-- | The empty solution we start with when computing allocations.
132
emptyAllocSolution :: AllocSolution
133
emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
134
                                   , asSolution = Nothing, asLog = [] }
135

    
136
-- | The empty evac solution.
137
emptyEvacSolution :: EvacSolution
138
emptyEvacSolution = EvacSolution { esMoved = []
139
                                 , esFailed = []
140
                                 , esOpCodes = []
141
                                 }
142

    
143
-- | The complete state for the balancing solution.
144
data Table = Table Node.List Instance.List Score [Placement]
145
             deriving (Show)
146

    
147
-- | Cluster statistics data type.
148
data CStats = CStats
149
  { csFmem :: Integer -- ^ Cluster free mem
150
  , csFdsk :: Integer -- ^ Cluster free disk
151
  , csAmem :: Integer -- ^ Cluster allocatable mem
152
  , csAdsk :: Integer -- ^ Cluster allocatable disk
153
  , csAcpu :: Integer -- ^ Cluster allocatable cpus
154
  , csMmem :: Integer -- ^ Max node allocatable mem
155
  , csMdsk :: Integer -- ^ Max node allocatable disk
156
  , csMcpu :: Integer -- ^ Max node allocatable cpu
157
  , csImem :: Integer -- ^ Instance used mem
158
  , csIdsk :: Integer -- ^ Instance used disk
159
  , csIcpu :: Integer -- ^ Instance used cpu
160
  , csTmem :: Double  -- ^ Cluster total mem
161
  , csTdsk :: Double  -- ^ Cluster total disk
162
  , csTcpu :: Double  -- ^ Cluster total cpus
163
  , csVcpu :: Integer -- ^ Cluster total virtual cpus
164
  , csNcpu :: Double  -- ^ Equivalent to 'csIcpu' but in terms of
165
                      -- physical CPUs, i.e. normalised used phys CPUs
166
  , csXmem :: Integer -- ^ Unnacounted for mem
167
  , csNmem :: Integer -- ^ Node own memory
168
  , csScore :: Score  -- ^ The cluster score
169
  , csNinst :: Int    -- ^ The total number of instances
170
  } deriving (Show)
171

    
172
-- | A simple type for allocation functions.
173
type AllocMethod =  Node.List           -- ^ Node list
174
                 -> Instance.List       -- ^ Instance list
175
                 -> Maybe Int           -- ^ Optional allocation limit
176
                 -> Instance.Instance   -- ^ Instance spec for allocation
177
                 -> AllocNodes          -- ^ Which nodes we should allocate on
178
                 -> [Instance.Instance] -- ^ Allocated instances
179
                 -> [CStats]            -- ^ Running cluster stats
180
                 -> Result AllocResult  -- ^ Allocation result
181

    
182
-- | A simple type for the running solution of evacuations.
183
type EvacInnerState =
184
  Either String (Node.List, Instance.Instance, Score, Ndx)
185

    
186
-- * Utility functions
187

    
188
-- | Verifies the N+1 status and return the affected nodes.
189
verifyN1 :: [Node.Node] -> [Node.Node]
190
verifyN1 = filter Node.failN1
191

    
192
{-| Computes the pair of bad nodes and instances.
193

    
194
The bad node list is computed via a simple 'verifyN1' check, and the
195
bad instance list is the list of primary and secondary instances of
196
those nodes.
197

    
198
-}
199
computeBadItems :: Node.List -> Instance.List ->
200
                   ([Node.Node], [Instance.Instance])
201
computeBadItems nl il =
202
  let bad_nodes = verifyN1 $ getOnline nl
203
      bad_instances = map (`Container.find` il) .
204
                      sort . nub $
205
                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
206
  in
207
    (bad_nodes, bad_instances)
208

    
209
-- | Extracts the node pairs for an instance. This can fail if the
210
-- instance is single-homed. FIXME: this needs to be improved,
211
-- together with the general enhancement for handling non-DRBD moves.
212
instanceNodes :: Node.List -> Instance.Instance ->
213
                 (Ndx, Ndx, Node.Node, Node.Node)
214
instanceNodes nl inst =
215
  let old_pdx = Instance.pNode inst
216
      old_sdx = Instance.sNode inst
217
      old_p = Container.find old_pdx nl
218
      old_s = Container.find old_sdx nl
219
  in (old_pdx, old_sdx, old_p, old_s)
220

    
221
-- | Zero-initializer for the CStats type.
222
emptyCStats :: CStats
223
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
224

    
225
-- | Update stats with data from a new node.
226
updateCStats :: CStats -> Node.Node -> CStats
227
updateCStats cs node =
228
  let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
229
               csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
230
               csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
231
               csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
232
               csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
233
               csVcpu = x_vcpu, csNcpu = x_ncpu,
234
               csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
235
             }
236
        = cs
237
      inc_amem = Node.fMem node - Node.rMem node
238
      inc_amem' = if inc_amem > 0 then inc_amem else 0
239
      inc_adsk = Node.availDisk node
240
      inc_imem = truncate (Node.tMem node) - Node.nMem node
241
                 - Node.xMem node - Node.fMem node
242
      inc_icpu = Node.uCpu node
243
      inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
244
      inc_vcpu = Node.hiCpu node
245
      inc_acpu = Node.availCpu node
246
      inc_ncpu = fromIntegral (Node.uCpu node) /
247
                 iPolicyVcpuRatio (Node.iPolicy node)
248
  in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
249
        , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
250
        , csAmem = x_amem + fromIntegral inc_amem'
251
        , csAdsk = x_adsk + fromIntegral inc_adsk
252
        , csAcpu = x_acpu + fromIntegral inc_acpu
253
        , csMmem = max x_mmem (fromIntegral inc_amem')
254
        , csMdsk = max x_mdsk (fromIntegral inc_adsk)
255
        , csMcpu = max x_mcpu (fromIntegral inc_acpu)
256
        , csImem = x_imem + fromIntegral inc_imem
257
        , csIdsk = x_idsk + fromIntegral inc_idsk
258
        , csIcpu = x_icpu + fromIntegral inc_icpu
259
        , csTmem = x_tmem + Node.tMem node
260
        , csTdsk = x_tdsk + Node.tDsk node
261
        , csTcpu = x_tcpu + Node.tCpu node
262
        , csVcpu = x_vcpu + fromIntegral inc_vcpu
263
        , csNcpu = x_ncpu + inc_ncpu
264
        , csXmem = x_xmem + fromIntegral (Node.xMem node)
265
        , csNmem = x_nmem + fromIntegral (Node.nMem node)
266
        , csNinst = x_ninst + length (Node.pList node)
267
        }
268

    
269
-- | Compute the total free disk and memory in the cluster.
270
totalResources :: Node.List -> CStats
271
totalResources nl =
272
  let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
273
  in cs { csScore = compCV nl }
274

    
275
-- | Compute the delta between two cluster state.
276
--
277
-- This is used when doing allocations, to understand better the
278
-- available cluster resources. The return value is a triple of the
279
-- current used values, the delta that was still allocated, and what
280
-- was left unallocated.
281
computeAllocationDelta :: CStats -> CStats -> AllocStats
282
computeAllocationDelta cini cfin =
283
  let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu,
284
              csNcpu = i_ncpu } = cini
285
      CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
286
              csTmem = t_mem, csTdsk = t_dsk, csVcpu = f_vcpu,
287
              csNcpu = f_ncpu, csTcpu = f_tcpu } = cfin
288
      rini = AllocInfo { allocInfoVCpus = fromIntegral i_icpu
289
                       , allocInfoNCpus = i_ncpu
290
                       , allocInfoMem   = fromIntegral i_imem
291
                       , allocInfoDisk  = fromIntegral i_idsk
292
                       }
293
      rfin = AllocInfo { allocInfoVCpus = fromIntegral (f_icpu - i_icpu)
294
                       , allocInfoNCpus = f_ncpu - i_ncpu
295
                       , allocInfoMem   = fromIntegral (f_imem - i_imem)
296
                       , allocInfoDisk  = fromIntegral (f_idsk - i_idsk)
297
                       }
298
      runa = AllocInfo { allocInfoVCpus = fromIntegral (f_vcpu - f_icpu)
299
                       , allocInfoNCpus = f_tcpu - f_ncpu
300
                       , allocInfoMem   = truncate t_mem - fromIntegral f_imem
301
                       , allocInfoDisk  = truncate t_dsk - fromIntegral f_idsk
302
                       }
303
  in (rini, rfin, runa)
304

    
305
-- | The names and weights of the individual elements in the CV list.
306
detailedCVInfo :: [(Double, String)]
307
detailedCVInfo = [ (1,  "free_mem_cv")
308
                 , (1,  "free_disk_cv")
309
                 , (1,  "n1_cnt")
310
                 , (1,  "reserved_mem_cv")
311
                 , (4,  "offline_all_cnt")
312
                 , (16, "offline_pri_cnt")
313
                 , (1,  "vcpu_ratio_cv")
314
                 , (1,  "cpu_load_cv")
315
                 , (1,  "mem_load_cv")
316
                 , (1,  "disk_load_cv")
317
                 , (1,  "net_load_cv")
318
                 , (2,  "pri_tags_score")
319
                 , (1,  "spindles_cv")
320
                 ]
321

    
322
-- | Holds the weights used by 'compCVNodes' for each metric.
323
detailedCVWeights :: [Double]
324
detailedCVWeights = map fst detailedCVInfo
325

    
326
-- | Compute the mem and disk covariance.
327
compDetailedCV :: [Node.Node] -> [Double]
328
compDetailedCV all_nodes =
329
  let (offline, nodes) = partition Node.offline all_nodes
330
      mem_l = map Node.pMem nodes
331
      dsk_l = map Node.pDsk nodes
332
      -- metric: memory covariance
333
      mem_cv = stdDev mem_l
334
      -- metric: disk covariance
335
      dsk_cv = stdDev dsk_l
336
      -- metric: count of instances living on N1 failing nodes
337
      n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
338
                                                 length (Node.pList n)) .
339
                 filter Node.failN1 $ nodes :: Double
340
      res_l = map Node.pRem nodes
341
      -- metric: reserved memory covariance
342
      res_cv = stdDev res_l
343
      -- offline instances metrics
344
      offline_ipri = sum . map (length . Node.pList) $ offline
345
      offline_isec = sum . map (length . Node.sList) $ offline
346
      -- metric: count of instances on offline nodes
347
      off_score = fromIntegral (offline_ipri + offline_isec)::Double
348
      -- metric: count of primary instances on offline nodes (this
349
      -- helps with evacuation/failover of primary instances on
350
      -- 2-node clusters with one node offline)
351
      off_pri_score = fromIntegral offline_ipri::Double
352
      cpu_l = map Node.pCpu nodes
353
      -- metric: covariance of vcpu/pcpu ratio
354
      cpu_cv = stdDev cpu_l
355
      -- metrics: covariance of cpu, memory, disk and network load
356
      (c_load, m_load, d_load, n_load) =
357
        unzip4 $ map (\n ->
358
                      let DynUtil c1 m1 d1 n1 = Node.utilLoad n
359
                          DynUtil c2 m2 d2 n2 = Node.utilPool n
360
                      in (c1/c2, m1/m2, d1/d2, n1/n2)) nodes
361
      -- metric: conflicting instance count
362
      pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
363
      pri_tags_score = fromIntegral pri_tags_inst::Double
364
      -- metric: spindles %
365
      spindles_cv = map (\n -> Node.instSpindles n / Node.hiSpindles n) nodes
366
  in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
367
     , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
368
     , pri_tags_score, stdDev spindles_cv ]
369

    
370
-- | Compute the /total/ variance.
371
compCVNodes :: [Node.Node] -> Double
372
compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
373

    
374
-- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
375
compCV :: Node.List -> Double
376
compCV = compCVNodes . Container.elems
377

    
378
-- | Compute online nodes from a 'Node.List'.
379
getOnline :: Node.List -> [Node.Node]
380
getOnline = filter (not . Node.offline) . Container.elems
381

    
382
-- * Balancing functions
383

    
384
-- | Compute best table. Note that the ordering of the arguments is important.
385
compareTables :: Table -> Table -> Table
386
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
387
  if a_cv > b_cv then b else a
388

    
389
-- | Applies an instance move to a given node list and instance.
390
applyMove :: Node.List -> Instance.Instance
391
          -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
392
-- Failover (f)
393
applyMove nl inst Failover =
394
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
395
      int_p = Node.removePri old_p inst
396
      int_s = Node.removeSec old_s inst
397
      new_nl = do -- Maybe monad
398
        new_p <- Node.addPriEx (Node.offline old_p) int_s inst
399
        new_s <- Node.addSec int_p inst old_sdx
400
        let new_inst = Instance.setBoth inst old_sdx old_pdx
401
        return (Container.addTwo old_pdx new_s old_sdx new_p nl,
402
                new_inst, old_sdx, old_pdx)
403
  in new_nl
404

    
405
-- Failover to any (fa)
406
applyMove nl inst (FailoverToAny new_pdx) = do
407
  let (old_pdx, old_sdx, old_pnode, _) = instanceNodes nl inst
408
      new_pnode = Container.find new_pdx nl
409
      force_failover = Node.offline old_pnode
410
  new_pnode' <- Node.addPriEx force_failover new_pnode inst
411
  let old_pnode' = Node.removePri old_pnode inst
412
      inst' = Instance.setPri inst new_pdx
413
      nl' = Container.addTwo old_pdx old_pnode' new_pdx new_pnode' nl
414
  return (nl', inst', new_pdx, old_sdx)
415

    
416
-- Replace the primary (f:, r:np, f)
417
applyMove nl inst (ReplacePrimary new_pdx) =
418
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
419
      tgt_n = Container.find new_pdx nl
420
      int_p = Node.removePri old_p inst
421
      int_s = Node.removeSec old_s inst
422
      force_p = Node.offline old_p
423
      new_nl = do -- Maybe monad
424
                  -- check that the current secondary can host the instance
425
                  -- during the migration
426
        tmp_s <- Node.addPriEx force_p int_s inst
427
        let tmp_s' = Node.removePri tmp_s inst
428
        new_p <- Node.addPriEx force_p tgt_n inst
429
        new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
430
        let new_inst = Instance.setPri inst new_pdx
431
        return (Container.add new_pdx new_p $
432
                Container.addTwo old_pdx int_p old_sdx new_s nl,
433
                new_inst, new_pdx, old_sdx)
434
  in new_nl
435

    
436
-- Replace the secondary (r:ns)
437
applyMove nl inst (ReplaceSecondary new_sdx) =
438
  let old_pdx = Instance.pNode inst
439
      old_sdx = Instance.sNode inst
440
      old_s = Container.find old_sdx nl
441
      tgt_n = Container.find new_sdx nl
442
      int_s = Node.removeSec old_s inst
443
      force_s = Node.offline old_s
444
      new_inst = Instance.setSec inst new_sdx
445
      new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
446
               \new_s -> return (Container.addTwo new_sdx
447
                                 new_s old_sdx int_s nl,
448
                                 new_inst, old_pdx, new_sdx)
449
  in new_nl
450

    
451
-- Replace the secondary and failover (r:np, f)
452
applyMove nl inst (ReplaceAndFailover new_pdx) =
453
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
454
      tgt_n = Container.find new_pdx nl
455
      int_p = Node.removePri old_p inst
456
      int_s = Node.removeSec old_s inst
457
      force_s = Node.offline old_s
458
      new_nl = do -- Maybe monad
459
        new_p <- Node.addPri tgt_n inst
460
        new_s <- Node.addSecEx force_s int_p inst new_pdx
461
        let new_inst = Instance.setBoth inst new_pdx old_pdx
462
        return (Container.add new_pdx new_p $
463
                Container.addTwo old_pdx new_s old_sdx int_s nl,
464
                new_inst, new_pdx, old_pdx)
465
  in new_nl
466

    
467
-- Failver and replace the secondary (f, r:ns)
468
applyMove nl inst (FailoverAndReplace new_sdx) =
469
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
470
      tgt_n = Container.find new_sdx nl
471
      int_p = Node.removePri old_p inst
472
      int_s = Node.removeSec old_s inst
473
      force_p = Node.offline old_p
474
      new_nl = do -- Maybe monad
475
        new_p <- Node.addPriEx force_p int_s inst
476
        new_s <- Node.addSecEx force_p tgt_n inst old_sdx
477
        let new_inst = Instance.setBoth inst old_sdx new_sdx
478
        return (Container.add new_sdx new_s $
479
                Container.addTwo old_sdx new_p old_pdx int_p nl,
480
                new_inst, old_sdx, new_sdx)
481
  in new_nl
482

    
483
-- | Tries to allocate an instance on one given node.
484
allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
485
                 -> OpResult Node.AllocElement
486
allocateOnSingle nl inst new_pdx =
487
  let p = Container.find new_pdx nl
488
      new_inst = Instance.setBoth inst new_pdx Node.noSecondary
489
  in do
490
    Instance.instMatchesPolicy inst (Node.iPolicy p)
491
    new_p <- Node.addPri p inst
492
    let new_nl = Container.add new_pdx new_p nl
493
        new_score = compCV new_nl
494
    return (new_nl, new_inst, [new_p], new_score)
495

    
496
-- | Tries to allocate an instance on a given pair of nodes.
497
allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
498
               -> OpResult Node.AllocElement
499
allocateOnPair nl inst new_pdx new_sdx =
500
  let tgt_p = Container.find new_pdx nl
501
      tgt_s = Container.find new_sdx nl
502
  in do
503
    Instance.instMatchesPolicy inst (Node.iPolicy tgt_p)
504
    new_p <- Node.addPri tgt_p inst
505
    new_s <- Node.addSec tgt_s inst new_pdx
506
    let new_inst = Instance.setBoth inst new_pdx new_sdx
507
        new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
508
    return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
509

    
510
-- | Tries to perform an instance move and returns the best table
511
-- between the original one and the new one.
512
checkSingleStep :: Table -- ^ The original table
513
                -> Instance.Instance -- ^ The instance to move
514
                -> Table -- ^ The current best table
515
                -> IMove -- ^ The move to apply
516
                -> Table -- ^ The final best table
517
checkSingleStep ini_tbl target cur_tbl move =
518
  let Table ini_nl ini_il _ ini_plc = ini_tbl
519
      tmp_resu = applyMove ini_nl target move
520
  in case tmp_resu of
521
       Bad _ -> cur_tbl
522
       Ok (upd_nl, new_inst, pri_idx, sec_idx) ->
523
         let tgt_idx = Instance.idx target
524
             upd_cvar = compCV upd_nl
525
             upd_il = Container.add tgt_idx new_inst ini_il
526
             upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
527
             upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
528
         in compareTables cur_tbl upd_tbl
529

    
530
-- | Given the status of the current secondary as a valid new node and
531
-- the current candidate target node, generate the possible moves for
532
-- a instance.
533
possibleMoves :: MirrorType -- ^ The mirroring type of the instance
534
              -> Bool       -- ^ Whether the secondary node is a valid new node
535
              -> Bool       -- ^ Whether we can change the primary node
536
              -> Ndx        -- ^ Target node candidate
537
              -> [IMove]    -- ^ List of valid result moves
538

    
539
possibleMoves MirrorNone _ _ _ = []
540

    
541
possibleMoves MirrorExternal _ False _ = []
542

    
543
possibleMoves MirrorExternal _ True tdx =
544
  [ FailoverToAny tdx ]
545

    
546
possibleMoves MirrorInternal _ False tdx =
547
  [ ReplaceSecondary tdx ]
548

    
549
possibleMoves MirrorInternal True True tdx =
550
  [ ReplaceSecondary tdx
551
  , ReplaceAndFailover tdx
552
  , ReplacePrimary tdx
553
  , FailoverAndReplace tdx
554
  ]
555

    
556
possibleMoves MirrorInternal False True tdx =
557
  [ ReplaceSecondary tdx
558
  , ReplaceAndFailover tdx
559
  ]
560

    
561
-- | Compute the best move for a given instance.
562
checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
563
                  -> Bool              -- ^ Whether disk moves are allowed
564
                  -> Bool              -- ^ Whether instance moves are allowed
565
                  -> Table             -- ^ Original table
566
                  -> Instance.Instance -- ^ Instance to move
567
                  -> Table             -- ^ Best new table for this instance
568
checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
569
  let opdx = Instance.pNode target
570
      osdx = Instance.sNode target
571
      bad_nodes = [opdx, osdx]
572
      nodes = filter (`notElem` bad_nodes) nodes_idx
573
      mir_type = Instance.mirrorType target
574
      use_secondary = elem osdx nodes_idx && inst_moves
575
      aft_failover = if mir_type == MirrorInternal && use_secondary
576
                       -- if drbd and allowed to failover
577
                       then checkSingleStep ini_tbl target ini_tbl Failover
578
                       else ini_tbl
579
      all_moves =
580
        if disk_moves
581
          then concatMap (possibleMoves mir_type use_secondary inst_moves)
582
               nodes
583
          else []
584
    in
585
      -- iterate over the possible nodes for this instance
586
      foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
587

    
588
-- | Compute the best next move.
589
checkMove :: [Ndx]               -- ^ Allowed target node indices
590
          -> Bool                -- ^ Whether disk moves are allowed
591
          -> Bool                -- ^ Whether instance moves are allowed
592
          -> Table               -- ^ The current solution
593
          -> [Instance.Instance] -- ^ List of instances still to move
594
          -> Table               -- ^ The new solution
595
checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
596
  let Table _ _ _ ini_plc = ini_tbl
597
      -- we're using rwhnf from the Control.Parallel.Strategies
598
      -- package; we don't need to use rnf as that would force too
599
      -- much evaluation in single-threaded cases, and in
600
      -- multi-threaded case the weak head normal form is enough to
601
      -- spark the evaluation
602
      tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
603
                             inst_moves ini_tbl)
604
               victims
605
      -- iterate over all instances, computing the best move
606
      best_tbl = foldl' compareTables ini_tbl tables
607
      Table _ _ _ best_plc = best_tbl
608
  in if length best_plc == length ini_plc
609
       then ini_tbl -- no advancement
610
       else best_tbl
611

    
612
-- | Check if we are allowed to go deeper in the balancing.
613
doNextBalance :: Table     -- ^ The starting table
614
              -> Int       -- ^ Remaining length
615
              -> Score     -- ^ Score at which to stop
616
              -> Bool      -- ^ The resulting table and commands
617
doNextBalance ini_tbl max_rounds min_score =
618
  let Table _ _ ini_cv ini_plc = ini_tbl
619
      ini_plc_len = length ini_plc
620
  in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
621

    
622
-- | Run a balance move.
623
tryBalance :: Table       -- ^ The starting table
624
           -> Bool        -- ^ Allow disk moves
625
           -> Bool        -- ^ Allow instance moves
626
           -> Bool        -- ^ Only evacuate moves
627
           -> Score       -- ^ Min gain threshold
628
           -> Score       -- ^ Min gain
629
           -> Maybe Table -- ^ The resulting table and commands
630
tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
631
    let Table ini_nl ini_il ini_cv _ = ini_tbl
632
        all_inst = Container.elems ini_il
633
        all_nodes = Container.elems ini_nl
634
        (offline_nodes, online_nodes) = partition Node.offline all_nodes
635
        all_inst' = if evac_mode
636
                      then let bad_nodes = map Node.idx offline_nodes
637
                           in filter (any (`elem` bad_nodes) .
638
                                          Instance.allNodes) all_inst
639
                      else all_inst
640
        reloc_inst = filter (\i -> Instance.movable i &&
641
                                   Instance.autoBalance i) all_inst'
642
        node_idx = map Node.idx online_nodes
643
        fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
644
        (Table _ _ fin_cv _) = fin_tbl
645
    in
646
      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
647
      then Just fin_tbl -- this round made success, return the new table
648
      else Nothing
649

    
650
-- * Allocation functions
651

    
652
-- | Build failure stats out of a list of failures.
653
collapseFailures :: [FailMode] -> FailStats
654
collapseFailures flst =
655
    map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
656
            [minBound..maxBound]
657

    
658
-- | Compares two Maybe AllocElement and chooses the best score.
659
bestAllocElement :: Maybe Node.AllocElement
660
                 -> Maybe Node.AllocElement
661
                 -> Maybe Node.AllocElement
662
bestAllocElement a Nothing = a
663
bestAllocElement Nothing b = b
664
bestAllocElement a@(Just (_, _, _, ascore)) b@(Just (_, _, _, bscore)) =
665
  if ascore < bscore then a else b
666

    
667
-- | Update current Allocation solution and failure stats with new
668
-- elements.
669
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
670
concatAllocs as (Bad reason) = as { asFailures = reason : asFailures as }
671

    
672
concatAllocs as (Ok ns) =
673
  let -- Choose the old or new solution, based on the cluster score
674
    cntok = asAllocs as
675
    osols = asSolution as
676
    nsols = bestAllocElement osols (Just ns)
677
    nsuc = cntok + 1
678
    -- Note: we force evaluation of nsols here in order to keep the
679
    -- memory profile low - we know that we will need nsols for sure
680
    -- in the next cycle, so we force evaluation of nsols, since the
681
    -- foldl' in the caller will only evaluate the tuple, but not the
682
    -- elements of the tuple
683
  in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolution = nsols }
684

    
685
-- | Sums two 'AllocSolution' structures.
686
sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
687
sumAllocs (AllocSolution aFails aAllocs aSols aLog)
688
          (AllocSolution bFails bAllocs bSols bLog) =
689
  -- note: we add b first, since usually it will be smaller; when
690
  -- fold'ing, a will grow and grow whereas b is the per-group
691
  -- result, hence smaller
692
  let nFails  = bFails ++ aFails
693
      nAllocs = aAllocs + bAllocs
694
      nSols   = bestAllocElement aSols bSols
695
      nLog    = bLog ++ aLog
696
  in AllocSolution nFails nAllocs nSols nLog
697

    
698
-- | Given a solution, generates a reasonable description for it.
699
describeSolution :: AllocSolution -> String
700
describeSolution as =
701
  let fcnt = asFailures as
702
      sols = asSolution as
703
      freasons =
704
        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
705
        filter ((> 0) . snd) . collapseFailures $ fcnt
706
  in case sols of
707
     Nothing -> "No valid allocation solutions, failure reasons: " ++
708
                (if null fcnt then "unknown reasons" else freasons)
709
     Just (_, _, nodes, cv) ->
710
         printf ("score: %.8f, successes %d, failures %d (%s)" ++
711
                 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
712
               (intercalate "/" . map Node.name $ nodes)
713

    
714
-- | Annotates a solution with the appropriate string.
715
annotateSolution :: AllocSolution -> AllocSolution
716
annotateSolution as = as { asLog = describeSolution as : asLog as }
717

    
718
-- | Reverses an evacuation solution.
719
--
720
-- Rationale: we always concat the results to the top of the lists, so
721
-- for proper jobset execution, we should reverse all lists.
722
reverseEvacSolution :: EvacSolution -> EvacSolution
723
reverseEvacSolution (EvacSolution f m o) =
724
  EvacSolution (reverse f) (reverse m) (reverse o)
725

    
726
-- | Generate the valid node allocation singles or pairs for a new instance.
727
genAllocNodes :: Group.List        -- ^ Group list
728
              -> Node.List         -- ^ The node map
729
              -> Int               -- ^ The number of nodes required
730
              -> Bool              -- ^ Whether to drop or not
731
                                   -- unallocable nodes
732
              -> Result AllocNodes -- ^ The (monadic) result
733
genAllocNodes gl nl count drop_unalloc =
734
  let filter_fn = if drop_unalloc
735
                    then filter (Group.isAllocable .
736
                                 flip Container.find gl . Node.group)
737
                    else id
738
      all_nodes = filter_fn $ getOnline nl
739
      all_pairs = [(Node.idx p,
740
                    [Node.idx s | s <- all_nodes,
741
                                       Node.idx p /= Node.idx s,
742
                                       Node.group p == Node.group s]) |
743
                   p <- all_nodes]
744
  in case count of
745
       1 -> Ok (Left (map Node.idx all_nodes))
746
       2 -> Ok (Right (filter (not . null . snd) all_pairs))
747
       _ -> Bad "Unsupported number of nodes, only one or two  supported"
748

    
749
-- | Try to allocate an instance on the cluster.
750
tryAlloc :: (Monad m) =>
751
            Node.List         -- ^ The node list
752
         -> Instance.List     -- ^ The instance list
753
         -> Instance.Instance -- ^ The instance to allocate
754
         -> AllocNodes        -- ^ The allocation targets
755
         -> m AllocSolution   -- ^ Possible solution list
756
tryAlloc _  _ _    (Right []) = fail "Not enough online nodes"
757
tryAlloc nl _ inst (Right ok_pairs) =
758
  let psols = parMap rwhnf (\(p, ss) ->
759
                              foldl' (\cstate ->
760
                                        concatAllocs cstate .
761
                                        allocateOnPair nl inst p)
762
                              emptyAllocSolution ss) ok_pairs
763
      sols = foldl' sumAllocs emptyAllocSolution psols
764
  in return $ annotateSolution sols
765

    
766
tryAlloc _  _ _    (Left []) = fail "No online nodes"
767
tryAlloc nl _ inst (Left all_nodes) =
768
  let sols = foldl' (\cstate ->
769
                       concatAllocs cstate . allocateOnSingle nl inst
770
                    ) emptyAllocSolution all_nodes
771
  in return $ annotateSolution sols
772

    
773
-- | Given a group/result, describe it as a nice (list of) messages.
774
solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
775
solutionDescription gl (groupId, result) =
776
  case result of
777
    Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
778
    Bad message -> [printf "Group %s: error %s" gname message]
779
  where grp = Container.find groupId gl
780
        gname = Group.name grp
781
        pol = allocPolicyToRaw (Group.allocPolicy grp)
782

    
783
-- | From a list of possibly bad and possibly empty solutions, filter
784
-- only the groups with a valid result. Note that the result will be
785
-- reversed compared to the original list.
786
filterMGResults :: Group.List
787
                -> [(Gdx, Result AllocSolution)]
788
                -> [(Gdx, AllocSolution)]
789
filterMGResults gl = foldl' fn []
790
  where unallocable = not . Group.isAllocable . flip Container.find gl
791
        fn accu (gdx, rasol) =
792
          case rasol of
793
            Bad _ -> accu
794
            Ok sol | isNothing (asSolution sol) -> accu
795
                   | unallocable gdx -> accu
796
                   | otherwise -> (gdx, sol):accu
797

    
798
-- | Sort multigroup results based on policy and score.
799
sortMGResults :: Group.List
800
             -> [(Gdx, AllocSolution)]
801
             -> [(Gdx, AllocSolution)]
802
sortMGResults gl sols =
803
  let extractScore (_, _, _, x) = x
804
      solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
805
                             (extractScore . fromJust . asSolution) sol)
806
  in sortBy (comparing solScore) sols
807

    
808
-- | Finds the best group for an instance on a multi-group cluster.
809
--
810
-- Only solutions in @preferred@ and @last_resort@ groups will be
811
-- accepted as valid, and additionally if the allowed groups parameter
812
-- is not null then allocation will only be run for those group
813
-- indices.
814
findBestAllocGroup :: Group.List           -- ^ The group list
815
                   -> Node.List            -- ^ The node list
816
                   -> Instance.List        -- ^ The instance list
817
                   -> Maybe [Gdx]          -- ^ The allowed groups
818
                   -> Instance.Instance    -- ^ The instance to allocate
819
                   -> Int                  -- ^ Required number of nodes
820
                   -> Result (Gdx, AllocSolution, [String])
821
findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
822
  let groups = splitCluster mgnl mgil
823
      groups' = maybe groups (\gs -> filter ((`elem` gs) . fst) groups)
824
                allowed_gdxs
825
      sols = map (\(gid, (nl, il)) ->
826
                   (gid, genAllocNodes mggl nl cnt False >>=
827
                       tryAlloc nl il inst))
828
             groups'::[(Gdx, Result AllocSolution)]
829
      all_msgs = concatMap (solutionDescription mggl) sols
830
      goodSols = filterMGResults mggl sols
831
      sortedSols = sortMGResults mggl goodSols
832
  in case sortedSols of
833
       [] -> Bad $ if null groups'
834
                     then "no groups for evacuation: allowed groups was" ++
835
                          show allowed_gdxs ++ ", all groups: " ++
836
                          show (map fst groups)
837
                     else intercalate ", " all_msgs
838
       (final_group, final_sol):_ -> return (final_group, final_sol, all_msgs)
839

    
840
-- | Try to allocate an instance on a multi-group cluster.
841
tryMGAlloc :: Group.List           -- ^ The group list
842
           -> Node.List            -- ^ The node list
843
           -> Instance.List        -- ^ The instance list
844
           -> Instance.Instance    -- ^ The instance to allocate
845
           -> Int                  -- ^ Required number of nodes
846
           -> Result AllocSolution -- ^ Possible solution list
847
tryMGAlloc mggl mgnl mgil inst cnt = do
848
  (best_group, solution, all_msgs) <-
849
      findBestAllocGroup mggl mgnl mgil Nothing inst cnt
850
  let group_name = Group.name $ Container.find best_group mggl
851
      selmsg = "Selected group: " ++ group_name
852
  return $ solution { asLog = selmsg:all_msgs }
853

    
854
-- | Calculate the new instance list after allocation solution.
855
updateIl :: Instance.List           -- ^ The original instance list
856
         -> Maybe Node.AllocElement -- ^ The result of the allocation attempt
857
         -> Instance.List           -- ^ The updated instance list
858
updateIl il Nothing = il
859
updateIl il (Just (_, xi, _, _)) = Container.add (Container.size il) xi il
860

    
861
-- | Extract the the new node list from the allocation solution.
862
extractNl :: Node.List               -- ^ The original node list
863
          -> Maybe Node.AllocElement -- ^ The result of the allocation attempt
864
          -> Node.List               -- ^ The new node list
865
extractNl nl Nothing = nl
866
extractNl _ (Just (xnl, _, _, _)) = xnl
867

    
868
-- | Try to allocate a list of instances on a multi-group cluster.
869
allocList :: Group.List                  -- ^ The group list
870
          -> Node.List                   -- ^ The node list
871
          -> Instance.List               -- ^ The instance list
872
          -> [(Instance.Instance, Int)]  -- ^ The instance to allocate
873
          -> AllocSolutionList           -- ^ Possible solution list
874
          -> Result (Node.List, Instance.List,
875
                     AllocSolutionList)  -- ^ The final solution list
876
allocList _  nl il [] result = Ok (nl, il, result)
877
allocList gl nl il ((xi, xicnt):xies) result = do
878
  ares <- tryMGAlloc gl nl il xi xicnt
879
  let sol = asSolution ares
880
      nl' = extractNl nl sol
881
      il' = updateIl il sol
882
  allocList gl nl' il' xies ((xi, ares):result)
883

    
884
-- | Function which fails if the requested mode is change secondary.
885
--
886
-- This is useful since except DRBD, no other disk template can
887
-- execute change secondary; thus, we can just call this function
888
-- instead of always checking for secondary mode. After the call to
889
-- this function, whatever mode we have is just a primary change.
890
failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
891
failOnSecondaryChange ChangeSecondary dt =
892
  fail $ "Instances with disk template '" ++ diskTemplateToRaw dt ++
893
         "' can't execute change secondary"
894
failOnSecondaryChange _ _ = return ()
895

    
896
-- | Run evacuation for a single instance.
897
--
898
-- /Note:/ this function should correctly execute both intra-group
899
-- evacuations (in all modes) and inter-group evacuations (in the
900
-- 'ChangeAll' mode). Of course, this requires that the correct list
901
-- of target nodes is passed.
902
nodeEvacInstance :: Node.List         -- ^ The node list (cluster-wide)
903
                 -> Instance.List     -- ^ Instance list (cluster-wide)
904
                 -> EvacMode          -- ^ The evacuation mode
905
                 -> Instance.Instance -- ^ The instance to be evacuated
906
                 -> Gdx               -- ^ The group we're targetting
907
                 -> [Ndx]             -- ^ The list of available nodes
908
                                      -- for allocation
909
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
910
nodeEvacInstance nl il mode inst@(Instance.Instance
911
                                  {Instance.diskTemplate = dt@DTDiskless})
912
                 gdx avail_nodes =
913
                   failOnSecondaryChange mode dt >>
914
                   evacOneNodeOnly nl il inst gdx avail_nodes
915

    
916
nodeEvacInstance _ _ _ (Instance.Instance
917
                        {Instance.diskTemplate = DTPlain}) _ _ =
918
                  fail "Instances of type plain cannot be relocated"
919

    
920
nodeEvacInstance _ _ _ (Instance.Instance
921
                        {Instance.diskTemplate = DTFile}) _ _ =
922
                  fail "Instances of type file cannot be relocated"
923

    
924
nodeEvacInstance nl il mode inst@(Instance.Instance
925
                                  {Instance.diskTemplate = dt@DTSharedFile})
926
                 gdx avail_nodes =
927
                   failOnSecondaryChange mode dt >>
928
                   evacOneNodeOnly nl il inst gdx avail_nodes
929

    
930
nodeEvacInstance nl il mode inst@(Instance.Instance
931
                                  {Instance.diskTemplate = dt@DTBlock})
932
                 gdx avail_nodes =
933
                   failOnSecondaryChange mode dt >>
934
                   evacOneNodeOnly nl il inst gdx avail_nodes
935

    
936
nodeEvacInstance nl il mode inst@(Instance.Instance
937
                                  {Instance.diskTemplate = dt@DTRbd})
938
                 gdx avail_nodes =
939
                   failOnSecondaryChange mode dt >>
940
                   evacOneNodeOnly nl il inst gdx avail_nodes
941

    
942
nodeEvacInstance nl il mode inst@(Instance.Instance
943
                                  {Instance.diskTemplate = dt@DTExt})
944
                 gdx avail_nodes =
945
                   failOnSecondaryChange mode dt >>
946
                   evacOneNodeOnly nl il inst gdx avail_nodes
947

    
948
nodeEvacInstance nl il ChangePrimary
949
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
950
                 _ _ =
951
  do
952
    (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
953
    let idx = Instance.idx inst
954
        il' = Container.add idx inst' il
955
        ops = iMoveToJob nl' il' idx Failover
956
    return (nl', il', ops)
957

    
958
nodeEvacInstance nl il ChangeSecondary
959
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
960
                 gdx avail_nodes =
961
  evacOneNodeOnly nl il inst gdx avail_nodes
962

    
963
-- The algorithm for ChangeAll is as follows:
964
--
965
-- * generate all (primary, secondary) node pairs for the target groups
966
-- * for each pair, execute the needed moves (r:s, f, r:s) and compute
967
--   the final node list state and group score
968
-- * select the best choice via a foldl that uses the same Either
969
--   String solution as the ChangeSecondary mode
970
nodeEvacInstance nl il ChangeAll
971
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
972
                 gdx avail_nodes =
973
  do
974
    let no_nodes = Left "no nodes available"
975
        node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s]
976
    (nl', il', ops, _) <-
977
        annotateResult "Can't find any good nodes for relocation" .
978
        eitherToResult $
979
        foldl'
980
        (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of
981
                          Bad msg ->
982
                              case accu of
983
                                Right _ -> accu
984
                                -- we don't need more details (which
985
                                -- nodes, etc.) as we only selected
986
                                -- this group if we can allocate on
987
                                -- it, hence failures will not
988
                                -- propagate out of this fold loop
989
                                Left _ -> Left $ "Allocation failed: " ++ msg
990
                          Ok result@(_, _, _, new_cv) ->
991
                              let new_accu = Right result in
992
                              case accu of
993
                                Left _ -> new_accu
994
                                Right (_, _, _, old_cv) ->
995
                                    if old_cv < new_cv
996
                                    then accu
997
                                    else new_accu
998
        ) no_nodes node_pairs
999

    
1000
    return (nl', il', ops)
1001

    
1002
-- | Generic function for changing one node of an instance.
1003
--
1004
-- This is similar to 'nodeEvacInstance' but will be used in a few of
1005
-- its sub-patterns. It folds the inner function 'evacOneNodeInner'
1006
-- over the list of available nodes, which results in the best choice
1007
-- for relocation.
1008
evacOneNodeOnly :: Node.List         -- ^ The node list (cluster-wide)
1009
                -> Instance.List     -- ^ Instance list (cluster-wide)
1010
                -> Instance.Instance -- ^ The instance to be evacuated
1011
                -> Gdx               -- ^ The group we're targetting
1012
                -> [Ndx]             -- ^ The list of available nodes
1013
                                      -- for allocation
1014
                -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1015
evacOneNodeOnly nl il inst gdx avail_nodes = do
1016
  op_fn <- case Instance.mirrorType inst of
1017
             MirrorNone -> Bad "Can't relocate/evacuate non-mirrored instances"
1018
             MirrorInternal -> Ok ReplaceSecondary
1019
             MirrorExternal -> Ok FailoverToAny
1020
  (nl', inst', _, ndx) <- annotateResult "Can't find any good node" .
1021
                          eitherToResult $
1022
                          foldl' (evacOneNodeInner nl inst gdx op_fn)
1023
                          (Left "no nodes available") avail_nodes
1024
  let idx = Instance.idx inst
1025
      il' = Container.add idx inst' il
1026
      ops = iMoveToJob nl' il' idx (op_fn ndx)
1027
  return (nl', il', ops)
1028

    
1029
-- | Inner fold function for changing one node of an instance.
1030
--
1031
-- Depending on the instance disk template, this will either change
1032
-- the secondary (for DRBD) or the primary node (for shared
1033
-- storage). However, the operation is generic otherwise.
1034
--
1035
-- The running solution is either a @Left String@, which means we
1036
-- don't have yet a working solution, or a @Right (...)@, which
1037
-- represents a valid solution; it holds the modified node list, the
1038
-- modified instance (after evacuation), the score of that solution,
1039
-- and the new secondary node index.
1040
evacOneNodeInner :: Node.List         -- ^ Cluster node list
1041
                 -> Instance.Instance -- ^ Instance being evacuated
1042
                 -> Gdx               -- ^ The group index of the instance
1043
                 -> (Ndx -> IMove)    -- ^ Operation constructor
1044
                 -> EvacInnerState    -- ^ Current best solution
1045
                 -> Ndx               -- ^ Node we're evaluating as target
1046
                 -> EvacInnerState    -- ^ New best solution
1047
evacOneNodeInner nl inst gdx op_fn accu ndx =
1048
  case applyMove nl inst (op_fn ndx) of
1049
    Bad fm -> let fail_msg = "Node " ++ Container.nameOf nl ndx ++
1050
                             " failed: " ++ show fm
1051
              in either (const $ Left fail_msg) (const accu) accu
1052
    Ok (nl', inst', _, _) ->
1053
      let nodes = Container.elems nl'
1054
          -- The fromJust below is ugly (it can fail nastily), but
1055
          -- at this point we should have any internal mismatches,
1056
          -- and adding a monad here would be quite involved
1057
          grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1058
          new_cv = compCVNodes grpnodes
1059
          new_accu = Right (nl', inst', new_cv, ndx)
1060
      in case accu of
1061
           Left _ -> new_accu
1062
           Right (_, _, old_cv, _) ->
1063
             if old_cv < new_cv
1064
               then accu
1065
               else new_accu
1066

    
1067
-- | Compute result of changing all nodes of a DRBD instance.
1068
--
1069
-- Given the target primary and secondary node (which might be in a
1070
-- different group or not), this function will 'execute' all the
1071
-- required steps and assuming all operations succceed, will return
1072
-- the modified node and instance lists, the opcodes needed for this
1073
-- and the new group score.
1074
evacDrbdAllInner :: Node.List         -- ^ Cluster node list
1075
                 -> Instance.List     -- ^ Cluster instance list
1076
                 -> Instance.Instance -- ^ The instance to be moved
1077
                 -> Gdx               -- ^ The target group index
1078
                                      -- (which can differ from the
1079
                                      -- current group of the
1080
                                      -- instance)
1081
                 -> (Ndx, Ndx)        -- ^ Tuple of new
1082
                                      -- primary\/secondary nodes
1083
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score)
1084
evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) = do
1085
  let primary = Container.find (Instance.pNode inst) nl
1086
      idx = Instance.idx inst
1087
  -- if the primary is offline, then we first failover
1088
  (nl1, inst1, ops1) <-
1089
    if Node.offline primary
1090
      then do
1091
        (nl', inst', _, _) <-
1092
          annotateResult "Failing over to the secondary" .
1093
          opToResult $ applyMove nl inst Failover
1094
        return (nl', inst', [Failover])
1095
      else return (nl, inst, [])
1096
  let (o1, o2, o3) = (ReplaceSecondary t_pdx,
1097
                      Failover,
1098
                      ReplaceSecondary t_sdx)
1099
  -- we now need to execute a replace secondary to the future
1100
  -- primary node
1101
  (nl2, inst2, _, _) <-
1102
    annotateResult "Changing secondary to new primary" .
1103
    opToResult $
1104
    applyMove nl1 inst1 o1
1105
  let ops2 = o1:ops1
1106
  -- we now execute another failover, the primary stays fixed now
1107
  (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" .
1108
                        opToResult $ applyMove nl2 inst2 o2
1109
  let ops3 = o2:ops2
1110
  -- and finally another replace secondary, to the final secondary
1111
  (nl4, inst4, _, _) <-
1112
    annotateResult "Changing secondary to final secondary" .
1113
    opToResult $
1114
    applyMove nl3 inst3 o3
1115
  let ops4 = o3:ops3
1116
      il' = Container.add idx inst4 il
1117
      ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1118
  let nodes = Container.elems nl4
1119
      -- The fromJust below is ugly (it can fail nastily), but
1120
      -- at this point we should have any internal mismatches,
1121
      -- and adding a monad here would be quite involved
1122
      grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1123
      new_cv = compCVNodes grpnodes
1124
  return (nl4, il', ops, new_cv)
1125

    
1126
-- | Computes the nodes in a given group which are available for
1127
-- allocation.
1128
availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1129
                    -> IntSet.IntSet  -- ^ Nodes that are excluded
1130
                    -> Gdx            -- ^ The group for which we
1131
                                      -- query the nodes
1132
                    -> Result [Ndx]   -- ^ List of available node indices
1133
availableGroupNodes group_nodes excl_ndx gdx = do
1134
  local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1135
                 Ok (lookup gdx group_nodes)
1136
  let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1137
  return avail_nodes
1138

    
1139
-- | Updates the evac solution with the results of an instance
1140
-- evacuation.
1141
updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1142
                   -> Idx
1143
                   -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1144
                   -> (Node.List, Instance.List, EvacSolution)
1145
updateEvacSolution (nl, il, es) idx (Bad msg) =
1146
  (nl, il, es { esFailed = (idx, msg):esFailed es})
1147
updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1148
  (nl, il, es { esMoved = new_elem:esMoved es
1149
              , esOpCodes = opcodes:esOpCodes es })
1150
    where inst = Container.find idx il
1151
          new_elem = (idx,
1152
                      instancePriGroup nl inst,
1153
                      Instance.allNodes inst)
1154

    
1155
-- | Node-evacuation IAllocator mode main function.
1156
tryNodeEvac :: Group.List    -- ^ The cluster groups
1157
            -> Node.List     -- ^ The node list (cluster-wide, not per group)
1158
            -> Instance.List -- ^ Instance list (cluster-wide)
1159
            -> EvacMode      -- ^ The evacuation mode
1160
            -> [Idx]         -- ^ List of instance (indices) to be evacuated
1161
            -> Result (Node.List, Instance.List, EvacSolution)
1162
tryNodeEvac _ ini_nl ini_il mode idxs =
1163
  let evac_ndx = nodesToEvacuate ini_il mode idxs
1164
      offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1165
      excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1166
      group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1167
                                           (Container.elems nl))) $
1168
                  splitCluster ini_nl ini_il
1169
      (fin_nl, fin_il, esol) =
1170
        foldl' (\state@(nl, il, _) inst ->
1171
                  let gdx = instancePriGroup nl inst
1172
                      pdx = Instance.pNode inst in
1173
                  updateEvacSolution state (Instance.idx inst) $
1174
                  availableGroupNodes group_ndx
1175
                    (IntSet.insert pdx excl_ndx) gdx >>=
1176
                      nodeEvacInstance nl il mode inst gdx
1177
               )
1178
        (ini_nl, ini_il, emptyEvacSolution)
1179
        (map (`Container.find` ini_il) idxs)
1180
  in return (fin_nl, fin_il, reverseEvacSolution esol)
1181

    
1182
-- | Change-group IAllocator mode main function.
1183
--
1184
-- This is very similar to 'tryNodeEvac', the only difference is that
1185
-- we don't choose as target group the current instance group, but
1186
-- instead:
1187
--
1188
--   1. at the start of the function, we compute which are the target
1189
--   groups; either no groups were passed in, in which case we choose
1190
--   all groups out of which we don't evacuate instance, or there were
1191
--   some groups passed, in which case we use those
1192
--
1193
--   2. for each instance, we use 'findBestAllocGroup' to choose the
1194
--   best group to hold the instance, and then we do what
1195
--   'tryNodeEvac' does, except for this group instead of the current
1196
--   instance group.
1197
--
1198
-- Note that the correct behaviour of this function relies on the
1199
-- function 'nodeEvacInstance' to be able to do correctly both
1200
-- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1201
tryChangeGroup :: Group.List    -- ^ The cluster groups
1202
               -> Node.List     -- ^ The node list (cluster-wide)
1203
               -> Instance.List -- ^ Instance list (cluster-wide)
1204
               -> [Gdx]         -- ^ Target groups; if empty, any
1205
                                -- groups not being evacuated
1206
               -> [Idx]         -- ^ List of instance (indices) to be evacuated
1207
               -> Result (Node.List, Instance.List, EvacSolution)
1208
tryChangeGroup gl ini_nl ini_il gdxs idxs =
1209
  let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1210
                             flip Container.find ini_il) idxs
1211
      target_gdxs = (if null gdxs
1212
                       then Container.keys gl
1213
                       else gdxs) \\ evac_gdxs
1214
      offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1215
      excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1216
      group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1217
                                           (Container.elems nl))) $
1218
                  splitCluster ini_nl ini_il
1219
      (fin_nl, fin_il, esol) =
1220
        foldl' (\state@(nl, il, _) inst ->
1221
                  let solution = do
1222
                        let ncnt = Instance.requiredNodes $
1223
                                   Instance.diskTemplate inst
1224
                        (gdx, _, _) <- findBestAllocGroup gl nl il
1225
                                       (Just target_gdxs) inst ncnt
1226
                        av_nodes <- availableGroupNodes group_ndx
1227
                                    excl_ndx gdx
1228
                        nodeEvacInstance nl il ChangeAll inst gdx av_nodes
1229
                  in updateEvacSolution state (Instance.idx inst) solution
1230
               )
1231
        (ini_nl, ini_il, emptyEvacSolution)
1232
        (map (`Container.find` ini_il) idxs)
1233
  in return (fin_nl, fin_il, reverseEvacSolution esol)
1234

    
1235
-- | Standard-sized allocation method.
1236
--
1237
-- This places instances of the same size on the cluster until we're
1238
-- out of space. The result will be a list of identically-sized
1239
-- instances.
1240
iterateAlloc :: AllocMethod
1241
iterateAlloc nl il limit newinst allocnodes ixes cstats =
1242
  let depth = length ixes
1243
      newname = printf "new-%d" depth::String
1244
      newidx = Container.size il
1245
      newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1246
      newlimit = fmap (flip (-) 1) limit
1247
  in case tryAlloc nl il newi2 allocnodes of
1248
       Bad s -> Bad s
1249
       Ok (AllocSolution { asFailures = errs, asSolution = sols3 }) ->
1250
         let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1251
         case sols3 of
1252
           Nothing -> newsol
1253
           Just (xnl, xi, _, _) ->
1254
             if limit == Just 0
1255
               then newsol
1256
               else iterateAlloc xnl (Container.add newidx xi il)
1257
                      newlimit newinst allocnodes (xi:ixes)
1258
                      (totalResources xnl:cstats)
1259

    
1260
-- | Tiered allocation method.
1261
--
1262
-- This places instances on the cluster, and decreases the spec until
1263
-- we can allocate again. The result will be a list of decreasing
1264
-- instance specs.
1265
tieredAlloc :: AllocMethod
1266
tieredAlloc nl il limit newinst allocnodes ixes cstats =
1267
  case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1268
    Bad s -> Bad s
1269
    Ok (errs, nl', il', ixes', cstats') ->
1270
      let newsol = Ok (errs, nl', il', ixes', cstats')
1271
          ixes_cnt = length ixes'
1272
          (stop, newlimit) = case limit of
1273
                               Nothing -> (False, Nothing)
1274
                               Just n -> (n <= ixes_cnt,
1275
                                            Just (n - ixes_cnt)) in
1276
      if stop then newsol else
1277
          case Instance.shrinkByType newinst . fst . last $
1278
               sortBy (comparing snd) errs of
1279
            Bad _ -> newsol
1280
            Ok newinst' -> tieredAlloc nl' il' newlimit
1281
                           newinst' allocnodes ixes' cstats'
1282

    
1283
-- * Formatting functions
1284

    
1285
-- | Given the original and final nodes, computes the relocation description.
1286
computeMoves :: Instance.Instance -- ^ The instance to be moved
1287
             -> String -- ^ The instance name
1288
             -> IMove  -- ^ The move being performed
1289
             -> String -- ^ New primary
1290
             -> String -- ^ New secondary
1291
             -> (String, [String])
1292
                -- ^ Tuple of moves and commands list; moves is containing
1293
                -- either @/f/@ for failover or @/r:name/@ for replace
1294
                -- secondary, while the command list holds gnt-instance
1295
                -- commands (without that prefix), e.g \"@failover instance1@\"
1296
computeMoves i inam mv c d =
1297
  case mv of
1298
    Failover -> ("f", [mig])
1299
    FailoverToAny _ -> (printf "fa:%s" c, [mig_any])
1300
    FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1301
    ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1302
    ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1303
    ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1304
  where morf = if Instance.isRunning i then "migrate" else "failover"
1305
        mig = printf "%s -f %s" morf inam::String
1306
        mig_any = printf "%s -f -n %s %s" morf c inam::String
1307
        rep n = printf "replace-disks -n %s %s" n inam::String
1308

    
1309
-- | Converts a placement to string format.
1310
printSolutionLine :: Node.List     -- ^ The node list
1311
                  -> Instance.List -- ^ The instance list
1312
                  -> Int           -- ^ Maximum node name length
1313
                  -> Int           -- ^ Maximum instance name length
1314
                  -> Placement     -- ^ The current placement
1315
                  -> Int           -- ^ The index of the placement in
1316
                                   -- the solution
1317
                  -> (String, [String])
1318
printSolutionLine nl il nmlen imlen plc pos =
1319
  let pmlen = (2*nmlen + 1)
1320
      (i, p, s, mv, c) = plc
1321
      old_sec = Instance.sNode inst
1322
      inst = Container.find i il
1323
      inam = Instance.alias inst
1324
      npri = Node.alias $ Container.find p nl
1325
      nsec = Node.alias $ Container.find s nl
1326
      opri = Node.alias $ Container.find (Instance.pNode inst) nl
1327
      osec = Node.alias $ Container.find old_sec nl
1328
      (moves, cmds) =  computeMoves inst inam mv npri nsec
1329
      -- FIXME: this should check instead/also the disk template
1330
      ostr = if old_sec == Node.noSecondary
1331
               then printf "%s" opri::String
1332
               else printf "%s:%s" opri osec::String
1333
      nstr = if s == Node.noSecondary
1334
               then printf "%s" npri::String
1335
               else printf "%s:%s" npri nsec::String
1336
  in (printf "  %3d. %-*s %-*s => %-*s %12.8f a=%s"
1337
      pos imlen inam pmlen ostr pmlen nstr c moves,
1338
      cmds)
1339

    
1340
-- | Return the instance and involved nodes in an instance move.
1341
--
1342
-- Note that the output list length can vary, and is not required nor
1343
-- guaranteed to be of any specific length.
1344
involvedNodes :: Instance.List -- ^ Instance list, used for retrieving
1345
                               -- the instance from its index; note
1346
                               -- that this /must/ be the original
1347
                               -- instance list, so that we can
1348
                               -- retrieve the old nodes
1349
              -> Placement     -- ^ The placement we're investigating,
1350
                               -- containing the new nodes and
1351
                               -- instance index
1352
              -> [Ndx]         -- ^ Resulting list of node indices
1353
involvedNodes il plc =
1354
  let (i, np, ns, _, _) = plc
1355
      inst = Container.find i il
1356
  in nub $ [np, ns] ++ Instance.allNodes inst
1357

    
1358
-- | Inner function for splitJobs, that either appends the next job to
1359
-- the current jobset, or starts a new jobset.
1360
mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1361
mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1362
mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1363
  | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1364
  | otherwise = ([n]:cjs, ndx)
1365

    
1366
-- | Break a list of moves into independent groups. Note that this
1367
-- will reverse the order of jobs.
1368
splitJobs :: [MoveJob] -> [JobSet]
1369
splitJobs = fst . foldl mergeJobs ([], [])
1370

    
1371
-- | Given a list of commands, prefix them with @gnt-instance@ and
1372
-- also beautify the display a little.
1373
formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1374
formatJob jsn jsl (sn, (_, _, _, cmds)) =
1375
  let out =
1376
        printf "  echo job %d/%d" jsn sn:
1377
        printf "  check":
1378
        map ("  gnt-instance " ++) cmds
1379
  in if sn == 1
1380
       then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1381
       else out
1382

    
1383
-- | Given a list of commands, prefix them with @gnt-instance@ and
1384
-- also beautify the display a little.
1385
formatCmds :: [JobSet] -> String
1386
formatCmds =
1387
  unlines .
1388
  concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1389
                           (zip [1..] js)) .
1390
  zip [1..]
1391

    
1392
-- | Print the node list.
1393
printNodes :: Node.List -> [String] -> String
1394
printNodes nl fs =
1395
  let fields = case fs of
1396
                 [] -> Node.defaultFields
1397
                 "+":rest -> Node.defaultFields ++ rest
1398
                 _ -> fs
1399
      snl = sortBy (comparing Node.idx) (Container.elems nl)
1400
      (header, isnum) = unzip $ map Node.showHeader fields
1401
  in printTable "" header (map (Node.list fields) snl) isnum
1402

    
1403
-- | Print the instance list.
1404
printInsts :: Node.List -> Instance.List -> String
1405
printInsts nl il =
1406
  let sil = sortBy (comparing Instance.idx) (Container.elems il)
1407
      helper inst = [ if Instance.isRunning inst then "R" else " "
1408
                    , Instance.name inst
1409
                    , Container.nameOf nl (Instance.pNode inst)
1410
                    , let sdx = Instance.sNode inst
1411
                      in if sdx == Node.noSecondary
1412
                           then  ""
1413
                           else Container.nameOf nl sdx
1414
                    , if Instance.autoBalance inst then "Y" else "N"
1415
                    , printf "%3d" $ Instance.vcpus inst
1416
                    , printf "%5d" $ Instance.mem inst
1417
                    , printf "%5d" $ Instance.dsk inst `div` 1024
1418
                    , printf "%5.3f" lC
1419
                    , printf "%5.3f" lM
1420
                    , printf "%5.3f" lD
1421
                    , printf "%5.3f" lN
1422
                    ]
1423
          where DynUtil lC lM lD lN = Instance.util inst
1424
      header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1425
               , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1426
      isnum = False:False:False:False:False:repeat True
1427
  in printTable "" header (map helper sil) isnum
1428

    
1429
-- | Shows statistics for a given node list.
1430
printStats :: String -> Node.List -> String
1431
printStats lp nl =
1432
  let dcvs = compDetailedCV $ Container.elems nl
1433
      (weights, names) = unzip detailedCVInfo
1434
      hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1435
      header = [ "Field", "Value", "Weight" ]
1436
      formatted = map (\(w, h, val) ->
1437
                         [ h
1438
                         , printf "%.8f" val
1439
                         , printf "x%.2f" w
1440
                         ]) hd
1441
  in printTable lp header formatted $ False:repeat True
1442

    
1443
-- | Convert a placement into a list of OpCodes (basically a job).
1444
iMoveToJob :: Node.List        -- ^ The node list; only used for node
1445
                               -- names, so any version is good
1446
                               -- (before or after the operation)
1447
           -> Instance.List    -- ^ The instance list; also used for
1448
                               -- names only
1449
           -> Idx              -- ^ The index of the instance being
1450
                               -- moved
1451
           -> IMove            -- ^ The actual move to be described
1452
           -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to
1453
                               -- the given move
1454
iMoveToJob nl il idx move =
1455
  let inst = Container.find idx il
1456
      iname = Instance.name inst
1457
      lookNode  n = case mkNonEmpty (Container.nameOf nl n) of
1458
                      -- FIXME: convert htools codebase to non-empty strings
1459
                      Bad msg -> error $ "Empty node name for idx " ++
1460
                                 show n ++ ": " ++ msg ++ "??"
1461
                      Ok ne -> Just ne
1462
      opF = OpCodes.OpInstanceMigrate
1463
              { OpCodes.opInstanceName        = iname
1464
              , OpCodes.opMigrationMode       = Nothing -- default
1465
              , OpCodes.opOldLiveMode         = Nothing -- default as well
1466
              , OpCodes.opTargetNode          = Nothing -- this is drbd
1467
              , OpCodes.opAllowRuntimeChanges = False
1468
              , OpCodes.opIgnoreIpolicy       = False
1469
              , OpCodes.opMigrationCleanup    = False
1470
              , OpCodes.opIallocator          = Nothing
1471
              , OpCodes.opAllowFailover       = True }
1472
      opFA n = opF { OpCodes.opTargetNode = lookNode n } -- not drbd
1473
      opR n = OpCodes.OpInstanceReplaceDisks
1474
                { OpCodes.opInstanceName     = iname
1475
                , OpCodes.opEarlyRelease     = False
1476
                , OpCodes.opIgnoreIpolicy    = False
1477
                , OpCodes.opReplaceDisksMode = OpCodes.ReplaceNewSecondary
1478
                , OpCodes.opReplaceDisksList = []
1479
                , OpCodes.opRemoteNode       = lookNode n
1480
                , OpCodes.opIallocator       = Nothing
1481
                }
1482
  in case move of
1483
       Failover -> [ opF ]
1484
       FailoverToAny np -> [ opFA np ]
1485
       ReplacePrimary np -> [ opF, opR np, opF ]
1486
       ReplaceSecondary ns -> [ opR ns ]
1487
       ReplaceAndFailover np -> [ opR np, opF ]
1488
       FailoverAndReplace ns -> [ opF, opR ns ]
1489

    
1490
-- * Node group functions
1491

    
1492
-- | Computes the group of an instance.
1493
instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1494
instanceGroup nl i =
1495
  let sidx = Instance.sNode i
1496
      pnode = Container.find (Instance.pNode i) nl
1497
      snode = if sidx == Node.noSecondary
1498
              then pnode
1499
              else Container.find sidx nl
1500
      pgroup = Node.group pnode
1501
      sgroup = Node.group snode
1502
  in if pgroup /= sgroup
1503
       then fail ("Instance placed accross two node groups, primary " ++
1504
                  show pgroup ++ ", secondary " ++ show sgroup)
1505
       else return pgroup
1506

    
1507
-- | Computes the group of an instance per the primary node.
1508
instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1509
instancePriGroup nl i =
1510
  let pnode = Container.find (Instance.pNode i) nl
1511
  in  Node.group pnode
1512

    
1513
-- | Compute the list of badly allocated instances (split across node
1514
-- groups).
1515
findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1516
findSplitInstances nl =
1517
  filter (not . isOk . instanceGroup nl) . Container.elems
1518

    
1519
-- | Splits a cluster into the component node groups.
1520
splitCluster :: Node.List -> Instance.List ->
1521
                [(Gdx, (Node.List, Instance.List))]
1522
splitCluster nl il =
1523
  let ngroups = Node.computeGroups (Container.elems nl)
1524
  in map (\(guuid, nodes) ->
1525
           let nidxs = map Node.idx nodes
1526
               nodes' = zip nidxs nodes
1527
               instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1528
           in (guuid, (Container.fromList nodes', instances))) ngroups
1529

    
1530
-- | Compute the list of nodes that are to be evacuated, given a list
1531
-- of instances and an evacuation mode.
1532
nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1533
                -> EvacMode      -- ^ The evacuation mode we're using
1534
                -> [Idx]         -- ^ List of instance indices being evacuated
1535
                -> IntSet.IntSet -- ^ Set of node indices
1536
nodesToEvacuate il mode =
1537
  IntSet.delete Node.noSecondary .
1538
  foldl' (\ns idx ->
1539
            let i = Container.find idx il
1540
                pdx = Instance.pNode i
1541
                sdx = Instance.sNode i
1542
                dt = Instance.diskTemplate i
1543
                withSecondary = case dt of
1544
                                  DTDrbd8 -> IntSet.insert sdx ns
1545
                                  _ -> ns
1546
            in case mode of
1547
                 ChangePrimary   -> IntSet.insert pdx ns
1548
                 ChangeSecondary -> withSecondary
1549
                 ChangeAll       -> IntSet.insert pdx withSecondary
1550
         ) IntSet.empty