Statistics
| Branch: | Tag: | Revision:

root / src / Ganeti / HTools / Cluster.hs @ 53822ec4

History | View | Annotate | Download (67.2 kB)

1
{-| Implementation of cluster-wide logic.
2

    
3
This module holds all pure cluster-logic; I\/O related functionality
4
goes into the /Main/ module for the individual binaries.
5

    
6
-}
7

    
8
{-
9

    
10
Copyright (C) 2009, 2010, 2011, 2012, 2013 Google Inc.
11

    
12
This program is free software; you can redistribute it and/or modify
13
it under the terms of the GNU General Public License as published by
14
the Free Software Foundation; either version 2 of the License, or
15
(at your option) any later version.
16

    
17
This program is distributed in the hope that it will be useful, but
18
WITHOUT ANY WARRANTY; without even the implied warranty of
19
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
General Public License for more details.
21

    
22
You should have received a copy of the GNU General Public License
23
along with this program; if not, write to the Free Software
24
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25
02110-1301, USA.
26

    
27
-}
28

    
29
module Ganeti.HTools.Cluster
30
  (
31
    -- * Types
32
    AllocSolution(..)
33
  , EvacSolution(..)
34
  , Table(..)
35
  , CStats(..)
36
  , AllocNodes
37
  , AllocResult
38
  , AllocMethod
39
  , AllocSolutionList
40
  -- * Generic functions
41
  , totalResources
42
  , computeAllocationDelta
43
  -- * First phase functions
44
  , computeBadItems
45
  -- * Second phase functions
46
  , printSolutionLine
47
  , formatCmds
48
  , involvedNodes
49
  , splitJobs
50
  -- * Display functions
51
  , printNodes
52
  , printInsts
53
  -- * Balacing functions
54
  , checkMove
55
  , doNextBalance
56
  , tryBalance
57
  , compCV
58
  , compCVNodes
59
  , compDetailedCV
60
  , printStats
61
  , iMoveToJob
62
  -- * IAllocator functions
63
  , genAllocNodes
64
  , tryAlloc
65
  , tryMGAlloc
66
  , tryNodeEvac
67
  , tryChangeGroup
68
  , collapseFailures
69
  , allocList
70
  -- * Allocation functions
71
  , iterateAlloc
72
  , tieredAlloc
73
  -- * Node group functions
74
  , instanceGroup
75
  , findSplitInstances
76
  , splitCluster
77
  ) where
78

    
79
import qualified Data.IntSet as IntSet
80
import Data.List
81
import Data.Maybe (fromJust, isNothing)
82
import Data.Ord (comparing)
83
import Text.Printf (printf)
84

    
85
import Ganeti.BasicTypes
86
import qualified Ganeti.HTools.Container as Container
87
import qualified Ganeti.HTools.Instance as Instance
88
import qualified Ganeti.HTools.Node as Node
89
import qualified Ganeti.HTools.Group as Group
90
import Ganeti.HTools.Types
91
import Ganeti.Compat
92
import qualified Ganeti.OpCodes as OpCodes
93
import Ganeti.Utils
94
import Ganeti.Types (mkNonEmpty)
95

    
96
-- * Types
97

    
98
-- | Allocation\/relocation solution.
99
data AllocSolution = AllocSolution
100
  { asFailures :: [FailMode]              -- ^ Failure counts
101
  , asAllocs   :: Int                     -- ^ Good allocation count
102
  , asSolution :: Maybe Node.AllocElement -- ^ The actual allocation result
103
  , asLog      :: [String]                -- ^ Informational messages
104
  }
105

    
106
-- | Node evacuation/group change iallocator result type. This result
107
-- type consists of actual opcodes (a restricted subset) that are
108
-- transmitted back to Ganeti.
109
data EvacSolution = EvacSolution
110
  { esMoved   :: [(Idx, Gdx, [Ndx])]  -- ^ Instances moved successfully
111
  , esFailed  :: [(Idx, String)]      -- ^ Instances which were not
112
                                      -- relocated
113
  , esOpCodes :: [[OpCodes.OpCode]]   -- ^ List of jobs
114
  } deriving (Show)
115

    
116
-- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
117
type AllocResult = (FailStats, Node.List, Instance.List,
118
                    [Instance.Instance], [CStats])
119

    
120
-- | Type alias for easier handling.
121
type AllocSolutionList = [(Instance.Instance, AllocSolution)]
122

    
123
-- | A type denoting the valid allocation mode/pairs.
124
--
125
-- For a one-node allocation, this will be a @Left ['Ndx']@, whereas
126
-- for a two-node allocation, this will be a @Right [('Ndx',
127
-- ['Ndx'])]@. In the latter case, the list is basically an
128
-- association list, grouped by primary node and holding the potential
129
-- secondary nodes in the sub-list.
130
type AllocNodes = Either [Ndx] [(Ndx, [Ndx])]
131

    
132
-- | The empty solution we start with when computing allocations.
133
emptyAllocSolution :: AllocSolution
134
emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
135
                                   , asSolution = Nothing, asLog = [] }
136

    
137
-- | The empty evac solution.
138
emptyEvacSolution :: EvacSolution
139
emptyEvacSolution = EvacSolution { esMoved = []
140
                                 , esFailed = []
141
                                 , esOpCodes = []
142
                                 }
143

    
144
-- | The complete state for the balancing solution.
145
data Table = Table Node.List Instance.List Score [Placement]
146
             deriving (Show)
147

    
148
-- | Cluster statistics data type.
149
data CStats = CStats
150
  { csFmem :: Integer -- ^ Cluster free mem
151
  , csFdsk :: Integer -- ^ Cluster free disk
152
  , csAmem :: Integer -- ^ Cluster allocatable mem
153
  , csAdsk :: Integer -- ^ Cluster allocatable disk
154
  , csAcpu :: Integer -- ^ Cluster allocatable cpus
155
  , csMmem :: Integer -- ^ Max node allocatable mem
156
  , csMdsk :: Integer -- ^ Max node allocatable disk
157
  , csMcpu :: Integer -- ^ Max node allocatable cpu
158
  , csImem :: Integer -- ^ Instance used mem
159
  , csIdsk :: Integer -- ^ Instance used disk
160
  , csIcpu :: Integer -- ^ Instance used cpu
161
  , csTmem :: Double  -- ^ Cluster total mem
162
  , csTdsk :: Double  -- ^ Cluster total disk
163
  , csTcpu :: Double  -- ^ Cluster total cpus
164
  , csVcpu :: Integer -- ^ Cluster total virtual cpus
165
  , csNcpu :: Double  -- ^ Equivalent to 'csIcpu' but in terms of
166
                      -- physical CPUs, i.e. normalised used phys CPUs
167
  , csXmem :: Integer -- ^ Unnacounted for mem
168
  , csNmem :: Integer -- ^ Node own memory
169
  , csScore :: Score  -- ^ The cluster score
170
  , csNinst :: Int    -- ^ The total number of instances
171
  } deriving (Show)
172

    
173
-- | A simple type for allocation functions.
174
type AllocMethod =  Node.List           -- ^ Node list
175
                 -> Instance.List       -- ^ Instance list
176
                 -> Maybe Int           -- ^ Optional allocation limit
177
                 -> Instance.Instance   -- ^ Instance spec for allocation
178
                 -> AllocNodes          -- ^ Which nodes we should allocate on
179
                 -> [Instance.Instance] -- ^ Allocated instances
180
                 -> [CStats]            -- ^ Running cluster stats
181
                 -> Result AllocResult  -- ^ Allocation result
182

    
183
-- | A simple type for the running solution of evacuations.
184
type EvacInnerState =
185
  Either String (Node.List, Instance.Instance, Score, Ndx)
186

    
187
-- * Utility functions
188

    
189
-- | Verifies the N+1 status and return the affected nodes.
190
verifyN1 :: [Node.Node] -> [Node.Node]
191
verifyN1 = filter Node.failN1
192

    
193
{-| Computes the pair of bad nodes and instances.
194

    
195
The bad node list is computed via a simple 'verifyN1' check, and the
196
bad instance list is the list of primary and secondary instances of
197
those nodes.
198

    
199
-}
200
computeBadItems :: Node.List -> Instance.List ->
201
                   ([Node.Node], [Instance.Instance])
202
computeBadItems nl il =
203
  let bad_nodes = verifyN1 $ getOnline nl
204
      bad_instances = map (`Container.find` il) .
205
                      sort . nub $
206
                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
207
  in
208
    (bad_nodes, bad_instances)
209

    
210
-- | Extracts the node pairs for an instance. This can fail if the
211
-- instance is single-homed. FIXME: this needs to be improved,
212
-- together with the general enhancement for handling non-DRBD moves.
213
instanceNodes :: Node.List -> Instance.Instance ->
214
                 (Ndx, Ndx, Node.Node, Node.Node)
215
instanceNodes nl inst =
216
  let old_pdx = Instance.pNode inst
217
      old_sdx = Instance.sNode inst
218
      old_p = Container.find old_pdx nl
219
      old_s = Container.find old_sdx nl
220
  in (old_pdx, old_sdx, old_p, old_s)
221

    
222
-- | Zero-initializer for the CStats type.
223
emptyCStats :: CStats
224
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
225

    
226
-- | Update stats with data from a new node.
227
updateCStats :: CStats -> Node.Node -> CStats
228
updateCStats cs node =
229
  let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
230
               csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
231
               csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
232
               csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
233
               csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
234
               csVcpu = x_vcpu, csNcpu = x_ncpu,
235
               csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
236
             }
237
        = cs
238
      inc_amem = Node.fMem node - Node.rMem node
239
      inc_amem' = if inc_amem > 0 then inc_amem else 0
240
      inc_adsk = Node.availDisk node
241
      inc_imem = truncate (Node.tMem node) - Node.nMem node
242
                 - Node.xMem node - Node.fMem node
243
      inc_icpu = Node.uCpu node
244
      inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
245
      inc_vcpu = Node.hiCpu node
246
      inc_acpu = Node.availCpu node
247
      inc_ncpu = fromIntegral (Node.uCpu node) /
248
                 iPolicyVcpuRatio (Node.iPolicy node)
249
  in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
250
        , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
251
        , csAmem = x_amem + fromIntegral inc_amem'
252
        , csAdsk = x_adsk + fromIntegral inc_adsk
253
        , csAcpu = x_acpu + fromIntegral inc_acpu
254
        , csMmem = max x_mmem (fromIntegral inc_amem')
255
        , csMdsk = max x_mdsk (fromIntegral inc_adsk)
256
        , csMcpu = max x_mcpu (fromIntegral inc_acpu)
257
        , csImem = x_imem + fromIntegral inc_imem
258
        , csIdsk = x_idsk + fromIntegral inc_idsk
259
        , csIcpu = x_icpu + fromIntegral inc_icpu
260
        , csTmem = x_tmem + Node.tMem node
261
        , csTdsk = x_tdsk + Node.tDsk node
262
        , csTcpu = x_tcpu + Node.tCpu node
263
        , csVcpu = x_vcpu + fromIntegral inc_vcpu
264
        , csNcpu = x_ncpu + inc_ncpu
265
        , csXmem = x_xmem + fromIntegral (Node.xMem node)
266
        , csNmem = x_nmem + fromIntegral (Node.nMem node)
267
        , csNinst = x_ninst + length (Node.pList node)
268
        }
269

    
270
-- | Compute the total free disk and memory in the cluster.
271
totalResources :: Node.List -> CStats
272
totalResources nl =
273
  let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
274
  in cs { csScore = compCV nl }
275

    
276
-- | Compute the delta between two cluster state.
277
--
278
-- This is used when doing allocations, to understand better the
279
-- available cluster resources. The return value is a triple of the
280
-- current used values, the delta that was still allocated, and what
281
-- was left unallocated.
282
computeAllocationDelta :: CStats -> CStats -> AllocStats
283
computeAllocationDelta cini cfin =
284
  let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu,
285
              csNcpu = i_ncpu } = cini
286
      CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
287
              csTmem = t_mem, csTdsk = t_dsk, csVcpu = f_vcpu,
288
              csNcpu = f_ncpu, csTcpu = f_tcpu } = cfin
289
      rini = AllocInfo { allocInfoVCpus = fromIntegral i_icpu
290
                       , allocInfoNCpus = i_ncpu
291
                       , allocInfoMem   = fromIntegral i_imem
292
                       , allocInfoDisk  = fromIntegral i_idsk
293
                       }
294
      rfin = AllocInfo { allocInfoVCpus = fromIntegral (f_icpu - i_icpu)
295
                       , allocInfoNCpus = f_ncpu - i_ncpu
296
                       , allocInfoMem   = fromIntegral (f_imem - i_imem)
297
                       , allocInfoDisk  = fromIntegral (f_idsk - i_idsk)
298
                       }
299
      runa = AllocInfo { allocInfoVCpus = fromIntegral (f_vcpu - f_icpu)
300
                       , allocInfoNCpus = f_tcpu - f_ncpu
301
                       , allocInfoMem   = truncate t_mem - fromIntegral f_imem
302
                       , allocInfoDisk  = truncate t_dsk - fromIntegral f_idsk
303
                       }
304
  in (rini, rfin, runa)
305

    
306
-- | The names and weights of the individual elements in the CV list.
307
detailedCVInfo :: [(Double, String)]
308
detailedCVInfo = [ (1,  "free_mem_cv")
309
                 , (1,  "free_disk_cv")
310
                 , (1,  "n1_cnt")
311
                 , (1,  "reserved_mem_cv")
312
                 , (4,  "offline_all_cnt")
313
                 , (16, "offline_pri_cnt")
314
                 , (1,  "vcpu_ratio_cv")
315
                 , (1,  "cpu_load_cv")
316
                 , (1,  "mem_load_cv")
317
                 , (1,  "disk_load_cv")
318
                 , (1,  "net_load_cv")
319
                 , (2,  "pri_tags_score")
320
                 , (1,  "spindles_cv")
321
                 ]
322

    
323
-- | Holds the weights used by 'compCVNodes' for each metric.
324
detailedCVWeights :: [Double]
325
detailedCVWeights = map fst detailedCVInfo
326

    
327
-- | Compute the mem and disk covariance.
328
compDetailedCV :: [Node.Node] -> [Double]
329
compDetailedCV all_nodes =
330
  let (offline, nodes) = partition Node.offline all_nodes
331
      mem_l = map Node.pMem nodes
332
      dsk_l = map Node.pDsk nodes
333
      -- metric: memory covariance
334
      mem_cv = stdDev mem_l
335
      -- metric: disk covariance
336
      dsk_cv = stdDev dsk_l
337
      -- metric: count of instances living on N1 failing nodes
338
      n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
339
                                                 length (Node.pList n)) .
340
                 filter Node.failN1 $ nodes :: Double
341
      res_l = map Node.pRem nodes
342
      -- metric: reserved memory covariance
343
      res_cv = stdDev res_l
344
      -- offline instances metrics
345
      offline_ipri = sum . map (length . Node.pList) $ offline
346
      offline_isec = sum . map (length . Node.sList) $ offline
347
      -- metric: count of instances on offline nodes
348
      off_score = fromIntegral (offline_ipri + offline_isec)::Double
349
      -- metric: count of primary instances on offline nodes (this
350
      -- helps with evacuation/failover of primary instances on
351
      -- 2-node clusters with one node offline)
352
      off_pri_score = fromIntegral offline_ipri::Double
353
      cpu_l = map Node.pCpu nodes
354
      -- metric: covariance of vcpu/pcpu ratio
355
      cpu_cv = stdDev cpu_l
356
      -- metrics: covariance of cpu, memory, disk and network load
357
      (c_load, m_load, d_load, n_load) =
358
        unzip4 $ map (\n ->
359
                      let DynUtil c1 m1 d1 n1 = Node.utilLoad n
360
                          DynUtil c2 m2 d2 n2 = Node.utilPool n
361
                      in (c1/c2, m1/m2, d1/d2, n1/n2)) nodes
362
      -- metric: conflicting instance count
363
      pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
364
      pri_tags_score = fromIntegral pri_tags_inst::Double
365
      -- metric: spindles %
366
      spindles_cv = map (\n -> Node.instSpindles n / Node.hiSpindles n) nodes
367
  in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
368
     , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
369
     , pri_tags_score, stdDev spindles_cv ]
370

    
371
-- | Compute the /total/ variance.
372
compCVNodes :: [Node.Node] -> Double
373
compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
374

    
375
-- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
376
compCV :: Node.List -> Double
377
compCV = compCVNodes . Container.elems
378

    
379
-- | Compute online nodes from a 'Node.List'.
380
getOnline :: Node.List -> [Node.Node]
381
getOnline = filter (not . Node.offline) . Container.elems
382

    
383
-- * Balancing functions
384

    
385
-- | Compute best table. Note that the ordering of the arguments is important.
386
compareTables :: Table -> Table -> Table
387
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
388
  if a_cv > b_cv then b else a
389

    
390
-- | Applies an instance move to a given node list and instance.
391
applyMove :: Node.List -> Instance.Instance
392
          -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
393
-- Failover (f)
394
applyMove nl inst Failover =
395
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
396
      int_p = Node.removePri old_p inst
397
      int_s = Node.removeSec old_s inst
398
      new_nl = do -- Maybe monad
399
        new_p <- Node.addPriEx (Node.offline old_p) int_s inst
400
        new_s <- Node.addSec int_p inst old_sdx
401
        let new_inst = Instance.setBoth inst old_sdx old_pdx
402
        return (Container.addTwo old_pdx new_s old_sdx new_p nl,
403
                new_inst, old_sdx, old_pdx)
404
  in new_nl
405

    
406
-- Failover to any (fa)
407
applyMove nl inst (FailoverToAny new_pdx) = do
408
  let (old_pdx, old_sdx, old_pnode, _) = instanceNodes nl inst
409
      new_pnode = Container.find new_pdx nl
410
      force_failover = Node.offline old_pnode
411
  new_pnode' <- Node.addPriEx force_failover new_pnode inst
412
  let old_pnode' = Node.removePri old_pnode inst
413
      inst' = Instance.setPri inst new_pdx
414
      nl' = Container.addTwo old_pdx old_pnode' new_pdx new_pnode' nl
415
  return (nl', inst', new_pdx, old_sdx)
416

    
417
-- Replace the primary (f:, r:np, f)
418
applyMove nl inst (ReplacePrimary new_pdx) =
419
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
420
      tgt_n = Container.find new_pdx nl
421
      int_p = Node.removePri old_p inst
422
      int_s = Node.removeSec old_s inst
423
      force_p = Node.offline old_p
424
      new_nl = do -- Maybe monad
425
                  -- check that the current secondary can host the instance
426
                  -- during the migration
427
        tmp_s <- Node.addPriEx force_p int_s inst
428
        let tmp_s' = Node.removePri tmp_s inst
429
        new_p <- Node.addPriEx force_p tgt_n inst
430
        new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
431
        let new_inst = Instance.setPri inst new_pdx
432
        return (Container.add new_pdx new_p $
433
                Container.addTwo old_pdx int_p old_sdx new_s nl,
434
                new_inst, new_pdx, old_sdx)
435
  in new_nl
436

    
437
-- Replace the secondary (r:ns)
438
applyMove nl inst (ReplaceSecondary new_sdx) =
439
  let old_pdx = Instance.pNode inst
440
      old_sdx = Instance.sNode inst
441
      old_s = Container.find old_sdx nl
442
      tgt_n = Container.find new_sdx nl
443
      int_s = Node.removeSec old_s inst
444
      force_s = Node.offline old_s
445
      new_inst = Instance.setSec inst new_sdx
446
      new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
447
               \new_s -> return (Container.addTwo new_sdx
448
                                 new_s old_sdx int_s nl,
449
                                 new_inst, old_pdx, new_sdx)
450
  in new_nl
451

    
452
-- Replace the secondary and failover (r:np, f)
453
applyMove nl inst (ReplaceAndFailover new_pdx) =
454
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
455
      tgt_n = Container.find new_pdx nl
456
      int_p = Node.removePri old_p inst
457
      int_s = Node.removeSec old_s inst
458
      force_s = Node.offline old_s
459
      new_nl = do -- Maybe monad
460
        new_p <- Node.addPri tgt_n inst
461
        new_s <- Node.addSecEx force_s int_p inst new_pdx
462
        let new_inst = Instance.setBoth inst new_pdx old_pdx
463
        return (Container.add new_pdx new_p $
464
                Container.addTwo old_pdx new_s old_sdx int_s nl,
465
                new_inst, new_pdx, old_pdx)
466
  in new_nl
467

    
468
-- Failver and replace the secondary (f, r:ns)
469
applyMove nl inst (FailoverAndReplace new_sdx) =
470
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
471
      tgt_n = Container.find new_sdx nl
472
      int_p = Node.removePri old_p inst
473
      int_s = Node.removeSec old_s inst
474
      force_p = Node.offline old_p
475
      new_nl = do -- Maybe monad
476
        new_p <- Node.addPriEx force_p int_s inst
477
        new_s <- Node.addSecEx force_p tgt_n inst old_sdx
478
        let new_inst = Instance.setBoth inst old_sdx new_sdx
479
        return (Container.add new_sdx new_s $
480
                Container.addTwo old_sdx new_p old_pdx int_p nl,
481
                new_inst, old_sdx, new_sdx)
482
  in new_nl
483

    
484
-- | Tries to allocate an instance on one given node.
485
allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
486
                 -> OpResult Node.AllocElement
487
allocateOnSingle nl inst new_pdx =
488
  let p = Container.find new_pdx nl
489
      new_inst = Instance.setBoth inst new_pdx Node.noSecondary
490
  in do
491
    Instance.instMatchesPolicy inst (Node.iPolicy p)
492
    new_p <- Node.addPri p inst
493
    let new_nl = Container.add new_pdx new_p nl
494
        new_score = compCV new_nl
495
    return (new_nl, new_inst, [new_p], new_score)
496

    
497
-- | Tries to allocate an instance on a given pair of nodes.
498
allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
499
               -> OpResult Node.AllocElement
500
allocateOnPair nl inst new_pdx new_sdx =
501
  let tgt_p = Container.find new_pdx nl
502
      tgt_s = Container.find new_sdx nl
503
  in do
504
    Instance.instMatchesPolicy inst (Node.iPolicy tgt_p)
505
    new_p <- Node.addPri tgt_p inst
506
    new_s <- Node.addSec tgt_s inst new_pdx
507
    let new_inst = Instance.setBoth inst new_pdx new_sdx
508
        new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
509
    return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
510

    
511
-- | Tries to perform an instance move and returns the best table
512
-- between the original one and the new one.
513
checkSingleStep :: Table -- ^ The original table
514
                -> Instance.Instance -- ^ The instance to move
515
                -> Table -- ^ The current best table
516
                -> IMove -- ^ The move to apply
517
                -> Table -- ^ The final best table
518
checkSingleStep ini_tbl target cur_tbl move =
519
  let Table ini_nl ini_il _ ini_plc = ini_tbl
520
      tmp_resu = applyMove ini_nl target move
521
  in case tmp_resu of
522
       Bad _ -> cur_tbl
523
       Ok (upd_nl, new_inst, pri_idx, sec_idx) ->
524
         let tgt_idx = Instance.idx target
525
             upd_cvar = compCV upd_nl
526
             upd_il = Container.add tgt_idx new_inst ini_il
527
             upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
528
             upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
529
         in compareTables cur_tbl upd_tbl
530

    
531
-- | Given the status of the current secondary as a valid new node and
532
-- the current candidate target node, generate the possible moves for
533
-- a instance.
534
possibleMoves :: MirrorType -- ^ The mirroring type of the instance
535
              -> Bool       -- ^ Whether the secondary node is a valid new node
536
              -> Bool       -- ^ Whether we can change the primary node
537
              -> Ndx        -- ^ Target node candidate
538
              -> [IMove]    -- ^ List of valid result moves
539

    
540
possibleMoves MirrorNone _ _ _ = []
541

    
542
possibleMoves MirrorExternal _ False _ = []
543

    
544
possibleMoves MirrorExternal _ True tdx =
545
  [ FailoverToAny tdx ]
546

    
547
possibleMoves MirrorInternal _ False tdx =
548
  [ ReplaceSecondary tdx ]
549

    
550
possibleMoves MirrorInternal True True tdx =
551
  [ ReplaceSecondary tdx
552
  , ReplaceAndFailover tdx
553
  , ReplacePrimary tdx
554
  , FailoverAndReplace tdx
555
  ]
556

    
557
possibleMoves MirrorInternal False True tdx =
558
  [ ReplaceSecondary tdx
559
  , ReplaceAndFailover tdx
560
  ]
561

    
562
-- | Compute the best move for a given instance.
563
checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
564
                  -> Bool              -- ^ Whether disk moves are allowed
565
                  -> Bool              -- ^ Whether instance moves are allowed
566
                  -> Table             -- ^ Original table
567
                  -> Instance.Instance -- ^ Instance to move
568
                  -> Table             -- ^ Best new table for this instance
569
checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
570
  let opdx = Instance.pNode target
571
      osdx = Instance.sNode target
572
      bad_nodes = [opdx, osdx]
573
      nodes = filter (`notElem` bad_nodes) nodes_idx
574
      mir_type = Instance.mirrorType target
575
      use_secondary = elem osdx nodes_idx && inst_moves
576
      aft_failover = if mir_type == MirrorInternal && use_secondary
577
                       -- if drbd and allowed to failover
578
                       then checkSingleStep ini_tbl target ini_tbl Failover
579
                       else ini_tbl
580
      all_moves =
581
        if disk_moves
582
          then concatMap (possibleMoves mir_type use_secondary inst_moves)
583
               nodes
584
          else []
585
    in
586
      -- iterate over the possible nodes for this instance
587
      foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
588

    
589
-- | Compute the best next move.
590
checkMove :: [Ndx]               -- ^ Allowed target node indices
591
          -> Bool                -- ^ Whether disk moves are allowed
592
          -> Bool                -- ^ Whether instance moves are allowed
593
          -> Table               -- ^ The current solution
594
          -> [Instance.Instance] -- ^ List of instances still to move
595
          -> Table               -- ^ The new solution
596
checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
597
  let Table _ _ _ ini_plc = ini_tbl
598
      -- we're using rwhnf from the Control.Parallel.Strategies
599
      -- package; we don't need to use rnf as that would force too
600
      -- much evaluation in single-threaded cases, and in
601
      -- multi-threaded case the weak head normal form is enough to
602
      -- spark the evaluation
603
      tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
604
                             inst_moves ini_tbl)
605
               victims
606
      -- iterate over all instances, computing the best move
607
      best_tbl = foldl' compareTables ini_tbl tables
608
      Table _ _ _ best_plc = best_tbl
609
  in if length best_plc == length ini_plc
610
       then ini_tbl -- no advancement
611
       else best_tbl
612

    
613
-- | Check if we are allowed to go deeper in the balancing.
614
doNextBalance :: Table     -- ^ The starting table
615
              -> Int       -- ^ Remaining length
616
              -> Score     -- ^ Score at which to stop
617
              -> Bool      -- ^ The resulting table and commands
618
doNextBalance ini_tbl max_rounds min_score =
619
  let Table _ _ ini_cv ini_plc = ini_tbl
620
      ini_plc_len = length ini_plc
621
  in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
622

    
623
-- | Run a balance move.
624
tryBalance :: Table       -- ^ The starting table
625
           -> Bool        -- ^ Allow disk moves
626
           -> Bool        -- ^ Allow instance moves
627
           -> Bool        -- ^ Only evacuate moves
628
           -> Score       -- ^ Min gain threshold
629
           -> Score       -- ^ Min gain
630
           -> Maybe Table -- ^ The resulting table and commands
631
tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
632
    let Table ini_nl ini_il ini_cv _ = ini_tbl
633
        all_inst = Container.elems ini_il
634
        all_nodes = Container.elems ini_nl
635
        (offline_nodes, online_nodes) = partition Node.offline all_nodes
636
        all_inst' = if evac_mode
637
                      then let bad_nodes = map Node.idx offline_nodes
638
                           in filter (any (`elem` bad_nodes) .
639
                                          Instance.allNodes) all_inst
640
                      else all_inst
641
        reloc_inst = filter (\i -> Instance.movable i &&
642
                                   Instance.autoBalance i) all_inst'
643
        node_idx = map Node.idx online_nodes
644
        fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
645
        (Table _ _ fin_cv _) = fin_tbl
646
    in
647
      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
648
      then Just fin_tbl -- this round made success, return the new table
649
      else Nothing
650

    
651
-- * Allocation functions
652

    
653
-- | Build failure stats out of a list of failures.
654
collapseFailures :: [FailMode] -> FailStats
655
collapseFailures flst =
656
    map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
657
            [minBound..maxBound]
658

    
659
-- | Compares two Maybe AllocElement and chooses the best score.
660
bestAllocElement :: Maybe Node.AllocElement
661
                 -> Maybe Node.AllocElement
662
                 -> Maybe Node.AllocElement
663
bestAllocElement a Nothing = a
664
bestAllocElement Nothing b = b
665
bestAllocElement a@(Just (_, _, _, ascore)) b@(Just (_, _, _, bscore)) =
666
  if ascore < bscore then a else b
667

    
668
-- | Update current Allocation solution and failure stats with new
669
-- elements.
670
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
671
concatAllocs as (Bad reason) = as { asFailures = reason : asFailures as }
672

    
673
concatAllocs as (Ok ns) =
674
  let -- Choose the old or new solution, based on the cluster score
675
    cntok = asAllocs as
676
    osols = asSolution as
677
    nsols = bestAllocElement osols (Just ns)
678
    nsuc = cntok + 1
679
    -- Note: we force evaluation of nsols here in order to keep the
680
    -- memory profile low - we know that we will need nsols for sure
681
    -- in the next cycle, so we force evaluation of nsols, since the
682
    -- foldl' in the caller will only evaluate the tuple, but not the
683
    -- elements of the tuple
684
  in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolution = nsols }
685

    
686
-- | Sums two 'AllocSolution' structures.
687
sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
688
sumAllocs (AllocSolution aFails aAllocs aSols aLog)
689
          (AllocSolution bFails bAllocs bSols bLog) =
690
  -- note: we add b first, since usually it will be smaller; when
691
  -- fold'ing, a will grow and grow whereas b is the per-group
692
  -- result, hence smaller
693
  let nFails  = bFails ++ aFails
694
      nAllocs = aAllocs + bAllocs
695
      nSols   = bestAllocElement aSols bSols
696
      nLog    = bLog ++ aLog
697
  in AllocSolution nFails nAllocs nSols nLog
698

    
699
-- | Given a solution, generates a reasonable description for it.
700
describeSolution :: AllocSolution -> String
701
describeSolution as =
702
  let fcnt = asFailures as
703
      sols = asSolution as
704
      freasons =
705
        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
706
        filter ((> 0) . snd) . collapseFailures $ fcnt
707
  in case sols of
708
     Nothing -> "No valid allocation solutions, failure reasons: " ++
709
                (if null fcnt then "unknown reasons" else freasons)
710
     Just (_, _, nodes, cv) ->
711
         printf ("score: %.8f, successes %d, failures %d (%s)" ++
712
                 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
713
               (intercalate "/" . map Node.name $ nodes)
714

    
715
-- | Annotates a solution with the appropriate string.
716
annotateSolution :: AllocSolution -> AllocSolution
717
annotateSolution as = as { asLog = describeSolution as : asLog as }
718

    
719
-- | Reverses an evacuation solution.
720
--
721
-- Rationale: we always concat the results to the top of the lists, so
722
-- for proper jobset execution, we should reverse all lists.
723
reverseEvacSolution :: EvacSolution -> EvacSolution
724
reverseEvacSolution (EvacSolution f m o) =
725
  EvacSolution (reverse f) (reverse m) (reverse o)
726

    
727
-- | Generate the valid node allocation singles or pairs for a new instance.
728
genAllocNodes :: Group.List        -- ^ Group list
729
              -> Node.List         -- ^ The node map
730
              -> Int               -- ^ The number of nodes required
731
              -> Bool              -- ^ Whether to drop or not
732
                                   -- unallocable nodes
733
              -> Result AllocNodes -- ^ The (monadic) result
734
genAllocNodes gl nl count drop_unalloc =
735
  let filter_fn = if drop_unalloc
736
                    then filter (Group.isAllocable .
737
                                 flip Container.find gl . Node.group)
738
                    else id
739
      all_nodes = filter_fn $ getOnline nl
740
      all_pairs = [(Node.idx p,
741
                    [Node.idx s | s <- all_nodes,
742
                                       Node.idx p /= Node.idx s,
743
                                       Node.group p == Node.group s]) |
744
                   p <- all_nodes]
745
  in case count of
746
       1 -> Ok (Left (map Node.idx all_nodes))
747
       2 -> Ok (Right (filter (not . null . snd) all_pairs))
748
       _ -> Bad "Unsupported number of nodes, only one or two  supported"
749

    
750
-- | Try to allocate an instance on the cluster.
751
tryAlloc :: (Monad m) =>
752
            Node.List         -- ^ The node list
753
         -> Instance.List     -- ^ The instance list
754
         -> Instance.Instance -- ^ The instance to allocate
755
         -> AllocNodes        -- ^ The allocation targets
756
         -> m AllocSolution   -- ^ Possible solution list
757
tryAlloc _  _ _    (Right []) = fail "Not enough online nodes"
758
tryAlloc nl _ inst (Right ok_pairs) =
759
  let psols = parMap rwhnf (\(p, ss) ->
760
                              foldl' (\cstate ->
761
                                        concatAllocs cstate .
762
                                        allocateOnPair nl inst p)
763
                              emptyAllocSolution ss) ok_pairs
764
      sols = foldl' sumAllocs emptyAllocSolution psols
765
  in return $ annotateSolution sols
766

    
767
tryAlloc _  _ _    (Left []) = fail "No online nodes"
768
tryAlloc nl _ inst (Left all_nodes) =
769
  let sols = foldl' (\cstate ->
770
                       concatAllocs cstate . allocateOnSingle nl inst
771
                    ) emptyAllocSolution all_nodes
772
  in return $ annotateSolution sols
773

    
774
-- | Given a group/result, describe it as a nice (list of) messages.
775
solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
776
solutionDescription gl (groupId, result) =
777
  case result of
778
    Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
779
    Bad message -> [printf "Group %s: error %s" gname message]
780
  where grp = Container.find groupId gl
781
        gname = Group.name grp
782
        pol = allocPolicyToRaw (Group.allocPolicy grp)
783

    
784
-- | From a list of possibly bad and possibly empty solutions, filter
785
-- only the groups with a valid result. Note that the result will be
786
-- reversed compared to the original list.
787
filterMGResults :: Group.List
788
                -> [(Gdx, Result AllocSolution)]
789
                -> [(Gdx, AllocSolution)]
790
filterMGResults gl = foldl' fn []
791
  where unallocable = not . Group.isAllocable . flip Container.find gl
792
        fn accu (gdx, rasol) =
793
          case rasol of
794
            Bad _ -> accu
795
            Ok sol | isNothing (asSolution sol) -> accu
796
                   | unallocable gdx -> accu
797
                   | otherwise -> (gdx, sol):accu
798

    
799
-- | Sort multigroup results based on policy and score.
800
sortMGResults :: Group.List
801
             -> [(Gdx, AllocSolution)]
802
             -> [(Gdx, AllocSolution)]
803
sortMGResults gl sols =
804
  let extractScore (_, _, _, x) = x
805
      solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
806
                             (extractScore . fromJust . asSolution) sol)
807
  in sortBy (comparing solScore) sols
808

    
809
-- | Finds the best group for an instance on a multi-group cluster.
810
--
811
-- Only solutions in @preferred@ and @last_resort@ groups will be
812
-- accepted as valid, and additionally if the allowed groups parameter
813
-- is not null then allocation will only be run for those group
814
-- indices.
815
findBestAllocGroup :: Group.List           -- ^ The group list
816
                   -> Node.List            -- ^ The node list
817
                   -> Instance.List        -- ^ The instance list
818
                   -> Maybe [Gdx]          -- ^ The allowed groups
819
                   -> Instance.Instance    -- ^ The instance to allocate
820
                   -> Int                  -- ^ Required number of nodes
821
                   -> Result (Gdx, AllocSolution, [String])
822
findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
823
  let groups = splitCluster mgnl mgil
824
      groups' = maybe groups (\gs -> filter ((`elem` gs) . fst) groups)
825
                allowed_gdxs
826
      sols = map (\(gid, (nl, il)) ->
827
                   (gid, genAllocNodes mggl nl cnt False >>=
828
                       tryAlloc nl il inst))
829
             groups'::[(Gdx, Result AllocSolution)]
830
      all_msgs = concatMap (solutionDescription mggl) sols
831
      goodSols = filterMGResults mggl sols
832
      sortedSols = sortMGResults mggl goodSols
833
  in case sortedSols of
834
       [] -> Bad $ if null groups'
835
                     then "no groups for evacuation: allowed groups was" ++
836
                          show allowed_gdxs ++ ", all groups: " ++
837
                          show (map fst groups)
838
                     else intercalate ", " all_msgs
839
       (final_group, final_sol):_ -> return (final_group, final_sol, all_msgs)
840

    
841
-- | Try to allocate an instance on a multi-group cluster.
842
tryMGAlloc :: Group.List           -- ^ The group list
843
           -> Node.List            -- ^ The node list
844
           -> Instance.List        -- ^ The instance list
845
           -> Instance.Instance    -- ^ The instance to allocate
846
           -> Int                  -- ^ Required number of nodes
847
           -> Result AllocSolution -- ^ Possible solution list
848
tryMGAlloc mggl mgnl mgil inst cnt = do
849
  (best_group, solution, all_msgs) <-
850
      findBestAllocGroup mggl mgnl mgil Nothing inst cnt
851
  let group_name = Group.name $ Container.find best_group mggl
852
      selmsg = "Selected group: " ++ group_name
853
  return $ solution { asLog = selmsg:all_msgs }
854

    
855
-- | Calculate the new instance list after allocation solution.
856
updateIl :: Instance.List           -- ^ The original instance list
857
         -> Maybe Node.AllocElement -- ^ The result of the allocation attempt
858
         -> Instance.List           -- ^ The updated instance list
859
updateIl il Nothing = il
860
updateIl il (Just (_, xi, _, _)) = Container.add (Container.size il) xi il
861

    
862
-- | Extract the the new node list from the allocation solution.
863
extractNl :: Node.List               -- ^ The original node list
864
          -> Maybe Node.AllocElement -- ^ The result of the allocation attempt
865
          -> Node.List               -- ^ The new node list
866
extractNl nl Nothing = nl
867
extractNl _ (Just (xnl, _, _, _)) = xnl
868

    
869
-- | Try to allocate a list of instances on a multi-group cluster.
870
allocList :: Group.List                  -- ^ The group list
871
          -> Node.List                   -- ^ The node list
872
          -> Instance.List               -- ^ The instance list
873
          -> [(Instance.Instance, Int)]  -- ^ The instance to allocate
874
          -> AllocSolutionList           -- ^ Possible solution list
875
          -> Result (Node.List, Instance.List,
876
                     AllocSolutionList)  -- ^ The final solution list
877
allocList _  nl il [] result = Ok (nl, il, result)
878
allocList gl nl il ((xi, xicnt):xies) result = do
879
  ares <- tryMGAlloc gl nl il xi xicnt
880
  let sol = asSolution ares
881
      nl' = extractNl nl sol
882
      il' = updateIl il sol
883
  allocList gl nl' il' xies ((xi, ares):result)
884

    
885
-- | Function which fails if the requested mode is change secondary.
886
--
887
-- This is useful since except DRBD, no other disk template can
888
-- execute change secondary; thus, we can just call this function
889
-- instead of always checking for secondary mode. After the call to
890
-- this function, whatever mode we have is just a primary change.
891
failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
892
failOnSecondaryChange ChangeSecondary dt =
893
  fail $ "Instances with disk template '" ++ diskTemplateToRaw dt ++
894
         "' can't execute change secondary"
895
failOnSecondaryChange _ _ = return ()
896

    
897
-- | Run evacuation for a single instance.
898
--
899
-- /Note:/ this function should correctly execute both intra-group
900
-- evacuations (in all modes) and inter-group evacuations (in the
901
-- 'ChangeAll' mode). Of course, this requires that the correct list
902
-- of target nodes is passed.
903
nodeEvacInstance :: Node.List         -- ^ The node list (cluster-wide)
904
                 -> Instance.List     -- ^ Instance list (cluster-wide)
905
                 -> EvacMode          -- ^ The evacuation mode
906
                 -> Instance.Instance -- ^ The instance to be evacuated
907
                 -> Gdx               -- ^ The group we're targetting
908
                 -> [Ndx]             -- ^ The list of available nodes
909
                                      -- for allocation
910
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
911
nodeEvacInstance nl il mode inst@(Instance.Instance
912
                                  {Instance.diskTemplate = dt@DTDiskless})
913
                 gdx avail_nodes =
914
                   failOnSecondaryChange mode dt >>
915
                   evacOneNodeOnly nl il inst gdx avail_nodes
916

    
917
nodeEvacInstance _ _ _ (Instance.Instance
918
                        {Instance.diskTemplate = DTPlain}) _ _ =
919
                  fail "Instances of type plain cannot be relocated"
920

    
921
nodeEvacInstance _ _ _ (Instance.Instance
922
                        {Instance.diskTemplate = DTFile}) _ _ =
923
                  fail "Instances of type file cannot be relocated"
924

    
925
nodeEvacInstance nl il mode inst@(Instance.Instance
926
                                  {Instance.diskTemplate = dt@DTSharedFile})
927
                 gdx avail_nodes =
928
                   failOnSecondaryChange mode dt >>
929
                   evacOneNodeOnly nl il inst gdx avail_nodes
930

    
931
nodeEvacInstance nl il mode inst@(Instance.Instance
932
                                  {Instance.diskTemplate = dt@DTBlock})
933
                 gdx avail_nodes =
934
                   failOnSecondaryChange mode dt >>
935
                   evacOneNodeOnly nl il inst gdx avail_nodes
936

    
937
nodeEvacInstance nl il mode inst@(Instance.Instance
938
                                  {Instance.diskTemplate = dt@DTRbd})
939
                 gdx avail_nodes =
940
                   failOnSecondaryChange mode dt >>
941
                   evacOneNodeOnly nl il inst gdx avail_nodes
942

    
943
nodeEvacInstance nl il mode inst@(Instance.Instance
944
                                  {Instance.diskTemplate = dt@DTExt})
945
                 gdx avail_nodes =
946
                   failOnSecondaryChange mode dt >>
947
                   evacOneNodeOnly nl il inst gdx avail_nodes
948

    
949
nodeEvacInstance nl il ChangePrimary
950
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
951
                 _ _ =
952
  do
953
    (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
954
    let idx = Instance.idx inst
955
        il' = Container.add idx inst' il
956
        ops = iMoveToJob nl' il' idx Failover
957
    return (nl', il', ops)
958

    
959
nodeEvacInstance nl il ChangeSecondary
960
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
961
                 gdx avail_nodes =
962
  evacOneNodeOnly nl il inst gdx avail_nodes
963

    
964
-- The algorithm for ChangeAll is as follows:
965
--
966
-- * generate all (primary, secondary) node pairs for the target groups
967
-- * for each pair, execute the needed moves (r:s, f, r:s) and compute
968
--   the final node list state and group score
969
-- * select the best choice via a foldl that uses the same Either
970
--   String solution as the ChangeSecondary mode
971
nodeEvacInstance nl il ChangeAll
972
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
973
                 gdx avail_nodes =
974
  do
975
    let no_nodes = Left "no nodes available"
976
        node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s]
977
    (nl', il', ops, _) <-
978
        annotateResult "Can't find any good nodes for relocation" .
979
        eitherToResult $
980
        foldl'
981
        (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of
982
                          Bad msg ->
983
                              case accu of
984
                                Right _ -> accu
985
                                -- we don't need more details (which
986
                                -- nodes, etc.) as we only selected
987
                                -- this group if we can allocate on
988
                                -- it, hence failures will not
989
                                -- propagate out of this fold loop
990
                                Left _ -> Left $ "Allocation failed: " ++ msg
991
                          Ok result@(_, _, _, new_cv) ->
992
                              let new_accu = Right result in
993
                              case accu of
994
                                Left _ -> new_accu
995
                                Right (_, _, _, old_cv) ->
996
                                    if old_cv < new_cv
997
                                    then accu
998
                                    else new_accu
999
        ) no_nodes node_pairs
1000

    
1001
    return (nl', il', ops)
1002

    
1003
-- | Generic function for changing one node of an instance.
1004
--
1005
-- This is similar to 'nodeEvacInstance' but will be used in a few of
1006
-- its sub-patterns. It folds the inner function 'evacOneNodeInner'
1007
-- over the list of available nodes, which results in the best choice
1008
-- for relocation.
1009
evacOneNodeOnly :: Node.List         -- ^ The node list (cluster-wide)
1010
                -> Instance.List     -- ^ Instance list (cluster-wide)
1011
                -> Instance.Instance -- ^ The instance to be evacuated
1012
                -> Gdx               -- ^ The group we're targetting
1013
                -> [Ndx]             -- ^ The list of available nodes
1014
                                      -- for allocation
1015
                -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1016
evacOneNodeOnly nl il inst gdx avail_nodes = do
1017
  op_fn <- case Instance.mirrorType inst of
1018
             MirrorNone -> Bad "Can't relocate/evacuate non-mirrored instances"
1019
             MirrorInternal -> Ok ReplaceSecondary
1020
             MirrorExternal -> Ok FailoverToAny
1021
  (nl', inst', _, ndx) <- annotateResult "Can't find any good node" .
1022
                          eitherToResult $
1023
                          foldl' (evacOneNodeInner nl inst gdx op_fn)
1024
                          (Left "no nodes available") avail_nodes
1025
  let idx = Instance.idx inst
1026
      il' = Container.add idx inst' il
1027
      ops = iMoveToJob nl' il' idx (op_fn ndx)
1028
  return (nl', il', ops)
1029

    
1030
-- | Inner fold function for changing one node of an instance.
1031
--
1032
-- Depending on the instance disk template, this will either change
1033
-- the secondary (for DRBD) or the primary node (for shared
1034
-- storage). However, the operation is generic otherwise.
1035
--
1036
-- The running solution is either a @Left String@, which means we
1037
-- don't have yet a working solution, or a @Right (...)@, which
1038
-- represents a valid solution; it holds the modified node list, the
1039
-- modified instance (after evacuation), the score of that solution,
1040
-- and the new secondary node index.
1041
evacOneNodeInner :: Node.List         -- ^ Cluster node list
1042
                 -> Instance.Instance -- ^ Instance being evacuated
1043
                 -> Gdx               -- ^ The group index of the instance
1044
                 -> (Ndx -> IMove)    -- ^ Operation constructor
1045
                 -> EvacInnerState    -- ^ Current best solution
1046
                 -> Ndx               -- ^ Node we're evaluating as target
1047
                 -> EvacInnerState    -- ^ New best solution
1048
evacOneNodeInner nl inst gdx op_fn accu ndx =
1049
  case applyMove nl inst (op_fn ndx) of
1050
    Bad fm -> let fail_msg = "Node " ++ Container.nameOf nl ndx ++
1051
                             " failed: " ++ show fm
1052
              in either (const $ Left fail_msg) (const accu) accu
1053
    Ok (nl', inst', _, _) ->
1054
      let nodes = Container.elems nl'
1055
          -- The fromJust below is ugly (it can fail nastily), but
1056
          -- at this point we should have any internal mismatches,
1057
          -- and adding a monad here would be quite involved
1058
          grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1059
          new_cv = compCVNodes grpnodes
1060
          new_accu = Right (nl', inst', new_cv, ndx)
1061
      in case accu of
1062
           Left _ -> new_accu
1063
           Right (_, _, old_cv, _) ->
1064
             if old_cv < new_cv
1065
               then accu
1066
               else new_accu
1067

    
1068
-- | Compute result of changing all nodes of a DRBD instance.
1069
--
1070
-- Given the target primary and secondary node (which might be in a
1071
-- different group or not), this function will 'execute' all the
1072
-- required steps and assuming all operations succceed, will return
1073
-- the modified node and instance lists, the opcodes needed for this
1074
-- and the new group score.
1075
evacDrbdAllInner :: Node.List         -- ^ Cluster node list
1076
                 -> Instance.List     -- ^ Cluster instance list
1077
                 -> Instance.Instance -- ^ The instance to be moved
1078
                 -> Gdx               -- ^ The target group index
1079
                                      -- (which can differ from the
1080
                                      -- current group of the
1081
                                      -- instance)
1082
                 -> (Ndx, Ndx)        -- ^ Tuple of new
1083
                                      -- primary\/secondary nodes
1084
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score)
1085
evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) = do
1086
  let primary = Container.find (Instance.pNode inst) nl
1087
      idx = Instance.idx inst
1088
  -- if the primary is offline, then we first failover
1089
  (nl1, inst1, ops1) <-
1090
    if Node.offline primary
1091
      then do
1092
        (nl', inst', _, _) <-
1093
          annotateResult "Failing over to the secondary" .
1094
          opToResult $ applyMove nl inst Failover
1095
        return (nl', inst', [Failover])
1096
      else return (nl, inst, [])
1097
  let (o1, o2, o3) = (ReplaceSecondary t_pdx,
1098
                      Failover,
1099
                      ReplaceSecondary t_sdx)
1100
  -- we now need to execute a replace secondary to the future
1101
  -- primary node
1102
  (nl2, inst2, _, _) <-
1103
    annotateResult "Changing secondary to new primary" .
1104
    opToResult $
1105
    applyMove nl1 inst1 o1
1106
  let ops2 = o1:ops1
1107
  -- we now execute another failover, the primary stays fixed now
1108
  (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" .
1109
                        opToResult $ applyMove nl2 inst2 o2
1110
  let ops3 = o2:ops2
1111
  -- and finally another replace secondary, to the final secondary
1112
  (nl4, inst4, _, _) <-
1113
    annotateResult "Changing secondary to final secondary" .
1114
    opToResult $
1115
    applyMove nl3 inst3 o3
1116
  let ops4 = o3:ops3
1117
      il' = Container.add idx inst4 il
1118
      ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1119
  let nodes = Container.elems nl4
1120
      -- The fromJust below is ugly (it can fail nastily), but
1121
      -- at this point we should have any internal mismatches,
1122
      -- and adding a monad here would be quite involved
1123
      grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1124
      new_cv = compCVNodes grpnodes
1125
  return (nl4, il', ops, new_cv)
1126

    
1127
-- | Computes the nodes in a given group which are available for
1128
-- allocation.
1129
availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1130
                    -> IntSet.IntSet  -- ^ Nodes that are excluded
1131
                    -> Gdx            -- ^ The group for which we
1132
                                      -- query the nodes
1133
                    -> Result [Ndx]   -- ^ List of available node indices
1134
availableGroupNodes group_nodes excl_ndx gdx = do
1135
  local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1136
                 Ok (lookup gdx group_nodes)
1137
  let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1138
  return avail_nodes
1139

    
1140
-- | Updates the evac solution with the results of an instance
1141
-- evacuation.
1142
updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1143
                   -> Idx
1144
                   -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1145
                   -> (Node.List, Instance.List, EvacSolution)
1146
updateEvacSolution (nl, il, es) idx (Bad msg) =
1147
  (nl, il, es { esFailed = (idx, msg):esFailed es})
1148
updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1149
  (nl, il, es { esMoved = new_elem:esMoved es
1150
              , esOpCodes = opcodes:esOpCodes es })
1151
    where inst = Container.find idx il
1152
          new_elem = (idx,
1153
                      instancePriGroup nl inst,
1154
                      Instance.allNodes inst)
1155

    
1156
-- | Node-evacuation IAllocator mode main function.
1157
tryNodeEvac :: Group.List    -- ^ The cluster groups
1158
            -> Node.List     -- ^ The node list (cluster-wide, not per group)
1159
            -> Instance.List -- ^ Instance list (cluster-wide)
1160
            -> EvacMode      -- ^ The evacuation mode
1161
            -> [Idx]         -- ^ List of instance (indices) to be evacuated
1162
            -> Result (Node.List, Instance.List, EvacSolution)
1163
tryNodeEvac _ ini_nl ini_il mode idxs =
1164
  let evac_ndx = nodesToEvacuate ini_il mode idxs
1165
      offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1166
      excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1167
      group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1168
                                           (Container.elems nl))) $
1169
                  splitCluster ini_nl ini_il
1170
      (fin_nl, fin_il, esol) =
1171
        foldl' (\state@(nl, il, _) inst ->
1172
                  let gdx = instancePriGroup nl inst
1173
                      pdx = Instance.pNode inst in
1174
                  updateEvacSolution state (Instance.idx inst) $
1175
                  availableGroupNodes group_ndx
1176
                    (IntSet.insert pdx excl_ndx) gdx >>=
1177
                      nodeEvacInstance nl il mode inst gdx
1178
               )
1179
        (ini_nl, ini_il, emptyEvacSolution)
1180
        (map (`Container.find` ini_il) idxs)
1181
  in return (fin_nl, fin_il, reverseEvacSolution esol)
1182

    
1183
-- | Change-group IAllocator mode main function.
1184
--
1185
-- This is very similar to 'tryNodeEvac', the only difference is that
1186
-- we don't choose as target group the current instance group, but
1187
-- instead:
1188
--
1189
--   1. at the start of the function, we compute which are the target
1190
--   groups; either no groups were passed in, in which case we choose
1191
--   all groups out of which we don't evacuate instance, or there were
1192
--   some groups passed, in which case we use those
1193
--
1194
--   2. for each instance, we use 'findBestAllocGroup' to choose the
1195
--   best group to hold the instance, and then we do what
1196
--   'tryNodeEvac' does, except for this group instead of the current
1197
--   instance group.
1198
--
1199
-- Note that the correct behaviour of this function relies on the
1200
-- function 'nodeEvacInstance' to be able to do correctly both
1201
-- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1202
tryChangeGroup :: Group.List    -- ^ The cluster groups
1203
               -> Node.List     -- ^ The node list (cluster-wide)
1204
               -> Instance.List -- ^ Instance list (cluster-wide)
1205
               -> [Gdx]         -- ^ Target groups; if empty, any
1206
                                -- groups not being evacuated
1207
               -> [Idx]         -- ^ List of instance (indices) to be evacuated
1208
               -> Result (Node.List, Instance.List, EvacSolution)
1209
tryChangeGroup gl ini_nl ini_il gdxs idxs =
1210
  let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1211
                             flip Container.find ini_il) idxs
1212
      target_gdxs = (if null gdxs
1213
                       then Container.keys gl
1214
                       else gdxs) \\ evac_gdxs
1215
      offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1216
      excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1217
      group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1218
                                           (Container.elems nl))) $
1219
                  splitCluster ini_nl ini_il
1220
      (fin_nl, fin_il, esol) =
1221
        foldl' (\state@(nl, il, _) inst ->
1222
                  let solution = do
1223
                        let ncnt = Instance.requiredNodes $
1224
                                   Instance.diskTemplate inst
1225
                        (gdx, _, _) <- findBestAllocGroup gl nl il
1226
                                       (Just target_gdxs) inst ncnt
1227
                        av_nodes <- availableGroupNodes group_ndx
1228
                                    excl_ndx gdx
1229
                        nodeEvacInstance nl il ChangeAll inst gdx av_nodes
1230
                  in updateEvacSolution state (Instance.idx inst) solution
1231
               )
1232
        (ini_nl, ini_il, emptyEvacSolution)
1233
        (map (`Container.find` ini_il) idxs)
1234
  in return (fin_nl, fin_il, reverseEvacSolution esol)
1235

    
1236
-- | Standard-sized allocation method.
1237
--
1238
-- This places instances of the same size on the cluster until we're
1239
-- out of space. The result will be a list of identically-sized
1240
-- instances.
1241
iterateAlloc :: AllocMethod
1242
iterateAlloc nl il limit newinst allocnodes ixes cstats =
1243
  let depth = length ixes
1244
      newname = printf "new-%d" depth::String
1245
      newidx = Container.size il
1246
      newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1247
      newlimit = fmap (flip (-) 1) limit
1248
  in case tryAlloc nl il newi2 allocnodes of
1249
       Bad s -> Bad s
1250
       Ok (AllocSolution { asFailures = errs, asSolution = sols3 }) ->
1251
         let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1252
         case sols3 of
1253
           Nothing -> newsol
1254
           Just (xnl, xi, _, _) ->
1255
             if limit == Just 0
1256
               then newsol
1257
               else iterateAlloc xnl (Container.add newidx xi il)
1258
                      newlimit newinst allocnodes (xi:ixes)
1259
                      (totalResources xnl:cstats)
1260

    
1261
-- | Tiered allocation method.
1262
--
1263
-- This places instances on the cluster, and decreases the spec until
1264
-- we can allocate again. The result will be a list of decreasing
1265
-- instance specs.
1266
tieredAlloc :: AllocMethod
1267
tieredAlloc nl il limit newinst allocnodes ixes cstats =
1268
  case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1269
    Bad s -> Bad s
1270
    Ok (errs, nl', il', ixes', cstats') ->
1271
      let newsol = Ok (errs, nl', il', ixes', cstats')
1272
          ixes_cnt = length ixes'
1273
          (stop, newlimit) = case limit of
1274
                               Nothing -> (False, Nothing)
1275
                               Just n -> (n <= ixes_cnt,
1276
                                            Just (n - ixes_cnt)) in
1277
      if stop then newsol else
1278
          case Instance.shrinkByType newinst . fst . last $
1279
               sortBy (comparing snd) errs of
1280
            Bad _ -> newsol
1281
            Ok newinst' -> tieredAlloc nl' il' newlimit
1282
                           newinst' allocnodes ixes' cstats'
1283

    
1284
-- * Formatting functions
1285

    
1286
-- | Given the original and final nodes, computes the relocation description.
1287
computeMoves :: Instance.Instance -- ^ The instance to be moved
1288
             -> String -- ^ The instance name
1289
             -> IMove  -- ^ The move being performed
1290
             -> String -- ^ New primary
1291
             -> String -- ^ New secondary
1292
             -> (String, [String])
1293
                -- ^ Tuple of moves and commands list; moves is containing
1294
                -- either @/f/@ for failover or @/r:name/@ for replace
1295
                -- secondary, while the command list holds gnt-instance
1296
                -- commands (without that prefix), e.g \"@failover instance1@\"
1297
computeMoves i inam mv c d =
1298
  case mv of
1299
    Failover -> ("f", [mig])
1300
    FailoverToAny _ -> (printf "fa:%s" c, [mig_any])
1301
    FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1302
    ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1303
    ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1304
    ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1305
  where morf = if Instance.isRunning i then "migrate" else "failover"
1306
        mig = printf "%s -f %s" morf inam::String
1307
        mig_any = printf "%s -f -n %s %s" morf c inam::String
1308
        rep n = printf "replace-disks -n %s %s" n inam::String
1309

    
1310
-- | Converts a placement to string format.
1311
printSolutionLine :: Node.List     -- ^ The node list
1312
                  -> Instance.List -- ^ The instance list
1313
                  -> Int           -- ^ Maximum node name length
1314
                  -> Int           -- ^ Maximum instance name length
1315
                  -> Placement     -- ^ The current placement
1316
                  -> Int           -- ^ The index of the placement in
1317
                                   -- the solution
1318
                  -> (String, [String])
1319
printSolutionLine nl il nmlen imlen plc pos =
1320
  let pmlen = (2*nmlen + 1)
1321
      (i, p, s, mv, c) = plc
1322
      old_sec = Instance.sNode inst
1323
      inst = Container.find i il
1324
      inam = Instance.alias inst
1325
      npri = Node.alias $ Container.find p nl
1326
      nsec = Node.alias $ Container.find s nl
1327
      opri = Node.alias $ Container.find (Instance.pNode inst) nl
1328
      osec = Node.alias $ Container.find old_sec nl
1329
      (moves, cmds) =  computeMoves inst inam mv npri nsec
1330
      -- FIXME: this should check instead/also the disk template
1331
      ostr = if old_sec == Node.noSecondary
1332
               then printf "%s" opri::String
1333
               else printf "%s:%s" opri osec::String
1334
      nstr = if s == Node.noSecondary
1335
               then printf "%s" npri::String
1336
               else printf "%s:%s" npri nsec::String
1337
  in (printf "  %3d. %-*s %-*s => %-*s %12.8f a=%s"
1338
      pos imlen inam pmlen ostr pmlen nstr c moves,
1339
      cmds)
1340

    
1341
-- | Return the instance and involved nodes in an instance move.
1342
--
1343
-- Note that the output list length can vary, and is not required nor
1344
-- guaranteed to be of any specific length.
1345
involvedNodes :: Instance.List -- ^ Instance list, used for retrieving
1346
                               -- the instance from its index; note
1347
                               -- that this /must/ be the original
1348
                               -- instance list, so that we can
1349
                               -- retrieve the old nodes
1350
              -> Placement     -- ^ The placement we're investigating,
1351
                               -- containing the new nodes and
1352
                               -- instance index
1353
              -> [Ndx]         -- ^ Resulting list of node indices
1354
involvedNodes il plc =
1355
  let (i, np, ns, _, _) = plc
1356
      inst = Container.find i il
1357
  in nub $ [np, ns] ++ Instance.allNodes inst
1358

    
1359
-- | Inner function for splitJobs, that either appends the next job to
1360
-- the current jobset, or starts a new jobset.
1361
mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1362
mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1363
mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1364
  | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1365
  | otherwise = ([n]:cjs, ndx)
1366

    
1367
-- | Break a list of moves into independent groups. Note that this
1368
-- will reverse the order of jobs.
1369
splitJobs :: [MoveJob] -> [JobSet]
1370
splitJobs = fst . foldl mergeJobs ([], [])
1371

    
1372
-- | Given a list of commands, prefix them with @gnt-instance@ and
1373
-- also beautify the display a little.
1374
formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1375
formatJob jsn jsl (sn, (_, _, _, cmds)) =
1376
  let out =
1377
        printf "  echo job %d/%d" jsn sn:
1378
        printf "  check":
1379
        map ("  gnt-instance " ++) cmds
1380
  in if sn == 1
1381
       then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1382
       else out
1383

    
1384
-- | Given a list of commands, prefix them with @gnt-instance@ and
1385
-- also beautify the display a little.
1386
formatCmds :: [JobSet] -> String
1387
formatCmds =
1388
  unlines .
1389
  concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1390
                           (zip [1..] js)) .
1391
  zip [1..]
1392

    
1393
-- | Print the node list.
1394
printNodes :: Node.List -> [String] -> String
1395
printNodes nl fs =
1396
  let fields = case fs of
1397
                 [] -> Node.defaultFields
1398
                 "+":rest -> Node.defaultFields ++ rest
1399
                 _ -> fs
1400
      snl = sortBy (comparing Node.idx) (Container.elems nl)
1401
      (header, isnum) = unzip $ map Node.showHeader fields
1402
  in printTable "" header (map (Node.list fields) snl) isnum
1403

    
1404
-- | Print the instance list.
1405
printInsts :: Node.List -> Instance.List -> String
1406
printInsts nl il =
1407
  let sil = sortBy (comparing Instance.idx) (Container.elems il)
1408
      helper inst = [ if Instance.isRunning inst then "R" else " "
1409
                    , Instance.name inst
1410
                    , Container.nameOf nl (Instance.pNode inst)
1411
                    , let sdx = Instance.sNode inst
1412
                      in if sdx == Node.noSecondary
1413
                           then  ""
1414
                           else Container.nameOf nl sdx
1415
                    , if Instance.autoBalance inst then "Y" else "N"
1416
                    , printf "%3d" $ Instance.vcpus inst
1417
                    , printf "%5d" $ Instance.mem inst
1418
                    , printf "%5d" $ Instance.dsk inst `div` 1024
1419
                    , printf "%5.3f" lC
1420
                    , printf "%5.3f" lM
1421
                    , printf "%5.3f" lD
1422
                    , printf "%5.3f" lN
1423
                    ]
1424
          where DynUtil lC lM lD lN = Instance.util inst
1425
      header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1426
               , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1427
      isnum = False:False:False:False:False:repeat True
1428
  in printTable "" header (map helper sil) isnum
1429

    
1430
-- | Shows statistics for a given node list.
1431
printStats :: String -> Node.List -> String
1432
printStats lp nl =
1433
  let dcvs = compDetailedCV $ Container.elems nl
1434
      (weights, names) = unzip detailedCVInfo
1435
      hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1436
      header = [ "Field", "Value", "Weight" ]
1437
      formatted = map (\(w, h, val) ->
1438
                         [ h
1439
                         , printf "%.8f" val
1440
                         , printf "x%.2f" w
1441
                         ]) hd
1442
  in printTable lp header formatted $ False:repeat True
1443

    
1444
-- | Convert a placement into a list of OpCodes (basically a job).
1445
iMoveToJob :: Node.List        -- ^ The node list; only used for node
1446
                               -- names, so any version is good
1447
                               -- (before or after the operation)
1448
           -> Instance.List    -- ^ The instance list; also used for
1449
                               -- names only
1450
           -> Idx              -- ^ The index of the instance being
1451
                               -- moved
1452
           -> IMove            -- ^ The actual move to be described
1453
           -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to
1454
                               -- the given move
1455
iMoveToJob nl il idx move =
1456
  let inst = Container.find idx il
1457
      iname = Instance.name inst
1458
      lookNode  n = case mkNonEmpty (Container.nameOf nl n) of
1459
                      -- FIXME: convert htools codebase to non-empty strings
1460
                      Bad msg -> error $ "Empty node name for idx " ++
1461
                                 show n ++ ": " ++ msg ++ "??"
1462
                      Ok ne -> Just ne
1463
      opF = OpCodes.OpInstanceMigrate
1464
              { OpCodes.opInstanceName        = iname
1465
              , OpCodes.opMigrationMode       = Nothing -- default
1466
              , OpCodes.opOldLiveMode         = Nothing -- default as well
1467
              , OpCodes.opTargetNode          = Nothing -- this is drbd
1468
              , OpCodes.opAllowRuntimeChanges = False
1469
              , OpCodes.opIgnoreIpolicy       = False
1470
              , OpCodes.opMigrationCleanup    = False
1471
              , OpCodes.opIallocator          = Nothing
1472
              , OpCodes.opAllowFailover       = True }
1473
      opFA n = opF { OpCodes.opTargetNode = lookNode n } -- not drbd
1474
      opR n = OpCodes.OpInstanceReplaceDisks
1475
                { OpCodes.opInstanceName     = iname
1476
                , OpCodes.opEarlyRelease     = False
1477
                , OpCodes.opIgnoreIpolicy    = False
1478
                , OpCodes.opReplaceDisksMode = OpCodes.ReplaceNewSecondary
1479
                , OpCodes.opReplaceDisksList = []
1480
                , OpCodes.opRemoteNode       = lookNode n
1481
                , OpCodes.opIallocator       = Nothing
1482
                }
1483
  in case move of
1484
       Failover -> [ opF ]
1485
       FailoverToAny np -> [ opFA np ]
1486
       ReplacePrimary np -> [ opF, opR np, opF ]
1487
       ReplaceSecondary ns -> [ opR ns ]
1488
       ReplaceAndFailover np -> [ opR np, opF ]
1489
       FailoverAndReplace ns -> [ opF, opR ns ]
1490

    
1491
-- * Node group functions
1492

    
1493
-- | Computes the group of an instance.
1494
instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1495
instanceGroup nl i =
1496
  let sidx = Instance.sNode i
1497
      pnode = Container.find (Instance.pNode i) nl
1498
      snode = if sidx == Node.noSecondary
1499
              then pnode
1500
              else Container.find sidx nl
1501
      pgroup = Node.group pnode
1502
      sgroup = Node.group snode
1503
  in if pgroup /= sgroup
1504
       then fail ("Instance placed accross two node groups, primary " ++
1505
                  show pgroup ++ ", secondary " ++ show sgroup)
1506
       else return pgroup
1507

    
1508
-- | Computes the group of an instance per the primary node.
1509
instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1510
instancePriGroup nl i =
1511
  let pnode = Container.find (Instance.pNode i) nl
1512
  in  Node.group pnode
1513

    
1514
-- | Compute the list of badly allocated instances (split across node
1515
-- groups).
1516
findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1517
findSplitInstances nl =
1518
  filter (not . isOk . instanceGroup nl) . Container.elems
1519

    
1520
-- | Splits a cluster into the component node groups.
1521
splitCluster :: Node.List -> Instance.List ->
1522
                [(Gdx, (Node.List, Instance.List))]
1523
splitCluster nl il =
1524
  let ngroups = Node.computeGroups (Container.elems nl)
1525
  in map (\(guuid, nodes) ->
1526
           let nidxs = map Node.idx nodes
1527
               nodes' = zip nidxs nodes
1528
               instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1529
           in (guuid, (Container.fromList nodes', instances))) ngroups
1530

    
1531
-- | Compute the list of nodes that are to be evacuated, given a list
1532
-- of instances and an evacuation mode.
1533
nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1534
                -> EvacMode      -- ^ The evacuation mode we're using
1535
                -> [Idx]         -- ^ List of instance indices being evacuated
1536
                -> IntSet.IntSet -- ^ Set of node indices
1537
nodesToEvacuate il mode =
1538
  IntSet.delete Node.noSecondary .
1539
  foldl' (\ns idx ->
1540
            let i = Container.find idx il
1541
                pdx = Instance.pNode i
1542
                sdx = Instance.sNode i
1543
                dt = Instance.diskTemplate i
1544
                withSecondary = case dt of
1545
                                  DTDrbd8 -> IntSet.insert sdx ns
1546
                                  _ -> ns
1547
            in case mode of
1548
                 ChangePrimary   -> IntSet.insert pdx ns
1549
                 ChangeSecondary -> withSecondary
1550
                 ChangeAll       -> IntSet.insert pdx withSecondary
1551
         ) IntSet.empty