Statistics
| Branch: | Tag: | Revision:

root / src / Ganeti / HTools / Cluster.hs @ 5a13489b

History | View | Annotate | Download (68.2 kB)

1
{-| Implementation of cluster-wide logic.
2

    
3
This module holds all pure cluster-logic; I\/O related functionality
4
goes into the /Main/ module for the individual binaries.
5

    
6
-}
7

    
8
{-
9

    
10
Copyright (C) 2009, 2010, 2011, 2012, 2013 Google Inc.
11

    
12
This program is free software; you can redistribute it and/or modify
13
it under the terms of the GNU General Public License as published by
14
the Free Software Foundation; either version 2 of the License, or
15
(at your option) any later version.
16

    
17
This program is distributed in the hope that it will be useful, but
18
WITHOUT ANY WARRANTY; without even the implied warranty of
19
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
General Public License for more details.
21

    
22
You should have received a copy of the GNU General Public License
23
along with this program; if not, write to the Free Software
24
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25
02110-1301, USA.
26

    
27
-}
28

    
29
module Ganeti.HTools.Cluster
30
  (
31
    -- * Types
32
    AllocSolution(..)
33
  , EvacSolution(..)
34
  , Table(..)
35
  , CStats(..)
36
  , AllocNodes
37
  , AllocResult
38
  , AllocMethod
39
  , AllocSolutionList
40
  -- * Generic functions
41
  , totalResources
42
  , computeAllocationDelta
43
  -- * First phase functions
44
  , computeBadItems
45
  -- * Second phase functions
46
  , printSolutionLine
47
  , formatCmds
48
  , involvedNodes
49
  , splitJobs
50
  -- * Display functions
51
  , printNodes
52
  , printInsts
53
  -- * Balacing functions
54
  , checkMove
55
  , doNextBalance
56
  , tryBalance
57
  , compCV
58
  , compCVNodes
59
  , compDetailedCV
60
  , printStats
61
  , iMoveToJob
62
  -- * IAllocator functions
63
  , genAllocNodes
64
  , tryAlloc
65
  , tryMGAlloc
66
  , tryNodeEvac
67
  , tryChangeGroup
68
  , collapseFailures
69
  , allocList
70
  -- * Allocation functions
71
  , iterateAlloc
72
  , tieredAlloc
73
  -- * Node group functions
74
  , instanceGroup
75
  , findSplitInstances
76
  , splitCluster
77
  ) where
78

    
79
import qualified Data.IntSet as IntSet
80
import Data.List
81
import Data.Maybe (fromJust, isNothing)
82
import Data.Ord (comparing)
83
import Text.Printf (printf)
84

    
85
import Ganeti.BasicTypes
86
import qualified Ganeti.HTools.Container as Container
87
import qualified Ganeti.HTools.Instance as Instance
88
import qualified Ganeti.HTools.Nic as Nic
89
import qualified Ganeti.HTools.Node as Node
90
import qualified Ganeti.HTools.Group as Group
91
import Ganeti.HTools.Types
92
import Ganeti.Compat
93
import qualified Ganeti.OpCodes as OpCodes
94
import Ganeti.Utils
95
import Ganeti.Types (mkNonEmpty)
96

    
97
-- * Types
98

    
99
-- | Allocation\/relocation solution.
100
data AllocSolution = AllocSolution
101
  { asFailures :: [FailMode]              -- ^ Failure counts
102
  , asAllocs   :: Int                     -- ^ Good allocation count
103
  , asSolution :: Maybe Node.AllocElement -- ^ The actual allocation result
104
  , asLog      :: [String]                -- ^ Informational messages
105
  }
106

    
107
-- | Node evacuation/group change iallocator result type. This result
108
-- type consists of actual opcodes (a restricted subset) that are
109
-- transmitted back to Ganeti.
110
data EvacSolution = EvacSolution
111
  { esMoved   :: [(Idx, Gdx, [Ndx])]  -- ^ Instances moved successfully
112
  , esFailed  :: [(Idx, String)]      -- ^ Instances which were not
113
                                      -- relocated
114
  , esOpCodes :: [[OpCodes.OpCode]]   -- ^ List of jobs
115
  } deriving (Show)
116

    
117
-- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
118
type AllocResult = (FailStats, Node.List, Instance.List,
119
                    [Instance.Instance], [CStats])
120

    
121
-- | Type alias for easier handling.
122
type AllocSolutionList = [(Instance.Instance, AllocSolution)]
123

    
124
-- | A type denoting the valid allocation mode/pairs.
125
--
126
-- For a one-node allocation, this will be a @Left ['Ndx']@, whereas
127
-- for a two-node allocation, this will be a @Right [('Ndx',
128
-- ['Ndx'])]@. In the latter case, the list is basically an
129
-- association list, grouped by primary node and holding the potential
130
-- secondary nodes in the sub-list.
131
type AllocNodes = Either [Ndx] [(Ndx, [Ndx])]
132

    
133
-- | The empty solution we start with when computing allocations.
134
emptyAllocSolution :: AllocSolution
135
emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
136
                                   , asSolution = Nothing, asLog = [] }
137

    
138
-- | The empty evac solution.
139
emptyEvacSolution :: EvacSolution
140
emptyEvacSolution = EvacSolution { esMoved = []
141
                                 , esFailed = []
142
                                 , esOpCodes = []
143
                                 }
144

    
145
-- | The complete state for the balancing solution.
146
data Table = Table Node.List Instance.List Score [Placement]
147
             deriving (Show)
148

    
149
-- | Cluster statistics data type.
150
data CStats = CStats
151
  { csFmem :: Integer -- ^ Cluster free mem
152
  , csFdsk :: Integer -- ^ Cluster free disk
153
  , csAmem :: Integer -- ^ Cluster allocatable mem
154
  , csAdsk :: Integer -- ^ Cluster allocatable disk
155
  , csAcpu :: Integer -- ^ Cluster allocatable cpus
156
  , csMmem :: Integer -- ^ Max node allocatable mem
157
  , csMdsk :: Integer -- ^ Max node allocatable disk
158
  , csMcpu :: Integer -- ^ Max node allocatable cpu
159
  , csImem :: Integer -- ^ Instance used mem
160
  , csIdsk :: Integer -- ^ Instance used disk
161
  , csIcpu :: Integer -- ^ Instance used cpu
162
  , csTmem :: Double  -- ^ Cluster total mem
163
  , csTdsk :: Double  -- ^ Cluster total disk
164
  , csTcpu :: Double  -- ^ Cluster total cpus
165
  , csVcpu :: Integer -- ^ Cluster total virtual cpus
166
  , csNcpu :: Double  -- ^ Equivalent to 'csIcpu' but in terms of
167
                      -- physical CPUs, i.e. normalised used phys CPUs
168
  , csXmem :: Integer -- ^ Unnacounted for mem
169
  , csNmem :: Integer -- ^ Node own memory
170
  , csScore :: Score  -- ^ The cluster score
171
  , csNinst :: Int    -- ^ The total number of instances
172
  } deriving (Show)
173

    
174
-- | A simple type for allocation functions.
175
type AllocMethod =  Node.List           -- ^ Node list
176
                 -> Instance.List       -- ^ Instance list
177
                 -> Maybe Int           -- ^ Optional allocation limit
178
                 -> Instance.Instance   -- ^ Instance spec for allocation
179
                 -> AllocNodes          -- ^ Which nodes we should allocate on
180
                 -> [Instance.Instance] -- ^ Allocated instances
181
                 -> [CStats]            -- ^ Running cluster stats
182
                 -> Result AllocResult  -- ^ Allocation result
183

    
184
-- | A simple type for the running solution of evacuations.
185
type EvacInnerState =
186
  Either String (Node.List, Instance.Instance, Score, Ndx)
187

    
188
-- * Utility functions
189

    
190
-- | Verifies the N+1 status and return the affected nodes.
191
verifyN1 :: [Node.Node] -> [Node.Node]
192
verifyN1 = filter Node.failN1
193

    
194
{-| Computes the pair of bad nodes and instances.
195

    
196
The bad node list is computed via a simple 'verifyN1' check, and the
197
bad instance list is the list of primary and secondary instances of
198
those nodes.
199

    
200
-}
201
computeBadItems :: Node.List -> Instance.List ->
202
                   ([Node.Node], [Instance.Instance])
203
computeBadItems nl il =
204
  let bad_nodes = verifyN1 $ getOnline nl
205
      bad_instances = map (`Container.find` il) .
206
                      sort . nub $
207
                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
208
  in
209
    (bad_nodes, bad_instances)
210

    
211
-- | Extracts the node pairs for an instance. This can fail if the
212
-- instance is single-homed. FIXME: this needs to be improved,
213
-- together with the general enhancement for handling non-DRBD moves.
214
instanceNodes :: Node.List -> Instance.Instance ->
215
                 (Ndx, Ndx, Node.Node, Node.Node)
216
instanceNodes nl inst =
217
  let old_pdx = Instance.pNode inst
218
      old_sdx = Instance.sNode inst
219
      old_p = Container.find old_pdx nl
220
      old_s = Container.find old_sdx nl
221
  in (old_pdx, old_sdx, old_p, old_s)
222

    
223
-- | Zero-initializer for the CStats type.
224
emptyCStats :: CStats
225
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
226

    
227
-- | Update stats with data from a new node.
228
updateCStats :: CStats -> Node.Node -> CStats
229
updateCStats cs node =
230
  let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
231
               csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
232
               csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
233
               csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
234
               csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
235
               csVcpu = x_vcpu, csNcpu = x_ncpu,
236
               csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
237
             }
238
        = cs
239
      inc_amem = Node.fMem node - Node.rMem node
240
      inc_amem' = if inc_amem > 0 then inc_amem else 0
241
      inc_adsk = Node.availDisk node
242
      inc_imem = truncate (Node.tMem node) - Node.nMem node
243
                 - Node.xMem node - Node.fMem node
244
      inc_icpu = Node.uCpu node
245
      inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
246
      inc_vcpu = Node.hiCpu node
247
      inc_acpu = Node.availCpu node
248
      inc_ncpu = fromIntegral (Node.uCpu node) /
249
                 iPolicyVcpuRatio (Node.iPolicy node)
250
  in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
251
        , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
252
        , csAmem = x_amem + fromIntegral inc_amem'
253
        , csAdsk = x_adsk + fromIntegral inc_adsk
254
        , csAcpu = x_acpu + fromIntegral inc_acpu
255
        , csMmem = max x_mmem (fromIntegral inc_amem')
256
        , csMdsk = max x_mdsk (fromIntegral inc_adsk)
257
        , csMcpu = max x_mcpu (fromIntegral inc_acpu)
258
        , csImem = x_imem + fromIntegral inc_imem
259
        , csIdsk = x_idsk + fromIntegral inc_idsk
260
        , csIcpu = x_icpu + fromIntegral inc_icpu
261
        , csTmem = x_tmem + Node.tMem node
262
        , csTdsk = x_tdsk + Node.tDsk node
263
        , csTcpu = x_tcpu + Node.tCpu node
264
        , csVcpu = x_vcpu + fromIntegral inc_vcpu
265
        , csNcpu = x_ncpu + inc_ncpu
266
        , csXmem = x_xmem + fromIntegral (Node.xMem node)
267
        , csNmem = x_nmem + fromIntegral (Node.nMem node)
268
        , csNinst = x_ninst + length (Node.pList node)
269
        }
270

    
271
-- | Compute the total free disk and memory in the cluster.
272
totalResources :: Node.List -> CStats
273
totalResources nl =
274
  let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
275
  in cs { csScore = compCV nl }
276

    
277
-- | Compute the delta between two cluster state.
278
--
279
-- This is used when doing allocations, to understand better the
280
-- available cluster resources. The return value is a triple of the
281
-- current used values, the delta that was still allocated, and what
282
-- was left unallocated.
283
computeAllocationDelta :: CStats -> CStats -> AllocStats
284
computeAllocationDelta cini cfin =
285
  let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu,
286
              csNcpu = i_ncpu } = cini
287
      CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
288
              csTmem = t_mem, csTdsk = t_dsk, csVcpu = f_vcpu,
289
              csNcpu = f_ncpu, csTcpu = f_tcpu } = cfin
290
      rini = AllocInfo { allocInfoVCpus = fromIntegral i_icpu
291
                       , allocInfoNCpus = i_ncpu
292
                       , allocInfoMem   = fromIntegral i_imem
293
                       , allocInfoDisk  = fromIntegral i_idsk
294
                       }
295
      rfin = AllocInfo { allocInfoVCpus = fromIntegral (f_icpu - i_icpu)
296
                       , allocInfoNCpus = f_ncpu - i_ncpu
297
                       , allocInfoMem   = fromIntegral (f_imem - i_imem)
298
                       , allocInfoDisk  = fromIntegral (f_idsk - i_idsk)
299
                       }
300
      runa = AllocInfo { allocInfoVCpus = fromIntegral (f_vcpu - f_icpu)
301
                       , allocInfoNCpus = f_tcpu - f_ncpu
302
                       , allocInfoMem   = truncate t_mem - fromIntegral f_imem
303
                       , allocInfoDisk  = truncate t_dsk - fromIntegral f_idsk
304
                       }
305
  in (rini, rfin, runa)
306

    
307
-- | The names and weights of the individual elements in the CV list.
308
detailedCVInfo :: [(Double, String)]
309
detailedCVInfo = [ (1,  "free_mem_cv")
310
                 , (1,  "free_disk_cv")
311
                 , (1,  "n1_cnt")
312
                 , (1,  "reserved_mem_cv")
313
                 , (4,  "offline_all_cnt")
314
                 , (16, "offline_pri_cnt")
315
                 , (1,  "vcpu_ratio_cv")
316
                 , (1,  "cpu_load_cv")
317
                 , (1,  "mem_load_cv")
318
                 , (1,  "disk_load_cv")
319
                 , (1,  "net_load_cv")
320
                 , (2,  "pri_tags_score")
321
                 , (1,  "spindles_cv")
322
                 ]
323

    
324
-- | Holds the weights used by 'compCVNodes' for each metric.
325
detailedCVWeights :: [Double]
326
detailedCVWeights = map fst detailedCVInfo
327

    
328
-- | Compute the mem and disk covariance.
329
compDetailedCV :: [Node.Node] -> [Double]
330
compDetailedCV all_nodes =
331
  let (offline, nodes) = partition Node.offline all_nodes
332
      mem_l = map Node.pMem nodes
333
      dsk_l = map Node.pDsk nodes
334
      -- metric: memory covariance
335
      mem_cv = stdDev mem_l
336
      -- metric: disk covariance
337
      dsk_cv = stdDev dsk_l
338
      -- metric: count of instances living on N1 failing nodes
339
      n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
340
                                                 length (Node.pList n)) .
341
                 filter Node.failN1 $ nodes :: Double
342
      res_l = map Node.pRem nodes
343
      -- metric: reserved memory covariance
344
      res_cv = stdDev res_l
345
      -- offline instances metrics
346
      offline_ipri = sum . map (length . Node.pList) $ offline
347
      offline_isec = sum . map (length . Node.sList) $ offline
348
      -- metric: count of instances on offline nodes
349
      off_score = fromIntegral (offline_ipri + offline_isec)::Double
350
      -- metric: count of primary instances on offline nodes (this
351
      -- helps with evacuation/failover of primary instances on
352
      -- 2-node clusters with one node offline)
353
      off_pri_score = fromIntegral offline_ipri::Double
354
      cpu_l = map Node.pCpu nodes
355
      -- metric: covariance of vcpu/pcpu ratio
356
      cpu_cv = stdDev cpu_l
357
      -- metrics: covariance of cpu, memory, disk and network load
358
      (c_load, m_load, d_load, n_load) =
359
        unzip4 $ map (\n ->
360
                      let DynUtil c1 m1 d1 n1 = Node.utilLoad n
361
                          DynUtil c2 m2 d2 n2 = Node.utilPool n
362
                      in (c1/c2, m1/m2, d1/d2, n1/n2)) nodes
363
      -- metric: conflicting instance count
364
      pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
365
      pri_tags_score = fromIntegral pri_tags_inst::Double
366
      -- metric: spindles %
367
      spindles_cv = map (\n -> Node.instSpindles n / Node.hiSpindles n) nodes
368
  in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
369
     , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
370
     , pri_tags_score, stdDev spindles_cv ]
371

    
372
-- | Compute the /total/ variance.
373
compCVNodes :: [Node.Node] -> Double
374
compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
375

    
376
-- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
377
compCV :: Node.List -> Double
378
compCV = compCVNodes . Container.elems
379

    
380
-- | Compute online nodes from a 'Node.List'.
381
getOnline :: Node.List -> [Node.Node]
382
getOnline = filter (not . Node.offline) . Container.elems
383

    
384
-- * Balancing functions
385

    
386
-- | Compute best table. Note that the ordering of the arguments is important.
387
compareTables :: Table -> Table -> Table
388
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
389
  if a_cv > b_cv then b else a
390

    
391
-- | Applies an instance move to a given node list and instance.
392
applyMove :: Node.List -> Instance.Instance
393
          -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
394
-- Failover (f)
395
applyMove nl inst Failover =
396
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
397
      int_p = Node.removePri old_p inst
398
      int_s = Node.removeSec old_s inst
399
      new_nl = do -- Maybe monad
400
        new_p <- Node.addPriEx (Node.offline old_p) int_s inst
401
        new_s <- Node.addSec int_p inst old_sdx
402
        let new_inst = Instance.setBoth inst old_sdx old_pdx
403
        return (Container.addTwo old_pdx new_s old_sdx new_p nl,
404
                new_inst, old_sdx, old_pdx)
405
  in new_nl
406

    
407
-- Failover to any (fa)
408
applyMove nl inst (FailoverToAny new_pdx) = do
409
  let (old_pdx, old_sdx, old_pnode, _) = instanceNodes nl inst
410
      new_pnode = Container.find new_pdx nl
411
      force_failover = Node.offline old_pnode
412
  new_pnode' <- Node.addPriEx force_failover new_pnode inst
413
  let old_pnode' = Node.removePri old_pnode inst
414
      inst' = Instance.setPri inst new_pdx
415
      nl' = Container.addTwo old_pdx old_pnode' new_pdx new_pnode' nl
416
  return (nl', inst', new_pdx, old_sdx)
417

    
418
-- Replace the primary (f:, r:np, f)
419
applyMove nl inst (ReplacePrimary new_pdx) =
420
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
421
      tgt_n = Container.find new_pdx nl
422
      int_p = Node.removePri old_p inst
423
      int_s = Node.removeSec old_s inst
424
      force_p = Node.offline old_p
425
      new_nl = do -- Maybe monad
426
                  -- check that the current secondary can host the instance
427
                  -- during the migration
428
        tmp_s <- Node.addPriEx force_p int_s inst
429
        let tmp_s' = Node.removePri tmp_s inst
430
        new_p <- Node.addPriEx force_p tgt_n inst
431
        new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
432
        let new_inst = Instance.setPri inst new_pdx
433
        return (Container.add new_pdx new_p $
434
                Container.addTwo old_pdx int_p old_sdx new_s nl,
435
                new_inst, new_pdx, old_sdx)
436
  in new_nl
437

    
438
-- Replace the secondary (r:ns)
439
applyMove nl inst (ReplaceSecondary new_sdx) =
440
  let old_pdx = Instance.pNode inst
441
      old_sdx = Instance.sNode inst
442
      old_s = Container.find old_sdx nl
443
      tgt_n = Container.find new_sdx nl
444
      int_s = Node.removeSec old_s inst
445
      force_s = Node.offline old_s
446
      new_inst = Instance.setSec inst new_sdx
447
      new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
448
               \new_s -> return (Container.addTwo new_sdx
449
                                 new_s old_sdx int_s nl,
450
                                 new_inst, old_pdx, new_sdx)
451
  in new_nl
452

    
453
-- Replace the secondary and failover (r:np, f)
454
applyMove nl inst (ReplaceAndFailover new_pdx) =
455
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
456
      tgt_n = Container.find new_pdx nl
457
      int_p = Node.removePri old_p inst
458
      int_s = Node.removeSec old_s inst
459
      force_s = Node.offline old_s
460
      new_nl = do -- Maybe monad
461
        new_p <- Node.addPri tgt_n inst
462
        new_s <- Node.addSecEx force_s int_p inst new_pdx
463
        let new_inst = Instance.setBoth inst new_pdx old_pdx
464
        return (Container.add new_pdx new_p $
465
                Container.addTwo old_pdx new_s old_sdx int_s nl,
466
                new_inst, new_pdx, old_pdx)
467
  in new_nl
468

    
469
-- Failver and replace the secondary (f, r:ns)
470
applyMove nl inst (FailoverAndReplace new_sdx) =
471
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
472
      tgt_n = Container.find new_sdx nl
473
      int_p = Node.removePri old_p inst
474
      int_s = Node.removeSec old_s inst
475
      force_p = Node.offline old_p
476
      new_nl = do -- Maybe monad
477
        new_p <- Node.addPriEx force_p int_s inst
478
        new_s <- Node.addSecEx force_p tgt_n inst old_sdx
479
        let new_inst = Instance.setBoth inst old_sdx new_sdx
480
        return (Container.add new_sdx new_s $
481
                Container.addTwo old_sdx new_p old_pdx int_p nl,
482
                new_inst, old_sdx, new_sdx)
483
  in new_nl
484

    
485
-- | Tries to allocate an instance on one given node.
486
allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
487
                 -> OpResult Node.AllocElement
488
allocateOnSingle nl inst new_pdx =
489
  let p = Container.find new_pdx nl
490
      new_inst = Instance.setBoth inst new_pdx Node.noSecondary
491
  in do
492
    Instance.instMatchesPolicy inst (Node.iPolicy p) (Node.exclStorage p)
493
    new_p <- Node.addPri p inst
494
    let new_nl = Container.add new_pdx new_p nl
495
        new_score = compCV new_nl
496
    return (new_nl, new_inst, [new_p], new_score)
497

    
498
-- | Tries to allocate an instance on a given pair of nodes.
499
allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
500
               -> OpResult Node.AllocElement
501
allocateOnPair nl inst new_pdx new_sdx =
502
  let tgt_p = Container.find new_pdx nl
503
      tgt_s = Container.find new_sdx nl
504
  in do
505
    Instance.instMatchesPolicy inst (Node.iPolicy tgt_p)
506
      (Node.exclStorage tgt_p)
507
    new_p <- Node.addPri tgt_p inst
508
    new_s <- Node.addSec tgt_s inst new_pdx
509
    let new_inst = Instance.setBoth inst new_pdx new_sdx
510
        new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
511
    return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
512

    
513
-- | Tries to perform an instance move and returns the best table
514
-- between the original one and the new one.
515
checkSingleStep :: Table -- ^ The original table
516
                -> Instance.Instance -- ^ The instance to move
517
                -> Table -- ^ The current best table
518
                -> IMove -- ^ The move to apply
519
                -> Table -- ^ The final best table
520
checkSingleStep ini_tbl target cur_tbl move =
521
  let Table ini_nl ini_il _ ini_plc = ini_tbl
522
      tmp_resu = applyMove ini_nl target move
523
  in case tmp_resu of
524
       Bad _ -> cur_tbl
525
       Ok (upd_nl, new_inst, pri_idx, sec_idx) ->
526
         let tgt_idx = Instance.idx target
527
             upd_cvar = compCV upd_nl
528
             upd_il = Container.add tgt_idx new_inst ini_il
529
             upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
530
             upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
531
         in compareTables cur_tbl upd_tbl
532

    
533
-- | Given the status of the current secondary as a valid new node and
534
-- the current candidate target node, generate the possible moves for
535
-- a instance.
536
possibleMoves :: MirrorType -- ^ The mirroring type of the instance
537
              -> Bool       -- ^ Whether the secondary node is a valid new node
538
              -> Bool       -- ^ Whether we can change the primary node
539
              -> Ndx        -- ^ Target node candidate
540
              -> [IMove]    -- ^ List of valid result moves
541

    
542
possibleMoves MirrorNone _ _ _ = []
543

    
544
possibleMoves MirrorExternal _ False _ = []
545

    
546
possibleMoves MirrorExternal _ True tdx =
547
  [ FailoverToAny tdx ]
548

    
549
possibleMoves MirrorInternal _ False tdx =
550
  [ ReplaceSecondary tdx ]
551

    
552
possibleMoves MirrorInternal True True tdx =
553
  [ ReplaceSecondary tdx
554
  , ReplaceAndFailover tdx
555
  , ReplacePrimary tdx
556
  , FailoverAndReplace tdx
557
  ]
558

    
559
possibleMoves MirrorInternal False True tdx =
560
  [ ReplaceSecondary tdx
561
  , ReplaceAndFailover tdx
562
  ]
563

    
564
-- | Compute the best move for a given instance.
565
checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
566
                  -> Bool              -- ^ Whether disk moves are allowed
567
                  -> Bool              -- ^ Whether instance moves are allowed
568
                  -> Table             -- ^ Original table
569
                  -> Instance.Instance -- ^ Instance to move
570
                  -> Table             -- ^ Best new table for this instance
571
checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
572
  let opdx = Instance.pNode target
573
      osdx = Instance.sNode target
574
      bad_nodes = [opdx, osdx]
575
      nodes = filter (`notElem` bad_nodes) nodes_idx
576
      mir_type = Instance.mirrorType target
577
      use_secondary = elem osdx nodes_idx && inst_moves
578
      aft_failover = if mir_type == MirrorInternal && use_secondary
579
                       -- if drbd and allowed to failover
580
                       then checkSingleStep ini_tbl target ini_tbl Failover
581
                       else ini_tbl
582
      all_moves =
583
        if disk_moves
584
          then concatMap (possibleMoves mir_type use_secondary inst_moves)
585
               nodes
586
          else []
587
    in
588
      -- iterate over the possible nodes for this instance
589
      foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
590

    
591
-- | Compute the best next move.
592
checkMove :: [Ndx]               -- ^ Allowed target node indices
593
          -> Bool                -- ^ Whether disk moves are allowed
594
          -> Bool                -- ^ Whether instance moves are allowed
595
          -> Table               -- ^ The current solution
596
          -> [Instance.Instance] -- ^ List of instances still to move
597
          -> Table               -- ^ The new solution
598
checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
599
  let Table _ _ _ ini_plc = ini_tbl
600
      -- we're using rwhnf from the Control.Parallel.Strategies
601
      -- package; we don't need to use rnf as that would force too
602
      -- much evaluation in single-threaded cases, and in
603
      -- multi-threaded case the weak head normal form is enough to
604
      -- spark the evaluation
605
      tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
606
                             inst_moves ini_tbl)
607
               victims
608
      -- iterate over all instances, computing the best move
609
      best_tbl = foldl' compareTables ini_tbl tables
610
      Table _ _ _ best_plc = best_tbl
611
  in if length best_plc == length ini_plc
612
       then ini_tbl -- no advancement
613
       else best_tbl
614

    
615
-- | Check if we are allowed to go deeper in the balancing.
616
doNextBalance :: Table     -- ^ The starting table
617
              -> Int       -- ^ Remaining length
618
              -> Score     -- ^ Score at which to stop
619
              -> Bool      -- ^ The resulting table and commands
620
doNextBalance ini_tbl max_rounds min_score =
621
  let Table _ _ ini_cv ini_plc = ini_tbl
622
      ini_plc_len = length ini_plc
623
  in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
624

    
625
-- | Run a balance move.
626
tryBalance :: Table       -- ^ The starting table
627
           -> Bool        -- ^ Allow disk moves
628
           -> Bool        -- ^ Allow instance moves
629
           -> Bool        -- ^ Only evacuate moves
630
           -> Score       -- ^ Min gain threshold
631
           -> Score       -- ^ Min gain
632
           -> Maybe Table -- ^ The resulting table and commands
633
tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
634
    let Table ini_nl ini_il ini_cv _ = ini_tbl
635
        all_inst = Container.elems ini_il
636
        all_nodes = Container.elems ini_nl
637
        (offline_nodes, online_nodes) = partition Node.offline all_nodes
638
        all_inst' = if evac_mode
639
                      then let bad_nodes = map Node.idx offline_nodes
640
                           in filter (any (`elem` bad_nodes) .
641
                                          Instance.allNodes) all_inst
642
                      else all_inst
643
        reloc_inst = filter (\i -> Instance.movable i &&
644
                                   Instance.autoBalance i) all_inst'
645
        node_idx = map Node.idx online_nodes
646
        fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
647
        (Table _ _ fin_cv _) = fin_tbl
648
    in
649
      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
650
      then Just fin_tbl -- this round made success, return the new table
651
      else Nothing
652

    
653
-- * Allocation functions
654

    
655
-- | Build failure stats out of a list of failures.
656
collapseFailures :: [FailMode] -> FailStats
657
collapseFailures flst =
658
    map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
659
            [minBound..maxBound]
660

    
661
-- | Compares two Maybe AllocElement and chooses the best score.
662
bestAllocElement :: Maybe Node.AllocElement
663
                 -> Maybe Node.AllocElement
664
                 -> Maybe Node.AllocElement
665
bestAllocElement a Nothing = a
666
bestAllocElement Nothing b = b
667
bestAllocElement a@(Just (_, _, _, ascore)) b@(Just (_, _, _, bscore)) =
668
  if ascore < bscore then a else b
669

    
670
-- | Update current Allocation solution and failure stats with new
671
-- elements.
672
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
673
concatAllocs as (Bad reason) = as { asFailures = reason : asFailures as }
674

    
675
concatAllocs as (Ok ns) =
676
  let -- Choose the old or new solution, based on the cluster score
677
    cntok = asAllocs as
678
    osols = asSolution as
679
    nsols = bestAllocElement osols (Just ns)
680
    nsuc = cntok + 1
681
    -- Note: we force evaluation of nsols here in order to keep the
682
    -- memory profile low - we know that we will need nsols for sure
683
    -- in the next cycle, so we force evaluation of nsols, since the
684
    -- foldl' in the caller will only evaluate the tuple, but not the
685
    -- elements of the tuple
686
  in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolution = nsols }
687

    
688
-- | Sums two 'AllocSolution' structures.
689
sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
690
sumAllocs (AllocSolution aFails aAllocs aSols aLog)
691
          (AllocSolution bFails bAllocs bSols bLog) =
692
  -- note: we add b first, since usually it will be smaller; when
693
  -- fold'ing, a will grow and grow whereas b is the per-group
694
  -- result, hence smaller
695
  let nFails  = bFails ++ aFails
696
      nAllocs = aAllocs + bAllocs
697
      nSols   = bestAllocElement aSols bSols
698
      nLog    = bLog ++ aLog
699
  in AllocSolution nFails nAllocs nSols nLog
700

    
701
-- | Given a solution, generates a reasonable description for it.
702
describeSolution :: AllocSolution -> String
703
describeSolution as =
704
  let fcnt = asFailures as
705
      sols = asSolution as
706
      freasons =
707
        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
708
        filter ((> 0) . snd) . collapseFailures $ fcnt
709
  in case sols of
710
     Nothing -> "No valid allocation solutions, failure reasons: " ++
711
                (if null fcnt then "unknown reasons" else freasons)
712
     Just (_, _, nodes, cv) ->
713
         printf ("score: %.8f, successes %d, failures %d (%s)" ++
714
                 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
715
               (intercalate "/" . map Node.name $ nodes)
716

    
717
-- | Annotates a solution with the appropriate string.
718
annotateSolution :: AllocSolution -> AllocSolution
719
annotateSolution as = as { asLog = describeSolution as : asLog as }
720

    
721
-- | Reverses an evacuation solution.
722
--
723
-- Rationale: we always concat the results to the top of the lists, so
724
-- for proper jobset execution, we should reverse all lists.
725
reverseEvacSolution :: EvacSolution -> EvacSolution
726
reverseEvacSolution (EvacSolution f m o) =
727
  EvacSolution (reverse f) (reverse m) (reverse o)
728

    
729
-- | Generate the valid node allocation singles or pairs for a new instance.
730
genAllocNodes :: Group.List        -- ^ Group list
731
              -> Node.List         -- ^ The node map
732
              -> Int               -- ^ The number of nodes required
733
              -> Bool              -- ^ Whether to drop or not
734
                                   -- unallocable nodes
735
              -> Result AllocNodes -- ^ The (monadic) result
736
genAllocNodes gl nl count drop_unalloc =
737
  let filter_fn = if drop_unalloc
738
                    then filter (Group.isAllocable .
739
                                 flip Container.find gl . Node.group)
740
                    else id
741
      all_nodes = filter_fn $ getOnline nl
742
      all_pairs = [(Node.idx p,
743
                    [Node.idx s | s <- all_nodes,
744
                                       Node.idx p /= Node.idx s,
745
                                       Node.group p == Node.group s]) |
746
                   p <- all_nodes]
747
  in case count of
748
       1 -> Ok (Left (map Node.idx all_nodes))
749
       2 -> Ok (Right (filter (not . null . snd) all_pairs))
750
       _ -> Bad "Unsupported number of nodes, only one or two  supported"
751

    
752
-- | Try to allocate an instance on the cluster.
753
tryAlloc :: (Monad m) =>
754
            Node.List         -- ^ The node list
755
         -> Instance.List     -- ^ The instance list
756
         -> Instance.Instance -- ^ The instance to allocate
757
         -> AllocNodes        -- ^ The allocation targets
758
         -> m AllocSolution   -- ^ Possible solution list
759
tryAlloc _  _ _    (Right []) = fail "Not enough online nodes"
760
tryAlloc nl _ inst (Right ok_pairs) =
761
  let psols = parMap rwhnf (\(p, ss) ->
762
                              foldl' (\cstate ->
763
                                        concatAllocs cstate .
764
                                        allocateOnPair nl inst p)
765
                              emptyAllocSolution ss) ok_pairs
766
      sols = foldl' sumAllocs emptyAllocSolution psols
767
  in return $ annotateSolution sols
768

    
769
tryAlloc _  _ _    (Left []) = fail "No online nodes"
770
tryAlloc nl _ inst (Left all_nodes) =
771
  let sols = foldl' (\cstate ->
772
                       concatAllocs cstate . allocateOnSingle nl inst
773
                    ) emptyAllocSolution all_nodes
774
  in return $ annotateSolution sols
775

    
776
-- | Given a group/result, describe it as a nice (list of) messages.
777
solutionDescription :: (Group.Group, Result AllocSolution)
778
                    -> [String]
779
solutionDescription (grp, result) =
780
  case result of
781
    Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
782
    Bad message -> [printf "Group %s: error %s" gname message]
783
  where gname = Group.name grp
784
        pol = allocPolicyToRaw (Group.allocPolicy grp)
785

    
786
-- | From a list of possibly bad and possibly empty solutions, filter
787
-- only the groups with a valid result. Note that the result will be
788
-- reversed compared to the original list.
789
filterMGResults :: [(Group.Group, Result AllocSolution)]
790
                -> [(Group.Group, AllocSolution)]
791
filterMGResults = foldl' fn []
792
  where unallocable = not . Group.isAllocable
793
        fn accu (grp, rasol) =
794
          case rasol of
795
            Bad _ -> accu
796
            Ok sol | isNothing (asSolution sol) -> accu
797
                   | unallocable grp -> accu
798
                   | otherwise -> (grp, sol):accu
799

    
800
-- | Sort multigroup results based on policy and score.
801
sortMGResults :: [(Group.Group, AllocSolution)]
802
              -> [(Group.Group, AllocSolution)]
803
sortMGResults sols =
804
  let extractScore (_, _, _, x) = x
805
      solScore (grp, sol) = (Group.allocPolicy grp,
806
                             (extractScore . fromJust . asSolution) sol)
807
  in sortBy (comparing solScore) sols
808

    
809
-- | Removes node groups which can't accommodate the instance
810
filterValidGroups :: [(Group.Group, (Node.List, Instance.List))]
811
                  -> Instance.Instance
812
                  -> ([(Group.Group, (Node.List, Instance.List))], [String])
813
filterValidGroups [] _ = ([], [])
814
filterValidGroups (ng:ngs) inst =
815
  let (valid_ngs, msgs) = filterValidGroups ngs inst
816
      hasNetwork nic = case Nic.network nic of
817
        Just net -> net `elem` Group.networks (fst ng)
818
        Nothing -> True
819
      hasRequiredNetworks = all hasNetwork (Instance.nics inst)
820
  in if hasRequiredNetworks
821
      then (ng:valid_ngs, msgs)
822
      else (valid_ngs,
823
            ("group " ++ Group.name (fst ng) ++
824
             " is not connected to a network required by instance " ++
825
             Instance.name inst):msgs)
826

    
827
-- | Finds the best group for an instance on a multi-group cluster.
828
--
829
-- Only solutions in @preferred@ and @last_resort@ groups will be
830
-- accepted as valid, and additionally if the allowed groups parameter
831
-- is not null then allocation will only be run for those group
832
-- indices.
833
findBestAllocGroup :: Group.List           -- ^ The group list
834
                   -> Node.List            -- ^ The node list
835
                   -> Instance.List        -- ^ The instance list
836
                   -> Maybe [Gdx]          -- ^ The allowed groups
837
                   -> Instance.Instance    -- ^ The instance to allocate
838
                   -> Int                  -- ^ Required number of nodes
839
                   -> Result (Group.Group, AllocSolution, [String])
840
findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
841
  let groups_by_idx = splitCluster mgnl mgil
842
      groups = map (\(gid, d) -> (Container.find gid mggl, d)) groups_by_idx
843
      groups' = maybe groups
844
                (\gs -> filter ((`elem` gs) . Group.idx . fst) groups)
845
                allowed_gdxs
846
      (groups'', filter_group_msgs) = filterValidGroups groups' inst
847
      sols = map (\(gr, (nl, il)) ->
848
                   (gr, genAllocNodes mggl nl cnt False >>=
849
                        tryAlloc nl il inst))
850
             groups''::[(Group.Group, Result AllocSolution)]
851
      all_msgs = filter_group_msgs ++ concatMap solutionDescription sols
852
      goodSols = filterMGResults sols
853
      sortedSols = sortMGResults goodSols
854
  in case sortedSols of
855
       [] -> Bad $ if null groups'
856
                     then "no groups for evacuation: allowed groups was" ++
857
                          show allowed_gdxs ++ ", all groups: " ++
858
                          show (map fst groups)
859
                     else intercalate ", " all_msgs
860
       (final_group, final_sol):_ -> return (final_group, final_sol, all_msgs)
861

    
862
-- | Try to allocate an instance on a multi-group cluster.
863
tryMGAlloc :: Group.List           -- ^ The group list
864
           -> Node.List            -- ^ The node list
865
           -> Instance.List        -- ^ The instance list
866
           -> Instance.Instance    -- ^ The instance to allocate
867
           -> Int                  -- ^ Required number of nodes
868
           -> Result AllocSolution -- ^ Possible solution list
869
tryMGAlloc mggl mgnl mgil inst cnt = do
870
  (best_group, solution, all_msgs) <-
871
      findBestAllocGroup mggl mgnl mgil Nothing inst cnt
872
  let group_name = Group.name best_group
873
      selmsg = "Selected group: " ++ group_name
874
  return $ solution { asLog = selmsg:all_msgs }
875

    
876
-- | Calculate the new instance list after allocation solution.
877
updateIl :: Instance.List           -- ^ The original instance list
878
         -> Maybe Node.AllocElement -- ^ The result of the allocation attempt
879
         -> Instance.List           -- ^ The updated instance list
880
updateIl il Nothing = il
881
updateIl il (Just (_, xi, _, _)) = Container.add (Container.size il) xi il
882

    
883
-- | Extract the the new node list from the allocation solution.
884
extractNl :: Node.List               -- ^ The original node list
885
          -> Maybe Node.AllocElement -- ^ The result of the allocation attempt
886
          -> Node.List               -- ^ The new node list
887
extractNl nl Nothing = nl
888
extractNl _ (Just (xnl, _, _, _)) = xnl
889

    
890
-- | Try to allocate a list of instances on a multi-group cluster.
891
allocList :: Group.List                  -- ^ The group list
892
          -> Node.List                   -- ^ The node list
893
          -> Instance.List               -- ^ The instance list
894
          -> [(Instance.Instance, Int)]  -- ^ The instance to allocate
895
          -> AllocSolutionList           -- ^ Possible solution list
896
          -> Result (Node.List, Instance.List,
897
                     AllocSolutionList)  -- ^ The final solution list
898
allocList _  nl il [] result = Ok (nl, il, result)
899
allocList gl nl il ((xi, xicnt):xies) result = do
900
  ares <- tryMGAlloc gl nl il xi xicnt
901
  let sol = asSolution ares
902
      nl' = extractNl nl sol
903
      il' = updateIl il sol
904
  allocList gl nl' il' xies ((xi, ares):result)
905

    
906
-- | Function which fails if the requested mode is change secondary.
907
--
908
-- This is useful since except DRBD, no other disk template can
909
-- execute change secondary; thus, we can just call this function
910
-- instead of always checking for secondary mode. After the call to
911
-- this function, whatever mode we have is just a primary change.
912
failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
913
failOnSecondaryChange ChangeSecondary dt =
914
  fail $ "Instances with disk template '" ++ diskTemplateToRaw dt ++
915
         "' can't execute change secondary"
916
failOnSecondaryChange _ _ = return ()
917

    
918
-- | Run evacuation for a single instance.
919
--
920
-- /Note:/ this function should correctly execute both intra-group
921
-- evacuations (in all modes) and inter-group evacuations (in the
922
-- 'ChangeAll' mode). Of course, this requires that the correct list
923
-- of target nodes is passed.
924
nodeEvacInstance :: Node.List         -- ^ The node list (cluster-wide)
925
                 -> Instance.List     -- ^ Instance list (cluster-wide)
926
                 -> EvacMode          -- ^ The evacuation mode
927
                 -> Instance.Instance -- ^ The instance to be evacuated
928
                 -> Gdx               -- ^ The group we're targetting
929
                 -> [Ndx]             -- ^ The list of available nodes
930
                                      -- for allocation
931
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
932
nodeEvacInstance nl il mode inst@(Instance.Instance
933
                                  {Instance.diskTemplate = dt@DTDiskless})
934
                 gdx avail_nodes =
935
                   failOnSecondaryChange mode dt >>
936
                   evacOneNodeOnly nl il inst gdx avail_nodes
937

    
938
nodeEvacInstance _ _ _ (Instance.Instance
939
                        {Instance.diskTemplate = DTPlain}) _ _ =
940
                  fail "Instances of type plain cannot be relocated"
941

    
942
nodeEvacInstance _ _ _ (Instance.Instance
943
                        {Instance.diskTemplate = DTFile}) _ _ =
944
                  fail "Instances of type file cannot be relocated"
945

    
946
nodeEvacInstance nl il mode inst@(Instance.Instance
947
                                  {Instance.diskTemplate = dt@DTSharedFile})
948
                 gdx avail_nodes =
949
                   failOnSecondaryChange mode dt >>
950
                   evacOneNodeOnly nl il inst gdx avail_nodes
951

    
952
nodeEvacInstance nl il mode inst@(Instance.Instance
953
                                  {Instance.diskTemplate = dt@DTBlock})
954
                 gdx avail_nodes =
955
                   failOnSecondaryChange mode dt >>
956
                   evacOneNodeOnly nl il inst gdx avail_nodes
957

    
958
nodeEvacInstance nl il mode inst@(Instance.Instance
959
                                  {Instance.diskTemplate = dt@DTRbd})
960
                 gdx avail_nodes =
961
                   failOnSecondaryChange mode dt >>
962
                   evacOneNodeOnly nl il inst gdx avail_nodes
963

    
964
nodeEvacInstance nl il mode inst@(Instance.Instance
965
                                  {Instance.diskTemplate = dt@DTExt})
966
                 gdx avail_nodes =
967
                   failOnSecondaryChange mode dt >>
968
                   evacOneNodeOnly nl il inst gdx avail_nodes
969

    
970
nodeEvacInstance nl il ChangePrimary
971
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
972
                 _ _ =
973
  do
974
    (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
975
    let idx = Instance.idx inst
976
        il' = Container.add idx inst' il
977
        ops = iMoveToJob nl' il' idx Failover
978
    return (nl', il', ops)
979

    
980
nodeEvacInstance nl il ChangeSecondary
981
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
982
                 gdx avail_nodes =
983
  evacOneNodeOnly nl il inst gdx avail_nodes
984

    
985
-- The algorithm for ChangeAll is as follows:
986
--
987
-- * generate all (primary, secondary) node pairs for the target groups
988
-- * for each pair, execute the needed moves (r:s, f, r:s) and compute
989
--   the final node list state and group score
990
-- * select the best choice via a foldl that uses the same Either
991
--   String solution as the ChangeSecondary mode
992
nodeEvacInstance nl il ChangeAll
993
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
994
                 gdx avail_nodes =
995
  do
996
    let no_nodes = Left "no nodes available"
997
        node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s]
998
    (nl', il', ops, _) <-
999
        annotateResult "Can't find any good nodes for relocation" .
1000
        eitherToResult $
1001
        foldl'
1002
        (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of
1003
                          Bad msg ->
1004
                              case accu of
1005
                                Right _ -> accu
1006
                                -- we don't need more details (which
1007
                                -- nodes, etc.) as we only selected
1008
                                -- this group if we can allocate on
1009
                                -- it, hence failures will not
1010
                                -- propagate out of this fold loop
1011
                                Left _ -> Left $ "Allocation failed: " ++ msg
1012
                          Ok result@(_, _, _, new_cv) ->
1013
                              let new_accu = Right result in
1014
                              case accu of
1015
                                Left _ -> new_accu
1016
                                Right (_, _, _, old_cv) ->
1017
                                    if old_cv < new_cv
1018
                                    then accu
1019
                                    else new_accu
1020
        ) no_nodes node_pairs
1021

    
1022
    return (nl', il', ops)
1023

    
1024
-- | Generic function for changing one node of an instance.
1025
--
1026
-- This is similar to 'nodeEvacInstance' but will be used in a few of
1027
-- its sub-patterns. It folds the inner function 'evacOneNodeInner'
1028
-- over the list of available nodes, which results in the best choice
1029
-- for relocation.
1030
evacOneNodeOnly :: Node.List         -- ^ The node list (cluster-wide)
1031
                -> Instance.List     -- ^ Instance list (cluster-wide)
1032
                -> Instance.Instance -- ^ The instance to be evacuated
1033
                -> Gdx               -- ^ The group we're targetting
1034
                -> [Ndx]             -- ^ The list of available nodes
1035
                                      -- for allocation
1036
                -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1037
evacOneNodeOnly nl il inst gdx avail_nodes = do
1038
  op_fn <- case Instance.mirrorType inst of
1039
             MirrorNone -> Bad "Can't relocate/evacuate non-mirrored instances"
1040
             MirrorInternal -> Ok ReplaceSecondary
1041
             MirrorExternal -> Ok FailoverToAny
1042
  (nl', inst', _, ndx) <- annotateResult "Can't find any good node" .
1043
                          eitherToResult $
1044
                          foldl' (evacOneNodeInner nl inst gdx op_fn)
1045
                          (Left "no nodes available") avail_nodes
1046
  let idx = Instance.idx inst
1047
      il' = Container.add idx inst' il
1048
      ops = iMoveToJob nl' il' idx (op_fn ndx)
1049
  return (nl', il', ops)
1050

    
1051
-- | Inner fold function for changing one node of an instance.
1052
--
1053
-- Depending on the instance disk template, this will either change
1054
-- the secondary (for DRBD) or the primary node (for shared
1055
-- storage). However, the operation is generic otherwise.
1056
--
1057
-- The running solution is either a @Left String@, which means we
1058
-- don't have yet a working solution, or a @Right (...)@, which
1059
-- represents a valid solution; it holds the modified node list, the
1060
-- modified instance (after evacuation), the score of that solution,
1061
-- and the new secondary node index.
1062
evacOneNodeInner :: Node.List         -- ^ Cluster node list
1063
                 -> Instance.Instance -- ^ Instance being evacuated
1064
                 -> Gdx               -- ^ The group index of the instance
1065
                 -> (Ndx -> IMove)    -- ^ Operation constructor
1066
                 -> EvacInnerState    -- ^ Current best solution
1067
                 -> Ndx               -- ^ Node we're evaluating as target
1068
                 -> EvacInnerState    -- ^ New best solution
1069
evacOneNodeInner nl inst gdx op_fn accu ndx =
1070
  case applyMove nl inst (op_fn ndx) of
1071
    Bad fm -> let fail_msg = "Node " ++ Container.nameOf nl ndx ++
1072
                             " failed: " ++ show fm
1073
              in either (const $ Left fail_msg) (const accu) accu
1074
    Ok (nl', inst', _, _) ->
1075
      let nodes = Container.elems nl'
1076
          -- The fromJust below is ugly (it can fail nastily), but
1077
          -- at this point we should have any internal mismatches,
1078
          -- and adding a monad here would be quite involved
1079
          grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1080
          new_cv = compCVNodes grpnodes
1081
          new_accu = Right (nl', inst', new_cv, ndx)
1082
      in case accu of
1083
           Left _ -> new_accu
1084
           Right (_, _, old_cv, _) ->
1085
             if old_cv < new_cv
1086
               then accu
1087
               else new_accu
1088

    
1089
-- | Compute result of changing all nodes of a DRBD instance.
1090
--
1091
-- Given the target primary and secondary node (which might be in a
1092
-- different group or not), this function will 'execute' all the
1093
-- required steps and assuming all operations succceed, will return
1094
-- the modified node and instance lists, the opcodes needed for this
1095
-- and the new group score.
1096
evacDrbdAllInner :: Node.List         -- ^ Cluster node list
1097
                 -> Instance.List     -- ^ Cluster instance list
1098
                 -> Instance.Instance -- ^ The instance to be moved
1099
                 -> Gdx               -- ^ The target group index
1100
                                      -- (which can differ from the
1101
                                      -- current group of the
1102
                                      -- instance)
1103
                 -> (Ndx, Ndx)        -- ^ Tuple of new
1104
                                      -- primary\/secondary nodes
1105
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score)
1106
evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) = do
1107
  let primary = Container.find (Instance.pNode inst) nl
1108
      idx = Instance.idx inst
1109
  -- if the primary is offline, then we first failover
1110
  (nl1, inst1, ops1) <-
1111
    if Node.offline primary
1112
      then do
1113
        (nl', inst', _, _) <-
1114
          annotateResult "Failing over to the secondary" .
1115
          opToResult $ applyMove nl inst Failover
1116
        return (nl', inst', [Failover])
1117
      else return (nl, inst, [])
1118
  let (o1, o2, o3) = (ReplaceSecondary t_pdx,
1119
                      Failover,
1120
                      ReplaceSecondary t_sdx)
1121
  -- we now need to execute a replace secondary to the future
1122
  -- primary node
1123
  (nl2, inst2, _, _) <-
1124
    annotateResult "Changing secondary to new primary" .
1125
    opToResult $
1126
    applyMove nl1 inst1 o1
1127
  let ops2 = o1:ops1
1128
  -- we now execute another failover, the primary stays fixed now
1129
  (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" .
1130
                        opToResult $ applyMove nl2 inst2 o2
1131
  let ops3 = o2:ops2
1132
  -- and finally another replace secondary, to the final secondary
1133
  (nl4, inst4, _, _) <-
1134
    annotateResult "Changing secondary to final secondary" .
1135
    opToResult $
1136
    applyMove nl3 inst3 o3
1137
  let ops4 = o3:ops3
1138
      il' = Container.add idx inst4 il
1139
      ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1140
  let nodes = Container.elems nl4
1141
      -- The fromJust below is ugly (it can fail nastily), but
1142
      -- at this point we should have any internal mismatches,
1143
      -- and adding a monad here would be quite involved
1144
      grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1145
      new_cv = compCVNodes grpnodes
1146
  return (nl4, il', ops, new_cv)
1147

    
1148
-- | Computes the nodes in a given group which are available for
1149
-- allocation.
1150
availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1151
                    -> IntSet.IntSet  -- ^ Nodes that are excluded
1152
                    -> Gdx            -- ^ The group for which we
1153
                                      -- query the nodes
1154
                    -> Result [Ndx]   -- ^ List of available node indices
1155
availableGroupNodes group_nodes excl_ndx gdx = do
1156
  local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1157
                 Ok (lookup gdx group_nodes)
1158
  let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1159
  return avail_nodes
1160

    
1161
-- | Updates the evac solution with the results of an instance
1162
-- evacuation.
1163
updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1164
                   -> Idx
1165
                   -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1166
                   -> (Node.List, Instance.List, EvacSolution)
1167
updateEvacSolution (nl, il, es) idx (Bad msg) =
1168
  (nl, il, es { esFailed = (idx, msg):esFailed es})
1169
updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1170
  (nl, il, es { esMoved = new_elem:esMoved es
1171
              , esOpCodes = opcodes:esOpCodes es })
1172
    where inst = Container.find idx il
1173
          new_elem = (idx,
1174
                      instancePriGroup nl inst,
1175
                      Instance.allNodes inst)
1176

    
1177
-- | Node-evacuation IAllocator mode main function.
1178
tryNodeEvac :: Group.List    -- ^ The cluster groups
1179
            -> Node.List     -- ^ The node list (cluster-wide, not per group)
1180
            -> Instance.List -- ^ Instance list (cluster-wide)
1181
            -> EvacMode      -- ^ The evacuation mode
1182
            -> [Idx]         -- ^ List of instance (indices) to be evacuated
1183
            -> Result (Node.List, Instance.List, EvacSolution)
1184
tryNodeEvac _ ini_nl ini_il mode idxs =
1185
  let evac_ndx = nodesToEvacuate ini_il mode idxs
1186
      offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1187
      excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1188
      group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1189
                                           (Container.elems nl))) $
1190
                  splitCluster ini_nl ini_il
1191
      (fin_nl, fin_il, esol) =
1192
        foldl' (\state@(nl, il, _) inst ->
1193
                  let gdx = instancePriGroup nl inst
1194
                      pdx = Instance.pNode inst in
1195
                  updateEvacSolution state (Instance.idx inst) $
1196
                  availableGroupNodes group_ndx
1197
                    (IntSet.insert pdx excl_ndx) gdx >>=
1198
                      nodeEvacInstance nl il mode inst gdx
1199
               )
1200
        (ini_nl, ini_il, emptyEvacSolution)
1201
        (map (`Container.find` ini_il) idxs)
1202
  in return (fin_nl, fin_il, reverseEvacSolution esol)
1203

    
1204
-- | Change-group IAllocator mode main function.
1205
--
1206
-- This is very similar to 'tryNodeEvac', the only difference is that
1207
-- we don't choose as target group the current instance group, but
1208
-- instead:
1209
--
1210
--   1. at the start of the function, we compute which are the target
1211
--   groups; either no groups were passed in, in which case we choose
1212
--   all groups out of which we don't evacuate instance, or there were
1213
--   some groups passed, in which case we use those
1214
--
1215
--   2. for each instance, we use 'findBestAllocGroup' to choose the
1216
--   best group to hold the instance, and then we do what
1217
--   'tryNodeEvac' does, except for this group instead of the current
1218
--   instance group.
1219
--
1220
-- Note that the correct behaviour of this function relies on the
1221
-- function 'nodeEvacInstance' to be able to do correctly both
1222
-- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1223
tryChangeGroup :: Group.List    -- ^ The cluster groups
1224
               -> Node.List     -- ^ The node list (cluster-wide)
1225
               -> Instance.List -- ^ Instance list (cluster-wide)
1226
               -> [Gdx]         -- ^ Target groups; if empty, any
1227
                                -- groups not being evacuated
1228
               -> [Idx]         -- ^ List of instance (indices) to be evacuated
1229
               -> Result (Node.List, Instance.List, EvacSolution)
1230
tryChangeGroup gl ini_nl ini_il gdxs idxs =
1231
  let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1232
                             flip Container.find ini_il) idxs
1233
      target_gdxs = (if null gdxs
1234
                       then Container.keys gl
1235
                       else gdxs) \\ evac_gdxs
1236
      offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1237
      excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1238
      group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1239
                                           (Container.elems nl))) $
1240
                  splitCluster ini_nl ini_il
1241
      (fin_nl, fin_il, esol) =
1242
        foldl' (\state@(nl, il, _) inst ->
1243
                  let solution = do
1244
                        let ncnt = Instance.requiredNodes $
1245
                                   Instance.diskTemplate inst
1246
                        (grp, _, _) <- findBestAllocGroup gl nl il
1247
                                       (Just target_gdxs) inst ncnt
1248
                        let gdx = Group.idx grp
1249
                        av_nodes <- availableGroupNodes group_ndx
1250
                                    excl_ndx gdx
1251
                        nodeEvacInstance nl il ChangeAll inst gdx av_nodes
1252
                  in updateEvacSolution state (Instance.idx inst) solution
1253
               )
1254
        (ini_nl, ini_il, emptyEvacSolution)
1255
        (map (`Container.find` ini_il) idxs)
1256
  in return (fin_nl, fin_il, reverseEvacSolution esol)
1257

    
1258
-- | Standard-sized allocation method.
1259
--
1260
-- This places instances of the same size on the cluster until we're
1261
-- out of space. The result will be a list of identically-sized
1262
-- instances.
1263
iterateAlloc :: AllocMethod
1264
iterateAlloc nl il limit newinst allocnodes ixes cstats =
1265
  let depth = length ixes
1266
      newname = printf "new-%d" depth::String
1267
      newidx = Container.size il
1268
      newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1269
      newlimit = fmap (flip (-) 1) limit
1270
  in case tryAlloc nl il newi2 allocnodes of
1271
       Bad s -> Bad s
1272
       Ok (AllocSolution { asFailures = errs, asSolution = sols3 }) ->
1273
         let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1274
         case sols3 of
1275
           Nothing -> newsol
1276
           Just (xnl, xi, _, _) ->
1277
             if limit == Just 0
1278
               then newsol
1279
               else iterateAlloc xnl (Container.add newidx xi il)
1280
                      newlimit newinst allocnodes (xi:ixes)
1281
                      (totalResources xnl:cstats)
1282

    
1283
-- | Tiered allocation method.
1284
--
1285
-- This places instances on the cluster, and decreases the spec until
1286
-- we can allocate again. The result will be a list of decreasing
1287
-- instance specs.
1288
tieredAlloc :: AllocMethod
1289
tieredAlloc nl il limit newinst allocnodes ixes cstats =
1290
  case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1291
    Bad s -> Bad s
1292
    Ok (errs, nl', il', ixes', cstats') ->
1293
      let newsol = Ok (errs, nl', il', ixes', cstats')
1294
          ixes_cnt = length ixes'
1295
          (stop, newlimit) = case limit of
1296
                               Nothing -> (False, Nothing)
1297
                               Just n -> (n <= ixes_cnt,
1298
                                            Just (n - ixes_cnt)) in
1299
      if stop then newsol else
1300
          case Instance.shrinkByType newinst . fst . last $
1301
               sortBy (comparing snd) errs of
1302
            Bad _ -> newsol
1303
            Ok newinst' -> tieredAlloc nl' il' newlimit
1304
                           newinst' allocnodes ixes' cstats'
1305

    
1306
-- * Formatting functions
1307

    
1308
-- | Given the original and final nodes, computes the relocation description.
1309
computeMoves :: Instance.Instance -- ^ The instance to be moved
1310
             -> String -- ^ The instance name
1311
             -> IMove  -- ^ The move being performed
1312
             -> String -- ^ New primary
1313
             -> String -- ^ New secondary
1314
             -> (String, [String])
1315
                -- ^ Tuple of moves and commands list; moves is containing
1316
                -- either @/f/@ for failover or @/r:name/@ for replace
1317
                -- secondary, while the command list holds gnt-instance
1318
                -- commands (without that prefix), e.g \"@failover instance1@\"
1319
computeMoves i inam mv c d =
1320
  case mv of
1321
    Failover -> ("f", [mig])
1322
    FailoverToAny _ -> (printf "fa:%s" c, [mig_any])
1323
    FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1324
    ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1325
    ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1326
    ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1327
  where morf = if Instance.isRunning i then "migrate" else "failover"
1328
        mig = printf "%s -f %s" morf inam::String
1329
        mig_any = printf "%s -f -n %s %s" morf c inam::String
1330
        rep n = printf "replace-disks -n %s %s" n inam::String
1331

    
1332
-- | Converts a placement to string format.
1333
printSolutionLine :: Node.List     -- ^ The node list
1334
                  -> Instance.List -- ^ The instance list
1335
                  -> Int           -- ^ Maximum node name length
1336
                  -> Int           -- ^ Maximum instance name length
1337
                  -> Placement     -- ^ The current placement
1338
                  -> Int           -- ^ The index of the placement in
1339
                                   -- the solution
1340
                  -> (String, [String])
1341
printSolutionLine nl il nmlen imlen plc pos =
1342
  let pmlen = (2*nmlen + 1)
1343
      (i, p, s, mv, c) = plc
1344
      old_sec = Instance.sNode inst
1345
      inst = Container.find i il
1346
      inam = Instance.alias inst
1347
      npri = Node.alias $ Container.find p nl
1348
      nsec = Node.alias $ Container.find s nl
1349
      opri = Node.alias $ Container.find (Instance.pNode inst) nl
1350
      osec = Node.alias $ Container.find old_sec nl
1351
      (moves, cmds) =  computeMoves inst inam mv npri nsec
1352
      -- FIXME: this should check instead/also the disk template
1353
      ostr = if old_sec == Node.noSecondary
1354
               then printf "%s" opri::String
1355
               else printf "%s:%s" opri osec::String
1356
      nstr = if s == Node.noSecondary
1357
               then printf "%s" npri::String
1358
               else printf "%s:%s" npri nsec::String
1359
  in (printf "  %3d. %-*s %-*s => %-*s %12.8f a=%s"
1360
      pos imlen inam pmlen ostr pmlen nstr c moves,
1361
      cmds)
1362

    
1363
-- | Return the instance and involved nodes in an instance move.
1364
--
1365
-- Note that the output list length can vary, and is not required nor
1366
-- guaranteed to be of any specific length.
1367
involvedNodes :: Instance.List -- ^ Instance list, used for retrieving
1368
                               -- the instance from its index; note
1369
                               -- that this /must/ be the original
1370
                               -- instance list, so that we can
1371
                               -- retrieve the old nodes
1372
              -> Placement     -- ^ The placement we're investigating,
1373
                               -- containing the new nodes and
1374
                               -- instance index
1375
              -> [Ndx]         -- ^ Resulting list of node indices
1376
involvedNodes il plc =
1377
  let (i, np, ns, _, _) = plc
1378
      inst = Container.find i il
1379
  in nub $ [np, ns] ++ Instance.allNodes inst
1380

    
1381
-- | Inner function for splitJobs, that either appends the next job to
1382
-- the current jobset, or starts a new jobset.
1383
mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1384
mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1385
mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1386
  | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1387
  | otherwise = ([n]:cjs, ndx)
1388

    
1389
-- | Break a list of moves into independent groups. Note that this
1390
-- will reverse the order of jobs.
1391
splitJobs :: [MoveJob] -> [JobSet]
1392
splitJobs = fst . foldl mergeJobs ([], [])
1393

    
1394
-- | Given a list of commands, prefix them with @gnt-instance@ and
1395
-- also beautify the display a little.
1396
formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1397
formatJob jsn jsl (sn, (_, _, _, cmds)) =
1398
  let out =
1399
        printf "  echo job %d/%d" jsn sn:
1400
        printf "  check":
1401
        map ("  gnt-instance " ++) cmds
1402
  in if sn == 1
1403
       then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1404
       else out
1405

    
1406
-- | Given a list of commands, prefix them with @gnt-instance@ and
1407
-- also beautify the display a little.
1408
formatCmds :: [JobSet] -> String
1409
formatCmds =
1410
  unlines .
1411
  concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1412
                           (zip [1..] js)) .
1413
  zip [1..]
1414

    
1415
-- | Print the node list.
1416
printNodes :: Node.List -> [String] -> String
1417
printNodes nl fs =
1418
  let fields = case fs of
1419
                 [] -> Node.defaultFields
1420
                 "+":rest -> Node.defaultFields ++ rest
1421
                 _ -> fs
1422
      snl = sortBy (comparing Node.idx) (Container.elems nl)
1423
      (header, isnum) = unzip $ map Node.showHeader fields
1424
  in printTable "" header (map (Node.list fields) snl) isnum
1425

    
1426
-- | Print the instance list.
1427
printInsts :: Node.List -> Instance.List -> String
1428
printInsts nl il =
1429
  let sil = sortBy (comparing Instance.idx) (Container.elems il)
1430
      helper inst = [ if Instance.isRunning inst then "R" else " "
1431
                    , Instance.name inst
1432
                    , Container.nameOf nl (Instance.pNode inst)
1433
                    , let sdx = Instance.sNode inst
1434
                      in if sdx == Node.noSecondary
1435
                           then  ""
1436
                           else Container.nameOf nl sdx
1437
                    , if Instance.autoBalance inst then "Y" else "N"
1438
                    , printf "%3d" $ Instance.vcpus inst
1439
                    , printf "%5d" $ Instance.mem inst
1440
                    , printf "%5d" $ Instance.dsk inst `div` 1024
1441
                    , printf "%5.3f" lC
1442
                    , printf "%5.3f" lM
1443
                    , printf "%5.3f" lD
1444
                    , printf "%5.3f" lN
1445
                    ]
1446
          where DynUtil lC lM lD lN = Instance.util inst
1447
      header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1448
               , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1449
      isnum = False:False:False:False:False:repeat True
1450
  in printTable "" header (map helper sil) isnum
1451

    
1452
-- | Shows statistics for a given node list.
1453
printStats :: String -> Node.List -> String
1454
printStats lp nl =
1455
  let dcvs = compDetailedCV $ Container.elems nl
1456
      (weights, names) = unzip detailedCVInfo
1457
      hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1458
      header = [ "Field", "Value", "Weight" ]
1459
      formatted = map (\(w, h, val) ->
1460
                         [ h
1461
                         , printf "%.8f" val
1462
                         , printf "x%.2f" w
1463
                         ]) hd
1464
  in printTable lp header formatted $ False:repeat True
1465

    
1466
-- | Convert a placement into a list of OpCodes (basically a job).
1467
iMoveToJob :: Node.List        -- ^ The node list; only used for node
1468
                               -- names, so any version is good
1469
                               -- (before or after the operation)
1470
           -> Instance.List    -- ^ The instance list; also used for
1471
                               -- names only
1472
           -> Idx              -- ^ The index of the instance being
1473
                               -- moved
1474
           -> IMove            -- ^ The actual move to be described
1475
           -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to
1476
                               -- the given move
1477
iMoveToJob nl il idx move =
1478
  let inst = Container.find idx il
1479
      iname = Instance.name inst
1480
      lookNode  n = case mkNonEmpty (Container.nameOf nl n) of
1481
                      -- FIXME: convert htools codebase to non-empty strings
1482
                      Bad msg -> error $ "Empty node name for idx " ++
1483
                                 show n ++ ": " ++ msg ++ "??"
1484
                      Ok ne -> Just ne
1485
      opF = OpCodes.OpInstanceMigrate
1486
              { OpCodes.opInstanceName        = iname
1487
              , OpCodes.opMigrationMode       = Nothing -- default
1488
              , OpCodes.opOldLiveMode         = Nothing -- default as well
1489
              , OpCodes.opTargetNode          = Nothing -- this is drbd
1490
              , OpCodes.opAllowRuntimeChanges = False
1491
              , OpCodes.opIgnoreIpolicy       = False
1492
              , OpCodes.opMigrationCleanup    = False
1493
              , OpCodes.opIallocator          = Nothing
1494
              , OpCodes.opAllowFailover       = True }
1495
      opFA n = opF { OpCodes.opTargetNode = lookNode n } -- not drbd
1496
      opR n = OpCodes.OpInstanceReplaceDisks
1497
                { OpCodes.opInstanceName     = iname
1498
                , OpCodes.opEarlyRelease     = False
1499
                , OpCodes.opIgnoreIpolicy    = False
1500
                , OpCodes.opReplaceDisksMode = OpCodes.ReplaceNewSecondary
1501
                , OpCodes.opReplaceDisksList = []
1502
                , OpCodes.opRemoteNode       = lookNode n
1503
                , OpCodes.opIallocator       = Nothing
1504
                }
1505
  in case move of
1506
       Failover -> [ opF ]
1507
       FailoverToAny np -> [ opFA np ]
1508
       ReplacePrimary np -> [ opF, opR np, opF ]
1509
       ReplaceSecondary ns -> [ opR ns ]
1510
       ReplaceAndFailover np -> [ opR np, opF ]
1511
       FailoverAndReplace ns -> [ opF, opR ns ]
1512

    
1513
-- * Node group functions
1514

    
1515
-- | Computes the group of an instance.
1516
instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1517
instanceGroup nl i =
1518
  let sidx = Instance.sNode i
1519
      pnode = Container.find (Instance.pNode i) nl
1520
      snode = if sidx == Node.noSecondary
1521
              then pnode
1522
              else Container.find sidx nl
1523
      pgroup = Node.group pnode
1524
      sgroup = Node.group snode
1525
  in if pgroup /= sgroup
1526
       then fail ("Instance placed accross two node groups, primary " ++
1527
                  show pgroup ++ ", secondary " ++ show sgroup)
1528
       else return pgroup
1529

    
1530
-- | Computes the group of an instance per the primary node.
1531
instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1532
instancePriGroup nl i =
1533
  let pnode = Container.find (Instance.pNode i) nl
1534
  in  Node.group pnode
1535

    
1536
-- | Compute the list of badly allocated instances (split across node
1537
-- groups).
1538
findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1539
findSplitInstances nl =
1540
  filter (not . isOk . instanceGroup nl) . Container.elems
1541

    
1542
-- | Splits a cluster into the component node groups.
1543
splitCluster :: Node.List -> Instance.List ->
1544
                [(Gdx, (Node.List, Instance.List))]
1545
splitCluster nl il =
1546
  let ngroups = Node.computeGroups (Container.elems nl)
1547
  in map (\(gdx, nodes) ->
1548
           let nidxs = map Node.idx nodes
1549
               nodes' = zip nidxs nodes
1550
               instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1551
           in (gdx, (Container.fromList nodes', instances))) ngroups
1552

    
1553
-- | Compute the list of nodes that are to be evacuated, given a list
1554
-- of instances and an evacuation mode.
1555
nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1556
                -> EvacMode      -- ^ The evacuation mode we're using
1557
                -> [Idx]         -- ^ List of instance indices being evacuated
1558
                -> IntSet.IntSet -- ^ Set of node indices
1559
nodesToEvacuate il mode =
1560
  IntSet.delete Node.noSecondary .
1561
  foldl' (\ns idx ->
1562
            let i = Container.find idx il
1563
                pdx = Instance.pNode i
1564
                sdx = Instance.sNode i
1565
                dt = Instance.diskTemplate i
1566
                withSecondary = case dt of
1567
                                  DTDrbd8 -> IntSet.insert sdx ns
1568
                                  _ -> ns
1569
            in case mode of
1570
                 ChangePrimary   -> IntSet.insert pdx ns
1571
                 ChangeSecondary -> withSecondary
1572
                 ChangeAll       -> IntSet.insert pdx withSecondary
1573
         ) IntSet.empty