Statistics
| Branch: | Tag: | Revision:

root / src / Ganeti / HTools / Cluster.hs @ 017160ed

History | View | Annotate | Download (68.1 kB)

1
{-| Implementation of cluster-wide logic.
2

    
3
This module holds all pure cluster-logic; I\/O related functionality
4
goes into the /Main/ module for the individual binaries.
5

    
6
-}
7

    
8
{-
9

    
10
Copyright (C) 2009, 2010, 2011, 2012, 2013 Google Inc.
11

    
12
This program is free software; you can redistribute it and/or modify
13
it under the terms of the GNU General Public License as published by
14
the Free Software Foundation; either version 2 of the License, or
15
(at your option) any later version.
16

    
17
This program is distributed in the hope that it will be useful, but
18
WITHOUT ANY WARRANTY; without even the implied warranty of
19
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
General Public License for more details.
21

    
22
You should have received a copy of the GNU General Public License
23
along with this program; if not, write to the Free Software
24
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25
02110-1301, USA.
26

    
27
-}
28

    
29
module Ganeti.HTools.Cluster
30
  (
31
    -- * Types
32
    AllocSolution(..)
33
  , EvacSolution(..)
34
  , Table(..)
35
  , CStats(..)
36
  , AllocNodes
37
  , AllocResult
38
  , AllocMethod
39
  , AllocSolutionList
40
  -- * Generic functions
41
  , totalResources
42
  , computeAllocationDelta
43
  -- * First phase functions
44
  , computeBadItems
45
  -- * Second phase functions
46
  , printSolutionLine
47
  , formatCmds
48
  , involvedNodes
49
  , splitJobs
50
  -- * Display functions
51
  , printNodes
52
  , printInsts
53
  -- * Balacing functions
54
  , checkMove
55
  , doNextBalance
56
  , tryBalance
57
  , compCV
58
  , compCVNodes
59
  , compDetailedCV
60
  , printStats
61
  , iMoveToJob
62
  -- * IAllocator functions
63
  , genAllocNodes
64
  , tryAlloc
65
  , tryMGAlloc
66
  , tryNodeEvac
67
  , tryChangeGroup
68
  , collapseFailures
69
  , allocList
70
  -- * Allocation functions
71
  , iterateAlloc
72
  , tieredAlloc
73
  -- * Node group functions
74
  , instanceGroup
75
  , findSplitInstances
76
  , splitCluster
77
  ) where
78

    
79
import qualified Data.IntSet as IntSet
80
import Data.List
81
import Data.Maybe (fromJust, isNothing)
82
import Data.Ord (comparing)
83
import Text.Printf (printf)
84

    
85
import Ganeti.BasicTypes
86
import qualified Ganeti.HTools.Container as Container
87
import qualified Ganeti.HTools.Instance as Instance
88
import qualified Ganeti.HTools.Nic as Nic
89
import qualified Ganeti.HTools.Node as Node
90
import qualified Ganeti.HTools.Group as Group
91
import Ganeti.HTools.Types
92
import Ganeti.Compat
93
import qualified Ganeti.OpCodes as OpCodes
94
import Ganeti.Utils
95
import Ganeti.Types (mkNonEmpty)
96

    
97
-- * Types
98

    
99
-- | Allocation\/relocation solution.
100
data AllocSolution = AllocSolution
101
  { asFailures :: [FailMode]              -- ^ Failure counts
102
  , asAllocs   :: Int                     -- ^ Good allocation count
103
  , asSolution :: Maybe Node.AllocElement -- ^ The actual allocation result
104
  , asLog      :: [String]                -- ^ Informational messages
105
  }
106

    
107
-- | Node evacuation/group change iallocator result type. This result
108
-- type consists of actual opcodes (a restricted subset) that are
109
-- transmitted back to Ganeti.
110
data EvacSolution = EvacSolution
111
  { esMoved   :: [(Idx, Gdx, [Ndx])]  -- ^ Instances moved successfully
112
  , esFailed  :: [(Idx, String)]      -- ^ Instances which were not
113
                                      -- relocated
114
  , esOpCodes :: [[OpCodes.OpCode]]   -- ^ List of jobs
115
  } deriving (Show)
116

    
117
-- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
118
type AllocResult = (FailStats, Node.List, Instance.List,
119
                    [Instance.Instance], [CStats])
120

    
121
-- | Type alias for easier handling.
122
type AllocSolutionList = [(Instance.Instance, AllocSolution)]
123

    
124
-- | A type denoting the valid allocation mode/pairs.
125
--
126
-- For a one-node allocation, this will be a @Left ['Ndx']@, whereas
127
-- for a two-node allocation, this will be a @Right [('Ndx',
128
-- ['Ndx'])]@. In the latter case, the list is basically an
129
-- association list, grouped by primary node and holding the potential
130
-- secondary nodes in the sub-list.
131
type AllocNodes = Either [Ndx] [(Ndx, [Ndx])]
132

    
133
-- | The empty solution we start with when computing allocations.
134
emptyAllocSolution :: AllocSolution
135
emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
136
                                   , asSolution = Nothing, asLog = [] }
137

    
138
-- | The empty evac solution.
139
emptyEvacSolution :: EvacSolution
140
emptyEvacSolution = EvacSolution { esMoved = []
141
                                 , esFailed = []
142
                                 , esOpCodes = []
143
                                 }
144

    
145
-- | The complete state for the balancing solution.
146
data Table = Table Node.List Instance.List Score [Placement]
147
             deriving (Show)
148

    
149
-- | Cluster statistics data type.
150
data CStats = CStats
151
  { csFmem :: Integer -- ^ Cluster free mem
152
  , csFdsk :: Integer -- ^ Cluster free disk
153
  , csAmem :: Integer -- ^ Cluster allocatable mem
154
  , csAdsk :: Integer -- ^ Cluster allocatable disk
155
  , csAcpu :: Integer -- ^ Cluster allocatable cpus
156
  , csMmem :: Integer -- ^ Max node allocatable mem
157
  , csMdsk :: Integer -- ^ Max node allocatable disk
158
  , csMcpu :: Integer -- ^ Max node allocatable cpu
159
  , csImem :: Integer -- ^ Instance used mem
160
  , csIdsk :: Integer -- ^ Instance used disk
161
  , csIcpu :: Integer -- ^ Instance used cpu
162
  , csTmem :: Double  -- ^ Cluster total mem
163
  , csTdsk :: Double  -- ^ Cluster total disk
164
  , csTcpu :: Double  -- ^ Cluster total cpus
165
  , csVcpu :: Integer -- ^ Cluster total virtual cpus
166
  , csNcpu :: Double  -- ^ Equivalent to 'csIcpu' but in terms of
167
                      -- physical CPUs, i.e. normalised used phys CPUs
168
  , csXmem :: Integer -- ^ Unnacounted for mem
169
  , csNmem :: Integer -- ^ Node own memory
170
  , csScore :: Score  -- ^ The cluster score
171
  , csNinst :: Int    -- ^ The total number of instances
172
  } deriving (Show)
173

    
174
-- | A simple type for allocation functions.
175
type AllocMethod =  Node.List           -- ^ Node list
176
                 -> Instance.List       -- ^ Instance list
177
                 -> Maybe Int           -- ^ Optional allocation limit
178
                 -> Instance.Instance   -- ^ Instance spec for allocation
179
                 -> AllocNodes          -- ^ Which nodes we should allocate on
180
                 -> [Instance.Instance] -- ^ Allocated instances
181
                 -> [CStats]            -- ^ Running cluster stats
182
                 -> Result AllocResult  -- ^ Allocation result
183

    
184
-- | A simple type for the running solution of evacuations.
185
type EvacInnerState =
186
  Either String (Node.List, Instance.Instance, Score, Ndx)
187

    
188
-- * Utility functions
189

    
190
-- | Verifies the N+1 status and return the affected nodes.
191
verifyN1 :: [Node.Node] -> [Node.Node]
192
verifyN1 = filter Node.failN1
193

    
194
{-| Computes the pair of bad nodes and instances.
195

    
196
The bad node list is computed via a simple 'verifyN1' check, and the
197
bad instance list is the list of primary and secondary instances of
198
those nodes.
199

    
200
-}
201
computeBadItems :: Node.List -> Instance.List ->
202
                   ([Node.Node], [Instance.Instance])
203
computeBadItems nl il =
204
  let bad_nodes = verifyN1 $ getOnline nl
205
      bad_instances = map (`Container.find` il) .
206
                      sort . nub $
207
                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
208
  in
209
    (bad_nodes, bad_instances)
210

    
211
-- | Extracts the node pairs for an instance. This can fail if the
212
-- instance is single-homed. FIXME: this needs to be improved,
213
-- together with the general enhancement for handling non-DRBD moves.
214
instanceNodes :: Node.List -> Instance.Instance ->
215
                 (Ndx, Ndx, Node.Node, Node.Node)
216
instanceNodes nl inst =
217
  let old_pdx = Instance.pNode inst
218
      old_sdx = Instance.sNode inst
219
      old_p = Container.find old_pdx nl
220
      old_s = Container.find old_sdx nl
221
  in (old_pdx, old_sdx, old_p, old_s)
222

    
223
-- | Zero-initializer for the CStats type.
224
emptyCStats :: CStats
225
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
226

    
227
-- | Update stats with data from a new node.
228
updateCStats :: CStats -> Node.Node -> CStats
229
updateCStats cs node =
230
  let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
231
               csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
232
               csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
233
               csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
234
               csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
235
               csVcpu = x_vcpu, csNcpu = x_ncpu,
236
               csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
237
             }
238
        = cs
239
      inc_amem = Node.fMem node - Node.rMem node
240
      inc_amem' = if inc_amem > 0 then inc_amem else 0
241
      inc_adsk = Node.availDisk node
242
      inc_imem = truncate (Node.tMem node) - Node.nMem node
243
                 - Node.xMem node - Node.fMem node
244
      inc_icpu = Node.uCpu node
245
      inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
246
      inc_vcpu = Node.hiCpu node
247
      inc_acpu = Node.availCpu node
248
      inc_ncpu = fromIntegral (Node.uCpu node) /
249
                 iPolicyVcpuRatio (Node.iPolicy node)
250
  in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
251
        , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
252
        , csAmem = x_amem + fromIntegral inc_amem'
253
        , csAdsk = x_adsk + fromIntegral inc_adsk
254
        , csAcpu = x_acpu + fromIntegral inc_acpu
255
        , csMmem = max x_mmem (fromIntegral inc_amem')
256
        , csMdsk = max x_mdsk (fromIntegral inc_adsk)
257
        , csMcpu = max x_mcpu (fromIntegral inc_acpu)
258
        , csImem = x_imem + fromIntegral inc_imem
259
        , csIdsk = x_idsk + fromIntegral inc_idsk
260
        , csIcpu = x_icpu + fromIntegral inc_icpu
261
        , csTmem = x_tmem + Node.tMem node
262
        , csTdsk = x_tdsk + Node.tDsk node
263
        , csTcpu = x_tcpu + Node.tCpu node
264
        , csVcpu = x_vcpu + fromIntegral inc_vcpu
265
        , csNcpu = x_ncpu + inc_ncpu
266
        , csXmem = x_xmem + fromIntegral (Node.xMem node)
267
        , csNmem = x_nmem + fromIntegral (Node.nMem node)
268
        , csNinst = x_ninst + length (Node.pList node)
269
        }
270

    
271
-- | Compute the total free disk and memory in the cluster.
272
totalResources :: Node.List -> CStats
273
totalResources nl =
274
  let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
275
  in cs { csScore = compCV nl }
276

    
277
-- | Compute the delta between two cluster state.
278
--
279
-- This is used when doing allocations, to understand better the
280
-- available cluster resources. The return value is a triple of the
281
-- current used values, the delta that was still allocated, and what
282
-- was left unallocated.
283
computeAllocationDelta :: CStats -> CStats -> AllocStats
284
computeAllocationDelta cini cfin =
285
  let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu,
286
              csNcpu = i_ncpu } = cini
287
      CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
288
              csTmem = t_mem, csTdsk = t_dsk, csVcpu = f_vcpu,
289
              csNcpu = f_ncpu, csTcpu = f_tcpu } = cfin
290
      rini = AllocInfo { allocInfoVCpus = fromIntegral i_icpu
291
                       , allocInfoNCpus = i_ncpu
292
                       , allocInfoMem   = fromIntegral i_imem
293
                       , allocInfoDisk  = fromIntegral i_idsk
294
                       }
295
      rfin = AllocInfo { allocInfoVCpus = fromIntegral (f_icpu - i_icpu)
296
                       , allocInfoNCpus = f_ncpu - i_ncpu
297
                       , allocInfoMem   = fromIntegral (f_imem - i_imem)
298
                       , allocInfoDisk  = fromIntegral (f_idsk - i_idsk)
299
                       }
300
      runa = AllocInfo { allocInfoVCpus = fromIntegral (f_vcpu - f_icpu)
301
                       , allocInfoNCpus = f_tcpu - f_ncpu
302
                       , allocInfoMem   = truncate t_mem - fromIntegral f_imem
303
                       , allocInfoDisk  = truncate t_dsk - fromIntegral f_idsk
304
                       }
305
  in (rini, rfin, runa)
306

    
307
-- | The names and weights of the individual elements in the CV list.
308
detailedCVInfo :: [(Double, String)]
309
detailedCVInfo = [ (1,  "free_mem_cv")
310
                 , (1,  "free_disk_cv")
311
                 , (1,  "n1_cnt")
312
                 , (1,  "reserved_mem_cv")
313
                 , (4,  "offline_all_cnt")
314
                 , (16, "offline_pri_cnt")
315
                 , (1,  "vcpu_ratio_cv")
316
                 , (1,  "cpu_load_cv")
317
                 , (1,  "mem_load_cv")
318
                 , (1,  "disk_load_cv")
319
                 , (1,  "net_load_cv")
320
                 , (2,  "pri_tags_score")
321
                 , (1,  "spindles_cv")
322
                 ]
323

    
324
-- | Holds the weights used by 'compCVNodes' for each metric.
325
detailedCVWeights :: [Double]
326
detailedCVWeights = map fst detailedCVInfo
327

    
328
-- | Compute the mem and disk covariance.
329
compDetailedCV :: [Node.Node] -> [Double]
330
compDetailedCV all_nodes =
331
  let (offline, nodes) = partition Node.offline all_nodes
332
      mem_l = map Node.pMem nodes
333
      dsk_l = map Node.pDsk nodes
334
      -- metric: memory covariance
335
      mem_cv = stdDev mem_l
336
      -- metric: disk covariance
337
      dsk_cv = stdDev dsk_l
338
      -- metric: count of instances living on N1 failing nodes
339
      n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
340
                                                 length (Node.pList n)) .
341
                 filter Node.failN1 $ nodes :: Double
342
      res_l = map Node.pRem nodes
343
      -- metric: reserved memory covariance
344
      res_cv = stdDev res_l
345
      -- offline instances metrics
346
      offline_ipri = sum . map (length . Node.pList) $ offline
347
      offline_isec = sum . map (length . Node.sList) $ offline
348
      -- metric: count of instances on offline nodes
349
      off_score = fromIntegral (offline_ipri + offline_isec)::Double
350
      -- metric: count of primary instances on offline nodes (this
351
      -- helps with evacuation/failover of primary instances on
352
      -- 2-node clusters with one node offline)
353
      off_pri_score = fromIntegral offline_ipri::Double
354
      cpu_l = map Node.pCpu nodes
355
      -- metric: covariance of vcpu/pcpu ratio
356
      cpu_cv = stdDev cpu_l
357
      -- metrics: covariance of cpu, memory, disk and network load
358
      (c_load, m_load, d_load, n_load) =
359
        unzip4 $ map (\n ->
360
                      let DynUtil c1 m1 d1 n1 = Node.utilLoad n
361
                          DynUtil c2 m2 d2 n2 = Node.utilPool n
362
                      in (c1/c2, m1/m2, d1/d2, n1/n2)) nodes
363
      -- metric: conflicting instance count
364
      pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
365
      pri_tags_score = fromIntegral pri_tags_inst::Double
366
      -- metric: spindles %
367
      spindles_cv = map (\n -> Node.instSpindles n / Node.hiSpindles n) nodes
368
  in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
369
     , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
370
     , pri_tags_score, stdDev spindles_cv ]
371

    
372
-- | Compute the /total/ variance.
373
compCVNodes :: [Node.Node] -> Double
374
compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
375

    
376
-- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
377
compCV :: Node.List -> Double
378
compCV = compCVNodes . Container.elems
379

    
380
-- | Compute online nodes from a 'Node.List'.
381
getOnline :: Node.List -> [Node.Node]
382
getOnline = filter (not . Node.offline) . Container.elems
383

    
384
-- * Balancing functions
385

    
386
-- | Compute best table. Note that the ordering of the arguments is important.
387
compareTables :: Table -> Table -> Table
388
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
389
  if a_cv > b_cv then b else a
390

    
391
-- | Applies an instance move to a given node list and instance.
392
applyMove :: Node.List -> Instance.Instance
393
          -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
394
-- Failover (f)
395
applyMove nl inst Failover =
396
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
397
      int_p = Node.removePri old_p inst
398
      int_s = Node.removeSec old_s inst
399
      new_nl = do -- Maybe monad
400
        new_p <- Node.addPriEx (Node.offline old_p) int_s inst
401
        new_s <- Node.addSec int_p inst old_sdx
402
        let new_inst = Instance.setBoth inst old_sdx old_pdx
403
        return (Container.addTwo old_pdx new_s old_sdx new_p nl,
404
                new_inst, old_sdx, old_pdx)
405
  in new_nl
406

    
407
-- Failover to any (fa)
408
applyMove nl inst (FailoverToAny new_pdx) = do
409
  let (old_pdx, old_sdx, old_pnode, _) = instanceNodes nl inst
410
      new_pnode = Container.find new_pdx nl
411
      force_failover = Node.offline old_pnode
412
  new_pnode' <- Node.addPriEx force_failover new_pnode inst
413
  let old_pnode' = Node.removePri old_pnode inst
414
      inst' = Instance.setPri inst new_pdx
415
      nl' = Container.addTwo old_pdx old_pnode' new_pdx new_pnode' nl
416
  return (nl', inst', new_pdx, old_sdx)
417

    
418
-- Replace the primary (f:, r:np, f)
419
applyMove nl inst (ReplacePrimary new_pdx) =
420
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
421
      tgt_n = Container.find new_pdx nl
422
      int_p = Node.removePri old_p inst
423
      int_s = Node.removeSec old_s inst
424
      force_p = Node.offline old_p
425
      new_nl = do -- Maybe monad
426
                  -- check that the current secondary can host the instance
427
                  -- during the migration
428
        tmp_s <- Node.addPriEx force_p int_s inst
429
        let tmp_s' = Node.removePri tmp_s inst
430
        new_p <- Node.addPriEx force_p tgt_n inst
431
        new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
432
        let new_inst = Instance.setPri inst new_pdx
433
        return (Container.add new_pdx new_p $
434
                Container.addTwo old_pdx int_p old_sdx new_s nl,
435
                new_inst, new_pdx, old_sdx)
436
  in new_nl
437

    
438
-- Replace the secondary (r:ns)
439
applyMove nl inst (ReplaceSecondary new_sdx) =
440
  let old_pdx = Instance.pNode inst
441
      old_sdx = Instance.sNode inst
442
      old_s = Container.find old_sdx nl
443
      tgt_n = Container.find new_sdx nl
444
      int_s = Node.removeSec old_s inst
445
      force_s = Node.offline old_s
446
      new_inst = Instance.setSec inst new_sdx
447
      new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
448
               \new_s -> return (Container.addTwo new_sdx
449
                                 new_s old_sdx int_s nl,
450
                                 new_inst, old_pdx, new_sdx)
451
  in new_nl
452

    
453
-- Replace the secondary and failover (r:np, f)
454
applyMove nl inst (ReplaceAndFailover new_pdx) =
455
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
456
      tgt_n = Container.find new_pdx nl
457
      int_p = Node.removePri old_p inst
458
      int_s = Node.removeSec old_s inst
459
      force_s = Node.offline old_s
460
      new_nl = do -- Maybe monad
461
        new_p <- Node.addPri tgt_n inst
462
        new_s <- Node.addSecEx force_s int_p inst new_pdx
463
        let new_inst = Instance.setBoth inst new_pdx old_pdx
464
        return (Container.add new_pdx new_p $
465
                Container.addTwo old_pdx new_s old_sdx int_s nl,
466
                new_inst, new_pdx, old_pdx)
467
  in new_nl
468

    
469
-- Failver and replace the secondary (f, r:ns)
470
applyMove nl inst (FailoverAndReplace new_sdx) =
471
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
472
      tgt_n = Container.find new_sdx nl
473
      int_p = Node.removePri old_p inst
474
      int_s = Node.removeSec old_s inst
475
      force_p = Node.offline old_p
476
      new_nl = do -- Maybe monad
477
        new_p <- Node.addPriEx force_p int_s inst
478
        new_s <- Node.addSecEx force_p tgt_n inst old_sdx
479
        let new_inst = Instance.setBoth inst old_sdx new_sdx
480
        return (Container.add new_sdx new_s $
481
                Container.addTwo old_sdx new_p old_pdx int_p nl,
482
                new_inst, old_sdx, new_sdx)
483
  in new_nl
484

    
485
-- | Tries to allocate an instance on one given node.
486
allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
487
                 -> OpResult Node.AllocElement
488
allocateOnSingle nl inst new_pdx =
489
  let p = Container.find new_pdx nl
490
      new_inst = Instance.setBoth inst new_pdx Node.noSecondary
491
  in do
492
    Instance.instMatchesPolicy inst (Node.iPolicy p)
493
    new_p <- Node.addPri p inst
494
    let new_nl = Container.add new_pdx new_p nl
495
        new_score = compCV new_nl
496
    return (new_nl, new_inst, [new_p], new_score)
497

    
498
-- | Tries to allocate an instance on a given pair of nodes.
499
allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
500
               -> OpResult Node.AllocElement
501
allocateOnPair nl inst new_pdx new_sdx =
502
  let tgt_p = Container.find new_pdx nl
503
      tgt_s = Container.find new_sdx nl
504
  in do
505
    Instance.instMatchesPolicy inst (Node.iPolicy tgt_p)
506
    new_p <- Node.addPri tgt_p inst
507
    new_s <- Node.addSec tgt_s inst new_pdx
508
    let new_inst = Instance.setBoth inst new_pdx new_sdx
509
        new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
510
    return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
511

    
512
-- | Tries to perform an instance move and returns the best table
513
-- between the original one and the new one.
514
checkSingleStep :: Table -- ^ The original table
515
                -> Instance.Instance -- ^ The instance to move
516
                -> Table -- ^ The current best table
517
                -> IMove -- ^ The move to apply
518
                -> Table -- ^ The final best table
519
checkSingleStep ini_tbl target cur_tbl move =
520
  let Table ini_nl ini_il _ ini_plc = ini_tbl
521
      tmp_resu = applyMove ini_nl target move
522
  in case tmp_resu of
523
       Bad _ -> cur_tbl
524
       Ok (upd_nl, new_inst, pri_idx, sec_idx) ->
525
         let tgt_idx = Instance.idx target
526
             upd_cvar = compCV upd_nl
527
             upd_il = Container.add tgt_idx new_inst ini_il
528
             upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
529
             upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
530
         in compareTables cur_tbl upd_tbl
531

    
532
-- | Given the status of the current secondary as a valid new node and
533
-- the current candidate target node, generate the possible moves for
534
-- a instance.
535
possibleMoves :: MirrorType -- ^ The mirroring type of the instance
536
              -> Bool       -- ^ Whether the secondary node is a valid new node
537
              -> Bool       -- ^ Whether we can change the primary node
538
              -> Ndx        -- ^ Target node candidate
539
              -> [IMove]    -- ^ List of valid result moves
540

    
541
possibleMoves MirrorNone _ _ _ = []
542

    
543
possibleMoves MirrorExternal _ False _ = []
544

    
545
possibleMoves MirrorExternal _ True tdx =
546
  [ FailoverToAny tdx ]
547

    
548
possibleMoves MirrorInternal _ False tdx =
549
  [ ReplaceSecondary tdx ]
550

    
551
possibleMoves MirrorInternal True True tdx =
552
  [ ReplaceSecondary tdx
553
  , ReplaceAndFailover tdx
554
  , ReplacePrimary tdx
555
  , FailoverAndReplace tdx
556
  ]
557

    
558
possibleMoves MirrorInternal False True tdx =
559
  [ ReplaceSecondary tdx
560
  , ReplaceAndFailover tdx
561
  ]
562

    
563
-- | Compute the best move for a given instance.
564
checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
565
                  -> Bool              -- ^ Whether disk moves are allowed
566
                  -> Bool              -- ^ Whether instance moves are allowed
567
                  -> Table             -- ^ Original table
568
                  -> Instance.Instance -- ^ Instance to move
569
                  -> Table             -- ^ Best new table for this instance
570
checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
571
  let opdx = Instance.pNode target
572
      osdx = Instance.sNode target
573
      bad_nodes = [opdx, osdx]
574
      nodes = filter (`notElem` bad_nodes) nodes_idx
575
      mir_type = Instance.mirrorType target
576
      use_secondary = elem osdx nodes_idx && inst_moves
577
      aft_failover = if mir_type == MirrorInternal && use_secondary
578
                       -- if drbd and allowed to failover
579
                       then checkSingleStep ini_tbl target ini_tbl Failover
580
                       else ini_tbl
581
      all_moves =
582
        if disk_moves
583
          then concatMap (possibleMoves mir_type use_secondary inst_moves)
584
               nodes
585
          else []
586
    in
587
      -- iterate over the possible nodes for this instance
588
      foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
589

    
590
-- | Compute the best next move.
591
checkMove :: [Ndx]               -- ^ Allowed target node indices
592
          -> Bool                -- ^ Whether disk moves are allowed
593
          -> Bool                -- ^ Whether instance moves are allowed
594
          -> Table               -- ^ The current solution
595
          -> [Instance.Instance] -- ^ List of instances still to move
596
          -> Table               -- ^ The new solution
597
checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
598
  let Table _ _ _ ini_plc = ini_tbl
599
      -- we're using rwhnf from the Control.Parallel.Strategies
600
      -- package; we don't need to use rnf as that would force too
601
      -- much evaluation in single-threaded cases, and in
602
      -- multi-threaded case the weak head normal form is enough to
603
      -- spark the evaluation
604
      tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
605
                             inst_moves ini_tbl)
606
               victims
607
      -- iterate over all instances, computing the best move
608
      best_tbl = foldl' compareTables ini_tbl tables
609
      Table _ _ _ best_plc = best_tbl
610
  in if length best_plc == length ini_plc
611
       then ini_tbl -- no advancement
612
       else best_tbl
613

    
614
-- | Check if we are allowed to go deeper in the balancing.
615
doNextBalance :: Table     -- ^ The starting table
616
              -> Int       -- ^ Remaining length
617
              -> Score     -- ^ Score at which to stop
618
              -> Bool      -- ^ The resulting table and commands
619
doNextBalance ini_tbl max_rounds min_score =
620
  let Table _ _ ini_cv ini_plc = ini_tbl
621
      ini_plc_len = length ini_plc
622
  in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
623

    
624
-- | Run a balance move.
625
tryBalance :: Table       -- ^ The starting table
626
           -> Bool        -- ^ Allow disk moves
627
           -> Bool        -- ^ Allow instance moves
628
           -> Bool        -- ^ Only evacuate moves
629
           -> Score       -- ^ Min gain threshold
630
           -> Score       -- ^ Min gain
631
           -> Maybe Table -- ^ The resulting table and commands
632
tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
633
    let Table ini_nl ini_il ini_cv _ = ini_tbl
634
        all_inst = Container.elems ini_il
635
        all_nodes = Container.elems ini_nl
636
        (offline_nodes, online_nodes) = partition Node.offline all_nodes
637
        all_inst' = if evac_mode
638
                      then let bad_nodes = map Node.idx offline_nodes
639
                           in filter (any (`elem` bad_nodes) .
640
                                          Instance.allNodes) all_inst
641
                      else all_inst
642
        reloc_inst = filter (\i -> Instance.movable i &&
643
                                   Instance.autoBalance i) all_inst'
644
        node_idx = map Node.idx online_nodes
645
        fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
646
        (Table _ _ fin_cv _) = fin_tbl
647
    in
648
      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
649
      then Just fin_tbl -- this round made success, return the new table
650
      else Nothing
651

    
652
-- * Allocation functions
653

    
654
-- | Build failure stats out of a list of failures.
655
collapseFailures :: [FailMode] -> FailStats
656
collapseFailures flst =
657
    map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
658
            [minBound..maxBound]
659

    
660
-- | Compares two Maybe AllocElement and chooses the best score.
661
bestAllocElement :: Maybe Node.AllocElement
662
                 -> Maybe Node.AllocElement
663
                 -> Maybe Node.AllocElement
664
bestAllocElement a Nothing = a
665
bestAllocElement Nothing b = b
666
bestAllocElement a@(Just (_, _, _, ascore)) b@(Just (_, _, _, bscore)) =
667
  if ascore < bscore then a else b
668

    
669
-- | Update current Allocation solution and failure stats with new
670
-- elements.
671
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
672
concatAllocs as (Bad reason) = as { asFailures = reason : asFailures as }
673

    
674
concatAllocs as (Ok ns) =
675
  let -- Choose the old or new solution, based on the cluster score
676
    cntok = asAllocs as
677
    osols = asSolution as
678
    nsols = bestAllocElement osols (Just ns)
679
    nsuc = cntok + 1
680
    -- Note: we force evaluation of nsols here in order to keep the
681
    -- memory profile low - we know that we will need nsols for sure
682
    -- in the next cycle, so we force evaluation of nsols, since the
683
    -- foldl' in the caller will only evaluate the tuple, but not the
684
    -- elements of the tuple
685
  in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolution = nsols }
686

    
687
-- | Sums two 'AllocSolution' structures.
688
sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
689
sumAllocs (AllocSolution aFails aAllocs aSols aLog)
690
          (AllocSolution bFails bAllocs bSols bLog) =
691
  -- note: we add b first, since usually it will be smaller; when
692
  -- fold'ing, a will grow and grow whereas b is the per-group
693
  -- result, hence smaller
694
  let nFails  = bFails ++ aFails
695
      nAllocs = aAllocs + bAllocs
696
      nSols   = bestAllocElement aSols bSols
697
      nLog    = bLog ++ aLog
698
  in AllocSolution nFails nAllocs nSols nLog
699

    
700
-- | Given a solution, generates a reasonable description for it.
701
describeSolution :: AllocSolution -> String
702
describeSolution as =
703
  let fcnt = asFailures as
704
      sols = asSolution as
705
      freasons =
706
        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
707
        filter ((> 0) . snd) . collapseFailures $ fcnt
708
  in case sols of
709
     Nothing -> "No valid allocation solutions, failure reasons: " ++
710
                (if null fcnt then "unknown reasons" else freasons)
711
     Just (_, _, nodes, cv) ->
712
         printf ("score: %.8f, successes %d, failures %d (%s)" ++
713
                 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
714
               (intercalate "/" . map Node.name $ nodes)
715

    
716
-- | Annotates a solution with the appropriate string.
717
annotateSolution :: AllocSolution -> AllocSolution
718
annotateSolution as = as { asLog = describeSolution as : asLog as }
719

    
720
-- | Reverses an evacuation solution.
721
--
722
-- Rationale: we always concat the results to the top of the lists, so
723
-- for proper jobset execution, we should reverse all lists.
724
reverseEvacSolution :: EvacSolution -> EvacSolution
725
reverseEvacSolution (EvacSolution f m o) =
726
  EvacSolution (reverse f) (reverse m) (reverse o)
727

    
728
-- | Generate the valid node allocation singles or pairs for a new instance.
729
genAllocNodes :: Group.List        -- ^ Group list
730
              -> Node.List         -- ^ The node map
731
              -> Int               -- ^ The number of nodes required
732
              -> Bool              -- ^ Whether to drop or not
733
                                   -- unallocable nodes
734
              -> Result AllocNodes -- ^ The (monadic) result
735
genAllocNodes gl nl count drop_unalloc =
736
  let filter_fn = if drop_unalloc
737
                    then filter (Group.isAllocable .
738
                                 flip Container.find gl . Node.group)
739
                    else id
740
      all_nodes = filter_fn $ getOnline nl
741
      all_pairs = [(Node.idx p,
742
                    [Node.idx s | s <- all_nodes,
743
                                       Node.idx p /= Node.idx s,
744
                                       Node.group p == Node.group s]) |
745
                   p <- all_nodes]
746
  in case count of
747
       1 -> Ok (Left (map Node.idx all_nodes))
748
       2 -> Ok (Right (filter (not . null . snd) all_pairs))
749
       _ -> Bad "Unsupported number of nodes, only one or two  supported"
750

    
751
-- | Try to allocate an instance on the cluster.
752
tryAlloc :: (Monad m) =>
753
            Node.List         -- ^ The node list
754
         -> Instance.List     -- ^ The instance list
755
         -> Instance.Instance -- ^ The instance to allocate
756
         -> AllocNodes        -- ^ The allocation targets
757
         -> m AllocSolution   -- ^ Possible solution list
758
tryAlloc _  _ _    (Right []) = fail "Not enough online nodes"
759
tryAlloc nl _ inst (Right ok_pairs) =
760
  let psols = parMap rwhnf (\(p, ss) ->
761
                              foldl' (\cstate ->
762
                                        concatAllocs cstate .
763
                                        allocateOnPair nl inst p)
764
                              emptyAllocSolution ss) ok_pairs
765
      sols = foldl' sumAllocs emptyAllocSolution psols
766
  in return $ annotateSolution sols
767

    
768
tryAlloc _  _ _    (Left []) = fail "No online nodes"
769
tryAlloc nl _ inst (Left all_nodes) =
770
  let sols = foldl' (\cstate ->
771
                       concatAllocs cstate . allocateOnSingle nl inst
772
                    ) emptyAllocSolution all_nodes
773
  in return $ annotateSolution sols
774

    
775
-- | Given a group/result, describe it as a nice (list of) messages.
776
solutionDescription :: (Group.Group, Result AllocSolution)
777
                    -> [String]
778
solutionDescription (grp, result) =
779
  case result of
780
    Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
781
    Bad message -> [printf "Group %s: error %s" gname message]
782
  where gname = Group.name grp
783
        pol = allocPolicyToRaw (Group.allocPolicy grp)
784

    
785
-- | From a list of possibly bad and possibly empty solutions, filter
786
-- only the groups with a valid result. Note that the result will be
787
-- reversed compared to the original list.
788
filterMGResults :: [(Group.Group, Result AllocSolution)]
789
                -> [(Group.Group, AllocSolution)]
790
filterMGResults = foldl' fn []
791
  where unallocable = not . Group.isAllocable
792
        fn accu (grp, rasol) =
793
          case rasol of
794
            Bad _ -> accu
795
            Ok sol | isNothing (asSolution sol) -> accu
796
                   | unallocable grp -> accu
797
                   | otherwise -> (grp, sol):accu
798

    
799
-- | Sort multigroup results based on policy and score.
800
sortMGResults :: [(Group.Group, AllocSolution)]
801
              -> [(Group.Group, AllocSolution)]
802
sortMGResults sols =
803
  let extractScore (_, _, _, x) = x
804
      solScore (grp, sol) = (Group.allocPolicy grp,
805
                             (extractScore . fromJust . asSolution) sol)
806
  in sortBy (comparing solScore) sols
807

    
808
-- | Removes node groups which can't accommodate the instance
809
filterValidGroups :: [(Group.Group, (Node.List, Instance.List))]
810
                  -> Instance.Instance
811
                  -> ([(Group.Group, (Node.List, Instance.List))], [String])
812
filterValidGroups [] _ = ([], [])
813
filterValidGroups (ng:ngs) inst =
814
  let (valid_ngs, msgs) = filterValidGroups ngs inst
815
      hasNetwork nic = case Nic.network nic of
816
        Just net -> net `elem` Group.networks (fst ng)
817
        Nothing -> True
818
      hasRequiredNetworks = all hasNetwork (Instance.nics inst)
819
  in if hasRequiredNetworks
820
      then (ng:valid_ngs, msgs)
821
      else (valid_ngs,
822
            ("group " ++ Group.name (fst ng) ++
823
             " is not connected to a network required by instance " ++
824
             Instance.name inst):msgs)
825

    
826
-- | Finds the best group for an instance on a multi-group cluster.
827
--
828
-- Only solutions in @preferred@ and @last_resort@ groups will be
829
-- accepted as valid, and additionally if the allowed groups parameter
830
-- is not null then allocation will only be run for those group
831
-- indices.
832
findBestAllocGroup :: Group.List           -- ^ The group list
833
                   -> Node.List            -- ^ The node list
834
                   -> Instance.List        -- ^ The instance list
835
                   -> Maybe [Gdx]          -- ^ The allowed groups
836
                   -> Instance.Instance    -- ^ The instance to allocate
837
                   -> Int                  -- ^ Required number of nodes
838
                   -> Result (Group.Group, AllocSolution, [String])
839
findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
840
  let groups_by_idx = splitCluster mgnl mgil
841
      groups = map (\(gid, d) -> (Container.find gid mggl, d)) groups_by_idx
842
      groups' = maybe groups
843
                (\gs -> filter ((`elem` gs) . Group.idx . fst) groups)
844
                allowed_gdxs
845
      (groups'', filter_group_msgs) = filterValidGroups groups' inst
846
      sols = map (\(gr, (nl, il)) ->
847
                   (gr, genAllocNodes mggl nl cnt False >>=
848
                        tryAlloc nl il inst))
849
             groups''::[(Group.Group, Result AllocSolution)]
850
      all_msgs = filter_group_msgs ++ concatMap solutionDescription sols
851
      goodSols = filterMGResults sols
852
      sortedSols = sortMGResults goodSols
853
  in case sortedSols of
854
       [] -> Bad $ if null groups'
855
                     then "no groups for evacuation: allowed groups was" ++
856
                          show allowed_gdxs ++ ", all groups: " ++
857
                          show (map fst groups)
858
                     else intercalate ", " all_msgs
859
       (final_group, final_sol):_ -> return (final_group, final_sol, all_msgs)
860

    
861
-- | Try to allocate an instance on a multi-group cluster.
862
tryMGAlloc :: Group.List           -- ^ The group list
863
           -> Node.List            -- ^ The node list
864
           -> Instance.List        -- ^ The instance list
865
           -> Instance.Instance    -- ^ The instance to allocate
866
           -> Int                  -- ^ Required number of nodes
867
           -> Result AllocSolution -- ^ Possible solution list
868
tryMGAlloc mggl mgnl mgil inst cnt = do
869
  (best_group, solution, all_msgs) <-
870
      findBestAllocGroup mggl mgnl mgil Nothing inst cnt
871
  let group_name = Group.name best_group
872
      selmsg = "Selected group: " ++ group_name
873
  return $ solution { asLog = selmsg:all_msgs }
874

    
875
-- | Calculate the new instance list after allocation solution.
876
updateIl :: Instance.List           -- ^ The original instance list
877
         -> Maybe Node.AllocElement -- ^ The result of the allocation attempt
878
         -> Instance.List           -- ^ The updated instance list
879
updateIl il Nothing = il
880
updateIl il (Just (_, xi, _, _)) = Container.add (Container.size il) xi il
881

    
882
-- | Extract the the new node list from the allocation solution.
883
extractNl :: Node.List               -- ^ The original node list
884
          -> Maybe Node.AllocElement -- ^ The result of the allocation attempt
885
          -> Node.List               -- ^ The new node list
886
extractNl nl Nothing = nl
887
extractNl _ (Just (xnl, _, _, _)) = xnl
888

    
889
-- | Try to allocate a list of instances on a multi-group cluster.
890
allocList :: Group.List                  -- ^ The group list
891
          -> Node.List                   -- ^ The node list
892
          -> Instance.List               -- ^ The instance list
893
          -> [(Instance.Instance, Int)]  -- ^ The instance to allocate
894
          -> AllocSolutionList           -- ^ Possible solution list
895
          -> Result (Node.List, Instance.List,
896
                     AllocSolutionList)  -- ^ The final solution list
897
allocList _  nl il [] result = Ok (nl, il, result)
898
allocList gl nl il ((xi, xicnt):xies) result = do
899
  ares <- tryMGAlloc gl nl il xi xicnt
900
  let sol = asSolution ares
901
      nl' = extractNl nl sol
902
      il' = updateIl il sol
903
  allocList gl nl' il' xies ((xi, ares):result)
904

    
905
-- | Function which fails if the requested mode is change secondary.
906
--
907
-- This is useful since except DRBD, no other disk template can
908
-- execute change secondary; thus, we can just call this function
909
-- instead of always checking for secondary mode. After the call to
910
-- this function, whatever mode we have is just a primary change.
911
failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
912
failOnSecondaryChange ChangeSecondary dt =
913
  fail $ "Instances with disk template '" ++ diskTemplateToRaw dt ++
914
         "' can't execute change secondary"
915
failOnSecondaryChange _ _ = return ()
916

    
917
-- | Run evacuation for a single instance.
918
--
919
-- /Note:/ this function should correctly execute both intra-group
920
-- evacuations (in all modes) and inter-group evacuations (in the
921
-- 'ChangeAll' mode). Of course, this requires that the correct list
922
-- of target nodes is passed.
923
nodeEvacInstance :: Node.List         -- ^ The node list (cluster-wide)
924
                 -> Instance.List     -- ^ Instance list (cluster-wide)
925
                 -> EvacMode          -- ^ The evacuation mode
926
                 -> Instance.Instance -- ^ The instance to be evacuated
927
                 -> Gdx               -- ^ The group we're targetting
928
                 -> [Ndx]             -- ^ The list of available nodes
929
                                      -- for allocation
930
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
931
nodeEvacInstance nl il mode inst@(Instance.Instance
932
                                  {Instance.diskTemplate = dt@DTDiskless})
933
                 gdx avail_nodes =
934
                   failOnSecondaryChange mode dt >>
935
                   evacOneNodeOnly nl il inst gdx avail_nodes
936

    
937
nodeEvacInstance _ _ _ (Instance.Instance
938
                        {Instance.diskTemplate = DTPlain}) _ _ =
939
                  fail "Instances of type plain cannot be relocated"
940

    
941
nodeEvacInstance _ _ _ (Instance.Instance
942
                        {Instance.diskTemplate = DTFile}) _ _ =
943
                  fail "Instances of type file cannot be relocated"
944

    
945
nodeEvacInstance nl il mode inst@(Instance.Instance
946
                                  {Instance.diskTemplate = dt@DTSharedFile})
947
                 gdx avail_nodes =
948
                   failOnSecondaryChange mode dt >>
949
                   evacOneNodeOnly nl il inst gdx avail_nodes
950

    
951
nodeEvacInstance nl il mode inst@(Instance.Instance
952
                                  {Instance.diskTemplate = dt@DTBlock})
953
                 gdx avail_nodes =
954
                   failOnSecondaryChange mode dt >>
955
                   evacOneNodeOnly nl il inst gdx avail_nodes
956

    
957
nodeEvacInstance nl il mode inst@(Instance.Instance
958
                                  {Instance.diskTemplate = dt@DTRbd})
959
                 gdx avail_nodes =
960
                   failOnSecondaryChange mode dt >>
961
                   evacOneNodeOnly nl il inst gdx avail_nodes
962

    
963
nodeEvacInstance nl il mode inst@(Instance.Instance
964
                                  {Instance.diskTemplate = dt@DTExt})
965
                 gdx avail_nodes =
966
                   failOnSecondaryChange mode dt >>
967
                   evacOneNodeOnly nl il inst gdx avail_nodes
968

    
969
nodeEvacInstance nl il ChangePrimary
970
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
971
                 _ _ =
972
  do
973
    (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
974
    let idx = Instance.idx inst
975
        il' = Container.add idx inst' il
976
        ops = iMoveToJob nl' il' idx Failover
977
    return (nl', il', ops)
978

    
979
nodeEvacInstance nl il ChangeSecondary
980
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
981
                 gdx avail_nodes =
982
  evacOneNodeOnly nl il inst gdx avail_nodes
983

    
984
-- The algorithm for ChangeAll is as follows:
985
--
986
-- * generate all (primary, secondary) node pairs for the target groups
987
-- * for each pair, execute the needed moves (r:s, f, r:s) and compute
988
--   the final node list state and group score
989
-- * select the best choice via a foldl that uses the same Either
990
--   String solution as the ChangeSecondary mode
991
nodeEvacInstance nl il ChangeAll
992
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
993
                 gdx avail_nodes =
994
  do
995
    let no_nodes = Left "no nodes available"
996
        node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s]
997
    (nl', il', ops, _) <-
998
        annotateResult "Can't find any good nodes for relocation" .
999
        eitherToResult $
1000
        foldl'
1001
        (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of
1002
                          Bad msg ->
1003
                              case accu of
1004
                                Right _ -> accu
1005
                                -- we don't need more details (which
1006
                                -- nodes, etc.) as we only selected
1007
                                -- this group if we can allocate on
1008
                                -- it, hence failures will not
1009
                                -- propagate out of this fold loop
1010
                                Left _ -> Left $ "Allocation failed: " ++ msg
1011
                          Ok result@(_, _, _, new_cv) ->
1012
                              let new_accu = Right result in
1013
                              case accu of
1014
                                Left _ -> new_accu
1015
                                Right (_, _, _, old_cv) ->
1016
                                    if old_cv < new_cv
1017
                                    then accu
1018
                                    else new_accu
1019
        ) no_nodes node_pairs
1020

    
1021
    return (nl', il', ops)
1022

    
1023
-- | Generic function for changing one node of an instance.
1024
--
1025
-- This is similar to 'nodeEvacInstance' but will be used in a few of
1026
-- its sub-patterns. It folds the inner function 'evacOneNodeInner'
1027
-- over the list of available nodes, which results in the best choice
1028
-- for relocation.
1029
evacOneNodeOnly :: Node.List         -- ^ The node list (cluster-wide)
1030
                -> Instance.List     -- ^ Instance list (cluster-wide)
1031
                -> Instance.Instance -- ^ The instance to be evacuated
1032
                -> Gdx               -- ^ The group we're targetting
1033
                -> [Ndx]             -- ^ The list of available nodes
1034
                                      -- for allocation
1035
                -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1036
evacOneNodeOnly nl il inst gdx avail_nodes = do
1037
  op_fn <- case Instance.mirrorType inst of
1038
             MirrorNone -> Bad "Can't relocate/evacuate non-mirrored instances"
1039
             MirrorInternal -> Ok ReplaceSecondary
1040
             MirrorExternal -> Ok FailoverToAny
1041
  (nl', inst', _, ndx) <- annotateResult "Can't find any good node" .
1042
                          eitherToResult $
1043
                          foldl' (evacOneNodeInner nl inst gdx op_fn)
1044
                          (Left "no nodes available") avail_nodes
1045
  let idx = Instance.idx inst
1046
      il' = Container.add idx inst' il
1047
      ops = iMoveToJob nl' il' idx (op_fn ndx)
1048
  return (nl', il', ops)
1049

    
1050
-- | Inner fold function for changing one node of an instance.
1051
--
1052
-- Depending on the instance disk template, this will either change
1053
-- the secondary (for DRBD) or the primary node (for shared
1054
-- storage). However, the operation is generic otherwise.
1055
--
1056
-- The running solution is either a @Left String@, which means we
1057
-- don't have yet a working solution, or a @Right (...)@, which
1058
-- represents a valid solution; it holds the modified node list, the
1059
-- modified instance (after evacuation), the score of that solution,
1060
-- and the new secondary node index.
1061
evacOneNodeInner :: Node.List         -- ^ Cluster node list
1062
                 -> Instance.Instance -- ^ Instance being evacuated
1063
                 -> Gdx               -- ^ The group index of the instance
1064
                 -> (Ndx -> IMove)    -- ^ Operation constructor
1065
                 -> EvacInnerState    -- ^ Current best solution
1066
                 -> Ndx               -- ^ Node we're evaluating as target
1067
                 -> EvacInnerState    -- ^ New best solution
1068
evacOneNodeInner nl inst gdx op_fn accu ndx =
1069
  case applyMove nl inst (op_fn ndx) of
1070
    Bad fm -> let fail_msg = "Node " ++ Container.nameOf nl ndx ++
1071
                             " failed: " ++ show fm
1072
              in either (const $ Left fail_msg) (const accu) accu
1073
    Ok (nl', inst', _, _) ->
1074
      let nodes = Container.elems nl'
1075
          -- The fromJust below is ugly (it can fail nastily), but
1076
          -- at this point we should have any internal mismatches,
1077
          -- and adding a monad here would be quite involved
1078
          grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1079
          new_cv = compCVNodes grpnodes
1080
          new_accu = Right (nl', inst', new_cv, ndx)
1081
      in case accu of
1082
           Left _ -> new_accu
1083
           Right (_, _, old_cv, _) ->
1084
             if old_cv < new_cv
1085
               then accu
1086
               else new_accu
1087

    
1088
-- | Compute result of changing all nodes of a DRBD instance.
1089
--
1090
-- Given the target primary and secondary node (which might be in a
1091
-- different group or not), this function will 'execute' all the
1092
-- required steps and assuming all operations succceed, will return
1093
-- the modified node and instance lists, the opcodes needed for this
1094
-- and the new group score.
1095
evacDrbdAllInner :: Node.List         -- ^ Cluster node list
1096
                 -> Instance.List     -- ^ Cluster instance list
1097
                 -> Instance.Instance -- ^ The instance to be moved
1098
                 -> Gdx               -- ^ The target group index
1099
                                      -- (which can differ from the
1100
                                      -- current group of the
1101
                                      -- instance)
1102
                 -> (Ndx, Ndx)        -- ^ Tuple of new
1103
                                      -- primary\/secondary nodes
1104
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score)
1105
evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) = do
1106
  let primary = Container.find (Instance.pNode inst) nl
1107
      idx = Instance.idx inst
1108
  -- if the primary is offline, then we first failover
1109
  (nl1, inst1, ops1) <-
1110
    if Node.offline primary
1111
      then do
1112
        (nl', inst', _, _) <-
1113
          annotateResult "Failing over to the secondary" .
1114
          opToResult $ applyMove nl inst Failover
1115
        return (nl', inst', [Failover])
1116
      else return (nl, inst, [])
1117
  let (o1, o2, o3) = (ReplaceSecondary t_pdx,
1118
                      Failover,
1119
                      ReplaceSecondary t_sdx)
1120
  -- we now need to execute a replace secondary to the future
1121
  -- primary node
1122
  (nl2, inst2, _, _) <-
1123
    annotateResult "Changing secondary to new primary" .
1124
    opToResult $
1125
    applyMove nl1 inst1 o1
1126
  let ops2 = o1:ops1
1127
  -- we now execute another failover, the primary stays fixed now
1128
  (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" .
1129
                        opToResult $ applyMove nl2 inst2 o2
1130
  let ops3 = o2:ops2
1131
  -- and finally another replace secondary, to the final secondary
1132
  (nl4, inst4, _, _) <-
1133
    annotateResult "Changing secondary to final secondary" .
1134
    opToResult $
1135
    applyMove nl3 inst3 o3
1136
  let ops4 = o3:ops3
1137
      il' = Container.add idx inst4 il
1138
      ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1139
  let nodes = Container.elems nl4
1140
      -- The fromJust below is ugly (it can fail nastily), but
1141
      -- at this point we should have any internal mismatches,
1142
      -- and adding a monad here would be quite involved
1143
      grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1144
      new_cv = compCVNodes grpnodes
1145
  return (nl4, il', ops, new_cv)
1146

    
1147
-- | Computes the nodes in a given group which are available for
1148
-- allocation.
1149
availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1150
                    -> IntSet.IntSet  -- ^ Nodes that are excluded
1151
                    -> Gdx            -- ^ The group for which we
1152
                                      -- query the nodes
1153
                    -> Result [Ndx]   -- ^ List of available node indices
1154
availableGroupNodes group_nodes excl_ndx gdx = do
1155
  local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1156
                 Ok (lookup gdx group_nodes)
1157
  let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1158
  return avail_nodes
1159

    
1160
-- | Updates the evac solution with the results of an instance
1161
-- evacuation.
1162
updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1163
                   -> Idx
1164
                   -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1165
                   -> (Node.List, Instance.List, EvacSolution)
1166
updateEvacSolution (nl, il, es) idx (Bad msg) =
1167
  (nl, il, es { esFailed = (idx, msg):esFailed es})
1168
updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1169
  (nl, il, es { esMoved = new_elem:esMoved es
1170
              , esOpCodes = opcodes:esOpCodes es })
1171
    where inst = Container.find idx il
1172
          new_elem = (idx,
1173
                      instancePriGroup nl inst,
1174
                      Instance.allNodes inst)
1175

    
1176
-- | Node-evacuation IAllocator mode main function.
1177
tryNodeEvac :: Group.List    -- ^ The cluster groups
1178
            -> Node.List     -- ^ The node list (cluster-wide, not per group)
1179
            -> Instance.List -- ^ Instance list (cluster-wide)
1180
            -> EvacMode      -- ^ The evacuation mode
1181
            -> [Idx]         -- ^ List of instance (indices) to be evacuated
1182
            -> Result (Node.List, Instance.List, EvacSolution)
1183
tryNodeEvac _ ini_nl ini_il mode idxs =
1184
  let evac_ndx = nodesToEvacuate ini_il mode idxs
1185
      offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1186
      excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1187
      group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1188
                                           (Container.elems nl))) $
1189
                  splitCluster ini_nl ini_il
1190
      (fin_nl, fin_il, esol) =
1191
        foldl' (\state@(nl, il, _) inst ->
1192
                  let gdx = instancePriGroup nl inst
1193
                      pdx = Instance.pNode inst in
1194
                  updateEvacSolution state (Instance.idx inst) $
1195
                  availableGroupNodes group_ndx
1196
                    (IntSet.insert pdx excl_ndx) gdx >>=
1197
                      nodeEvacInstance nl il mode inst gdx
1198
               )
1199
        (ini_nl, ini_il, emptyEvacSolution)
1200
        (map (`Container.find` ini_il) idxs)
1201
  in return (fin_nl, fin_il, reverseEvacSolution esol)
1202

    
1203
-- | Change-group IAllocator mode main function.
1204
--
1205
-- This is very similar to 'tryNodeEvac', the only difference is that
1206
-- we don't choose as target group the current instance group, but
1207
-- instead:
1208
--
1209
--   1. at the start of the function, we compute which are the target
1210
--   groups; either no groups were passed in, in which case we choose
1211
--   all groups out of which we don't evacuate instance, or there were
1212
--   some groups passed, in which case we use those
1213
--
1214
--   2. for each instance, we use 'findBestAllocGroup' to choose the
1215
--   best group to hold the instance, and then we do what
1216
--   'tryNodeEvac' does, except for this group instead of the current
1217
--   instance group.
1218
--
1219
-- Note that the correct behaviour of this function relies on the
1220
-- function 'nodeEvacInstance' to be able to do correctly both
1221
-- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1222
tryChangeGroup :: Group.List    -- ^ The cluster groups
1223
               -> Node.List     -- ^ The node list (cluster-wide)
1224
               -> Instance.List -- ^ Instance list (cluster-wide)
1225
               -> [Gdx]         -- ^ Target groups; if empty, any
1226
                                -- groups not being evacuated
1227
               -> [Idx]         -- ^ List of instance (indices) to be evacuated
1228
               -> Result (Node.List, Instance.List, EvacSolution)
1229
tryChangeGroup gl ini_nl ini_il gdxs idxs =
1230
  let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1231
                             flip Container.find ini_il) idxs
1232
      target_gdxs = (if null gdxs
1233
                       then Container.keys gl
1234
                       else gdxs) \\ evac_gdxs
1235
      offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1236
      excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1237
      group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1238
                                           (Container.elems nl))) $
1239
                  splitCluster ini_nl ini_il
1240
      (fin_nl, fin_il, esol) =
1241
        foldl' (\state@(nl, il, _) inst ->
1242
                  let solution = do
1243
                        let ncnt = Instance.requiredNodes $
1244
                                   Instance.diskTemplate inst
1245
                        (grp, _, _) <- findBestAllocGroup gl nl il
1246
                                       (Just target_gdxs) inst ncnt
1247
                        let gdx = Group.idx grp
1248
                        av_nodes <- availableGroupNodes group_ndx
1249
                                    excl_ndx gdx
1250
                        nodeEvacInstance nl il ChangeAll inst gdx av_nodes
1251
                  in updateEvacSolution state (Instance.idx inst) solution
1252
               )
1253
        (ini_nl, ini_il, emptyEvacSolution)
1254
        (map (`Container.find` ini_il) idxs)
1255
  in return (fin_nl, fin_il, reverseEvacSolution esol)
1256

    
1257
-- | Standard-sized allocation method.
1258
--
1259
-- This places instances of the same size on the cluster until we're
1260
-- out of space. The result will be a list of identically-sized
1261
-- instances.
1262
iterateAlloc :: AllocMethod
1263
iterateAlloc nl il limit newinst allocnodes ixes cstats =
1264
  let depth = length ixes
1265
      newname = printf "new-%d" depth::String
1266
      newidx = Container.size il
1267
      newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1268
      newlimit = fmap (flip (-) 1) limit
1269
  in case tryAlloc nl il newi2 allocnodes of
1270
       Bad s -> Bad s
1271
       Ok (AllocSolution { asFailures = errs, asSolution = sols3 }) ->
1272
         let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1273
         case sols3 of
1274
           Nothing -> newsol
1275
           Just (xnl, xi, _, _) ->
1276
             if limit == Just 0
1277
               then newsol
1278
               else iterateAlloc xnl (Container.add newidx xi il)
1279
                      newlimit newinst allocnodes (xi:ixes)
1280
                      (totalResources xnl:cstats)
1281

    
1282
-- | Tiered allocation method.
1283
--
1284
-- This places instances on the cluster, and decreases the spec until
1285
-- we can allocate again. The result will be a list of decreasing
1286
-- instance specs.
1287
tieredAlloc :: AllocMethod
1288
tieredAlloc nl il limit newinst allocnodes ixes cstats =
1289
  case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1290
    Bad s -> Bad s
1291
    Ok (errs, nl', il', ixes', cstats') ->
1292
      let newsol = Ok (errs, nl', il', ixes', cstats')
1293
          ixes_cnt = length ixes'
1294
          (stop, newlimit) = case limit of
1295
                               Nothing -> (False, Nothing)
1296
                               Just n -> (n <= ixes_cnt,
1297
                                            Just (n - ixes_cnt)) in
1298
      if stop then newsol else
1299
          case Instance.shrinkByType newinst . fst . last $
1300
               sortBy (comparing snd) errs of
1301
            Bad _ -> newsol
1302
            Ok newinst' -> tieredAlloc nl' il' newlimit
1303
                           newinst' allocnodes ixes' cstats'
1304

    
1305
-- * Formatting functions
1306

    
1307
-- | Given the original and final nodes, computes the relocation description.
1308
computeMoves :: Instance.Instance -- ^ The instance to be moved
1309
             -> String -- ^ The instance name
1310
             -> IMove  -- ^ The move being performed
1311
             -> String -- ^ New primary
1312
             -> String -- ^ New secondary
1313
             -> (String, [String])
1314
                -- ^ Tuple of moves and commands list; moves is containing
1315
                -- either @/f/@ for failover or @/r:name/@ for replace
1316
                -- secondary, while the command list holds gnt-instance
1317
                -- commands (without that prefix), e.g \"@failover instance1@\"
1318
computeMoves i inam mv c d =
1319
  case mv of
1320
    Failover -> ("f", [mig])
1321
    FailoverToAny _ -> (printf "fa:%s" c, [mig_any])
1322
    FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1323
    ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1324
    ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1325
    ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1326
  where morf = if Instance.isRunning i then "migrate" else "failover"
1327
        mig = printf "%s -f %s" morf inam::String
1328
        mig_any = printf "%s -f -n %s %s" morf c inam::String
1329
        rep n = printf "replace-disks -n %s %s" n inam::String
1330

    
1331
-- | Converts a placement to string format.
1332
printSolutionLine :: Node.List     -- ^ The node list
1333
                  -> Instance.List -- ^ The instance list
1334
                  -> Int           -- ^ Maximum node name length
1335
                  -> Int           -- ^ Maximum instance name length
1336
                  -> Placement     -- ^ The current placement
1337
                  -> Int           -- ^ The index of the placement in
1338
                                   -- the solution
1339
                  -> (String, [String])
1340
printSolutionLine nl il nmlen imlen plc pos =
1341
  let pmlen = (2*nmlen + 1)
1342
      (i, p, s, mv, c) = plc
1343
      old_sec = Instance.sNode inst
1344
      inst = Container.find i il
1345
      inam = Instance.alias inst
1346
      npri = Node.alias $ Container.find p nl
1347
      nsec = Node.alias $ Container.find s nl
1348
      opri = Node.alias $ Container.find (Instance.pNode inst) nl
1349
      osec = Node.alias $ Container.find old_sec nl
1350
      (moves, cmds) =  computeMoves inst inam mv npri nsec
1351
      -- FIXME: this should check instead/also the disk template
1352
      ostr = if old_sec == Node.noSecondary
1353
               then printf "%s" opri::String
1354
               else printf "%s:%s" opri osec::String
1355
      nstr = if s == Node.noSecondary
1356
               then printf "%s" npri::String
1357
               else printf "%s:%s" npri nsec::String
1358
  in (printf "  %3d. %-*s %-*s => %-*s %12.8f a=%s"
1359
      pos imlen inam pmlen ostr pmlen nstr c moves,
1360
      cmds)
1361

    
1362
-- | Return the instance and involved nodes in an instance move.
1363
--
1364
-- Note that the output list length can vary, and is not required nor
1365
-- guaranteed to be of any specific length.
1366
involvedNodes :: Instance.List -- ^ Instance list, used for retrieving
1367
                               -- the instance from its index; note
1368
                               -- that this /must/ be the original
1369
                               -- instance list, so that we can
1370
                               -- retrieve the old nodes
1371
              -> Placement     -- ^ The placement we're investigating,
1372
                               -- containing the new nodes and
1373
                               -- instance index
1374
              -> [Ndx]         -- ^ Resulting list of node indices
1375
involvedNodes il plc =
1376
  let (i, np, ns, _, _) = plc
1377
      inst = Container.find i il
1378
  in nub $ [np, ns] ++ Instance.allNodes inst
1379

    
1380
-- | Inner function for splitJobs, that either appends the next job to
1381
-- the current jobset, or starts a new jobset.
1382
mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1383
mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1384
mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1385
  | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1386
  | otherwise = ([n]:cjs, ndx)
1387

    
1388
-- | Break a list of moves into independent groups. Note that this
1389
-- will reverse the order of jobs.
1390
splitJobs :: [MoveJob] -> [JobSet]
1391
splitJobs = fst . foldl mergeJobs ([], [])
1392

    
1393
-- | Given a list of commands, prefix them with @gnt-instance@ and
1394
-- also beautify the display a little.
1395
formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1396
formatJob jsn jsl (sn, (_, _, _, cmds)) =
1397
  let out =
1398
        printf "  echo job %d/%d" jsn sn:
1399
        printf "  check":
1400
        map ("  gnt-instance " ++) cmds
1401
  in if sn == 1
1402
       then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1403
       else out
1404

    
1405
-- | Given a list of commands, prefix them with @gnt-instance@ and
1406
-- also beautify the display a little.
1407
formatCmds :: [JobSet] -> String
1408
formatCmds =
1409
  unlines .
1410
  concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1411
                           (zip [1..] js)) .
1412
  zip [1..]
1413

    
1414
-- | Print the node list.
1415
printNodes :: Node.List -> [String] -> String
1416
printNodes nl fs =
1417
  let fields = case fs of
1418
                 [] -> Node.defaultFields
1419
                 "+":rest -> Node.defaultFields ++ rest
1420
                 _ -> fs
1421
      snl = sortBy (comparing Node.idx) (Container.elems nl)
1422
      (header, isnum) = unzip $ map Node.showHeader fields
1423
  in printTable "" header (map (Node.list fields) snl) isnum
1424

    
1425
-- | Print the instance list.
1426
printInsts :: Node.List -> Instance.List -> String
1427
printInsts nl il =
1428
  let sil = sortBy (comparing Instance.idx) (Container.elems il)
1429
      helper inst = [ if Instance.isRunning inst then "R" else " "
1430
                    , Instance.name inst
1431
                    , Container.nameOf nl (Instance.pNode inst)
1432
                    , let sdx = Instance.sNode inst
1433
                      in if sdx == Node.noSecondary
1434
                           then  ""
1435
                           else Container.nameOf nl sdx
1436
                    , if Instance.autoBalance inst then "Y" else "N"
1437
                    , printf "%3d" $ Instance.vcpus inst
1438
                    , printf "%5d" $ Instance.mem inst
1439
                    , printf "%5d" $ Instance.dsk inst `div` 1024
1440
                    , printf "%5.3f" lC
1441
                    , printf "%5.3f" lM
1442
                    , printf "%5.3f" lD
1443
                    , printf "%5.3f" lN
1444
                    ]
1445
          where DynUtil lC lM lD lN = Instance.util inst
1446
      header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1447
               , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1448
      isnum = False:False:False:False:False:repeat True
1449
  in printTable "" header (map helper sil) isnum
1450

    
1451
-- | Shows statistics for a given node list.
1452
printStats :: String -> Node.List -> String
1453
printStats lp nl =
1454
  let dcvs = compDetailedCV $ Container.elems nl
1455
      (weights, names) = unzip detailedCVInfo
1456
      hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1457
      header = [ "Field", "Value", "Weight" ]
1458
      formatted = map (\(w, h, val) ->
1459
                         [ h
1460
                         , printf "%.8f" val
1461
                         , printf "x%.2f" w
1462
                         ]) hd
1463
  in printTable lp header formatted $ False:repeat True
1464

    
1465
-- | Convert a placement into a list of OpCodes (basically a job).
1466
iMoveToJob :: Node.List        -- ^ The node list; only used for node
1467
                               -- names, so any version is good
1468
                               -- (before or after the operation)
1469
           -> Instance.List    -- ^ The instance list; also used for
1470
                               -- names only
1471
           -> Idx              -- ^ The index of the instance being
1472
                               -- moved
1473
           -> IMove            -- ^ The actual move to be described
1474
           -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to
1475
                               -- the given move
1476
iMoveToJob nl il idx move =
1477
  let inst = Container.find idx il
1478
      iname = Instance.name inst
1479
      lookNode  n = case mkNonEmpty (Container.nameOf nl n) of
1480
                      -- FIXME: convert htools codebase to non-empty strings
1481
                      Bad msg -> error $ "Empty node name for idx " ++
1482
                                 show n ++ ": " ++ msg ++ "??"
1483
                      Ok ne -> Just ne
1484
      opF = OpCodes.OpInstanceMigrate
1485
              { OpCodes.opInstanceName        = iname
1486
              , OpCodes.opMigrationMode       = Nothing -- default
1487
              , OpCodes.opOldLiveMode         = Nothing -- default as well
1488
              , OpCodes.opTargetNode          = Nothing -- this is drbd
1489
              , OpCodes.opAllowRuntimeChanges = False
1490
              , OpCodes.opIgnoreIpolicy       = False
1491
              , OpCodes.opMigrationCleanup    = False
1492
              , OpCodes.opIallocator          = Nothing
1493
              , OpCodes.opAllowFailover       = True }
1494
      opFA n = opF { OpCodes.opTargetNode = lookNode n } -- not drbd
1495
      opR n = OpCodes.OpInstanceReplaceDisks
1496
                { OpCodes.opInstanceName     = iname
1497
                , OpCodes.opEarlyRelease     = False
1498
                , OpCodes.opIgnoreIpolicy    = False
1499
                , OpCodes.opReplaceDisksMode = OpCodes.ReplaceNewSecondary
1500
                , OpCodes.opReplaceDisksList = []
1501
                , OpCodes.opRemoteNode       = lookNode n
1502
                , OpCodes.opIallocator       = Nothing
1503
                }
1504
  in case move of
1505
       Failover -> [ opF ]
1506
       FailoverToAny np -> [ opFA np ]
1507
       ReplacePrimary np -> [ opF, opR np, opF ]
1508
       ReplaceSecondary ns -> [ opR ns ]
1509
       ReplaceAndFailover np -> [ opR np, opF ]
1510
       FailoverAndReplace ns -> [ opF, opR ns ]
1511

    
1512
-- * Node group functions
1513

    
1514
-- | Computes the group of an instance.
1515
instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1516
instanceGroup nl i =
1517
  let sidx = Instance.sNode i
1518
      pnode = Container.find (Instance.pNode i) nl
1519
      snode = if sidx == Node.noSecondary
1520
              then pnode
1521
              else Container.find sidx nl
1522
      pgroup = Node.group pnode
1523
      sgroup = Node.group snode
1524
  in if pgroup /= sgroup
1525
       then fail ("Instance placed accross two node groups, primary " ++
1526
                  show pgroup ++ ", secondary " ++ show sgroup)
1527
       else return pgroup
1528

    
1529
-- | Computes the group of an instance per the primary node.
1530
instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1531
instancePriGroup nl i =
1532
  let pnode = Container.find (Instance.pNode i) nl
1533
  in  Node.group pnode
1534

    
1535
-- | Compute the list of badly allocated instances (split across node
1536
-- groups).
1537
findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1538
findSplitInstances nl =
1539
  filter (not . isOk . instanceGroup nl) . Container.elems
1540

    
1541
-- | Splits a cluster into the component node groups.
1542
splitCluster :: Node.List -> Instance.List ->
1543
                [(Gdx, (Node.List, Instance.List))]
1544
splitCluster nl il =
1545
  let ngroups = Node.computeGroups (Container.elems nl)
1546
  in map (\(gdx, nodes) ->
1547
           let nidxs = map Node.idx nodes
1548
               nodes' = zip nidxs nodes
1549
               instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1550
           in (gdx, (Container.fromList nodes', instances))) ngroups
1551

    
1552
-- | Compute the list of nodes that are to be evacuated, given a list
1553
-- of instances and an evacuation mode.
1554
nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1555
                -> EvacMode      -- ^ The evacuation mode we're using
1556
                -> [Idx]         -- ^ List of instance indices being evacuated
1557
                -> IntSet.IntSet -- ^ Set of node indices
1558
nodesToEvacuate il mode =
1559
  IntSet.delete Node.noSecondary .
1560
  foldl' (\ns idx ->
1561
            let i = Container.find idx il
1562
                pdx = Instance.pNode i
1563
                sdx = Instance.sNode i
1564
                dt = Instance.diskTemplate i
1565
                withSecondary = case dt of
1566
                                  DTDrbd8 -> IntSet.insert sdx ns
1567
                                  _ -> ns
1568
            in case mode of
1569
                 ChangePrimary   -> IntSet.insert pdx ns
1570
                 ChangeSecondary -> withSecondary
1571
                 ChangeAll       -> IntSet.insert pdx withSecondary
1572
         ) IntSet.empty