Statistics
| Branch: | Tag: | Revision:

root / src / Ganeti / HTools / Cluster.hs @ c32c4e4d

History | View | Annotate | Download (70.2 kB)

1
{-| Implementation of cluster-wide logic.
2

    
3
This module holds all pure cluster-logic; I\/O related functionality
4
goes into the /Main/ module for the individual binaries.
5

    
6
-}
7

    
8
{-
9

    
10
Copyright (C) 2009, 2010, 2011, 2012, 2013 Google Inc.
11

    
12
This program is free software; you can redistribute it and/or modify
13
it under the terms of the GNU General Public License as published by
14
the Free Software Foundation; either version 2 of the License, or
15
(at your option) any later version.
16

    
17
This program is distributed in the hope that it will be useful, but
18
WITHOUT ANY WARRANTY; without even the implied warranty of
19
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
General Public License for more details.
21

    
22
You should have received a copy of the GNU General Public License
23
along with this program; if not, write to the Free Software
24
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25
02110-1301, USA.
26

    
27
-}
28

    
29
module Ganeti.HTools.Cluster
30
  (
31
    -- * Types
32
    AllocSolution(..)
33
  , EvacSolution(..)
34
  , Table(..)
35
  , CStats(..)
36
  , AllocNodes
37
  , AllocResult
38
  , AllocMethod
39
  , AllocSolutionList
40
  -- * Generic functions
41
  , totalResources
42
  , computeAllocationDelta
43
  -- * First phase functions
44
  , computeBadItems
45
  -- * Second phase functions
46
  , printSolutionLine
47
  , formatCmds
48
  , involvedNodes
49
  , splitJobs
50
  -- * Display functions
51
  , printNodes
52
  , printInsts
53
  -- * Balacing functions
54
  , checkMove
55
  , doNextBalance
56
  , tryBalance
57
  , compCV
58
  , compCVNodes
59
  , compDetailedCV
60
  , printStats
61
  , iMoveToJob
62
  -- * IAllocator functions
63
  , genAllocNodes
64
  , tryAlloc
65
  , tryMGAlloc
66
  , tryNodeEvac
67
  , tryChangeGroup
68
  , collapseFailures
69
  , allocList
70
  -- * Allocation functions
71
  , iterateAlloc
72
  , tieredAlloc
73
  -- * Node group functions
74
  , instanceGroup
75
  , findSplitInstances
76
  , splitCluster
77
  ) where
78

    
79
import Control.Applicative (liftA2)
80
import Control.Arrow ((&&&))
81
import qualified Data.IntSet as IntSet
82
import Data.List
83
import Data.Maybe (fromJust, fromMaybe, isJust, isNothing)
84
import Data.Ord (comparing)
85
import Text.Printf (printf)
86

    
87
import Ganeti.BasicTypes
88
import qualified Ganeti.HTools.Container as Container
89
import qualified Ganeti.HTools.Instance as Instance
90
import qualified Ganeti.HTools.Nic as Nic
91
import qualified Ganeti.HTools.Node as Node
92
import qualified Ganeti.HTools.Group as Group
93
import Ganeti.HTools.Types
94
import Ganeti.Compat
95
import qualified Ganeti.OpCodes as OpCodes
96
import Ganeti.Utils
97
import Ganeti.Types (mkNonEmpty)
98

    
99
-- * Types
100

    
101
-- | Allocation\/relocation solution.
102
data AllocSolution = AllocSolution
103
  { asFailures :: [FailMode]              -- ^ Failure counts
104
  , asAllocs   :: Int                     -- ^ Good allocation count
105
  , asSolution :: Maybe Node.AllocElement -- ^ The actual allocation result
106
  , asLog      :: [String]                -- ^ Informational messages
107
  }
108

    
109
-- | Node evacuation/group change iallocator result type. This result
110
-- type consists of actual opcodes (a restricted subset) that are
111
-- transmitted back to Ganeti.
112
data EvacSolution = EvacSolution
113
  { esMoved   :: [(Idx, Gdx, [Ndx])]  -- ^ Instances moved successfully
114
  , esFailed  :: [(Idx, String)]      -- ^ Instances which were not
115
                                      -- relocated
116
  , esOpCodes :: [[OpCodes.OpCode]]   -- ^ List of jobs
117
  } deriving (Show)
118

    
119
-- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
120
type AllocResult = (FailStats, Node.List, Instance.List,
121
                    [Instance.Instance], [CStats])
122

    
123
-- | Type alias for easier handling.
124
type AllocSolutionList = [(Instance.Instance, AllocSolution)]
125

    
126
-- | A type denoting the valid allocation mode/pairs.
127
--
128
-- For a one-node allocation, this will be a @Left ['Ndx']@, whereas
129
-- for a two-node allocation, this will be a @Right [('Ndx',
130
-- ['Ndx'])]@. In the latter case, the list is basically an
131
-- association list, grouped by primary node and holding the potential
132
-- secondary nodes in the sub-list.
133
type AllocNodes = Either [Ndx] [(Ndx, [Ndx])]
134

    
135
-- | The empty solution we start with when computing allocations.
136
emptyAllocSolution :: AllocSolution
137
emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
138
                                   , asSolution = Nothing, asLog = [] }
139

    
140
-- | The empty evac solution.
141
emptyEvacSolution :: EvacSolution
142
emptyEvacSolution = EvacSolution { esMoved = []
143
                                 , esFailed = []
144
                                 , esOpCodes = []
145
                                 }
146

    
147
-- | The complete state for the balancing solution.
148
data Table = Table Node.List Instance.List Score [Placement]
149
             deriving (Show)
150

    
151
-- | Cluster statistics data type.
152
data CStats = CStats
153
  { csFmem :: Integer -- ^ Cluster free mem
154
  , csFdsk :: Integer -- ^ Cluster free disk
155
  , csFspn :: Integer -- ^ Cluster free spindles
156
  , csAmem :: Integer -- ^ Cluster allocatable mem
157
  , csAdsk :: Integer -- ^ Cluster allocatable disk
158
  , csAcpu :: Integer -- ^ Cluster allocatable cpus
159
  , csMmem :: Integer -- ^ Max node allocatable mem
160
  , csMdsk :: Integer -- ^ Max node allocatable disk
161
  , csMcpu :: Integer -- ^ Max node allocatable cpu
162
  , csImem :: Integer -- ^ Instance used mem
163
  , csIdsk :: Integer -- ^ Instance used disk
164
  , csIspn :: Integer -- ^ Instance used spindles
165
  , csIcpu :: Integer -- ^ Instance used cpu
166
  , csTmem :: Double  -- ^ Cluster total mem
167
  , csTdsk :: Double  -- ^ Cluster total disk
168
  , csTspn :: Double  -- ^ Cluster total spindles
169
  , csTcpu :: Double  -- ^ Cluster total cpus
170
  , csVcpu :: Integer -- ^ Cluster total virtual cpus
171
  , csNcpu :: Double  -- ^ Equivalent to 'csIcpu' but in terms of
172
                      -- physical CPUs, i.e. normalised used phys CPUs
173
  , csXmem :: Integer -- ^ Unnacounted for mem
174
  , csNmem :: Integer -- ^ Node own memory
175
  , csScore :: Score  -- ^ The cluster score
176
  , csNinst :: Int    -- ^ The total number of instances
177
  } deriving (Show)
178

    
179
-- | A simple type for allocation functions.
180
type AllocMethod =  Node.List           -- ^ Node list
181
                 -> Instance.List       -- ^ Instance list
182
                 -> Maybe Int           -- ^ Optional allocation limit
183
                 -> Instance.Instance   -- ^ Instance spec for allocation
184
                 -> AllocNodes          -- ^ Which nodes we should allocate on
185
                 -> [Instance.Instance] -- ^ Allocated instances
186
                 -> [CStats]            -- ^ Running cluster stats
187
                 -> Result AllocResult  -- ^ Allocation result
188

    
189
-- | A simple type for the running solution of evacuations.
190
type EvacInnerState =
191
  Either String (Node.List, Instance.Instance, Score, Ndx)
192

    
193
-- * Utility functions
194

    
195
-- | Verifies the N+1 status and return the affected nodes.
196
verifyN1 :: [Node.Node] -> [Node.Node]
197
verifyN1 = filter Node.failN1
198

    
199
{-| Computes the pair of bad nodes and instances.
200

    
201
The bad node list is computed via a simple 'verifyN1' check, and the
202
bad instance list is the list of primary and secondary instances of
203
those nodes.
204

    
205
-}
206
computeBadItems :: Node.List -> Instance.List ->
207
                   ([Node.Node], [Instance.Instance])
208
computeBadItems nl il =
209
  let bad_nodes = verifyN1 $ getOnline nl
210
      bad_instances = map (`Container.find` il) .
211
                      sort . nub $
212
                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
213
  in
214
    (bad_nodes, bad_instances)
215

    
216
-- | Extracts the node pairs for an instance. This can fail if the
217
-- instance is single-homed. FIXME: this needs to be improved,
218
-- together with the general enhancement for handling non-DRBD moves.
219
instanceNodes :: Node.List -> Instance.Instance ->
220
                 (Ndx, Ndx, Node.Node, Node.Node)
221
instanceNodes nl inst =
222
  let old_pdx = Instance.pNode inst
223
      old_sdx = Instance.sNode inst
224
      old_p = Container.find old_pdx nl
225
      old_s = Container.find old_sdx nl
226
  in (old_pdx, old_sdx, old_p, old_s)
227

    
228
-- | Zero-initializer for the CStats type.
229
emptyCStats :: CStats
230
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
231

    
232
-- | Update stats with data from a new node.
233
updateCStats :: CStats -> Node.Node -> CStats
234
updateCStats cs node =
235
  let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
236
               csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
237
               csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
238
               csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
239
               csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
240
               csVcpu = x_vcpu, csNcpu = x_ncpu,
241
               csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst,
242
               csFspn = x_fspn, csIspn = x_ispn, csTspn = x_tspn
243
             }
244
        = cs
245
      inc_amem = Node.fMem node - Node.rMem node
246
      inc_amem' = if inc_amem > 0 then inc_amem else 0
247
      inc_adsk = Node.availDisk node
248
      inc_imem = truncate (Node.tMem node) - Node.nMem node
249
                 - Node.xMem node - Node.fMem node
250
      inc_icpu = Node.uCpu node
251
      inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
252
      inc_ispn = Node.tSpindles node - Node.fSpindles node
253
      inc_vcpu = Node.hiCpu node
254
      inc_acpu = Node.availCpu node
255
      inc_ncpu = fromIntegral (Node.uCpu node) /
256
                 iPolicyVcpuRatio (Node.iPolicy node)
257
  in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
258
        , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
259
        , csFspn = x_fspn + fromIntegral (Node.fSpindles node)
260
        , csAmem = x_amem + fromIntegral inc_amem'
261
        , csAdsk = x_adsk + fromIntegral inc_adsk
262
        , csAcpu = x_acpu + fromIntegral inc_acpu
263
        , csMmem = max x_mmem (fromIntegral inc_amem')
264
        , csMdsk = max x_mdsk (fromIntegral inc_adsk)
265
        , csMcpu = max x_mcpu (fromIntegral inc_acpu)
266
        , csImem = x_imem + fromIntegral inc_imem
267
        , csIdsk = x_idsk + fromIntegral inc_idsk
268
        , csIspn = x_ispn + fromIntegral inc_ispn
269
        , csIcpu = x_icpu + fromIntegral inc_icpu
270
        , csTmem = x_tmem + Node.tMem node
271
        , csTdsk = x_tdsk + Node.tDsk node
272
        , csTspn = x_tspn + fromIntegral (Node.tSpindles node)
273
        , csTcpu = x_tcpu + Node.tCpu node
274
        , csVcpu = x_vcpu + fromIntegral inc_vcpu
275
        , csNcpu = x_ncpu + inc_ncpu
276
        , csXmem = x_xmem + fromIntegral (Node.xMem node)
277
        , csNmem = x_nmem + fromIntegral (Node.nMem node)
278
        , csNinst = x_ninst + length (Node.pList node)
279
        }
280

    
281
-- | Compute the total free disk and memory in the cluster.
282
totalResources :: Node.List -> CStats
283
totalResources nl =
284
  let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
285
  in cs { csScore = compCV nl }
286

    
287
-- | Compute the delta between two cluster state.
288
--
289
-- This is used when doing allocations, to understand better the
290
-- available cluster resources. The return value is a triple of the
291
-- current used values, the delta that was still allocated, and what
292
-- was left unallocated.
293
computeAllocationDelta :: CStats -> CStats -> AllocStats
294
computeAllocationDelta cini cfin =
295
  let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu,
296
              csNcpu = i_ncpu, csIspn = i_ispn } = cini
297
      CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
298
              csTmem = t_mem, csTdsk = t_dsk, csVcpu = f_vcpu,
299
              csNcpu = f_ncpu, csTcpu = f_tcpu,
300
              csIspn = f_ispn, csTspn = t_spn } = cfin
301
      rini = AllocInfo { allocInfoVCpus = fromIntegral i_icpu
302
                       , allocInfoNCpus = i_ncpu
303
                       , allocInfoMem   = fromIntegral i_imem
304
                       , allocInfoDisk  = fromIntegral i_idsk
305
                       , allocInfoSpn   = fromIntegral i_ispn
306
                       }
307
      rfin = AllocInfo { allocInfoVCpus = fromIntegral (f_icpu - i_icpu)
308
                       , allocInfoNCpus = f_ncpu - i_ncpu
309
                       , allocInfoMem   = fromIntegral (f_imem - i_imem)
310
                       , allocInfoDisk  = fromIntegral (f_idsk - i_idsk)
311
                       , allocInfoSpn   = fromIntegral (f_ispn - i_ispn)
312
                       }
313
      runa = AllocInfo { allocInfoVCpus = fromIntegral (f_vcpu - f_icpu)
314
                       , allocInfoNCpus = f_tcpu - f_ncpu
315
                       , allocInfoMem   = truncate t_mem - fromIntegral f_imem
316
                       , allocInfoDisk  = truncate t_dsk - fromIntegral f_idsk
317
                       , allocInfoSpn   = truncate t_spn - fromIntegral f_ispn
318
                       }
319
  in (rini, rfin, runa)
320

    
321
-- | The names and weights of the individual elements in the CV list.
322
detailedCVInfo :: [(Double, String)]
323
detailedCVInfo = [ (1,  "free_mem_cv")
324
                 , (1,  "free_disk_cv")
325
                 , (1,  "n1_cnt")
326
                 , (1,  "reserved_mem_cv")
327
                 , (4,  "offline_all_cnt")
328
                 , (16, "offline_pri_cnt")
329
                 , (1,  "vcpu_ratio_cv")
330
                 , (1,  "cpu_load_cv")
331
                 , (1,  "mem_load_cv")
332
                 , (1,  "disk_load_cv")
333
                 , (1,  "net_load_cv")
334
                 , (2,  "pri_tags_score")
335
                 , (1,  "spindles_cv")
336
                 ]
337

    
338
-- | Holds the weights used by 'compCVNodes' for each metric.
339
detailedCVWeights :: [Double]
340
detailedCVWeights = map fst detailedCVInfo
341

    
342
-- | Compute the mem and disk covariance.
343
compDetailedCV :: [Node.Node] -> [Double]
344
compDetailedCV all_nodes =
345
  let (offline, nodes) = partition Node.offline all_nodes
346
      mem_l = map Node.pMem nodes
347
      dsk_l = map Node.pDsk nodes
348
      -- metric: memory covariance
349
      mem_cv = stdDev mem_l
350
      -- metric: disk covariance
351
      dsk_cv = stdDev dsk_l
352
      -- metric: count of instances living on N1 failing nodes
353
      n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
354
                                                 length (Node.pList n)) .
355
                 filter Node.failN1 $ nodes :: Double
356
      res_l = map Node.pRem nodes
357
      -- metric: reserved memory covariance
358
      res_cv = stdDev res_l
359
      -- offline instances metrics
360
      offline_ipri = sum . map (length . Node.pList) $ offline
361
      offline_isec = sum . map (length . Node.sList) $ offline
362
      -- metric: count of instances on offline nodes
363
      off_score = fromIntegral (offline_ipri + offline_isec)::Double
364
      -- metric: count of primary instances on offline nodes (this
365
      -- helps with evacuation/failover of primary instances on
366
      -- 2-node clusters with one node offline)
367
      off_pri_score = fromIntegral offline_ipri::Double
368
      cpu_l = map Node.pCpu nodes
369
      -- metric: covariance of vcpu/pcpu ratio
370
      cpu_cv = stdDev cpu_l
371
      -- metrics: covariance of cpu, memory, disk and network load
372
      (c_load, m_load, d_load, n_load) =
373
        unzip4 $ map (\n ->
374
                      let DynUtil c1 m1 d1 n1 = Node.utilLoad n
375
                          DynUtil c2 m2 d2 n2 = Node.utilPool n
376
                      in (c1/c2, m1/m2, d1/d2, n1/n2)) nodes
377
      -- metric: conflicting instance count
378
      pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
379
      pri_tags_score = fromIntegral pri_tags_inst::Double
380
      -- metric: spindles %
381
      spindles_cv = map (\n -> Node.instSpindles n / Node.hiSpindles n) nodes
382
  in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
383
     , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
384
     , pri_tags_score, stdDev spindles_cv ]
385

    
386
-- | Compute the /total/ variance.
387
compCVNodes :: [Node.Node] -> Double
388
compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
389

    
390
-- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
391
compCV :: Node.List -> Double
392
compCV = compCVNodes . Container.elems
393

    
394
-- | Compute online nodes from a 'Node.List'.
395
getOnline :: Node.List -> [Node.Node]
396
getOnline = filter (not . Node.offline) . Container.elems
397

    
398
-- * Balancing functions
399

    
400
-- | Compute best table. Note that the ordering of the arguments is important.
401
compareTables :: Table -> Table -> Table
402
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
403
  if a_cv > b_cv then b else a
404

    
405
-- | Applies an instance move to a given node list and instance.
406
applyMove :: Node.List -> Instance.Instance
407
          -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
408
-- Failover (f)
409
applyMove nl inst Failover =
410
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
411
      int_p = Node.removePri old_p inst
412
      int_s = Node.removeSec old_s inst
413
      new_nl = do -- Maybe monad
414
        new_p <- Node.addPriEx (Node.offline old_p) int_s inst
415
        new_s <- Node.addSec int_p inst old_sdx
416
        let new_inst = Instance.setBoth inst old_sdx old_pdx
417
        return (Container.addTwo old_pdx new_s old_sdx new_p nl,
418
                new_inst, old_sdx, old_pdx)
419
  in new_nl
420

    
421
-- Failover to any (fa)
422
applyMove nl inst (FailoverToAny new_pdx) = do
423
  let (old_pdx, old_sdx, old_pnode, _) = instanceNodes nl inst
424
      new_pnode = Container.find new_pdx nl
425
      force_failover = Node.offline old_pnode
426
  new_pnode' <- Node.addPriEx force_failover new_pnode inst
427
  let old_pnode' = Node.removePri old_pnode inst
428
      inst' = Instance.setPri inst new_pdx
429
      nl' = Container.addTwo old_pdx old_pnode' new_pdx new_pnode' nl
430
  return (nl', inst', new_pdx, old_sdx)
431

    
432
-- Replace the primary (f:, r:np, f)
433
applyMove nl inst (ReplacePrimary new_pdx) =
434
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
435
      tgt_n = Container.find new_pdx nl
436
      int_p = Node.removePri old_p inst
437
      int_s = Node.removeSec old_s inst
438
      force_p = Node.offline old_p
439
      new_nl = do -- Maybe monad
440
                  -- check that the current secondary can host the instance
441
                  -- during the migration
442
        tmp_s <- Node.addPriEx force_p int_s inst
443
        let tmp_s' = Node.removePri tmp_s inst
444
        new_p <- Node.addPriEx force_p tgt_n inst
445
        new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
446
        let new_inst = Instance.setPri inst new_pdx
447
        return (Container.add new_pdx new_p $
448
                Container.addTwo old_pdx int_p old_sdx new_s nl,
449
                new_inst, new_pdx, old_sdx)
450
  in new_nl
451

    
452
-- Replace the secondary (r:ns)
453
applyMove nl inst (ReplaceSecondary new_sdx) =
454
  let old_pdx = Instance.pNode inst
455
      old_sdx = Instance.sNode inst
456
      old_s = Container.find old_sdx nl
457
      tgt_n = Container.find new_sdx nl
458
      int_s = Node.removeSec old_s inst
459
      force_s = Node.offline old_s
460
      new_inst = Instance.setSec inst new_sdx
461
      new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
462
               \new_s -> return (Container.addTwo new_sdx
463
                                 new_s old_sdx int_s nl,
464
                                 new_inst, old_pdx, new_sdx)
465
  in new_nl
466

    
467
-- Replace the secondary and failover (r:np, f)
468
applyMove nl inst (ReplaceAndFailover new_pdx) =
469
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
470
      tgt_n = Container.find new_pdx nl
471
      int_p = Node.removePri old_p inst
472
      int_s = Node.removeSec old_s inst
473
      force_s = Node.offline old_s
474
      new_nl = do -- Maybe monad
475
        new_p <- Node.addPri tgt_n inst
476
        new_s <- Node.addSecEx force_s int_p inst new_pdx
477
        let new_inst = Instance.setBoth inst new_pdx old_pdx
478
        return (Container.add new_pdx new_p $
479
                Container.addTwo old_pdx new_s old_sdx int_s nl,
480
                new_inst, new_pdx, old_pdx)
481
  in new_nl
482

    
483
-- Failver and replace the secondary (f, r:ns)
484
applyMove nl inst (FailoverAndReplace new_sdx) =
485
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
486
      tgt_n = Container.find new_sdx nl
487
      int_p = Node.removePri old_p inst
488
      int_s = Node.removeSec old_s inst
489
      force_p = Node.offline old_p
490
      new_nl = do -- Maybe monad
491
        new_p <- Node.addPriEx force_p int_s inst
492
        new_s <- Node.addSecEx force_p tgt_n inst old_sdx
493
        let new_inst = Instance.setBoth inst old_sdx new_sdx
494
        return (Container.add new_sdx new_s $
495
                Container.addTwo old_sdx new_p old_pdx int_p nl,
496
                new_inst, old_sdx, new_sdx)
497
  in new_nl
498

    
499
-- | Tries to allocate an instance on one given node.
500
allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
501
                 -> OpResult Node.AllocElement
502
allocateOnSingle nl inst new_pdx =
503
  let p = Container.find new_pdx nl
504
      new_inst = Instance.setBoth inst new_pdx Node.noSecondary
505
  in do
506
    Instance.instMatchesPolicy inst (Node.iPolicy p) (Node.exclStorage p)
507
    new_p <- Node.addPri p inst
508
    let new_nl = Container.add new_pdx new_p nl
509
        new_score = compCV new_nl
510
    return (new_nl, new_inst, [new_p], new_score)
511

    
512
-- | Tries to allocate an instance on a given pair of nodes.
513
allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
514
               -> OpResult Node.AllocElement
515
allocateOnPair nl inst new_pdx new_sdx =
516
  let tgt_p = Container.find new_pdx nl
517
      tgt_s = Container.find new_sdx nl
518
  in do
519
    Instance.instMatchesPolicy inst (Node.iPolicy tgt_p)
520
      (Node.exclStorage tgt_p)
521
    new_p <- Node.addPri tgt_p inst
522
    new_s <- Node.addSec tgt_s inst new_pdx
523
    let new_inst = Instance.setBoth inst new_pdx new_sdx
524
        new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
525
    return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
526

    
527
-- | Tries to perform an instance move and returns the best table
528
-- between the original one and the new one.
529
checkSingleStep :: Table -- ^ The original table
530
                -> Instance.Instance -- ^ The instance to move
531
                -> Table -- ^ The current best table
532
                -> IMove -- ^ The move to apply
533
                -> Table -- ^ The final best table
534
checkSingleStep ini_tbl target cur_tbl move =
535
  let Table ini_nl ini_il _ ini_plc = ini_tbl
536
      tmp_resu = applyMove ini_nl target move
537
  in case tmp_resu of
538
       Bad _ -> cur_tbl
539
       Ok (upd_nl, new_inst, pri_idx, sec_idx) ->
540
         let tgt_idx = Instance.idx target
541
             upd_cvar = compCV upd_nl
542
             upd_il = Container.add tgt_idx new_inst ini_il
543
             upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
544
             upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
545
         in compareTables cur_tbl upd_tbl
546

    
547
-- | Given the status of the current secondary as a valid new node and
548
-- the current candidate target node, generate the possible moves for
549
-- a instance.
550
possibleMoves :: MirrorType -- ^ The mirroring type of the instance
551
              -> Bool       -- ^ Whether the secondary node is a valid new node
552
              -> Bool       -- ^ Whether we can change the primary node
553
              -> Ndx        -- ^ Target node candidate
554
              -> [IMove]    -- ^ List of valid result moves
555

    
556
possibleMoves MirrorNone _ _ _ = []
557

    
558
possibleMoves MirrorExternal _ False _ = []
559

    
560
possibleMoves MirrorExternal _ True tdx =
561
  [ FailoverToAny tdx ]
562

    
563
possibleMoves MirrorInternal _ False tdx =
564
  [ ReplaceSecondary tdx ]
565

    
566
possibleMoves MirrorInternal True True tdx =
567
  [ ReplaceSecondary tdx
568
  , ReplaceAndFailover tdx
569
  , ReplacePrimary tdx
570
  , FailoverAndReplace tdx
571
  ]
572

    
573
possibleMoves MirrorInternal False True tdx =
574
  [ ReplaceSecondary tdx
575
  , ReplaceAndFailover tdx
576
  ]
577

    
578
-- | Compute the best move for a given instance.
579
checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
580
                  -> Bool              -- ^ Whether disk moves are allowed
581
                  -> Bool              -- ^ Whether instance moves are allowed
582
                  -> Table             -- ^ Original table
583
                  -> Instance.Instance -- ^ Instance to move
584
                  -> Table             -- ^ Best new table for this instance
585
checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
586
  let opdx = Instance.pNode target
587
      osdx = Instance.sNode target
588
      bad_nodes = [opdx, osdx]
589
      nodes = filter (`notElem` bad_nodes) nodes_idx
590
      mir_type = Instance.mirrorType target
591
      use_secondary = elem osdx nodes_idx && inst_moves
592
      aft_failover = if mir_type == MirrorInternal && use_secondary
593
                       -- if drbd and allowed to failover
594
                       then checkSingleStep ini_tbl target ini_tbl Failover
595
                       else ini_tbl
596
      all_moves =
597
        if disk_moves
598
          then concatMap (possibleMoves mir_type use_secondary inst_moves)
599
               nodes
600
          else []
601
    in
602
      -- iterate over the possible nodes for this instance
603
      foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
604

    
605
-- | Compute the best next move.
606
checkMove :: [Ndx]               -- ^ Allowed target node indices
607
          -> Bool                -- ^ Whether disk moves are allowed
608
          -> Bool                -- ^ Whether instance moves are allowed
609
          -> Table               -- ^ The current solution
610
          -> [Instance.Instance] -- ^ List of instances still to move
611
          -> Table               -- ^ The new solution
612
checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
613
  let Table _ _ _ ini_plc = ini_tbl
614
      -- we're using rwhnf from the Control.Parallel.Strategies
615
      -- package; we don't need to use rnf as that would force too
616
      -- much evaluation in single-threaded cases, and in
617
      -- multi-threaded case the weak head normal form is enough to
618
      -- spark the evaluation
619
      tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
620
                             inst_moves ini_tbl)
621
               victims
622
      -- iterate over all instances, computing the best move
623
      best_tbl = foldl' compareTables ini_tbl tables
624
      Table _ _ _ best_plc = best_tbl
625
  in if length best_plc == length ini_plc
626
       then ini_tbl -- no advancement
627
       else best_tbl
628

    
629
-- | Check if we are allowed to go deeper in the balancing.
630
doNextBalance :: Table     -- ^ The starting table
631
              -> Int       -- ^ Remaining length
632
              -> Score     -- ^ Score at which to stop
633
              -> Bool      -- ^ The resulting table and commands
634
doNextBalance ini_tbl max_rounds min_score =
635
  let Table _ _ ini_cv ini_plc = ini_tbl
636
      ini_plc_len = length ini_plc
637
  in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
638

    
639
-- | Run a balance move.
640
tryBalance :: Table       -- ^ The starting table
641
           -> Bool        -- ^ Allow disk moves
642
           -> Bool        -- ^ Allow instance moves
643
           -> Bool        -- ^ Only evacuate moves
644
           -> Score       -- ^ Min gain threshold
645
           -> Score       -- ^ Min gain
646
           -> Maybe Table -- ^ The resulting table and commands
647
tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
648
    let Table ini_nl ini_il ini_cv _ = ini_tbl
649
        all_inst = Container.elems ini_il
650
        all_nodes = Container.elems ini_nl
651
        (offline_nodes, online_nodes) = partition Node.offline all_nodes
652
        all_inst' = if evac_mode
653
                      then let bad_nodes = map Node.idx offline_nodes
654
                           in filter (any (`elem` bad_nodes) .
655
                                          Instance.allNodes) all_inst
656
                      else all_inst
657
        reloc_inst = filter (\i -> Instance.movable i &&
658
                                   Instance.autoBalance i) all_inst'
659
        node_idx = map Node.idx online_nodes
660
        fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
661
        (Table _ _ fin_cv _) = fin_tbl
662
    in
663
      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
664
      then Just fin_tbl -- this round made success, return the new table
665
      else Nothing
666

    
667
-- * Allocation functions
668

    
669
-- | Build failure stats out of a list of failures.
670
collapseFailures :: [FailMode] -> FailStats
671
collapseFailures flst =
672
    map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
673
            [minBound..maxBound]
674

    
675
-- | Compares two Maybe AllocElement and chooses the best score.
676
bestAllocElement :: Maybe Node.AllocElement
677
                 -> Maybe Node.AllocElement
678
                 -> Maybe Node.AllocElement
679
bestAllocElement a Nothing = a
680
bestAllocElement Nothing b = b
681
bestAllocElement a@(Just (_, _, _, ascore)) b@(Just (_, _, _, bscore)) =
682
  if ascore < bscore then a else b
683

    
684
-- | Update current Allocation solution and failure stats with new
685
-- elements.
686
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
687
concatAllocs as (Bad reason) = as { asFailures = reason : asFailures as }
688

    
689
concatAllocs as (Ok ns) =
690
  let -- Choose the old or new solution, based on the cluster score
691
    cntok = asAllocs as
692
    osols = asSolution as
693
    nsols = bestAllocElement osols (Just ns)
694
    nsuc = cntok + 1
695
    -- Note: we force evaluation of nsols here in order to keep the
696
    -- memory profile low - we know that we will need nsols for sure
697
    -- in the next cycle, so we force evaluation of nsols, since the
698
    -- foldl' in the caller will only evaluate the tuple, but not the
699
    -- elements of the tuple
700
  in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolution = nsols }
701

    
702
-- | Sums two 'AllocSolution' structures.
703
sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
704
sumAllocs (AllocSolution aFails aAllocs aSols aLog)
705
          (AllocSolution bFails bAllocs bSols bLog) =
706
  -- note: we add b first, since usually it will be smaller; when
707
  -- fold'ing, a will grow and grow whereas b is the per-group
708
  -- result, hence smaller
709
  let nFails  = bFails ++ aFails
710
      nAllocs = aAllocs + bAllocs
711
      nSols   = bestAllocElement aSols bSols
712
      nLog    = bLog ++ aLog
713
  in AllocSolution nFails nAllocs nSols nLog
714

    
715
-- | Given a solution, generates a reasonable description for it.
716
describeSolution :: AllocSolution -> String
717
describeSolution as =
718
  let fcnt = asFailures as
719
      sols = asSolution as
720
      freasons =
721
        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
722
        filter ((> 0) . snd) . collapseFailures $ fcnt
723
  in case sols of
724
     Nothing -> "No valid allocation solutions, failure reasons: " ++
725
                (if null fcnt then "unknown reasons" else freasons)
726
     Just (_, _, nodes, cv) ->
727
         printf ("score: %.8f, successes %d, failures %d (%s)" ++
728
                 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
729
               (intercalate "/" . map Node.name $ nodes)
730

    
731
-- | Annotates a solution with the appropriate string.
732
annotateSolution :: AllocSolution -> AllocSolution
733
annotateSolution as = as { asLog = describeSolution as : asLog as }
734

    
735
-- | Reverses an evacuation solution.
736
--
737
-- Rationale: we always concat the results to the top of the lists, so
738
-- for proper jobset execution, we should reverse all lists.
739
reverseEvacSolution :: EvacSolution -> EvacSolution
740
reverseEvacSolution (EvacSolution f m o) =
741
  EvacSolution (reverse f) (reverse m) (reverse o)
742

    
743
-- | Generate the valid node allocation singles or pairs for a new instance.
744
genAllocNodes :: Group.List        -- ^ Group list
745
              -> Node.List         -- ^ The node map
746
              -> Int               -- ^ The number of nodes required
747
              -> Bool              -- ^ Whether to drop or not
748
                                   -- unallocable nodes
749
              -> Result AllocNodes -- ^ The (monadic) result
750
genAllocNodes gl nl count drop_unalloc =
751
  let filter_fn = if drop_unalloc
752
                    then filter (Group.isAllocable .
753
                                 flip Container.find gl . Node.group)
754
                    else id
755
      all_nodes = filter_fn $ getOnline nl
756
      all_pairs = [(Node.idx p,
757
                    [Node.idx s | s <- all_nodes,
758
                                       Node.idx p /= Node.idx s,
759
                                       Node.group p == Node.group s]) |
760
                   p <- all_nodes]
761
  in case count of
762
       1 -> Ok (Left (map Node.idx all_nodes))
763
       2 -> Ok (Right (filter (not . null . snd) all_pairs))
764
       _ -> Bad "Unsupported number of nodes, only one or two  supported"
765

    
766
-- | Try to allocate an instance on the cluster.
767
tryAlloc :: (Monad m) =>
768
            Node.List         -- ^ The node list
769
         -> Instance.List     -- ^ The instance list
770
         -> Instance.Instance -- ^ The instance to allocate
771
         -> AllocNodes        -- ^ The allocation targets
772
         -> m AllocSolution   -- ^ Possible solution list
773
tryAlloc _  _ _    (Right []) = fail "Not enough online nodes"
774
tryAlloc nl _ inst (Right ok_pairs) =
775
  let psols = parMap rwhnf (\(p, ss) ->
776
                              foldl' (\cstate ->
777
                                        concatAllocs cstate .
778
                                        allocateOnPair nl inst p)
779
                              emptyAllocSolution ss) ok_pairs
780
      sols = foldl' sumAllocs emptyAllocSolution psols
781
  in return $ annotateSolution sols
782

    
783
tryAlloc _  _ _    (Left []) = fail "No online nodes"
784
tryAlloc nl _ inst (Left all_nodes) =
785
  let sols = foldl' (\cstate ->
786
                       concatAllocs cstate . allocateOnSingle nl inst
787
                    ) emptyAllocSolution all_nodes
788
  in return $ annotateSolution sols
789

    
790
-- | Given a group/result, describe it as a nice (list of) messages.
791
solutionDescription :: (Group.Group, Result AllocSolution)
792
                    -> [String]
793
solutionDescription (grp, result) =
794
  case result of
795
    Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
796
    Bad message -> [printf "Group %s: error %s" gname message]
797
  where gname = Group.name grp
798
        pol = allocPolicyToRaw (Group.allocPolicy grp)
799

    
800
-- | From a list of possibly bad and possibly empty solutions, filter
801
-- only the groups with a valid result. Note that the result will be
802
-- reversed compared to the original list.
803
filterMGResults :: [(Group.Group, Result AllocSolution)]
804
                -> [(Group.Group, AllocSolution)]
805
filterMGResults = foldl' fn []
806
  where unallocable = not . Group.isAllocable
807
        fn accu (grp, rasol) =
808
          case rasol of
809
            Bad _ -> accu
810
            Ok sol | isNothing (asSolution sol) -> accu
811
                   | unallocable grp -> accu
812
                   | otherwise -> (grp, sol):accu
813

    
814
-- | Sort multigroup results based on policy and score.
815
sortMGResults :: [(Group.Group, AllocSolution)]
816
              -> [(Group.Group, AllocSolution)]
817
sortMGResults sols =
818
  let extractScore (_, _, _, x) = x
819
      solScore (grp, sol) = (Group.allocPolicy grp,
820
                             (extractScore . fromJust . asSolution) sol)
821
  in sortBy (comparing solScore) sols
822

    
823
-- | Removes node groups which can't accommodate the instance
824
filterValidGroups :: [(Group.Group, (Node.List, Instance.List))]
825
                  -> Instance.Instance
826
                  -> ([(Group.Group, (Node.List, Instance.List))], [String])
827
filterValidGroups [] _ = ([], [])
828
filterValidGroups (ng:ngs) inst =
829
  let (valid_ngs, msgs) = filterValidGroups ngs inst
830
      hasNetwork nic = case Nic.network nic of
831
        Just net -> net `elem` Group.networks (fst ng)
832
        Nothing -> True
833
      hasRequiredNetworks = all hasNetwork (Instance.nics inst)
834
  in if hasRequiredNetworks
835
      then (ng:valid_ngs, msgs)
836
      else (valid_ngs,
837
            ("group " ++ Group.name (fst ng) ++
838
             " is not connected to a network required by instance " ++
839
             Instance.name inst):msgs)
840

    
841
-- | Finds the best group for an instance on a multi-group cluster.
842
--
843
-- Only solutions in @preferred@ and @last_resort@ groups will be
844
-- accepted as valid, and additionally if the allowed groups parameter
845
-- is not null then allocation will only be run for those group
846
-- indices.
847
findBestAllocGroup :: Group.List           -- ^ The group list
848
                   -> Node.List            -- ^ The node list
849
                   -> Instance.List        -- ^ The instance list
850
                   -> Maybe [Gdx]          -- ^ The allowed groups
851
                   -> Instance.Instance    -- ^ The instance to allocate
852
                   -> Int                  -- ^ Required number of nodes
853
                   -> Result (Group.Group, AllocSolution, [String])
854
findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
855
  let groups_by_idx = splitCluster mgnl mgil
856
      groups = map (\(gid, d) -> (Container.find gid mggl, d)) groups_by_idx
857
      groups' = maybe groups
858
                (\gs -> filter ((`elem` gs) . Group.idx . fst) groups)
859
                allowed_gdxs
860
      (groups'', filter_group_msgs) = filterValidGroups groups' inst
861
      sols = map (\(gr, (nl, il)) ->
862
                   (gr, genAllocNodes mggl nl cnt False >>=
863
                        tryAlloc nl il inst))
864
             groups''::[(Group.Group, Result AllocSolution)]
865
      all_msgs = filter_group_msgs ++ concatMap solutionDescription sols
866
      goodSols = filterMGResults sols
867
      sortedSols = sortMGResults goodSols
868
  in case sortedSols of
869
       [] -> Bad $ if null groups'
870
                     then "no groups for evacuation: allowed groups was" ++
871
                          show allowed_gdxs ++ ", all groups: " ++
872
                          show (map fst groups)
873
                     else intercalate ", " all_msgs
874
       (final_group, final_sol):_ -> return (final_group, final_sol, all_msgs)
875

    
876
-- | Try to allocate an instance on a multi-group cluster.
877
tryMGAlloc :: Group.List           -- ^ The group list
878
           -> Node.List            -- ^ The node list
879
           -> Instance.List        -- ^ The instance list
880
           -> Instance.Instance    -- ^ The instance to allocate
881
           -> Int                  -- ^ Required number of nodes
882
           -> Result AllocSolution -- ^ Possible solution list
883
tryMGAlloc mggl mgnl mgil inst cnt = do
884
  (best_group, solution, all_msgs) <-
885
      findBestAllocGroup mggl mgnl mgil Nothing inst cnt
886
  let group_name = Group.name best_group
887
      selmsg = "Selected group: " ++ group_name
888
  return $ solution { asLog = selmsg:all_msgs }
889

    
890
-- | Calculate the new instance list after allocation solution.
891
updateIl :: Instance.List           -- ^ The original instance list
892
         -> Maybe Node.AllocElement -- ^ The result of the allocation attempt
893
         -> Instance.List           -- ^ The updated instance list
894
updateIl il Nothing = il
895
updateIl il (Just (_, xi, _, _)) = Container.add (Container.size il) xi il
896

    
897
-- | Extract the the new node list from the allocation solution.
898
extractNl :: Node.List               -- ^ The original node list
899
          -> Maybe Node.AllocElement -- ^ The result of the allocation attempt
900
          -> Node.List               -- ^ The new node list
901
extractNl nl Nothing = nl
902
extractNl _ (Just (xnl, _, _, _)) = xnl
903

    
904
-- | Try to allocate a list of instances on a multi-group cluster.
905
allocList :: Group.List                  -- ^ The group list
906
          -> Node.List                   -- ^ The node list
907
          -> Instance.List               -- ^ The instance list
908
          -> [(Instance.Instance, Int)]  -- ^ The instance to allocate
909
          -> AllocSolutionList           -- ^ Possible solution list
910
          -> Result (Node.List, Instance.List,
911
                     AllocSolutionList)  -- ^ The final solution list
912
allocList _  nl il [] result = Ok (nl, il, result)
913
allocList gl nl il ((xi, xicnt):xies) result = do
914
  ares <- tryMGAlloc gl nl il xi xicnt
915
  let sol = asSolution ares
916
      nl' = extractNl nl sol
917
      il' = updateIl il sol
918
  allocList gl nl' il' xies ((xi, ares):result)
919

    
920
-- | Function which fails if the requested mode is change secondary.
921
--
922
-- This is useful since except DRBD, no other disk template can
923
-- execute change secondary; thus, we can just call this function
924
-- instead of always checking for secondary mode. After the call to
925
-- this function, whatever mode we have is just a primary change.
926
failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
927
failOnSecondaryChange ChangeSecondary dt =
928
  fail $ "Instances with disk template '" ++ diskTemplateToRaw dt ++
929
         "' can't execute change secondary"
930
failOnSecondaryChange _ _ = return ()
931

    
932
-- | Run evacuation for a single instance.
933
--
934
-- /Note:/ this function should correctly execute both intra-group
935
-- evacuations (in all modes) and inter-group evacuations (in the
936
-- 'ChangeAll' mode). Of course, this requires that the correct list
937
-- of target nodes is passed.
938
nodeEvacInstance :: Node.List         -- ^ The node list (cluster-wide)
939
                 -> Instance.List     -- ^ Instance list (cluster-wide)
940
                 -> EvacMode          -- ^ The evacuation mode
941
                 -> Instance.Instance -- ^ The instance to be evacuated
942
                 -> Gdx               -- ^ The group we're targetting
943
                 -> [Ndx]             -- ^ The list of available nodes
944
                                      -- for allocation
945
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
946
nodeEvacInstance nl il mode inst@(Instance.Instance
947
                                  {Instance.diskTemplate = dt@DTDiskless})
948
                 gdx avail_nodes =
949
                   failOnSecondaryChange mode dt >>
950
                   evacOneNodeOnly nl il inst gdx avail_nodes
951

    
952
nodeEvacInstance _ _ _ (Instance.Instance
953
                        {Instance.diskTemplate = DTPlain}) _ _ =
954
                  fail "Instances of type plain cannot be relocated"
955

    
956
nodeEvacInstance _ _ _ (Instance.Instance
957
                        {Instance.diskTemplate = DTFile}) _ _ =
958
                  fail "Instances of type file cannot be relocated"
959

    
960
nodeEvacInstance nl il mode inst@(Instance.Instance
961
                                  {Instance.diskTemplate = dt@DTSharedFile})
962
                 gdx avail_nodes =
963
                   failOnSecondaryChange mode dt >>
964
                   evacOneNodeOnly nl il inst gdx avail_nodes
965

    
966
nodeEvacInstance nl il mode inst@(Instance.Instance
967
                                  {Instance.diskTemplate = dt@DTBlock})
968
                 gdx avail_nodes =
969
                   failOnSecondaryChange mode dt >>
970
                   evacOneNodeOnly nl il inst gdx avail_nodes
971

    
972
nodeEvacInstance nl il mode inst@(Instance.Instance
973
                                  {Instance.diskTemplate = dt@DTRbd})
974
                 gdx avail_nodes =
975
                   failOnSecondaryChange mode dt >>
976
                   evacOneNodeOnly nl il inst gdx avail_nodes
977

    
978
nodeEvacInstance nl il mode inst@(Instance.Instance
979
                                  {Instance.diskTemplate = dt@DTExt})
980
                 gdx avail_nodes =
981
                   failOnSecondaryChange mode dt >>
982
                   evacOneNodeOnly nl il inst gdx avail_nodes
983

    
984
nodeEvacInstance nl il ChangePrimary
985
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
986
                 _ _ =
987
  do
988
    (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
989
    let idx = Instance.idx inst
990
        il' = Container.add idx inst' il
991
        ops = iMoveToJob nl' il' idx Failover
992
    return (nl', il', ops)
993

    
994
nodeEvacInstance nl il ChangeSecondary
995
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
996
                 gdx avail_nodes =
997
  evacOneNodeOnly nl il inst gdx avail_nodes
998

    
999
-- The algorithm for ChangeAll is as follows:
1000
--
1001
-- * generate all (primary, secondary) node pairs for the target groups
1002
-- * for each pair, execute the needed moves (r:s, f, r:s) and compute
1003
--   the final node list state and group score
1004
-- * select the best choice via a foldl that uses the same Either
1005
--   String solution as the ChangeSecondary mode
1006
nodeEvacInstance nl il ChangeAll
1007
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
1008
                 gdx avail_nodes =
1009
  do
1010
    let no_nodes = Left "no nodes available"
1011
        node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s]
1012
    (nl', il', ops, _) <-
1013
        annotateResult "Can't find any good nodes for relocation" .
1014
        eitherToResult $
1015
        foldl'
1016
        (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of
1017
                          Bad msg ->
1018
                              case accu of
1019
                                Right _ -> accu
1020
                                -- we don't need more details (which
1021
                                -- nodes, etc.) as we only selected
1022
                                -- this group if we can allocate on
1023
                                -- it, hence failures will not
1024
                                -- propagate out of this fold loop
1025
                                Left _ -> Left $ "Allocation failed: " ++ msg
1026
                          Ok result@(_, _, _, new_cv) ->
1027
                              let new_accu = Right result in
1028
                              case accu of
1029
                                Left _ -> new_accu
1030
                                Right (_, _, _, old_cv) ->
1031
                                    if old_cv < new_cv
1032
                                    then accu
1033
                                    else new_accu
1034
        ) no_nodes node_pairs
1035

    
1036
    return (nl', il', ops)
1037

    
1038
-- | Generic function for changing one node of an instance.
1039
--
1040
-- This is similar to 'nodeEvacInstance' but will be used in a few of
1041
-- its sub-patterns. It folds the inner function 'evacOneNodeInner'
1042
-- over the list of available nodes, which results in the best choice
1043
-- for relocation.
1044
evacOneNodeOnly :: Node.List         -- ^ The node list (cluster-wide)
1045
                -> Instance.List     -- ^ Instance list (cluster-wide)
1046
                -> Instance.Instance -- ^ The instance to be evacuated
1047
                -> Gdx               -- ^ The group we're targetting
1048
                -> [Ndx]             -- ^ The list of available nodes
1049
                                      -- for allocation
1050
                -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1051
evacOneNodeOnly nl il inst gdx avail_nodes = do
1052
  op_fn <- case Instance.mirrorType inst of
1053
             MirrorNone -> Bad "Can't relocate/evacuate non-mirrored instances"
1054
             MirrorInternal -> Ok ReplaceSecondary
1055
             MirrorExternal -> Ok FailoverToAny
1056
  (nl', inst', _, ndx) <- annotateResult "Can't find any good node" .
1057
                          eitherToResult $
1058
                          foldl' (evacOneNodeInner nl inst gdx op_fn)
1059
                          (Left "no nodes available") avail_nodes
1060
  let idx = Instance.idx inst
1061
      il' = Container.add idx inst' il
1062
      ops = iMoveToJob nl' il' idx (op_fn ndx)
1063
  return (nl', il', ops)
1064

    
1065
-- | Inner fold function for changing one node of an instance.
1066
--
1067
-- Depending on the instance disk template, this will either change
1068
-- the secondary (for DRBD) or the primary node (for shared
1069
-- storage). However, the operation is generic otherwise.
1070
--
1071
-- The running solution is either a @Left String@, which means we
1072
-- don't have yet a working solution, or a @Right (...)@, which
1073
-- represents a valid solution; it holds the modified node list, the
1074
-- modified instance (after evacuation), the score of that solution,
1075
-- and the new secondary node index.
1076
evacOneNodeInner :: Node.List         -- ^ Cluster node list
1077
                 -> Instance.Instance -- ^ Instance being evacuated
1078
                 -> Gdx               -- ^ The group index of the instance
1079
                 -> (Ndx -> IMove)    -- ^ Operation constructor
1080
                 -> EvacInnerState    -- ^ Current best solution
1081
                 -> Ndx               -- ^ Node we're evaluating as target
1082
                 -> EvacInnerState    -- ^ New best solution
1083
evacOneNodeInner nl inst gdx op_fn accu ndx =
1084
  case applyMove nl inst (op_fn ndx) of
1085
    Bad fm -> let fail_msg = "Node " ++ Container.nameOf nl ndx ++
1086
                             " failed: " ++ show fm
1087
              in either (const $ Left fail_msg) (const accu) accu
1088
    Ok (nl', inst', _, _) ->
1089
      let nodes = Container.elems nl'
1090
          -- The fromJust below is ugly (it can fail nastily), but
1091
          -- at this point we should have any internal mismatches,
1092
          -- and adding a monad here would be quite involved
1093
          grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1094
          new_cv = compCVNodes grpnodes
1095
          new_accu = Right (nl', inst', new_cv, ndx)
1096
      in case accu of
1097
           Left _ -> new_accu
1098
           Right (_, _, old_cv, _) ->
1099
             if old_cv < new_cv
1100
               then accu
1101
               else new_accu
1102

    
1103
-- | Compute result of changing all nodes of a DRBD instance.
1104
--
1105
-- Given the target primary and secondary node (which might be in a
1106
-- different group or not), this function will 'execute' all the
1107
-- required steps and assuming all operations succceed, will return
1108
-- the modified node and instance lists, the opcodes needed for this
1109
-- and the new group score.
1110
evacDrbdAllInner :: Node.List         -- ^ Cluster node list
1111
                 -> Instance.List     -- ^ Cluster instance list
1112
                 -> Instance.Instance -- ^ The instance to be moved
1113
                 -> Gdx               -- ^ The target group index
1114
                                      -- (which can differ from the
1115
                                      -- current group of the
1116
                                      -- instance)
1117
                 -> (Ndx, Ndx)        -- ^ Tuple of new
1118
                                      -- primary\/secondary nodes
1119
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score)
1120
evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) = do
1121
  let primary = Container.find (Instance.pNode inst) nl
1122
      idx = Instance.idx inst
1123
  -- if the primary is offline, then we first failover
1124
  (nl1, inst1, ops1) <-
1125
    if Node.offline primary
1126
      then do
1127
        (nl', inst', _, _) <-
1128
          annotateResult "Failing over to the secondary" .
1129
          opToResult $ applyMove nl inst Failover
1130
        return (nl', inst', [Failover])
1131
      else return (nl, inst, [])
1132
  let (o1, o2, o3) = (ReplaceSecondary t_pdx,
1133
                      Failover,
1134
                      ReplaceSecondary t_sdx)
1135
  -- we now need to execute a replace secondary to the future
1136
  -- primary node
1137
  (nl2, inst2, _, _) <-
1138
    annotateResult "Changing secondary to new primary" .
1139
    opToResult $
1140
    applyMove nl1 inst1 o1
1141
  let ops2 = o1:ops1
1142
  -- we now execute another failover, the primary stays fixed now
1143
  (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" .
1144
                        opToResult $ applyMove nl2 inst2 o2
1145
  let ops3 = o2:ops2
1146
  -- and finally another replace secondary, to the final secondary
1147
  (nl4, inst4, _, _) <-
1148
    annotateResult "Changing secondary to final secondary" .
1149
    opToResult $
1150
    applyMove nl3 inst3 o3
1151
  let ops4 = o3:ops3
1152
      il' = Container.add idx inst4 il
1153
      ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1154
  let nodes = Container.elems nl4
1155
      -- The fromJust below is ugly (it can fail nastily), but
1156
      -- at this point we should have any internal mismatches,
1157
      -- and adding a monad here would be quite involved
1158
      grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1159
      new_cv = compCVNodes grpnodes
1160
  return (nl4, il', ops, new_cv)
1161

    
1162
-- | Computes the nodes in a given group which are available for
1163
-- allocation.
1164
availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1165
                    -> IntSet.IntSet  -- ^ Nodes that are excluded
1166
                    -> Gdx            -- ^ The group for which we
1167
                                      -- query the nodes
1168
                    -> Result [Ndx]   -- ^ List of available node indices
1169
availableGroupNodes group_nodes excl_ndx gdx = do
1170
  local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1171
                 Ok (lookup gdx group_nodes)
1172
  let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1173
  return avail_nodes
1174

    
1175
-- | Updates the evac solution with the results of an instance
1176
-- evacuation.
1177
updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1178
                   -> Idx
1179
                   -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1180
                   -> (Node.List, Instance.List, EvacSolution)
1181
updateEvacSolution (nl, il, es) idx (Bad msg) =
1182
  (nl, il, es { esFailed = (idx, msg):esFailed es})
1183
updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1184
  (nl, il, es { esMoved = new_elem:esMoved es
1185
              , esOpCodes = opcodes:esOpCodes es })
1186
    where inst = Container.find idx il
1187
          new_elem = (idx,
1188
                      instancePriGroup nl inst,
1189
                      Instance.allNodes inst)
1190

    
1191
-- | Node-evacuation IAllocator mode main function.
1192
tryNodeEvac :: Group.List    -- ^ The cluster groups
1193
            -> Node.List     -- ^ The node list (cluster-wide, not per group)
1194
            -> Instance.List -- ^ Instance list (cluster-wide)
1195
            -> EvacMode      -- ^ The evacuation mode
1196
            -> [Idx]         -- ^ List of instance (indices) to be evacuated
1197
            -> Result (Node.List, Instance.List, EvacSolution)
1198
tryNodeEvac _ ini_nl ini_il mode idxs =
1199
  let evac_ndx = nodesToEvacuate ini_il mode idxs
1200
      offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1201
      excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1202
      group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1203
                                           (Container.elems nl))) $
1204
                  splitCluster ini_nl ini_il
1205
      (fin_nl, fin_il, esol) =
1206
        foldl' (\state@(nl, il, _) inst ->
1207
                  let gdx = instancePriGroup nl inst
1208
                      pdx = Instance.pNode inst in
1209
                  updateEvacSolution state (Instance.idx inst) $
1210
                  availableGroupNodes group_ndx
1211
                    (IntSet.insert pdx excl_ndx) gdx >>=
1212
                      nodeEvacInstance nl il mode inst gdx
1213
               )
1214
        (ini_nl, ini_il, emptyEvacSolution)
1215
        (map (`Container.find` ini_il) idxs)
1216
  in return (fin_nl, fin_il, reverseEvacSolution esol)
1217

    
1218
-- | Change-group IAllocator mode main function.
1219
--
1220
-- This is very similar to 'tryNodeEvac', the only difference is that
1221
-- we don't choose as target group the current instance group, but
1222
-- instead:
1223
--
1224
--   1. at the start of the function, we compute which are the target
1225
--   groups; either no groups were passed in, in which case we choose
1226
--   all groups out of which we don't evacuate instance, or there were
1227
--   some groups passed, in which case we use those
1228
--
1229
--   2. for each instance, we use 'findBestAllocGroup' to choose the
1230
--   best group to hold the instance, and then we do what
1231
--   'tryNodeEvac' does, except for this group instead of the current
1232
--   instance group.
1233
--
1234
-- Note that the correct behaviour of this function relies on the
1235
-- function 'nodeEvacInstance' to be able to do correctly both
1236
-- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1237
tryChangeGroup :: Group.List    -- ^ The cluster groups
1238
               -> Node.List     -- ^ The node list (cluster-wide)
1239
               -> Instance.List -- ^ Instance list (cluster-wide)
1240
               -> [Gdx]         -- ^ Target groups; if empty, any
1241
                                -- groups not being evacuated
1242
               -> [Idx]         -- ^ List of instance (indices) to be evacuated
1243
               -> Result (Node.List, Instance.List, EvacSolution)
1244
tryChangeGroup gl ini_nl ini_il gdxs idxs =
1245
  let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1246
                             flip Container.find ini_il) idxs
1247
      target_gdxs = (if null gdxs
1248
                       then Container.keys gl
1249
                       else gdxs) \\ evac_gdxs
1250
      offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1251
      excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1252
      group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1253
                                           (Container.elems nl))) $
1254
                  splitCluster ini_nl ini_il
1255
      (fin_nl, fin_il, esol) =
1256
        foldl' (\state@(nl, il, _) inst ->
1257
                  let solution = do
1258
                        let ncnt = Instance.requiredNodes $
1259
                                   Instance.diskTemplate inst
1260
                        (grp, _, _) <- findBestAllocGroup gl nl il
1261
                                       (Just target_gdxs) inst ncnt
1262
                        let gdx = Group.idx grp
1263
                        av_nodes <- availableGroupNodes group_ndx
1264
                                    excl_ndx gdx
1265
                        nodeEvacInstance nl il ChangeAll inst gdx av_nodes
1266
                  in updateEvacSolution state (Instance.idx inst) solution
1267
               )
1268
        (ini_nl, ini_il, emptyEvacSolution)
1269
        (map (`Container.find` ini_il) idxs)
1270
  in return (fin_nl, fin_il, reverseEvacSolution esol)
1271

    
1272
-- | Standard-sized allocation method.
1273
--
1274
-- This places instances of the same size on the cluster until we're
1275
-- out of space. The result will be a list of identically-sized
1276
-- instances.
1277
iterateAlloc :: AllocMethod
1278
iterateAlloc nl il limit newinst allocnodes ixes cstats =
1279
  let depth = length ixes
1280
      newname = printf "new-%d" depth::String
1281
      newidx = Container.size il
1282
      newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1283
      newlimit = fmap (flip (-) 1) limit
1284
  in case tryAlloc nl il newi2 allocnodes of
1285
       Bad s -> Bad s
1286
       Ok (AllocSolution { asFailures = errs, asSolution = sols3 }) ->
1287
         let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1288
         case sols3 of
1289
           Nothing -> newsol
1290
           Just (xnl, xi, _, _) ->
1291
             if limit == Just 0
1292
               then newsol
1293
               else iterateAlloc xnl (Container.add newidx xi il)
1294
                      newlimit newinst allocnodes (xi:ixes)
1295
                      (totalResources xnl:cstats)
1296

    
1297
-- | Predicate whether shrinking a single resource can lead to a valid
1298
-- allocation.
1299
sufficesShrinking :: (Instance.Instance -> AllocSolution) -> Instance.Instance
1300
                     -> FailMode  -> Maybe Instance.Instance
1301
sufficesShrinking allocFn inst fm =
1302
  case dropWhile (isNothing . asSolution . fst)
1303
       . takeWhile (liftA2 (||) (elem fm . asFailures . fst)
1304
                                (isJust . asSolution . fst))
1305
       . map (allocFn &&& id) $
1306
       iterateOk (`Instance.shrinkByType` fm) inst
1307
  of x:_ -> Just . snd $ x
1308
     _ -> Nothing
1309

    
1310
-- | Tiered allocation method.
1311
--
1312
-- This places instances on the cluster, and decreases the spec until
1313
-- we can allocate again. The result will be a list of decreasing
1314
-- instance specs.
1315
tieredAlloc :: AllocMethod
1316
tieredAlloc nl il limit newinst allocnodes ixes cstats =
1317
  case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1318
    Bad s -> Bad s
1319
    Ok (errs, nl', il', ixes', cstats') ->
1320
      let newsol = Ok (errs, nl', il', ixes', cstats')
1321
          ixes_cnt = length ixes'
1322
          (stop, newlimit) = case limit of
1323
                               Nothing -> (False, Nothing)
1324
                               Just n -> (n <= ixes_cnt,
1325
                                            Just (n - ixes_cnt))
1326
          sortedErrs = map fst $ sortBy (comparing snd) errs
1327
          suffShrink = sufficesShrinking (fromMaybe emptyAllocSolution
1328
                                          . flip (tryAlloc nl' il') allocnodes)
1329
                       newinst
1330
          bigSteps = filter isJust . map suffShrink . reverse $ sortedErrs
1331
      in if stop then newsol else
1332
          case bigSteps of
1333
            Just newinst':_ -> tieredAlloc nl' il' newlimit
1334
                               newinst' allocnodes ixes' cstats'
1335
            _ -> case Instance.shrinkByType newinst . last $ sortedErrs of
1336
                   Bad _ -> newsol
1337
                   Ok newinst' -> tieredAlloc nl' il' newlimit
1338
                                  newinst' allocnodes ixes' cstats'
1339

    
1340
-- * Formatting functions
1341

    
1342
-- | Given the original and final nodes, computes the relocation description.
1343
computeMoves :: Instance.Instance -- ^ The instance to be moved
1344
             -> String -- ^ The instance name
1345
             -> IMove  -- ^ The move being performed
1346
             -> String -- ^ New primary
1347
             -> String -- ^ New secondary
1348
             -> (String, [String])
1349
                -- ^ Tuple of moves and commands list; moves is containing
1350
                -- either @/f/@ for failover or @/r:name/@ for replace
1351
                -- secondary, while the command list holds gnt-instance
1352
                -- commands (without that prefix), e.g \"@failover instance1@\"
1353
computeMoves i inam mv c d =
1354
  case mv of
1355
    Failover -> ("f", [mig])
1356
    FailoverToAny _ -> (printf "fa:%s" c, [mig_any])
1357
    FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1358
    ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1359
    ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1360
    ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1361
  where morf = if Instance.isRunning i then "migrate" else "failover"
1362
        mig = printf "%s -f %s" morf inam::String
1363
        mig_any = printf "%s -f -n %s %s" morf c inam::String
1364
        rep n = printf "replace-disks -n %s %s" n inam::String
1365

    
1366
-- | Converts a placement to string format.
1367
printSolutionLine :: Node.List     -- ^ The node list
1368
                  -> Instance.List -- ^ The instance list
1369
                  -> Int           -- ^ Maximum node name length
1370
                  -> Int           -- ^ Maximum instance name length
1371
                  -> Placement     -- ^ The current placement
1372
                  -> Int           -- ^ The index of the placement in
1373
                                   -- the solution
1374
                  -> (String, [String])
1375
printSolutionLine nl il nmlen imlen plc pos =
1376
  let pmlen = (2*nmlen + 1)
1377
      (i, p, s, mv, c) = plc
1378
      old_sec = Instance.sNode inst
1379
      inst = Container.find i il
1380
      inam = Instance.alias inst
1381
      npri = Node.alias $ Container.find p nl
1382
      nsec = Node.alias $ Container.find s nl
1383
      opri = Node.alias $ Container.find (Instance.pNode inst) nl
1384
      osec = Node.alias $ Container.find old_sec nl
1385
      (moves, cmds) =  computeMoves inst inam mv npri nsec
1386
      -- FIXME: this should check instead/also the disk template
1387
      ostr = if old_sec == Node.noSecondary
1388
               then printf "%s" opri::String
1389
               else printf "%s:%s" opri osec::String
1390
      nstr = if s == Node.noSecondary
1391
               then printf "%s" npri::String
1392
               else printf "%s:%s" npri nsec::String
1393
  in (printf "  %3d. %-*s %-*s => %-*s %12.8f a=%s"
1394
      pos imlen inam pmlen ostr pmlen nstr c moves,
1395
      cmds)
1396

    
1397
-- | Return the instance and involved nodes in an instance move.
1398
--
1399
-- Note that the output list length can vary, and is not required nor
1400
-- guaranteed to be of any specific length.
1401
involvedNodes :: Instance.List -- ^ Instance list, used for retrieving
1402
                               -- the instance from its index; note
1403
                               -- that this /must/ be the original
1404
                               -- instance list, so that we can
1405
                               -- retrieve the old nodes
1406
              -> Placement     -- ^ The placement we're investigating,
1407
                               -- containing the new nodes and
1408
                               -- instance index
1409
              -> [Ndx]         -- ^ Resulting list of node indices
1410
involvedNodes il plc =
1411
  let (i, np, ns, _, _) = plc
1412
      inst = Container.find i il
1413
  in nub $ [np, ns] ++ Instance.allNodes inst
1414

    
1415
-- | Inner function for splitJobs, that either appends the next job to
1416
-- the current jobset, or starts a new jobset.
1417
mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1418
mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1419
mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1420
  | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1421
  | otherwise = ([n]:cjs, ndx)
1422

    
1423
-- | Break a list of moves into independent groups. Note that this
1424
-- will reverse the order of jobs.
1425
splitJobs :: [MoveJob] -> [JobSet]
1426
splitJobs = fst . foldl mergeJobs ([], [])
1427

    
1428
-- | Given a list of commands, prefix them with @gnt-instance@ and
1429
-- also beautify the display a little.
1430
formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1431
formatJob jsn jsl (sn, (_, _, _, cmds)) =
1432
  let out =
1433
        printf "  echo job %d/%d" jsn sn:
1434
        printf "  check":
1435
        map ("  gnt-instance " ++) cmds
1436
  in if sn == 1
1437
       then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1438
       else out
1439

    
1440
-- | Given a list of commands, prefix them with @gnt-instance@ and
1441
-- also beautify the display a little.
1442
formatCmds :: [JobSet] -> String
1443
formatCmds =
1444
  unlines .
1445
  concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1446
                           (zip [1..] js)) .
1447
  zip [1..]
1448

    
1449
-- | Print the node list.
1450
printNodes :: Node.List -> [String] -> String
1451
printNodes nl fs =
1452
  let fields = case fs of
1453
                 [] -> Node.defaultFields
1454
                 "+":rest -> Node.defaultFields ++ rest
1455
                 _ -> fs
1456
      snl = sortBy (comparing Node.idx) (Container.elems nl)
1457
      (header, isnum) = unzip $ map Node.showHeader fields
1458
  in printTable "" header (map (Node.list fields) snl) isnum
1459

    
1460
-- | Print the instance list.
1461
printInsts :: Node.List -> Instance.List -> String
1462
printInsts nl il =
1463
  let sil = sortBy (comparing Instance.idx) (Container.elems il)
1464
      helper inst = [ if Instance.isRunning inst then "R" else " "
1465
                    , Instance.name inst
1466
                    , Container.nameOf nl (Instance.pNode inst)
1467
                    , let sdx = Instance.sNode inst
1468
                      in if sdx == Node.noSecondary
1469
                           then  ""
1470
                           else Container.nameOf nl sdx
1471
                    , if Instance.autoBalance inst then "Y" else "N"
1472
                    , printf "%3d" $ Instance.vcpus inst
1473
                    , printf "%5d" $ Instance.mem inst
1474
                    , printf "%5d" $ Instance.dsk inst `div` 1024
1475
                    , printf "%5.3f" lC
1476
                    , printf "%5.3f" lM
1477
                    , printf "%5.3f" lD
1478
                    , printf "%5.3f" lN
1479
                    ]
1480
          where DynUtil lC lM lD lN = Instance.util inst
1481
      header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1482
               , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1483
      isnum = False:False:False:False:False:repeat True
1484
  in printTable "" header (map helper sil) isnum
1485

    
1486
-- | Shows statistics for a given node list.
1487
printStats :: String -> Node.List -> String
1488
printStats lp nl =
1489
  let dcvs = compDetailedCV $ Container.elems nl
1490
      (weights, names) = unzip detailedCVInfo
1491
      hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1492
      header = [ "Field", "Value", "Weight" ]
1493
      formatted = map (\(w, h, val) ->
1494
                         [ h
1495
                         , printf "%.8f" val
1496
                         , printf "x%.2f" w
1497
                         ]) hd
1498
  in printTable lp header formatted $ False:repeat True
1499

    
1500
-- | Convert a placement into a list of OpCodes (basically a job).
1501
iMoveToJob :: Node.List        -- ^ The node list; only used for node
1502
                               -- names, so any version is good
1503
                               -- (before or after the operation)
1504
           -> Instance.List    -- ^ The instance list; also used for
1505
                               -- names only
1506
           -> Idx              -- ^ The index of the instance being
1507
                               -- moved
1508
           -> IMove            -- ^ The actual move to be described
1509
           -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to
1510
                               -- the given move
1511
iMoveToJob nl il idx move =
1512
  let inst = Container.find idx il
1513
      iname = Instance.name inst
1514
      lookNode  n = case mkNonEmpty (Container.nameOf nl n) of
1515
                      -- FIXME: convert htools codebase to non-empty strings
1516
                      Bad msg -> error $ "Empty node name for idx " ++
1517
                                 show n ++ ": " ++ msg ++ "??"
1518
                      Ok ne -> Just ne
1519
      opF = OpCodes.OpInstanceMigrate
1520
              { OpCodes.opInstanceName        = iname
1521
              , OpCodes.opInstanceUuid        = Nothing
1522
              , OpCodes.opMigrationMode       = Nothing -- default
1523
              , OpCodes.opOldLiveMode         = Nothing -- default as well
1524
              , OpCodes.opTargetNode          = Nothing -- this is drbd
1525
              , OpCodes.opTargetNodeUuid      = Nothing
1526
              , OpCodes.opAllowRuntimeChanges = False
1527
              , OpCodes.opIgnoreIpolicy       = False
1528
              , OpCodes.opMigrationCleanup    = False
1529
              , OpCodes.opIallocator          = Nothing
1530
              , OpCodes.opAllowFailover       = True }
1531
      opFA n = opF { OpCodes.opTargetNode = lookNode n } -- not drbd
1532
      opR n = OpCodes.OpInstanceReplaceDisks
1533
                { OpCodes.opInstanceName     = iname
1534
                , OpCodes.opInstanceUuid     = Nothing
1535
                , OpCodes.opEarlyRelease     = False
1536
                , OpCodes.opIgnoreIpolicy    = False
1537
                , OpCodes.opReplaceDisksMode = OpCodes.ReplaceNewSecondary
1538
                , OpCodes.opReplaceDisksList = []
1539
                , OpCodes.opRemoteNode       = lookNode n
1540
                , OpCodes.opRemoteNodeUuid   = Nothing
1541
                , OpCodes.opIallocator       = Nothing
1542
                }
1543
  in case move of
1544
       Failover -> [ opF ]
1545
       FailoverToAny np -> [ opFA np ]
1546
       ReplacePrimary np -> [ opF, opR np, opF ]
1547
       ReplaceSecondary ns -> [ opR ns ]
1548
       ReplaceAndFailover np -> [ opR np, opF ]
1549
       FailoverAndReplace ns -> [ opF, opR ns ]
1550

    
1551
-- * Node group functions
1552

    
1553
-- | Computes the group of an instance.
1554
instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1555
instanceGroup nl i =
1556
  let sidx = Instance.sNode i
1557
      pnode = Container.find (Instance.pNode i) nl
1558
      snode = if sidx == Node.noSecondary
1559
              then pnode
1560
              else Container.find sidx nl
1561
      pgroup = Node.group pnode
1562
      sgroup = Node.group snode
1563
  in if pgroup /= sgroup
1564
       then fail ("Instance placed accross two node groups, primary " ++
1565
                  show pgroup ++ ", secondary " ++ show sgroup)
1566
       else return pgroup
1567

    
1568
-- | Computes the group of an instance per the primary node.
1569
instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1570
instancePriGroup nl i =
1571
  let pnode = Container.find (Instance.pNode i) nl
1572
  in  Node.group pnode
1573

    
1574
-- | Compute the list of badly allocated instances (split across node
1575
-- groups).
1576
findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1577
findSplitInstances nl =
1578
  filter (not . isOk . instanceGroup nl) . Container.elems
1579

    
1580
-- | Splits a cluster into the component node groups.
1581
splitCluster :: Node.List -> Instance.List ->
1582
                [(Gdx, (Node.List, Instance.List))]
1583
splitCluster nl il =
1584
  let ngroups = Node.computeGroups (Container.elems nl)
1585
  in map (\(gdx, nodes) ->
1586
           let nidxs = map Node.idx nodes
1587
               nodes' = zip nidxs nodes
1588
               instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1589
           in (gdx, (Container.fromList nodes', instances))) ngroups
1590

    
1591
-- | Compute the list of nodes that are to be evacuated, given a list
1592
-- of instances and an evacuation mode.
1593
nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1594
                -> EvacMode      -- ^ The evacuation mode we're using
1595
                -> [Idx]         -- ^ List of instance indices being evacuated
1596
                -> IntSet.IntSet -- ^ Set of node indices
1597
nodesToEvacuate il mode =
1598
  IntSet.delete Node.noSecondary .
1599
  foldl' (\ns idx ->
1600
            let i = Container.find idx il
1601
                pdx = Instance.pNode i
1602
                sdx = Instance.sNode i
1603
                dt = Instance.diskTemplate i
1604
                withSecondary = case dt of
1605
                                  DTDrbd8 -> IntSet.insert sdx ns
1606
                                  _ -> ns
1607
            in case mode of
1608
                 ChangePrimary   -> IntSet.insert pdx ns
1609
                 ChangeSecondary -> withSecondary
1610
                 ChangeAll       -> IntSet.insert pdx withSecondary
1611
         ) IntSet.empty