Statistics
| Branch: | Tag: | Revision:

root / src / Ganeti / HTools / Cluster.hs @ 0cc3d742

History | View | Annotate | Download (68.9 kB)

1
{-| Implementation of cluster-wide logic.
2

    
3
This module holds all pure cluster-logic; I\/O related functionality
4
goes into the /Main/ module for the individual binaries.
5

    
6
-}
7

    
8
{-
9

    
10
Copyright (C) 2009, 2010, 2011, 2012, 2013 Google Inc.
11

    
12
This program is free software; you can redistribute it and/or modify
13
it under the terms of the GNU General Public License as published by
14
the Free Software Foundation; either version 2 of the License, or
15
(at your option) any later version.
16

    
17
This program is distributed in the hope that it will be useful, but
18
WITHOUT ANY WARRANTY; without even the implied warranty of
19
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
General Public License for more details.
21

    
22
You should have received a copy of the GNU General Public License
23
along with this program; if not, write to the Free Software
24
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25
02110-1301, USA.
26

    
27
-}
28

    
29
module Ganeti.HTools.Cluster
30
  (
31
    -- * Types
32
    AllocSolution(..)
33
  , EvacSolution(..)
34
  , Table(..)
35
  , CStats(..)
36
  , AllocNodes
37
  , AllocResult
38
  , AllocMethod
39
  , AllocSolutionList
40
  -- * Generic functions
41
  , totalResources
42
  , computeAllocationDelta
43
  -- * First phase functions
44
  , computeBadItems
45
  -- * Second phase functions
46
  , printSolutionLine
47
  , formatCmds
48
  , involvedNodes
49
  , splitJobs
50
  -- * Display functions
51
  , printNodes
52
  , printInsts
53
  -- * Balacing functions
54
  , checkMove
55
  , doNextBalance
56
  , tryBalance
57
  , compCV
58
  , compCVNodes
59
  , compDetailedCV
60
  , printStats
61
  , iMoveToJob
62
  -- * IAllocator functions
63
  , genAllocNodes
64
  , tryAlloc
65
  , tryMGAlloc
66
  , tryNodeEvac
67
  , tryChangeGroup
68
  , collapseFailures
69
  , allocList
70
  -- * Allocation functions
71
  , iterateAlloc
72
  , tieredAlloc
73
  -- * Node group functions
74
  , instanceGroup
75
  , findSplitInstances
76
  , splitCluster
77
  ) where
78

    
79
import qualified Data.IntSet as IntSet
80
import Data.List
81
import Data.Maybe (fromJust, isNothing)
82
import Data.Ord (comparing)
83
import Text.Printf (printf)
84

    
85
import Ganeti.BasicTypes
86
import qualified Ganeti.HTools.Container as Container
87
import qualified Ganeti.HTools.Instance as Instance
88
import qualified Ganeti.HTools.Nic as Nic
89
import qualified Ganeti.HTools.Node as Node
90
import qualified Ganeti.HTools.Group as Group
91
import Ganeti.HTools.Types
92
import Ganeti.Compat
93
import qualified Ganeti.OpCodes as OpCodes
94
import Ganeti.Utils
95
import Ganeti.Types (mkNonEmpty)
96

    
97
-- * Types
98

    
99
-- | Allocation\/relocation solution.
100
data AllocSolution = AllocSolution
101
  { asFailures :: [FailMode]              -- ^ Failure counts
102
  , asAllocs   :: Int                     -- ^ Good allocation count
103
  , asSolution :: Maybe Node.AllocElement -- ^ The actual allocation result
104
  , asLog      :: [String]                -- ^ Informational messages
105
  }
106

    
107
-- | Node evacuation/group change iallocator result type. This result
108
-- type consists of actual opcodes (a restricted subset) that are
109
-- transmitted back to Ganeti.
110
data EvacSolution = EvacSolution
111
  { esMoved   :: [(Idx, Gdx, [Ndx])]  -- ^ Instances moved successfully
112
  , esFailed  :: [(Idx, String)]      -- ^ Instances which were not
113
                                      -- relocated
114
  , esOpCodes :: [[OpCodes.OpCode]]   -- ^ List of jobs
115
  } deriving (Show)
116

    
117
-- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
118
type AllocResult = (FailStats, Node.List, Instance.List,
119
                    [Instance.Instance], [CStats])
120

    
121
-- | Type alias for easier handling.
122
type AllocSolutionList = [(Instance.Instance, AllocSolution)]
123

    
124
-- | A type denoting the valid allocation mode/pairs.
125
--
126
-- For a one-node allocation, this will be a @Left ['Ndx']@, whereas
127
-- for a two-node allocation, this will be a @Right [('Ndx',
128
-- ['Ndx'])]@. In the latter case, the list is basically an
129
-- association list, grouped by primary node and holding the potential
130
-- secondary nodes in the sub-list.
131
type AllocNodes = Either [Ndx] [(Ndx, [Ndx])]
132

    
133
-- | The empty solution we start with when computing allocations.
134
emptyAllocSolution :: AllocSolution
135
emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
136
                                   , asSolution = Nothing, asLog = [] }
137

    
138
-- | The empty evac solution.
139
emptyEvacSolution :: EvacSolution
140
emptyEvacSolution = EvacSolution { esMoved = []
141
                                 , esFailed = []
142
                                 , esOpCodes = []
143
                                 }
144

    
145
-- | The complete state for the balancing solution.
146
data Table = Table Node.List Instance.List Score [Placement]
147
             deriving (Show)
148

    
149
-- | Cluster statistics data type.
150
data CStats = CStats
151
  { csFmem :: Integer -- ^ Cluster free mem
152
  , csFdsk :: Integer -- ^ Cluster free disk
153
  , csFspn :: Integer -- ^ Cluster free spindles
154
  , csAmem :: Integer -- ^ Cluster allocatable mem
155
  , csAdsk :: Integer -- ^ Cluster allocatable disk
156
  , csAcpu :: Integer -- ^ Cluster allocatable cpus
157
  , csMmem :: Integer -- ^ Max node allocatable mem
158
  , csMdsk :: Integer -- ^ Max node allocatable disk
159
  , csMcpu :: Integer -- ^ Max node allocatable cpu
160
  , csImem :: Integer -- ^ Instance used mem
161
  , csIdsk :: Integer -- ^ Instance used disk
162
  , csIspn :: Integer -- ^ Instance used spindles
163
  , csIcpu :: Integer -- ^ Instance used cpu
164
  , csTmem :: Double  -- ^ Cluster total mem
165
  , csTdsk :: Double  -- ^ Cluster total disk
166
  , csTspn :: Double  -- ^ Cluster total spindles
167
  , csTcpu :: Double  -- ^ Cluster total cpus
168
  , csVcpu :: Integer -- ^ Cluster total virtual cpus
169
  , csNcpu :: Double  -- ^ Equivalent to 'csIcpu' but in terms of
170
                      -- physical CPUs, i.e. normalised used phys CPUs
171
  , csXmem :: Integer -- ^ Unnacounted for mem
172
  , csNmem :: Integer -- ^ Node own memory
173
  , csScore :: Score  -- ^ The cluster score
174
  , csNinst :: Int    -- ^ The total number of instances
175
  } deriving (Show)
176

    
177
-- | A simple type for allocation functions.
178
type AllocMethod =  Node.List           -- ^ Node list
179
                 -> Instance.List       -- ^ Instance list
180
                 -> Maybe Int           -- ^ Optional allocation limit
181
                 -> Instance.Instance   -- ^ Instance spec for allocation
182
                 -> AllocNodes          -- ^ Which nodes we should allocate on
183
                 -> [Instance.Instance] -- ^ Allocated instances
184
                 -> [CStats]            -- ^ Running cluster stats
185
                 -> Result AllocResult  -- ^ Allocation result
186

    
187
-- | A simple type for the running solution of evacuations.
188
type EvacInnerState =
189
  Either String (Node.List, Instance.Instance, Score, Ndx)
190

    
191
-- * Utility functions
192

    
193
-- | Verifies the N+1 status and return the affected nodes.
194
verifyN1 :: [Node.Node] -> [Node.Node]
195
verifyN1 = filter Node.failN1
196

    
197
{-| Computes the pair of bad nodes and instances.
198

    
199
The bad node list is computed via a simple 'verifyN1' check, and the
200
bad instance list is the list of primary and secondary instances of
201
those nodes.
202

    
203
-}
204
computeBadItems :: Node.List -> Instance.List ->
205
                   ([Node.Node], [Instance.Instance])
206
computeBadItems nl il =
207
  let bad_nodes = verifyN1 $ getOnline nl
208
      bad_instances = map (`Container.find` il) .
209
                      sort . nub $
210
                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
211
  in
212
    (bad_nodes, bad_instances)
213

    
214
-- | Extracts the node pairs for an instance. This can fail if the
215
-- instance is single-homed. FIXME: this needs to be improved,
216
-- together with the general enhancement for handling non-DRBD moves.
217
instanceNodes :: Node.List -> Instance.Instance ->
218
                 (Ndx, Ndx, Node.Node, Node.Node)
219
instanceNodes nl inst =
220
  let old_pdx = Instance.pNode inst
221
      old_sdx = Instance.sNode inst
222
      old_p = Container.find old_pdx nl
223
      old_s = Container.find old_sdx nl
224
  in (old_pdx, old_sdx, old_p, old_s)
225

    
226
-- | Zero-initializer for the CStats type.
227
emptyCStats :: CStats
228
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
229

    
230
-- | Update stats with data from a new node.
231
updateCStats :: CStats -> Node.Node -> CStats
232
updateCStats cs node =
233
  let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
234
               csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
235
               csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
236
               csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
237
               csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
238
               csVcpu = x_vcpu, csNcpu = x_ncpu,
239
               csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst,
240
               csFspn = x_fspn, csIspn = x_ispn, csTspn = x_tspn
241
             }
242
        = cs
243
      inc_amem = Node.fMem node - Node.rMem node
244
      inc_amem' = if inc_amem > 0 then inc_amem else 0
245
      inc_adsk = Node.availDisk node
246
      inc_imem = truncate (Node.tMem node) - Node.nMem node
247
                 - Node.xMem node - Node.fMem node
248
      inc_icpu = Node.uCpu node
249
      inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
250
      inc_ispn = Node.tSpindles node - Node.fSpindles node
251
      inc_vcpu = Node.hiCpu node
252
      inc_acpu = Node.availCpu node
253
      inc_ncpu = fromIntegral (Node.uCpu node) /
254
                 iPolicyVcpuRatio (Node.iPolicy node)
255
  in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
256
        , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
257
        , csFspn = x_fspn + fromIntegral (Node.fSpindles node)
258
        , csAmem = x_amem + fromIntegral inc_amem'
259
        , csAdsk = x_adsk + fromIntegral inc_adsk
260
        , csAcpu = x_acpu + fromIntegral inc_acpu
261
        , csMmem = max x_mmem (fromIntegral inc_amem')
262
        , csMdsk = max x_mdsk (fromIntegral inc_adsk)
263
        , csMcpu = max x_mcpu (fromIntegral inc_acpu)
264
        , csImem = x_imem + fromIntegral inc_imem
265
        , csIdsk = x_idsk + fromIntegral inc_idsk
266
        , csIspn = x_ispn + fromIntegral inc_ispn
267
        , csIcpu = x_icpu + fromIntegral inc_icpu
268
        , csTmem = x_tmem + Node.tMem node
269
        , csTdsk = x_tdsk + Node.tDsk node
270
        , csTspn = x_tspn + fromIntegral (Node.tSpindles node)
271
        , csTcpu = x_tcpu + Node.tCpu node
272
        , csVcpu = x_vcpu + fromIntegral inc_vcpu
273
        , csNcpu = x_ncpu + inc_ncpu
274
        , csXmem = x_xmem + fromIntegral (Node.xMem node)
275
        , csNmem = x_nmem + fromIntegral (Node.nMem node)
276
        , csNinst = x_ninst + length (Node.pList node)
277
        }
278

    
279
-- | Compute the total free disk and memory in the cluster.
280
totalResources :: Node.List -> CStats
281
totalResources nl =
282
  let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
283
  in cs { csScore = compCV nl }
284

    
285
-- | Compute the delta between two cluster state.
286
--
287
-- This is used when doing allocations, to understand better the
288
-- available cluster resources. The return value is a triple of the
289
-- current used values, the delta that was still allocated, and what
290
-- was left unallocated.
291
computeAllocationDelta :: CStats -> CStats -> AllocStats
292
computeAllocationDelta cini cfin =
293
  let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu,
294
              csNcpu = i_ncpu, csIspn = i_ispn } = cini
295
      CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
296
              csTmem = t_mem, csTdsk = t_dsk, csVcpu = f_vcpu,
297
              csNcpu = f_ncpu, csTcpu = f_tcpu,
298
              csIspn = f_ispn, csTspn = t_spn } = cfin
299
      rini = AllocInfo { allocInfoVCpus = fromIntegral i_icpu
300
                       , allocInfoNCpus = i_ncpu
301
                       , allocInfoMem   = fromIntegral i_imem
302
                       , allocInfoDisk  = fromIntegral i_idsk
303
                       , allocInfoSpn   = fromIntegral i_ispn
304
                       }
305
      rfin = AllocInfo { allocInfoVCpus = fromIntegral (f_icpu - i_icpu)
306
                       , allocInfoNCpus = f_ncpu - i_ncpu
307
                       , allocInfoMem   = fromIntegral (f_imem - i_imem)
308
                       , allocInfoDisk  = fromIntegral (f_idsk - i_idsk)
309
                       , allocInfoSpn   = fromIntegral (f_ispn - i_ispn)
310
                       }
311
      runa = AllocInfo { allocInfoVCpus = fromIntegral (f_vcpu - f_icpu)
312
                       , allocInfoNCpus = f_tcpu - f_ncpu
313
                       , allocInfoMem   = truncate t_mem - fromIntegral f_imem
314
                       , allocInfoDisk  = truncate t_dsk - fromIntegral f_idsk
315
                       , allocInfoSpn   = truncate t_spn - fromIntegral f_ispn
316
                       }
317
  in (rini, rfin, runa)
318

    
319
-- | The names and weights of the individual elements in the CV list.
320
detailedCVInfo :: [(Double, String)]
321
detailedCVInfo = [ (1,  "free_mem_cv")
322
                 , (1,  "free_disk_cv")
323
                 , (1,  "n1_cnt")
324
                 , (1,  "reserved_mem_cv")
325
                 , (4,  "offline_all_cnt")
326
                 , (16, "offline_pri_cnt")
327
                 , (1,  "vcpu_ratio_cv")
328
                 , (1,  "cpu_load_cv")
329
                 , (1,  "mem_load_cv")
330
                 , (1,  "disk_load_cv")
331
                 , (1,  "net_load_cv")
332
                 , (2,  "pri_tags_score")
333
                 , (1,  "spindles_cv")
334
                 ]
335

    
336
-- | Holds the weights used by 'compCVNodes' for each metric.
337
detailedCVWeights :: [Double]
338
detailedCVWeights = map fst detailedCVInfo
339

    
340
-- | Compute the mem and disk covariance.
341
compDetailedCV :: [Node.Node] -> [Double]
342
compDetailedCV all_nodes =
343
  let (offline, nodes) = partition Node.offline all_nodes
344
      mem_l = map Node.pMem nodes
345
      dsk_l = map Node.pDsk nodes
346
      -- metric: memory covariance
347
      mem_cv = stdDev mem_l
348
      -- metric: disk covariance
349
      dsk_cv = stdDev dsk_l
350
      -- metric: count of instances living on N1 failing nodes
351
      n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
352
                                                 length (Node.pList n)) .
353
                 filter Node.failN1 $ nodes :: Double
354
      res_l = map Node.pRem nodes
355
      -- metric: reserved memory covariance
356
      res_cv = stdDev res_l
357
      -- offline instances metrics
358
      offline_ipri = sum . map (length . Node.pList) $ offline
359
      offline_isec = sum . map (length . Node.sList) $ offline
360
      -- metric: count of instances on offline nodes
361
      off_score = fromIntegral (offline_ipri + offline_isec)::Double
362
      -- metric: count of primary instances on offline nodes (this
363
      -- helps with evacuation/failover of primary instances on
364
      -- 2-node clusters with one node offline)
365
      off_pri_score = fromIntegral offline_ipri::Double
366
      cpu_l = map Node.pCpu nodes
367
      -- metric: covariance of vcpu/pcpu ratio
368
      cpu_cv = stdDev cpu_l
369
      -- metrics: covariance of cpu, memory, disk and network load
370
      (c_load, m_load, d_load, n_load) =
371
        unzip4 $ map (\n ->
372
                      let DynUtil c1 m1 d1 n1 = Node.utilLoad n
373
                          DynUtil c2 m2 d2 n2 = Node.utilPool n
374
                      in (c1/c2, m1/m2, d1/d2, n1/n2)) nodes
375
      -- metric: conflicting instance count
376
      pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
377
      pri_tags_score = fromIntegral pri_tags_inst::Double
378
      -- metric: spindles %
379
      spindles_cv = map (\n -> Node.instSpindles n / Node.hiSpindles n) nodes
380
  in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
381
     , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
382
     , pri_tags_score, stdDev spindles_cv ]
383

    
384
-- | Compute the /total/ variance.
385
compCVNodes :: [Node.Node] -> Double
386
compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
387

    
388
-- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
389
compCV :: Node.List -> Double
390
compCV = compCVNodes . Container.elems
391

    
392
-- | Compute online nodes from a 'Node.List'.
393
getOnline :: Node.List -> [Node.Node]
394
getOnline = filter (not . Node.offline) . Container.elems
395

    
396
-- * Balancing functions
397

    
398
-- | Compute best table. Note that the ordering of the arguments is important.
399
compareTables :: Table -> Table -> Table
400
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
401
  if a_cv > b_cv then b else a
402

    
403
-- | Applies an instance move to a given node list and instance.
404
applyMove :: Node.List -> Instance.Instance
405
          -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
406
-- Failover (f)
407
applyMove nl inst Failover =
408
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
409
      int_p = Node.removePri old_p inst
410
      int_s = Node.removeSec old_s inst
411
      new_nl = do -- Maybe monad
412
        new_p <- Node.addPriEx (Node.offline old_p) int_s inst
413
        new_s <- Node.addSec int_p inst old_sdx
414
        let new_inst = Instance.setBoth inst old_sdx old_pdx
415
        return (Container.addTwo old_pdx new_s old_sdx new_p nl,
416
                new_inst, old_sdx, old_pdx)
417
  in new_nl
418

    
419
-- Failover to any (fa)
420
applyMove nl inst (FailoverToAny new_pdx) = do
421
  let (old_pdx, old_sdx, old_pnode, _) = instanceNodes nl inst
422
      new_pnode = Container.find new_pdx nl
423
      force_failover = Node.offline old_pnode
424
  new_pnode' <- Node.addPriEx force_failover new_pnode inst
425
  let old_pnode' = Node.removePri old_pnode inst
426
      inst' = Instance.setPri inst new_pdx
427
      nl' = Container.addTwo old_pdx old_pnode' new_pdx new_pnode' nl
428
  return (nl', inst', new_pdx, old_sdx)
429

    
430
-- Replace the primary (f:, r:np, f)
431
applyMove nl inst (ReplacePrimary new_pdx) =
432
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
433
      tgt_n = Container.find new_pdx nl
434
      int_p = Node.removePri old_p inst
435
      int_s = Node.removeSec old_s inst
436
      force_p = Node.offline old_p
437
      new_nl = do -- Maybe monad
438
                  -- check that the current secondary can host the instance
439
                  -- during the migration
440
        tmp_s <- Node.addPriEx force_p int_s inst
441
        let tmp_s' = Node.removePri tmp_s inst
442
        new_p <- Node.addPriEx force_p tgt_n inst
443
        new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
444
        let new_inst = Instance.setPri inst new_pdx
445
        return (Container.add new_pdx new_p $
446
                Container.addTwo old_pdx int_p old_sdx new_s nl,
447
                new_inst, new_pdx, old_sdx)
448
  in new_nl
449

    
450
-- Replace the secondary (r:ns)
451
applyMove nl inst (ReplaceSecondary new_sdx) =
452
  let old_pdx = Instance.pNode inst
453
      old_sdx = Instance.sNode inst
454
      old_s = Container.find old_sdx nl
455
      tgt_n = Container.find new_sdx nl
456
      int_s = Node.removeSec old_s inst
457
      force_s = Node.offline old_s
458
      new_inst = Instance.setSec inst new_sdx
459
      new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
460
               \new_s -> return (Container.addTwo new_sdx
461
                                 new_s old_sdx int_s nl,
462
                                 new_inst, old_pdx, new_sdx)
463
  in new_nl
464

    
465
-- Replace the secondary and failover (r:np, f)
466
applyMove nl inst (ReplaceAndFailover new_pdx) =
467
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
468
      tgt_n = Container.find new_pdx nl
469
      int_p = Node.removePri old_p inst
470
      int_s = Node.removeSec old_s inst
471
      force_s = Node.offline old_s
472
      new_nl = do -- Maybe monad
473
        new_p <- Node.addPri tgt_n inst
474
        new_s <- Node.addSecEx force_s int_p inst new_pdx
475
        let new_inst = Instance.setBoth inst new_pdx old_pdx
476
        return (Container.add new_pdx new_p $
477
                Container.addTwo old_pdx new_s old_sdx int_s nl,
478
                new_inst, new_pdx, old_pdx)
479
  in new_nl
480

    
481
-- Failver and replace the secondary (f, r:ns)
482
applyMove nl inst (FailoverAndReplace new_sdx) =
483
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
484
      tgt_n = Container.find new_sdx nl
485
      int_p = Node.removePri old_p inst
486
      int_s = Node.removeSec old_s inst
487
      force_p = Node.offline old_p
488
      new_nl = do -- Maybe monad
489
        new_p <- Node.addPriEx force_p int_s inst
490
        new_s <- Node.addSecEx force_p tgt_n inst old_sdx
491
        let new_inst = Instance.setBoth inst old_sdx new_sdx
492
        return (Container.add new_sdx new_s $
493
                Container.addTwo old_sdx new_p old_pdx int_p nl,
494
                new_inst, old_sdx, new_sdx)
495
  in new_nl
496

    
497
-- | Tries to allocate an instance on one given node.
498
allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
499
                 -> OpResult Node.AllocElement
500
allocateOnSingle nl inst new_pdx =
501
  let p = Container.find new_pdx nl
502
      new_inst = Instance.setBoth inst new_pdx Node.noSecondary
503
  in do
504
    Instance.instMatchesPolicy inst (Node.iPolicy p) (Node.exclStorage p)
505
    new_p <- Node.addPri p inst
506
    let new_nl = Container.add new_pdx new_p nl
507
        new_score = compCV new_nl
508
    return (new_nl, new_inst, [new_p], new_score)
509

    
510
-- | Tries to allocate an instance on a given pair of nodes.
511
allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
512
               -> OpResult Node.AllocElement
513
allocateOnPair nl inst new_pdx new_sdx =
514
  let tgt_p = Container.find new_pdx nl
515
      tgt_s = Container.find new_sdx nl
516
  in do
517
    Instance.instMatchesPolicy inst (Node.iPolicy tgt_p)
518
      (Node.exclStorage tgt_p)
519
    new_p <- Node.addPri tgt_p inst
520
    new_s <- Node.addSec tgt_s inst new_pdx
521
    let new_inst = Instance.setBoth inst new_pdx new_sdx
522
        new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
523
    return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
524

    
525
-- | Tries to perform an instance move and returns the best table
526
-- between the original one and the new one.
527
checkSingleStep :: Table -- ^ The original table
528
                -> Instance.Instance -- ^ The instance to move
529
                -> Table -- ^ The current best table
530
                -> IMove -- ^ The move to apply
531
                -> Table -- ^ The final best table
532
checkSingleStep ini_tbl target cur_tbl move =
533
  let Table ini_nl ini_il _ ini_plc = ini_tbl
534
      tmp_resu = applyMove ini_nl target move
535
  in case tmp_resu of
536
       Bad _ -> cur_tbl
537
       Ok (upd_nl, new_inst, pri_idx, sec_idx) ->
538
         let tgt_idx = Instance.idx target
539
             upd_cvar = compCV upd_nl
540
             upd_il = Container.add tgt_idx new_inst ini_il
541
             upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
542
             upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
543
         in compareTables cur_tbl upd_tbl
544

    
545
-- | Given the status of the current secondary as a valid new node and
546
-- the current candidate target node, generate the possible moves for
547
-- a instance.
548
possibleMoves :: MirrorType -- ^ The mirroring type of the instance
549
              -> Bool       -- ^ Whether the secondary node is a valid new node
550
              -> Bool       -- ^ Whether we can change the primary node
551
              -> Ndx        -- ^ Target node candidate
552
              -> [IMove]    -- ^ List of valid result moves
553

    
554
possibleMoves MirrorNone _ _ _ = []
555

    
556
possibleMoves MirrorExternal _ False _ = []
557

    
558
possibleMoves MirrorExternal _ True tdx =
559
  [ FailoverToAny tdx ]
560

    
561
possibleMoves MirrorInternal _ False tdx =
562
  [ ReplaceSecondary tdx ]
563

    
564
possibleMoves MirrorInternal True True tdx =
565
  [ ReplaceSecondary tdx
566
  , ReplaceAndFailover tdx
567
  , ReplacePrimary tdx
568
  , FailoverAndReplace tdx
569
  ]
570

    
571
possibleMoves MirrorInternal False True tdx =
572
  [ ReplaceSecondary tdx
573
  , ReplaceAndFailover tdx
574
  ]
575

    
576
-- | Compute the best move for a given instance.
577
checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
578
                  -> Bool              -- ^ Whether disk moves are allowed
579
                  -> Bool              -- ^ Whether instance moves are allowed
580
                  -> Table             -- ^ Original table
581
                  -> Instance.Instance -- ^ Instance to move
582
                  -> Table             -- ^ Best new table for this instance
583
checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
584
  let opdx = Instance.pNode target
585
      osdx = Instance.sNode target
586
      bad_nodes = [opdx, osdx]
587
      nodes = filter (`notElem` bad_nodes) nodes_idx
588
      mir_type = Instance.mirrorType target
589
      use_secondary = elem osdx nodes_idx && inst_moves
590
      aft_failover = if mir_type == MirrorInternal && use_secondary
591
                       -- if drbd and allowed to failover
592
                       then checkSingleStep ini_tbl target ini_tbl Failover
593
                       else ini_tbl
594
      all_moves =
595
        if disk_moves
596
          then concatMap (possibleMoves mir_type use_secondary inst_moves)
597
               nodes
598
          else []
599
    in
600
      -- iterate over the possible nodes for this instance
601
      foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
602

    
603
-- | Compute the best next move.
604
checkMove :: [Ndx]               -- ^ Allowed target node indices
605
          -> Bool                -- ^ Whether disk moves are allowed
606
          -> Bool                -- ^ Whether instance moves are allowed
607
          -> Table               -- ^ The current solution
608
          -> [Instance.Instance] -- ^ List of instances still to move
609
          -> Table               -- ^ The new solution
610
checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
611
  let Table _ _ _ ini_plc = ini_tbl
612
      -- we're using rwhnf from the Control.Parallel.Strategies
613
      -- package; we don't need to use rnf as that would force too
614
      -- much evaluation in single-threaded cases, and in
615
      -- multi-threaded case the weak head normal form is enough to
616
      -- spark the evaluation
617
      tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
618
                             inst_moves ini_tbl)
619
               victims
620
      -- iterate over all instances, computing the best move
621
      best_tbl = foldl' compareTables ini_tbl tables
622
      Table _ _ _ best_plc = best_tbl
623
  in if length best_plc == length ini_plc
624
       then ini_tbl -- no advancement
625
       else best_tbl
626

    
627
-- | Check if we are allowed to go deeper in the balancing.
628
doNextBalance :: Table     -- ^ The starting table
629
              -> Int       -- ^ Remaining length
630
              -> Score     -- ^ Score at which to stop
631
              -> Bool      -- ^ The resulting table and commands
632
doNextBalance ini_tbl max_rounds min_score =
633
  let Table _ _ ini_cv ini_plc = ini_tbl
634
      ini_plc_len = length ini_plc
635
  in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
636

    
637
-- | Run a balance move.
638
tryBalance :: Table       -- ^ The starting table
639
           -> Bool        -- ^ Allow disk moves
640
           -> Bool        -- ^ Allow instance moves
641
           -> Bool        -- ^ Only evacuate moves
642
           -> Score       -- ^ Min gain threshold
643
           -> Score       -- ^ Min gain
644
           -> Maybe Table -- ^ The resulting table and commands
645
tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
646
    let Table ini_nl ini_il ini_cv _ = ini_tbl
647
        all_inst = Container.elems ini_il
648
        all_nodes = Container.elems ini_nl
649
        (offline_nodes, online_nodes) = partition Node.offline all_nodes
650
        all_inst' = if evac_mode
651
                      then let bad_nodes = map Node.idx offline_nodes
652
                           in filter (any (`elem` bad_nodes) .
653
                                          Instance.allNodes) all_inst
654
                      else all_inst
655
        reloc_inst = filter (\i -> Instance.movable i &&
656
                                   Instance.autoBalance i) all_inst'
657
        node_idx = map Node.idx online_nodes
658
        fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
659
        (Table _ _ fin_cv _) = fin_tbl
660
    in
661
      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
662
      then Just fin_tbl -- this round made success, return the new table
663
      else Nothing
664

    
665
-- * Allocation functions
666

    
667
-- | Build failure stats out of a list of failures.
668
collapseFailures :: [FailMode] -> FailStats
669
collapseFailures flst =
670
    map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
671
            [minBound..maxBound]
672

    
673
-- | Compares two Maybe AllocElement and chooses the best score.
674
bestAllocElement :: Maybe Node.AllocElement
675
                 -> Maybe Node.AllocElement
676
                 -> Maybe Node.AllocElement
677
bestAllocElement a Nothing = a
678
bestAllocElement Nothing b = b
679
bestAllocElement a@(Just (_, _, _, ascore)) b@(Just (_, _, _, bscore)) =
680
  if ascore < bscore then a else b
681

    
682
-- | Update current Allocation solution and failure stats with new
683
-- elements.
684
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
685
concatAllocs as (Bad reason) = as { asFailures = reason : asFailures as }
686

    
687
concatAllocs as (Ok ns) =
688
  let -- Choose the old or new solution, based on the cluster score
689
    cntok = asAllocs as
690
    osols = asSolution as
691
    nsols = bestAllocElement osols (Just ns)
692
    nsuc = cntok + 1
693
    -- Note: we force evaluation of nsols here in order to keep the
694
    -- memory profile low - we know that we will need nsols for sure
695
    -- in the next cycle, so we force evaluation of nsols, since the
696
    -- foldl' in the caller will only evaluate the tuple, but not the
697
    -- elements of the tuple
698
  in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolution = nsols }
699

    
700
-- | Sums two 'AllocSolution' structures.
701
sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
702
sumAllocs (AllocSolution aFails aAllocs aSols aLog)
703
          (AllocSolution bFails bAllocs bSols bLog) =
704
  -- note: we add b first, since usually it will be smaller; when
705
  -- fold'ing, a will grow and grow whereas b is the per-group
706
  -- result, hence smaller
707
  let nFails  = bFails ++ aFails
708
      nAllocs = aAllocs + bAllocs
709
      nSols   = bestAllocElement aSols bSols
710
      nLog    = bLog ++ aLog
711
  in AllocSolution nFails nAllocs nSols nLog
712

    
713
-- | Given a solution, generates a reasonable description for it.
714
describeSolution :: AllocSolution -> String
715
describeSolution as =
716
  let fcnt = asFailures as
717
      sols = asSolution as
718
      freasons =
719
        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
720
        filter ((> 0) . snd) . collapseFailures $ fcnt
721
  in case sols of
722
     Nothing -> "No valid allocation solutions, failure reasons: " ++
723
                (if null fcnt then "unknown reasons" else freasons)
724
     Just (_, _, nodes, cv) ->
725
         printf ("score: %.8f, successes %d, failures %d (%s)" ++
726
                 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
727
               (intercalate "/" . map Node.name $ nodes)
728

    
729
-- | Annotates a solution with the appropriate string.
730
annotateSolution :: AllocSolution -> AllocSolution
731
annotateSolution as = as { asLog = describeSolution as : asLog as }
732

    
733
-- | Reverses an evacuation solution.
734
--
735
-- Rationale: we always concat the results to the top of the lists, so
736
-- for proper jobset execution, we should reverse all lists.
737
reverseEvacSolution :: EvacSolution -> EvacSolution
738
reverseEvacSolution (EvacSolution f m o) =
739
  EvacSolution (reverse f) (reverse m) (reverse o)
740

    
741
-- | Generate the valid node allocation singles or pairs for a new instance.
742
genAllocNodes :: Group.List        -- ^ Group list
743
              -> Node.List         -- ^ The node map
744
              -> Int               -- ^ The number of nodes required
745
              -> Bool              -- ^ Whether to drop or not
746
                                   -- unallocable nodes
747
              -> Result AllocNodes -- ^ The (monadic) result
748
genAllocNodes gl nl count drop_unalloc =
749
  let filter_fn = if drop_unalloc
750
                    then filter (Group.isAllocable .
751
                                 flip Container.find gl . Node.group)
752
                    else id
753
      all_nodes = filter_fn $ getOnline nl
754
      all_pairs = [(Node.idx p,
755
                    [Node.idx s | s <- all_nodes,
756
                                       Node.idx p /= Node.idx s,
757
                                       Node.group p == Node.group s]) |
758
                   p <- all_nodes]
759
  in case count of
760
       1 -> Ok (Left (map Node.idx all_nodes))
761
       2 -> Ok (Right (filter (not . null . snd) all_pairs))
762
       _ -> Bad "Unsupported number of nodes, only one or two  supported"
763

    
764
-- | Try to allocate an instance on the cluster.
765
tryAlloc :: (Monad m) =>
766
            Node.List         -- ^ The node list
767
         -> Instance.List     -- ^ The instance list
768
         -> Instance.Instance -- ^ The instance to allocate
769
         -> AllocNodes        -- ^ The allocation targets
770
         -> m AllocSolution   -- ^ Possible solution list
771
tryAlloc _  _ _    (Right []) = fail "Not enough online nodes"
772
tryAlloc nl _ inst (Right ok_pairs) =
773
  let psols = parMap rwhnf (\(p, ss) ->
774
                              foldl' (\cstate ->
775
                                        concatAllocs cstate .
776
                                        allocateOnPair nl inst p)
777
                              emptyAllocSolution ss) ok_pairs
778
      sols = foldl' sumAllocs emptyAllocSolution psols
779
  in return $ annotateSolution sols
780

    
781
tryAlloc _  _ _    (Left []) = fail "No online nodes"
782
tryAlloc nl _ inst (Left all_nodes) =
783
  let sols = foldl' (\cstate ->
784
                       concatAllocs cstate . allocateOnSingle nl inst
785
                    ) emptyAllocSolution all_nodes
786
  in return $ annotateSolution sols
787

    
788
-- | Given a group/result, describe it as a nice (list of) messages.
789
solutionDescription :: (Group.Group, Result AllocSolution)
790
                    -> [String]
791
solutionDescription (grp, result) =
792
  case result of
793
    Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
794
    Bad message -> [printf "Group %s: error %s" gname message]
795
  where gname = Group.name grp
796
        pol = allocPolicyToRaw (Group.allocPolicy grp)
797

    
798
-- | From a list of possibly bad and possibly empty solutions, filter
799
-- only the groups with a valid result. Note that the result will be
800
-- reversed compared to the original list.
801
filterMGResults :: [(Group.Group, Result AllocSolution)]
802
                -> [(Group.Group, AllocSolution)]
803
filterMGResults = foldl' fn []
804
  where unallocable = not . Group.isAllocable
805
        fn accu (grp, rasol) =
806
          case rasol of
807
            Bad _ -> accu
808
            Ok sol | isNothing (asSolution sol) -> accu
809
                   | unallocable grp -> accu
810
                   | otherwise -> (grp, sol):accu
811

    
812
-- | Sort multigroup results based on policy and score.
813
sortMGResults :: [(Group.Group, AllocSolution)]
814
              -> [(Group.Group, AllocSolution)]
815
sortMGResults sols =
816
  let extractScore (_, _, _, x) = x
817
      solScore (grp, sol) = (Group.allocPolicy grp,
818
                             (extractScore . fromJust . asSolution) sol)
819
  in sortBy (comparing solScore) sols
820

    
821
-- | Removes node groups which can't accommodate the instance
822
filterValidGroups :: [(Group.Group, (Node.List, Instance.List))]
823
                  -> Instance.Instance
824
                  -> ([(Group.Group, (Node.List, Instance.List))], [String])
825
filterValidGroups [] _ = ([], [])
826
filterValidGroups (ng:ngs) inst =
827
  let (valid_ngs, msgs) = filterValidGroups ngs inst
828
      hasNetwork nic = case Nic.network nic of
829
        Just net -> net `elem` Group.networks (fst ng)
830
        Nothing -> True
831
      hasRequiredNetworks = all hasNetwork (Instance.nics inst)
832
  in if hasRequiredNetworks
833
      then (ng:valid_ngs, msgs)
834
      else (valid_ngs,
835
            ("group " ++ Group.name (fst ng) ++
836
             " is not connected to a network required by instance " ++
837
             Instance.name inst):msgs)
838

    
839
-- | Finds the best group for an instance on a multi-group cluster.
840
--
841
-- Only solutions in @preferred@ and @last_resort@ groups will be
842
-- accepted as valid, and additionally if the allowed groups parameter
843
-- is not null then allocation will only be run for those group
844
-- indices.
845
findBestAllocGroup :: Group.List           -- ^ The group list
846
                   -> Node.List            -- ^ The node list
847
                   -> Instance.List        -- ^ The instance list
848
                   -> Maybe [Gdx]          -- ^ The allowed groups
849
                   -> Instance.Instance    -- ^ The instance to allocate
850
                   -> Int                  -- ^ Required number of nodes
851
                   -> Result (Group.Group, AllocSolution, [String])
852
findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
853
  let groups_by_idx = splitCluster mgnl mgil
854
      groups = map (\(gid, d) -> (Container.find gid mggl, d)) groups_by_idx
855
      groups' = maybe groups
856
                (\gs -> filter ((`elem` gs) . Group.idx . fst) groups)
857
                allowed_gdxs
858
      (groups'', filter_group_msgs) = filterValidGroups groups' inst
859
      sols = map (\(gr, (nl, il)) ->
860
                   (gr, genAllocNodes mggl nl cnt False >>=
861
                        tryAlloc nl il inst))
862
             groups''::[(Group.Group, Result AllocSolution)]
863
      all_msgs = filter_group_msgs ++ concatMap solutionDescription sols
864
      goodSols = filterMGResults sols
865
      sortedSols = sortMGResults goodSols
866
  in case sortedSols of
867
       [] -> Bad $ if null groups'
868
                     then "no groups for evacuation: allowed groups was" ++
869
                          show allowed_gdxs ++ ", all groups: " ++
870
                          show (map fst groups)
871
                     else intercalate ", " all_msgs
872
       (final_group, final_sol):_ -> return (final_group, final_sol, all_msgs)
873

    
874
-- | Try to allocate an instance on a multi-group cluster.
875
tryMGAlloc :: Group.List           -- ^ The group list
876
           -> Node.List            -- ^ The node list
877
           -> Instance.List        -- ^ The instance list
878
           -> Instance.Instance    -- ^ The instance to allocate
879
           -> Int                  -- ^ Required number of nodes
880
           -> Result AllocSolution -- ^ Possible solution list
881
tryMGAlloc mggl mgnl mgil inst cnt = do
882
  (best_group, solution, all_msgs) <-
883
      findBestAllocGroup mggl mgnl mgil Nothing inst cnt
884
  let group_name = Group.name best_group
885
      selmsg = "Selected group: " ++ group_name
886
  return $ solution { asLog = selmsg:all_msgs }
887

    
888
-- | Calculate the new instance list after allocation solution.
889
updateIl :: Instance.List           -- ^ The original instance list
890
         -> Maybe Node.AllocElement -- ^ The result of the allocation attempt
891
         -> Instance.List           -- ^ The updated instance list
892
updateIl il Nothing = il
893
updateIl il (Just (_, xi, _, _)) = Container.add (Container.size il) xi il
894

    
895
-- | Extract the the new node list from the allocation solution.
896
extractNl :: Node.List               -- ^ The original node list
897
          -> Maybe Node.AllocElement -- ^ The result of the allocation attempt
898
          -> Node.List               -- ^ The new node list
899
extractNl nl Nothing = nl
900
extractNl _ (Just (xnl, _, _, _)) = xnl
901

    
902
-- | Try to allocate a list of instances on a multi-group cluster.
903
allocList :: Group.List                  -- ^ The group list
904
          -> Node.List                   -- ^ The node list
905
          -> Instance.List               -- ^ The instance list
906
          -> [(Instance.Instance, Int)]  -- ^ The instance to allocate
907
          -> AllocSolutionList           -- ^ Possible solution list
908
          -> Result (Node.List, Instance.List,
909
                     AllocSolutionList)  -- ^ The final solution list
910
allocList _  nl il [] result = Ok (nl, il, result)
911
allocList gl nl il ((xi, xicnt):xies) result = do
912
  ares <- tryMGAlloc gl nl il xi xicnt
913
  let sol = asSolution ares
914
      nl' = extractNl nl sol
915
      il' = updateIl il sol
916
  allocList gl nl' il' xies ((xi, ares):result)
917

    
918
-- | Function which fails if the requested mode is change secondary.
919
--
920
-- This is useful since except DRBD, no other disk template can
921
-- execute change secondary; thus, we can just call this function
922
-- instead of always checking for secondary mode. After the call to
923
-- this function, whatever mode we have is just a primary change.
924
failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
925
failOnSecondaryChange ChangeSecondary dt =
926
  fail $ "Instances with disk template '" ++ diskTemplateToRaw dt ++
927
         "' can't execute change secondary"
928
failOnSecondaryChange _ _ = return ()
929

    
930
-- | Run evacuation for a single instance.
931
--
932
-- /Note:/ this function should correctly execute both intra-group
933
-- evacuations (in all modes) and inter-group evacuations (in the
934
-- 'ChangeAll' mode). Of course, this requires that the correct list
935
-- of target nodes is passed.
936
nodeEvacInstance :: Node.List         -- ^ The node list (cluster-wide)
937
                 -> Instance.List     -- ^ Instance list (cluster-wide)
938
                 -> EvacMode          -- ^ The evacuation mode
939
                 -> Instance.Instance -- ^ The instance to be evacuated
940
                 -> Gdx               -- ^ The group we're targetting
941
                 -> [Ndx]             -- ^ The list of available nodes
942
                                      -- for allocation
943
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
944
nodeEvacInstance nl il mode inst@(Instance.Instance
945
                                  {Instance.diskTemplate = dt@DTDiskless})
946
                 gdx avail_nodes =
947
                   failOnSecondaryChange mode dt >>
948
                   evacOneNodeOnly nl il inst gdx avail_nodes
949

    
950
nodeEvacInstance _ _ _ (Instance.Instance
951
                        {Instance.diskTemplate = DTPlain}) _ _ =
952
                  fail "Instances of type plain cannot be relocated"
953

    
954
nodeEvacInstance _ _ _ (Instance.Instance
955
                        {Instance.diskTemplate = DTFile}) _ _ =
956
                  fail "Instances of type file cannot be relocated"
957

    
958
nodeEvacInstance nl il mode inst@(Instance.Instance
959
                                  {Instance.diskTemplate = dt@DTSharedFile})
960
                 gdx avail_nodes =
961
                   failOnSecondaryChange mode dt >>
962
                   evacOneNodeOnly nl il inst gdx avail_nodes
963

    
964
nodeEvacInstance nl il mode inst@(Instance.Instance
965
                                  {Instance.diskTemplate = dt@DTBlock})
966
                 gdx avail_nodes =
967
                   failOnSecondaryChange mode dt >>
968
                   evacOneNodeOnly nl il inst gdx avail_nodes
969

    
970
nodeEvacInstance nl il mode inst@(Instance.Instance
971
                                  {Instance.diskTemplate = dt@DTRbd})
972
                 gdx avail_nodes =
973
                   failOnSecondaryChange mode dt >>
974
                   evacOneNodeOnly nl il inst gdx avail_nodes
975

    
976
nodeEvacInstance nl il mode inst@(Instance.Instance
977
                                  {Instance.diskTemplate = dt@DTExt})
978
                 gdx avail_nodes =
979
                   failOnSecondaryChange mode dt >>
980
                   evacOneNodeOnly nl il inst gdx avail_nodes
981

    
982
nodeEvacInstance nl il ChangePrimary
983
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
984
                 _ _ =
985
  do
986
    (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
987
    let idx = Instance.idx inst
988
        il' = Container.add idx inst' il
989
        ops = iMoveToJob nl' il' idx Failover
990
    return (nl', il', ops)
991

    
992
nodeEvacInstance nl il ChangeSecondary
993
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
994
                 gdx avail_nodes =
995
  evacOneNodeOnly nl il inst gdx avail_nodes
996

    
997
-- The algorithm for ChangeAll is as follows:
998
--
999
-- * generate all (primary, secondary) node pairs for the target groups
1000
-- * for each pair, execute the needed moves (r:s, f, r:s) and compute
1001
--   the final node list state and group score
1002
-- * select the best choice via a foldl that uses the same Either
1003
--   String solution as the ChangeSecondary mode
1004
nodeEvacInstance nl il ChangeAll
1005
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
1006
                 gdx avail_nodes =
1007
  do
1008
    let no_nodes = Left "no nodes available"
1009
        node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s]
1010
    (nl', il', ops, _) <-
1011
        annotateResult "Can't find any good nodes for relocation" .
1012
        eitherToResult $
1013
        foldl'
1014
        (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of
1015
                          Bad msg ->
1016
                              case accu of
1017
                                Right _ -> accu
1018
                                -- we don't need more details (which
1019
                                -- nodes, etc.) as we only selected
1020
                                -- this group if we can allocate on
1021
                                -- it, hence failures will not
1022
                                -- propagate out of this fold loop
1023
                                Left _ -> Left $ "Allocation failed: " ++ msg
1024
                          Ok result@(_, _, _, new_cv) ->
1025
                              let new_accu = Right result in
1026
                              case accu of
1027
                                Left _ -> new_accu
1028
                                Right (_, _, _, old_cv) ->
1029
                                    if old_cv < new_cv
1030
                                    then accu
1031
                                    else new_accu
1032
        ) no_nodes node_pairs
1033

    
1034
    return (nl', il', ops)
1035

    
1036
-- | Generic function for changing one node of an instance.
1037
--
1038
-- This is similar to 'nodeEvacInstance' but will be used in a few of
1039
-- its sub-patterns. It folds the inner function 'evacOneNodeInner'
1040
-- over the list of available nodes, which results in the best choice
1041
-- for relocation.
1042
evacOneNodeOnly :: Node.List         -- ^ The node list (cluster-wide)
1043
                -> Instance.List     -- ^ Instance list (cluster-wide)
1044
                -> Instance.Instance -- ^ The instance to be evacuated
1045
                -> Gdx               -- ^ The group we're targetting
1046
                -> [Ndx]             -- ^ The list of available nodes
1047
                                      -- for allocation
1048
                -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1049
evacOneNodeOnly nl il inst gdx avail_nodes = do
1050
  op_fn <- case Instance.mirrorType inst of
1051
             MirrorNone -> Bad "Can't relocate/evacuate non-mirrored instances"
1052
             MirrorInternal -> Ok ReplaceSecondary
1053
             MirrorExternal -> Ok FailoverToAny
1054
  (nl', inst', _, ndx) <- annotateResult "Can't find any good node" .
1055
                          eitherToResult $
1056
                          foldl' (evacOneNodeInner nl inst gdx op_fn)
1057
                          (Left "no nodes available") avail_nodes
1058
  let idx = Instance.idx inst
1059
      il' = Container.add idx inst' il
1060
      ops = iMoveToJob nl' il' idx (op_fn ndx)
1061
  return (nl', il', ops)
1062

    
1063
-- | Inner fold function for changing one node of an instance.
1064
--
1065
-- Depending on the instance disk template, this will either change
1066
-- the secondary (for DRBD) or the primary node (for shared
1067
-- storage). However, the operation is generic otherwise.
1068
--
1069
-- The running solution is either a @Left String@, which means we
1070
-- don't have yet a working solution, or a @Right (...)@, which
1071
-- represents a valid solution; it holds the modified node list, the
1072
-- modified instance (after evacuation), the score of that solution,
1073
-- and the new secondary node index.
1074
evacOneNodeInner :: Node.List         -- ^ Cluster node list
1075
                 -> Instance.Instance -- ^ Instance being evacuated
1076
                 -> Gdx               -- ^ The group index of the instance
1077
                 -> (Ndx -> IMove)    -- ^ Operation constructor
1078
                 -> EvacInnerState    -- ^ Current best solution
1079
                 -> Ndx               -- ^ Node we're evaluating as target
1080
                 -> EvacInnerState    -- ^ New best solution
1081
evacOneNodeInner nl inst gdx op_fn accu ndx =
1082
  case applyMove nl inst (op_fn ndx) of
1083
    Bad fm -> let fail_msg = "Node " ++ Container.nameOf nl ndx ++
1084
                             " failed: " ++ show fm
1085
              in either (const $ Left fail_msg) (const accu) accu
1086
    Ok (nl', inst', _, _) ->
1087
      let nodes = Container.elems nl'
1088
          -- The fromJust below is ugly (it can fail nastily), but
1089
          -- at this point we should have any internal mismatches,
1090
          -- and adding a monad here would be quite involved
1091
          grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1092
          new_cv = compCVNodes grpnodes
1093
          new_accu = Right (nl', inst', new_cv, ndx)
1094
      in case accu of
1095
           Left _ -> new_accu
1096
           Right (_, _, old_cv, _) ->
1097
             if old_cv < new_cv
1098
               then accu
1099
               else new_accu
1100

    
1101
-- | Compute result of changing all nodes of a DRBD instance.
1102
--
1103
-- Given the target primary and secondary node (which might be in a
1104
-- different group or not), this function will 'execute' all the
1105
-- required steps and assuming all operations succceed, will return
1106
-- the modified node and instance lists, the opcodes needed for this
1107
-- and the new group score.
1108
evacDrbdAllInner :: Node.List         -- ^ Cluster node list
1109
                 -> Instance.List     -- ^ Cluster instance list
1110
                 -> Instance.Instance -- ^ The instance to be moved
1111
                 -> Gdx               -- ^ The target group index
1112
                                      -- (which can differ from the
1113
                                      -- current group of the
1114
                                      -- instance)
1115
                 -> (Ndx, Ndx)        -- ^ Tuple of new
1116
                                      -- primary\/secondary nodes
1117
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score)
1118
evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) = do
1119
  let primary = Container.find (Instance.pNode inst) nl
1120
      idx = Instance.idx inst
1121
  -- if the primary is offline, then we first failover
1122
  (nl1, inst1, ops1) <-
1123
    if Node.offline primary
1124
      then do
1125
        (nl', inst', _, _) <-
1126
          annotateResult "Failing over to the secondary" .
1127
          opToResult $ applyMove nl inst Failover
1128
        return (nl', inst', [Failover])
1129
      else return (nl, inst, [])
1130
  let (o1, o2, o3) = (ReplaceSecondary t_pdx,
1131
                      Failover,
1132
                      ReplaceSecondary t_sdx)
1133
  -- we now need to execute a replace secondary to the future
1134
  -- primary node
1135
  (nl2, inst2, _, _) <-
1136
    annotateResult "Changing secondary to new primary" .
1137
    opToResult $
1138
    applyMove nl1 inst1 o1
1139
  let ops2 = o1:ops1
1140
  -- we now execute another failover, the primary stays fixed now
1141
  (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" .
1142
                        opToResult $ applyMove nl2 inst2 o2
1143
  let ops3 = o2:ops2
1144
  -- and finally another replace secondary, to the final secondary
1145
  (nl4, inst4, _, _) <-
1146
    annotateResult "Changing secondary to final secondary" .
1147
    opToResult $
1148
    applyMove nl3 inst3 o3
1149
  let ops4 = o3:ops3
1150
      il' = Container.add idx inst4 il
1151
      ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1152
  let nodes = Container.elems nl4
1153
      -- The fromJust below is ugly (it can fail nastily), but
1154
      -- at this point we should have any internal mismatches,
1155
      -- and adding a monad here would be quite involved
1156
      grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1157
      new_cv = compCVNodes grpnodes
1158
  return (nl4, il', ops, new_cv)
1159

    
1160
-- | Computes the nodes in a given group which are available for
1161
-- allocation.
1162
availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1163
                    -> IntSet.IntSet  -- ^ Nodes that are excluded
1164
                    -> Gdx            -- ^ The group for which we
1165
                                      -- query the nodes
1166
                    -> Result [Ndx]   -- ^ List of available node indices
1167
availableGroupNodes group_nodes excl_ndx gdx = do
1168
  local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1169
                 Ok (lookup gdx group_nodes)
1170
  let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1171
  return avail_nodes
1172

    
1173
-- | Updates the evac solution with the results of an instance
1174
-- evacuation.
1175
updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1176
                   -> Idx
1177
                   -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1178
                   -> (Node.List, Instance.List, EvacSolution)
1179
updateEvacSolution (nl, il, es) idx (Bad msg) =
1180
  (nl, il, es { esFailed = (idx, msg):esFailed es})
1181
updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1182
  (nl, il, es { esMoved = new_elem:esMoved es
1183
              , esOpCodes = opcodes:esOpCodes es })
1184
    where inst = Container.find idx il
1185
          new_elem = (idx,
1186
                      instancePriGroup nl inst,
1187
                      Instance.allNodes inst)
1188

    
1189
-- | Node-evacuation IAllocator mode main function.
1190
tryNodeEvac :: Group.List    -- ^ The cluster groups
1191
            -> Node.List     -- ^ The node list (cluster-wide, not per group)
1192
            -> Instance.List -- ^ Instance list (cluster-wide)
1193
            -> EvacMode      -- ^ The evacuation mode
1194
            -> [Idx]         -- ^ List of instance (indices) to be evacuated
1195
            -> Result (Node.List, Instance.List, EvacSolution)
1196
tryNodeEvac _ ini_nl ini_il mode idxs =
1197
  let evac_ndx = nodesToEvacuate ini_il mode idxs
1198
      offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1199
      excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1200
      group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1201
                                           (Container.elems nl))) $
1202
                  splitCluster ini_nl ini_il
1203
      (fin_nl, fin_il, esol) =
1204
        foldl' (\state@(nl, il, _) inst ->
1205
                  let gdx = instancePriGroup nl inst
1206
                      pdx = Instance.pNode inst in
1207
                  updateEvacSolution state (Instance.idx inst) $
1208
                  availableGroupNodes group_ndx
1209
                    (IntSet.insert pdx excl_ndx) gdx >>=
1210
                      nodeEvacInstance nl il mode inst gdx
1211
               )
1212
        (ini_nl, ini_il, emptyEvacSolution)
1213
        (map (`Container.find` ini_il) idxs)
1214
  in return (fin_nl, fin_il, reverseEvacSolution esol)
1215

    
1216
-- | Change-group IAllocator mode main function.
1217
--
1218
-- This is very similar to 'tryNodeEvac', the only difference is that
1219
-- we don't choose as target group the current instance group, but
1220
-- instead:
1221
--
1222
--   1. at the start of the function, we compute which are the target
1223
--   groups; either no groups were passed in, in which case we choose
1224
--   all groups out of which we don't evacuate instance, or there were
1225
--   some groups passed, in which case we use those
1226
--
1227
--   2. for each instance, we use 'findBestAllocGroup' to choose the
1228
--   best group to hold the instance, and then we do what
1229
--   'tryNodeEvac' does, except for this group instead of the current
1230
--   instance group.
1231
--
1232
-- Note that the correct behaviour of this function relies on the
1233
-- function 'nodeEvacInstance' to be able to do correctly both
1234
-- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1235
tryChangeGroup :: Group.List    -- ^ The cluster groups
1236
               -> Node.List     -- ^ The node list (cluster-wide)
1237
               -> Instance.List -- ^ Instance list (cluster-wide)
1238
               -> [Gdx]         -- ^ Target groups; if empty, any
1239
                                -- groups not being evacuated
1240
               -> [Idx]         -- ^ List of instance (indices) to be evacuated
1241
               -> Result (Node.List, Instance.List, EvacSolution)
1242
tryChangeGroup gl ini_nl ini_il gdxs idxs =
1243
  let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1244
                             flip Container.find ini_il) idxs
1245
      target_gdxs = (if null gdxs
1246
                       then Container.keys gl
1247
                       else gdxs) \\ evac_gdxs
1248
      offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1249
      excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1250
      group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1251
                                           (Container.elems nl))) $
1252
                  splitCluster ini_nl ini_il
1253
      (fin_nl, fin_il, esol) =
1254
        foldl' (\state@(nl, il, _) inst ->
1255
                  let solution = do
1256
                        let ncnt = Instance.requiredNodes $
1257
                                   Instance.diskTemplate inst
1258
                        (grp, _, _) <- findBestAllocGroup gl nl il
1259
                                       (Just target_gdxs) inst ncnt
1260
                        let gdx = Group.idx grp
1261
                        av_nodes <- availableGroupNodes group_ndx
1262
                                    excl_ndx gdx
1263
                        nodeEvacInstance nl il ChangeAll inst gdx av_nodes
1264
                  in updateEvacSolution state (Instance.idx inst) solution
1265
               )
1266
        (ini_nl, ini_il, emptyEvacSolution)
1267
        (map (`Container.find` ini_il) idxs)
1268
  in return (fin_nl, fin_il, reverseEvacSolution esol)
1269

    
1270
-- | Standard-sized allocation method.
1271
--
1272
-- This places instances of the same size on the cluster until we're
1273
-- out of space. The result will be a list of identically-sized
1274
-- instances.
1275
iterateAlloc :: AllocMethod
1276
iterateAlloc nl il limit newinst allocnodes ixes cstats =
1277
  let depth = length ixes
1278
      newname = printf "new-%d" depth::String
1279
      newidx = Container.size il
1280
      newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1281
      newlimit = fmap (flip (-) 1) limit
1282
  in case tryAlloc nl il newi2 allocnodes of
1283
       Bad s -> Bad s
1284
       Ok (AllocSolution { asFailures = errs, asSolution = sols3 }) ->
1285
         let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1286
         case sols3 of
1287
           Nothing -> newsol
1288
           Just (xnl, xi, _, _) ->
1289
             if limit == Just 0
1290
               then newsol
1291
               else iterateAlloc xnl (Container.add newidx xi il)
1292
                      newlimit newinst allocnodes (xi:ixes)
1293
                      (totalResources xnl:cstats)
1294

    
1295
-- | Tiered allocation method.
1296
--
1297
-- This places instances on the cluster, and decreases the spec until
1298
-- we can allocate again. The result will be a list of decreasing
1299
-- instance specs.
1300
tieredAlloc :: AllocMethod
1301
tieredAlloc nl il limit newinst allocnodes ixes cstats =
1302
  case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1303
    Bad s -> Bad s
1304
    Ok (errs, nl', il', ixes', cstats') ->
1305
      let newsol = Ok (errs, nl', il', ixes', cstats')
1306
          ixes_cnt = length ixes'
1307
          (stop, newlimit) = case limit of
1308
                               Nothing -> (False, Nothing)
1309
                               Just n -> (n <= ixes_cnt,
1310
                                            Just (n - ixes_cnt)) in
1311
      if stop then newsol else
1312
          case Instance.shrinkByType newinst . fst . last $
1313
               sortBy (comparing snd) errs of
1314
            Bad _ -> newsol
1315
            Ok newinst' -> tieredAlloc nl' il' newlimit
1316
                           newinst' allocnodes ixes' cstats'
1317

    
1318
-- * Formatting functions
1319

    
1320
-- | Given the original and final nodes, computes the relocation description.
1321
computeMoves :: Instance.Instance -- ^ The instance to be moved
1322
             -> String -- ^ The instance name
1323
             -> IMove  -- ^ The move being performed
1324
             -> String -- ^ New primary
1325
             -> String -- ^ New secondary
1326
             -> (String, [String])
1327
                -- ^ Tuple of moves and commands list; moves is containing
1328
                -- either @/f/@ for failover or @/r:name/@ for replace
1329
                -- secondary, while the command list holds gnt-instance
1330
                -- commands (without that prefix), e.g \"@failover instance1@\"
1331
computeMoves i inam mv c d =
1332
  case mv of
1333
    Failover -> ("f", [mig])
1334
    FailoverToAny _ -> (printf "fa:%s" c, [mig_any])
1335
    FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1336
    ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1337
    ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1338
    ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1339
  where morf = if Instance.isRunning i then "migrate" else "failover"
1340
        mig = printf "%s -f %s" morf inam::String
1341
        mig_any = printf "%s -f -n %s %s" morf c inam::String
1342
        rep n = printf "replace-disks -n %s %s" n inam::String
1343

    
1344
-- | Converts a placement to string format.
1345
printSolutionLine :: Node.List     -- ^ The node list
1346
                  -> Instance.List -- ^ The instance list
1347
                  -> Int           -- ^ Maximum node name length
1348
                  -> Int           -- ^ Maximum instance name length
1349
                  -> Placement     -- ^ The current placement
1350
                  -> Int           -- ^ The index of the placement in
1351
                                   -- the solution
1352
                  -> (String, [String])
1353
printSolutionLine nl il nmlen imlen plc pos =
1354
  let pmlen = (2*nmlen + 1)
1355
      (i, p, s, mv, c) = plc
1356
      old_sec = Instance.sNode inst
1357
      inst = Container.find i il
1358
      inam = Instance.alias inst
1359
      npri = Node.alias $ Container.find p nl
1360
      nsec = Node.alias $ Container.find s nl
1361
      opri = Node.alias $ Container.find (Instance.pNode inst) nl
1362
      osec = Node.alias $ Container.find old_sec nl
1363
      (moves, cmds) =  computeMoves inst inam mv npri nsec
1364
      -- FIXME: this should check instead/also the disk template
1365
      ostr = if old_sec == Node.noSecondary
1366
               then printf "%s" opri::String
1367
               else printf "%s:%s" opri osec::String
1368
      nstr = if s == Node.noSecondary
1369
               then printf "%s" npri::String
1370
               else printf "%s:%s" npri nsec::String
1371
  in (printf "  %3d. %-*s %-*s => %-*s %12.8f a=%s"
1372
      pos imlen inam pmlen ostr pmlen nstr c moves,
1373
      cmds)
1374

    
1375
-- | Return the instance and involved nodes in an instance move.
1376
--
1377
-- Note that the output list length can vary, and is not required nor
1378
-- guaranteed to be of any specific length.
1379
involvedNodes :: Instance.List -- ^ Instance list, used for retrieving
1380
                               -- the instance from its index; note
1381
                               -- that this /must/ be the original
1382
                               -- instance list, so that we can
1383
                               -- retrieve the old nodes
1384
              -> Placement     -- ^ The placement we're investigating,
1385
                               -- containing the new nodes and
1386
                               -- instance index
1387
              -> [Ndx]         -- ^ Resulting list of node indices
1388
involvedNodes il plc =
1389
  let (i, np, ns, _, _) = plc
1390
      inst = Container.find i il
1391
  in nub $ [np, ns] ++ Instance.allNodes inst
1392

    
1393
-- | Inner function for splitJobs, that either appends the next job to
1394
-- the current jobset, or starts a new jobset.
1395
mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1396
mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1397
mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1398
  | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1399
  | otherwise = ([n]:cjs, ndx)
1400

    
1401
-- | Break a list of moves into independent groups. Note that this
1402
-- will reverse the order of jobs.
1403
splitJobs :: [MoveJob] -> [JobSet]
1404
splitJobs = fst . foldl mergeJobs ([], [])
1405

    
1406
-- | Given a list of commands, prefix them with @gnt-instance@ and
1407
-- also beautify the display a little.
1408
formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1409
formatJob jsn jsl (sn, (_, _, _, cmds)) =
1410
  let out =
1411
        printf "  echo job %d/%d" jsn sn:
1412
        printf "  check":
1413
        map ("  gnt-instance " ++) cmds
1414
  in if sn == 1
1415
       then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1416
       else out
1417

    
1418
-- | Given a list of commands, prefix them with @gnt-instance@ and
1419
-- also beautify the display a little.
1420
formatCmds :: [JobSet] -> String
1421
formatCmds =
1422
  unlines .
1423
  concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1424
                           (zip [1..] js)) .
1425
  zip [1..]
1426

    
1427
-- | Print the node list.
1428
printNodes :: Node.List -> [String] -> String
1429
printNodes nl fs =
1430
  let fields = case fs of
1431
                 [] -> Node.defaultFields
1432
                 "+":rest -> Node.defaultFields ++ rest
1433
                 _ -> fs
1434
      snl = sortBy (comparing Node.idx) (Container.elems nl)
1435
      (header, isnum) = unzip $ map Node.showHeader fields
1436
  in printTable "" header (map (Node.list fields) snl) isnum
1437

    
1438
-- | Print the instance list.
1439
printInsts :: Node.List -> Instance.List -> String
1440
printInsts nl il =
1441
  let sil = sortBy (comparing Instance.idx) (Container.elems il)
1442
      helper inst = [ if Instance.isRunning inst then "R" else " "
1443
                    , Instance.name inst
1444
                    , Container.nameOf nl (Instance.pNode inst)
1445
                    , let sdx = Instance.sNode inst
1446
                      in if sdx == Node.noSecondary
1447
                           then  ""
1448
                           else Container.nameOf nl sdx
1449
                    , if Instance.autoBalance inst then "Y" else "N"
1450
                    , printf "%3d" $ Instance.vcpus inst
1451
                    , printf "%5d" $ Instance.mem inst
1452
                    , printf "%5d" $ Instance.dsk inst `div` 1024
1453
                    , printf "%5.3f" lC
1454
                    , printf "%5.3f" lM
1455
                    , printf "%5.3f" lD
1456
                    , printf "%5.3f" lN
1457
                    ]
1458
          where DynUtil lC lM lD lN = Instance.util inst
1459
      header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1460
               , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1461
      isnum = False:False:False:False:False:repeat True
1462
  in printTable "" header (map helper sil) isnum
1463

    
1464
-- | Shows statistics for a given node list.
1465
printStats :: String -> Node.List -> String
1466
printStats lp nl =
1467
  let dcvs = compDetailedCV $ Container.elems nl
1468
      (weights, names) = unzip detailedCVInfo
1469
      hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1470
      header = [ "Field", "Value", "Weight" ]
1471
      formatted = map (\(w, h, val) ->
1472
                         [ h
1473
                         , printf "%.8f" val
1474
                         , printf "x%.2f" w
1475
                         ]) hd
1476
  in printTable lp header formatted $ False:repeat True
1477

    
1478
-- | Convert a placement into a list of OpCodes (basically a job).
1479
iMoveToJob :: Node.List        -- ^ The node list; only used for node
1480
                               -- names, so any version is good
1481
                               -- (before or after the operation)
1482
           -> Instance.List    -- ^ The instance list; also used for
1483
                               -- names only
1484
           -> Idx              -- ^ The index of the instance being
1485
                               -- moved
1486
           -> IMove            -- ^ The actual move to be described
1487
           -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to
1488
                               -- the given move
1489
iMoveToJob nl il idx move =
1490
  let inst = Container.find idx il
1491
      iname = Instance.name inst
1492
      lookNode  n = case mkNonEmpty (Container.nameOf nl n) of
1493
                      -- FIXME: convert htools codebase to non-empty strings
1494
                      Bad msg -> error $ "Empty node name for idx " ++
1495
                                 show n ++ ": " ++ msg ++ "??"
1496
                      Ok ne -> Just ne
1497
      opF = OpCodes.OpInstanceMigrate
1498
              { OpCodes.opInstanceName        = iname
1499
              , OpCodes.opMigrationMode       = Nothing -- default
1500
              , OpCodes.opOldLiveMode         = Nothing -- default as well
1501
              , OpCodes.opTargetNode          = Nothing -- this is drbd
1502
              , OpCodes.opAllowRuntimeChanges = False
1503
              , OpCodes.opIgnoreIpolicy       = False
1504
              , OpCodes.opMigrationCleanup    = False
1505
              , OpCodes.opIallocator          = Nothing
1506
              , OpCodes.opAllowFailover       = True }
1507
      opFA n = opF { OpCodes.opTargetNode = lookNode n } -- not drbd
1508
      opR n = OpCodes.OpInstanceReplaceDisks
1509
                { OpCodes.opInstanceName     = iname
1510
                , OpCodes.opEarlyRelease     = False
1511
                , OpCodes.opIgnoreIpolicy    = False
1512
                , OpCodes.opReplaceDisksMode = OpCodes.ReplaceNewSecondary
1513
                , OpCodes.opReplaceDisksList = []
1514
                , OpCodes.opRemoteNode       = lookNode n
1515
                , OpCodes.opIallocator       = Nothing
1516
                }
1517
  in case move of
1518
       Failover -> [ opF ]
1519
       FailoverToAny np -> [ opFA np ]
1520
       ReplacePrimary np -> [ opF, opR np, opF ]
1521
       ReplaceSecondary ns -> [ opR ns ]
1522
       ReplaceAndFailover np -> [ opR np, opF ]
1523
       FailoverAndReplace ns -> [ opF, opR ns ]
1524

    
1525
-- * Node group functions
1526

    
1527
-- | Computes the group of an instance.
1528
instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1529
instanceGroup nl i =
1530
  let sidx = Instance.sNode i
1531
      pnode = Container.find (Instance.pNode i) nl
1532
      snode = if sidx == Node.noSecondary
1533
              then pnode
1534
              else Container.find sidx nl
1535
      pgroup = Node.group pnode
1536
      sgroup = Node.group snode
1537
  in if pgroup /= sgroup
1538
       then fail ("Instance placed accross two node groups, primary " ++
1539
                  show pgroup ++ ", secondary " ++ show sgroup)
1540
       else return pgroup
1541

    
1542
-- | Computes the group of an instance per the primary node.
1543
instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1544
instancePriGroup nl i =
1545
  let pnode = Container.find (Instance.pNode i) nl
1546
  in  Node.group pnode
1547

    
1548
-- | Compute the list of badly allocated instances (split across node
1549
-- groups).
1550
findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1551
findSplitInstances nl =
1552
  filter (not . isOk . instanceGroup nl) . Container.elems
1553

    
1554
-- | Splits a cluster into the component node groups.
1555
splitCluster :: Node.List -> Instance.List ->
1556
                [(Gdx, (Node.List, Instance.List))]
1557
splitCluster nl il =
1558
  let ngroups = Node.computeGroups (Container.elems nl)
1559
  in map (\(gdx, nodes) ->
1560
           let nidxs = map Node.idx nodes
1561
               nodes' = zip nidxs nodes
1562
               instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1563
           in (gdx, (Container.fromList nodes', instances))) ngroups
1564

    
1565
-- | Compute the list of nodes that are to be evacuated, given a list
1566
-- of instances and an evacuation mode.
1567
nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1568
                -> EvacMode      -- ^ The evacuation mode we're using
1569
                -> [Idx]         -- ^ List of instance indices being evacuated
1570
                -> IntSet.IntSet -- ^ Set of node indices
1571
nodesToEvacuate il mode =
1572
  IntSet.delete Node.noSecondary .
1573
  foldl' (\ns idx ->
1574
            let i = Container.find idx il
1575
                pdx = Instance.pNode i
1576
                sdx = Instance.sNode i
1577
                dt = Instance.diskTemplate i
1578
                withSecondary = case dt of
1579
                                  DTDrbd8 -> IntSet.insert sdx ns
1580
                                  _ -> ns
1581
            in case mode of
1582
                 ChangePrimary   -> IntSet.insert pdx ns
1583
                 ChangeSecondary -> withSecondary
1584
                 ChangeAll       -> IntSet.insert pdx withSecondary
1585
         ) IntSet.empty