Statistics
| Branch: | Tag: | Revision:

root / src / Ganeti / HTools / Cluster.hs @ 8106dd64

History | View | Annotate | Download (71.2 kB)

1
{-| Implementation of cluster-wide logic.
2

    
3
This module holds all pure cluster-logic; I\/O related functionality
4
goes into the /Main/ module for the individual binaries.
5

    
6
-}
7

    
8
{-
9

    
10
Copyright (C) 2009, 2010, 2011, 2012, 2013 Google Inc.
11

    
12
This program is free software; you can redistribute it and/or modify
13
it under the terms of the GNU General Public License as published by
14
the Free Software Foundation; either version 2 of the License, or
15
(at your option) any later version.
16

    
17
This program is distributed in the hope that it will be useful, but
18
WITHOUT ANY WARRANTY; without even the implied warranty of
19
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
General Public License for more details.
21

    
22
You should have received a copy of the GNU General Public License
23
along with this program; if not, write to the Free Software
24
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25
02110-1301, USA.
26

    
27
-}
28

    
29
module Ganeti.HTools.Cluster
30
  (
31
    -- * Types
32
    AllocSolution(..)
33
  , EvacSolution(..)
34
  , Table(..)
35
  , CStats(..)
36
  , AllocNodes
37
  , AllocResult
38
  , AllocMethod
39
  , AllocSolutionList
40
  -- * Generic functions
41
  , totalResources
42
  , computeAllocationDelta
43
  -- * First phase functions
44
  , computeBadItems
45
  -- * Second phase functions
46
  , printSolutionLine
47
  , formatCmds
48
  , involvedNodes
49
  , getMoves
50
  , splitJobs
51
  -- * Display functions
52
  , printNodes
53
  , printInsts
54
  -- * Balacing functions
55
  , checkMove
56
  , doNextBalance
57
  , tryBalance
58
  , compCV
59
  , compCVNodes
60
  , compDetailedCV
61
  , printStats
62
  , iMoveToJob
63
  -- * IAllocator functions
64
  , genAllocNodes
65
  , tryAlloc
66
  , tryMGAlloc
67
  , tryNodeEvac
68
  , tryChangeGroup
69
  , collapseFailures
70
  , allocList
71
  -- * Allocation functions
72
  , iterateAlloc
73
  , tieredAlloc
74
  -- * Node group functions
75
  , instanceGroup
76
  , findSplitInstances
77
  , splitCluster
78
  ) where
79

    
80
import Control.Applicative (liftA2)
81
import Control.Arrow ((&&&))
82
import qualified Data.IntSet as IntSet
83
import Data.List
84
import Data.Maybe (fromJust, fromMaybe, isJust, isNothing)
85
import Data.Ord (comparing)
86
import Text.Printf (printf)
87

    
88
import Ganeti.BasicTypes
89
import qualified Ganeti.HTools.Container as Container
90
import qualified Ganeti.HTools.Instance as Instance
91
import qualified Ganeti.HTools.Nic as Nic
92
import qualified Ganeti.HTools.Node as Node
93
import qualified Ganeti.HTools.Group as Group
94
import Ganeti.HTools.Types
95
import Ganeti.Compat
96
import qualified Ganeti.OpCodes as OpCodes
97
import Ganeti.Utils
98
import Ganeti.Types (EvacMode(..), mkNonEmpty)
99

    
100
-- * Types
101

    
102
-- | Allocation\/relocation solution.
103
data AllocSolution = AllocSolution
104
  { asFailures :: [FailMode]              -- ^ Failure counts
105
  , asAllocs   :: Int                     -- ^ Good allocation count
106
  , asSolution :: Maybe Node.AllocElement -- ^ The actual allocation result
107
  , asLog      :: [String]                -- ^ Informational messages
108
  }
109

    
110
-- | Node evacuation/group change iallocator result type. This result
111
-- type consists of actual opcodes (a restricted subset) that are
112
-- transmitted back to Ganeti.
113
data EvacSolution = EvacSolution
114
  { esMoved   :: [(Idx, Gdx, [Ndx])]  -- ^ Instances moved successfully
115
  , esFailed  :: [(Idx, String)]      -- ^ Instances which were not
116
                                      -- relocated
117
  , esOpCodes :: [[OpCodes.OpCode]]   -- ^ List of jobs
118
  } deriving (Show)
119

    
120
-- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
121
type AllocResult = (FailStats, Node.List, Instance.List,
122
                    [Instance.Instance], [CStats])
123

    
124
-- | Type alias for easier handling.
125
type AllocSolutionList = [(Instance.Instance, AllocSolution)]
126

    
127
-- | A type denoting the valid allocation mode/pairs.
128
--
129
-- For a one-node allocation, this will be a @Left ['Ndx']@, whereas
130
-- for a two-node allocation, this will be a @Right [('Ndx',
131
-- ['Ndx'])]@. In the latter case, the list is basically an
132
-- association list, grouped by primary node and holding the potential
133
-- secondary nodes in the sub-list.
134
type AllocNodes = Either [Ndx] [(Ndx, [Ndx])]
135

    
136
-- | The empty solution we start with when computing allocations.
137
emptyAllocSolution :: AllocSolution
138
emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
139
                                   , asSolution = Nothing, asLog = [] }
140

    
141
-- | The empty evac solution.
142
emptyEvacSolution :: EvacSolution
143
emptyEvacSolution = EvacSolution { esMoved = []
144
                                 , esFailed = []
145
                                 , esOpCodes = []
146
                                 }
147

    
148
-- | The complete state for the balancing solution.
149
data Table = Table Node.List Instance.List Score [Placement]
150
             deriving (Show)
151

    
152
-- | Cluster statistics data type.
153
data CStats = CStats
154
  { csFmem :: Integer -- ^ Cluster free mem
155
  , csFdsk :: Integer -- ^ Cluster free disk
156
  , csFspn :: Integer -- ^ Cluster free spindles
157
  , csAmem :: Integer -- ^ Cluster allocatable mem
158
  , csAdsk :: Integer -- ^ Cluster allocatable disk
159
  , csAcpu :: Integer -- ^ Cluster allocatable cpus
160
  , csMmem :: Integer -- ^ Max node allocatable mem
161
  , csMdsk :: Integer -- ^ Max node allocatable disk
162
  , csMcpu :: Integer -- ^ Max node allocatable cpu
163
  , csImem :: Integer -- ^ Instance used mem
164
  , csIdsk :: Integer -- ^ Instance used disk
165
  , csIspn :: Integer -- ^ Instance used spindles
166
  , csIcpu :: Integer -- ^ Instance used cpu
167
  , csTmem :: Double  -- ^ Cluster total mem
168
  , csTdsk :: Double  -- ^ Cluster total disk
169
  , csTspn :: Double  -- ^ Cluster total spindles
170
  , csTcpu :: Double  -- ^ Cluster total cpus
171
  , csVcpu :: Integer -- ^ Cluster total virtual cpus
172
  , csNcpu :: Double  -- ^ Equivalent to 'csIcpu' but in terms of
173
                      -- physical CPUs, i.e. normalised used phys CPUs
174
  , csXmem :: Integer -- ^ Unnacounted for mem
175
  , csNmem :: Integer -- ^ Node own memory
176
  , csScore :: Score  -- ^ The cluster score
177
  , csNinst :: Int    -- ^ The total number of instances
178
  } deriving (Show)
179

    
180
-- | A simple type for allocation functions.
181
type AllocMethod =  Node.List           -- ^ Node list
182
                 -> Instance.List       -- ^ Instance list
183
                 -> Maybe Int           -- ^ Optional allocation limit
184
                 -> Instance.Instance   -- ^ Instance spec for allocation
185
                 -> AllocNodes          -- ^ Which nodes we should allocate on
186
                 -> [Instance.Instance] -- ^ Allocated instances
187
                 -> [CStats]            -- ^ Running cluster stats
188
                 -> Result AllocResult  -- ^ Allocation result
189

    
190
-- | A simple type for the running solution of evacuations.
191
type EvacInnerState =
192
  Either String (Node.List, Instance.Instance, Score, Ndx)
193

    
194
-- * Utility functions
195

    
196
-- | Verifies the N+1 status and return the affected nodes.
197
verifyN1 :: [Node.Node] -> [Node.Node]
198
verifyN1 = filter Node.failN1
199

    
200
{-| Computes the pair of bad nodes and instances.
201

    
202
The bad node list is computed via a simple 'verifyN1' check, and the
203
bad instance list is the list of primary and secondary instances of
204
those nodes.
205

    
206
-}
207
computeBadItems :: Node.List -> Instance.List ->
208
                   ([Node.Node], [Instance.Instance])
209
computeBadItems nl il =
210
  let bad_nodes = verifyN1 $ getOnline nl
211
      bad_instances = map (`Container.find` il) .
212
                      sort . nub $
213
                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
214
  in
215
    (bad_nodes, bad_instances)
216

    
217
-- | Extracts the node pairs for an instance. This can fail if the
218
-- instance is single-homed. FIXME: this needs to be improved,
219
-- together with the general enhancement for handling non-DRBD moves.
220
instanceNodes :: Node.List -> Instance.Instance ->
221
                 (Ndx, Ndx, Node.Node, Node.Node)
222
instanceNodes nl inst =
223
  let old_pdx = Instance.pNode inst
224
      old_sdx = Instance.sNode inst
225
      old_p = Container.find old_pdx nl
226
      old_s = Container.find old_sdx nl
227
  in (old_pdx, old_sdx, old_p, old_s)
228

    
229
-- | Zero-initializer for the CStats type.
230
emptyCStats :: CStats
231
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
232

    
233
-- | Update stats with data from a new node.
234
updateCStats :: CStats -> Node.Node -> CStats
235
updateCStats cs node =
236
  let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
237
               csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
238
               csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
239
               csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
240
               csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
241
               csVcpu = x_vcpu, csNcpu = x_ncpu,
242
               csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst,
243
               csFspn = x_fspn, csIspn = x_ispn, csTspn = x_tspn
244
             }
245
        = cs
246
      inc_amem = Node.fMem node - Node.rMem node
247
      inc_amem' = if inc_amem > 0 then inc_amem else 0
248
      inc_adsk = Node.availDisk node
249
      inc_imem = truncate (Node.tMem node) - Node.nMem node
250
                 - Node.xMem node - Node.fMem node
251
      inc_icpu = Node.uCpu node
252
      inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
253
      inc_ispn = Node.tSpindles node - Node.fSpindles node
254
      inc_vcpu = Node.hiCpu node
255
      inc_acpu = Node.availCpu node
256
      inc_ncpu = fromIntegral (Node.uCpu node) /
257
                 iPolicyVcpuRatio (Node.iPolicy node)
258
  in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
259
        , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
260
        , csFspn = x_fspn + fromIntegral (Node.fSpindles node)
261
        , csAmem = x_amem + fromIntegral inc_amem'
262
        , csAdsk = x_adsk + fromIntegral inc_adsk
263
        , csAcpu = x_acpu + fromIntegral inc_acpu
264
        , csMmem = max x_mmem (fromIntegral inc_amem')
265
        , csMdsk = max x_mdsk (fromIntegral inc_adsk)
266
        , csMcpu = max x_mcpu (fromIntegral inc_acpu)
267
        , csImem = x_imem + fromIntegral inc_imem
268
        , csIdsk = x_idsk + fromIntegral inc_idsk
269
        , csIspn = x_ispn + fromIntegral inc_ispn
270
        , csIcpu = x_icpu + fromIntegral inc_icpu
271
        , csTmem = x_tmem + Node.tMem node
272
        , csTdsk = x_tdsk + Node.tDsk node
273
        , csTspn = x_tspn + fromIntegral (Node.tSpindles node)
274
        , csTcpu = x_tcpu + Node.tCpu node
275
        , csVcpu = x_vcpu + fromIntegral inc_vcpu
276
        , csNcpu = x_ncpu + inc_ncpu
277
        , csXmem = x_xmem + fromIntegral (Node.xMem node)
278
        , csNmem = x_nmem + fromIntegral (Node.nMem node)
279
        , csNinst = x_ninst + length (Node.pList node)
280
        }
281

    
282
-- | Compute the total free disk and memory in the cluster.
283
totalResources :: Node.List -> CStats
284
totalResources nl =
285
  let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
286
  in cs { csScore = compCV nl }
287

    
288
-- | Compute the delta between two cluster state.
289
--
290
-- This is used when doing allocations, to understand better the
291
-- available cluster resources. The return value is a triple of the
292
-- current used values, the delta that was still allocated, and what
293
-- was left unallocated.
294
computeAllocationDelta :: CStats -> CStats -> AllocStats
295
computeAllocationDelta cini cfin =
296
  let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu,
297
              csNcpu = i_ncpu, csIspn = i_ispn } = cini
298
      CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
299
              csTmem = t_mem, csTdsk = t_dsk, csVcpu = f_vcpu,
300
              csNcpu = f_ncpu, csTcpu = f_tcpu,
301
              csIspn = f_ispn, csTspn = t_spn } = cfin
302
      rini = AllocInfo { allocInfoVCpus = fromIntegral i_icpu
303
                       , allocInfoNCpus = i_ncpu
304
                       , allocInfoMem   = fromIntegral i_imem
305
                       , allocInfoDisk  = fromIntegral i_idsk
306
                       , allocInfoSpn   = fromIntegral i_ispn
307
                       }
308
      rfin = AllocInfo { allocInfoVCpus = fromIntegral (f_icpu - i_icpu)
309
                       , allocInfoNCpus = f_ncpu - i_ncpu
310
                       , allocInfoMem   = fromIntegral (f_imem - i_imem)
311
                       , allocInfoDisk  = fromIntegral (f_idsk - i_idsk)
312
                       , allocInfoSpn   = fromIntegral (f_ispn - i_ispn)
313
                       }
314
      runa = AllocInfo { allocInfoVCpus = fromIntegral (f_vcpu - f_icpu)
315
                       , allocInfoNCpus = f_tcpu - f_ncpu
316
                       , allocInfoMem   = truncate t_mem - fromIntegral f_imem
317
                       , allocInfoDisk  = truncate t_dsk - fromIntegral f_idsk
318
                       , allocInfoSpn   = truncate t_spn - fromIntegral f_ispn
319
                       }
320
  in (rini, rfin, runa)
321

    
322
-- | The names and weights of the individual elements in the CV list.
323
detailedCVInfo :: [(Double, String)]
324
detailedCVInfo = [ (1,  "free_mem_cv")
325
                 , (1,  "free_disk_cv")
326
                 , (1,  "n1_cnt")
327
                 , (1,  "reserved_mem_cv")
328
                 , (4,  "offline_all_cnt")
329
                 , (16, "offline_pri_cnt")
330
                 , (1,  "vcpu_ratio_cv")
331
                 , (1,  "cpu_load_cv")
332
                 , (1,  "mem_load_cv")
333
                 , (1,  "disk_load_cv")
334
                 , (1,  "net_load_cv")
335
                 , (2,  "pri_tags_score")
336
                 , (1,  "spindles_cv")
337
                 ]
338

    
339
-- | Holds the weights used by 'compCVNodes' for each metric.
340
detailedCVWeights :: [Double]
341
detailedCVWeights = map fst detailedCVInfo
342

    
343
-- | Compute the mem and disk covariance.
344
compDetailedCV :: [Node.Node] -> [Double]
345
compDetailedCV all_nodes =
346
  let (offline, nodes) = partition Node.offline all_nodes
347
      mem_l = map Node.pMem nodes
348
      dsk_l = map Node.pDsk nodes
349
      -- metric: memory covariance
350
      mem_cv = stdDev mem_l
351
      -- metric: disk covariance
352
      dsk_cv = stdDev dsk_l
353
      -- metric: count of instances living on N1 failing nodes
354
      n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
355
                                                 length (Node.pList n)) .
356
                 filter Node.failN1 $ nodes :: Double
357
      res_l = map Node.pRem nodes
358
      -- metric: reserved memory covariance
359
      res_cv = stdDev res_l
360
      -- offline instances metrics
361
      offline_ipri = sum . map (length . Node.pList) $ offline
362
      offline_isec = sum . map (length . Node.sList) $ offline
363
      -- metric: count of instances on offline nodes
364
      off_score = fromIntegral (offline_ipri + offline_isec)::Double
365
      -- metric: count of primary instances on offline nodes (this
366
      -- helps with evacuation/failover of primary instances on
367
      -- 2-node clusters with one node offline)
368
      off_pri_score = fromIntegral offline_ipri::Double
369
      cpu_l = map Node.pCpu nodes
370
      -- metric: covariance of vcpu/pcpu ratio
371
      cpu_cv = stdDev cpu_l
372
      -- metrics: covariance of cpu, memory, disk and network load
373
      (c_load, m_load, d_load, n_load) =
374
        unzip4 $ map (\n ->
375
                      let DynUtil c1 m1 d1 n1 = Node.utilLoad n
376
                          DynUtil c2 m2 d2 n2 = Node.utilPool n
377
                      in (c1/c2, m1/m2, d1/d2, n1/n2)) nodes
378
      -- metric: conflicting instance count
379
      pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
380
      pri_tags_score = fromIntegral pri_tags_inst::Double
381
      -- metric: spindles %
382
      spindles_cv = map (\n -> Node.instSpindles n / Node.hiSpindles n) nodes
383
  in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
384
     , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
385
     , pri_tags_score, stdDev spindles_cv ]
386

    
387
-- | Compute the /total/ variance.
388
compCVNodes :: [Node.Node] -> Double
389
compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
390

    
391
-- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
392
compCV :: Node.List -> Double
393
compCV = compCVNodes . Container.elems
394

    
395
-- | Compute online nodes from a 'Node.List'.
396
getOnline :: Node.List -> [Node.Node]
397
getOnline = filter (not . Node.offline) . Container.elems
398

    
399
-- * Balancing functions
400

    
401
-- | Compute best table. Note that the ordering of the arguments is important.
402
compareTables :: Table -> Table -> Table
403
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
404
  if a_cv > b_cv then b else a
405

    
406
-- | Applies an instance move to a given node list and instance.
407
applyMove :: Node.List -> Instance.Instance
408
          -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
409
-- Failover (f)
410
applyMove nl inst Failover =
411
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
412
      int_p = Node.removePri old_p inst
413
      int_s = Node.removeSec old_s inst
414
      new_nl = do -- Maybe monad
415
        new_p <- Node.addPriEx (Node.offline old_p) int_s inst
416
        new_s <- Node.addSec int_p inst old_sdx
417
        let new_inst = Instance.setBoth inst old_sdx old_pdx
418
        return (Container.addTwo old_pdx new_s old_sdx new_p nl,
419
                new_inst, old_sdx, old_pdx)
420
  in new_nl
421

    
422
-- Failover to any (fa)
423
applyMove nl inst (FailoverToAny new_pdx) = do
424
  let (old_pdx, old_sdx, old_pnode, _) = instanceNodes nl inst
425
      new_pnode = Container.find new_pdx nl
426
      force_failover = Node.offline old_pnode
427
  new_pnode' <- Node.addPriEx force_failover new_pnode inst
428
  let old_pnode' = Node.removePri old_pnode inst
429
      inst' = Instance.setPri inst new_pdx
430
      nl' = Container.addTwo old_pdx old_pnode' new_pdx new_pnode' nl
431
  return (nl', inst', new_pdx, old_sdx)
432

    
433
-- Replace the primary (f:, r:np, f)
434
applyMove nl inst (ReplacePrimary new_pdx) =
435
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
436
      tgt_n = Container.find new_pdx nl
437
      int_p = Node.removePri old_p inst
438
      int_s = Node.removeSec old_s inst
439
      force_p = Node.offline old_p
440
      new_nl = do -- Maybe monad
441
                  -- check that the current secondary can host the instance
442
                  -- during the migration
443
        tmp_s <- Node.addPriEx force_p int_s inst
444
        let tmp_s' = Node.removePri tmp_s inst
445
        new_p <- Node.addPriEx force_p tgt_n inst
446
        new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
447
        let new_inst = Instance.setPri inst new_pdx
448
        return (Container.add new_pdx new_p $
449
                Container.addTwo old_pdx int_p old_sdx new_s nl,
450
                new_inst, new_pdx, old_sdx)
451
  in new_nl
452

    
453
-- Replace the secondary (r:ns)
454
applyMove nl inst (ReplaceSecondary new_sdx) =
455
  let old_pdx = Instance.pNode inst
456
      old_sdx = Instance.sNode inst
457
      old_s = Container.find old_sdx nl
458
      tgt_n = Container.find new_sdx nl
459
      int_s = Node.removeSec old_s inst
460
      force_s = Node.offline old_s
461
      new_inst = Instance.setSec inst new_sdx
462
      new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
463
               \new_s -> return (Container.addTwo new_sdx
464
                                 new_s old_sdx int_s nl,
465
                                 new_inst, old_pdx, new_sdx)
466
  in new_nl
467

    
468
-- Replace the secondary and failover (r:np, f)
469
applyMove nl inst (ReplaceAndFailover new_pdx) =
470
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
471
      tgt_n = Container.find new_pdx nl
472
      int_p = Node.removePri old_p inst
473
      int_s = Node.removeSec old_s inst
474
      force_s = Node.offline old_s
475
      new_nl = do -- Maybe monad
476
        new_p <- Node.addPri tgt_n inst
477
        new_s <- Node.addSecEx force_s int_p inst new_pdx
478
        let new_inst = Instance.setBoth inst new_pdx old_pdx
479
        return (Container.add new_pdx new_p $
480
                Container.addTwo old_pdx new_s old_sdx int_s nl,
481
                new_inst, new_pdx, old_pdx)
482
  in new_nl
483

    
484
-- Failver and replace the secondary (f, r:ns)
485
applyMove nl inst (FailoverAndReplace new_sdx) =
486
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
487
      tgt_n = Container.find new_sdx nl
488
      int_p = Node.removePri old_p inst
489
      int_s = Node.removeSec old_s inst
490
      force_p = Node.offline old_p
491
      new_nl = do -- Maybe monad
492
        new_p <- Node.addPriEx force_p int_s inst
493
        new_s <- Node.addSecEx force_p tgt_n inst old_sdx
494
        let new_inst = Instance.setBoth inst old_sdx new_sdx
495
        return (Container.add new_sdx new_s $
496
                Container.addTwo old_sdx new_p old_pdx int_p nl,
497
                new_inst, old_sdx, new_sdx)
498
  in new_nl
499

    
500
-- | Tries to allocate an instance on one given node.
501
allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
502
                 -> OpResult Node.AllocElement
503
allocateOnSingle nl inst new_pdx =
504
  let p = Container.find new_pdx nl
505
      new_inst = Instance.setBoth inst new_pdx Node.noSecondary
506
  in do
507
    Instance.instMatchesPolicy inst (Node.iPolicy p) (Node.exclStorage p)
508
    new_p <- Node.addPri p inst
509
    let new_nl = Container.add new_pdx new_p nl
510
        new_score = compCV new_nl
511
    return (new_nl, new_inst, [new_p], new_score)
512

    
513
-- | Tries to allocate an instance on a given pair of nodes.
514
allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
515
               -> OpResult Node.AllocElement
516
allocateOnPair nl inst new_pdx new_sdx =
517
  let tgt_p = Container.find new_pdx nl
518
      tgt_s = Container.find new_sdx nl
519
  in do
520
    Instance.instMatchesPolicy inst (Node.iPolicy tgt_p)
521
      (Node.exclStorage tgt_p)
522
    new_p <- Node.addPri tgt_p inst
523
    new_s <- Node.addSec tgt_s inst new_pdx
524
    let new_inst = Instance.setBoth inst new_pdx new_sdx
525
        new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
526
    return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
527

    
528
-- | Tries to perform an instance move and returns the best table
529
-- between the original one and the new one.
530
checkSingleStep :: Table -- ^ The original table
531
                -> Instance.Instance -- ^ The instance to move
532
                -> Table -- ^ The current best table
533
                -> IMove -- ^ The move to apply
534
                -> Table -- ^ The final best table
535
checkSingleStep ini_tbl target cur_tbl move =
536
  let Table ini_nl ini_il _ ini_plc = ini_tbl
537
      tmp_resu = applyMove ini_nl target move
538
  in case tmp_resu of
539
       Bad _ -> cur_tbl
540
       Ok (upd_nl, new_inst, pri_idx, sec_idx) ->
541
         let tgt_idx = Instance.idx target
542
             upd_cvar = compCV upd_nl
543
             upd_il = Container.add tgt_idx new_inst ini_il
544
             upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
545
             upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
546
         in compareTables cur_tbl upd_tbl
547

    
548
-- | Given the status of the current secondary as a valid new node and
549
-- the current candidate target node, generate the possible moves for
550
-- a instance.
551
possibleMoves :: MirrorType -- ^ The mirroring type of the instance
552
              -> Bool       -- ^ Whether the secondary node is a valid new node
553
              -> Bool       -- ^ Whether we can change the primary node
554
              -> Ndx        -- ^ Target node candidate
555
              -> [IMove]    -- ^ List of valid result moves
556

    
557
possibleMoves MirrorNone _ _ _ = []
558

    
559
possibleMoves MirrorExternal _ False _ = []
560

    
561
possibleMoves MirrorExternal _ True tdx =
562
  [ FailoverToAny tdx ]
563

    
564
possibleMoves MirrorInternal _ False tdx =
565
  [ ReplaceSecondary tdx ]
566

    
567
possibleMoves MirrorInternal True True tdx =
568
  [ ReplaceSecondary tdx
569
  , ReplaceAndFailover tdx
570
  , ReplacePrimary tdx
571
  , FailoverAndReplace tdx
572
  ]
573

    
574
possibleMoves MirrorInternal False True tdx =
575
  [ ReplaceSecondary tdx
576
  , ReplaceAndFailover tdx
577
  ]
578

    
579
-- | Compute the best move for a given instance.
580
checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
581
                  -> Bool              -- ^ Whether disk moves are allowed
582
                  -> Bool              -- ^ Whether instance moves are allowed
583
                  -> Table             -- ^ Original table
584
                  -> Instance.Instance -- ^ Instance to move
585
                  -> Table             -- ^ Best new table for this instance
586
checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
587
  let opdx = Instance.pNode target
588
      osdx = Instance.sNode target
589
      bad_nodes = [opdx, osdx]
590
      nodes = filter (`notElem` bad_nodes) nodes_idx
591
      mir_type = Instance.mirrorType target
592
      use_secondary = elem osdx nodes_idx && inst_moves
593
      aft_failover = if mir_type == MirrorInternal && use_secondary
594
                       -- if drbd and allowed to failover
595
                       then checkSingleStep ini_tbl target ini_tbl Failover
596
                       else ini_tbl
597
      all_moves =
598
        if disk_moves
599
          then concatMap (possibleMoves mir_type use_secondary inst_moves)
600
               nodes
601
          else []
602
    in
603
      -- iterate over the possible nodes for this instance
604
      foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
605

    
606
-- | Compute the best next move.
607
checkMove :: [Ndx]               -- ^ Allowed target node indices
608
          -> Bool                -- ^ Whether disk moves are allowed
609
          -> Bool                -- ^ Whether instance moves are allowed
610
          -> Table               -- ^ The current solution
611
          -> [Instance.Instance] -- ^ List of instances still to move
612
          -> Table               -- ^ The new solution
613
checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
614
  let Table _ _ _ ini_plc = ini_tbl
615
      -- we're using rwhnf from the Control.Parallel.Strategies
616
      -- package; we don't need to use rnf as that would force too
617
      -- much evaluation in single-threaded cases, and in
618
      -- multi-threaded case the weak head normal form is enough to
619
      -- spark the evaluation
620
      tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
621
                             inst_moves ini_tbl)
622
               victims
623
      -- iterate over all instances, computing the best move
624
      best_tbl = foldl' compareTables ini_tbl tables
625
      Table _ _ _ best_plc = best_tbl
626
  in if length best_plc == length ini_plc
627
       then ini_tbl -- no advancement
628
       else best_tbl
629

    
630
-- | Check if we are allowed to go deeper in the balancing.
631
doNextBalance :: Table     -- ^ The starting table
632
              -> Int       -- ^ Remaining length
633
              -> Score     -- ^ Score at which to stop
634
              -> Bool      -- ^ The resulting table and commands
635
doNextBalance ini_tbl max_rounds min_score =
636
  let Table _ _ ini_cv ini_plc = ini_tbl
637
      ini_plc_len = length ini_plc
638
  in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
639

    
640
-- | Run a balance move.
641
tryBalance :: Table       -- ^ The starting table
642
           -> Bool        -- ^ Allow disk moves
643
           -> Bool        -- ^ Allow instance moves
644
           -> Bool        -- ^ Only evacuate moves
645
           -> Score       -- ^ Min gain threshold
646
           -> Score       -- ^ Min gain
647
           -> Maybe Table -- ^ The resulting table and commands
648
tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
649
    let Table ini_nl ini_il ini_cv _ = ini_tbl
650
        all_inst = Container.elems ini_il
651
        all_nodes = Container.elems ini_nl
652
        (offline_nodes, online_nodes) = partition Node.offline all_nodes
653
        all_inst' = if evac_mode
654
                      then let bad_nodes = map Node.idx offline_nodes
655
                           in filter (any (`elem` bad_nodes) .
656
                                          Instance.allNodes) all_inst
657
                      else all_inst
658
        reloc_inst = filter (\i -> Instance.movable i &&
659
                                   Instance.autoBalance i) all_inst'
660
        node_idx = map Node.idx online_nodes
661
        fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
662
        (Table _ _ fin_cv _) = fin_tbl
663
    in
664
      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
665
      then Just fin_tbl -- this round made success, return the new table
666
      else Nothing
667

    
668
-- * Allocation functions
669

    
670
-- | Build failure stats out of a list of failures.
671
collapseFailures :: [FailMode] -> FailStats
672
collapseFailures flst =
673
    map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
674
            [minBound..maxBound]
675

    
676
-- | Compares two Maybe AllocElement and chooses the best score.
677
bestAllocElement :: Maybe Node.AllocElement
678
                 -> Maybe Node.AllocElement
679
                 -> Maybe Node.AllocElement
680
bestAllocElement a Nothing = a
681
bestAllocElement Nothing b = b
682
bestAllocElement a@(Just (_, _, _, ascore)) b@(Just (_, _, _, bscore)) =
683
  if ascore < bscore then a else b
684

    
685
-- | Update current Allocation solution and failure stats with new
686
-- elements.
687
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
688
concatAllocs as (Bad reason) = as { asFailures = reason : asFailures as }
689

    
690
concatAllocs as (Ok ns) =
691
  let -- Choose the old or new solution, based on the cluster score
692
    cntok = asAllocs as
693
    osols = asSolution as
694
    nsols = bestAllocElement osols (Just ns)
695
    nsuc = cntok + 1
696
    -- Note: we force evaluation of nsols here in order to keep the
697
    -- memory profile low - we know that we will need nsols for sure
698
    -- in the next cycle, so we force evaluation of nsols, since the
699
    -- foldl' in the caller will only evaluate the tuple, but not the
700
    -- elements of the tuple
701
  in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolution = nsols }
702

    
703
-- | Sums two 'AllocSolution' structures.
704
sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
705
sumAllocs (AllocSolution aFails aAllocs aSols aLog)
706
          (AllocSolution bFails bAllocs bSols bLog) =
707
  -- note: we add b first, since usually it will be smaller; when
708
  -- fold'ing, a will grow and grow whereas b is the per-group
709
  -- result, hence smaller
710
  let nFails  = bFails ++ aFails
711
      nAllocs = aAllocs + bAllocs
712
      nSols   = bestAllocElement aSols bSols
713
      nLog    = bLog ++ aLog
714
  in AllocSolution nFails nAllocs nSols nLog
715

    
716
-- | Given a solution, generates a reasonable description for it.
717
describeSolution :: AllocSolution -> String
718
describeSolution as =
719
  let fcnt = asFailures as
720
      sols = asSolution as
721
      freasons =
722
        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
723
        filter ((> 0) . snd) . collapseFailures $ fcnt
724
  in case sols of
725
     Nothing -> "No valid allocation solutions, failure reasons: " ++
726
                (if null fcnt then "unknown reasons" else freasons)
727
     Just (_, _, nodes, cv) ->
728
         printf ("score: %.8f, successes %d, failures %d (%s)" ++
729
                 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
730
               (intercalate "/" . map Node.name $ nodes)
731

    
732
-- | Annotates a solution with the appropriate string.
733
annotateSolution :: AllocSolution -> AllocSolution
734
annotateSolution as = as { asLog = describeSolution as : asLog as }
735

    
736
-- | Reverses an evacuation solution.
737
--
738
-- Rationale: we always concat the results to the top of the lists, so
739
-- for proper jobset execution, we should reverse all lists.
740
reverseEvacSolution :: EvacSolution -> EvacSolution
741
reverseEvacSolution (EvacSolution f m o) =
742
  EvacSolution (reverse f) (reverse m) (reverse o)
743

    
744
-- | Generate the valid node allocation singles or pairs for a new instance.
745
genAllocNodes :: Group.List        -- ^ Group list
746
              -> Node.List         -- ^ The node map
747
              -> Int               -- ^ The number of nodes required
748
              -> Bool              -- ^ Whether to drop or not
749
                                   -- unallocable nodes
750
              -> Result AllocNodes -- ^ The (monadic) result
751
genAllocNodes gl nl count drop_unalloc =
752
  let filter_fn = if drop_unalloc
753
                    then filter (Group.isAllocable .
754
                                 flip Container.find gl . Node.group)
755
                    else id
756
      all_nodes = filter_fn $ getOnline nl
757
      all_pairs = [(Node.idx p,
758
                    [Node.idx s | s <- all_nodes,
759
                                       Node.idx p /= Node.idx s,
760
                                       Node.group p == Node.group s]) |
761
                   p <- all_nodes]
762
  in case count of
763
       1 -> Ok (Left (map Node.idx all_nodes))
764
       2 -> Ok (Right (filter (not . null . snd) all_pairs))
765
       _ -> Bad "Unsupported number of nodes, only one or two  supported"
766

    
767
-- | Try to allocate an instance on the cluster.
768
tryAlloc :: (Monad m) =>
769
            Node.List         -- ^ The node list
770
         -> Instance.List     -- ^ The instance list
771
         -> Instance.Instance -- ^ The instance to allocate
772
         -> AllocNodes        -- ^ The allocation targets
773
         -> m AllocSolution   -- ^ Possible solution list
774
tryAlloc _  _ _    (Right []) = fail "Not enough online nodes"
775
tryAlloc nl _ inst (Right ok_pairs) =
776
  let psols = parMap rwhnf (\(p, ss) ->
777
                              foldl' (\cstate ->
778
                                        concatAllocs cstate .
779
                                        allocateOnPair nl inst p)
780
                              emptyAllocSolution ss) ok_pairs
781
      sols = foldl' sumAllocs emptyAllocSolution psols
782
  in return $ annotateSolution sols
783

    
784
tryAlloc _  _ _    (Left []) = fail "No online nodes"
785
tryAlloc nl _ inst (Left all_nodes) =
786
  let sols = foldl' (\cstate ->
787
                       concatAllocs cstate . allocateOnSingle nl inst
788
                    ) emptyAllocSolution all_nodes
789
  in return $ annotateSolution sols
790

    
791
-- | Given a group/result, describe it as a nice (list of) messages.
792
solutionDescription :: (Group.Group, Result AllocSolution)
793
                    -> [String]
794
solutionDescription (grp, result) =
795
  case result of
796
    Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
797
    Bad message -> [printf "Group %s: error %s" gname message]
798
  where gname = Group.name grp
799
        pol = allocPolicyToRaw (Group.allocPolicy grp)
800

    
801
-- | From a list of possibly bad and possibly empty solutions, filter
802
-- only the groups with a valid result. Note that the result will be
803
-- reversed compared to the original list.
804
filterMGResults :: [(Group.Group, Result AllocSolution)]
805
                -> [(Group.Group, AllocSolution)]
806
filterMGResults = foldl' fn []
807
  where unallocable = not . Group.isAllocable
808
        fn accu (grp, rasol) =
809
          case rasol of
810
            Bad _ -> accu
811
            Ok sol | isNothing (asSolution sol) -> accu
812
                   | unallocable grp -> accu
813
                   | otherwise -> (grp, sol):accu
814

    
815
-- | Sort multigroup results based on policy and score.
816
sortMGResults :: [(Group.Group, AllocSolution)]
817
              -> [(Group.Group, AllocSolution)]
818
sortMGResults sols =
819
  let extractScore (_, _, _, x) = x
820
      solScore (grp, sol) = (Group.allocPolicy grp,
821
                             (extractScore . fromJust . asSolution) sol)
822
  in sortBy (comparing solScore) sols
823

    
824
-- | Removes node groups which can't accommodate the instance
825
filterValidGroups :: [(Group.Group, (Node.List, Instance.List))]
826
                  -> Instance.Instance
827
                  -> ([(Group.Group, (Node.List, Instance.List))], [String])
828
filterValidGroups [] _ = ([], [])
829
filterValidGroups (ng:ngs) inst =
830
  let (valid_ngs, msgs) = filterValidGroups ngs inst
831
      hasNetwork nic = case Nic.network nic of
832
        Just net -> net `elem` Group.networks (fst ng)
833
        Nothing -> True
834
      hasRequiredNetworks = all hasNetwork (Instance.nics inst)
835
  in if hasRequiredNetworks
836
      then (ng:valid_ngs, msgs)
837
      else (valid_ngs,
838
            ("group " ++ Group.name (fst ng) ++
839
             " is not connected to a network required by instance " ++
840
             Instance.name inst):msgs)
841

    
842
-- | Finds the best group for an instance on a multi-group cluster.
843
--
844
-- Only solutions in @preferred@ and @last_resort@ groups will be
845
-- accepted as valid, and additionally if the allowed groups parameter
846
-- is not null then allocation will only be run for those group
847
-- indices.
848
findBestAllocGroup :: Group.List           -- ^ The group list
849
                   -> Node.List            -- ^ The node list
850
                   -> Instance.List        -- ^ The instance list
851
                   -> Maybe [Gdx]          -- ^ The allowed groups
852
                   -> Instance.Instance    -- ^ The instance to allocate
853
                   -> Int                  -- ^ Required number of nodes
854
                   -> Result (Group.Group, AllocSolution, [String])
855
findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
856
  let groups_by_idx = splitCluster mgnl mgil
857
      groups = map (\(gid, d) -> (Container.find gid mggl, d)) groups_by_idx
858
      groups' = maybe groups
859
                (\gs -> filter ((`elem` gs) . Group.idx . fst) groups)
860
                allowed_gdxs
861
      (groups'', filter_group_msgs) = filterValidGroups groups' inst
862
      sols = map (\(gr, (nl, il)) ->
863
                   (gr, genAllocNodes mggl nl cnt False >>=
864
                        tryAlloc nl il inst))
865
             groups''::[(Group.Group, Result AllocSolution)]
866
      all_msgs = filter_group_msgs ++ concatMap solutionDescription sols
867
      goodSols = filterMGResults sols
868
      sortedSols = sortMGResults goodSols
869
  in case sortedSols of
870
       [] -> Bad $ if null groups'
871
                     then "no groups for evacuation: allowed groups was" ++
872
                          show allowed_gdxs ++ ", all groups: " ++
873
                          show (map fst groups)
874
                     else intercalate ", " all_msgs
875
       (final_group, final_sol):_ -> return (final_group, final_sol, all_msgs)
876

    
877
-- | Try to allocate an instance on a multi-group cluster.
878
tryMGAlloc :: Group.List           -- ^ The group list
879
           -> Node.List            -- ^ The node list
880
           -> Instance.List        -- ^ The instance list
881
           -> Instance.Instance    -- ^ The instance to allocate
882
           -> Int                  -- ^ Required number of nodes
883
           -> Result AllocSolution -- ^ Possible solution list
884
tryMGAlloc mggl mgnl mgil inst cnt = do
885
  (best_group, solution, all_msgs) <-
886
      findBestAllocGroup mggl mgnl mgil Nothing inst cnt
887
  let group_name = Group.name best_group
888
      selmsg = "Selected group: " ++ group_name
889
  return $ solution { asLog = selmsg:all_msgs }
890

    
891
-- | Calculate the new instance list after allocation solution.
892
updateIl :: Instance.List           -- ^ The original instance list
893
         -> Maybe Node.AllocElement -- ^ The result of the allocation attempt
894
         -> Instance.List           -- ^ The updated instance list
895
updateIl il Nothing = il
896
updateIl il (Just (_, xi, _, _)) = Container.add (Container.size il) xi il
897

    
898
-- | Extract the the new node list from the allocation solution.
899
extractNl :: Node.List               -- ^ The original node list
900
          -> Maybe Node.AllocElement -- ^ The result of the allocation attempt
901
          -> Node.List               -- ^ The new node list
902
extractNl nl Nothing = nl
903
extractNl _ (Just (xnl, _, _, _)) = xnl
904

    
905
-- | Try to allocate a list of instances on a multi-group cluster.
906
allocList :: Group.List                  -- ^ The group list
907
          -> Node.List                   -- ^ The node list
908
          -> Instance.List               -- ^ The instance list
909
          -> [(Instance.Instance, Int)]  -- ^ The instance to allocate
910
          -> AllocSolutionList           -- ^ Possible solution list
911
          -> Result (Node.List, Instance.List,
912
                     AllocSolutionList)  -- ^ The final solution list
913
allocList _  nl il [] result = Ok (nl, il, result)
914
allocList gl nl il ((xi, xicnt):xies) result = do
915
  ares <- tryMGAlloc gl nl il xi xicnt
916
  let sol = asSolution ares
917
      nl' = extractNl nl sol
918
      il' = updateIl il sol
919
  allocList gl nl' il' xies ((xi, ares):result)
920

    
921
-- | Function which fails if the requested mode is change secondary.
922
--
923
-- This is useful since except DRBD, no other disk template can
924
-- execute change secondary; thus, we can just call this function
925
-- instead of always checking for secondary mode. After the call to
926
-- this function, whatever mode we have is just a primary change.
927
failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
928
failOnSecondaryChange ChangeSecondary dt =
929
  fail $ "Instances with disk template '" ++ diskTemplateToRaw dt ++
930
         "' can't execute change secondary"
931
failOnSecondaryChange _ _ = return ()
932

    
933
-- | Run evacuation for a single instance.
934
--
935
-- /Note:/ this function should correctly execute both intra-group
936
-- evacuations (in all modes) and inter-group evacuations (in the
937
-- 'ChangeAll' mode). Of course, this requires that the correct list
938
-- of target nodes is passed.
939
nodeEvacInstance :: Node.List         -- ^ The node list (cluster-wide)
940
                 -> Instance.List     -- ^ Instance list (cluster-wide)
941
                 -> EvacMode          -- ^ The evacuation mode
942
                 -> Instance.Instance -- ^ The instance to be evacuated
943
                 -> Gdx               -- ^ The group we're targetting
944
                 -> [Ndx]             -- ^ The list of available nodes
945
                                      -- for allocation
946
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
947
nodeEvacInstance nl il mode inst@(Instance.Instance
948
                                  {Instance.diskTemplate = dt@DTDiskless})
949
                 gdx avail_nodes =
950
                   failOnSecondaryChange mode dt >>
951
                   evacOneNodeOnly nl il inst gdx avail_nodes
952

    
953
nodeEvacInstance _ _ _ (Instance.Instance
954
                        {Instance.diskTemplate = DTPlain}) _ _ =
955
                  fail "Instances of type plain cannot be relocated"
956

    
957
nodeEvacInstance _ _ _ (Instance.Instance
958
                        {Instance.diskTemplate = DTFile}) _ _ =
959
                  fail "Instances of type file cannot be relocated"
960

    
961
nodeEvacInstance nl il mode inst@(Instance.Instance
962
                                  {Instance.diskTemplate = dt@DTSharedFile})
963
                 gdx avail_nodes =
964
                   failOnSecondaryChange mode dt >>
965
                   evacOneNodeOnly nl il inst gdx avail_nodes
966

    
967
nodeEvacInstance nl il mode inst@(Instance.Instance
968
                                  {Instance.diskTemplate = dt@DTBlock})
969
                 gdx avail_nodes =
970
                   failOnSecondaryChange mode dt >>
971
                   evacOneNodeOnly nl il inst gdx avail_nodes
972

    
973
nodeEvacInstance nl il mode inst@(Instance.Instance
974
                                  {Instance.diskTemplate = dt@DTRbd})
975
                 gdx avail_nodes =
976
                   failOnSecondaryChange mode dt >>
977
                   evacOneNodeOnly nl il inst gdx avail_nodes
978

    
979
nodeEvacInstance nl il mode inst@(Instance.Instance
980
                                  {Instance.diskTemplate = dt@DTExt})
981
                 gdx avail_nodes =
982
                   failOnSecondaryChange mode dt >>
983
                   evacOneNodeOnly nl il inst gdx avail_nodes
984

    
985
nodeEvacInstance nl il mode inst@(Instance.Instance
986
                                  {Instance.diskTemplate = dt@DTGluster})
987
                 gdx avail_nodes =
988
                   failOnSecondaryChange mode dt >>
989
                   evacOneNodeOnly nl il inst gdx avail_nodes
990

    
991
nodeEvacInstance nl il ChangePrimary
992
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
993
                 _ _ =
994
  do
995
    (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
996
    let idx = Instance.idx inst
997
        il' = Container.add idx inst' il
998
        ops = iMoveToJob nl' il' idx Failover
999
    return (nl', il', ops)
1000

    
1001
nodeEvacInstance nl il ChangeSecondary
1002
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
1003
                 gdx avail_nodes =
1004
  evacOneNodeOnly nl il inst gdx avail_nodes
1005

    
1006
-- The algorithm for ChangeAll is as follows:
1007
--
1008
-- * generate all (primary, secondary) node pairs for the target groups
1009
-- * for each pair, execute the needed moves (r:s, f, r:s) and compute
1010
--   the final node list state and group score
1011
-- * select the best choice via a foldl that uses the same Either
1012
--   String solution as the ChangeSecondary mode
1013
nodeEvacInstance nl il ChangeAll
1014
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
1015
                 gdx avail_nodes =
1016
  do
1017
    let no_nodes = Left "no nodes available"
1018
        node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s]
1019
    (nl', il', ops, _) <-
1020
        annotateResult "Can't find any good nodes for relocation" .
1021
        eitherToResult $
1022
        foldl'
1023
        (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of
1024
                          Bad msg ->
1025
                              case accu of
1026
                                Right _ -> accu
1027
                                -- we don't need more details (which
1028
                                -- nodes, etc.) as we only selected
1029
                                -- this group if we can allocate on
1030
                                -- it, hence failures will not
1031
                                -- propagate out of this fold loop
1032
                                Left _ -> Left $ "Allocation failed: " ++ msg
1033
                          Ok result@(_, _, _, new_cv) ->
1034
                              let new_accu = Right result in
1035
                              case accu of
1036
                                Left _ -> new_accu
1037
                                Right (_, _, _, old_cv) ->
1038
                                    if old_cv < new_cv
1039
                                    then accu
1040
                                    else new_accu
1041
        ) no_nodes node_pairs
1042

    
1043
    return (nl', il', ops)
1044

    
1045
-- | Generic function for changing one node of an instance.
1046
--
1047
-- This is similar to 'nodeEvacInstance' but will be used in a few of
1048
-- its sub-patterns. It folds the inner function 'evacOneNodeInner'
1049
-- over the list of available nodes, which results in the best choice
1050
-- for relocation.
1051
evacOneNodeOnly :: Node.List         -- ^ The node list (cluster-wide)
1052
                -> Instance.List     -- ^ Instance list (cluster-wide)
1053
                -> Instance.Instance -- ^ The instance to be evacuated
1054
                -> Gdx               -- ^ The group we're targetting
1055
                -> [Ndx]             -- ^ The list of available nodes
1056
                                      -- for allocation
1057
                -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1058
evacOneNodeOnly nl il inst gdx avail_nodes = do
1059
  op_fn <- case Instance.mirrorType inst of
1060
             MirrorNone -> Bad "Can't relocate/evacuate non-mirrored instances"
1061
             MirrorInternal -> Ok ReplaceSecondary
1062
             MirrorExternal -> Ok FailoverToAny
1063
  (nl', inst', _, ndx) <- annotateResult "Can't find any good node" .
1064
                          eitherToResult $
1065
                          foldl' (evacOneNodeInner nl inst gdx op_fn)
1066
                          (Left "no nodes available") avail_nodes
1067
  let idx = Instance.idx inst
1068
      il' = Container.add idx inst' il
1069
      ops = iMoveToJob nl' il' idx (op_fn ndx)
1070
  return (nl', il', ops)
1071

    
1072
-- | Inner fold function for changing one node of an instance.
1073
--
1074
-- Depending on the instance disk template, this will either change
1075
-- the secondary (for DRBD) or the primary node (for shared
1076
-- storage). However, the operation is generic otherwise.
1077
--
1078
-- The running solution is either a @Left String@, which means we
1079
-- don't have yet a working solution, or a @Right (...)@, which
1080
-- represents a valid solution; it holds the modified node list, the
1081
-- modified instance (after evacuation), the score of that solution,
1082
-- and the new secondary node index.
1083
evacOneNodeInner :: Node.List         -- ^ Cluster node list
1084
                 -> Instance.Instance -- ^ Instance being evacuated
1085
                 -> Gdx               -- ^ The group index of the instance
1086
                 -> (Ndx -> IMove)    -- ^ Operation constructor
1087
                 -> EvacInnerState    -- ^ Current best solution
1088
                 -> Ndx               -- ^ Node we're evaluating as target
1089
                 -> EvacInnerState    -- ^ New best solution
1090
evacOneNodeInner nl inst gdx op_fn accu ndx =
1091
  case applyMove nl inst (op_fn ndx) of
1092
    Bad fm -> let fail_msg = "Node " ++ Container.nameOf nl ndx ++
1093
                             " failed: " ++ show fm
1094
              in either (const $ Left fail_msg) (const accu) accu
1095
    Ok (nl', inst', _, _) ->
1096
      let nodes = Container.elems nl'
1097
          -- The fromJust below is ugly (it can fail nastily), but
1098
          -- at this point we should have any internal mismatches,
1099
          -- and adding a monad here would be quite involved
1100
          grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1101
          new_cv = compCVNodes grpnodes
1102
          new_accu = Right (nl', inst', new_cv, ndx)
1103
      in case accu of
1104
           Left _ -> new_accu
1105
           Right (_, _, old_cv, _) ->
1106
             if old_cv < new_cv
1107
               then accu
1108
               else new_accu
1109

    
1110
-- | Compute result of changing all nodes of a DRBD instance.
1111
--
1112
-- Given the target primary and secondary node (which might be in a
1113
-- different group or not), this function will 'execute' all the
1114
-- required steps and assuming all operations succceed, will return
1115
-- the modified node and instance lists, the opcodes needed for this
1116
-- and the new group score.
1117
evacDrbdAllInner :: Node.List         -- ^ Cluster node list
1118
                 -> Instance.List     -- ^ Cluster instance list
1119
                 -> Instance.Instance -- ^ The instance to be moved
1120
                 -> Gdx               -- ^ The target group index
1121
                                      -- (which can differ from the
1122
                                      -- current group of the
1123
                                      -- instance)
1124
                 -> (Ndx, Ndx)        -- ^ Tuple of new
1125
                                      -- primary\/secondary nodes
1126
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score)
1127
evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) = do
1128
  let primary = Container.find (Instance.pNode inst) nl
1129
      idx = Instance.idx inst
1130
  -- if the primary is offline, then we first failover
1131
  (nl1, inst1, ops1) <-
1132
    if Node.offline primary
1133
      then do
1134
        (nl', inst', _, _) <-
1135
          annotateResult "Failing over to the secondary" .
1136
          opToResult $ applyMove nl inst Failover
1137
        return (nl', inst', [Failover])
1138
      else return (nl, inst, [])
1139
  let (o1, o2, o3) = (ReplaceSecondary t_pdx,
1140
                      Failover,
1141
                      ReplaceSecondary t_sdx)
1142
  -- we now need to execute a replace secondary to the future
1143
  -- primary node
1144
  (nl2, inst2, _, _) <-
1145
    annotateResult "Changing secondary to new primary" .
1146
    opToResult $
1147
    applyMove nl1 inst1 o1
1148
  let ops2 = o1:ops1
1149
  -- we now execute another failover, the primary stays fixed now
1150
  (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" .
1151
                        opToResult $ applyMove nl2 inst2 o2
1152
  let ops3 = o2:ops2
1153
  -- and finally another replace secondary, to the final secondary
1154
  (nl4, inst4, _, _) <-
1155
    annotateResult "Changing secondary to final secondary" .
1156
    opToResult $
1157
    applyMove nl3 inst3 o3
1158
  let ops4 = o3:ops3
1159
      il' = Container.add idx inst4 il
1160
      ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1161
  let nodes = Container.elems nl4
1162
      -- The fromJust below is ugly (it can fail nastily), but
1163
      -- at this point we should have any internal mismatches,
1164
      -- and adding a monad here would be quite involved
1165
      grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1166
      new_cv = compCVNodes grpnodes
1167
  return (nl4, il', ops, new_cv)
1168

    
1169
-- | Computes the nodes in a given group which are available for
1170
-- allocation.
1171
availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1172
                    -> IntSet.IntSet  -- ^ Nodes that are excluded
1173
                    -> Gdx            -- ^ The group for which we
1174
                                      -- query the nodes
1175
                    -> Result [Ndx]   -- ^ List of available node indices
1176
availableGroupNodes group_nodes excl_ndx gdx = do
1177
  local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1178
                 Ok (lookup gdx group_nodes)
1179
  let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1180
  return avail_nodes
1181

    
1182
-- | Updates the evac solution with the results of an instance
1183
-- evacuation.
1184
updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1185
                   -> Idx
1186
                   -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1187
                   -> (Node.List, Instance.List, EvacSolution)
1188
updateEvacSolution (nl, il, es) idx (Bad msg) =
1189
  (nl, il, es { esFailed = (idx, msg):esFailed es})
1190
updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1191
  (nl, il, es { esMoved = new_elem:esMoved es
1192
              , esOpCodes = opcodes:esOpCodes es })
1193
    where inst = Container.find idx il
1194
          new_elem = (idx,
1195
                      instancePriGroup nl inst,
1196
                      Instance.allNodes inst)
1197

    
1198
-- | Node-evacuation IAllocator mode main function.
1199
tryNodeEvac :: Group.List    -- ^ The cluster groups
1200
            -> Node.List     -- ^ The node list (cluster-wide, not per group)
1201
            -> Instance.List -- ^ Instance list (cluster-wide)
1202
            -> EvacMode      -- ^ The evacuation mode
1203
            -> [Idx]         -- ^ List of instance (indices) to be evacuated
1204
            -> Result (Node.List, Instance.List, EvacSolution)
1205
tryNodeEvac _ ini_nl ini_il mode idxs =
1206
  let evac_ndx = nodesToEvacuate ini_il mode idxs
1207
      offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1208
      excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1209
      group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1210
                                           (Container.elems nl))) $
1211
                  splitCluster ini_nl ini_il
1212
      (fin_nl, fin_il, esol) =
1213
        foldl' (\state@(nl, il, _) inst ->
1214
                  let gdx = instancePriGroup nl inst
1215
                      pdx = Instance.pNode inst in
1216
                  updateEvacSolution state (Instance.idx inst) $
1217
                  availableGroupNodes group_ndx
1218
                    (IntSet.insert pdx excl_ndx) gdx >>=
1219
                      nodeEvacInstance nl il mode inst gdx
1220
               )
1221
        (ini_nl, ini_il, emptyEvacSolution)
1222
        (map (`Container.find` ini_il) idxs)
1223
  in return (fin_nl, fin_il, reverseEvacSolution esol)
1224

    
1225
-- | Change-group IAllocator mode main function.
1226
--
1227
-- This is very similar to 'tryNodeEvac', the only difference is that
1228
-- we don't choose as target group the current instance group, but
1229
-- instead:
1230
--
1231
--   1. at the start of the function, we compute which are the target
1232
--   groups; either no groups were passed in, in which case we choose
1233
--   all groups out of which we don't evacuate instance, or there were
1234
--   some groups passed, in which case we use those
1235
--
1236
--   2. for each instance, we use 'findBestAllocGroup' to choose the
1237
--   best group to hold the instance, and then we do what
1238
--   'tryNodeEvac' does, except for this group instead of the current
1239
--   instance group.
1240
--
1241
-- Note that the correct behaviour of this function relies on the
1242
-- function 'nodeEvacInstance' to be able to do correctly both
1243
-- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1244
tryChangeGroup :: Group.List    -- ^ The cluster groups
1245
               -> Node.List     -- ^ The node list (cluster-wide)
1246
               -> Instance.List -- ^ Instance list (cluster-wide)
1247
               -> [Gdx]         -- ^ Target groups; if empty, any
1248
                                -- groups not being evacuated
1249
               -> [Idx]         -- ^ List of instance (indices) to be evacuated
1250
               -> Result (Node.List, Instance.List, EvacSolution)
1251
tryChangeGroup gl ini_nl ini_il gdxs idxs =
1252
  let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1253
                             flip Container.find ini_il) idxs
1254
      target_gdxs = (if null gdxs
1255
                       then Container.keys gl
1256
                       else gdxs) \\ evac_gdxs
1257
      offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1258
      excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1259
      group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1260
                                           (Container.elems nl))) $
1261
                  splitCluster ini_nl ini_il
1262
      (fin_nl, fin_il, esol) =
1263
        foldl' (\state@(nl, il, _) inst ->
1264
                  let solution = do
1265
                        let ncnt = Instance.requiredNodes $
1266
                                   Instance.diskTemplate inst
1267
                        (grp, _, _) <- findBestAllocGroup gl nl il
1268
                                       (Just target_gdxs) inst ncnt
1269
                        let gdx = Group.idx grp
1270
                        av_nodes <- availableGroupNodes group_ndx
1271
                                    excl_ndx gdx
1272
                        nodeEvacInstance nl il ChangeAll inst gdx av_nodes
1273
                  in updateEvacSolution state (Instance.idx inst) solution
1274
               )
1275
        (ini_nl, ini_il, emptyEvacSolution)
1276
        (map (`Container.find` ini_il) idxs)
1277
  in return (fin_nl, fin_il, reverseEvacSolution esol)
1278

    
1279
-- | Standard-sized allocation method.
1280
--
1281
-- This places instances of the same size on the cluster until we're
1282
-- out of space. The result will be a list of identically-sized
1283
-- instances.
1284
iterateAlloc :: AllocMethod
1285
iterateAlloc nl il limit newinst allocnodes ixes cstats =
1286
  let depth = length ixes
1287
      newname = printf "new-%d" depth::String
1288
      newidx = Container.size il
1289
      newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1290
      newlimit = fmap (flip (-) 1) limit
1291
  in case tryAlloc nl il newi2 allocnodes of
1292
       Bad s -> Bad s
1293
       Ok (AllocSolution { asFailures = errs, asSolution = sols3 }) ->
1294
         let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1295
         case sols3 of
1296
           Nothing -> newsol
1297
           Just (xnl, xi, _, _) ->
1298
             if limit == Just 0
1299
               then newsol
1300
               else iterateAlloc xnl (Container.add newidx xi il)
1301
                      newlimit newinst allocnodes (xi:ixes)
1302
                      (totalResources xnl:cstats)
1303

    
1304
-- | Predicate whether shrinking a single resource can lead to a valid
1305
-- allocation.
1306
sufficesShrinking :: (Instance.Instance -> AllocSolution) -> Instance.Instance
1307
                     -> FailMode  -> Maybe Instance.Instance
1308
sufficesShrinking allocFn inst fm =
1309
  case dropWhile (isNothing . asSolution . fst)
1310
       . takeWhile (liftA2 (||) (elem fm . asFailures . fst)
1311
                                (isJust . asSolution . fst))
1312
       . map (allocFn &&& id) $
1313
       iterateOk (`Instance.shrinkByType` fm) inst
1314
  of x:_ -> Just . snd $ x
1315
     _ -> Nothing
1316

    
1317
-- | Tiered allocation method.
1318
--
1319
-- This places instances on the cluster, and decreases the spec until
1320
-- we can allocate again. The result will be a list of decreasing
1321
-- instance specs.
1322
tieredAlloc :: AllocMethod
1323
tieredAlloc nl il limit newinst allocnodes ixes cstats =
1324
  case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1325
    Bad s -> Bad s
1326
    Ok (errs, nl', il', ixes', cstats') ->
1327
      let newsol = Ok (errs, nl', il', ixes', cstats')
1328
          ixes_cnt = length ixes'
1329
          (stop, newlimit) = case limit of
1330
                               Nothing -> (False, Nothing)
1331
                               Just n -> (n <= ixes_cnt,
1332
                                            Just (n - ixes_cnt))
1333
          sortedErrs = map fst $ sortBy (comparing snd) errs
1334
          suffShrink = sufficesShrinking (fromMaybe emptyAllocSolution
1335
                                          . flip (tryAlloc nl' il') allocnodes)
1336
                       newinst
1337
          bigSteps = filter isJust . map suffShrink . reverse $ sortedErrs
1338
      in if stop then newsol else
1339
          case bigSteps of
1340
            Just newinst':_ -> tieredAlloc nl' il' newlimit
1341
                               newinst' allocnodes ixes' cstats'
1342
            _ -> case Instance.shrinkByType newinst . last $ sortedErrs of
1343
                   Bad _ -> newsol
1344
                   Ok newinst' -> tieredAlloc nl' il' newlimit
1345
                                  newinst' allocnodes ixes' cstats'
1346

    
1347
-- * Formatting functions
1348

    
1349
-- | Given the original and final nodes, computes the relocation description.
1350
computeMoves :: Instance.Instance -- ^ The instance to be moved
1351
             -> String -- ^ The instance name
1352
             -> IMove  -- ^ The move being performed
1353
             -> String -- ^ New primary
1354
             -> String -- ^ New secondary
1355
             -> (String, [String])
1356
                -- ^ Tuple of moves and commands list; moves is containing
1357
                -- either @/f/@ for failover or @/r:name/@ for replace
1358
                -- secondary, while the command list holds gnt-instance
1359
                -- commands (without that prefix), e.g \"@failover instance1@\"
1360
computeMoves i inam mv c d =
1361
  case mv of
1362
    Failover -> ("f", [mig])
1363
    FailoverToAny _ -> (printf "fa:%s" c, [mig_any])
1364
    FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1365
    ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1366
    ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1367
    ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1368
  where morf = if Instance.isRunning i then "migrate" else "failover"
1369
        mig = printf "%s -f %s" morf inam::String
1370
        mig_any = printf "%s -f -n %s %s" morf c inam::String
1371
        rep n = printf "replace-disks -n %s %s" n inam::String
1372

    
1373
-- | Converts a placement to string format.
1374
printSolutionLine :: Node.List     -- ^ The node list
1375
                  -> Instance.List -- ^ The instance list
1376
                  -> Int           -- ^ Maximum node name length
1377
                  -> Int           -- ^ Maximum instance name length
1378
                  -> Placement     -- ^ The current placement
1379
                  -> Int           -- ^ The index of the placement in
1380
                                   -- the solution
1381
                  -> (String, [String])
1382
printSolutionLine nl il nmlen imlen plc pos =
1383
  let pmlen = (2*nmlen + 1)
1384
      (i, p, s, mv, c) = plc
1385
      old_sec = Instance.sNode inst
1386
      inst = Container.find i il
1387
      inam = Instance.alias inst
1388
      npri = Node.alias $ Container.find p nl
1389
      nsec = Node.alias $ Container.find s nl
1390
      opri = Node.alias $ Container.find (Instance.pNode inst) nl
1391
      osec = Node.alias $ Container.find old_sec nl
1392
      (moves, cmds) =  computeMoves inst inam mv npri nsec
1393
      -- FIXME: this should check instead/also the disk template
1394
      ostr = if old_sec == Node.noSecondary
1395
               then printf "%s" opri::String
1396
               else printf "%s:%s" opri osec::String
1397
      nstr = if s == Node.noSecondary
1398
               then printf "%s" npri::String
1399
               else printf "%s:%s" npri nsec::String
1400
  in (printf "  %3d. %-*s %-*s => %-*s %12.8f a=%s"
1401
      pos imlen inam pmlen ostr pmlen nstr c moves,
1402
      cmds)
1403

    
1404
-- | Return the instance and involved nodes in an instance move.
1405
--
1406
-- Note that the output list length can vary, and is not required nor
1407
-- guaranteed to be of any specific length.
1408
involvedNodes :: Instance.List -- ^ Instance list, used for retrieving
1409
                               -- the instance from its index; note
1410
                               -- that this /must/ be the original
1411
                               -- instance list, so that we can
1412
                               -- retrieve the old nodes
1413
              -> Placement     -- ^ The placement we're investigating,
1414
                               -- containing the new nodes and
1415
                               -- instance index
1416
              -> [Ndx]         -- ^ Resulting list of node indices
1417
involvedNodes il plc =
1418
  let (i, np, ns, _, _) = plc
1419
      inst = Container.find i il
1420
  in nub . filter (>= 0) $ [np, ns] ++ Instance.allNodes inst
1421

    
1422
-- | From two adjacent cluster tables get the list of moves that transitions
1423
-- from to the other
1424
getMoves :: (Table, Table) -> [MoveJob]
1425
getMoves (Table _ initial_il _ initial_plc, Table final_nl _ _ final_plc) =
1426
  let
1427
    plctoMoves (plc@(idx, p, s, mv, _)) =
1428
      let inst = Container.find idx initial_il
1429
          inst_name = Instance.name inst
1430
          affected = involvedNodes initial_il plc
1431
          np = Node.alias $ Container.find p final_nl
1432
          ns = Node.alias $ Container.find s final_nl
1433
          (_, cmds) = computeMoves inst inst_name mv np ns
1434
      in (affected, idx, mv, cmds)
1435
  in map plctoMoves . reverse . drop (length initial_plc) $ reverse final_plc
1436
             
1437
-- | Inner function for splitJobs, that either appends the next job to
1438
-- the current jobset, or starts a new jobset.
1439
mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1440
mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1441
mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1442
  | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1443
  | otherwise = ([n]:cjs, ndx)
1444

    
1445
-- | Break a list of moves into independent groups. Note that this
1446
-- will reverse the order of jobs.
1447
splitJobs :: [MoveJob] -> [JobSet]
1448
splitJobs = fst . foldl mergeJobs ([], [])
1449

    
1450
-- | Given a list of commands, prefix them with @gnt-instance@ and
1451
-- also beautify the display a little.
1452
formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1453
formatJob jsn jsl (sn, (_, _, _, cmds)) =
1454
  let out =
1455
        printf "  echo job %d/%d" jsn sn:
1456
        printf "  check":
1457
        map ("  gnt-instance " ++) cmds
1458
  in if sn == 1
1459
       then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1460
       else out
1461

    
1462
-- | Given a list of commands, prefix them with @gnt-instance@ and
1463
-- also beautify the display a little.
1464
formatCmds :: [JobSet] -> String
1465
formatCmds =
1466
  unlines .
1467
  concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1468
                           (zip [1..] js)) .
1469
  zip [1..]
1470

    
1471
-- | Print the node list.
1472
printNodes :: Node.List -> [String] -> String
1473
printNodes nl fs =
1474
  let fields = case fs of
1475
                 [] -> Node.defaultFields
1476
                 "+":rest -> Node.defaultFields ++ rest
1477
                 _ -> fs
1478
      snl = sortBy (comparing Node.idx) (Container.elems nl)
1479
      (header, isnum) = unzip $ map Node.showHeader fields
1480
  in printTable "" header (map (Node.list fields) snl) isnum
1481

    
1482
-- | Print the instance list.
1483
printInsts :: Node.List -> Instance.List -> String
1484
printInsts nl il =
1485
  let sil = sortBy (comparing Instance.idx) (Container.elems il)
1486
      helper inst = [ if Instance.isRunning inst then "R" else " "
1487
                    , Instance.name inst
1488
                    , Container.nameOf nl (Instance.pNode inst)
1489
                    , let sdx = Instance.sNode inst
1490
                      in if sdx == Node.noSecondary
1491
                           then  ""
1492
                           else Container.nameOf nl sdx
1493
                    , if Instance.autoBalance inst then "Y" else "N"
1494
                    , printf "%3d" $ Instance.vcpus inst
1495
                    , printf "%5d" $ Instance.mem inst
1496
                    , printf "%5d" $ Instance.dsk inst `div` 1024
1497
                    , printf "%5.3f" lC
1498
                    , printf "%5.3f" lM
1499
                    , printf "%5.3f" lD
1500
                    , printf "%5.3f" lN
1501
                    ]
1502
          where DynUtil lC lM lD lN = Instance.util inst
1503
      header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1504
               , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1505
      isnum = False:False:False:False:False:repeat True
1506
  in printTable "" header (map helper sil) isnum
1507

    
1508
-- | Shows statistics for a given node list.
1509
printStats :: String -> Node.List -> String
1510
printStats lp nl =
1511
  let dcvs = compDetailedCV $ Container.elems nl
1512
      (weights, names) = unzip detailedCVInfo
1513
      hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1514
      header = [ "Field", "Value", "Weight" ]
1515
      formatted = map (\(w, h, val) ->
1516
                         [ h
1517
                         , printf "%.8f" val
1518
                         , printf "x%.2f" w
1519
                         ]) hd
1520
  in printTable lp header formatted $ False:repeat True
1521

    
1522
-- | Convert a placement into a list of OpCodes (basically a job).
1523
iMoveToJob :: Node.List        -- ^ The node list; only used for node
1524
                               -- names, so any version is good
1525
                               -- (before or after the operation)
1526
           -> Instance.List    -- ^ The instance list; also used for
1527
                               -- names only
1528
           -> Idx              -- ^ The index of the instance being
1529
                               -- moved
1530
           -> IMove            -- ^ The actual move to be described
1531
           -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to
1532
                               -- the given move
1533
iMoveToJob nl il idx move =
1534
  let inst = Container.find idx il
1535
      iname = Instance.name inst
1536
      lookNode  n = case mkNonEmpty (Container.nameOf nl n) of
1537
                      -- FIXME: convert htools codebase to non-empty strings
1538
                      Bad msg -> error $ "Empty node name for idx " ++
1539
                                 show n ++ ": " ++ msg ++ "??"
1540
                      Ok ne -> Just ne
1541
      opF = OpCodes.OpInstanceMigrate
1542
              { OpCodes.opInstanceName        = iname
1543
              , OpCodes.opInstanceUuid        = Nothing
1544
              , OpCodes.opMigrationMode       = Nothing -- default
1545
              , OpCodes.opOldLiveMode         = Nothing -- default as well
1546
              , OpCodes.opTargetNode          = Nothing -- this is drbd
1547
              , OpCodes.opTargetNodeUuid      = Nothing
1548
              , OpCodes.opAllowRuntimeChanges = False
1549
              , OpCodes.opIgnoreIpolicy       = False
1550
              , OpCodes.opMigrationCleanup    = False
1551
              , OpCodes.opIallocator          = Nothing
1552
              , OpCodes.opAllowFailover       = True }
1553
      opFA n = opF { OpCodes.opTargetNode = lookNode n } -- not drbd
1554
      opR n = OpCodes.OpInstanceReplaceDisks
1555
                { OpCodes.opInstanceName     = iname
1556
                , OpCodes.opInstanceUuid     = Nothing
1557
                , OpCodes.opEarlyRelease     = False
1558
                , OpCodes.opIgnoreIpolicy    = False
1559
                , OpCodes.opReplaceDisksMode = OpCodes.ReplaceNewSecondary
1560
                , OpCodes.opReplaceDisksList = []
1561
                , OpCodes.opRemoteNode       = lookNode n
1562
                , OpCodes.opRemoteNodeUuid   = Nothing
1563
                , OpCodes.opIallocator       = Nothing
1564
                }
1565
  in case move of
1566
       Failover -> [ opF ]
1567
       FailoverToAny np -> [ opFA np ]
1568
       ReplacePrimary np -> [ opF, opR np, opF ]
1569
       ReplaceSecondary ns -> [ opR ns ]
1570
       ReplaceAndFailover np -> [ opR np, opF ]
1571
       FailoverAndReplace ns -> [ opF, opR ns ]
1572

    
1573
-- * Node group functions
1574

    
1575
-- | Computes the group of an instance.
1576
instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1577
instanceGroup nl i =
1578
  let sidx = Instance.sNode i
1579
      pnode = Container.find (Instance.pNode i) nl
1580
      snode = if sidx == Node.noSecondary
1581
              then pnode
1582
              else Container.find sidx nl
1583
      pgroup = Node.group pnode
1584
      sgroup = Node.group snode
1585
  in if pgroup /= sgroup
1586
       then fail ("Instance placed accross two node groups, primary " ++
1587
                  show pgroup ++ ", secondary " ++ show sgroup)
1588
       else return pgroup
1589

    
1590
-- | Computes the group of an instance per the primary node.
1591
instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1592
instancePriGroup nl i =
1593
  let pnode = Container.find (Instance.pNode i) nl
1594
  in  Node.group pnode
1595

    
1596
-- | Compute the list of badly allocated instances (split across node
1597
-- groups).
1598
findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1599
findSplitInstances nl =
1600
  filter (not . isOk . instanceGroup nl) . Container.elems
1601

    
1602
-- | Splits a cluster into the component node groups.
1603
splitCluster :: Node.List -> Instance.List ->
1604
                [(Gdx, (Node.List, Instance.List))]
1605
splitCluster nl il =
1606
  let ngroups = Node.computeGroups (Container.elems nl)
1607
  in map (\(gdx, nodes) ->
1608
           let nidxs = map Node.idx nodes
1609
               nodes' = zip nidxs nodes
1610
               instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1611
           in (gdx, (Container.fromList nodes', instances))) ngroups
1612

    
1613
-- | Compute the list of nodes that are to be evacuated, given a list
1614
-- of instances and an evacuation mode.
1615
nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1616
                -> EvacMode      -- ^ The evacuation mode we're using
1617
                -> [Idx]         -- ^ List of instance indices being evacuated
1618
                -> IntSet.IntSet -- ^ Set of node indices
1619
nodesToEvacuate il mode =
1620
  IntSet.delete Node.noSecondary .
1621
  foldl' (\ns idx ->
1622
            let i = Container.find idx il
1623
                pdx = Instance.pNode i
1624
                sdx = Instance.sNode i
1625
                dt = Instance.diskTemplate i
1626
                withSecondary = case dt of
1627
                                  DTDrbd8 -> IntSet.insert sdx ns
1628
                                  _ -> ns
1629
            in case mode of
1630
                 ChangePrimary   -> IntSet.insert pdx ns
1631
                 ChangeSecondary -> withSecondary
1632
                 ChangeAll       -> IntSet.insert pdx withSecondary
1633
         ) IntSet.empty