Statistics
| Branch: | Tag: | Revision:

root / htools / Ganeti / HTools / Program / Hcheck.hs @ 81bcbbd3

History | View | Annotate | Download (12.3 kB)

1
{-| Cluster checker.
2

    
3
-}
4

    
5
{-
6

    
7
Copyright (C) 2012 Google Inc.
8

    
9
This program is free software; you can redistribute it and/or modify
10
it under the terms of the GNU General Public License as published by
11
the Free Software Foundation; either version 2 of the License, or
12
(at your option) any later version.
13

    
14
This program is distributed in the hope that it will be useful, but
15
WITHOUT ANY WARRANTY; without even the implied warranty of
16
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17
General Public License for more details.
18

    
19
You should have received a copy of the GNU Gene52al Public License
20
along with this program; if not, write to the Free Software
21
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22
02110-1301, USA.
23

    
24
-}
25

    
26
module Ganeti.HTools.Program.Hcheck (main, options) where
27

    
28
import Control.Monad
29
import Data.List (transpose)
30
import System.Exit
31
import System.IO
32
import Text.Printf (printf)
33

    
34
import qualified Ganeti.HTools.Container as Container
35
import qualified Ganeti.HTools.Cluster as Cluster
36
import qualified Ganeti.HTools.Group as Group
37
import qualified Ganeti.HTools.Node as Node
38
import qualified Ganeti.HTools.Instance as Instance
39

    
40
import qualified Ganeti.HTools.Program.Hbal as Hbal
41

    
42
import Ganeti.HTools.CLI
43
import Ganeti.HTools.ExtLoader
44
import Ganeti.HTools.Loader
45
import Ganeti.HTools.Types
46

    
47
-- | Options list and functions.
48
options :: [OptType]
49
options =
50
  [ oDataFile
51
  , oDiskMoves
52
  , oDynuFile
53
  , oEvacMode
54
  , oExInst
55
  , oExTags
56
  , oIAllocSrc
57
  , oInstMoves
58
  , oLuxiSocket
59
  , oMachineReadable
60
  , oMaxCpu
61
  , oMaxSolLength
62
  , oMinDisk
63
  , oMinGain
64
  , oMinGainLim
65
  , oMinScore
66
  , oNoSimulation
67
  , oOfflineNode
68
  , oQuiet
69
  , oRapiMaster
70
  , oSelInst
71
  , oShowHelp
72
  , oShowVer
73
  , oVerbose
74
  ]
75

    
76
-- | Check phase - are we before (initial) or after rebalance.
77
data Phase = Initial
78
           | Rebalanced
79

    
80
-- | Level of presented statistics.
81
data Level = GroupLvl
82
           | ClusterLvl
83

    
84
-- | Prefix for machine readable names.
85
htcPrefix :: String
86
htcPrefix = "HCHECK"
87

    
88
-- | Data showed both per group and per cluster.
89
commonData :: [(String, String)]
90
commonData =[ ("N1_FAIL", "Nodes not N+1 happy")
91
            , ("CONFLICT_TAGS", "Nodes with conflicting instances")
92
            , ("OFFLINE_PRI", "Instances with primary on an offline node")
93
            , ("OFFLINE_SEC", "Instances with seondary on an offline node")
94
            ]
95

    
96
-- | Data showed per group.
97
groupData :: [(String, String)]
98
groupData = commonData ++ [("SCORE", "Group score")]
99

    
100
-- | Data showed per cluster.
101
clusterData :: [(String, String)]
102
clusterData = commonData ++
103
              [ ("NEED_REBALANCE", "Cluster is not healthy")
104
              , ("CAN_REBALANCE", "Possible to run rebalance")
105
              ]
106

    
107
-- | Phase-specific prefix for machine readable version.
108
phasePrefix :: Phase -> String
109
phasePrefix Initial = "INIT"
110
phasePrefix Rebalanced = "FINAL"
111

    
112
-- | Level-specific prefix for machine readable version.
113
levelPrefix :: Level -> String
114
levelPrefix GroupLvl = "GROUP"
115
levelPrefix ClusterLvl = "CLUSTER"
116

    
117
-- | Machine-readable keys to show depending on given level.
118
keysData :: Level -> [String]
119
keysData GroupLvl = map fst groupData
120
keysData ClusterLvl = map fst clusterData
121

    
122
-- | Description of phases for human readable version.
123
phaseDescr :: Phase -> String
124
phaseDescr Initial = "initially"
125
phaseDescr Rebalanced = "after rebalancing"
126

    
127
-- | Description to show depending on given level.
128
descrData :: Level -> [String]
129
descrData GroupLvl = map snd groupData
130
descrData ClusterLvl = map snd clusterData
131

    
132
-- | Human readable prefix for statistics.
133
phaseLevelDescr :: Phase -> Level -> Maybe String -> String
134
phaseLevelDescr phase GroupLvl (Just name) =
135
    printf "Statistics for group %s %s\n" name $ phaseDescr phase
136
phaseLevelDescr phase GroupLvl Nothing =
137
    printf "Statistics for group %s\n" $ phaseDescr phase
138
phaseLevelDescr phase ClusterLvl _ =
139
    printf "Cluster statistics %s\n" $ phaseDescr phase
140

    
141
-- | Format a list of key, value as a shell fragment.
142
printKeysHTC :: [(String, String)] -> IO ()
143
printKeysHTC = printKeys htcPrefix
144

    
145
-- | Prepare string from boolean value.
146
printBool :: Bool    -- ^ Whether the result should be machine readable
147
          -> Bool    -- ^ Value to be converted to string
148
          -> String
149
printBool True True = "1"
150
printBool True False = "0"
151
printBool False b = show b
152

    
153
-- | Print mapping from group idx to group uuid (only in machine
154
-- readable mode).
155
printGroupsMappings :: Group.List -> IO ()
156
printGroupsMappings gl = do
157
    let extract_vals = \g -> (printf "GROUP_UUID_%d" $ Group.idx g :: String,
158
                              Group.uuid g)
159
        printpairs = map extract_vals (Container.elems gl)
160
    printKeysHTC printpairs
161

    
162
-- | Prepare a single key given a certain level and phase of simulation.
163
prepareKey :: Level -> Phase -> Maybe String -> String -> String
164
prepareKey level phase Nothing suffix =
165
  printf "%s_%s_%s" (phasePrefix phase) (levelPrefix level) suffix
166
prepareKey level phase (Just idx) suffix =
167
  printf "%s_%s_%s_%s" (phasePrefix phase) (levelPrefix level) idx suffix
168

    
169
-- | Print all the statistics for given level and phase.
170
printStats :: Int            -- ^ Verbosity level
171
           -> Bool           -- ^ If the output should be machine readable
172
           -> Level          -- ^ Level on which we are printing
173
           -> Phase          -- ^ Current phase of simulation
174
           -> [String]       -- ^ Values to print
175
           -> Maybe String   -- ^ Additional data for groups
176
           -> IO ()
177
printStats _ True level phase values gidx = do
178
  let keys = map (prepareKey level phase gidx) (keysData level)
179
  printKeysHTC $ zip keys values
180

    
181
printStats verbose False level phase values name = do
182
  let prefix = phaseLevelDescr phase level name
183
      descr = descrData level
184
  unless (verbose == 0) $ do
185
    printf "\n%s" prefix :: IO ()
186
    mapM_ (\(a,b) -> printf "    %s: %s\n" a b) (zip descr values)
187

    
188
-- | Extract name or idx from group.
189
extractGroupData :: Bool -> Group.Group -> String
190
extractGroupData True grp = show $ Group.idx grp
191
extractGroupData False grp = Group.name grp
192

    
193
-- | Prepare values for group.
194
prepareGroupValues :: [Int] -> Double -> [String]
195
prepareGroupValues stats score =
196
  map show stats ++ [printf "%.8f" score]
197

    
198
-- | Prepare values for cluster.
199
prepareClusterValues :: Bool -> [Int] -> [Bool] -> [String]
200
prepareClusterValues machineread stats bstats =
201
  map show stats ++ map (printBool machineread) bstats
202

    
203
-- | Print all the statistics on a group level.
204
printGroupStats :: Int -> Bool -> Phase -> Group.Group -> [Int] -> Double
205
                -> IO ()
206
printGroupStats verbose machineread phase grp stats score = do
207
  let values = prepareGroupValues stats score
208
      extradata = extractGroupData machineread grp
209
  printStats verbose machineread GroupLvl phase values (Just extradata)
210

    
211
-- | Print all the statistics on a cluster (global) level.
212
printClusterStats :: Int -> Bool -> Phase -> [Int] -> Bool -> Bool -> IO ()
213
printClusterStats verbose machineread phase stats needhbal canhbal = do
214
  let values = prepareClusterValues machineread stats [needhbal, canhbal]
215
  printStats verbose machineread ClusterLvl phase values Nothing
216

    
217
-- | Check if any of cluster metrics is non-zero.
218
clusterNeedsRebalance :: [Int] -> Bool
219
clusterNeedsRebalance stats = sum stats > 0
220

    
221
{- | Check group for N+1 hapiness, conflicts of primaries on nodes and
222
instances residing on offline nodes.
223

    
224
-}
225
perGroupChecks :: Int -> Bool -> Phase -> Group.List
226
               -> (Gdx, (Node.List, Instance.List)) -> IO ([Int])
227
perGroupChecks verbose machineread phase gl (gidx, (nl, il)) = do
228
  let grp = Container.find gidx gl
229
      offnl = filter Node.offline (Container.elems nl)
230
      n1violated = length $ fst $ Cluster.computeBadItems nl il
231
      conflicttags = length $ filter (>0)
232
                     (map Node.conflictingPrimaries (Container.elems nl))
233
      offline_pri = sum . map length $ map Node.pList offnl
234
      offline_sec = length $ map Node.sList offnl
235
      score = Cluster.compCV nl
236
      groupstats = [ n1violated
237
                   , conflicttags
238
                   , offline_pri
239
                   , offline_sec
240
                   ]
241
  printGroupStats verbose machineread phase grp groupstats score
242
  return groupstats
243

    
244
-- | Use Hbal's iterateDepth to simulate group rebalance.
245
executeSimulation :: Options -> Cluster.Table -> Double
246
                  -> Gdx -> Node.List -> Instance.List
247
                  -> IO (Gdx, (Node.List, Instance.List))
248
executeSimulation opts ini_tbl min_cv gidx nl il = do
249
  let imlen = maximum . map (length . Instance.alias) $ Container.elems il
250
      nmlen = maximum . map (length . Node.alias) $ Container.elems nl
251

    
252
  (fin_tbl, _) <- Hbal.iterateDepth False ini_tbl
253
                                    (optMaxLength opts)
254
                                    (optDiskMoves opts)
255
                                    (optInstMoves opts)
256
                                    nmlen imlen [] min_cv
257
                                    (optMinGainLim opts) (optMinGain opts)
258
                                    (optEvacMode opts)
259

    
260
  let (Cluster.Table fin_nl fin_il _ _) = fin_tbl
261
  return (gidx, (fin_nl, fin_il))
262

    
263
-- | Simulate group rebalance if group's score is not good
264
maybeSimulateGroupRebalance :: Options -> (Gdx, (Node.List, Instance.List))
265
                            -> IO (Gdx, (Node.List, Instance.List))
266
maybeSimulateGroupRebalance opts (gidx, (nl, il)) = do
267
  let ini_cv = Cluster.compCV nl
268
      ini_tbl = Cluster.Table nl il ini_cv []
269
      min_cv = optMinScore opts
270
  if ini_cv < min_cv
271
    then return (gidx, (nl, il))
272
    else executeSimulation opts ini_tbl min_cv gidx nl il
273

    
274
-- | Decide whether to simulate rebalance.
275
maybeSimulateRebalance :: Bool             -- ^ Whether to simulate rebalance
276
                       -> Options          -- ^ Command line options
277
                       -> [(Gdx, (Node.List, Instance.List))] -- ^ Group data
278
                       -> IO [(Gdx, (Node.List, Instance.List))]
279
maybeSimulateRebalance True opts cluster =
280
    mapM (maybeSimulateGroupRebalance opts) cluster
281
maybeSimulateRebalance False _ cluster = return cluster
282

    
283
-- | Prints the final @OK@ marker in machine readable output.
284
printFinalHTC :: Bool -> IO ()
285
printFinalHTC = printFinal htcPrefix
286

    
287
-- | Main function.
288
main :: Options -> [String] -> IO ()
289
main opts args = do
290
  unless (null args) $ do
291
         hPutStrLn stderr "Error: this program doesn't take any arguments."
292
         exitWith $ ExitFailure 1
293

    
294
  let verbose = optVerbose opts
295
      machineread = optMachineReadable opts
296
      nosimulation = optNoSimulation opts
297

    
298
  (ClusterData gl fixed_nl ilf _ _) <- loadExternalData opts
299
  nlf <- setNodeStatus opts fixed_nl
300

    
301
  let splitinstances = Cluster.findSplitInstances nlf ilf
302
      splitcluster = Cluster.splitCluster nlf ilf
303

    
304
  when machineread $ printGroupsMappings gl
305

    
306
  groupsstats <- mapM (perGroupChecks verbose machineread Initial gl)
307
                 splitcluster
308
  let clusterstats = map sum (transpose groupsstats) :: [Int]
309
      needrebalance = clusterNeedsRebalance clusterstats
310
      canrebalance = length splitinstances == 0
311
  printClusterStats verbose machineread Initial clusterstats needrebalance
312
                    canrebalance
313

    
314
  when nosimulation $ do
315
    unless (verbose == 0 || machineread) $
316
      putStrLn "Running in no-simulation mode. Exiting."
317

    
318
  when (length splitinstances > 0) $ do
319
    unless (verbose == 0 || machineread) $
320
       putStrLn "Split instances found, simulation of re-balancing\
321
                \ not possible"
322

    
323
  unless needrebalance $ do
324
    unless (verbose == 0 || machineread) $
325
      putStrLn "No need to rebalance cluster, no problems found. Exiting."
326

    
327
  let exitOK = nosimulation || not needrebalance
328
      simulate = not nosimulation && length splitinstances == 0
329
                 && needrebalance
330

    
331
  rebalancedcluster <- maybeSimulateRebalance simulate opts splitcluster
332

    
333
  when (simulate || machineread) $ do
334
    newgroupstats <- mapM (perGroupChecks verbose machineread Rebalanced gl)
335
                     rebalancedcluster
336
    -- We do not introduce new split instances during rebalance
337
    let newsplitinstances = splitinstances
338
        newclusterstats = map sum (transpose newgroupstats) :: [Int]
339
        newneedrebalance = clusterNeedsRebalance clusterstats
340
        newcanrebalance = length newsplitinstances == 0
341

    
342
    printClusterStats verbose machineread Rebalanced newclusterstats
343
                           newneedrebalance newcanrebalance
344

    
345
  printFinalHTC machineread
346

    
347
  unless exitOK $ exitWith $ ExitFailure 1