Statistics
| Branch: | Tag: | Revision:

root / htools / Ganeti / HTools / Program / Hcheck.hs @ d64acbb5

History | View | Annotate | Download (12.3 kB)

1
{-| Cluster checker.
2

    
3
-}
4

    
5
{-
6

    
7
Copyright (C) 2012 Google Inc.
8

    
9
This program is free software; you can redistribute it and/or modify
10
it under the terms of the GNU General Public License as published by
11
the Free Software Foundation; either version 2 of the License, or
12
(at your option) any later version.
13

    
14
This program is distributed in the hope that it will be useful, but
15
WITHOUT ANY WARRANTY; without even the implied warranty of
16
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17
General Public License for more details.
18

    
19
You should have received a copy of the GNU Gene52al Public License
20
along with this program; if not, write to the Free Software
21
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22
02110-1301, USA.
23

    
24
-}
25

    
26
module Ganeti.HTools.Program.Hcheck (main, options) where
27

    
28
import Control.Monad
29
import List (transpose)
30
import System.Exit
31
import System.IO
32
import Text.Printf (printf)
33

    
34
import qualified Ganeti.HTools.Container as Container
35
import qualified Ganeti.HTools.Cluster as Cluster
36
import qualified Ganeti.HTools.Group as Group
37
import qualified Ganeti.HTools.Node as Node
38
import qualified Ganeti.HTools.Instance as Instance
39

    
40
import qualified Ganeti.HTools.Program.Hbal as Hbal
41

    
42
import Ganeti.HTools.CLI
43
import Ganeti.HTools.ExtLoader
44
import Ganeti.HTools.Loader
45
import Ganeti.HTools.Types
46

    
47
-- | Options list and functions.
48
options :: [OptType]
49
options =
50
  [ oDataFile
51
  , oDiskMoves
52
  , oDynuFile
53
  , oEvacMode
54
  , oExInst
55
  , oExTags
56
  , oIAllocSrc
57
  , oInstMoves
58
  , oLuxiSocket
59
  , oMachineReadable
60
  , oMaxCpu
61
  , oMaxSolLength
62
  , oMinDisk
63
  , oMinGain
64
  , oMinGainLim
65
  , oMinScore
66
  , oNoSimulation
67
  , oOfflineNode
68
  , oQuiet
69
  , oRapiMaster
70
  , oSelInst
71
  , oShowHelp
72
  , oShowVer
73
  , oVerbose
74
  ]
75

    
76
-- | Check phase - are we before (initial) or after rebalance.
77
data Phase = Initial
78
           | Rebalanced
79

    
80
-- | Level of presented statistics.
81
data Level = GroupLvl
82
           | ClusterLvl
83

    
84
-- | Prefix for machine readable names.
85
htcPrefix :: String
86
htcPrefix = "HCHECK"
87

    
88
-- | Data showed both per group and per cluster.
89
commonData :: [(String, String)]
90
commonData =[ ("N1_FAIL", "Nodes not N+1 happy")
91
            , ("CONFLICT_TAGS", "Nodes with conflicting instances")
92
            , ("OFFLINE_PRI", "Instances with primary on an offline node")
93
            , ("OFFLINE_SEC", "Instances with seondary on an offline node")
94
            ]
95

    
96
-- | Data showed per group.
97
groupData :: [(String, String)]
98
groupData = commonData ++ [("SCORE", "Group score")]
99

    
100
-- | Data showed per cluster.
101
clusterData :: [(String, String)]
102
clusterData = commonData ++
103
              [ ("NEED_REBALANCE", "Cluster is not healthy")
104
              , ("CAN_REBALANCE", "Possible to run rebalance")
105
              ]
106

    
107
-- | Phase-specific prefix for machine readable version.
108
phasePrefix :: Phase -> String
109
phasePrefix Initial = "INIT"
110
phasePrefix Rebalanced = "FINAL"
111

    
112
-- | Level-specific prefix for machine readable version.
113
levelPrefix :: Level -> String
114
levelPrefix GroupLvl = "GROUP"
115
levelPrefix ClusterLvl = "CLUSTER"
116

    
117
-- | Machine-readable keys to show depending on given level.
118
keysData :: Level -> [String]
119
keysData GroupLvl = map fst groupData
120
keysData ClusterLvl = map fst clusterData
121

    
122
-- | Description of phases for human readable version.
123
phaseDescr :: Phase -> String
124
phaseDescr Initial = "initially"
125
phaseDescr Rebalanced = "after rebalancing"
126

    
127
-- | Description to show depending on given level.
128
descrData :: Level -> [String]
129
descrData GroupLvl = map snd groupData
130
descrData ClusterLvl = map snd clusterData
131

    
132
-- | Human readable prefix for statistics.
133
phaseLevelDescr :: Phase -> Level -> Maybe String -> String
134
phaseLevelDescr phase GroupLvl (Just name) =
135
    printf "Statistics for group %s %s\n" name $ phaseDescr phase
136
phaseLevelDescr phase GroupLvl Nothing =
137
    printf "Statistics for group %s\n" $ phaseDescr phase
138
phaseLevelDescr phase ClusterLvl _ =
139
    printf "Cluster statistics %s\n" $ phaseDescr phase
140

    
141
-- | Format a list of key, value as a shell fragment.
142
printKeysHTC :: [(String, String)] -> IO ()
143
printKeysHTC = printKeys htcPrefix
144

    
145
-- | Prepare string from boolean value.
146
printBool :: Bool    -- ^ Whether the result should be machine readable
147
          -> Bool    -- ^ Value to be converted to string
148
          -> String
149
printBool True True = "1"
150
printBool True False = "0"
151
printBool False b = show b
152

    
153
-- | Print mapping from group idx to group uuid (only in machine readable mode).
154
printGroupsMappings :: Group.List -> IO ()
155
printGroupsMappings gl = do
156
    let extract_vals = \g -> (printf "GROUP_UUID_%d" $ Group.idx g :: String,
157
                              printf "%s" $ Group.uuid g :: String)
158
        printpairs = map extract_vals (Container.elems gl)
159
    printKeysHTC printpairs
160

    
161
-- | Prepare a single key given a certain level and phase of simulation.
162
prepareKey :: Level -> Phase -> Maybe String -> String -> String
163
prepareKey level phase Nothing suffix =
164
  printf "%s_%s_%s" (phasePrefix phase) (levelPrefix level) suffix
165
prepareKey level phase (Just idx) suffix =
166
  printf "%s_%s_%s_%s" (phasePrefix phase) (levelPrefix level) idx suffix
167

    
168
-- | Print all the statistics for given level and phase.
169
printStats :: Int            -- ^ Verbosity level
170
           -> Bool           -- ^ If the output should be machine readable
171
           -> Level          -- ^ Level on which we are printing
172
           -> Phase          -- ^ Current phase of simulation
173
           -> [String]       -- ^ Values to print
174
           -> Maybe String   -- ^ Additional data for groups
175
           -> IO ()
176
printStats _ True level phase values gidx = do
177
  let keys = map (prepareKey level phase gidx) (keysData level)
178
  printKeysHTC $ zip keys values
179

    
180
printStats verbose False level phase values name = do
181
  let prefix = phaseLevelDescr phase level name
182
      descr = descrData level
183
  unless (verbose == 0) $ do
184
    printf "\n%s" prefix :: IO ()
185
    mapM_ (\(a,b) -> printf "    %s: %s\n" a b) (zip descr values)
186

    
187
-- | Extract name or idx from group.
188
extractGroupData :: Bool -> Group.Group -> String
189
extractGroupData True grp = printf "%d" $ Group.idx grp
190
extractGroupData False grp = Group.name grp
191

    
192
-- | Prepare values for group.
193
prepareGroupValues :: [Int] -> Double -> [String]
194
prepareGroupValues stats score =
195
  map (printf "%d") stats ++ [printf "%.8f" score]
196

    
197
-- | Prepare values for cluster.
198
prepareClusterValues :: Bool -> [Int] -> [Bool] -> [String]
199
prepareClusterValues machineread stats bstats =
200
  map (printf "%d")  stats ++ map (printBool machineread) bstats
201

    
202
-- | Print all the statistics on a group level.
203
printGroupStats :: Int -> Bool -> Phase -> Group.Group -> [Int] -> Double
204
                -> IO ()
205
printGroupStats verbose machineread phase grp stats score = do
206
  let values = prepareGroupValues stats score
207
      extradata = extractGroupData machineread grp
208
  printStats verbose machineread GroupLvl phase values (Just extradata)
209

    
210
-- | Print all the statistics on a cluster (global) level.
211
printClusterStats :: Int -> Bool -> Phase -> [Int] -> Bool -> Bool -> IO ()
212
printClusterStats verbose machineread phase stats needhbal canhbal = do
213
  let values = prepareClusterValues machineread stats [needhbal, canhbal]
214
  printStats verbose machineread ClusterLvl phase values Nothing
215

    
216
-- | Check if any of cluster metrics is non-zero.
217
clusterNeedsRebalance :: [Int] -> Bool
218
clusterNeedsRebalance stats = sum stats > 0
219

    
220
{- | Check group for N+1 hapiness, conflicts of primaries on nodes and
221
instances residing on offline nodes.
222

    
223
-}
224
perGroupChecks :: Int -> Bool -> Phase -> Group.List ->
225
                  (Gdx, (Node.List, Instance.List)) -> IO ([Int])
226
perGroupChecks verbose machineread phase gl (gidx, (nl, il)) = do
227
  let grp = Container.find gidx gl
228
      offnl = filter Node.offline (Container.elems nl)
229
      n1violated = length $ fst $ Cluster.computeBadItems nl il
230
      conflicttags = length $ filter (>0)
231
                     (map Node.conflictingPrimaries (Container.elems nl))
232
      offline_pri = sum . map length $ map Node.pList offnl
233
      offline_sec = length $ map Node.sList offnl
234
      score = Cluster.compCV nl
235
      groupstats = [ n1violated
236
                   , conflicttags
237
                   , offline_pri
238
                   , offline_sec
239
                   ]
240
  printGroupStats verbose machineread phase grp groupstats score
241
  return groupstats
242

    
243
-- | Use Hbal's iterateDepth to simulate group rebalance.
244
executeSimulation :: Options -> Cluster.Table -> Double ->
245
                     Gdx -> Node.List -> Instance.List ->
246
                     IO ( (Gdx, (Node.List, Instance.List)) )
247
executeSimulation opts ini_tbl min_cv gidx nl il = do
248
  let imlen = maximum . map (length . Instance.alias) $ Container.elems il
249
      nmlen = maximum . map (length . Node.alias) $ Container.elems nl
250

    
251
  (fin_tbl, _) <- Hbal.iterateDepth False ini_tbl
252
                                    (optMaxLength opts)
253
                                    (optDiskMoves opts)
254
                                    (optInstMoves opts)
255
                                    nmlen imlen [] min_cv
256
                                    (optMinGainLim opts) (optMinGain opts)
257
                                    (optEvacMode opts)
258

    
259
  let (Cluster.Table fin_nl fin_il _ _) = fin_tbl
260
  return (gidx, (fin_nl, fin_il))
261

    
262
-- | Simulate group rebalance if group's score is not good
263
maybeSimulateGroupRebalance :: Options -> (Gdx, (Node.List, Instance.List))
264
                               -> IO ((Gdx, (Node.List, Instance.List)))
265
maybeSimulateGroupRebalance opts (gidx, (nl, il)) = do
266
  let ini_cv = Cluster.compCV nl
267
      ini_tbl = Cluster.Table nl il ini_cv []
268
      min_cv = optMinScore opts
269
  if (ini_cv < min_cv)
270
    then return (gidx, (nl, il))
271
    else executeSimulation opts ini_tbl min_cv gidx nl il
272

    
273
-- | Decide whether to simulate rebalance.
274
maybeSimulateRebalance :: Bool             -- ^ Whether to simulate rebalance
275
                       -> Options          -- ^ Command line options
276
                       -> [(Gdx, (Node.List, Instance.List))] -- ^ Group data
277
                       -> IO([(Gdx, (Node.List, Instance.List))])
278
maybeSimulateRebalance True opts cluster =
279
    mapM (maybeSimulateGroupRebalance opts) cluster
280
maybeSimulateRebalance False _ cluster = return cluster
281

    
282
-- | Prints the final @OK@ marker in machine readable output.
283
printFinalHTC :: Bool -> IO ()
284
printFinalHTC = printFinal htcPrefix
285

    
286
-- | Main function.
287
main :: Options -> [String] -> IO ()
288
main opts args = do
289
  unless (null args) $ do
290
         hPutStrLn stderr "Error: this program doesn't take any arguments."
291
         exitWith $ ExitFailure 1
292

    
293
  let verbose = optVerbose opts
294
      machineread = optMachineReadable opts
295
      nosimulation = optNoSimulation opts
296

    
297
  (ClusterData gl fixed_nl ilf _ _) <- loadExternalData opts
298
  nlf <- setNodeStatus opts fixed_nl
299

    
300
  let splitinstances = Cluster.findSplitInstances nlf ilf
301
      splitcluster = Cluster.splitCluster nlf ilf
302

    
303
  when machineread $ printGroupsMappings gl
304

    
305
  groupsstats <- mapM (perGroupChecks verbose machineread Initial gl) splitcluster
306
  let clusterstats = map sum (transpose groupsstats) :: [Int]
307
      needrebalance = clusterNeedsRebalance clusterstats
308
      canrebalance = length splitinstances == 0
309
  printClusterStats verbose machineread Initial clusterstats needrebalance
310
                    canrebalance
311

    
312
  when nosimulation $ do
313
    unless (verbose == 0 || machineread) $
314
      printf "Running in no-simulation mode. Exiting.\n"
315

    
316
  when (length splitinstances > 0) $ do
317
    unless (verbose == 0 || machineread) $
318
       printf "Split instances found, simulation of re-balancing not possible\n"
319

    
320
  unless needrebalance $ do
321
    unless (verbose == 0 || machineread) $
322
      printf "No need to rebalance cluster, no problems found. Exiting.\n"
323

    
324
  let exitOK = nosimulation || not needrebalance
325
      simulate = not nosimulation && length splitinstances == 0 && needrebalance
326

    
327
  rebalancedcluster <- maybeSimulateRebalance simulate opts splitcluster
328

    
329
  when (simulate || machineread) $ do
330
    newgroupstats <- mapM (perGroupChecks verbose machineread Rebalanced gl)
331
                     rebalancedcluster
332
    -- We do not introduce new split instances during rebalance
333
    let newsplitinstances = splitinstances
334
        newclusterstats = map sum (transpose newgroupstats) :: [Int]
335
        newneedrebalance = clusterNeedsRebalance clusterstats
336
        newcanrebalance = length newsplitinstances == 0
337

    
338
    printClusterStats verbose machineread Rebalanced newclusterstats
339
                           newneedrebalance newcanrebalance
340

    
341
  printFinalHTC machineread
342

    
343
  unless exitOK $ exitWith $ ExitFailure 1