Statistics
| Branch: | Tag: | Revision:

root / htools / Ganeti / HTools / Program / Hcheck.hs @ b5b19d5e

History | View | Annotate | Download (10.6 kB)

1
{-| Cluster checker.
2

    
3
-}
4

    
5
{-
6

    
7
Copyright (C) 2012 Google Inc.
8

    
9
This program is free software; you can redistribute it and/or modify
10
it under the terms of the GNU General Public License as published by
11
the Free Software Foundation; either version 2 of the License, or
12
(at your option) any later version.
13

    
14
This program is distributed in the hope that it will be useful, but
15
WITHOUT ANY WARRANTY; without even the implied warranty of
16
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17
General Public License for more details.
18

    
19
You should have received a copy of the GNU Gene52al Public License
20
along with this program; if not, write to the Free Software
21
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22
02110-1301, USA.
23

    
24
-}
25

    
26
module Ganeti.HTools.Program.Hcheck (main, options) where
27

    
28
import Control.Monad
29
import List (transpose)
30
import System.Exit
31
import System.IO
32
import Text.Printf (printf)
33

    
34
import qualified Ganeti.HTools.Container as Container
35
import qualified Ganeti.HTools.Cluster as Cluster
36
import qualified Ganeti.HTools.Group as Group
37
import qualified Ganeti.HTools.Node as Node
38
import qualified Ganeti.HTools.Instance as Instance
39

    
40
import qualified Ganeti.HTools.Program.Hbal as Hbal
41

    
42
import Ganeti.HTools.CLI
43
import Ganeti.HTools.ExtLoader
44
import Ganeti.HTools.Loader
45
import Ganeti.HTools.Types
46

    
47
-- | Options list and functions.
48
options :: [OptType]
49
options =
50
  [ oDataFile
51
  , oDiskMoves
52
  , oDynuFile
53
  , oEvacMode
54
  , oExInst
55
  , oExTags
56
  , oIAllocSrc
57
  , oInstMoves
58
  , oLuxiSocket
59
  , oMachineReadable
60
  , oMaxCpu
61
  , oMaxSolLength
62
  , oMinDisk
63
  , oMinGain
64
  , oMinGainLim
65
  , oMinScore
66
  , oNoSimulation
67
  , oOfflineNode
68
  , oQuiet
69
  , oRapiMaster
70
  , oSelInst
71
  , oShowHelp
72
  , oShowVer
73
  , oVerbose
74
  ]
75

    
76
-- | Check phase - are we before (initial) or after rebalance.
77
data Phase = Initial
78
           | Rebalanced
79

    
80
-- | Level of presented statistics.
81
data Level = GroupLvl
82
           | ClusterLvl
83

    
84
-- | Prefix for machine readable names
85
htcPrefix :: String
86
htcPrefix = "HCHECK"
87

    
88
-- | Data showed both per group and per cluster.
89
commonData :: [(String, String)]
90
commonData =[ ("N1_FAIL", "Nodes not N+1 happy")
91
            , ("CONFLICT_TAGS", "Nodes with conflicting instances")
92
            , ("OFFLINE_PRI", "Instances with primary on an offline node")
93
            , ("OFFLINE_SEC", "Instances with seondary on an offline node")
94
            ]
95

    
96
-- | Data showed per group.
97
groupData :: [(String, String)]
98
groupData = commonData ++ [("SCORE", "Group score")]
99

    
100
-- | Data showed per cluster.
101
clusterData :: [(String, String)]
102
clusterData = commonData ++
103
              [ ("NEED_REBALANCE", "Cluster is not healthy")
104
              , ("CAN_REBALANCE", "Possible to run rebalance")
105
              ]
106

    
107
-- | Phase-specific prefix for machine readable version.
108
phasePrefix :: Phase -> String
109
phasePrefix Initial = "INIT"
110
phasePrefix Rebalanced = "FINAL"
111

    
112
-- | Level-specific prefix for machine readable version.
113
levelPrefix :: Level -> String
114
levelPrefix GroupLvl = "GROUP"
115
levelPrefix ClusterLvl = "CLUSTER"
116

    
117
-- | Description of phases for human readable version.
118
phaseDescr :: Phase -> String
119
phaseDescr Initial = "initially"
120
phaseDescr Rebalanced = "after rebalancing"
121

    
122

    
123
-- | Format a list of key, value as a shell fragment.
124
printKeysHTC :: [(String, String)] -> IO ()
125
printKeysHTC = printKeys htcPrefix
126

    
127
-- | Prepare string from boolean value.
128
printBool :: Bool    -- ^ Whether the result should be machine readable
129
          -> Bool    -- ^ Value to be converted to string
130
          -> String
131
printBool True True = "1"
132
printBool True False = "0"
133
printBool False b = show b
134

    
135
-- | Print mapping from group idx to group uuid (only in machine readable mode).
136
printGroupsMappings :: Group.List -> IO ()
137
printGroupsMappings gl = do
138
    let extract_vals = \g -> (printf "GROUP_UUID_%d" $ Group.idx g :: String,
139
                              printf "%s" $ Group.uuid g :: String)
140
        printpairs = map extract_vals (Container.elems gl)
141
    printKeysHTC printpairs
142

    
143
-- | Print all the statistics on a group level.
144
printGroupStats :: Int -> Bool -> Phase -> Group.Group -> [Int] -> Double -> IO ()
145
printGroupStats _ True phase grp stats score = do
146
  let printstats = map (printf "%d") stats ++ [printf "%.8f" score] :: [String]
147
      printkeys = map (printf "%s_%s_%d_%s"
148
                                  (phasePrefix phase)
149
                                  (levelPrefix GroupLvl)
150
                                  (Group.idx grp))
151
                       (map fst groupData) :: [String]
152
  printKeysHTC (zip printkeys printstats)
153

    
154
printGroupStats verbose False phase grp stats score = do
155
  let printstats = map (printf "%d") stats ++ [printf "%.8f" score] :: [String]
156

    
157
  unless (verbose == 0) $ do
158
    printf "\nStatistics for group %s %s\n"
159
               (Group.name grp) (phaseDescr phase) :: IO ()
160
    mapM_ (\(a,b) -> printf "    %s: %s\n" (snd a) b :: IO ())
161
          (zip groupData printstats)
162

    
163
-- | Print all the statistics on a cluster (global) level.
164
printClusterStats :: Int -> Bool -> Phase -> [Int] -> Bool -> Bool -> IO ()
165
printClusterStats _ True phase stats needrebal canrebal = do
166
  let printstats = map (printf "%d") stats ++
167
                   map (printBool True) [needrebal, canrebal]
168
      printkeys = map (printf "%s_%s_%s"
169
                              (phasePrefix phase)
170
                              (levelPrefix ClusterLvl))
171
                      (map fst clusterData) :: [String]
172
  printKeysHTC (zip printkeys printstats)
173

    
174
printClusterStats verbose False phase stats needrebal canrebal = do
175
  let printstats = map (printf "%d") stats ++
176
                   map (printBool False) [needrebal, canrebal]
177
  unless (verbose == 0) $ do
178
      printf "\nCluster statistics %s\n" (phaseDescr phase) :: IO ()
179
      mapM_ (\(a,b) -> printf "    %s: %s\n" (snd a) b :: IO ())
180
            (zip clusterData printstats)
181

    
182
-- | Check if any of cluster metrics is non-zero.
183
clusterNeedsRebalance :: [Int] -> Bool
184
clusterNeedsRebalance stats = sum stats > 0
185

    
186
{- | Check group for N+1 hapiness, conflicts of primaries on nodes and
187
instances residing on offline nodes.
188

    
189
-}
190
perGroupChecks :: Int -> Bool -> Phase -> Group.List ->
191
                  (Gdx, (Node.List, Instance.List)) -> IO ([Int])
192
perGroupChecks verbose machineread phase gl (gidx, (nl, il)) = do
193
  let grp = Container.find gidx gl
194
      offnl = filter Node.offline (Container.elems nl)
195
      n1violated = length $ fst $ Cluster.computeBadItems nl il
196
      conflicttags = length $ filter (>0)
197
                     (map Node.conflictingPrimaries (Container.elems nl))
198
      offline_pri = sum . map length $ map Node.pList offnl
199
      offline_sec = length $ map Node.sList offnl
200
      score = Cluster.compCV nl
201
      groupstats = [ n1violated
202
                   , conflicttags
203
                   , offline_pri
204
                   , offline_sec
205
                   ]
206
  printGroupStats verbose machineread phase grp groupstats score
207
  return groupstats
208

    
209
-- | Use Hbal's iterateDepth to simulate group rebalance.
210
simulateRebalance :: Options ->
211
                     (Gdx, (Node.List, Instance.List)) ->
212
                     IO ( (Gdx, (Node.List, Instance.List)) )
213
simulateRebalance opts (gidx, (nl, il)) = do
214
  let ini_cv = Cluster.compCV nl
215
      ini_tbl = Cluster.Table nl il ini_cv []
216
      min_cv = optMinScore opts
217

    
218

    
219
  if (ini_cv < min_cv)
220
    then return (gidx, (nl, il))
221
    else do
222
      let imlen = maximum . map (length . Instance.alias) $ Container.elems il
223
          nmlen = maximum . map (length . Node.alias) $ Container.elems nl
224

    
225
      (fin_tbl, _) <- Hbal.iterateDepth False ini_tbl
226
                                        (optMaxLength opts)
227
                                        (optDiskMoves opts)
228
                                        (optInstMoves opts)
229
                                        nmlen imlen [] min_cv
230
                                        (optMinGainLim opts) (optMinGain opts)
231
                                        (optEvacMode opts)
232

    
233
      let (Cluster.Table fin_nl fin_il _ _) = fin_tbl
234
      return (gidx, (fin_nl, fin_il))
235

    
236
-- | Decide whether to simulate rebalance.
237
maybeSimulateRebalance :: Bool             -- ^ Whether to simulate rebalance
238
                       -> Options          -- ^ Command line options
239
                       -> [(Gdx, (Node.List, Instance.List))] -- ^ Group data
240
                       -> IO([(Gdx, (Node.List, Instance.List))])
241
maybeSimulateRebalance True opts cluster =
242
    mapM (simulateRebalance opts) cluster
243
maybeSimulateRebalance False _ cluster = return cluster
244

    
245
-- | Prints the final @OK@ marker in machine readable output.
246
printFinalHTC :: Bool -> IO ()
247
printFinalHTC = printFinal htcPrefix
248

    
249
-- | Main function.
250
main :: Options -> [String] -> IO ()
251
main opts args = do
252
  unless (null args) $ do
253
         hPutStrLn stderr "Error: this program doesn't take any arguments."
254
         exitWith $ ExitFailure 1
255

    
256
  let verbose = optVerbose opts
257
      machineread = optMachineReadable opts
258
      nosimulation = optNoSimulation opts
259

    
260
  (ClusterData gl fixed_nl ilf _ _) <- loadExternalData opts
261
  nlf <- setNodeStatus opts fixed_nl
262

    
263
  let splitinstances = Cluster.findSplitInstances nlf ilf
264
      splitcluster = Cluster.splitCluster nlf ilf
265

    
266
  when machineread $ printGroupsMappings gl
267

    
268
  groupsstats <- mapM (perGroupChecks verbose machineread Initial gl) splitcluster
269
  let clusterstats = map sum (transpose groupsstats) :: [Int]
270
      needrebalance = clusterNeedsRebalance clusterstats
271
      canrebalance = length splitinstances == 0
272
  printClusterStats verbose machineread Initial clusterstats needrebalance canrebalance
273

    
274
  when nosimulation $ do
275
    unless (verbose == 0 || machineread) $
276
      printf "Running in no-simulation mode. Exiting.\n"
277

    
278
  when (length splitinstances > 0) $ do
279
    unless (verbose == 0 || machineread) $
280
       printf "Split instances found, simulation of re-balancing not possible\n"
281

    
282
  unless needrebalance $ do
283
    unless (verbose == 0 || machineread) $
284
      printf "No need to rebalance cluster, no problems found. Exiting.\n"
285

    
286
  let exitOK = nosimulation || not needrebalance
287
      simulate = not nosimulation && length splitinstances == 0 && needrebalance
288

    
289
  rebalancedcluster <- maybeSimulateRebalance simulate opts splitcluster
290

    
291
  when (simulate || machineread) $ do
292
    newgroupstats <- mapM (perGroupChecks verbose machineread Rebalanced gl)
293
                     rebalancedcluster
294
    -- We do not introduce new split instances during rebalance
295
    let newsplitinstances = splitinstances
296
        newclusterstats = map sum (transpose newgroupstats) :: [Int]
297
        newneedrebalance = clusterNeedsRebalance clusterstats
298
        newcanrebalance = length newsplitinstances == 0
299

    
300
    printClusterStats verbose machineread Rebalanced newclusterstats
301
                           newneedrebalance newcanrebalance
302

    
303
  printFinalHTC machineread
304

    
305
  unless exitOK $ exitWith $ ExitFailure 1