root / htools / Ganeti / HTools / Program / Hcheck.hs @ e60fa4af
History | View | Annotate | Download (8.7 kB)
1 |
{-| Cluster checker. |
---|---|
2 |
|
3 |
-} |
4 |
|
5 |
{- |
6 |
|
7 |
Copyright (C) 2012 Google Inc. |
8 |
|
9 |
This program is free software; you can redistribute it and/or modify |
10 |
it under the terms of the GNU General Public License as published by |
11 |
the Free Software Foundation; either version 2 of the License, or |
12 |
(at your option) any later version. |
13 |
|
14 |
This program is distributed in the hope that it will be useful, but |
15 |
WITHOUT ANY WARRANTY; without even the implied warranty of |
16 |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
17 |
General Public License for more details. |
18 |
|
19 |
You should have received a copy of the GNU Gene52al Public License |
20 |
along with this program; if not, write to the Free Software |
21 |
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA |
22 |
02110-1301, USA. |
23 |
|
24 |
-} |
25 |
|
26 |
module Ganeti.HTools.Program.Hcheck (main, options) where |
27 |
|
28 |
import Control.Monad |
29 |
import List (transpose) |
30 |
import System.Exit |
31 |
import System.IO |
32 |
import Text.Printf (printf) |
33 |
|
34 |
import qualified Ganeti.HTools.Container as Container |
35 |
import qualified Ganeti.HTools.Cluster as Cluster |
36 |
import qualified Ganeti.HTools.Node as Node |
37 |
import qualified Ganeti.HTools.Instance as Instance |
38 |
|
39 |
import qualified Ganeti.HTools.Program.Hbal as Hbal |
40 |
|
41 |
import Ganeti.HTools.CLI |
42 |
import Ganeti.HTools.ExtLoader |
43 |
import Ganeti.HTools.Loader |
44 |
import Ganeti.HTools.Types |
45 |
|
46 |
-- | Options list and functions. |
47 |
options :: [OptType] |
48 |
options = |
49 |
[ oDataFile |
50 |
, oDiskMoves |
51 |
, oDynuFile |
52 |
, oEvacMode |
53 |
, oExInst |
54 |
, oExTags |
55 |
, oIAllocSrc |
56 |
, oInstMoves |
57 |
, oLuxiSocket |
58 |
, oMachineReadable |
59 |
, oMaxCpu |
60 |
, oMaxSolLength |
61 |
, oMinDisk |
62 |
, oMinGain |
63 |
, oMinGainLim |
64 |
, oMinScore |
65 |
, oNoSimulation |
66 |
, oOfflineNode |
67 |
, oQuiet |
68 |
, oRapiMaster |
69 |
, oSelInst |
70 |
, oShowHelp |
71 |
, oShowVer |
72 |
, oVerbose |
73 |
] |
74 |
|
75 |
-- | Check phase - are we before (initial) or after rebalance. |
76 |
data Phase = Initial |
77 |
| Rebalanced |
78 |
|
79 |
-- | Level of presented statistics. |
80 |
data Level = GroupLvl |
81 |
| ClusterLvl |
82 |
|
83 |
-- | Prefix for machine readable names |
84 |
htcPrefix :: String |
85 |
htcPrefix = "HCHECK" |
86 |
|
87 |
-- | Phase-specific prefix for machine readable version. |
88 |
phasePrefix :: Phase -> String |
89 |
phasePrefix Initial = "INIT" |
90 |
phasePrefix Rebalanced = "FINAL" |
91 |
|
92 |
-- | Description of phases for human readable version. |
93 |
phaseDescription :: Phase -> String |
94 |
phaseDescription Initial = "initially" |
95 |
phaseDescription Rebalanced = "after rebalancing" |
96 |
|
97 |
-- | Level-specific prefix for machine readable version. |
98 |
levelPrefix :: Level -> String |
99 |
levelPrefix GroupLvl = "GROUP" |
100 |
levelPrefix ClusterLvl = "CLUSTER" |
101 |
|
102 |
-- | Data showed both per group and per cluster. |
103 |
commonData :: [(String, String)] |
104 |
commonData =[ ("N1_FAIL", "Nodes not N+1 happy") |
105 |
, ("CONFLICT_TAGS", "Nodes with conflicting instances") |
106 |
, ("OFFLINE_PRI", "Instances with primary on an offline node") |
107 |
, ("OFFLINE_SEC", "Instances with seondary on an offline node") |
108 |
] |
109 |
|
110 |
-- | Data showed per group. |
111 |
groupData :: [(String, String)] |
112 |
groupData = commonData ++ [("SCORE", "Group score")] |
113 |
|
114 |
-- | Data showed per cluster. |
115 |
clusterData :: [(String, String)] |
116 |
clusterData = commonData ++ [("NEED_REBALANCE", "Cluster is not healthy")] |
117 |
|
118 |
-- | Format a list of key, value as a shell fragment. |
119 |
printKeysHTC :: [(String, String)] -> IO () |
120 |
printKeysHTC = printKeys htcPrefix |
121 |
|
122 |
-- | Print all the statistics on a group level. |
123 |
printGroupStats :: Int -> Bool -> Phase -> Gdx -> [Int] -> Double -> IO () |
124 |
printGroupStats _ True phase gidx stats score = do |
125 |
let printstats = map (printf "%d") stats ++ [printf "%.8f" score] :: [String] |
126 |
printkeys = map (printf "%s_%s_%d_%s" |
127 |
(phasePrefix phase) |
128 |
(levelPrefix GroupLvl) |
129 |
gidx) |
130 |
(map fst groupData) :: [String] |
131 |
printKeysHTC (zip printkeys printstats) |
132 |
|
133 |
printGroupStats verbose False phase gidx stats score = do |
134 |
let printstats = map (printf "%d") stats ++ [printf "%.8f" score] :: [String] |
135 |
|
136 |
unless (verbose == 0) $ do |
137 |
printf "\nStatistics for group %d %s\n" |
138 |
gidx (phaseDescription phase) :: IO () |
139 |
mapM_ (\(a,b) -> printf " %s: %s\n" (snd a) b :: IO ()) |
140 |
(zip groupData printstats) |
141 |
|
142 |
-- | Print all the statistics on a cluster (global) level. |
143 |
printClusterStats :: Int -> Bool -> Phase -> [Int] -> IO (Bool) |
144 |
printClusterStats _ True phase stats = do |
145 |
let needrebal = sum stats |
146 |
printstats = map (printf "%d") $ stats ++ [needrebal] |
147 |
:: [String] |
148 |
printkeys = map (printf "%s_%s_%s" |
149 |
(phasePrefix phase) |
150 |
(levelPrefix ClusterLvl)) |
151 |
(map fst clusterData) :: [String] |
152 |
printKeysHTC (zip printkeys printstats) |
153 |
return $ needrebal > 0 |
154 |
|
155 |
printClusterStats verbose False phase stats = do |
156 |
let needrebal = sum stats |
157 |
printstats = map (printf "%d") stats :: [String] |
158 |
unless (verbose == 0) $ do |
159 |
printf "\nCluster statistics %s\n" (phaseDescription phase) :: IO () |
160 |
mapM_ (\(a,b) -> printf " %s: %s\n" (snd a) b :: IO ()) |
161 |
(zip clusterData (printstats ++ [show (needrebal>0)])) |
162 |
return $ needrebal > 0 |
163 |
|
164 |
{- | Check group for N+1 hapiness, conflicts of primaries on nodes and |
165 |
instances residing on offline nodes. |
166 |
|
167 |
-} |
168 |
perGroupChecks :: Int -> Bool -> Phase -> (Gdx, (Node.List, Instance.List)) |
169 |
-> IO ([Int]) |
170 |
perGroupChecks verbose machineread phase (gidx, (nl, il)) = do |
171 |
let offnl = filter Node.offline (Container.elems nl) |
172 |
n1violated = length $ fst $ Cluster.computeBadItems nl il |
173 |
conflicttags = length $ filter (>0) |
174 |
(map Node.conflictingPrimaries (Container.elems nl)) |
175 |
offline_pri = sum . map length $ map Node.pList offnl |
176 |
offline_sec = length $ map Node.sList offnl |
177 |
score = Cluster.compCV nl |
178 |
groupstats = [ n1violated |
179 |
, conflicttags |
180 |
, offline_pri |
181 |
, offline_sec |
182 |
] |
183 |
printGroupStats verbose machineread phase gidx groupstats score |
184 |
return groupstats |
185 |
|
186 |
-- | Use Hbal's iterateDepth to simulate group rebalance. |
187 |
simulateRebalance :: Options -> |
188 |
(Gdx, (Node.List, Instance.List)) -> |
189 |
IO ( (Gdx, (Node.List, Instance.List)) ) |
190 |
simulateRebalance opts (gidx, (nl, il)) = do |
191 |
let ini_cv = Cluster.compCV nl |
192 |
ini_tbl = Cluster.Table nl il ini_cv [] |
193 |
min_cv = optMinScore opts |
194 |
|
195 |
|
196 |
if (ini_cv < min_cv) |
197 |
then return (gidx, (nl, il)) |
198 |
else do |
199 |
let imlen = maximum . map (length . Instance.alias) $ Container.elems il |
200 |
nmlen = maximum . map (length . Node.alias) $ Container.elems nl |
201 |
|
202 |
(fin_tbl, _) <- Hbal.iterateDepth False ini_tbl |
203 |
(optMaxLength opts) |
204 |
(optDiskMoves opts) |
205 |
(optInstMoves opts) |
206 |
nmlen imlen [] min_cv |
207 |
(optMinGainLim opts) (optMinGain opts) |
208 |
(optEvacMode opts) |
209 |
|
210 |
let (Cluster.Table fin_nl fin_il _ _) = fin_tbl |
211 |
return (gidx, (fin_nl, fin_il)) |
212 |
|
213 |
-- | Prints the final @OK@ marker in machine readable output. |
214 |
printFinalHTC :: Bool -> IO () |
215 |
printFinalHTC = printFinal htcPrefix |
216 |
|
217 |
-- | Main function. |
218 |
main :: Options -> [String] -> IO () |
219 |
main opts args = do |
220 |
unless (null args) $ do |
221 |
hPutStrLn stderr "Error: this program doesn't take any arguments." |
222 |
exitWith $ ExitFailure 1 |
223 |
|
224 |
let verbose = optVerbose opts |
225 |
machineread = optMachineReadable opts |
226 |
nosimulation = optNoSimulation opts |
227 |
|
228 |
(ClusterData _ fixed_nl ilf _ _) <- loadExternalData opts |
229 |
nlf <- setNodeStatus opts fixed_nl |
230 |
|
231 |
let splitinstances = Cluster.findSplitInstances nlf ilf |
232 |
splitcluster = Cluster.splitCluster nlf ilf |
233 |
|
234 |
groupsstats <- mapM (perGroupChecks verbose machineread Initial) splitcluster |
235 |
let clusterstats = map sum (transpose groupsstats) :: [Int] |
236 |
needrebalance <- printClusterStats verbose machineread Initial clusterstats |
237 |
|
238 |
when nosimulation $ do |
239 |
unless (verbose == 0 || machineread) $ |
240 |
printf "Running in no-simulation mode. Exiting.\n" |
241 |
printFinalHTC machineread |
242 |
exitWith ExitSuccess |
243 |
|
244 |
when (length splitinstances > 0) $ do |
245 |
unless (verbose == 0 || machineread) $ |
246 |
printf "Split instances found, simulation of re-balancing not possible\n" |
247 |
exitWith $ ExitFailure 1 |
248 |
|
249 |
unless needrebalance $ do |
250 |
unless (verbose == 0 || machineread) $ |
251 |
printf "No need to rebalance cluster, no problems found. Exiting.\n" |
252 |
printFinalHTC machineread |
253 |
exitWith ExitSuccess |
254 |
|
255 |
rebalancedcluster <- mapM (simulateRebalance opts) splitcluster |
256 |
newgroupstats <- mapM (perGroupChecks verbose machineread Rebalanced) |
257 |
rebalancedcluster |
258 |
let newclusterstats = map sum (transpose newgroupstats) :: [Int] |
259 |
_ <- printClusterStats verbose machineread Rebalanced newclusterstats |
260 |
|
261 |
printFinalHTC machineread |