root / htools / Ganeti / HTools / Program / Hcheck.hs @ 707cd3d7
History | View | Annotate | Download (11.6 kB)
1 |
{-| Cluster checker. |
---|---|
2 |
|
3 |
-} |
4 |
|
5 |
{- |
6 |
|
7 |
Copyright (C) 2012 Google Inc. |
8 |
|
9 |
This program is free software; you can redistribute it and/or modify |
10 |
it under the terms of the GNU General Public License as published by |
11 |
the Free Software Foundation; either version 2 of the License, or |
12 |
(at your option) any later version. |
13 |
|
14 |
This program is distributed in the hope that it will be useful, but |
15 |
WITHOUT ANY WARRANTY; without even the implied warranty of |
16 |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
17 |
General Public License for more details. |
18 |
|
19 |
You should have received a copy of the GNU Gene52al Public License |
20 |
along with this program; if not, write to the Free Software |
21 |
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA |
22 |
02110-1301, USA. |
23 |
|
24 |
-} |
25 |
|
26 |
module Ganeti.HTools.Program.Hcheck |
27 |
( main |
28 |
, options |
29 |
, arguments |
30 |
) where |
31 |
|
32 |
import Control.Monad |
33 |
import Data.List (transpose) |
34 |
import System.Exit |
35 |
import Text.Printf (printf) |
36 |
|
37 |
import qualified Ganeti.HTools.Container as Container |
38 |
import qualified Ganeti.HTools.Cluster as Cluster |
39 |
import qualified Ganeti.HTools.Group as Group |
40 |
import qualified Ganeti.HTools.Node as Node |
41 |
import qualified Ganeti.HTools.Instance as Instance |
42 |
|
43 |
import qualified Ganeti.HTools.Program.Hbal as Hbal |
44 |
|
45 |
import Ganeti.Common |
46 |
import Ganeti.HTools.CLI |
47 |
import Ganeti.HTools.ExtLoader |
48 |
import Ganeti.HTools.Loader |
49 |
import Ganeti.HTools.Types |
50 |
import Ganeti.Utils |
51 |
|
52 |
-- | Options list and functions. |
53 |
options :: [OptType] |
54 |
options = |
55 |
[ oDataFile |
56 |
, oDiskMoves |
57 |
, oDynuFile |
58 |
, oEvacMode |
59 |
, oExInst |
60 |
, oExTags |
61 |
, oIAllocSrc |
62 |
, oInstMoves |
63 |
, oLuxiSocket |
64 |
, oMachineReadable |
65 |
, oMaxCpu |
66 |
, oMaxSolLength |
67 |
, oMinDisk |
68 |
, oMinGain |
69 |
, oMinGainLim |
70 |
, oMinScore |
71 |
, oNoSimulation |
72 |
, oOfflineNode |
73 |
, oQuiet |
74 |
, oRapiMaster |
75 |
, oSelInst |
76 |
, oVerbose |
77 |
] |
78 |
|
79 |
-- | The list of arguments supported by the program. |
80 |
arguments :: [ArgCompletion] |
81 |
arguments = [] |
82 |
|
83 |
-- | Check phase - are we before (initial) or after rebalance. |
84 |
data Phase = Initial |
85 |
| Rebalanced |
86 |
|
87 |
-- | Level of presented statistics. |
88 |
data Level = GroupLvl |
89 |
| ClusterLvl |
90 |
|
91 |
-- | A type alias for a group index and node\/instance lists. |
92 |
type GroupInfo = (Gdx, (Node.List, Instance.List)) |
93 |
|
94 |
-- | A type alias for group stats. |
95 |
type GroupStats = ((Group.Group, Double), [Int]) |
96 |
|
97 |
-- | Prefix for machine readable names. |
98 |
htcPrefix :: String |
99 |
htcPrefix = "HCHECK" |
100 |
|
101 |
-- | Data showed both per group and per cluster. |
102 |
commonData :: [(String, String)] |
103 |
commonData =[ ("N1_FAIL", "Nodes not N+1 happy") |
104 |
, ("CONFLICT_TAGS", "Nodes with conflicting instances") |
105 |
, ("OFFLINE_PRI", "Instances having the primary node offline") |
106 |
, ("OFFLINE_SEC", "Instances having a secondary node offline") |
107 |
] |
108 |
|
109 |
-- | Data showed per group. |
110 |
groupData :: [(String, String)] |
111 |
groupData = commonData ++ [("SCORE", "Group score")] |
112 |
|
113 |
-- | Data showed per cluster. |
114 |
clusterData :: [(String, String)] |
115 |
clusterData = commonData ++ |
116 |
[ ("NEED_REBALANCE", "Cluster is not healthy") ] |
117 |
|
118 |
-- | Phase-specific prefix for machine readable version. |
119 |
phasePrefix :: Phase -> String |
120 |
phasePrefix Initial = "INIT" |
121 |
phasePrefix Rebalanced = "FINAL" |
122 |
|
123 |
-- | Level-specific prefix for machine readable version. |
124 |
levelPrefix :: Level -> String |
125 |
levelPrefix GroupLvl = "GROUP" |
126 |
levelPrefix ClusterLvl = "CLUSTER" |
127 |
|
128 |
-- | Machine-readable keys to show depending on given level. |
129 |
keysData :: Level -> [String] |
130 |
keysData GroupLvl = map fst groupData |
131 |
keysData ClusterLvl = map fst clusterData |
132 |
|
133 |
-- | Description of phases for human readable version. |
134 |
phaseDescr :: Phase -> String |
135 |
phaseDescr Initial = "initially" |
136 |
phaseDescr Rebalanced = "after rebalancing" |
137 |
|
138 |
-- | Description to show depending on given level. |
139 |
descrData :: Level -> [String] |
140 |
descrData GroupLvl = map snd groupData |
141 |
descrData ClusterLvl = map snd clusterData |
142 |
|
143 |
-- | Human readable prefix for statistics. |
144 |
phaseLevelDescr :: Phase -> Level -> Maybe String -> String |
145 |
phaseLevelDescr phase GroupLvl (Just name) = |
146 |
printf "Statistics for group %s %s\n" name $ phaseDescr phase |
147 |
phaseLevelDescr phase GroupLvl Nothing = |
148 |
printf "Statistics for group %s\n" $ phaseDescr phase |
149 |
phaseLevelDescr phase ClusterLvl _ = |
150 |
printf "Cluster statistics %s\n" $ phaseDescr phase |
151 |
|
152 |
-- | Format a list of key, value as a shell fragment. |
153 |
printKeysHTC :: [(String, String)] -> IO () |
154 |
printKeysHTC = printKeys htcPrefix |
155 |
|
156 |
-- | Prepare string from boolean value. |
157 |
printBool :: Bool -- ^ Whether the result should be machine readable |
158 |
-> Bool -- ^ Value to be converted to string |
159 |
-> String |
160 |
printBool True True = "1" |
161 |
printBool True False = "0" |
162 |
printBool False b = show b |
163 |
|
164 |
-- | Print mapping from group idx to group uuid (only in machine |
165 |
-- readable mode). |
166 |
printGroupsMappings :: Group.List -> IO () |
167 |
printGroupsMappings gl = do |
168 |
let extract_vals g = (printf "GROUP_UUID_%d" $ Group.idx g :: String, |
169 |
Group.uuid g) |
170 |
printpairs = map extract_vals (Container.elems gl) |
171 |
printKeysHTC printpairs |
172 |
|
173 |
-- | Prepare a single key given a certain level and phase of simulation. |
174 |
prepareKey :: Level -> Phase -> Maybe String -> String -> String |
175 |
prepareKey level phase Nothing suffix = |
176 |
printf "%s_%s_%s" (phasePrefix phase) (levelPrefix level) suffix |
177 |
prepareKey level phase (Just idx) suffix = |
178 |
printf "%s_%s_%s_%s" (phasePrefix phase) (levelPrefix level) idx suffix |
179 |
|
180 |
-- | Print all the statistics for given level and phase. |
181 |
printStats :: Int -- ^ Verbosity level |
182 |
-> Bool -- ^ If the output should be machine readable |
183 |
-> Level -- ^ Level on which we are printing |
184 |
-> Phase -- ^ Current phase of simulation |
185 |
-> [String] -- ^ Values to print |
186 |
-> Maybe String -- ^ Additional data for groups |
187 |
-> IO () |
188 |
printStats _ True level phase values gidx = do |
189 |
let keys = map (prepareKey level phase gidx) (keysData level) |
190 |
printKeysHTC $ zip keys values |
191 |
|
192 |
printStats verbose False level phase values name = do |
193 |
let prefix = phaseLevelDescr phase level name |
194 |
descr = descrData level |
195 |
unless (verbose == 0) $ do |
196 |
putStrLn "" |
197 |
putStr prefix |
198 |
mapM_ (uncurry (printf " %s: %s\n")) (zip descr values) |
199 |
|
200 |
-- | Extract name or idx from group. |
201 |
extractGroupData :: Bool -> Group.Group -> String |
202 |
extractGroupData True grp = show $ Group.idx grp |
203 |
extractGroupData False grp = Group.name grp |
204 |
|
205 |
-- | Prepare values for group. |
206 |
prepareGroupValues :: [Int] -> Double -> [String] |
207 |
prepareGroupValues stats score = |
208 |
map show stats ++ [printf "%.8f" score] |
209 |
|
210 |
-- | Prepare values for cluster. |
211 |
prepareClusterValues :: Bool -> [Int] -> [Bool] -> [String] |
212 |
prepareClusterValues machineread stats bstats = |
213 |
map show stats ++ map (printBool machineread) bstats |
214 |
|
215 |
-- | Print all the statistics on a group level. |
216 |
printGroupStats :: Int -> Bool -> Phase -> GroupStats -> IO () |
217 |
printGroupStats verbose machineread phase ((grp, score), stats) = do |
218 |
let values = prepareGroupValues stats score |
219 |
extradata = extractGroupData machineread grp |
220 |
printStats verbose machineread GroupLvl phase values (Just extradata) |
221 |
|
222 |
-- | Print all the statistics on a cluster (global) level. |
223 |
printClusterStats :: Int -> Bool -> Phase -> [Int] -> Bool -> IO () |
224 |
printClusterStats verbose machineread phase stats needhbal = do |
225 |
let values = prepareClusterValues machineread stats [needhbal] |
226 |
printStats verbose machineread ClusterLvl phase values Nothing |
227 |
|
228 |
-- | Check if any of cluster metrics is non-zero. |
229 |
clusterNeedsRebalance :: [Int] -> Bool |
230 |
clusterNeedsRebalance stats = sum stats > 0 |
231 |
|
232 |
{- | Check group for N+1 hapiness, conflicts of primaries on nodes and |
233 |
instances residing on offline nodes. |
234 |
|
235 |
-} |
236 |
perGroupChecks :: Group.List -> GroupInfo -> GroupStats |
237 |
perGroupChecks gl (gidx, (nl, il)) = |
238 |
let grp = Container.find gidx gl |
239 |
offnl = filter Node.offline (Container.elems nl) |
240 |
n1violated = length . fst $ Cluster.computeBadItems nl il |
241 |
conflicttags = length $ filter (>0) |
242 |
(map Node.conflictingPrimaries (Container.elems nl)) |
243 |
offline_pri = sum . map length $ map Node.pList offnl |
244 |
offline_sec = length $ map Node.sList offnl |
245 |
score = Cluster.compCV nl |
246 |
groupstats = [ n1violated |
247 |
, conflicttags |
248 |
, offline_pri |
249 |
, offline_sec |
250 |
] |
251 |
in ((grp, score), groupstats) |
252 |
|
253 |
-- | Use Hbal's iterateDepth to simulate group rebalance. |
254 |
executeSimulation :: Options -> Cluster.Table -> Double |
255 |
-> Gdx -> Node.List -> Instance.List |
256 |
-> IO GroupInfo |
257 |
executeSimulation opts ini_tbl min_cv gidx nl il = do |
258 |
let imlen = maximum . map (length . Instance.alias) $ Container.elems il |
259 |
nmlen = maximum . map (length . Node.alias) $ Container.elems nl |
260 |
|
261 |
(fin_tbl, _) <- Hbal.iterateDepth False ini_tbl |
262 |
(optMaxLength opts) |
263 |
(optDiskMoves opts) |
264 |
(optInstMoves opts) |
265 |
nmlen imlen [] min_cv |
266 |
(optMinGainLim opts) (optMinGain opts) |
267 |
(optEvacMode opts) |
268 |
|
269 |
let (Cluster.Table fin_nl fin_il _ _) = fin_tbl |
270 |
return (gidx, (fin_nl, fin_il)) |
271 |
|
272 |
-- | Simulate group rebalance if group's score is not good |
273 |
maybeSimulateGroupRebalance :: Options -> GroupInfo -> IO GroupInfo |
274 |
maybeSimulateGroupRebalance opts (gidx, (nl, il)) = do |
275 |
let ini_cv = Cluster.compCV nl |
276 |
ini_tbl = Cluster.Table nl il ini_cv [] |
277 |
min_cv = optMinScore opts |
278 |
if ini_cv < min_cv |
279 |
then return (gidx, (nl, il)) |
280 |
else executeSimulation opts ini_tbl min_cv gidx nl il |
281 |
|
282 |
-- | Decide whether to simulate rebalance. |
283 |
maybeSimulateRebalance :: Bool -- ^ Whether to simulate rebalance |
284 |
-> Options -- ^ Command line options |
285 |
-> [GroupInfo] -- ^ Group data |
286 |
-> IO [GroupInfo] |
287 |
maybeSimulateRebalance True opts cluster = |
288 |
mapM (maybeSimulateGroupRebalance opts) cluster |
289 |
maybeSimulateRebalance False _ cluster = return cluster |
290 |
|
291 |
-- | Prints the final @OK@ marker in machine readable output. |
292 |
printFinalHTC :: Bool -> IO () |
293 |
printFinalHTC = printFinal htcPrefix |
294 |
|
295 |
-- | Main function. |
296 |
main :: Options -> [String] -> IO () |
297 |
main opts args = do |
298 |
unless (null args) $ exitErr "This program doesn't take any arguments." |
299 |
|
300 |
let verbose = optVerbose opts |
301 |
machineread = optMachineReadable opts |
302 |
nosimulation = optNoSimulation opts |
303 |
|
304 |
(ClusterData gl fixed_nl ilf _ _) <- loadExternalData opts |
305 |
nlf <- setNodeStatus opts fixed_nl |
306 |
|
307 |
let splitcluster = Cluster.splitCluster nlf ilf |
308 |
|
309 |
when machineread $ printGroupsMappings gl |
310 |
|
311 |
let groupsstats = map (perGroupChecks gl) splitcluster |
312 |
clusterstats = map sum . transpose . map snd $ groupsstats |
313 |
needrebalance = clusterNeedsRebalance clusterstats |
314 |
|
315 |
unless (verbose == 0 || machineread) $ |
316 |
if nosimulation |
317 |
then putStrLn "Running in no-simulation mode." |
318 |
else if needrebalance |
319 |
then putStrLn "Cluster needs rebalancing." |
320 |
else putStrLn "No need to rebalance cluster, no problems found." |
321 |
|
322 |
mapM_ (printGroupStats verbose machineread Initial) groupsstats |
323 |
|
324 |
printClusterStats verbose machineread Initial clusterstats needrebalance |
325 |
|
326 |
let exitOK = nosimulation || not needrebalance |
327 |
simulate = not nosimulation && needrebalance |
328 |
|
329 |
rebalancedcluster <- maybeSimulateRebalance simulate opts splitcluster |
330 |
|
331 |
when (simulate || machineread) $ do |
332 |
let newgroupstats = map (perGroupChecks gl) rebalancedcluster |
333 |
newclusterstats = map sum . transpose . map snd $ newgroupstats |
334 |
newneedrebalance = clusterNeedsRebalance clusterstats |
335 |
|
336 |
mapM_ (printGroupStats verbose machineread Rebalanced) newgroupstats |
337 |
|
338 |
printClusterStats verbose machineread Rebalanced newclusterstats |
339 |
newneedrebalance |
340 |
|
341 |
printFinalHTC machineread |
342 |
|
343 |
unless exitOK . exitWith $ ExitFailure 1 |