root / hbal.hs @ 040afc35
History | View | Annotate | Download (11.9 kB)
1 |
{-| Solver for N+1 cluster errors |
---|---|
2 |
|
3 |
-} |
4 |
|
5 |
module Main (main) where |
6 |
|
7 |
import Data.List |
8 |
import Data.Function |
9 |
import Data.Maybe (isJust, fromJust, fromMaybe) |
10 |
import Monad |
11 |
import System |
12 |
import System.IO |
13 |
import System.Console.GetOpt |
14 |
import qualified System |
15 |
|
16 |
import Text.Printf (printf) |
17 |
|
18 |
import qualified Ganeti.HTools.Container as Container |
19 |
import qualified Ganeti.HTools.Cluster as Cluster |
20 |
import qualified Ganeti.HTools.Node as Node |
21 |
import qualified Ganeti.HTools.CLI as CLI |
22 |
import qualified Ganeti.HTools.Rapi as Rapi |
23 |
import qualified Ganeti.HTools.Text as Text |
24 |
|
25 |
import Ganeti.HTools.Utils |
26 |
|
27 |
-- | Command line options structure. |
28 |
data Options = Options |
29 |
{ optShowNodes :: Bool -- ^ Whether to show node status |
30 |
, optShowCmds :: Maybe FilePath -- ^ Whether to show the command list |
31 |
, optOneline :: Bool -- ^ Switch output to a single line |
32 |
, optNodef :: FilePath -- ^ Path to the nodes file |
33 |
, optNodeSet :: Bool -- ^ The nodes have been set by options |
34 |
, optInstf :: FilePath -- ^ Path to the instances file |
35 |
, optInstSet :: Bool -- ^ The insts have been set by options |
36 |
, optMaxLength :: Int -- ^ Stop after this many steps |
37 |
, optMaster :: String -- ^ Collect data from RAPI |
38 |
, optVerbose :: Int -- ^ Verbosity level |
39 |
, optOffline :: [String] -- ^ Names of offline nodes |
40 |
, optMinScore :: Cluster.Score -- ^ The minimum score we aim for |
41 |
, optShowVer :: Bool -- ^ Just show the program version |
42 |
, optShowHelp :: Bool -- ^ Just show the help |
43 |
} deriving Show |
44 |
|
45 |
-- | Default values for the command line options. |
46 |
defaultOptions :: Options |
47 |
defaultOptions = Options |
48 |
{ optShowNodes = False |
49 |
, optShowCmds = Nothing |
50 |
, optOneline = False |
51 |
, optNodef = "nodes" |
52 |
, optNodeSet = False |
53 |
, optInstf = "instances" |
54 |
, optInstSet = False |
55 |
, optMaxLength = -1 |
56 |
, optMaster = "" |
57 |
, optVerbose = 1 |
58 |
, optOffline = [] |
59 |
, optMinScore = 1e-9 |
60 |
, optShowVer = False |
61 |
, optShowHelp = False |
62 |
} |
63 |
|
64 |
-- | Options list and functions |
65 |
options :: [OptDescr (Options -> Options)] |
66 |
options = |
67 |
[ Option ['p'] ["print-nodes"] |
68 |
(NoArg (\ opts -> opts { optShowNodes = True })) |
69 |
"print the final node list" |
70 |
, Option ['C'] ["print-commands"] |
71 |
(OptArg ((\ f opts -> opts { optShowCmds = Just f }) . fromMaybe "-") |
72 |
"FILE") |
73 |
"print the ganeti command list for reaching the solution,\ |
74 |
\if an argument is passed then write the commands to a file named\ |
75 |
\ as such" |
76 |
, Option ['o'] ["oneline"] |
77 |
(NoArg (\ opts -> opts { optOneline = True })) |
78 |
"print the ganeti command list for reaching the solution" |
79 |
, Option ['n'] ["nodes"] |
80 |
(ReqArg (\ f opts -> opts { optNodef = f, optNodeSet = True }) "FILE") |
81 |
"the node list FILE" |
82 |
, Option ['i'] ["instances"] |
83 |
(ReqArg (\ f opts -> opts { optInstf = f, optInstSet = True }) "FILE") |
84 |
"the instance list FILE" |
85 |
, Option ['m'] ["master"] |
86 |
(ReqArg (\ m opts -> opts { optMaster = m }) "ADDRESS") |
87 |
"collect data via RAPI at the given ADDRESS" |
88 |
, Option ['l'] ["max-length"] |
89 |
(ReqArg (\ i opts -> opts { optMaxLength = (read i)::Int }) "N") |
90 |
"cap the solution at this many moves (useful for very unbalanced \ |
91 |
\clusters)" |
92 |
, Option ['v'] ["verbose"] |
93 |
(NoArg (\ opts -> opts { optVerbose = (optVerbose opts) + 1 })) |
94 |
"increase the verbosity level" |
95 |
, Option ['q'] ["quiet"] |
96 |
(NoArg (\ opts -> opts { optVerbose = (optVerbose opts) - 1 })) |
97 |
"decrease the verbosity level" |
98 |
, Option ['O'] ["offline"] |
99 |
(ReqArg (\ n opts -> opts { optOffline = n:optOffline opts }) "NODE") |
100 |
" set node as offline" |
101 |
, Option ['e'] ["min-score"] |
102 |
(ReqArg (\ e opts -> opts { optMinScore = read e }) "EPSILON") |
103 |
" mininum score to aim for" |
104 |
, Option ['V'] ["version"] |
105 |
(NoArg (\ opts -> opts { optShowVer = True})) |
106 |
"show the version of the program" |
107 |
, Option ['h'] ["help"] |
108 |
(NoArg (\ opts -> opts { optShowHelp = True})) |
109 |
"show help" |
110 |
] |
111 |
|
112 |
{- | Start computing the solution at the given depth and recurse until |
113 |
we find a valid solution or we exceed the maximum depth. |
114 |
|
115 |
-} |
116 |
iterateDepth :: Cluster.Table -- ^ The starting table |
117 |
-> Int -- ^ Remaining length |
118 |
-> Cluster.NameList -- ^ Node idx to name list |
119 |
-> Cluster.NameList -- ^ Inst idx to name list |
120 |
-> Int -- ^ Max node name len |
121 |
-> Int -- ^ Max instance name len |
122 |
-> [[String]] -- ^ Current command list |
123 |
-> Bool -- ^ Wheter to be silent |
124 |
-> Cluster.Score -- ^ Score at which to stop |
125 |
-> IO (Cluster.Table, [[String]]) -- ^ The resulting table and |
126 |
-- commands |
127 |
iterateDepth ini_tbl max_rounds ktn kti nmlen imlen |
128 |
cmd_strs oneline min_score = |
129 |
let Cluster.Table ini_nl ini_il ini_cv ini_plc = ini_tbl |
130 |
all_inst = Container.elems ini_il |
131 |
node_idx = map Node.idx . filter (not . Node.offline) $ |
132 |
Container.elems ini_nl |
133 |
fin_tbl = Cluster.checkMove node_idx ini_tbl all_inst |
134 |
(Cluster.Table _ _ fin_cv fin_plc) = fin_tbl |
135 |
ini_plc_len = length ini_plc |
136 |
fin_plc_len = length fin_plc |
137 |
allowed_next = (max_rounds < 0 || length fin_plc < max_rounds) |
138 |
in |
139 |
do |
140 |
let |
141 |
(sol_line, cmds) = Cluster.printSolutionLine ini_il ktn kti |
142 |
nmlen imlen (head fin_plc) fin_plc_len |
143 |
upd_cmd_strs = cmds:cmd_strs |
144 |
unless (oneline || fin_plc_len == ini_plc_len) $ do |
145 |
putStrLn sol_line |
146 |
hFlush stdout |
147 |
(if fin_cv < ini_cv then -- this round made success, try deeper |
148 |
if allowed_next && fin_cv > min_score |
149 |
then iterateDepth fin_tbl max_rounds ktn kti |
150 |
nmlen imlen upd_cmd_strs oneline min_score |
151 |
-- don't go deeper, but return the better solution |
152 |
else return (fin_tbl, upd_cmd_strs) |
153 |
else |
154 |
return (ini_tbl, cmd_strs)) |
155 |
|
156 |
-- | Formats the solution for the oneline display |
157 |
formatOneline :: Double -> Int -> Double -> String |
158 |
formatOneline ini_cv plc_len fin_cv = |
159 |
printf "%.8f %d %.8f %8.3f" ini_cv plc_len fin_cv |
160 |
(if fin_cv == 0 then 1 else (ini_cv / fin_cv)) |
161 |
|
162 |
-- | Main function. |
163 |
main :: IO () |
164 |
main = do |
165 |
cmd_args <- System.getArgs |
166 |
(opts, args) <- CLI.parseOpts cmd_args "hbal" options |
167 |
defaultOptions optShowHelp |
168 |
|
169 |
unless (null args) $ do |
170 |
hPutStrLn stderr "Error: this program doesn't take any arguments." |
171 |
exitWith $ ExitFailure 1 |
172 |
|
173 |
when (optShowVer opts) $ do |
174 |
putStr $ CLI.showVersion "hbal" |
175 |
exitWith ExitSuccess |
176 |
|
177 |
(env_node, env_inst) <- CLI.parseEnv () |
178 |
let nodef = if optNodeSet opts then optNodef opts |
179 |
else env_node |
180 |
instf = if optInstSet opts then optInstf opts |
181 |
else env_inst |
182 |
oneline = optOneline opts |
183 |
verbose = optVerbose opts |
184 |
input_data <- |
185 |
case optMaster opts of |
186 |
"" -> Text.loadData nodef instf |
187 |
host -> Rapi.loadData host |
188 |
|
189 |
let ldresult = input_data >> Cluster.loadData |
190 |
|
191 |
(loaded_nl, il, csf, ktn, kti) <- |
192 |
(case ldresult of |
193 |
Ok x -> return x |
194 |
Bad s -> do |
195 |
printf "Error: failed to load data. Details:\n%s\n" s |
196 |
exitWith $ ExitFailure 1 |
197 |
) |
198 |
let (fix_msgs, fixed_nl) = Cluster.checkData loaded_nl il ktn kti |
199 |
|
200 |
unless (null fix_msgs || verbose == 0) $ do |
201 |
putStrLn "Warning: cluster has inconsistent data:" |
202 |
putStrLn . unlines . map (\s -> printf " - %s" s) $ fix_msgs |
203 |
|
204 |
let offline_names = optOffline opts |
205 |
all_names = snd . unzip $ ktn |
206 |
offline_wrong = filter (\n -> not $ elem n all_names) offline_names |
207 |
offline_indices = fst . unzip . |
208 |
filter (\(_, n) -> elem n offline_names) $ ktn |
209 |
|
210 |
when (length offline_wrong > 0) $ do |
211 |
printf "Wrong node name(s) set as offline: %s\n" |
212 |
(commaJoin offline_wrong) |
213 |
exitWith $ ExitFailure 1 |
214 |
|
215 |
let nl = Container.map (\n -> if elem (Node.idx n) offline_indices |
216 |
then Node.setOffline n True |
217 |
else n) fixed_nl |
218 |
|
219 |
when (Container.size il == 0) $ do |
220 |
(if oneline then |
221 |
putStrLn $ formatOneline 0 0 0 |
222 |
else |
223 |
printf "Cluster is empty, exiting.\n") |
224 |
exitWith ExitSuccess |
225 |
|
226 |
|
227 |
unless oneline $ printf "Loaded %d nodes, %d instances\n" |
228 |
(Container.size nl) |
229 |
(Container.size il) |
230 |
|
231 |
when (length csf > 0 && not oneline && verbose > 1) $ do |
232 |
printf "Note: Stripping common suffix of '%s' from names\n" csf |
233 |
|
234 |
let (bad_nodes, bad_instances) = Cluster.computeBadItems nl il |
235 |
unless (oneline || verbose == 0) $ printf |
236 |
"Initial check done: %d bad nodes, %d bad instances.\n" |
237 |
(length bad_nodes) (length bad_instances) |
238 |
|
239 |
when (length bad_nodes > 0) $ do |
240 |
putStrLn "Cluster is not N+1 happy, continuing but no guarantee \ |
241 |
\that the cluster will end N+1 happy." |
242 |
|
243 |
when (optShowNodes opts) $ |
244 |
do |
245 |
putStrLn "Initial cluster status:" |
246 |
putStrLn $ Cluster.printNodes ktn nl |
247 |
|
248 |
let ini_cv = Cluster.compCV nl |
249 |
ini_tbl = Cluster.Table nl il ini_cv [] |
250 |
min_cv = optMinScore opts |
251 |
|
252 |
when (ini_cv < min_cv) $ do |
253 |
(if oneline then |
254 |
putStrLn $ formatOneline ini_cv 0 ini_cv |
255 |
else printf "Cluster is already well balanced (initial score %.6g,\n\ |
256 |
\minimum score %.6g).\nNothing to do, exiting\n" |
257 |
ini_cv min_cv) |
258 |
exitWith ExitSuccess |
259 |
|
260 |
unless oneline (if verbose > 2 then |
261 |
printf "Initial coefficients: overall %.8f, %s\n" |
262 |
ini_cv (Cluster.printStats nl) |
263 |
else |
264 |
printf "Initial score: %.8f\n" ini_cv) |
265 |
|
266 |
unless oneline $ putStrLn "Trying to minimize the CV..." |
267 |
let mlen_fn = maximum . (map length) . snd . unzip |
268 |
imlen = mlen_fn kti |
269 |
nmlen = mlen_fn ktn |
270 |
|
271 |
(fin_tbl, cmd_strs) <- iterateDepth ini_tbl (optMaxLength opts) |
272 |
ktn kti nmlen imlen [] oneline min_cv |
273 |
let (Cluster.Table fin_nl _ fin_cv fin_plc) = fin_tbl |
274 |
ord_plc = reverse fin_plc |
275 |
sol_msg = if null fin_plc |
276 |
then printf "No solution found\n" |
277 |
else (if verbose > 2 |
278 |
then printf "Final coefficients: overall %.8f, %s\n" |
279 |
fin_cv (Cluster.printStats fin_nl) |
280 |
else printf "Cluster score improved from %.8f to %.8f\n" |
281 |
ini_cv fin_cv |
282 |
) |
283 |
|
284 |
unless oneline $ putStr sol_msg |
285 |
|
286 |
unless (oneline || verbose == 0) $ |
287 |
printf "Solution length=%d\n" (length ord_plc) |
288 |
|
289 |
let cmd_data = Cluster.formatCmds . reverse $ cmd_strs |
290 |
|
291 |
when (isJust $ optShowCmds opts) $ |
292 |
do |
293 |
let out_path = fromJust $ optShowCmds opts |
294 |
putStrLn "" |
295 |
(if out_path == "-" then |
296 |
printf "Commands to run to reach the above solution:\n%s" |
297 |
(unlines . map (" " ++) . |
298 |
filter (/= "check") . |
299 |
lines $ cmd_data) |
300 |
else do |
301 |
writeFile out_path (CLI.shTemplate ++ cmd_data) |
302 |
printf "The commands have been written to file '%s'\n" out_path) |
303 |
|
304 |
when (optShowNodes opts) $ |
305 |
do |
306 |
let (orig_mem, orig_disk) = Cluster.totalResources nl |
307 |
(final_mem, final_disk) = Cluster.totalResources fin_nl |
308 |
putStrLn "" |
309 |
putStrLn "Final cluster status:" |
310 |
putStrLn $ Cluster.printNodes ktn fin_nl |
311 |
when (verbose > 3) $ |
312 |
do |
313 |
printf "Original: mem=%d disk=%d\n" orig_mem orig_disk |
314 |
printf "Final: mem=%d disk=%d\n" final_mem final_disk |
315 |
when oneline $ |
316 |
putStrLn $ formatOneline ini_cv (length ord_plc) fin_cv |