root / htools / Ganeti / HTools / Program / Hbal.hs @ e19ee6e4
History | View | Annotate | Download (16 kB)
1 | 525bfb36 | Iustin Pop | {-| Cluster rebalancer. |
---|---|---|---|
2 | e4f08c46 | Iustin Pop | |
3 | e4f08c46 | Iustin Pop | -} |
4 | e4f08c46 | Iustin Pop | |
5 | e2fa2baf | Iustin Pop | {- |
6 | e2fa2baf | Iustin Pop | |
7 | d5072e4c | Iustin Pop | Copyright (C) 2009, 2010, 2011 Google Inc. |
8 | e2fa2baf | Iustin Pop | |
9 | e2fa2baf | Iustin Pop | This program is free software; you can redistribute it and/or modify |
10 | e2fa2baf | Iustin Pop | it under the terms of the GNU General Public License as published by |
11 | e2fa2baf | Iustin Pop | the Free Software Foundation; either version 2 of the License, or |
12 | e2fa2baf | Iustin Pop | (at your option) any later version. |
13 | e2fa2baf | Iustin Pop | |
14 | e2fa2baf | Iustin Pop | This program is distributed in the hope that it will be useful, but |
15 | e2fa2baf | Iustin Pop | WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | e2fa2baf | Iustin Pop | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
17 | e2fa2baf | Iustin Pop | General Public License for more details. |
18 | e2fa2baf | Iustin Pop | |
19 | e2fa2baf | Iustin Pop | You should have received a copy of the GNU General Public License |
20 | e2fa2baf | Iustin Pop | along with this program; if not, write to the Free Software |
21 | e2fa2baf | Iustin Pop | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA |
22 | e2fa2baf | Iustin Pop | 02110-1301, USA. |
23 | e2fa2baf | Iustin Pop | |
24 | e2fa2baf | Iustin Pop | -} |
25 | e2fa2baf | Iustin Pop | |
26 | 201b6c34 | Iustin Pop | module Ganeti.HTools.Program.Hbal (main) where |
27 | e4f08c46 | Iustin Pop | |
28 | b2245847 | Iustin Pop | import Control.Concurrent (threadDelay) |
29 | b2245847 | Iustin Pop | import Control.Exception (bracket) |
30 | cc532bdd | Iustin Pop | import Control.Monad |
31 | e4f08c46 | Iustin Pop | import Data.List |
32 | 646aa028 | Iustin Pop | import Data.Maybe (isJust, isNothing, fromJust) |
33 | 03cb89f0 | Iustin Pop | import Data.IORef |
34 | 0903280b | Iustin Pop | import System (exitWith, ExitCode(..)) |
35 | e4f08c46 | Iustin Pop | import System.IO |
36 | 03cb89f0 | Iustin Pop | import System.Posix.Process |
37 | 03cb89f0 | Iustin Pop | import System.Posix.Signals |
38 | e4f08c46 | Iustin Pop | import qualified System |
39 | e4f08c46 | Iustin Pop | |
40 | 2795466b | Iustin Pop | import Text.Printf (printf, hPrintf) |
41 | e4f08c46 | Iustin Pop | |
42 | 669d7e3d | Iustin Pop | import qualified Ganeti.HTools.Container as Container |
43 | 669d7e3d | Iustin Pop | import qualified Ganeti.HTools.Cluster as Cluster |
44 | 10ef6b4e | Iustin Pop | import qualified Ganeti.HTools.Group as Group |
45 | ec18dca9 | Iustin Pop | import qualified Ganeti.HTools.Node as Node |
46 | b2245847 | Iustin Pop | import qualified Ganeti.HTools.Instance as Instance |
47 | 040afc35 | Iustin Pop | |
48 | 0427285d | Iustin Pop | import Ganeti.HTools.CLI |
49 | e8f89bb6 | Iustin Pop | import Ganeti.HTools.ExtLoader |
50 | 669d7e3d | Iustin Pop | import Ganeti.HTools.Utils |
51 | 0e8ae201 | Iustin Pop | import Ganeti.HTools.Types |
52 | 4938fa30 | Guido Trotter | import Ganeti.HTools.Loader |
53 | e4f08c46 | Iustin Pop | |
54 | b2245847 | Iustin Pop | import qualified Ganeti.Luxi as L |
55 | b2245847 | Iustin Pop | import Ganeti.Jobs |
56 | b2245847 | Iustin Pop | |
57 | 179c0828 | Iustin Pop | -- | Options list and functions. |
58 | 0427285d | Iustin Pop | options :: [OptType] |
59 | e4f08c46 | Iustin Pop | options = |
60 | 0427285d | Iustin Pop | [ oPrintNodes |
61 | 507fda3f | Iustin Pop | , oPrintInsts |
62 | 0427285d | Iustin Pop | , oPrintCommands |
63 | 16c2369c | Iustin Pop | , oDataFile |
64 | 2e28ac32 | Iustin Pop | , oEvacMode |
65 | 0427285d | Iustin Pop | , oRapiMaster |
66 | 0427285d | Iustin Pop | , oLuxiSocket |
67 | b2245847 | Iustin Pop | , oExecJobs |
68 | 646aa028 | Iustin Pop | , oGroup |
69 | 0427285d | Iustin Pop | , oMaxSolLength |
70 | 0427285d | Iustin Pop | , oVerbose |
71 | 0427285d | Iustin Pop | , oQuiet |
72 | 0427285d | Iustin Pop | , oOfflineNode |
73 | 0427285d | Iustin Pop | , oMinScore |
74 | 0427285d | Iustin Pop | , oMaxCpu |
75 | 0427285d | Iustin Pop | , oMinDisk |
76 | 848b65c9 | Iustin Pop | , oMinGain |
77 | 848b65c9 | Iustin Pop | , oMinGainLim |
78 | c0501c69 | Iustin Pop | , oDiskMoves |
79 | ddef0585 | Guido Trotter | , oSelInst |
80 | e7f7c003 | Guido Trotter | , oInstMoves |
81 | aa8d2e71 | Iustin Pop | , oDynuFile |
82 | 0f15cc76 | Iustin Pop | , oExTags |
83 | fcbf0da4 | Iustin Pop | , oExInst |
84 | 748654f7 | Iustin Pop | , oSaveCluster |
85 | 0427285d | Iustin Pop | , oShowVer |
86 | 0427285d | Iustin Pop | , oShowHelp |
87 | 7ef4d93e | Iustin Pop | ] |
88 | e4f08c46 | Iustin Pop | |
89 | 6dc960bc | Iustin Pop | {- | Start computing the solution at the given depth and recurse until |
90 | 6dc960bc | Iustin Pop | we find a valid solution or we exceed the maximum depth. |
91 | 6dc960bc | Iustin Pop | |
92 | 6dc960bc | Iustin Pop | -} |
93 | 6dc960bc | Iustin Pop | iterateDepth :: Cluster.Table -- ^ The starting table |
94 | 6dc960bc | Iustin Pop | -> Int -- ^ Remaining length |
95 | c0501c69 | Iustin Pop | -> Bool -- ^ Allow disk moves |
96 | e7f7c003 | Guido Trotter | -> Bool -- ^ Allow instance moves |
97 | 6dc960bc | Iustin Pop | -> Int -- ^ Max node name len |
98 | 6dc960bc | Iustin Pop | -> Int -- ^ Max instance name len |
99 | 0e8ae201 | Iustin Pop | -> [MoveJob] -- ^ Current command list |
100 | 92e32d76 | Iustin Pop | -> Score -- ^ Score at which to stop |
101 | 848b65c9 | Iustin Pop | -> Score -- ^ Min gain limit |
102 | 848b65c9 | Iustin Pop | -> Score -- ^ Min score gain |
103 | 2e28ac32 | Iustin Pop | -> Bool -- ^ Enable evacuation mode |
104 | 0e8ae201 | Iustin Pop | -> IO (Cluster.Table, [MoveJob]) -- ^ The resulting table |
105 | 0e8ae201 | Iustin Pop | -- and commands |
106 | e7f7c003 | Guido Trotter | iterateDepth ini_tbl max_rounds disk_moves inst_moves nmlen imlen |
107 | e19ee6e4 | Iustin Pop | cmd_strs min_score mg_limit min_gain evac_mode = |
108 | f25e5aac | Iustin Pop | let Cluster.Table ini_nl ini_il _ _ = ini_tbl |
109 | 5ad86777 | Iustin Pop | allowed_next = Cluster.doNextBalance ini_tbl max_rounds min_score |
110 | 5ad86777 | Iustin Pop | m_fin_tbl = if allowed_next |
111 | e08424a8 | Guido Trotter | then Cluster.tryBalance ini_tbl disk_moves inst_moves |
112 | e08424a8 | Guido Trotter | evac_mode mg_limit min_gain |
113 | 5ad86777 | Iustin Pop | else Nothing |
114 | 6dc960bc | Iustin Pop | in |
115 | f25e5aac | Iustin Pop | case m_fin_tbl of |
116 | f25e5aac | Iustin Pop | Just fin_tbl -> |
117 | f25e5aac | Iustin Pop | do |
118 | f25e5aac | Iustin Pop | let |
119 | f25e5aac | Iustin Pop | (Cluster.Table _ _ _ fin_plc) = fin_tbl |
120 | f25e5aac | Iustin Pop | fin_plc_len = length fin_plc |
121 | 924f9c16 | Iustin Pop | cur_plc@(idx, _, _, move, _) = head fin_plc |
122 | f25e5aac | Iustin Pop | (sol_line, cmds) = Cluster.printSolutionLine ini_nl ini_il |
123 | 0e8ae201 | Iustin Pop | nmlen imlen cur_plc fin_plc_len |
124 | 0e8ae201 | Iustin Pop | afn = Cluster.involvedNodes ini_il cur_plc |
125 | 924f9c16 | Iustin Pop | upd_cmd_strs = (afn, idx, move, cmds):cmd_strs |
126 | e19ee6e4 | Iustin Pop | putStrLn sol_line |
127 | e19ee6e4 | Iustin Pop | hFlush stdout |
128 | e7f7c003 | Guido Trotter | iterateDepth fin_tbl max_rounds disk_moves inst_moves |
129 | e19ee6e4 | Iustin Pop | nmlen imlen upd_cmd_strs min_score |
130 | 848b65c9 | Iustin Pop | mg_limit min_gain evac_mode |
131 | f25e5aac | Iustin Pop | Nothing -> return (ini_tbl, cmd_strs) |
132 | 6dc960bc | Iustin Pop | |
133 | 5dad2589 | Iustin Pop | -- | Displays the cluster stats. |
134 | 5dad2589 | Iustin Pop | printStats :: Node.List -> Node.List -> IO () |
135 | 5dad2589 | Iustin Pop | printStats ini_nl fin_nl = do |
136 | 5dad2589 | Iustin Pop | let ini_cs = Cluster.totalResources ini_nl |
137 | 5dad2589 | Iustin Pop | fin_cs = Cluster.totalResources fin_nl |
138 | 5dad2589 | Iustin Pop | printf "Original: mem=%d disk=%d\n" |
139 | 5dad2589 | Iustin Pop | (Cluster.csFmem ini_cs) (Cluster.csFdsk ini_cs) :: IO () |
140 | 5dad2589 | Iustin Pop | printf "Final: mem=%d disk=%d\n" |
141 | 5dad2589 | Iustin Pop | (Cluster.csFmem fin_cs) (Cluster.csFdsk fin_cs) |
142 | 5dad2589 | Iustin Pop | |
143 | 5dad2589 | Iustin Pop | -- | Saves the rebalance commands to a text file. |
144 | 5dad2589 | Iustin Pop | saveBalanceCommands :: Options -> String -> IO () |
145 | 5dad2589 | Iustin Pop | saveBalanceCommands opts cmd_data = do |
146 | 5dad2589 | Iustin Pop | let out_path = fromJust $ optShowCmds opts |
147 | 5dad2589 | Iustin Pop | putStrLn "" |
148 | 5dad2589 | Iustin Pop | (if out_path == "-" then |
149 | 5dad2589 | Iustin Pop | printf "Commands to run to reach the above solution:\n%s" |
150 | 5dad2589 | Iustin Pop | (unlines . map (" " ++) . |
151 | 5dad2589 | Iustin Pop | filter (/= " check") . |
152 | 5dad2589 | Iustin Pop | lines $ cmd_data) |
153 | 5dad2589 | Iustin Pop | else do |
154 | 5dad2589 | Iustin Pop | writeFile out_path (shTemplate ++ cmd_data) |
155 | 5dad2589 | Iustin Pop | printf "The commands have been written to file '%s'\n" out_path) |
156 | 5dad2589 | Iustin Pop | |
157 | b2245847 | Iustin Pop | -- | Polls a set of jobs at a fixed interval until all are finished |
158 | 179c0828 | Iustin Pop | -- one way or another. |
159 | b2245847 | Iustin Pop | waitForJobs :: L.Client -> [String] -> IO (Result [JobStatus]) |
160 | b2245847 | Iustin Pop | waitForJobs client jids = do |
161 | b2245847 | Iustin Pop | sts <- L.queryJobsStatus client jids |
162 | b2245847 | Iustin Pop | case sts of |
163 | b2245847 | Iustin Pop | Bad x -> return $ Bad x |
164 | 7e98f782 | Iustin Pop | Ok s -> if any (<= JOB_STATUS_RUNNING) s |
165 | b2245847 | Iustin Pop | then do |
166 | b2245847 | Iustin Pop | -- TODO: replace hardcoded value with a better thing |
167 | b2245847 | Iustin Pop | threadDelay (1000000 * 15) |
168 | b2245847 | Iustin Pop | waitForJobs client jids |
169 | b2245847 | Iustin Pop | else return $ Ok s |
170 | b2245847 | Iustin Pop | |
171 | 179c0828 | Iustin Pop | -- | Check that a set of job statuses is all success. |
172 | b2245847 | Iustin Pop | checkJobsStatus :: [JobStatus] -> Bool |
173 | 7e98f782 | Iustin Pop | checkJobsStatus = all (== JOB_STATUS_SUCCESS) |
174 | b2245847 | Iustin Pop | |
175 | 179c0828 | Iustin Pop | -- | Wrapper over execJobSet checking for early termination. |
176 | 23448f82 | Iustin Pop | execWrapper :: String -> Node.List |
177 | 5dad2589 | Iustin Pop | -> Instance.List -> IORef Int -> [JobSet] -> IO Bool |
178 | 23448f82 | Iustin Pop | execWrapper _ _ _ _ [] = return True |
179 | 23448f82 | Iustin Pop | execWrapper master nl il cref alljss = do |
180 | 23448f82 | Iustin Pop | cancel <- readIORef cref |
181 | 23448f82 | Iustin Pop | (if cancel > 0 |
182 | 23448f82 | Iustin Pop | then do |
183 | 23448f82 | Iustin Pop | hPrintf stderr "Exiting early due to user request, %d\ |
184 | 23448f82 | Iustin Pop | \ jobset(s) remaining." (length alljss)::IO () |
185 | 23448f82 | Iustin Pop | return False |
186 | 23448f82 | Iustin Pop | else execJobSet master nl il cref alljss) |
187 | 23448f82 | Iustin Pop | |
188 | 179c0828 | Iustin Pop | -- | Execute an entire jobset. |
189 | 3e4480e0 | Iustin Pop | execJobSet :: String -> Node.List |
190 | 23448f82 | Iustin Pop | -> Instance.List -> IORef Int -> [JobSet] -> IO Bool |
191 | 23448f82 | Iustin Pop | execJobSet _ _ _ _ [] = return True |
192 | 23448f82 | Iustin Pop | execJobSet master nl il cref (js:jss) = do |
193 | b2245847 | Iustin Pop | -- map from jobset (htools list of positions) to [[opcodes]] |
194 | b2245847 | Iustin Pop | let jobs = map (\(_, idx, move, _) -> |
195 | 3e4480e0 | Iustin Pop | Cluster.iMoveToJob nl il idx move) js |
196 | b2245847 | Iustin Pop | let descr = map (\(_, idx, _, _) -> Container.nameOf il idx) js |
197 | b2245847 | Iustin Pop | putStrLn $ "Executing jobset for instances " ++ commaJoin descr |
198 | b2245847 | Iustin Pop | jrs <- bracket (L.getClient master) L.closeClient |
199 | b2245847 | Iustin Pop | (\client -> do |
200 | 683b1ca7 | Iustin Pop | jids <- L.submitManyJobs client jobs |
201 | b2245847 | Iustin Pop | case jids of |
202 | b2245847 | Iustin Pop | Bad x -> return $ Bad x |
203 | b2245847 | Iustin Pop | Ok x -> do |
204 | b2245847 | Iustin Pop | putStrLn $ "Got job IDs " ++ commaJoin x |
205 | b2245847 | Iustin Pop | waitForJobs client x |
206 | b2245847 | Iustin Pop | ) |
207 | b2245847 | Iustin Pop | (case jrs of |
208 | b2245847 | Iustin Pop | Bad x -> do |
209 | b2245847 | Iustin Pop | hPutStrLn stderr $ "Cannot compute job status, aborting: " ++ show x |
210 | 23448f82 | Iustin Pop | return False |
211 | b2245847 | Iustin Pop | Ok x -> if checkJobsStatus x |
212 | 23448f82 | Iustin Pop | then execWrapper master nl il cref jss |
213 | b2245847 | Iustin Pop | else do |
214 | b2245847 | Iustin Pop | hPutStrLn stderr $ "Not all jobs completed successfully: " ++ |
215 | b2245847 | Iustin Pop | show x |
216 | 23448f82 | Iustin Pop | hPutStrLn stderr "Aborting." |
217 | 23448f82 | Iustin Pop | return False) |
218 | b2245847 | Iustin Pop | |
219 | 5dad2589 | Iustin Pop | -- | Executes the jobs, if possible and desired. |
220 | 5dad2589 | Iustin Pop | maybeExecJobs :: Options |
221 | 5dad2589 | Iustin Pop | -> [a] |
222 | 5dad2589 | Iustin Pop | -> Node.List |
223 | 5dad2589 | Iustin Pop | -> Instance.List |
224 | 5dad2589 | Iustin Pop | -> [JobSet] |
225 | 5dad2589 | Iustin Pop | -> IO Bool |
226 | 5dad2589 | Iustin Pop | maybeExecJobs opts ord_plc fin_nl il cmd_jobs = |
227 | 5dad2589 | Iustin Pop | if optExecJobs opts && not (null ord_plc) |
228 | 5dad2589 | Iustin Pop | then (case optLuxi opts of |
229 | 5dad2589 | Iustin Pop | Nothing -> do |
230 | 5dad2589 | Iustin Pop | hPutStrLn stderr "Execution of commands possible only on LUXI" |
231 | 5dad2589 | Iustin Pop | return False |
232 | 5dad2589 | Iustin Pop | Just master -> runJobSet master fin_nl il cmd_jobs) |
233 | 5dad2589 | Iustin Pop | else return True |
234 | 5dad2589 | Iustin Pop | |
235 | 179c0828 | Iustin Pop | -- | Signal handler for graceful termination. |
236 | 543e859d | Iustin Pop | hangleSigInt :: IORef Int -> IO () |
237 | 543e859d | Iustin Pop | hangleSigInt cref = do |
238 | 543e859d | Iustin Pop | writeIORef cref 1 |
239 | 543e859d | Iustin Pop | putStrLn ("Cancel request registered, will exit at" ++ |
240 | 543e859d | Iustin Pop | " the end of the current job set...") |
241 | 543e859d | Iustin Pop | |
242 | 179c0828 | Iustin Pop | -- | Signal handler for immediate termination. |
243 | 543e859d | Iustin Pop | hangleSigTerm :: IORef Int -> IO () |
244 | 543e859d | Iustin Pop | hangleSigTerm cref = do |
245 | 543e859d | Iustin Pop | -- update the cref to 2, just for consistency |
246 | 543e859d | Iustin Pop | writeIORef cref 2 |
247 | 543e859d | Iustin Pop | putStrLn "Double cancel request, exiting now..." |
248 | 543e859d | Iustin Pop | exitImmediately $ ExitFailure 2 |
249 | 03cb89f0 | Iustin Pop | |
250 | 179c0828 | Iustin Pop | -- | Runs a job set with handling of signals. |
251 | 23448f82 | Iustin Pop | runJobSet :: String -> Node.List -> Instance.List -> [JobSet] -> IO Bool |
252 | 03cb89f0 | Iustin Pop | runJobSet master fin_nl il cmd_jobs = do |
253 | 03cb89f0 | Iustin Pop | cref <- newIORef 0 |
254 | 543e859d | Iustin Pop | mapM_ (\(hnd, sig) -> installHandler sig (Catch (hnd cref)) Nothing) |
255 | 543e859d | Iustin Pop | [(hangleSigTerm, softwareTermination), (hangleSigInt, keyboardSignal)] |
256 | 23448f82 | Iustin Pop | execWrapper master fin_nl il cref cmd_jobs |
257 | 03cb89f0 | Iustin Pop | |
258 | 5dad2589 | Iustin Pop | -- | Set node properties based on command line options. |
259 | 5dad2589 | Iustin Pop | setNodesStatus :: Options -> Node.List -> IO Node.List |
260 | 5dad2589 | Iustin Pop | setNodesStatus opts fixed_nl = do |
261 | 4938fa30 | Guido Trotter | let offline_passed = optOffline opts |
262 | db1bcfe8 | Iustin Pop | all_nodes = Container.elems fixed_nl |
263 | 4938fa30 | Guido Trotter | offline_lkp = map (lookupName (map Node.name all_nodes)) offline_passed |
264 | 4938fa30 | Guido Trotter | offline_wrong = filter (not . goodLookupResult) offline_lkp |
265 | 4938fa30 | Guido Trotter | offline_names = map lrContent offline_lkp |
266 | db1bcfe8 | Iustin Pop | offline_indices = map Node.idx $ |
267 | 4938fa30 | Guido Trotter | filter (\n -> Node.name n `elem` offline_names) |
268 | db1bcfe8 | Iustin Pop | all_nodes |
269 | 66d67ad4 | Iustin Pop | m_cpu = optMcpu opts |
270 | 66d67ad4 | Iustin Pop | m_dsk = optMdsk opts |
271 | ec18dca9 | Iustin Pop | |
272 | 4938fa30 | Guido Trotter | when (not (null offline_wrong)) $ do |
273 | 4938fa30 | Guido Trotter | hPrintf stderr "Error: Wrong node name(s) set as offline: %s\n" |
274 | 4938fa30 | Guido Trotter | (commaJoin (map lrContent offline_wrong)) :: IO () |
275 | 3d7cd10b | Iustin Pop | exitWith $ ExitFailure 1 |
276 | 3d7cd10b | Iustin Pop | |
277 | 5182e970 | Iustin Pop | let nm = Container.map (\n -> if Node.idx n `elem` offline_indices |
278 | ec18dca9 | Iustin Pop | then Node.setOffline n True |
279 | a1c6212e | Iustin Pop | else n) fixed_nl |
280 | 646aa028 | Iustin Pop | nlf = Container.map (flip Node.setMdsk m_dsk . flip Node.setMcpu m_cpu) |
281 | 646aa028 | Iustin Pop | nm |
282 | 5dad2589 | Iustin Pop | return nlf |
283 | a30b2f5b | Iustin Pop | |
284 | 5dad2589 | Iustin Pop | -- | Select the target node group. |
285 | 5dad2589 | Iustin Pop | selectGroup :: Options -> Group.List -> Node.List -> Instance.List |
286 | 5dad2589 | Iustin Pop | -> IO (String, (Node.List, Instance.List)) |
287 | 5dad2589 | Iustin Pop | selectGroup opts gl nlf ilf = do |
288 | 646aa028 | Iustin Pop | let ngroups = Cluster.splitCluster nlf ilf |
289 | 646aa028 | Iustin Pop | when (length ngroups > 1 && isNothing (optGroup opts)) $ do |
290 | 646aa028 | Iustin Pop | hPutStrLn stderr "Found multiple node groups:" |
291 | e0c85e08 | Iustin Pop | mapM_ (hPutStrLn stderr . (" " ++) . Group.name . |
292 | d5072e4c | Iustin Pop | flip Container.find gl . fst) ngroups |
293 | 646aa028 | Iustin Pop | hPutStrLn stderr "Aborting." |
294 | 646aa028 | Iustin Pop | exitWith $ ExitFailure 1 |
295 | 646aa028 | Iustin Pop | |
296 | 5dad2589 | Iustin Pop | case optGroup opts of |
297 | 10ef6b4e | Iustin Pop | Nothing -> do |
298 | 10ef6b4e | Iustin Pop | let (gidx, cdata) = head ngroups |
299 | 10ef6b4e | Iustin Pop | grp = Container.find gidx gl |
300 | 10ef6b4e | Iustin Pop | return (Group.name grp, cdata) |
301 | 10ef6b4e | Iustin Pop | Just g -> case Container.findByName gl g of |
302 | 646aa028 | Iustin Pop | Nothing -> do |
303 | 646aa028 | Iustin Pop | hPutStrLn stderr $ "Node group " ++ g ++ |
304 | 646aa028 | Iustin Pop | " not found. Node group list is:" |
305 | 10ef6b4e | Iustin Pop | mapM_ (hPutStrLn stderr . (" " ++) . Group.name ) (Container.elems gl) |
306 | 646aa028 | Iustin Pop | hPutStrLn stderr "Aborting." |
307 | 646aa028 | Iustin Pop | exitWith $ ExitFailure 1 |
308 | 10ef6b4e | Iustin Pop | Just grp -> |
309 | 10ef6b4e | Iustin Pop | case lookup (Group.idx grp) ngroups of |
310 | 10ef6b4e | Iustin Pop | Nothing -> do |
311 | 10ef6b4e | Iustin Pop | -- TODO: while this is unlikely to happen, log here the |
312 | 10ef6b4e | Iustin Pop | -- actual group data to help debugging |
313 | d5072e4c | Iustin Pop | hPutStrLn stderr "Internal failure, missing group idx" |
314 | 10ef6b4e | Iustin Pop | exitWith $ ExitFailure 1 |
315 | 10ef6b4e | Iustin Pop | Just cdata -> return (Group.name grp, cdata) |
316 | 646aa028 | Iustin Pop | |
317 | 5dad2589 | Iustin Pop | -- | Do a few checks on the cluster data. |
318 | e19ee6e4 | Iustin Pop | checkCluster :: Int -> Node.List -> Instance.List -> IO () |
319 | e19ee6e4 | Iustin Pop | checkCluster verbose nl il = do |
320 | 5dad2589 | Iustin Pop | -- nothing to do on an empty cluster |
321 | 5dad2589 | Iustin Pop | when (Container.null il) $ do |
322 | e19ee6e4 | Iustin Pop | printf "Cluster is empty, exiting.\n"::IO () |
323 | 5dad2589 | Iustin Pop | exitWith ExitSuccess |
324 | 5dad2589 | Iustin Pop | |
325 | 5dad2589 | Iustin Pop | -- hbal doesn't currently handle split clusters |
326 | 5dad2589 | Iustin Pop | let split_insts = Cluster.findSplitInstances nl il |
327 | 5dad2589 | Iustin Pop | unless (null split_insts) $ do |
328 | 5dad2589 | Iustin Pop | hPutStrLn stderr "Found instances belonging to multiple node groups:" |
329 | 5dad2589 | Iustin Pop | mapM_ (\i -> hPutStrLn stderr $ " " ++ Instance.name i) split_insts |
330 | 5dad2589 | Iustin Pop | hPutStrLn stderr "Aborting." |
331 | 5dad2589 | Iustin Pop | exitWith $ ExitFailure 1 |
332 | 5dad2589 | Iustin Pop | |
333 | e19ee6e4 | Iustin Pop | printf "Loaded %d nodes, %d instances\n" |
334 | 5dad2589 | Iustin Pop | (Container.size nl) |
335 | e19ee6e4 | Iustin Pop | (Container.size il)::IO () |
336 | 5dad2589 | Iustin Pop | |
337 | 5dad2589 | Iustin Pop | let csf = commonSuffix nl il |
338 | e19ee6e4 | Iustin Pop | when (not (null csf) && verbose > 1) $ |
339 | 5dad2589 | Iustin Pop | printf "Note: Stripping common suffix of '%s' from names\n" csf |
340 | 5dad2589 | Iustin Pop | |
341 | 5dad2589 | Iustin Pop | -- | Do a few checks on the selected group data. |
342 | e19ee6e4 | Iustin Pop | checkGroup :: Int -> String -> Node.List -> Instance.List -> IO () |
343 | e19ee6e4 | Iustin Pop | checkGroup verbose gname nl il = do |
344 | e19ee6e4 | Iustin Pop | printf "Group size %d nodes, %d instances\n" |
345 | e4f08c46 | Iustin Pop | (Container.size nl) |
346 | e19ee6e4 | Iustin Pop | (Container.size il)::IO () |
347 | a0529a64 | Iustin Pop | |
348 | 10ef6b4e | Iustin Pop | putStrLn $ "Selected node group: " ++ gname |
349 | 646aa028 | Iustin Pop | |
350 | e4f08c46 | Iustin Pop | let (bad_nodes, bad_instances) = Cluster.computeBadItems nl il |
351 | e19ee6e4 | Iustin Pop | unless (verbose == 0) $ printf |
352 | 27f96567 | Iustin Pop | "Initial check done: %d bad nodes, %d bad instances.\n" |
353 | e4f08c46 | Iustin Pop | (length bad_nodes) (length bad_instances) |
354 | e4f08c46 | Iustin Pop | |
355 | 9f6dcdea | Iustin Pop | when (length bad_nodes > 0) $ |
356 | 289c3835 | Iustin Pop | putStrLn "Cluster is not N+1 happy, continuing but no guarantee \ |
357 | 289c3835 | Iustin Pop | \that the cluster will end N+1 happy." |
358 | e4f08c46 | Iustin Pop | |
359 | 5dad2589 | Iustin Pop | -- | Check that we actually need to rebalance. |
360 | 5dad2589 | Iustin Pop | checkNeedRebalance :: Options -> Score -> IO () |
361 | 5dad2589 | Iustin Pop | checkNeedRebalance opts ini_cv = do |
362 | 5dad2589 | Iustin Pop | let min_cv = optMinScore opts |
363 | 5dad2589 | Iustin Pop | when (ini_cv < min_cv) $ do |
364 | e19ee6e4 | Iustin Pop | printf "Cluster is already well balanced (initial score %.6g,\n\ |
365 | e19ee6e4 | Iustin Pop | \minimum score %.6g).\nNothing to do, exiting\n" |
366 | e19ee6e4 | Iustin Pop | ini_cv min_cv:: IO () |
367 | 5dad2589 | Iustin Pop | exitWith ExitSuccess |
368 | 5dad2589 | Iustin Pop | |
369 | 5dad2589 | Iustin Pop | -- | Main function. |
370 | 5dad2589 | Iustin Pop | main :: IO () |
371 | 5dad2589 | Iustin Pop | main = do |
372 | 5dad2589 | Iustin Pop | cmd_args <- System.getArgs |
373 | 5dad2589 | Iustin Pop | (opts, args) <- parseOpts cmd_args "hbal" options |
374 | 5dad2589 | Iustin Pop | |
375 | 5dad2589 | Iustin Pop | unless (null args) $ do |
376 | 5dad2589 | Iustin Pop | hPutStrLn stderr "Error: this program doesn't take any arguments." |
377 | 5dad2589 | Iustin Pop | exitWith $ ExitFailure 1 |
378 | 5dad2589 | Iustin Pop | |
379 | e19ee6e4 | Iustin Pop | let verbose = optVerbose opts |
380 | 5dad2589 | Iustin Pop | shownodes = optShowNodes opts |
381 | 5dad2589 | Iustin Pop | showinsts = optShowInsts opts |
382 | 5dad2589 | Iustin Pop | |
383 | 5dad2589 | Iustin Pop | ini_cdata@(ClusterData gl fixed_nl ilf ctags) <- loadExternalData opts |
384 | 5dad2589 | Iustin Pop | |
385 | e19ee6e4 | Iustin Pop | when (verbose > 1) $ |
386 | 5dad2589 | Iustin Pop | putStrLn $ "Loaded cluster tags: " ++ intercalate "," ctags |
387 | 5dad2589 | Iustin Pop | |
388 | 5dad2589 | Iustin Pop | nlf <- setNodesStatus opts fixed_nl |
389 | e19ee6e4 | Iustin Pop | checkCluster verbose nlf ilf |
390 | 5dad2589 | Iustin Pop | |
391 | 5dad2589 | Iustin Pop | maybeSaveData (optSaveCluster opts) "original" "before balancing" ini_cdata |
392 | 5dad2589 | Iustin Pop | |
393 | 5dad2589 | Iustin Pop | (gname, (nl, il)) <- selectGroup opts gl nlf ilf |
394 | 5dad2589 | Iustin Pop | |
395 | e19ee6e4 | Iustin Pop | checkGroup verbose gname nl il |
396 | 5dad2589 | Iustin Pop | |
397 | 33e44f0c | Iustin Pop | maybePrintInsts showinsts "Initial" (Cluster.printInsts nl il) |
398 | 507fda3f | Iustin Pop | |
399 | 417f6b50 | Iustin Pop | maybePrintNodes shownodes "Initial cluster" (Cluster.printNodes nl) |
400 | e4f08c46 | Iustin Pop | |
401 | e4f08c46 | Iustin Pop | let ini_cv = Cluster.compCV nl |
402 | e4f08c46 | Iustin Pop | ini_tbl = Cluster.Table nl il ini_cv [] |
403 | b0517d61 | Iustin Pop | min_cv = optMinScore opts |
404 | b0517d61 | Iustin Pop | |
405 | 5dad2589 | Iustin Pop | checkNeedRebalance opts ini_cv |
406 | b0517d61 | Iustin Pop | |
407 | e19ee6e4 | Iustin Pop | (if verbose > 2 |
408 | e19ee6e4 | Iustin Pop | then printf "Initial coefficients: overall %.8f, %s\n" |
409 | e19ee6e4 | Iustin Pop | ini_cv (Cluster.printStats nl)::IO () |
410 | e19ee6e4 | Iustin Pop | else printf "Initial score: %.8f\n" ini_cv) |
411 | e4f08c46 | Iustin Pop | |
412 | e19ee6e4 | Iustin Pop | putStrLn "Trying to minimize the CV..." |
413 | 14c972c7 | Iustin Pop | let imlen = maximum . map (length . Instance.alias) $ Container.elems il |
414 | 14c972c7 | Iustin Pop | nmlen = maximum . map (length . Node.alias) $ Container.elems nl |
415 | 7dfaafb1 | Iustin Pop | |
416 | 7dfaafb1 | Iustin Pop | (fin_tbl, cmd_strs) <- iterateDepth ini_tbl (optMaxLength opts) |
417 | c0501c69 | Iustin Pop | (optDiskMoves opts) |
418 | e7f7c003 | Guido Trotter | (optInstMoves opts) |
419 | e19ee6e4 | Iustin Pop | nmlen imlen [] min_cv |
420 | 848b65c9 | Iustin Pop | (optMinGainLim opts) (optMinGain opts) |
421 | 848b65c9 | Iustin Pop | (optEvacMode opts) |
422 | 507fda3f | Iustin Pop | let (Cluster.Table fin_nl fin_il fin_cv fin_plc) = fin_tbl |
423 | e4f08c46 | Iustin Pop | ord_plc = reverse fin_plc |
424 | cc532bdd | Iustin Pop | sol_msg = case () of |
425 | cc532bdd | Iustin Pop | _ | null fin_plc -> printf "No solution found\n" |
426 | cc532bdd | Iustin Pop | | verbose > 2 -> |
427 | cc532bdd | Iustin Pop | printf "Final coefficients: overall %.8f, %s\n" |
428 | cc532bdd | Iustin Pop | fin_cv (Cluster.printStats fin_nl) |
429 | cc532bdd | Iustin Pop | | otherwise -> |
430 | cc532bdd | Iustin Pop | printf "Cluster score improved from %.8f to %.8f\n" |
431 | cc532bdd | Iustin Pop | ini_cv fin_cv ::String |
432 | e4f08c46 | Iustin Pop | |
433 | e19ee6e4 | Iustin Pop | putStr sol_msg |
434 | 7eff5b09 | Iustin Pop | |
435 | e19ee6e4 | Iustin Pop | unless (verbose == 0) $ |
436 | 7eff5b09 | Iustin Pop | printf "Solution length=%d\n" (length ord_plc) |
437 | e4f08c46 | Iustin Pop | |
438 | b2245847 | Iustin Pop | let cmd_jobs = Cluster.splitJobs cmd_strs |
439 | e0eb63f0 | Iustin Pop | |
440 | e0eb63f0 | Iustin Pop | when (isJust $ optShowCmds opts) $ |
441 | 5dad2589 | Iustin Pop | saveBalanceCommands opts $ Cluster.formatCmds cmd_jobs |
442 | e0eb63f0 | Iustin Pop | |
443 | 4188449c | Iustin Pop | maybeSaveData (optSaveCluster opts) "balanced" "after balancing" |
444 | 4188449c | Iustin Pop | (ClusterData gl fin_nl fin_il ctags) |
445 | b2245847 | Iustin Pop | |
446 | 33e44f0c | Iustin Pop | maybePrintInsts showinsts "Final" (Cluster.printInsts fin_nl fin_il) |
447 | 507fda3f | Iustin Pop | |
448 | 417f6b50 | Iustin Pop | maybePrintNodes shownodes "Final cluster" (Cluster.printNodes fin_nl) |
449 | 417f6b50 | Iustin Pop | |
450 | 5dad2589 | Iustin Pop | when (verbose > 3) $ printStats nl fin_nl |
451 | 5dad2589 | Iustin Pop | |
452 | 5dad2589 | Iustin Pop | eval <- maybeExecJobs opts ord_plc fin_nl il cmd_jobs |
453 | d5072e4c | Iustin Pop | unless eval (exitWith (ExitFailure 1)) |