7 Copyright (C) 2009, 2010 Google Inc.
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 2 of the License, or
12 (at your option) any later version.
14 This program is distributed in the hope that it will be useful, but
15 WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26 module Main (main) where
28 import Control.Concurrent (threadDelay)
29 import Control.Exception (bracket)
31 import Data.Maybe (isJust, fromJust)
34 import System (exitWith, ExitCode(..))
36 import System.Posix.Process
37 import System.Posix.Signals
38 import qualified System
40 import Text.Printf (printf, hPrintf)
42 import qualified Ganeti.HTools.Container as Container
43 import qualified Ganeti.HTools.Cluster as Cluster
44 import qualified Ganeti.HTools.Node as Node
45 import qualified Ganeti.HTools.Instance as Instance
47 import Ganeti.HTools.CLI
48 import Ganeti.HTools.ExtLoader
49 import Ganeti.HTools.Utils
50 import Ganeti.HTools.Types
52 import Ganeti.HTools.Text (serializeCluster)
54 import qualified Ganeti.Luxi as L
57 -- | Options list and functions
87 {- | Start computing the solution at the given depth and recurse until
88 we find a valid solution or we exceed the maximum depth.
91 iterateDepth :: Cluster.Table -- ^ The starting table
92 -> Int -- ^ Remaining length
93 -> Bool -- ^ Allow disk moves
94 -> Int -- ^ Max node name len
95 -> Int -- ^ Max instance name len
96 -> [MoveJob] -- ^ Current command list
97 -> Bool -- ^ Whether to be silent
98 -> Score -- ^ Score at which to stop
99 -> Score -- ^ Min gain limit
100 -> Score -- ^ Min score gain
101 -> Bool -- ^ Enable evacuation mode
102 -> IO (Cluster.Table, [MoveJob]) -- ^ The resulting table
104 iterateDepth ini_tbl max_rounds disk_moves nmlen imlen
105 cmd_strs oneline min_score mg_limit min_gain evac_mode =
106 let Cluster.Table ini_nl ini_il _ _ = ini_tbl
107 allowed_next = Cluster.doNextBalance ini_tbl max_rounds min_score
108 m_fin_tbl = if allowed_next
109 then Cluster.tryBalance ini_tbl disk_moves evac_mode
117 (Cluster.Table _ _ _ fin_plc) = fin_tbl
118 fin_plc_len = length fin_plc
119 cur_plc@(idx, _, _, move, _) = head fin_plc
120 (sol_line, cmds) = Cluster.printSolutionLine ini_nl ini_il
121 nmlen imlen cur_plc fin_plc_len
122 afn = Cluster.involvedNodes ini_il cur_plc
123 upd_cmd_strs = (afn, idx, move, cmds):cmd_strs
127 iterateDepth fin_tbl max_rounds disk_moves
128 nmlen imlen upd_cmd_strs oneline min_score
129 mg_limit min_gain evac_mode
130 Nothing -> return (ini_tbl, cmd_strs)
132 -- | Formats the solution for the oneline display
133 formatOneline :: Double -> Int -> Double -> String
134 formatOneline ini_cv plc_len fin_cv =
135 printf "%.8f %d %.8f %8.3f" ini_cv plc_len fin_cv
136 (if fin_cv == 0 then 1 else ini_cv / fin_cv)
138 -- | Polls a set of jobs at a fixed interval until all are finished
139 -- one way or another
140 waitForJobs :: L.Client -> [String] -> IO (Result [JobStatus])
141 waitForJobs client jids = do
142 sts <- L.queryJobsStatus client jids
144 Bad x -> return $ Bad x
145 Ok s -> if any (<= JOB_STATUS_RUNNING) s
147 -- TODO: replace hardcoded value with a better thing
148 threadDelay (1000000 * 15)
149 waitForJobs client jids
152 -- | Check that a set of job statuses is all success
153 checkJobsStatus :: [JobStatus] -> Bool
154 checkJobsStatus = all (== JOB_STATUS_SUCCESS)
156 -- | Execute an entire jobset
157 execJobSet :: String -> Node.List
158 -> Instance.List -> IORef Int -> [JobSet] -> IO ()
159 execJobSet _ _ _ _ [] = return ()
160 execJobSet master nl il cref alljss@(js:jss) = do
161 -- map from jobset (htools list of positions) to [[opcodes]]
162 cancel <- readIORef cref
163 when (cancel > 0) $ do
164 putStrLn ("Exiting early due to user request, " ++ show (length alljss) ++
165 " jobset(s) remaining.")
166 exitWith $ ExitFailure 1
168 let jobs = map (\(_, idx, move, _) ->
169 Cluster.iMoveToJob nl il idx move) js
170 let descr = map (\(_, idx, _, _) -> Container.nameOf il idx) js
171 putStrLn $ "Executing jobset for instances " ++ commaJoin descr
172 jrs <- bracket (L.getClient master) L.closeClient
174 jids <- L.submitManyJobs client jobs
176 Bad x -> return $ Bad x
178 putStrLn $ "Got job IDs " ++ commaJoin x
183 hPutStrLn stderr $ "Cannot compute job status, aborting: " ++ show x
185 Ok x -> if checkJobsStatus x
186 then execJobSet master nl il cref jss
188 hPutStrLn stderr $ "Not all jobs completed successfully: " ++
190 hPutStrLn stderr "Aborting.")
192 -- | Signal handler for graceful termination
193 hangleSigInt :: IORef Int -> IO ()
194 hangleSigInt cref = do
196 putStrLn ("Cancel request registered, will exit at" ++
197 " the end of the current job set...")
199 -- | Signal handler for immediate termination
200 hangleSigTerm :: IORef Int -> IO ()
201 hangleSigTerm cref = do
202 -- update the cref to 2, just for consistency
204 putStrLn "Double cancel request, exiting now..."
205 exitImmediately $ ExitFailure 2
207 runJobSet :: String -> Node.List -> Instance.List -> [JobSet] -> IO ()
208 runJobSet master fin_nl il cmd_jobs = do
210 mapM_ (\(hnd, sig) -> installHandler sig (Catch (hnd cref)) Nothing)
211 [(hangleSigTerm, softwareTermination), (hangleSigInt, keyboardSignal)]
212 execJobSet master fin_nl il cref cmd_jobs
217 cmd_args <- System.getArgs
218 (opts, args) <- parseOpts cmd_args "hbal" options
220 unless (null args) $ do
221 hPutStrLn stderr "Error: this program doesn't take any arguments."
222 exitWith $ ExitFailure 1
224 let oneline = optOneline opts
225 verbose = optVerbose opts
226 shownodes = optShowNodes opts
228 (fixed_nl, il, ctags) <- loadExternalData opts
230 let offline_names = optOffline opts
231 all_nodes = Container.elems fixed_nl
232 all_names = concatMap allNames all_nodes
233 offline_wrong = filter (`notElem` all_names) offline_names
234 offline_indices = map Node.idx $
236 Node.name n `elem` offline_names ||
237 Node.alias n `elem` offline_names)
241 csf = commonSuffix fixed_nl il
243 when (length offline_wrong > 0) $ do
244 hPrintf stderr "Wrong node name(s) set as offline: %s\n"
245 (commaJoin offline_wrong) :: IO ()
246 exitWith $ ExitFailure 1
248 let nm = Container.map (\n -> if Node.idx n `elem` offline_indices
249 then Node.setOffline n True
251 nl = Container.map (flip Node.setMdsk m_dsk . flip Node.setMcpu m_cpu)
254 when (not oneline && verbose > 1) $
255 putStrLn $ "Loaded cluster tags: " ++ intercalate "," ctags
257 when (Container.size il == 0) $ do
258 (if oneline then putStrLn $ formatOneline 0 0 0
259 else printf "Cluster is empty, exiting.\n")
262 unless oneline $ printf "Loaded %d nodes, %d instances\n"
266 when (length csf > 0 && not oneline && verbose > 1) $
267 printf "Note: Stripping common suffix of '%s' from names\n" csf
269 let (bad_nodes, bad_instances) = Cluster.computeBadItems nl il
270 unless (oneline || verbose == 0) $ printf
271 "Initial check done: %d bad nodes, %d bad instances.\n"
272 (length bad_nodes) (length bad_instances)
274 when (length bad_nodes > 0) $
275 putStrLn "Cluster is not N+1 happy, continuing but no guarantee \
276 \that the cluster will end N+1 happy."
278 when (optShowInsts opts) $ do
280 putStrLn "Initial instance map:"
281 putStrLn $ Cluster.printInsts nl il
283 when (isJust shownodes) $
285 putStrLn "Initial cluster status:"
286 putStrLn $ Cluster.printNodes nl (fromJust shownodes)
288 let ini_cv = Cluster.compCV nl
289 ini_tbl = Cluster.Table nl il ini_cv []
290 min_cv = optMinScore opts
292 when (ini_cv < min_cv) $ do
294 putStrLn $ formatOneline ini_cv 0 ini_cv
295 else printf "Cluster is already well balanced (initial score %.6g,\n\
296 \minimum score %.6g).\nNothing to do, exiting\n"
300 unless oneline (if verbose > 2 then
301 printf "Initial coefficients: overall %.8f, %s\n"
302 ini_cv (Cluster.printStats nl)
304 printf "Initial score: %.8f\n" ini_cv)
306 unless oneline $ putStrLn "Trying to minimize the CV..."
307 let imlen = maximum . map (length . Instance.alias) $ Container.elems il
308 nmlen = maximum . map (length . Node.alias) $ Container.elems nl
310 (fin_tbl, cmd_strs) <- iterateDepth ini_tbl (optMaxLength opts)
312 nmlen imlen [] oneline min_cv
313 (optMinGainLim opts) (optMinGain opts)
315 let (Cluster.Table fin_nl fin_il fin_cv fin_plc) = fin_tbl
316 ord_plc = reverse fin_plc
317 sol_msg = if null fin_plc
318 then printf "No solution found\n"
320 then printf "Final coefficients: overall %.8f, %s\n"
321 fin_cv (Cluster.printStats fin_nl)
322 else printf "Cluster score improved from %.8f to %.8f\n"
326 unless oneline $ putStr sol_msg
328 unless (oneline || verbose == 0) $
329 printf "Solution length=%d\n" (length ord_plc)
331 let cmd_jobs = Cluster.splitJobs cmd_strs
332 cmd_data = Cluster.formatCmds cmd_jobs
334 when (isJust $ optShowCmds opts) $
336 let out_path = fromJust $ optShowCmds opts
338 (if out_path == "-" then
339 printf "Commands to run to reach the above solution:\n%s"
340 (unlines . map (" " ++) .
341 filter (/= " check") .
344 writeFile out_path (shTemplate ++ cmd_data)
345 printf "The commands have been written to file '%s'\n" out_path)
347 when (isJust $ optSaveCluster opts) $
349 let out_path = fromJust $ optSaveCluster opts
350 adata = serializeCluster fin_nl fin_il
351 writeFile out_path adata
352 printf "The cluster state has been written to file '%s'\n" out_path
354 when (optShowInsts opts) $ do
356 putStrLn "Final instance map:"
357 putStr $ Cluster.printInsts fin_nl fin_il
359 when (isJust shownodes) $
361 let ini_cs = Cluster.totalResources nl
362 fin_cs = Cluster.totalResources fin_nl
364 putStrLn "Final cluster status:"
365 putStrLn $ Cluster.printNodes fin_nl (fromJust shownodes)
368 printf "Original: mem=%d disk=%d\n"
369 (Cluster.csFmem ini_cs) (Cluster.csFdsk ini_cs) :: IO ()
370 printf "Final: mem=%d disk=%d\n"
371 (Cluster.csFmem fin_cs) (Cluster.csFdsk fin_cs)
373 putStrLn $ formatOneline ini_cv (length ord_plc) fin_cv
375 when (optExecJobs opts && not (null ord_plc))
376 (case optLuxi opts of
378 hPutStrLn stderr "Execution of commands possible only on LUXI"
379 exitWith $ ExitFailure 1
380 Just master -> runJobSet master fin_nl il cmd_jobs)