7 Copyright (C) 2009, 2010 Google Inc.
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 2 of the License, or
12 (at your option) any later version.
14 This program is distributed in the hope that it will be useful, but
15 WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26 module Main (main) where
28 import Control.Concurrent (threadDelay)
29 import Control.Exception (bracket)
31 import Data.Maybe (isJust, isNothing, fromJust)
34 import System (exitWith, ExitCode(..))
36 import System.Posix.Process
37 import System.Posix.Signals
38 import qualified System
40 import Text.Printf (printf, hPrintf)
42 import qualified Ganeti.HTools.Container as Container
43 import qualified Ganeti.HTools.Cluster as Cluster
44 import qualified Ganeti.HTools.Group as Group
45 import qualified Ganeti.HTools.Node as Node
46 import qualified Ganeti.HTools.Instance as Instance
48 import Ganeti.HTools.CLI
49 import Ganeti.HTools.ExtLoader
50 import Ganeti.HTools.Utils
51 import Ganeti.HTools.Types
53 import Ganeti.HTools.Text (serializeCluster)
55 import qualified Ganeti.Luxi as L
58 -- | Options list and functions
89 {- | Start computing the solution at the given depth and recurse until
90 we find a valid solution or we exceed the maximum depth.
93 iterateDepth :: Cluster.Table -- ^ The starting table
94 -> Int -- ^ Remaining length
95 -> Bool -- ^ Allow disk moves
96 -> Int -- ^ Max node name len
97 -> Int -- ^ Max instance name len
98 -> [MoveJob] -- ^ Current command list
99 -> Bool -- ^ Whether to be silent
100 -> Score -- ^ Score at which to stop
101 -> Score -- ^ Min gain limit
102 -> Score -- ^ Min score gain
103 -> Bool -- ^ Enable evacuation mode
104 -> IO (Cluster.Table, [MoveJob]) -- ^ The resulting table
106 iterateDepth ini_tbl max_rounds disk_moves nmlen imlen
107 cmd_strs oneline min_score mg_limit min_gain evac_mode =
108 let Cluster.Table ini_nl ini_il _ _ = ini_tbl
109 allowed_next = Cluster.doNextBalance ini_tbl max_rounds min_score
110 m_fin_tbl = if allowed_next
111 then Cluster.tryBalance ini_tbl disk_moves evac_mode
119 (Cluster.Table _ _ _ fin_plc) = fin_tbl
120 fin_plc_len = length fin_plc
121 cur_plc@(idx, _, _, move, _) = head fin_plc
122 (sol_line, cmds) = Cluster.printSolutionLine ini_nl ini_il
123 nmlen imlen cur_plc fin_plc_len
124 afn = Cluster.involvedNodes ini_il cur_plc
125 upd_cmd_strs = (afn, idx, move, cmds):cmd_strs
129 iterateDepth fin_tbl max_rounds disk_moves
130 nmlen imlen upd_cmd_strs oneline min_score
131 mg_limit min_gain evac_mode
132 Nothing -> return (ini_tbl, cmd_strs)
134 -- | Formats the solution for the oneline display
135 formatOneline :: Double -> Int -> Double -> String
136 formatOneline ini_cv plc_len fin_cv =
137 printf "%.8f %d %.8f %8.3f" ini_cv plc_len fin_cv
138 (if fin_cv == 0 then 1 else ini_cv / fin_cv)
140 -- | Polls a set of jobs at a fixed interval until all are finished
141 -- one way or another
142 waitForJobs :: L.Client -> [String] -> IO (Result [JobStatus])
143 waitForJobs client jids = do
144 sts <- L.queryJobsStatus client jids
146 Bad x -> return $ Bad x
147 Ok s -> if any (<= JOB_STATUS_RUNNING) s
149 -- TODO: replace hardcoded value with a better thing
150 threadDelay (1000000 * 15)
151 waitForJobs client jids
154 -- | Check that a set of job statuses is all success
155 checkJobsStatus :: [JobStatus] -> Bool
156 checkJobsStatus = all (== JOB_STATUS_SUCCESS)
158 -- | Execute an entire jobset
159 execJobSet :: String -> Node.List
160 -> Instance.List -> IORef Int -> [JobSet] -> IO ()
161 execJobSet _ _ _ _ [] = return ()
162 execJobSet master nl il cref alljss@(js:jss) = do
163 -- map from jobset (htools list of positions) to [[opcodes]]
164 cancel <- readIORef cref
165 when (cancel > 0) $ do
166 putStrLn ("Exiting early due to user request, " ++ show (length alljss) ++
167 " jobset(s) remaining.")
168 exitWith $ ExitFailure 1
170 let jobs = map (\(_, idx, move, _) ->
171 Cluster.iMoveToJob nl il idx move) js
172 let descr = map (\(_, idx, _, _) -> Container.nameOf il idx) js
173 putStrLn $ "Executing jobset for instances " ++ commaJoin descr
174 jrs <- bracket (L.getClient master) L.closeClient
176 jids <- L.submitManyJobs client jobs
178 Bad x -> return $ Bad x
180 putStrLn $ "Got job IDs " ++ commaJoin x
185 hPutStrLn stderr $ "Cannot compute job status, aborting: " ++ show x
187 Ok x -> if checkJobsStatus x
188 then execJobSet master nl il cref jss
190 hPutStrLn stderr $ "Not all jobs completed successfully: " ++
192 hPutStrLn stderr "Aborting.")
194 -- | Signal handler for graceful termination
195 hangleSigInt :: IORef Int -> IO ()
196 hangleSigInt cref = do
198 putStrLn ("Cancel request registered, will exit at" ++
199 " the end of the current job set...")
201 -- | Signal handler for immediate termination
202 hangleSigTerm :: IORef Int -> IO ()
203 hangleSigTerm cref = do
204 -- update the cref to 2, just for consistency
206 putStrLn "Double cancel request, exiting now..."
207 exitImmediately $ ExitFailure 2
209 runJobSet :: String -> Node.List -> Instance.List -> [JobSet] -> IO ()
210 runJobSet master fin_nl il cmd_jobs = do
212 mapM_ (\(hnd, sig) -> installHandler sig (Catch (hnd cref)) Nothing)
213 [(hangleSigTerm, softwareTermination), (hangleSigInt, keyboardSignal)]
214 execJobSet master fin_nl il cref cmd_jobs
219 cmd_args <- System.getArgs
220 (opts, args) <- parseOpts cmd_args "hbal" options
222 unless (null args) $ do
223 hPutStrLn stderr "Error: this program doesn't take any arguments."
224 exitWith $ ExitFailure 1
226 let oneline = optOneline opts
227 verbose = optVerbose opts
228 shownodes = optShowNodes opts
230 (gl, fixed_nl, ilf, ctags) <- loadExternalData opts
232 let offline_names = optOffline opts
233 all_nodes = Container.elems fixed_nl
234 all_names = concatMap allNames all_nodes
235 offline_wrong = filter (`notElem` all_names) offline_names
236 offline_indices = map Node.idx $
238 Node.name n `elem` offline_names ||
239 Node.alias n `elem` offline_names)
243 csf = commonSuffix fixed_nl ilf
245 when (length offline_wrong > 0) $ do
246 hPrintf stderr "Wrong node name(s) set as offline: %s\n"
247 (commaJoin offline_wrong) :: IO ()
248 exitWith $ ExitFailure 1
250 let nm = Container.map (\n -> if Node.idx n `elem` offline_indices
251 then Node.setOffline n True
253 nlf = Container.map (flip Node.setMdsk m_dsk . flip Node.setMcpu m_cpu)
256 when (not oneline && verbose > 1) $
257 putStrLn $ "Loaded cluster tags: " ++ intercalate "," ctags
259 when (Container.size ilf == 0) $ do
260 (if oneline then putStrLn $ formatOneline 0 0 0
261 else printf "Cluster is empty, exiting.\n")
264 let split_insts = Cluster.findSplitInstances nlf ilf
265 when (not . null $ split_insts) $ do
266 hPutStrLn stderr "Found instances belonging to multiple node groups:"
267 mapM_ (\i -> hPutStrLn stderr $ " " ++ Instance.name i) split_insts
268 hPutStrLn stderr "Aborting."
269 exitWith $ ExitFailure 1
271 let ngroups = Cluster.splitCluster nlf ilf
272 when (length ngroups > 1 && isNothing (optGroup opts)) $ do
273 hPutStrLn stderr "Found multiple node groups:"
274 mapM_ (hPutStrLn stderr . (" " ++) . Group.name .
275 (flip Container.find gl) . fst) ngroups
276 hPutStrLn stderr "Aborting."
277 exitWith $ ExitFailure 1
279 unless oneline $ printf "Loaded %d nodes, %d instances\n"
283 (gname, (nl, il)) <- case optGroup opts of
285 let (gidx, cdata) = head ngroups
286 grp = Container.find gidx gl
287 return (Group.name grp, cdata)
288 Just g -> case Container.findByName gl g of
290 hPutStrLn stderr $ "Node group " ++ g ++
291 " not found. Node group list is:"
292 mapM_ (hPutStrLn stderr . (" " ++) . Group.name ) (Container.elems gl)
293 hPutStrLn stderr "Aborting."
294 exitWith $ ExitFailure 1
296 case lookup (Group.idx grp) ngroups of
298 -- TODO: while this is unlikely to happen, log here the
299 -- actual group data to help debugging
300 hPutStrLn stderr $ "Internal failure, missing group idx"
301 exitWith $ ExitFailure 1
302 Just cdata -> return (Group.name grp, cdata)
304 unless oneline $ printf "Group size %d nodes, %d instances\n"
308 putStrLn $ "Selected node group: " ++ gname
310 when (length csf > 0 && not oneline && verbose > 1) $
311 printf "Note: Stripping common suffix of '%s' from names\n" csf
313 let (bad_nodes, bad_instances) = Cluster.computeBadItems nl il
314 unless (oneline || verbose == 0) $ printf
315 "Initial check done: %d bad nodes, %d bad instances.\n"
316 (length bad_nodes) (length bad_instances)
318 when (length bad_nodes > 0) $
319 putStrLn "Cluster is not N+1 happy, continuing but no guarantee \
320 \that the cluster will end N+1 happy."
322 when (optShowInsts opts) $ do
324 putStrLn "Initial instance map:"
325 putStrLn $ Cluster.printInsts nl il
327 when (isJust shownodes) $
329 putStrLn "Initial cluster status:"
330 putStrLn $ Cluster.printNodes nl (fromJust shownodes)
332 let ini_cv = Cluster.compCV nl
333 ini_tbl = Cluster.Table nl il ini_cv []
334 min_cv = optMinScore opts
336 when (ini_cv < min_cv) $ do
338 putStrLn $ formatOneline ini_cv 0 ini_cv
339 else printf "Cluster is already well balanced (initial score %.6g,\n\
340 \minimum score %.6g).\nNothing to do, exiting\n"
344 unless oneline (if verbose > 2 then
345 printf "Initial coefficients: overall %.8f, %s\n"
346 ini_cv (Cluster.printStats nl)
348 printf "Initial score: %.8f\n" ini_cv)
350 unless oneline $ putStrLn "Trying to minimize the CV..."
351 let imlen = maximum . map (length . Instance.alias) $ Container.elems il
352 nmlen = maximum . map (length . Node.alias) $ Container.elems nl
354 (fin_tbl, cmd_strs) <- iterateDepth ini_tbl (optMaxLength opts)
356 nmlen imlen [] oneline min_cv
357 (optMinGainLim opts) (optMinGain opts)
359 let (Cluster.Table fin_nl fin_il fin_cv fin_plc) = fin_tbl
360 ord_plc = reverse fin_plc
361 sol_msg = if null fin_plc
362 then printf "No solution found\n"
364 then printf "Final coefficients: overall %.8f, %s\n"
365 fin_cv (Cluster.printStats fin_nl)
366 else printf "Cluster score improved from %.8f to %.8f\n"
370 unless oneline $ putStr sol_msg
372 unless (oneline || verbose == 0) $
373 printf "Solution length=%d\n" (length ord_plc)
375 let cmd_jobs = Cluster.splitJobs cmd_strs
376 cmd_data = Cluster.formatCmds cmd_jobs
378 when (isJust $ optShowCmds opts) $
380 let out_path = fromJust $ optShowCmds opts
382 (if out_path == "-" then
383 printf "Commands to run to reach the above solution:\n%s"
384 (unlines . map (" " ++) .
385 filter (/= " check") .
388 writeFile out_path (shTemplate ++ cmd_data)
389 printf "The commands have been written to file '%s'\n" out_path)
391 when (isJust $ optSaveCluster opts) $
393 let out_path = fromJust $ optSaveCluster opts
394 adata = serializeCluster gl fin_nl fin_il
395 writeFile out_path adata
396 printf "The cluster state has been written to file '%s'\n" out_path
398 when (optShowInsts opts) $ do
400 putStrLn "Final instance map:"
401 putStr $ Cluster.printInsts fin_nl fin_il
403 when (isJust shownodes) $
405 let ini_cs = Cluster.totalResources nl
406 fin_cs = Cluster.totalResources fin_nl
408 putStrLn "Final cluster status:"
409 putStrLn $ Cluster.printNodes fin_nl (fromJust shownodes)
412 printf "Original: mem=%d disk=%d\n"
413 (Cluster.csFmem ini_cs) (Cluster.csFdsk ini_cs) :: IO ()
414 printf "Final: mem=%d disk=%d\n"
415 (Cluster.csFmem fin_cs) (Cluster.csFdsk fin_cs)
417 putStrLn $ formatOneline ini_cv (length ord_plc) fin_cv
419 when (optExecJobs opts && not (null ord_plc))
420 (case optLuxi opts of
422 hPutStrLn stderr "Execution of commands possible only on LUXI"
423 exitWith $ ExitFailure 1
424 Just master -> runJobSet master fin_nl il cmd_jobs)