X-Git-Url: https://code.grnet.gr/git/ganeti-local/blobdiff_plain/0903280b99d4b5a4a8e723876d99ca2117fe6c51..f4c7d37aa3d8a81bb403381abdfa9d213271fb9e:/hbal.hs diff --git a/hbal.hs b/hbal.hs index 635fc3c..a83b7fc 100644 --- a/hbal.hs +++ b/hbal.hs @@ -4,7 +4,7 @@ {- -Copyright (C) 2009 Google Inc. +Copyright (C) 2009, 2010 Google Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -28,17 +28,20 @@ module Main (main) where import Control.Concurrent (threadDelay) import Control.Exception (bracket) import Data.List -import Data.Maybe (isJust, fromJust) +import Data.Maybe (isJust, isNothing, fromJust) +import Data.IORef import Monad import System (exitWith, ExitCode(..)) import System.IO +import System.Posix.Process +import System.Posix.Signals import qualified System import Text.Printf (printf, hPrintf) -import Text.JSON (showJSON) import qualified Ganeti.HTools.Container as Container import qualified Ganeti.HTools.Cluster as Cluster +import qualified Ganeti.HTools.Group as Group import qualified Ganeti.HTools.Node as Node import qualified Ganeti.HTools.Instance as Instance @@ -47,8 +50,9 @@ import Ganeti.HTools.ExtLoader import Ganeti.HTools.Utils import Ganeti.HTools.Types +import Ganeti.HTools.Text (serializeCluster) + import qualified Ganeti.Luxi as L -import qualified Ganeti.OpCodes as OpCodes import Ganeti.Jobs -- | Options list and functions @@ -63,6 +67,7 @@ options = , oRapiMaster , oLuxiSocket , oExecJobs + , oGroup , oMaxSolLength , oVerbose , oQuiet @@ -70,10 +75,13 @@ options = , oMinScore , oMaxCpu , oMinDisk + , oMinGain + , oMinGainLim , oDiskMoves , oDynuFile , oExTags , oExInst + , oSaveCluster , oShowVer , oShowHelp ] @@ -90,15 +98,18 @@ iterateDepth :: Cluster.Table -- ^ The starting table -> [MoveJob] -- ^ Current command list -> Bool -- ^ Whether to be silent -> Score -- ^ Score at which to stop + -> Score -- ^ Min gain limit + -> Score -- ^ Min score gain -> Bool -- ^ Enable evacuation mode -> IO (Cluster.Table, [MoveJob]) -- ^ The resulting table -- and commands iterateDepth ini_tbl max_rounds disk_moves nmlen imlen - cmd_strs oneline min_score evac_mode = + cmd_strs oneline min_score mg_limit min_gain evac_mode = let Cluster.Table ini_nl ini_il _ _ = ini_tbl allowed_next = Cluster.doNextBalance ini_tbl max_rounds min_score m_fin_tbl = if allowed_next then Cluster.tryBalance ini_tbl disk_moves evac_mode + mg_limit min_gain else Nothing in case m_fin_tbl of @@ -117,7 +128,7 @@ iterateDepth ini_tbl max_rounds disk_moves nmlen imlen hFlush stdout iterateDepth fin_tbl max_rounds disk_moves nmlen imlen upd_cmd_strs oneline min_score - evac_mode + mg_limit min_gain evac_mode Nothing -> return (ini_tbl, cmd_strs) -- | Formats the solution for the oneline display @@ -126,10 +137,6 @@ formatOneline ini_cv plc_len fin_cv = printf "%.8f %d %.8f %8.3f" ini_cv plc_len fin_cv (if fin_cv == 0 then 1 else ini_cv / fin_cv) --- | Submits a list of jobs and waits for all to finish execution -execJobs :: L.Client -> [[OpCodes.OpCode]] -> IO (Result [String]) -execJobs client = L.submitManyJobs client . showJSON - -- | Polls a set of jobs at a fixed interval until all are finished -- one way or another waitForJobs :: L.Client -> [String] -> IO (Result [JobStatus]) @@ -137,7 +144,7 @@ waitForJobs client jids = do sts <- L.queryJobsStatus client jids case sts of Bad x -> return $ Bad x - Ok s -> if any (<= JobRunning) s + Ok s -> if any (<= JOB_STATUS_RUNNING) s then do -- TODO: replace hardcoded value with a better thing threadDelay (1000000 * 15) @@ -146,21 +153,27 @@ waitForJobs client jids = do -- | Check that a set of job statuses is all success checkJobsStatus :: [JobStatus] -> Bool -checkJobsStatus = all (== JobSuccess) +checkJobsStatus = all (== JOB_STATUS_SUCCESS) -- | Execute an entire jobset -execJobSet :: String -> String -> Node.List - -> Instance.List -> [JobSet] -> IO () -execJobSet _ _ _ _ [] = return () -execJobSet master csf nl il (js:jss) = do +execJobSet :: String -> Node.List + -> Instance.List -> IORef Int -> [JobSet] -> IO () +execJobSet _ _ _ _ [] = return () +execJobSet master nl il cref alljss@(js:jss) = do -- map from jobset (htools list of positions) to [[opcodes]] + cancel <- readIORef cref + when (cancel > 0) $ do + putStrLn ("Exiting early due to user request, " ++ show (length alljss) ++ + " jobset(s) remaining.") + exitWith $ ExitFailure 1 + let jobs = map (\(_, idx, move, _) -> - Cluster.iMoveToJob csf nl il idx move) js + Cluster.iMoveToJob nl il idx move) js let descr = map (\(_, idx, _, _) -> Container.nameOf il idx) js putStrLn $ "Executing jobset for instances " ++ commaJoin descr jrs <- bracket (L.getClient master) L.closeClient (\client -> do - jids <- execJobs client jobs + jids <- L.submitManyJobs client jobs case jids of Bad x -> return $ Bad x Ok x -> do @@ -172,12 +185,34 @@ execJobSet master csf nl il (js:jss) = do hPutStrLn stderr $ "Cannot compute job status, aborting: " ++ show x return () Ok x -> if checkJobsStatus x - then execJobSet master csf nl il jss + then execJobSet master nl il cref jss else do hPutStrLn stderr $ "Not all jobs completed successfully: " ++ show x hPutStrLn stderr "Aborting.") +-- | Signal handler for graceful termination +hangleSigInt :: IORef Int -> IO () +hangleSigInt cref = do + writeIORef cref 1 + putStrLn ("Cancel request registered, will exit at" ++ + " the end of the current job set...") + +-- | Signal handler for immediate termination +hangleSigTerm :: IORef Int -> IO () +hangleSigTerm cref = do + -- update the cref to 2, just for consistency + writeIORef cref 2 + putStrLn "Double cancel request, exiting now..." + exitImmediately $ ExitFailure 2 + +runJobSet :: String -> Node.List -> Instance.List -> [JobSet] -> IO () +runJobSet master fin_nl il cmd_jobs = do + cref <- newIORef 0 + mapM_ (\(hnd, sig) -> installHandler sig (Catch (hnd cref)) Nothing) + [(hangleSigTerm, softwareTermination), (hangleSigInt, keyboardSignal)] + execJobSet master fin_nl il cref cmd_jobs + -- | Main function. main :: IO () main = do @@ -192,41 +227,86 @@ main = do verbose = optVerbose opts shownodes = optShowNodes opts - (fixed_nl, il, ctags, csf) <- loadExternalData opts + (gl, fixed_nl, ilf, ctags) <- loadExternalData opts let offline_names = optOffline opts all_nodes = Container.elems fixed_nl - all_names = map Node.name all_nodes - offline_wrong = filter (flip notElem all_names) offline_names + all_names = concatMap allNames all_nodes + offline_wrong = filter (`notElem` all_names) offline_names offline_indices = map Node.idx $ - filter (\n -> elem (Node.name n) offline_names) + filter (\n -> + Node.name n `elem` offline_names || + Node.alias n `elem` offline_names) all_nodes m_cpu = optMcpu opts m_dsk = optMdsk opts + csf = commonSuffix fixed_nl ilf when (length offline_wrong > 0) $ do hPrintf stderr "Wrong node name(s) set as offline: %s\n" - (commaJoin offline_wrong) + (commaJoin offline_wrong) :: IO () exitWith $ ExitFailure 1 - let nm = Container.map (\n -> if elem (Node.idx n) offline_indices + let nm = Container.map (\n -> if Node.idx n `elem` offline_indices then Node.setOffline n True else n) fixed_nl - nl = Container.map (flip Node.setMdsk m_dsk . flip Node.setMcpu m_cpu) - nm + nlf = Container.map (flip Node.setMdsk m_dsk . flip Node.setMcpu m_cpu) + nm when (not oneline && verbose > 1) $ putStrLn $ "Loaded cluster tags: " ++ intercalate "," ctags - when (Container.size il == 0) $ do + when (Container.size ilf == 0) $ do (if oneline then putStrLn $ formatOneline 0 0 0 else printf "Cluster is empty, exiting.\n") exitWith ExitSuccess + let split_insts = Cluster.findSplitInstances nlf ilf + when (not . null $ split_insts) $ do + hPutStrLn stderr "Found instances belonging to multiple node groups:" + mapM_ (\i -> hPutStrLn stderr $ " " ++ Instance.name i) split_insts + hPutStrLn stderr "Aborting." + exitWith $ ExitFailure 1 + + let ngroups = Cluster.splitCluster nlf ilf + when (length ngroups > 1 && isNothing (optGroup opts)) $ do + hPutStrLn stderr "Found multiple node groups:" + mapM_ (hPutStrLn stderr . (" " ++) . Group.name . + (flip Container.find gl) . fst) ngroups + hPutStrLn stderr "Aborting." + exitWith $ ExitFailure 1 + unless oneline $ printf "Loaded %d nodes, %d instances\n" + (Container.size nlf) + (Container.size ilf) + + (gname, (nl, il)) <- case optGroup opts of + Nothing -> do + let (gidx, cdata) = head ngroups + grp = Container.find gidx gl + return (Group.name grp, cdata) + Just g -> case Container.findByName gl g of + Nothing -> do + hPutStrLn stderr $ "Node group " ++ g ++ + " not found. Node group list is:" + mapM_ (hPutStrLn stderr . (" " ++) . Group.name ) (Container.elems gl) + hPutStrLn stderr "Aborting." + exitWith $ ExitFailure 1 + Just grp -> + case lookup (Group.idx grp) ngroups of + Nothing -> do + -- TODO: while this is unlikely to happen, log here the + -- actual group data to help debugging + hPutStrLn stderr $ "Internal failure, missing group idx" + exitWith $ ExitFailure 1 + Just cdata -> return (Group.name grp, cdata) + + unless oneline $ printf "Group size %d nodes, %d instances\n" (Container.size nl) (Container.size il) + putStrLn $ "Selected node group: " ++ gname + when (length csf > 0 && not oneline && verbose > 1) $ printf "Note: Stripping common suffix of '%s' from names\n" csf @@ -268,12 +348,14 @@ main = do printf "Initial score: %.8f\n" ini_cv) unless oneline $ putStrLn "Trying to minimize the CV..." - let imlen = Container.maxNameLen il - nmlen = Container.maxNameLen nl + let imlen = maximum . map (length . Instance.alias) $ Container.elems il + nmlen = maximum . map (length . Node.alias) $ Container.elems nl (fin_tbl, cmd_strs) <- iterateDepth ini_tbl (optMaxLength opts) (optDiskMoves opts) - nmlen imlen [] oneline min_cv (optEvacMode opts) + nmlen imlen [] oneline min_cv + (optMinGainLim opts) (optMinGain opts) + (optEvacMode opts) let (Cluster.Table fin_nl fin_il fin_cv fin_plc) = fin_tbl ord_plc = reverse fin_plc sol_msg = if null fin_plc @@ -306,12 +388,12 @@ main = do writeFile out_path (shTemplate ++ cmd_data) printf "The commands have been written to file '%s'\n" out_path) - when (optExecJobs opts && not (null ord_plc)) - (case optLuxi opts of - Nothing -> do - hPutStrLn stderr "Execution of commands possible only on LUXI" - exitWith $ ExitFailure 1 - Just master -> execJobSet master csf fin_nl il cmd_jobs) + when (isJust $ optSaveCluster opts) $ + do + let out_path = fromJust $ optSaveCluster opts + adata = serializeCluster gl fin_nl fin_il ctags + writeFile out_path adata + printf "The cluster state has been written to file '%s'\n" out_path when (optShowInsts opts) $ do putStrLn "" @@ -328,8 +410,15 @@ main = do when (verbose > 3) $ do printf "Original: mem=%d disk=%d\n" - (Cluster.csFmem ini_cs) (Cluster.csFdsk ini_cs) + (Cluster.csFmem ini_cs) (Cluster.csFdsk ini_cs) :: IO () printf "Final: mem=%d disk=%d\n" (Cluster.csFmem fin_cs) (Cluster.csFdsk fin_cs) when oneline $ putStrLn $ formatOneline ini_cv (length ord_plc) fin_cv + + when (optExecJobs opts && not (null ord_plc)) + (case optLuxi opts of + Nothing -> do + hPutStrLn stderr "Execution of commands possible only on LUXI" + exitWith $ ExitFailure 1 + Just master -> runJobSet master fin_nl il cmd_jobs)