X-Git-Url: https://code.grnet.gr/git/ganeti-local/blobdiff_plain/f9acea1081fbc252f4d99dbfe5d864e358312015..f4c7d37aa3d8a81bb403381abdfa9d213271fb9e:/hbal.hs diff --git a/hbal.hs b/hbal.hs index f32706c..a83b7fc 100644 --- a/hbal.hs +++ b/hbal.hs @@ -4,7 +4,7 @@ {- -Copyright (C) 2009 Google Inc. +Copyright (C) 2009, 2010 Google Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -28,16 +28,20 @@ module Main (main) where import Control.Concurrent (threadDelay) import Control.Exception (bracket) import Data.List -import Data.Maybe (isJust, fromJust) +import Data.Maybe (isJust, isNothing, fromJust) +import Data.IORef import Monad import System (exitWith, ExitCode(..)) import System.IO +import System.Posix.Process +import System.Posix.Signals import qualified System import Text.Printf (printf, hPrintf) import qualified Ganeti.HTools.Container as Container import qualified Ganeti.HTools.Cluster as Cluster +import qualified Ganeti.HTools.Group as Group import qualified Ganeti.HTools.Node as Node import qualified Ganeti.HTools.Instance as Instance @@ -46,6 +50,8 @@ import Ganeti.HTools.ExtLoader import Ganeti.HTools.Utils import Ganeti.HTools.Types +import Ganeti.HTools.Text (serializeCluster) + import qualified Ganeti.Luxi as L import Ganeti.Jobs @@ -61,6 +67,7 @@ options = , oRapiMaster , oLuxiSocket , oExecJobs + , oGroup , oMaxSolLength , oVerbose , oQuiet @@ -68,10 +75,13 @@ options = , oMinScore , oMaxCpu , oMinDisk + , oMinGain + , oMinGainLim , oDiskMoves , oDynuFile , oExTags , oExInst + , oSaveCluster , oShowVer , oShowHelp ] @@ -88,15 +98,18 @@ iterateDepth :: Cluster.Table -- ^ The starting table -> [MoveJob] -- ^ Current command list -> Bool -- ^ Whether to be silent -> Score -- ^ Score at which to stop + -> Score -- ^ Min gain limit + -> Score -- ^ Min score gain -> Bool -- ^ Enable evacuation mode -> IO (Cluster.Table, [MoveJob]) -- ^ The resulting table -- and commands iterateDepth ini_tbl max_rounds disk_moves nmlen imlen - cmd_strs oneline min_score evac_mode = + cmd_strs oneline min_score mg_limit min_gain evac_mode = let Cluster.Table ini_nl ini_il _ _ = ini_tbl allowed_next = Cluster.doNextBalance ini_tbl max_rounds min_score m_fin_tbl = if allowed_next then Cluster.tryBalance ini_tbl disk_moves evac_mode + mg_limit min_gain else Nothing in case m_fin_tbl of @@ -115,7 +128,7 @@ iterateDepth ini_tbl max_rounds disk_moves nmlen imlen hFlush stdout iterateDepth fin_tbl max_rounds disk_moves nmlen imlen upd_cmd_strs oneline min_score - evac_mode + mg_limit min_gain evac_mode Nothing -> return (ini_tbl, cmd_strs) -- | Formats the solution for the oneline display @@ -144,10 +157,16 @@ checkJobsStatus = all (== JOB_STATUS_SUCCESS) -- | Execute an entire jobset execJobSet :: String -> Node.List - -> Instance.List -> [JobSet] -> IO () -execJobSet _ _ _ [] = return () -execJobSet master nl il (js:jss) = do + -> Instance.List -> IORef Int -> [JobSet] -> IO () +execJobSet _ _ _ _ [] = return () +execJobSet master nl il cref alljss@(js:jss) = do -- map from jobset (htools list of positions) to [[opcodes]] + cancel <- readIORef cref + when (cancel > 0) $ do + putStrLn ("Exiting early due to user request, " ++ show (length alljss) ++ + " jobset(s) remaining.") + exitWith $ ExitFailure 1 + let jobs = map (\(_, idx, move, _) -> Cluster.iMoveToJob nl il idx move) js let descr = map (\(_, idx, _, _) -> Container.nameOf il idx) js @@ -166,12 +185,34 @@ execJobSet master nl il (js:jss) = do hPutStrLn stderr $ "Cannot compute job status, aborting: " ++ show x return () Ok x -> if checkJobsStatus x - then execJobSet master nl il jss + then execJobSet master nl il cref jss else do hPutStrLn stderr $ "Not all jobs completed successfully: " ++ show x hPutStrLn stderr "Aborting.") +-- | Signal handler for graceful termination +hangleSigInt :: IORef Int -> IO () +hangleSigInt cref = do + writeIORef cref 1 + putStrLn ("Cancel request registered, will exit at" ++ + " the end of the current job set...") + +-- | Signal handler for immediate termination +hangleSigTerm :: IORef Int -> IO () +hangleSigTerm cref = do + -- update the cref to 2, just for consistency + writeIORef cref 2 + putStrLn "Double cancel request, exiting now..." + exitImmediately $ ExitFailure 2 + +runJobSet :: String -> Node.List -> Instance.List -> [JobSet] -> IO () +runJobSet master fin_nl il cmd_jobs = do + cref <- newIORef 0 + mapM_ (\(hnd, sig) -> installHandler sig (Catch (hnd cref)) Nothing) + [(hangleSigTerm, softwareTermination), (hangleSigInt, keyboardSignal)] + execJobSet master fin_nl il cref cmd_jobs + -- | Main function. main :: IO () main = do @@ -186,7 +227,7 @@ main = do verbose = optVerbose opts shownodes = optShowNodes opts - (fixed_nl, il, ctags) <- loadExternalData opts + (gl, fixed_nl, ilf, ctags) <- loadExternalData opts let offline_names = optOffline opts all_nodes = Container.elems fixed_nl @@ -199,7 +240,7 @@ main = do all_nodes m_cpu = optMcpu opts m_dsk = optMdsk opts - csf = commonSuffix fixed_nl il + csf = commonSuffix fixed_nl ilf when (length offline_wrong > 0) $ do hPrintf stderr "Wrong node name(s) set as offline: %s\n" @@ -209,21 +250,63 @@ main = do let nm = Container.map (\n -> if Node.idx n `elem` offline_indices then Node.setOffline n True else n) fixed_nl - nl = Container.map (flip Node.setMdsk m_dsk . flip Node.setMcpu m_cpu) - nm + nlf = Container.map (flip Node.setMdsk m_dsk . flip Node.setMcpu m_cpu) + nm when (not oneline && verbose > 1) $ putStrLn $ "Loaded cluster tags: " ++ intercalate "," ctags - when (Container.size il == 0) $ do + when (Container.size ilf == 0) $ do (if oneline then putStrLn $ formatOneline 0 0 0 else printf "Cluster is empty, exiting.\n") exitWith ExitSuccess + let split_insts = Cluster.findSplitInstances nlf ilf + when (not . null $ split_insts) $ do + hPutStrLn stderr "Found instances belonging to multiple node groups:" + mapM_ (\i -> hPutStrLn stderr $ " " ++ Instance.name i) split_insts + hPutStrLn stderr "Aborting." + exitWith $ ExitFailure 1 + + let ngroups = Cluster.splitCluster nlf ilf + when (length ngroups > 1 && isNothing (optGroup opts)) $ do + hPutStrLn stderr "Found multiple node groups:" + mapM_ (hPutStrLn stderr . (" " ++) . Group.name . + (flip Container.find gl) . fst) ngroups + hPutStrLn stderr "Aborting." + exitWith $ ExitFailure 1 + unless oneline $ printf "Loaded %d nodes, %d instances\n" + (Container.size nlf) + (Container.size ilf) + + (gname, (nl, il)) <- case optGroup opts of + Nothing -> do + let (gidx, cdata) = head ngroups + grp = Container.find gidx gl + return (Group.name grp, cdata) + Just g -> case Container.findByName gl g of + Nothing -> do + hPutStrLn stderr $ "Node group " ++ g ++ + " not found. Node group list is:" + mapM_ (hPutStrLn stderr . (" " ++) . Group.name ) (Container.elems gl) + hPutStrLn stderr "Aborting." + exitWith $ ExitFailure 1 + Just grp -> + case lookup (Group.idx grp) ngroups of + Nothing -> do + -- TODO: while this is unlikely to happen, log here the + -- actual group data to help debugging + hPutStrLn stderr $ "Internal failure, missing group idx" + exitWith $ ExitFailure 1 + Just cdata -> return (Group.name grp, cdata) + + unless oneline $ printf "Group size %d nodes, %d instances\n" (Container.size nl) (Container.size il) + putStrLn $ "Selected node group: " ++ gname + when (length csf > 0 && not oneline && verbose > 1) $ printf "Note: Stripping common suffix of '%s' from names\n" csf @@ -265,12 +348,14 @@ main = do printf "Initial score: %.8f\n" ini_cv) unless oneline $ putStrLn "Trying to minimize the CV..." - let imlen = Container.maxNameLen il - nmlen = Container.maxNameLen nl + let imlen = maximum . map (length . Instance.alias) $ Container.elems il + nmlen = maximum . map (length . Node.alias) $ Container.elems nl (fin_tbl, cmd_strs) <- iterateDepth ini_tbl (optMaxLength opts) (optDiskMoves opts) - nmlen imlen [] oneline min_cv (optEvacMode opts) + nmlen imlen [] oneline min_cv + (optMinGainLim opts) (optMinGain opts) + (optEvacMode opts) let (Cluster.Table fin_nl fin_il fin_cv fin_plc) = fin_tbl ord_plc = reverse fin_plc sol_msg = if null fin_plc @@ -303,12 +388,12 @@ main = do writeFile out_path (shTemplate ++ cmd_data) printf "The commands have been written to file '%s'\n" out_path) - when (optExecJobs opts && not (null ord_plc)) - (case optLuxi opts of - Nothing -> do - hPutStrLn stderr "Execution of commands possible only on LUXI" - exitWith $ ExitFailure 1 - Just master -> execJobSet master fin_nl il cmd_jobs) + when (isJust $ optSaveCluster opts) $ + do + let out_path = fromJust $ optSaveCluster opts + adata = serializeCluster gl fin_nl fin_il ctags + writeFile out_path adata + printf "The cluster state has been written to file '%s'\n" out_path when (optShowInsts opts) $ do putStrLn "" @@ -330,3 +415,10 @@ main = do (Cluster.csFmem fin_cs) (Cluster.csFdsk fin_cs) when oneline $ putStrLn $ formatOneline ini_cv (length ord_plc) fin_cv + + when (optExecJobs opts && not (null ord_plc)) + (case optLuxi opts of + Nothing -> do + hPutStrLn stderr "Execution of commands possible only on LUXI" + exitWith $ ExitFailure 1 + Just master -> runJobSet master fin_nl il cmd_jobs)