{-
-Copyright (C) 2009 Google Inc.
+Copyright (C) 2009, 2010 Google Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
import Control.Concurrent (threadDelay)
import Control.Exception (bracket)
import Data.List
-import Data.Maybe (isJust, fromJust)
+import Data.Maybe (isJust, isNothing, fromJust)
+import Data.IORef
import Monad
import System (exitWith, ExitCode(..))
import System.IO
+import System.Posix.Process
+import System.Posix.Signals
import qualified System
import Text.Printf (printf, hPrintf)
-import Text.JSON (showJSON)
import qualified Ganeti.HTools.Container as Container
import qualified Ganeti.HTools.Cluster as Cluster
+import qualified Ganeti.HTools.Group as Group
import qualified Ganeti.HTools.Node as Node
import qualified Ganeti.HTools.Instance as Instance
import Ganeti.HTools.Utils
import Ganeti.HTools.Types
+import Ganeti.HTools.Text (serializeCluster)
+
import qualified Ganeti.Luxi as L
-import qualified Ganeti.OpCodes as OpCodes
import Ganeti.Jobs
-- | Options list and functions
, oRapiMaster
, oLuxiSocket
, oExecJobs
+ , oGroup
, oMaxSolLength
, oVerbose
, oQuiet
, oMinScore
, oMaxCpu
, oMinDisk
+ , oMinGain
+ , oMinGainLim
, oDiskMoves
, oDynuFile
, oExTags
, oExInst
+ , oSaveCluster
, oShowVer
, oShowHelp
]
-> [MoveJob] -- ^ Current command list
-> Bool -- ^ Whether to be silent
-> Score -- ^ Score at which to stop
+ -> Score -- ^ Min gain limit
+ -> Score -- ^ Min score gain
-> Bool -- ^ Enable evacuation mode
-> IO (Cluster.Table, [MoveJob]) -- ^ The resulting table
-- and commands
iterateDepth ini_tbl max_rounds disk_moves nmlen imlen
- cmd_strs oneline min_score evac_mode =
+ cmd_strs oneline min_score mg_limit min_gain evac_mode =
let Cluster.Table ini_nl ini_il _ _ = ini_tbl
allowed_next = Cluster.doNextBalance ini_tbl max_rounds min_score
m_fin_tbl = if allowed_next
then Cluster.tryBalance ini_tbl disk_moves evac_mode
+ mg_limit min_gain
else Nothing
in
case m_fin_tbl of
hFlush stdout
iterateDepth fin_tbl max_rounds disk_moves
nmlen imlen upd_cmd_strs oneline min_score
- evac_mode
+ mg_limit min_gain evac_mode
Nothing -> return (ini_tbl, cmd_strs)
-- | Formats the solution for the oneline display
printf "%.8f %d %.8f %8.3f" ini_cv plc_len fin_cv
(if fin_cv == 0 then 1 else ini_cv / fin_cv)
--- | Submits a list of jobs and waits for all to finish execution
-execJobs :: L.Client -> [[OpCodes.OpCode]] -> IO (Result [String])
-execJobs client = L.submitManyJobs client . showJSON
-
-- | Polls a set of jobs at a fixed interval until all are finished
-- one way or another
waitForJobs :: L.Client -> [String] -> IO (Result [JobStatus])
sts <- L.queryJobsStatus client jids
case sts of
Bad x -> return $ Bad x
- Ok s -> if any (<= JobRunning) s
+ Ok s -> if any (<= JOB_STATUS_RUNNING) s
then do
-- TODO: replace hardcoded value with a better thing
threadDelay (1000000 * 15)
-- | Check that a set of job statuses is all success
checkJobsStatus :: [JobStatus] -> Bool
-checkJobsStatus = all (== JobSuccess)
+checkJobsStatus = all (== JOB_STATUS_SUCCESS)
-- | Execute an entire jobset
execJobSet :: String -> Node.List
- -> Instance.List -> [JobSet] -> IO ()
-execJobSet _ _ _ [] = return ()
-execJobSet master nl il (js:jss) = do
+ -> Instance.List -> IORef Int -> [JobSet] -> IO ()
+execJobSet _ _ _ _ [] = return ()
+execJobSet master nl il cref alljss@(js:jss) = do
-- map from jobset (htools list of positions) to [[opcodes]]
+ cancel <- readIORef cref
+ when (cancel > 0) $ do
+ putStrLn ("Exiting early due to user request, " ++ show (length alljss) ++
+ " jobset(s) remaining.")
+ exitWith $ ExitFailure 1
+
let jobs = map (\(_, idx, move, _) ->
Cluster.iMoveToJob nl il idx move) js
let descr = map (\(_, idx, _, _) -> Container.nameOf il idx) js
putStrLn $ "Executing jobset for instances " ++ commaJoin descr
jrs <- bracket (L.getClient master) L.closeClient
(\client -> do
- jids <- execJobs client jobs
+ jids <- L.submitManyJobs client jobs
case jids of
Bad x -> return $ Bad x
Ok x -> do
hPutStrLn stderr $ "Cannot compute job status, aborting: " ++ show x
return ()
Ok x -> if checkJobsStatus x
- then execJobSet master nl il jss
+ then execJobSet master nl il cref jss
else do
hPutStrLn stderr $ "Not all jobs completed successfully: " ++
show x
hPutStrLn stderr "Aborting.")
+-- | Signal handler for graceful termination
+hangleSigInt :: IORef Int -> IO ()
+hangleSigInt cref = do
+ writeIORef cref 1
+ putStrLn ("Cancel request registered, will exit at" ++
+ " the end of the current job set...")
+
+-- | Signal handler for immediate termination
+hangleSigTerm :: IORef Int -> IO ()
+hangleSigTerm cref = do
+ -- update the cref to 2, just for consistency
+ writeIORef cref 2
+ putStrLn "Double cancel request, exiting now..."
+ exitImmediately $ ExitFailure 2
+
+runJobSet :: String -> Node.List -> Instance.List -> [JobSet] -> IO ()
+runJobSet master fin_nl il cmd_jobs = do
+ cref <- newIORef 0
+ mapM_ (\(hnd, sig) -> installHandler sig (Catch (hnd cref)) Nothing)
+ [(hangleSigTerm, softwareTermination), (hangleSigInt, keyboardSignal)]
+ execJobSet master fin_nl il cref cmd_jobs
+
-- | Main function.
main :: IO ()
main = do
verbose = optVerbose opts
shownodes = optShowNodes opts
- (fixed_nl, il, ctags) <- loadExternalData opts
+ (gl, fixed_nl, ilf, ctags) <- loadExternalData opts
let offline_names = optOffline opts
all_nodes = Container.elems fixed_nl
- all_names = map Node.name all_nodes
+ all_names = concatMap allNames all_nodes
offline_wrong = filter (`notElem` all_names) offline_names
offline_indices = map Node.idx $
- filter (\n -> Node.name n `elem` offline_names)
+ filter (\n ->
+ Node.name n `elem` offline_names ||
+ Node.alias n `elem` offline_names)
all_nodes
m_cpu = optMcpu opts
m_dsk = optMdsk opts
- csf = commonSuffix fixed_nl il
+ csf = commonSuffix fixed_nl ilf
when (length offline_wrong > 0) $ do
hPrintf stderr "Wrong node name(s) set as offline: %s\n"
let nm = Container.map (\n -> if Node.idx n `elem` offline_indices
then Node.setOffline n True
else n) fixed_nl
- nl = Container.map (flip Node.setMdsk m_dsk . flip Node.setMcpu m_cpu)
- nm
+ nlf = Container.map (flip Node.setMdsk m_dsk . flip Node.setMcpu m_cpu)
+ nm
when (not oneline && verbose > 1) $
putStrLn $ "Loaded cluster tags: " ++ intercalate "," ctags
- when (Container.size il == 0) $ do
+ when (Container.size ilf == 0) $ do
(if oneline then putStrLn $ formatOneline 0 0 0
else printf "Cluster is empty, exiting.\n")
exitWith ExitSuccess
+ let split_insts = Cluster.findSplitInstances nlf ilf
+ when (not . null $ split_insts) $ do
+ hPutStrLn stderr "Found instances belonging to multiple node groups:"
+ mapM_ (\i -> hPutStrLn stderr $ " " ++ Instance.name i) split_insts
+ hPutStrLn stderr "Aborting."
+ exitWith $ ExitFailure 1
+
+ let ngroups = Cluster.splitCluster nlf ilf
+ when (length ngroups > 1 && isNothing (optGroup opts)) $ do
+ hPutStrLn stderr "Found multiple node groups:"
+ mapM_ (hPutStrLn stderr . (" " ++) . Group.name .
+ (flip Container.find gl) . fst) ngroups
+ hPutStrLn stderr "Aborting."
+ exitWith $ ExitFailure 1
+
unless oneline $ printf "Loaded %d nodes, %d instances\n"
+ (Container.size nlf)
+ (Container.size ilf)
+
+ (gname, (nl, il)) <- case optGroup opts of
+ Nothing -> do
+ let (gidx, cdata) = head ngroups
+ grp = Container.find gidx gl
+ return (Group.name grp, cdata)
+ Just g -> case Container.findByName gl g of
+ Nothing -> do
+ hPutStrLn stderr $ "Node group " ++ g ++
+ " not found. Node group list is:"
+ mapM_ (hPutStrLn stderr . (" " ++) . Group.name ) (Container.elems gl)
+ hPutStrLn stderr "Aborting."
+ exitWith $ ExitFailure 1
+ Just grp ->
+ case lookup (Group.idx grp) ngroups of
+ Nothing -> do
+ -- TODO: while this is unlikely to happen, log here the
+ -- actual group data to help debugging
+ hPutStrLn stderr $ "Internal failure, missing group idx"
+ exitWith $ ExitFailure 1
+ Just cdata -> return (Group.name grp, cdata)
+
+ unless oneline $ printf "Group size %d nodes, %d instances\n"
(Container.size nl)
(Container.size il)
+ putStrLn $ "Selected node group: " ++ gname
+
when (length csf > 0 && not oneline && verbose > 1) $
printf "Note: Stripping common suffix of '%s' from names\n" csf
printf "Initial score: %.8f\n" ini_cv)
unless oneline $ putStrLn "Trying to minimize the CV..."
- let imlen = Container.maxNameLen il
- nmlen = Container.maxNameLen nl
+ let imlen = maximum . map (length . Instance.alias) $ Container.elems il
+ nmlen = maximum . map (length . Node.alias) $ Container.elems nl
(fin_tbl, cmd_strs) <- iterateDepth ini_tbl (optMaxLength opts)
(optDiskMoves opts)
- nmlen imlen [] oneline min_cv (optEvacMode opts)
+ nmlen imlen [] oneline min_cv
+ (optMinGainLim opts) (optMinGain opts)
+ (optEvacMode opts)
let (Cluster.Table fin_nl fin_il fin_cv fin_plc) = fin_tbl
ord_plc = reverse fin_plc
sol_msg = if null fin_plc
writeFile out_path (shTemplate ++ cmd_data)
printf "The commands have been written to file '%s'\n" out_path)
- when (optExecJobs opts && not (null ord_plc))
- (case optLuxi opts of
- Nothing -> do
- hPutStrLn stderr "Execution of commands possible only on LUXI"
- exitWith $ ExitFailure 1
- Just master -> execJobSet master fin_nl il cmd_jobs)
+ when (isJust $ optSaveCluster opts) $
+ do
+ let out_path = fromJust $ optSaveCluster opts
+ adata = serializeCluster gl fin_nl fin_il ctags
+ writeFile out_path adata
+ printf "The cluster state has been written to file '%s'\n" out_path
when (optShowInsts opts) $ do
putStrLn ""
(Cluster.csFmem fin_cs) (Cluster.csFdsk fin_cs)
when oneline $
putStrLn $ formatOneline ini_cv (length ord_plc) fin_cv
+
+ when (optExecJobs opts && not (null ord_plc))
+ (case optLuxi opts of
+ Nothing -> do
+ hPutStrLn stderr "Execution of commands possible only on LUXI"
+ exitWith $ ExitFailure 1
+ Just master -> runJobSet master fin_nl il cmd_jobs)