Statistics
| Branch: | Tag: | Revision:

root / hbal.hs @ f5b553da

History | View | Annotate | Download (11 kB)

1
{-| Cluster rebalancer
2

    
3
-}
4

    
5
{-
6

    
7
Copyright (C) 2009 Google Inc.
8

    
9
This program is free software; you can redistribute it and/or modify
10
it under the terms of the GNU General Public License as published by
11
the Free Software Foundation; either version 2 of the License, or
12
(at your option) any later version.
13

    
14
This program is distributed in the hope that it will be useful, but
15
WITHOUT ANY WARRANTY; without even the implied warranty of
16
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17
General Public License for more details.
18

    
19
You should have received a copy of the GNU General Public License
20
along with this program; if not, write to the Free Software
21
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22
02110-1301, USA.
23

    
24
-}
25

    
26
module Main (main) where
27

    
28
import Control.Concurrent (threadDelay)
29
import Control.Exception (bracket)
30
import Data.List
31
import Data.Function
32
import Data.Maybe (isJust, fromJust)
33
import Monad
34
import System
35
import System.IO
36
import qualified System
37

    
38
import Text.Printf (printf, hPrintf)
39
import Text.JSON (showJSON)
40

    
41
import qualified Ganeti.HTools.Container as Container
42
import qualified Ganeti.HTools.Cluster as Cluster
43
import qualified Ganeti.HTools.Node as Node
44
import qualified Ganeti.HTools.Instance as Instance
45

    
46
import Ganeti.HTools.CLI
47
import Ganeti.HTools.ExtLoader
48
import Ganeti.HTools.Utils
49
import Ganeti.HTools.Types
50

    
51
import qualified Ganeti.Luxi as L
52
import qualified Ganeti.OpCodes as OpCodes
53
import Ganeti.Jobs
54

    
55
-- | Options list and functions
56
options :: [OptType]
57
options =
58
    [ oPrintNodes
59
    , oPrintCommands
60
    , oOneline
61
    , oNodeFile
62
    , oInstFile
63
    , oRapiMaster
64
    , oLuxiSocket
65
    , oExecJobs
66
    , oMaxSolLength
67
    , oVerbose
68
    , oQuiet
69
    , oOfflineNode
70
    , oMinScore
71
    , oMaxCpu
72
    , oMinDisk
73
    , oDiskMoves
74
    , oShowVer
75
    , oShowHelp
76
    ]
77

    
78
{- | Start computing the solution at the given depth and recurse until
79
we find a valid solution or we exceed the maximum depth.
80

    
81
-}
82
iterateDepth :: Cluster.Table    -- ^ The starting table
83
             -> Int              -- ^ Remaining length
84
             -> Bool             -- ^ Allow disk moves
85
             -> Int              -- ^ Max node name len
86
             -> Int              -- ^ Max instance name len
87
             -> [MoveJob]        -- ^ Current command list
88
             -> Bool             -- ^ Whether to be silent
89
             -> Score            -- ^ Score at which to stop
90
             -> IO (Cluster.Table, [MoveJob]) -- ^ The resulting table
91
                                              -- and commands
92
iterateDepth ini_tbl max_rounds disk_moves nmlen imlen
93
             cmd_strs oneline min_score =
94
    let Cluster.Table ini_nl ini_il _ _ = ini_tbl
95
        m_fin_tbl = Cluster.tryBalance ini_tbl max_rounds disk_moves min_score
96
    in
97
      case m_fin_tbl of
98
        Just fin_tbl ->
99
            do
100
              let
101
                  (Cluster.Table _ _ _ fin_plc) = fin_tbl
102
                  fin_plc_len = length fin_plc
103
                  cur_plc@(idx, _, _, move, _) = head fin_plc
104
                  (sol_line, cmds) = Cluster.printSolutionLine ini_nl ini_il
105
                                     nmlen imlen cur_plc fin_plc_len
106
                  afn = Cluster.involvedNodes ini_il cur_plc
107
                  upd_cmd_strs = (afn, idx, move, cmds):cmd_strs
108
              unless oneline $ do
109
                       putStrLn sol_line
110
                       hFlush stdout
111
              iterateDepth fin_tbl max_rounds disk_moves
112
                           nmlen imlen upd_cmd_strs oneline min_score
113
        Nothing -> return (ini_tbl, cmd_strs)
114

    
115
-- | Formats the solution for the oneline display
116
formatOneline :: Double -> Int -> Double -> String
117
formatOneline ini_cv plc_len fin_cv =
118
    printf "%.8f %d %.8f %8.3f" ini_cv plc_len fin_cv
119
               (if fin_cv == 0 then 1 else ini_cv / fin_cv)
120

    
121
-- | Submits a list of jobs and waits for all to finish execution
122
execJobs :: L.Client -> [[OpCodes.OpCode]] -> IO (Result [String])
123
execJobs client = L.submitManyJobs client . showJSON
124

    
125
-- | Polls a set of jobs at a fixed interval until all are finished
126
-- one way or another
127
waitForJobs :: L.Client -> [String] -> IO (Result [JobStatus])
128
waitForJobs client jids = do
129
  sts <- L.queryJobsStatus client jids
130
  case sts of
131
    Bad x -> return $ Bad x
132
    Ok s -> if any (<= JobRunning) s
133
            then do
134
              -- TODO: replace hardcoded value with a better thing
135
              threadDelay (1000000 * 15)
136
              waitForJobs client jids
137
            else return $ Ok s
138

    
139
-- | Check that a set of job statuses is all success
140
checkJobsStatus :: [JobStatus] -> Bool
141
checkJobsStatus = all (== JobSuccess)
142

    
143
-- | Execute an entire jobset
144
execJobSet :: String -> String -> Node.List
145
           -> Instance.List -> [JobSet] -> IO ()
146
execJobSet _      _   _  _  [] = return ()
147
execJobSet master csf nl il (js:jss) = do
148
  -- map from jobset (htools list of positions) to [[opcodes]]
149
  let jobs = map (\(_, idx, move, _) ->
150
                      Cluster.iMoveToJob csf nl il idx move) js
151
  let descr = map (\(_, idx, _, _) -> Container.nameOf il idx) js
152
  putStrLn $ "Executing jobset for instances " ++ commaJoin descr
153
  jrs <- bracket (L.getClient master) L.closeClient
154
         (\client -> do
155
            jids <- execJobs client jobs
156
            case jids of
157
              Bad x -> return $ Bad x
158
              Ok x -> do
159
                putStrLn $ "Got job IDs " ++ commaJoin x
160
                waitForJobs client x
161
         )
162
  (case jrs of
163
     Bad x -> do
164
       hPutStrLn stderr $ "Cannot compute job status, aborting: " ++ show x
165
       return ()
166
     Ok x -> if checkJobsStatus x
167
             then execJobSet master csf nl il jss
168
             else do
169
               hPutStrLn stderr $ "Not all jobs completed successfully: " ++
170
                         show x
171
               hPutStrLn stderr "Aborting.")
172

    
173
-- | Main function.
174
main :: IO ()
175
main = do
176
  cmd_args <- System.getArgs
177
  (opts, args) <- parseOpts cmd_args "hbal" options
178

    
179
  unless (null args) $ do
180
         hPutStrLn stderr "Error: this program doesn't take any arguments."
181
         exitWith $ ExitFailure 1
182

    
183
  let oneline = optOneline opts
184
      verbose = optVerbose opts
185

    
186
  (fixed_nl, il, csf) <- loadExternalData opts
187

    
188
  let offline_names = optOffline opts
189
      all_nodes = Container.elems fixed_nl
190
      all_names = map Node.name all_nodes
191
      offline_wrong = filter (flip notElem all_names) offline_names
192
      offline_indices = map Node.idx $
193
                        filter (\n -> elem (Node.name n) offline_names)
194
                               all_nodes
195
      m_cpu = optMcpu opts
196
      m_dsk = optMdsk opts
197

    
198
  when (length offline_wrong > 0) $ do
199
         hPrintf stderr "Wrong node name(s) set as offline: %s\n"
200
                     (commaJoin offline_wrong)
201
         exitWith $ ExitFailure 1
202

    
203
  let nm = Container.map (\n -> if elem (Node.idx n) offline_indices
204
                                then Node.setOffline n True
205
                                else n) fixed_nl
206
      nl = Container.map (flip Node.setMdsk m_dsk . flip Node.setMcpu m_cpu)
207
           nm
208

    
209
  when (Container.size il == 0) $ do
210
         (if oneline then putStrLn $ formatOneline 0 0 0
211
          else printf "Cluster is empty, exiting.\n")
212
         exitWith ExitSuccess
213

    
214
  unless oneline $ printf "Loaded %d nodes, %d instances\n"
215
             (Container.size nl)
216
             (Container.size il)
217

    
218
  when (length csf > 0 && not oneline && verbose > 1) $
219
       printf "Note: Stripping common suffix of '%s' from names\n" csf
220

    
221
  let (bad_nodes, bad_instances) = Cluster.computeBadItems nl il
222
  unless (oneline || verbose == 0) $ printf
223
             "Initial check done: %d bad nodes, %d bad instances.\n"
224
             (length bad_nodes) (length bad_instances)
225

    
226
  when (length bad_nodes > 0) $
227
         putStrLn "Cluster is not N+1 happy, continuing but no guarantee \
228
                  \that the cluster will end N+1 happy."
229

    
230
  when (optShowNodes opts) $
231
       do
232
         putStrLn "Initial cluster status:"
233
         putStrLn $ Cluster.printNodes nl
234

    
235
  let ini_cv = Cluster.compCV nl
236
      ini_tbl = Cluster.Table nl il ini_cv []
237
      min_cv = optMinScore opts
238

    
239
  when (ini_cv < min_cv) $ do
240
         (if oneline then
241
              putStrLn $ formatOneline ini_cv 0 ini_cv
242
          else printf "Cluster is already well balanced (initial score %.6g,\n\
243
                      \minimum score %.6g).\nNothing to do, exiting\n"
244
                      ini_cv min_cv)
245
         exitWith ExitSuccess
246

    
247
  unless oneline (if verbose > 2 then
248
                      printf "Initial coefficients: overall %.8f, %s\n"
249
                      ini_cv (Cluster.printStats nl)
250
                  else
251
                      printf "Initial score: %.8f\n" ini_cv)
252

    
253
  unless oneline $ putStrLn "Trying to minimize the CV..."
254
  let imlen = Container.maxNameLen il
255
      nmlen = Container.maxNameLen nl
256

    
257
  (fin_tbl, cmd_strs) <- iterateDepth ini_tbl (optMaxLength opts)
258
                         (optDiskMoves opts)
259
                         nmlen imlen [] oneline min_cv
260
  let (Cluster.Table fin_nl _ fin_cv fin_plc) = fin_tbl
261
      ord_plc = reverse fin_plc
262
      sol_msg = if null fin_plc
263
                then printf "No solution found\n"
264
                else if verbose > 2
265
                     then printf "Final coefficients:   overall %.8f, %s\n"
266
                          fin_cv (Cluster.printStats fin_nl)
267
                     else printf "Cluster score improved from %.8f to %.8f\n"
268
                          ini_cv fin_cv
269
                              ::String
270

    
271
  unless oneline $ putStr sol_msg
272

    
273
  unless (oneline || verbose == 0) $
274
         printf "Solution length=%d\n" (length ord_plc)
275

    
276
  let cmd_jobs = Cluster.splitJobs cmd_strs
277
      cmd_data = Cluster.formatCmds cmd_jobs
278

    
279
  when (isJust $ optShowCmds opts) $
280
       do
281
         let out_path = fromJust $ optShowCmds opts
282
         putStrLn ""
283
         (if out_path == "-" then
284
              printf "Commands to run to reach the above solution:\n%s"
285
                     (unlines . map ("  " ++) .
286
                      filter (/= "  check") .
287
                      lines $ cmd_data)
288
          else do
289
            writeFile out_path (shTemplate ++ cmd_data)
290
            printf "The commands have been written to file '%s'\n" out_path)
291

    
292
  when (optExecJobs opts && not (null ord_plc))
293
           (case optLuxi opts of
294
              Nothing -> do
295
                hPutStrLn stderr "Execution of commands possible only on LUXI"
296
                exitWith $ ExitFailure 1
297
              Just master -> execJobSet master csf fin_nl il cmd_jobs)
298

    
299
  when (optShowNodes opts) $
300
       do
301
         let ini_cs = Cluster.totalResources nl
302
             fin_cs = Cluster.totalResources fin_nl
303
         putStrLn ""
304
         putStrLn "Final cluster status:"
305
         putStrLn $ Cluster.printNodes fin_nl
306
         when (verbose > 3) $
307
              do
308
                printf "Original: mem=%d disk=%d\n"
309
                       (Cluster.csFmem ini_cs) (Cluster.csFdsk ini_cs)
310
                printf "Final:    mem=%d disk=%d\n"
311
                       (Cluster.csFmem fin_cs) (Cluster.csFdsk fin_cs)
312
  when oneline $
313
         putStrLn $ formatOneline ini_cv (length ord_plc) fin_cv