Statistics
| Branch: | Tag: | Revision:

root / hbal.hs @ 16c2369c

History | View | Annotate | Download (11.5 kB)

1
{-| Cluster rebalancer
2

    
3
-}
4

    
5
{-
6

    
7
Copyright (C) 2009 Google Inc.
8

    
9
This program is free software; you can redistribute it and/or modify
10
it under the terms of the GNU General Public License as published by
11
the Free Software Foundation; either version 2 of the License, or
12
(at your option) any later version.
13

    
14
This program is distributed in the hope that it will be useful, but
15
WITHOUT ANY WARRANTY; without even the implied warranty of
16
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17
General Public License for more details.
18

    
19
You should have received a copy of the GNU General Public License
20
along with this program; if not, write to the Free Software
21
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22
02110-1301, USA.
23

    
24
-}
25

    
26
module Main (main) where
27

    
28
import Control.Concurrent (threadDelay)
29
import Control.Exception (bracket)
30
import Data.List
31
import Data.Function
32
import Data.Maybe (isJust, fromJust)
33
import Monad
34
import System
35
import System.IO
36
import qualified System
37

    
38
import Text.Printf (printf, hPrintf)
39
import Text.JSON (showJSON)
40

    
41
import qualified Ganeti.HTools.Container as Container
42
import qualified Ganeti.HTools.Cluster as Cluster
43
import qualified Ganeti.HTools.Node as Node
44
import qualified Ganeti.HTools.Instance as Instance
45

    
46
import Ganeti.HTools.CLI
47
import Ganeti.HTools.ExtLoader
48
import Ganeti.HTools.Utils
49
import Ganeti.HTools.Types
50

    
51
import qualified Ganeti.Luxi as L
52
import qualified Ganeti.OpCodes as OpCodes
53
import Ganeti.Jobs
54

    
55
-- | Options list and functions
56
options :: [OptType]
57
options =
58
    [ oPrintNodes
59
    , oPrintInsts
60
    , oPrintCommands
61
    , oOneline
62
    , oDataFile
63
    , oRapiMaster
64
    , oLuxiSocket
65
    , oExecJobs
66
    , oMaxSolLength
67
    , oVerbose
68
    , oQuiet
69
    , oOfflineNode
70
    , oMinScore
71
    , oMaxCpu
72
    , oMinDisk
73
    , oDiskMoves
74
    , oDynuFile
75
    , oExTags
76
    , oShowVer
77
    , oShowHelp
78
    ]
79

    
80
{- | Start computing the solution at the given depth and recurse until
81
we find a valid solution or we exceed the maximum depth.
82

    
83
-}
84
iterateDepth :: Cluster.Table    -- ^ The starting table
85
             -> Int              -- ^ Remaining length
86
             -> Bool             -- ^ Allow disk moves
87
             -> Int              -- ^ Max node name len
88
             -> Int              -- ^ Max instance name len
89
             -> [MoveJob]        -- ^ Current command list
90
             -> Bool             -- ^ Whether to be silent
91
             -> Score            -- ^ Score at which to stop
92
             -> IO (Cluster.Table, [MoveJob]) -- ^ The resulting table
93
                                              -- and commands
94
iterateDepth ini_tbl max_rounds disk_moves nmlen imlen
95
             cmd_strs oneline min_score =
96
    let Cluster.Table ini_nl ini_il _ _ = ini_tbl
97
        m_fin_tbl = Cluster.tryBalance ini_tbl max_rounds disk_moves min_score
98
    in
99
      case m_fin_tbl of
100
        Just fin_tbl ->
101
            do
102
              let
103
                  (Cluster.Table _ _ _ fin_plc) = fin_tbl
104
                  fin_plc_len = length fin_plc
105
                  cur_plc@(idx, _, _, move, _) = head fin_plc
106
                  (sol_line, cmds) = Cluster.printSolutionLine ini_nl ini_il
107
                                     nmlen imlen cur_plc fin_plc_len
108
                  afn = Cluster.involvedNodes ini_il cur_plc
109
                  upd_cmd_strs = (afn, idx, move, cmds):cmd_strs
110
              unless oneline $ do
111
                       putStrLn sol_line
112
                       hFlush stdout
113
              iterateDepth fin_tbl max_rounds disk_moves
114
                           nmlen imlen upd_cmd_strs oneline min_score
115
        Nothing -> return (ini_tbl, cmd_strs)
116

    
117
-- | Formats the solution for the oneline display
118
formatOneline :: Double -> Int -> Double -> String
119
formatOneline ini_cv plc_len fin_cv =
120
    printf "%.8f %d %.8f %8.3f" ini_cv plc_len fin_cv
121
               (if fin_cv == 0 then 1 else ini_cv / fin_cv)
122

    
123
-- | Submits a list of jobs and waits for all to finish execution
124
execJobs :: L.Client -> [[OpCodes.OpCode]] -> IO (Result [String])
125
execJobs client = L.submitManyJobs client . showJSON
126

    
127
-- | Polls a set of jobs at a fixed interval until all are finished
128
-- one way or another
129
waitForJobs :: L.Client -> [String] -> IO (Result [JobStatus])
130
waitForJobs client jids = do
131
  sts <- L.queryJobsStatus client jids
132
  case sts of
133
    Bad x -> return $ Bad x
134
    Ok s -> if any (<= JobRunning) s
135
            then do
136
              -- TODO: replace hardcoded value with a better thing
137
              threadDelay (1000000 * 15)
138
              waitForJobs client jids
139
            else return $ Ok s
140

    
141
-- | Check that a set of job statuses is all success
142
checkJobsStatus :: [JobStatus] -> Bool
143
checkJobsStatus = all (== JobSuccess)
144

    
145
-- | Execute an entire jobset
146
execJobSet :: String -> String -> Node.List
147
           -> Instance.List -> [JobSet] -> IO ()
148
execJobSet _      _   _  _  [] = return ()
149
execJobSet master csf nl il (js:jss) = do
150
  -- map from jobset (htools list of positions) to [[opcodes]]
151
  let jobs = map (\(_, idx, move, _) ->
152
                      Cluster.iMoveToJob csf nl il idx move) js
153
  let descr = map (\(_, idx, _, _) -> Container.nameOf il idx) js
154
  putStrLn $ "Executing jobset for instances " ++ commaJoin descr
155
  jrs <- bracket (L.getClient master) L.closeClient
156
         (\client -> do
157
            jids <- execJobs client jobs
158
            case jids of
159
              Bad x -> return $ Bad x
160
              Ok x -> do
161
                putStrLn $ "Got job IDs " ++ commaJoin x
162
                waitForJobs client x
163
         )
164
  (case jrs of
165
     Bad x -> do
166
       hPutStrLn stderr $ "Cannot compute job status, aborting: " ++ show x
167
       return ()
168
     Ok x -> if checkJobsStatus x
169
             then execJobSet master csf nl il jss
170
             else do
171
               hPutStrLn stderr $ "Not all jobs completed successfully: " ++
172
                         show x
173
               hPutStrLn stderr "Aborting.")
174

    
175
-- | Main function.
176
main :: IO ()
177
main = do
178
  cmd_args <- System.getArgs
179
  (opts, args) <- parseOpts cmd_args "hbal" options
180

    
181
  unless (null args) $ do
182
         hPutStrLn stderr "Error: this program doesn't take any arguments."
183
         exitWith $ ExitFailure 1
184

    
185
  let oneline = optOneline opts
186
      verbose = optVerbose opts
187
      shownodes = optShowNodes opts
188

    
189
  (fixed_nl, il, ctags, csf) <- loadExternalData opts
190

    
191
  let offline_names = optOffline opts
192
      all_nodes = Container.elems fixed_nl
193
      all_names = map Node.name all_nodes
194
      offline_wrong = filter (flip notElem all_names) offline_names
195
      offline_indices = map Node.idx $
196
                        filter (\n -> elem (Node.name n) offline_names)
197
                               all_nodes
198
      m_cpu = optMcpu opts
199
      m_dsk = optMdsk opts
200

    
201
  when (length offline_wrong > 0) $ do
202
         hPrintf stderr "Wrong node name(s) set as offline: %s\n"
203
                     (commaJoin offline_wrong)
204
         exitWith $ ExitFailure 1
205

    
206
  let nm = Container.map (\n -> if elem (Node.idx n) offline_indices
207
                                then Node.setOffline n True
208
                                else n) fixed_nl
209
      nl = Container.map (flip Node.setMdsk m_dsk . flip Node.setMcpu m_cpu)
210
           nm
211

    
212
  when (not oneline && verbose > 1) $
213
       putStrLn $ "Loaded cluster tags: " ++ intercalate "," ctags
214

    
215
  when (Container.size il == 0) $ do
216
         (if oneline then putStrLn $ formatOneline 0 0 0
217
          else printf "Cluster is empty, exiting.\n")
218
         exitWith ExitSuccess
219

    
220
  unless oneline $ printf "Loaded %d nodes, %d instances\n"
221
             (Container.size nl)
222
             (Container.size il)
223

    
224
  when (length csf > 0 && not oneline && verbose > 1) $
225
       printf "Note: Stripping common suffix of '%s' from names\n" csf
226

    
227
  let (bad_nodes, bad_instances) = Cluster.computeBadItems nl il
228
  unless (oneline || verbose == 0) $ printf
229
             "Initial check done: %d bad nodes, %d bad instances.\n"
230
             (length bad_nodes) (length bad_instances)
231

    
232
  when (length bad_nodes > 0) $
233
         putStrLn "Cluster is not N+1 happy, continuing but no guarantee \
234
                  \that the cluster will end N+1 happy."
235

    
236
  when (optShowInsts opts) $ do
237
         putStrLn ""
238
         putStrLn "Initial instance map:"
239
         putStrLn $ Cluster.printInsts nl il
240

    
241
  when (isJust shownodes) $
242
       do
243
         putStrLn "Initial cluster status:"
244
         putStrLn $ Cluster.printNodes nl (fromJust shownodes)
245

    
246
  let ini_cv = Cluster.compCV nl
247
      ini_tbl = Cluster.Table nl il ini_cv []
248
      min_cv = optMinScore opts
249

    
250
  when (ini_cv < min_cv) $ do
251
         (if oneline then
252
              putStrLn $ formatOneline ini_cv 0 ini_cv
253
          else printf "Cluster is already well balanced (initial score %.6g,\n\
254
                      \minimum score %.6g).\nNothing to do, exiting\n"
255
                      ini_cv min_cv)
256
         exitWith ExitSuccess
257

    
258
  unless oneline (if verbose > 2 then
259
                      printf "Initial coefficients: overall %.8f, %s\n"
260
                      ini_cv (Cluster.printStats nl)
261
                  else
262
                      printf "Initial score: %.8f\n" ini_cv)
263

    
264
  unless oneline $ putStrLn "Trying to minimize the CV..."
265
  let imlen = Container.maxNameLen il
266
      nmlen = Container.maxNameLen nl
267

    
268
  (fin_tbl, cmd_strs) <- iterateDepth ini_tbl (optMaxLength opts)
269
                         (optDiskMoves opts)
270
                         nmlen imlen [] oneline min_cv
271
  let (Cluster.Table fin_nl fin_il fin_cv fin_plc) = fin_tbl
272
      ord_plc = reverse fin_plc
273
      sol_msg = if null fin_plc
274
                then printf "No solution found\n"
275
                else if verbose > 2
276
                     then printf "Final coefficients:   overall %.8f, %s\n"
277
                          fin_cv (Cluster.printStats fin_nl)
278
                     else printf "Cluster score improved from %.8f to %.8f\n"
279
                          ini_cv fin_cv
280
                              ::String
281

    
282
  unless oneline $ putStr sol_msg
283

    
284
  unless (oneline || verbose == 0) $
285
         printf "Solution length=%d\n" (length ord_plc)
286

    
287
  let cmd_jobs = Cluster.splitJobs cmd_strs
288
      cmd_data = Cluster.formatCmds cmd_jobs
289

    
290
  when (isJust $ optShowCmds opts) $
291
       do
292
         let out_path = fromJust $ optShowCmds opts
293
         putStrLn ""
294
         (if out_path == "-" then
295
              printf "Commands to run to reach the above solution:\n%s"
296
                     (unlines . map ("  " ++) .
297
                      filter (/= "  check") .
298
                      lines $ cmd_data)
299
          else do
300
            writeFile out_path (shTemplate ++ cmd_data)
301
            printf "The commands have been written to file '%s'\n" out_path)
302

    
303
  when (optExecJobs opts && not (null ord_plc))
304
           (case optLuxi opts of
305
              Nothing -> do
306
                hPutStrLn stderr "Execution of commands possible only on LUXI"
307
                exitWith $ ExitFailure 1
308
              Just master -> execJobSet master csf fin_nl il cmd_jobs)
309

    
310
  when (optShowInsts opts) $ do
311
         putStrLn ""
312
         putStrLn "Final instance map:"
313
         putStr $ Cluster.printInsts fin_nl fin_il
314

    
315
  when (isJust shownodes) $
316
       do
317
         let ini_cs = Cluster.totalResources nl
318
             fin_cs = Cluster.totalResources fin_nl
319
         putStrLn ""
320
         putStrLn "Final cluster status:"
321
         putStrLn $ Cluster.printNodes fin_nl (fromJust shownodes)
322
         when (verbose > 3) $
323
              do
324
                printf "Original: mem=%d disk=%d\n"
325
                       (Cluster.csFmem ini_cs) (Cluster.csFdsk ini_cs)
326
                printf "Final:    mem=%d disk=%d\n"
327
                       (Cluster.csFmem fin_cs) (Cluster.csFdsk fin_cs)
328
  when oneline $
329
         putStrLn $ formatOneline ini_cv (length ord_plc) fin_cv