Statistics
| Branch: | Tag: | Revision:

root / hbal.hs @ 94e05c32

History | View | Annotate | Download (11.4 kB)

1
{-| Cluster rebalancer
2

    
3
-}
4

    
5
{-
6

    
7
Copyright (C) 2009 Google Inc.
8

    
9
This program is free software; you can redistribute it and/or modify
10
it under the terms of the GNU General Public License as published by
11
the Free Software Foundation; either version 2 of the License, or
12
(at your option) any later version.
13

    
14
This program is distributed in the hope that it will be useful, but
15
WITHOUT ANY WARRANTY; without even the implied warranty of
16
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17
General Public License for more details.
18

    
19
You should have received a copy of the GNU General Public License
20
along with this program; if not, write to the Free Software
21
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22
02110-1301, USA.
23

    
24
-}
25

    
26
module Main (main) where
27

    
28
import Control.Concurrent (threadDelay)
29
import Control.Exception (bracket)
30
import Data.List
31
import Data.Function
32
import Data.Maybe (isJust, fromJust)
33
import Monad
34
import System
35
import System.IO
36
import qualified System
37

    
38
import Text.Printf (printf, hPrintf)
39
import Text.JSON (showJSON)
40

    
41
import qualified Ganeti.HTools.Container as Container
42
import qualified Ganeti.HTools.Cluster as Cluster
43
import qualified Ganeti.HTools.Node as Node
44
import qualified Ganeti.HTools.Instance as Instance
45

    
46
import Ganeti.HTools.CLI
47
import Ganeti.HTools.ExtLoader
48
import Ganeti.HTools.Utils
49
import Ganeti.HTools.Types
50

    
51
import qualified Ganeti.Luxi as L
52
import qualified Ganeti.OpCodes as OpCodes
53
import Ganeti.Jobs
54

    
55
-- | Options list and functions
56
options :: [OptType]
57
options =
58
    [ oPrintNodes
59
    , oPrintInsts
60
    , oPrintCommands
61
    , oOneline
62
    , oNodeFile
63
    , oInstFile
64
    , oRapiMaster
65
    , oLuxiSocket
66
    , oExecJobs
67
    , oMaxSolLength
68
    , oVerbose
69
    , oQuiet
70
    , oOfflineNode
71
    , oMinScore
72
    , oMaxCpu
73
    , oMinDisk
74
    , oDiskMoves
75
    , oDynuFile
76
    , oExTags
77
    , oShowVer
78
    , oShowHelp
79
    ]
80

    
81
{- | Start computing the solution at the given depth and recurse until
82
we find a valid solution or we exceed the maximum depth.
83

    
84
-}
85
iterateDepth :: Cluster.Table    -- ^ The starting table
86
             -> Int              -- ^ Remaining length
87
             -> Bool             -- ^ Allow disk moves
88
             -> Int              -- ^ Max node name len
89
             -> Int              -- ^ Max instance name len
90
             -> [MoveJob]        -- ^ Current command list
91
             -> Bool             -- ^ Whether to be silent
92
             -> Score            -- ^ Score at which to stop
93
             -> IO (Cluster.Table, [MoveJob]) -- ^ The resulting table
94
                                              -- and commands
95
iterateDepth ini_tbl max_rounds disk_moves nmlen imlen
96
             cmd_strs oneline min_score =
97
    let Cluster.Table ini_nl ini_il _ _ = ini_tbl
98
        m_fin_tbl = Cluster.tryBalance ini_tbl max_rounds disk_moves min_score
99
    in
100
      case m_fin_tbl of
101
        Just fin_tbl ->
102
            do
103
              let
104
                  (Cluster.Table _ _ _ fin_plc) = fin_tbl
105
                  fin_plc_len = length fin_plc
106
                  cur_plc@(idx, _, _, move, _) = head fin_plc
107
                  (sol_line, cmds) = Cluster.printSolutionLine ini_nl ini_il
108
                                     nmlen imlen cur_plc fin_plc_len
109
                  afn = Cluster.involvedNodes ini_il cur_plc
110
                  upd_cmd_strs = (afn, idx, move, cmds):cmd_strs
111
              unless oneline $ do
112
                       putStrLn sol_line
113
                       hFlush stdout
114
              iterateDepth fin_tbl max_rounds disk_moves
115
                           nmlen imlen upd_cmd_strs oneline min_score
116
        Nothing -> return (ini_tbl, cmd_strs)
117

    
118
-- | Formats the solution for the oneline display
119
formatOneline :: Double -> Int -> Double -> String
120
formatOneline ini_cv plc_len fin_cv =
121
    printf "%.8f %d %.8f %8.3f" ini_cv plc_len fin_cv
122
               (if fin_cv == 0 then 1 else ini_cv / fin_cv)
123

    
124
-- | Submits a list of jobs and waits for all to finish execution
125
execJobs :: L.Client -> [[OpCodes.OpCode]] -> IO (Result [String])
126
execJobs client = L.submitManyJobs client . showJSON
127

    
128
-- | Polls a set of jobs at a fixed interval until all are finished
129
-- one way or another
130
waitForJobs :: L.Client -> [String] -> IO (Result [JobStatus])
131
waitForJobs client jids = do
132
  sts <- L.queryJobsStatus client jids
133
  case sts of
134
    Bad x -> return $ Bad x
135
    Ok s -> if any (<= JobRunning) s
136
            then do
137
              -- TODO: replace hardcoded value with a better thing
138
              threadDelay (1000000 * 15)
139
              waitForJobs client jids
140
            else return $ Ok s
141

    
142
-- | Check that a set of job statuses is all success
143
checkJobsStatus :: [JobStatus] -> Bool
144
checkJobsStatus = all (== JobSuccess)
145

    
146
-- | Execute an entire jobset
147
execJobSet :: String -> String -> Node.List
148
           -> Instance.List -> [JobSet] -> IO ()
149
execJobSet _      _   _  _  [] = return ()
150
execJobSet master csf nl il (js:jss) = do
151
  -- map from jobset (htools list of positions) to [[opcodes]]
152
  let jobs = map (\(_, idx, move, _) ->
153
                      Cluster.iMoveToJob csf nl il idx move) js
154
  let descr = map (\(_, idx, _, _) -> Container.nameOf il idx) js
155
  putStrLn $ "Executing jobset for instances " ++ commaJoin descr
156
  jrs <- bracket (L.getClient master) L.closeClient
157
         (\client -> do
158
            jids <- execJobs client jobs
159
            case jids of
160
              Bad x -> return $ Bad x
161
              Ok x -> do
162
                putStrLn $ "Got job IDs " ++ commaJoin x
163
                waitForJobs client x
164
         )
165
  (case jrs of
166
     Bad x -> do
167
       hPutStrLn stderr $ "Cannot compute job status, aborting: " ++ show x
168
       return ()
169
     Ok x -> if checkJobsStatus x
170
             then execJobSet master csf nl il jss
171
             else do
172
               hPutStrLn stderr $ "Not all jobs completed successfully: " ++
173
                         show x
174
               hPutStrLn stderr "Aborting.")
175

    
176
-- | Main function.
177
main :: IO ()
178
main = do
179
  cmd_args <- System.getArgs
180
  (opts, args) <- parseOpts cmd_args "hbal" options
181

    
182
  unless (null args) $ do
183
         hPutStrLn stderr "Error: this program doesn't take any arguments."
184
         exitWith $ ExitFailure 1
185

    
186
  let oneline = optOneline opts
187
      verbose = optVerbose opts
188
      shownodes = optShowNodes opts
189

    
190
  (fixed_nl, il, _, csf) <- loadExternalData opts
191

    
192
  let offline_names = optOffline opts
193
      all_nodes = Container.elems fixed_nl
194
      all_names = map Node.name all_nodes
195
      offline_wrong = filter (flip notElem all_names) offline_names
196
      offline_indices = map Node.idx $
197
                        filter (\n -> elem (Node.name n) offline_names)
198
                               all_nodes
199
      m_cpu = optMcpu opts
200
      m_dsk = optMdsk opts
201

    
202
  when (length offline_wrong > 0) $ do
203
         hPrintf stderr "Wrong node name(s) set as offline: %s\n"
204
                     (commaJoin offline_wrong)
205
         exitWith $ ExitFailure 1
206

    
207
  let nm = Container.map (\n -> if elem (Node.idx n) offline_indices
208
                                then Node.setOffline n True
209
                                else n) fixed_nl
210
      nl = Container.map (flip Node.setMdsk m_dsk . flip Node.setMcpu m_cpu)
211
           nm
212

    
213
  when (Container.size il == 0) $ do
214
         (if oneline then putStrLn $ formatOneline 0 0 0
215
          else printf "Cluster is empty, exiting.\n")
216
         exitWith ExitSuccess
217

    
218
  unless oneline $ printf "Loaded %d nodes, %d instances\n"
219
             (Container.size nl)
220
             (Container.size il)
221

    
222
  when (length csf > 0 && not oneline && verbose > 1) $
223
       printf "Note: Stripping common suffix of '%s' from names\n" csf
224

    
225
  let (bad_nodes, bad_instances) = Cluster.computeBadItems nl il
226
  unless (oneline || verbose == 0) $ printf
227
             "Initial check done: %d bad nodes, %d bad instances.\n"
228
             (length bad_nodes) (length bad_instances)
229

    
230
  when (length bad_nodes > 0) $
231
         putStrLn "Cluster is not N+1 happy, continuing but no guarantee \
232
                  \that the cluster will end N+1 happy."
233

    
234
  when (optShowInsts opts) $ do
235
         putStrLn ""
236
         putStrLn "Initial instance map:"
237
         putStrLn $ Cluster.printInsts nl il
238

    
239
  when (isJust shownodes) $
240
       do
241
         putStrLn "Initial cluster status:"
242
         putStrLn $ Cluster.printNodes nl (fromJust shownodes)
243

    
244
  let ini_cv = Cluster.compCV nl
245
      ini_tbl = Cluster.Table nl il ini_cv []
246
      min_cv = optMinScore opts
247

    
248
  when (ini_cv < min_cv) $ do
249
         (if oneline then
250
              putStrLn $ formatOneline ini_cv 0 ini_cv
251
          else printf "Cluster is already well balanced (initial score %.6g,\n\
252
                      \minimum score %.6g).\nNothing to do, exiting\n"
253
                      ini_cv min_cv)
254
         exitWith ExitSuccess
255

    
256
  unless oneline (if verbose > 2 then
257
                      printf "Initial coefficients: overall %.8f, %s\n"
258
                      ini_cv (Cluster.printStats nl)
259
                  else
260
                      printf "Initial score: %.8f\n" ini_cv)
261

    
262
  unless oneline $ putStrLn "Trying to minimize the CV..."
263
  let imlen = Container.maxNameLen il
264
      nmlen = Container.maxNameLen nl
265

    
266
  (fin_tbl, cmd_strs) <- iterateDepth ini_tbl (optMaxLength opts)
267
                         (optDiskMoves opts)
268
                         nmlen imlen [] oneline min_cv
269
  let (Cluster.Table fin_nl fin_il fin_cv fin_plc) = fin_tbl
270
      ord_plc = reverse fin_plc
271
      sol_msg = if null fin_plc
272
                then printf "No solution found\n"
273
                else if verbose > 2
274
                     then printf "Final coefficients:   overall %.8f, %s\n"
275
                          fin_cv (Cluster.printStats fin_nl)
276
                     else printf "Cluster score improved from %.8f to %.8f\n"
277
                          ini_cv fin_cv
278
                              ::String
279

    
280
  unless oneline $ putStr sol_msg
281

    
282
  unless (oneline || verbose == 0) $
283
         printf "Solution length=%d\n" (length ord_plc)
284

    
285
  let cmd_jobs = Cluster.splitJobs cmd_strs
286
      cmd_data = Cluster.formatCmds cmd_jobs
287

    
288
  when (isJust $ optShowCmds opts) $
289
       do
290
         let out_path = fromJust $ optShowCmds opts
291
         putStrLn ""
292
         (if out_path == "-" then
293
              printf "Commands to run to reach the above solution:\n%s"
294
                     (unlines . map ("  " ++) .
295
                      filter (/= "  check") .
296
                      lines $ cmd_data)
297
          else do
298
            writeFile out_path (shTemplate ++ cmd_data)
299
            printf "The commands have been written to file '%s'\n" out_path)
300

    
301
  when (optExecJobs opts && not (null ord_plc))
302
           (case optLuxi opts of
303
              Nothing -> do
304
                hPutStrLn stderr "Execution of commands possible only on LUXI"
305
                exitWith $ ExitFailure 1
306
              Just master -> execJobSet master csf fin_nl il cmd_jobs)
307

    
308
  when (optShowInsts opts) $ do
309
         putStrLn ""
310
         putStrLn "Final instance map:"
311
         putStr $ Cluster.printInsts fin_nl fin_il
312

    
313
  when (isJust shownodes) $
314
       do
315
         let ini_cs = Cluster.totalResources nl
316
             fin_cs = Cluster.totalResources fin_nl
317
         putStrLn ""
318
         putStrLn "Final cluster status:"
319
         putStrLn $ Cluster.printNodes fin_nl (fromJust shownodes)
320
         when (verbose > 3) $
321
              do
322
                printf "Original: mem=%d disk=%d\n"
323
                       (Cluster.csFmem ini_cs) (Cluster.csFdsk ini_cs)
324
                printf "Final:    mem=%d disk=%d\n"
325
                       (Cluster.csFmem fin_cs) (Cluster.csFdsk fin_cs)
326
  when oneline $
327
         putStrLn $ formatOneline ini_cv (length ord_plc) fin_cv