Statistics
| Branch: | Tag: | Revision:

root / src / Ganeti / HTools / Program / Harep.hs @ 4ba5f148

History | View | Annotate | Download (14.3 kB)

1
{-| Auto-repair tool for Ganeti.
2

    
3
-}
4

    
5
{-
6

    
7
Copyright (C) 2013 Google Inc.
8

    
9
This program is free software; you can redistribute it and/or modify
10
it under the terms of the GNU General Public License as published by
11
the Free Software Foundation; either version 2 of the License, or
12
(at your option) any later version.
13

    
14
This program is distributed in the hope that it will be useful, but
15
WITHOUT ANY WARRANTY; without even the implied warranty of
16
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17
General Public License for more details.
18

    
19
You should have received a copy of the GNU General Public License
20
along with this program; if not, write to the Free Software
21
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22
02110-1301, USA.
23

    
24
-}
25

    
26
module Ganeti.HTools.Program.Harep
27
  ( main
28
  , arguments
29
  , options) where
30

    
31
import Control.Exception (bracket)
32
import Control.Monad
33
import Data.Function
34
import Data.List
35
import Data.Maybe
36
import Data.Ord
37
import System.Time
38

    
39
import Ganeti.BasicTypes
40
import Ganeti.Common
41
import Ganeti.Errors
42
import Ganeti.Jobs
43
import Ganeti.OpCodes
44
import Ganeti.OpParams
45
import Ganeti.Types
46
import Ganeti.Utils
47
import qualified Ganeti.Constants as C
48
import qualified Ganeti.Luxi as L
49
import qualified Ganeti.Path as Path
50

    
51
import Ganeti.HTools.CLI
52
import Ganeti.HTools.Loader
53
import Ganeti.HTools.ExtLoader
54
import Ganeti.HTools.Types
55
import qualified Ganeti.HTools.Container as Container
56
import qualified Ganeti.HTools.Instance as Instance
57
import qualified Ganeti.HTools.Node as Node
58

    
59
-- | Options list and functions.
60
options :: IO [OptType]
61
options = do
62
  luxi <- oLuxiSocket
63
  return
64
    [ luxi
65
    ]
66

    
67
arguments :: [ArgCompletion]
68
arguments = []
69

    
70
data InstanceData = InstanceData { arInstance :: Instance.Instance
71
                                 , arState :: AutoRepairStatus
72
                                 , tagsToRemove :: [String]
73
                                 }
74
                    deriving (Eq, Show)
75

    
76
-- | Parse a tag into an 'AutoRepairData' record.
77
--
78
-- @Nothing@ is returned if the tag is not an auto-repair tag, or if it's
79
-- malformed.
80
parseInitTag :: String -> Maybe AutoRepairData
81
parseInitTag tag =
82
  let parsePending = do
83
        subtag <- chompPrefix C.autoRepairTagPending tag
84
        case sepSplit ':' subtag of
85
          [rtype, uuid, ts, jobs] -> makeArData rtype uuid ts jobs
86
          _                       -> fail ("Invalid tag: " ++ show tag)
87

    
88
      parseResult = do
89
        subtag <- chompPrefix C.autoRepairTagResult tag
90
        case sepSplit ':' subtag of
91
          [rtype, uuid, ts, result, jobs] -> do
92
            arData <- makeArData rtype uuid ts jobs
93
            result' <- autoRepairResultFromRaw result
94
            return arData { arResult = Just result' }
95
          _                               -> fail ("Invalid tag: " ++ show tag)
96

    
97
      makeArData rtype uuid ts jobs = do
98
        rtype' <- autoRepairTypeFromRaw rtype
99
        ts' <- tryRead "auto-repair time" ts
100
        jobs' <- mapM makeJobIdS $ sepSplit '+' jobs
101
        return AutoRepairData { arType = rtype'
102
                              , arUuid = uuid
103
                              , arTime = TOD ts' 0
104
                              , arJobs = jobs'
105
                              , arResult = Nothing
106
                              , arTag = tag
107
                              }
108
  in
109
   parsePending `mplus` parseResult
110

    
111
-- | Return the 'AutoRepairData' element of an 'AutoRepairStatus' type.
112
getArData :: AutoRepairStatus -> Maybe AutoRepairData
113
getArData status =
114
  case status of
115
    ArHealthy (Just d) -> Just d
116
    ArFailedRepair  d  -> Just d
117
    ArPendingRepair d  -> Just d
118
    ArNeedsRepair   d  -> Just d
119
    _                  -> Nothing
120

    
121
-- | Return a short name for each auto-repair status.
122
--
123
-- This is a more concise representation of the status, because the default
124
-- "Show" formatting includes all the accompanying auto-repair data.
125
arStateName :: AutoRepairStatus -> String
126
arStateName status =
127
  case status of
128
    ArHealthy _       -> "Healthy"
129
    ArFailedRepair _  -> "Failure"
130
    ArPendingRepair _ -> "Pending repair"
131
    ArNeedsRepair _   -> "Needs repair"
132

    
133
-- | Return a new list of tags to remove that includes @arTag@ if present.
134
delCurTag :: InstanceData -> [String]
135
delCurTag instData =
136
  let arData = getArData $ arState instData
137
      rmTags = tagsToRemove instData
138
  in
139
   case arData of
140
     Just d  -> arTag d : rmTags
141
     Nothing -> rmTags
142

    
143
-- | Set the initial auto-repair state of an instance from its auto-repair tags.
144
--
145
-- The rules when there are multiple tags is:
146
--
147
--   * the earliest failure result always wins
148
--
149
--   * two or more pending repairs results in a fatal error
150
--
151
--   * a pending result from id X and a success result from id Y result in error
152
--     if Y is newer than X
153
--
154
--   * if there are no pending repairs, the newest success result wins,
155
--     otherwise the pending result is used.
156
setInitialState :: Instance.Instance -> Result InstanceData
157
setInitialState inst =
158
  let arData = mapMaybe parseInitTag $ Instance.allTags inst
159
      -- Group all the AutoRepairData records by id (i.e. by repair task), and
160
      -- present them from oldest to newest.
161
      arData' = sortBy (comparing arUuid) arData
162
      arGroups = groupBy ((==) `on` arUuid) arData'
163
      arGroups' = sortBy (comparing $ minimum . map arTime) arGroups
164
  in
165
   foldM arStatusCmp (InstanceData inst (ArHealthy Nothing) []) arGroups'
166

    
167
-- | Update the initial status of an instance with new repair task tags.
168
--
169
-- This function gets called once per repair group in an instance's tag, and it
170
-- determines whether to set the status of the instance according to this new
171
-- group, or to keep the existing state. See the documentation for
172
-- 'setInitialState' for the rules to be followed when determining this.
173
arStatusCmp :: InstanceData -> [AutoRepairData] -> Result InstanceData
174
arStatusCmp instData arData =
175
  let curSt = arState instData
176
      arData' = sortBy (comparing keyfn) arData
177
      keyfn d = (arResult d, arTime d)
178
      newData = last arData'
179
      newSt = case arResult newData of
180
                Just ArSuccess -> ArHealthy $ Just newData
181
                Just ArEnoperm -> ArHealthy $ Just newData
182
                Just ArFailure -> ArFailedRepair newData
183
                Nothing        -> ArPendingRepair newData
184
  in
185
   case curSt of
186
     ArFailedRepair _ -> Ok instData  -- Always keep the earliest failure.
187
     ArHealthy _      -> Ok instData { arState = newSt
188
                                     , tagsToRemove = delCurTag instData
189
                                     }
190
     ArPendingRepair d -> Bad (
191
       "An unfinished repair was found in instance " ++
192
       Instance.name (arInstance instData) ++ ": found tag " ++
193
       show (arTag newData) ++ ", but older pending tag " ++
194
       show (arTag d) ++ "exists.")
195

    
196
     ArNeedsRepair _ -> Bad
197
       "programming error: ArNeedsRepair found as an initial state"
198

    
199
-- | Query jobs of a pending repair, returning the new instance data.
200
processPending :: L.Client -> InstanceData -> IO InstanceData
201
processPending client instData =
202
  case arState instData of
203
    (ArPendingRepair arData) -> do
204
      sts <- L.queryJobsStatus client $ arJobs arData
205
      time <- getClockTime
206
      case sts of
207
        Bad e -> exitErr $ "could not check job status: " ++ formatError e
208
        Ok sts' ->
209
          if any (<= JOB_STATUS_RUNNING) sts' then
210
            return instData -- (no change)
211
          else do
212
            let iname = Instance.name $ arInstance instData
213
                srcSt = arStateName $ arState instData
214
                destSt = arStateName arState'
215
            putStrLn ("Moving " ++ iname ++ " from " ++ show srcSt ++ " to " ++
216
                      show destSt)
217
            commitChange client instData'
218
          where
219
            instData' =
220
              instData { arState = arState'
221
                       , tagsToRemove = delCurTag instData
222
                       }
223
            arState' =
224
              if all (== JOB_STATUS_SUCCESS) sts' then
225
                ArHealthy $ Just (updateTag $ arData { arResult = Just ArSuccess
226
                                                     , arTime = time })
227
              else
228
                ArFailedRepair (updateTag $ arData { arResult = Just ArFailure
229
                                                   , arTime = time })
230

    
231
    _ -> return instData
232

    
233
-- | Update the tag of an 'AutoRepairData' record to match all the other fields.
234
updateTag :: AutoRepairData -> AutoRepairData
235
updateTag arData =
236
  let ini = [autoRepairTypeToRaw $ arType arData,
237
             arUuid arData,
238
             clockTimeToString $ arTime arData]
239
      end = [intercalate "+" . map (show . fromJobId) $ arJobs arData]
240
      (pfx, middle) =
241
         case arResult arData of
242
          Nothing -> (C.autoRepairTagPending, [])
243
          Just rs -> (C.autoRepairTagResult, [autoRepairResultToRaw rs])
244
  in
245
   arData { arTag = pfx ++ intercalate ":" (ini ++ middle ++ end) }
246

    
247
-- | Apply and remove tags from an instance as indicated by 'InstanceData'.
248
--
249
-- If the /arState/ of the /InstanceData/ record has an associated
250
-- 'AutoRepairData', add its tag to the instance object. Additionally, if
251
-- /tagsToRemove/ is not empty, remove those tags from the instance object. The
252
-- returned /InstanceData/ object always has an empty /tagsToRemove/.
253
commitChange :: L.Client -> InstanceData -> IO InstanceData
254
commitChange client instData = do
255
  let iname = Instance.name $ arInstance instData
256
      arData = getArData $ arState instData
257
      rmTags = tagsToRemove instData
258
      execJobsWaitOk' opcodes = do
259
        res <- execJobsWaitOk [map wrapOpCode opcodes] client
260
        case res of
261
          Ok _ -> return ()
262
          Bad e -> exitErr e
263

    
264
  when (isJust arData) $ do
265
    let tag = arTag $ fromJust arData
266
    putStrLn (">>> Adding the following tag to " ++ iname ++ ":\n" ++ show tag)
267
    execJobsWaitOk' [OpTagsSet (TagInstance iname) [tag]]
268

    
269
  unless (null rmTags) $ do
270
    putStr (">>> Removing the following tags from " ++ iname ++ ":\n" ++
271
            unlines (map show rmTags))
272
    execJobsWaitOk' [OpTagsDel (TagInstance iname) rmTags]
273

    
274
  return instData { tagsToRemove = [] }
275

    
276
-- | Detect brokeness with an instance and suggest repair type and jobs to run.
277
detectBroken :: Node.List -> Instance.Instance
278
             -> Maybe (AutoRepairType, [OpCode])
279
detectBroken nl inst =
280
  let disk = Instance.diskTemplate inst
281
      iname = Instance.name inst
282
      offPri = Node.offline $ Container.find (Instance.pNode inst) nl
283
      offSec = Node.offline $ Container.find (Instance.sNode inst) nl
284
  in
285
   case disk of
286
     DTDrbd8
287
       | offPri && offSec ->
288
         Just (
289
           ArReinstall,
290
           [ OpInstanceRecreateDisks { opInstanceName = iname
291
                                     , opRecreateDisksInfo = RecreateDisksAll
292
                                     , opNodes = []
293
                                       -- FIXME: there should be a better way to
294
                                       -- specify opcode paramteres than abusing
295
                                       -- mkNonEmpty in this way (using the fact
296
                                       -- that Maybe is used both for optional
297
                                       -- fields, and to express failure).
298
                                     , opIallocator = mkNonEmpty "hail"
299
                                     }
300
           , OpInstanceReinstall { opInstanceName = iname
301
                                 , opOsType = Nothing
302
                                 , opTempOsParams = Nothing
303
                                 , opForceVariant = False
304
                                 }
305
           ])
306
       | offPri ->
307
         Just (
308
           ArFailover,
309
           [ OpInstanceFailover { opInstanceName = iname
310
                                  -- FIXME: ditto, see above.
311
                                , opShutdownTimeout = fromJust $ mkNonNegative
312
                                                      C.defaultShutdownTimeout
313
                                , opIgnoreConsistency = False
314
                                , opTargetNode = Nothing
315
                                , opIgnoreIpolicy = False
316
                                , opIallocator = Nothing
317
                                }
318
           ])
319
       | offSec ->
320
         Just (
321
           ArFixStorage,
322
           [ OpInstanceReplaceDisks { opInstanceName = iname
323
                                    , opReplaceDisksMode = ReplaceNewSecondary
324
                                    , opReplaceDisksList = []
325
                                    , opRemoteNode = Nothing
326
                                      -- FIXME: ditto, see above.
327
                                    , opIallocator = mkNonEmpty "hail"
328
                                    , opEarlyRelease = False
329
                                    , opIgnoreIpolicy = False
330
                                    }
331
            ])
332
       | otherwise -> Nothing
333

    
334
     DTPlain
335
       | offPri ->
336
         Just (
337
           ArReinstall,
338
           [ OpInstanceRecreateDisks { opInstanceName = iname
339
                                     , opRecreateDisksInfo = RecreateDisksAll
340
                                     , opNodes = []
341
                                       -- FIXME: ditto, see above.
342
                                     , opIallocator = mkNonEmpty "hail"
343
                                     }
344
           , OpInstanceReinstall { opInstanceName = iname
345
                                 , opOsType = Nothing
346
                                 , opTempOsParams = Nothing
347
                                 , opForceVariant = False
348
                                 }
349
           ])
350
       | otherwise -> Nothing
351

    
352
     _ -> Nothing  -- Other cases are unimplemented for now: DTDiskless,
353
                   -- DTFile, DTSharedFile, DTBlock, DTRbd, DTExt.
354

    
355
-- | Main function.
356
main :: Options -> [String] -> IO ()
357
main opts args = do
358
  unless (null args) $
359
    exitErr "this program doesn't take any arguments."
360

    
361
  luxiDef <- Path.defaultLuxiSocket
362
  let master = fromMaybe luxiDef $ optLuxi opts
363
      opts' = opts { optLuxi = Just master }
364

    
365
  (ClusterData _ nl il _ _) <- loadExternalData opts'
366

    
367
  let iniDataRes = mapM setInitialState $ Container.elems il
368
  iniData <- exitIfBad "when parsing auto-repair tags" iniDataRes
369

    
370
  -- First step: check all pending repairs, see if they are completed.
371
  iniData' <- bracket (L.getClient master) L.closeClient $
372
              forM iniData . processPending
373

    
374
  -- Second step: detect any problems.
375
  let _unused_repairs = map (detectBroken nl . arInstance) iniData'
376

    
377
  return ()