Statistics
| Branch: | Tag: | Revision:

root / src / Ganeti / HTools / Program / Harep.hs @ a5e58418

History | View | Annotate | Download (10.5 kB)

1
{-| Auto-repair tool for Ganeti.
2

    
3
-}
4

    
5
{-
6

    
7
Copyright (C) 2013 Google Inc.
8

    
9
This program is free software; you can redistribute it and/or modify
10
it under the terms of the GNU General Public License as published by
11
the Free Software Foundation; either version 2 of the License, or
12
(at your option) any later version.
13

    
14
This program is distributed in the hope that it will be useful, but
15
WITHOUT ANY WARRANTY; without even the implied warranty of
16
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17
General Public License for more details.
18

    
19
You should have received a copy of the GNU General Public License
20
along with this program; if not, write to the Free Software
21
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22
02110-1301, USA.
23

    
24
-}
25

    
26
module Ganeti.HTools.Program.Harep
27
  ( main
28
  , arguments
29
  , options) where
30

    
31
import Control.Exception (bracket)
32
import Control.Monad
33
import Data.Function
34
import Data.List
35
import Data.Maybe
36
import Data.Ord
37
import System.Time
38

    
39
import Ganeti.BasicTypes
40
import Ganeti.Common
41
import Ganeti.Errors
42
import Ganeti.Jobs
43
import Ganeti.OpCodes
44
import Ganeti.Types
45
import Ganeti.Utils
46
import qualified Ganeti.Constants as C
47
import qualified Ganeti.Luxi as L
48
import qualified Ganeti.Path as Path
49

    
50
import Ganeti.HTools.CLI
51
import Ganeti.HTools.Loader
52
import Ganeti.HTools.ExtLoader
53
import Ganeti.HTools.Types
54
import qualified Ganeti.HTools.Container as Container
55
import qualified Ganeti.HTools.Instance as Instance
56

    
57
-- | Options list and functions.
58
options :: IO [OptType]
59
options = do
60
  luxi <- oLuxiSocket
61
  return
62
    [ luxi
63
    ]
64

    
65
arguments :: [ArgCompletion]
66
arguments = []
67

    
68
data InstanceData = InstanceData { arInstance :: Instance.Instance
69
                                 , arState :: AutoRepairStatus
70
                                 , tagsToRemove :: [String]
71
                                 }
72
                    deriving (Eq, Show)
73

    
74
-- | Parse a tag into an 'AutoRepairData' record.
75
--
76
-- @Nothing@ is returned if the tag is not an auto-repair tag, or if it's
77
-- malformed.
78
parseInitTag :: String -> Maybe AutoRepairData
79
parseInitTag tag =
80
  let parsePending = do
81
        subtag <- chompPrefix C.autoRepairTagPending tag
82
        case sepSplit ':' subtag of
83
          [rtype, uuid, ts, jobs] -> makeArData rtype uuid ts jobs
84
          _                       -> fail ("Invalid tag: " ++ show tag)
85

    
86
      parseResult = do
87
        subtag <- chompPrefix C.autoRepairTagResult tag
88
        case sepSplit ':' subtag of
89
          [rtype, uuid, ts, result, jobs] -> do
90
            arData <- makeArData rtype uuid ts jobs
91
            result' <- autoRepairResultFromRaw result
92
            return arData { arResult = Just result' }
93
          _                               -> fail ("Invalid tag: " ++ show tag)
94

    
95
      makeArData rtype uuid ts jobs = do
96
        rtype' <- autoRepairTypeFromRaw rtype
97
        ts' <- tryRead "auto-repair time" ts
98
        jobs' <- mapM makeJobIdS $ sepSplit '+' jobs
99
        return AutoRepairData { arType = rtype'
100
                              , arUuid = uuid
101
                              , arTime = TOD ts' 0
102
                              , arJobs = jobs'
103
                              , arResult = Nothing
104
                              , arTag = tag
105
                              }
106
  in
107
   parsePending `mplus` parseResult
108

    
109
-- | Return the 'AutoRepairData' element of an 'AutoRepairStatus' type.
110
getArData :: AutoRepairStatus -> Maybe AutoRepairData
111
getArData status =
112
  case status of
113
    ArHealthy (Just d) -> Just d
114
    ArFailedRepair  d  -> Just d
115
    ArPendingRepair d  -> Just d
116
    ArNeedsRepair   d  -> Just d
117
    _                  -> Nothing
118

    
119
-- | Return a short name for each auto-repair status.
120
--
121
-- This is a more concise representation of the status, because the default
122
-- "Show" formatting includes all the accompanying auto-repair data.
123
arStateName :: AutoRepairStatus -> String
124
arStateName status =
125
  case status of
126
    ArHealthy _       -> "Healthy"
127
    ArFailedRepair _  -> "Failure"
128
    ArPendingRepair _ -> "Pending repair"
129
    ArNeedsRepair _   -> "Needs repair"
130

    
131
-- | Return a new list of tags to remove that includes @arTag@ if present.
132
delCurTag :: InstanceData -> [String]
133
delCurTag instData =
134
  let arData = getArData $ arState instData
135
      rmTags = tagsToRemove instData
136
  in
137
   case arData of
138
     Just d  -> arTag d : rmTags
139
     Nothing -> rmTags
140

    
141
-- | Set the initial auto-repair state of an instance from its auto-repair tags.
142
--
143
-- The rules when there are multiple tags is:
144
--
145
--   * the earliest failure result always wins
146
--
147
--   * two or more pending repairs results in a fatal error
148
--
149
--   * a pending result from id X and a success result from id Y result in error
150
--     if Y is newer than X
151
--
152
--   * if there are no pending repairs, the newest success result wins,
153
--     otherwise the pending result is used.
154
setInitialState :: Instance.Instance -> Result InstanceData
155
setInitialState inst =
156
  let arData = mapMaybe parseInitTag $ Instance.allTags inst
157
      -- Group all the AutoRepairData records by id (i.e. by repair task), and
158
      -- present them from oldest to newest.
159
      arData' = sortBy (comparing arUuid) arData
160
      arGroups = groupBy ((==) `on` arUuid) arData'
161
      arGroups' = sortBy (comparing $ minimum . map arTime) arGroups
162
  in
163
   foldM arStatusCmp (InstanceData inst (ArHealthy Nothing) []) arGroups'
164

    
165
-- | Update the initial status of an instance with new repair task tags.
166
--
167
-- This function gets called once per repair group in an instance's tag, and it
168
-- determines whether to set the status of the instance according to this new
169
-- group, or to keep the existing state. See the documentation for
170
-- 'setInitialState' for the rules to be followed when determining this.
171
arStatusCmp :: InstanceData -> [AutoRepairData] -> Result InstanceData
172
arStatusCmp instData arData =
173
  let curSt = arState instData
174
      arData' = sortBy (comparing keyfn) arData
175
      keyfn d = (arResult d, arTime d)
176
      newData = last arData'
177
      newSt = case arResult newData of
178
                Just ArSuccess -> ArHealthy $ Just newData
179
                Just ArEnoperm -> ArHealthy $ Just newData
180
                Just ArFailure -> ArFailedRepair newData
181
                Nothing        -> ArPendingRepair newData
182
  in
183
   case curSt of
184
     ArFailedRepair _ -> Ok instData  -- Always keep the earliest failure.
185
     ArHealthy _      -> Ok instData { arState = newSt
186
                                     , tagsToRemove = delCurTag instData
187
                                     }
188
     ArPendingRepair d -> Bad (
189
       "An unfinished repair was found in instance " ++
190
       Instance.name (arInstance instData) ++ ": found tag " ++
191
       show (arTag newData) ++ ", but older pending tag " ++
192
       show (arTag d) ++ "exists.")
193

    
194
     ArNeedsRepair _ -> Bad
195
       "programming error: ArNeedsRepair found as an initial state"
196

    
197
-- | Query jobs of a pending repair, returning the new instance data.
198
processPending :: L.Client -> InstanceData -> IO InstanceData
199
processPending client instData =
200
  case arState instData of
201
    (ArPendingRepair arData) -> do
202
      sts <- L.queryJobsStatus client $ arJobs arData
203
      time <- getClockTime
204
      case sts of
205
        Bad e -> exitErr $ "could not check job status: " ++ formatError e
206
        Ok sts' ->
207
          if any (<= JOB_STATUS_RUNNING) sts' then
208
            return instData -- (no change)
209
          else do
210
            let iname = Instance.name $ arInstance instData
211
                srcSt = arStateName $ arState instData
212
                destSt = arStateName arState'
213
            putStrLn ("Moving " ++ iname ++ " from " ++ show srcSt ++ " to " ++
214
                      show destSt)
215
            commitChange client instData'
216
          where
217
            instData' =
218
              instData { arState = arState'
219
                       , tagsToRemove = delCurTag instData
220
                       }
221
            arState' =
222
              if all (== JOB_STATUS_SUCCESS) sts' then
223
                ArHealthy $ Just (updateTag $ arData { arResult = Just ArSuccess
224
                                                     , arTime = time })
225
              else
226
                ArFailedRepair (updateTag $ arData { arResult = Just ArFailure
227
                                                   , arTime = time })
228

    
229
    _ -> return instData
230

    
231
-- | Update the tag of an 'AutoRepairData' record to match all the other fields.
232
updateTag :: AutoRepairData -> AutoRepairData
233
updateTag arData =
234
  let ini = [autoRepairTypeToRaw $ arType arData,
235
             arUuid arData,
236
             clockTimeToString $ arTime arData]
237
      end = [intercalate "+" . map (show . fromJobId) $ arJobs arData]
238
      (pfx, middle) =
239
         case arResult arData of
240
          Nothing -> (C.autoRepairTagPending, [])
241
          Just rs -> (C.autoRepairTagResult, [autoRepairResultToRaw rs])
242
  in
243
   arData { arTag = pfx ++ intercalate ":" (ini ++ middle ++ end) }
244

    
245
-- | Apply and remove tags from an instance as indicated by 'InstanceData'.
246
--
247
-- If the /arState/ of the /InstanceData/ record has an associated
248
-- 'AutoRepairData', add its tag to the instance object. Additionally, if
249
-- /tagsToRemove/ is not empty, remove those tags from the instance object. The
250
-- returned /InstanceData/ object always has an empty /tagsToRemove/.
251
commitChange :: L.Client -> InstanceData -> IO InstanceData
252
commitChange client instData = do
253
  let iname = Instance.name $ arInstance instData
254
      arData = getArData $ arState instData
255
      rmTags = tagsToRemove instData
256
      execJobsWaitOk' opcodes = do
257
        res <- execJobsWaitOk [map wrapOpCode opcodes] client
258
        case res of
259
          Ok _ -> return ()
260
          Bad e -> exitErr e
261

    
262
  when (isJust arData) $ do
263
    let tag = arTag $ fromJust arData
264
    putStrLn (">>> Adding the following tag to " ++ iname ++ ":\n" ++ show tag)
265
    execJobsWaitOk' [OpTagsSet (TagInstance iname) [tag]]
266

    
267
  unless (null rmTags) $ do
268
    putStr (">>> Removing the following tags from " ++ iname ++ ":\n" ++
269
            unlines (map show rmTags))
270
    execJobsWaitOk' [OpTagsDel (TagInstance iname) rmTags]
271

    
272
  return instData { tagsToRemove = [] }
273

    
274
-- | Main function.
275
main :: Options -> [String] -> IO ()
276
main opts args = do
277
  unless (null args) $
278
    exitErr "this program doesn't take any arguments."
279

    
280
  luxiDef <- Path.defaultLuxiSocket
281
  let master = fromMaybe luxiDef $ optLuxi opts
282
      opts' = opts { optLuxi = Just master }
283

    
284
  (ClusterData _ _ il _ _) <- loadExternalData opts'
285

    
286
  let iniDataRes = mapM setInitialState $ Container.elems il
287
  iniData <- exitIfBad "when parsing auto-repair tags" iniDataRes
288

    
289
  -- First step: check all pending repairs, see if they are completed.
290
  _unused_iniData' <- bracket (L.getClient master) L.closeClient $
291
                      forM iniData . processPending
292

    
293
  return ()