Statistics
| Branch: | Tag: | Revision:

root / src / Ganeti / HTools / Program / Harep.hs @ d605e261

History | View | Annotate | Download (19.5 kB)

1 41238d27 Dato Simó
{-# LANGUAGE TupleSections #-}
2 41238d27 Dato Simó
3 b6d9bec8 Dato Simó
{-| Auto-repair tool for Ganeti.
4 b6d9bec8 Dato Simó
5 b6d9bec8 Dato Simó
-}
6 b6d9bec8 Dato Simó
7 b6d9bec8 Dato Simó
{-
8 b6d9bec8 Dato Simó
9 b6d9bec8 Dato Simó
Copyright (C) 2013 Google Inc.
10 b6d9bec8 Dato Simó
11 b6d9bec8 Dato Simó
This program is free software; you can redistribute it and/or modify
12 b6d9bec8 Dato Simó
it under the terms of the GNU General Public License as published by
13 b6d9bec8 Dato Simó
the Free Software Foundation; either version 2 of the License, or
14 b6d9bec8 Dato Simó
(at your option) any later version.
15 b6d9bec8 Dato Simó
16 b6d9bec8 Dato Simó
This program is distributed in the hope that it will be useful, but
17 b6d9bec8 Dato Simó
WITHOUT ANY WARRANTY; without even the implied warranty of
18 b6d9bec8 Dato Simó
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19 b6d9bec8 Dato Simó
General Public License for more details.
20 b6d9bec8 Dato Simó
21 b6d9bec8 Dato Simó
You should have received a copy of the GNU General Public License
22 b6d9bec8 Dato Simó
along with this program; if not, write to the Free Software
23 b6d9bec8 Dato Simó
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
24 b6d9bec8 Dato Simó
02110-1301, USA.
25 b6d9bec8 Dato Simó
26 b6d9bec8 Dato Simó
-}
27 b6d9bec8 Dato Simó
28 b6d9bec8 Dato Simó
module Ganeti.HTools.Program.Harep
29 b6d9bec8 Dato Simó
  ( main
30 b6d9bec8 Dato Simó
  , arguments
31 b6d9bec8 Dato Simó
  , options) where
32 b6d9bec8 Dato Simó
33 a5e58418 Dato Simó
import Control.Exception (bracket)
34 b6d9bec8 Dato Simó
import Control.Monad
35 3416e3e7 Dato Simó
import Data.Function
36 3416e3e7 Dato Simó
import Data.List
37 3416e3e7 Dato Simó
import Data.Maybe
38 3416e3e7 Dato Simó
import Data.Ord
39 3416e3e7 Dato Simó
import System.Time
40 41238d27 Dato Simó
import qualified Data.Map as Map
41 b6d9bec8 Dato Simó
42 3416e3e7 Dato Simó
import Ganeti.BasicTypes
43 b6d9bec8 Dato Simó
import Ganeti.Common
44 a5e58418 Dato Simó
import Ganeti.Errors
45 a5e58418 Dato Simó
import Ganeti.Jobs
46 a5e58418 Dato Simó
import Ganeti.OpCodes
47 4ba5f148 Dato Simó
import Ganeti.OpParams
48 3416e3e7 Dato Simó
import Ganeti.Types
49 b6d9bec8 Dato Simó
import Ganeti.Utils
50 3416e3e7 Dato Simó
import qualified Ganeti.Constants as C
51 a5e58418 Dato Simó
import qualified Ganeti.Luxi as L
52 3416e3e7 Dato Simó
import qualified Ganeti.Path as Path
53 b6d9bec8 Dato Simó
54 b6d9bec8 Dato Simó
import Ganeti.HTools.CLI
55 3416e3e7 Dato Simó
import Ganeti.HTools.Loader
56 3416e3e7 Dato Simó
import Ganeti.HTools.ExtLoader
57 3416e3e7 Dato Simó
import Ganeti.HTools.Types
58 3416e3e7 Dato Simó
import qualified Ganeti.HTools.Container as Container
59 3416e3e7 Dato Simó
import qualified Ganeti.HTools.Instance as Instance
60 4ba5f148 Dato Simó
import qualified Ganeti.HTools.Node as Node
61 b6d9bec8 Dato Simó
62 b6d9bec8 Dato Simó
-- | Options list and functions.
63 b6d9bec8 Dato Simó
options :: IO [OptType]
64 b6d9bec8 Dato Simó
options = do
65 b6d9bec8 Dato Simó
  luxi <- oLuxiSocket
66 b6d9bec8 Dato Simó
  return
67 b6d9bec8 Dato Simó
    [ luxi
68 8af72964 Dato Simó
    , oJobDelay
69 b6d9bec8 Dato Simó
    ]
70 b6d9bec8 Dato Simó
71 b6d9bec8 Dato Simó
arguments :: [ArgCompletion]
72 b6d9bec8 Dato Simó
arguments = []
73 b6d9bec8 Dato Simó
74 3416e3e7 Dato Simó
data InstanceData = InstanceData { arInstance :: Instance.Instance
75 3416e3e7 Dato Simó
                                 , arState :: AutoRepairStatus
76 3416e3e7 Dato Simó
                                 , tagsToRemove :: [String]
77 3416e3e7 Dato Simó
                                 }
78 3416e3e7 Dato Simó
                    deriving (Eq, Show)
79 3416e3e7 Dato Simó
80 3416e3e7 Dato Simó
-- | Parse a tag into an 'AutoRepairData' record.
81 3416e3e7 Dato Simó
--
82 3416e3e7 Dato Simó
-- @Nothing@ is returned if the tag is not an auto-repair tag, or if it's
83 3416e3e7 Dato Simó
-- malformed.
84 3416e3e7 Dato Simó
parseInitTag :: String -> Maybe AutoRepairData
85 3416e3e7 Dato Simó
parseInitTag tag =
86 3416e3e7 Dato Simó
  let parsePending = do
87 3416e3e7 Dato Simó
        subtag <- chompPrefix C.autoRepairTagPending tag
88 3416e3e7 Dato Simó
        case sepSplit ':' subtag of
89 3416e3e7 Dato Simó
          [rtype, uuid, ts, jobs] -> makeArData rtype uuid ts jobs
90 3416e3e7 Dato Simó
          _                       -> fail ("Invalid tag: " ++ show tag)
91 3416e3e7 Dato Simó
92 3416e3e7 Dato Simó
      parseResult = do
93 3416e3e7 Dato Simó
        subtag <- chompPrefix C.autoRepairTagResult tag
94 3416e3e7 Dato Simó
        case sepSplit ':' subtag of
95 3416e3e7 Dato Simó
          [rtype, uuid, ts, result, jobs] -> do
96 3416e3e7 Dato Simó
            arData <- makeArData rtype uuid ts jobs
97 3416e3e7 Dato Simó
            result' <- autoRepairResultFromRaw result
98 3416e3e7 Dato Simó
            return arData { arResult = Just result' }
99 3416e3e7 Dato Simó
          _                               -> fail ("Invalid tag: " ++ show tag)
100 3416e3e7 Dato Simó
101 3416e3e7 Dato Simó
      makeArData rtype uuid ts jobs = do
102 3416e3e7 Dato Simó
        rtype' <- autoRepairTypeFromRaw rtype
103 3416e3e7 Dato Simó
        ts' <- tryRead "auto-repair time" ts
104 3416e3e7 Dato Simó
        jobs' <- mapM makeJobIdS $ sepSplit '+' jobs
105 3416e3e7 Dato Simó
        return AutoRepairData { arType = rtype'
106 3416e3e7 Dato Simó
                              , arUuid = uuid
107 3416e3e7 Dato Simó
                              , arTime = TOD ts' 0
108 3416e3e7 Dato Simó
                              , arJobs = jobs'
109 3416e3e7 Dato Simó
                              , arResult = Nothing
110 3416e3e7 Dato Simó
                              , arTag = tag
111 3416e3e7 Dato Simó
                              }
112 3416e3e7 Dato Simó
  in
113 3416e3e7 Dato Simó
   parsePending `mplus` parseResult
114 3416e3e7 Dato Simó
115 3416e3e7 Dato Simó
-- | Return the 'AutoRepairData' element of an 'AutoRepairStatus' type.
116 3416e3e7 Dato Simó
getArData :: AutoRepairStatus -> Maybe AutoRepairData
117 3416e3e7 Dato Simó
getArData status =
118 3416e3e7 Dato Simó
  case status of
119 3416e3e7 Dato Simó
    ArHealthy (Just d) -> Just d
120 3416e3e7 Dato Simó
    ArFailedRepair  d  -> Just d
121 3416e3e7 Dato Simó
    ArPendingRepair d  -> Just d
122 3416e3e7 Dato Simó
    ArNeedsRepair   d  -> Just d
123 3416e3e7 Dato Simó
    _                  -> Nothing
124 3416e3e7 Dato Simó
125 a5e58418 Dato Simó
-- | Return a short name for each auto-repair status.
126 a5e58418 Dato Simó
--
127 a5e58418 Dato Simó
-- This is a more concise representation of the status, because the default
128 a5e58418 Dato Simó
-- "Show" formatting includes all the accompanying auto-repair data.
129 a5e58418 Dato Simó
arStateName :: AutoRepairStatus -> String
130 a5e58418 Dato Simó
arStateName status =
131 a5e58418 Dato Simó
  case status of
132 a5e58418 Dato Simó
    ArHealthy _       -> "Healthy"
133 a5e58418 Dato Simó
    ArFailedRepair _  -> "Failure"
134 a5e58418 Dato Simó
    ArPendingRepair _ -> "Pending repair"
135 a5e58418 Dato Simó
    ArNeedsRepair _   -> "Needs repair"
136 a5e58418 Dato Simó
137 3416e3e7 Dato Simó
-- | Return a new list of tags to remove that includes @arTag@ if present.
138 3416e3e7 Dato Simó
delCurTag :: InstanceData -> [String]
139 3416e3e7 Dato Simó
delCurTag instData =
140 3416e3e7 Dato Simó
  let arData = getArData $ arState instData
141 3416e3e7 Dato Simó
      rmTags = tagsToRemove instData
142 3416e3e7 Dato Simó
  in
143 3416e3e7 Dato Simó
   case arData of
144 3416e3e7 Dato Simó
     Just d  -> arTag d : rmTags
145 3416e3e7 Dato Simó
     Nothing -> rmTags
146 3416e3e7 Dato Simó
147 3416e3e7 Dato Simó
-- | Set the initial auto-repair state of an instance from its auto-repair tags.
148 3416e3e7 Dato Simó
--
149 3416e3e7 Dato Simó
-- The rules when there are multiple tags is:
150 3416e3e7 Dato Simó
--
151 3416e3e7 Dato Simó
--   * the earliest failure result always wins
152 3416e3e7 Dato Simó
--
153 3416e3e7 Dato Simó
--   * two or more pending repairs results in a fatal error
154 3416e3e7 Dato Simó
--
155 3416e3e7 Dato Simó
--   * a pending result from id X and a success result from id Y result in error
156 3416e3e7 Dato Simó
--     if Y is newer than X
157 3416e3e7 Dato Simó
--
158 3416e3e7 Dato Simó
--   * if there are no pending repairs, the newest success result wins,
159 3416e3e7 Dato Simó
--     otherwise the pending result is used.
160 3416e3e7 Dato Simó
setInitialState :: Instance.Instance -> Result InstanceData
161 3416e3e7 Dato Simó
setInitialState inst =
162 3416e3e7 Dato Simó
  let arData = mapMaybe parseInitTag $ Instance.allTags inst
163 3416e3e7 Dato Simó
      -- Group all the AutoRepairData records by id (i.e. by repair task), and
164 3416e3e7 Dato Simó
      -- present them from oldest to newest.
165 3416e3e7 Dato Simó
      arData' = sortBy (comparing arUuid) arData
166 3416e3e7 Dato Simó
      arGroups = groupBy ((==) `on` arUuid) arData'
167 3416e3e7 Dato Simó
      arGroups' = sortBy (comparing $ minimum . map arTime) arGroups
168 3416e3e7 Dato Simó
  in
169 3416e3e7 Dato Simó
   foldM arStatusCmp (InstanceData inst (ArHealthy Nothing) []) arGroups'
170 3416e3e7 Dato Simó
171 3416e3e7 Dato Simó
-- | Update the initial status of an instance with new repair task tags.
172 3416e3e7 Dato Simó
--
173 3416e3e7 Dato Simó
-- This function gets called once per repair group in an instance's tag, and it
174 3416e3e7 Dato Simó
-- determines whether to set the status of the instance according to this new
175 3416e3e7 Dato Simó
-- group, or to keep the existing state. See the documentation for
176 3416e3e7 Dato Simó
-- 'setInitialState' for the rules to be followed when determining this.
177 3416e3e7 Dato Simó
arStatusCmp :: InstanceData -> [AutoRepairData] -> Result InstanceData
178 3416e3e7 Dato Simó
arStatusCmp instData arData =
179 3416e3e7 Dato Simó
  let curSt = arState instData
180 3416e3e7 Dato Simó
      arData' = sortBy (comparing keyfn) arData
181 3416e3e7 Dato Simó
      keyfn d = (arResult d, arTime d)
182 3416e3e7 Dato Simó
      newData = last arData'
183 3416e3e7 Dato Simó
      newSt = case arResult newData of
184 3416e3e7 Dato Simó
                Just ArSuccess -> ArHealthy $ Just newData
185 3416e3e7 Dato Simó
                Just ArEnoperm -> ArHealthy $ Just newData
186 3416e3e7 Dato Simó
                Just ArFailure -> ArFailedRepair newData
187 3416e3e7 Dato Simó
                Nothing        -> ArPendingRepair newData
188 3416e3e7 Dato Simó
  in
189 3416e3e7 Dato Simó
   case curSt of
190 3416e3e7 Dato Simó
     ArFailedRepair _ -> Ok instData  -- Always keep the earliest failure.
191 3416e3e7 Dato Simó
     ArHealthy _      -> Ok instData { arState = newSt
192 3416e3e7 Dato Simó
                                     , tagsToRemove = delCurTag instData
193 3416e3e7 Dato Simó
                                     }
194 3416e3e7 Dato Simó
     ArPendingRepair d -> Bad (
195 3416e3e7 Dato Simó
       "An unfinished repair was found in instance " ++
196 3416e3e7 Dato Simó
       Instance.name (arInstance instData) ++ ": found tag " ++
197 3416e3e7 Dato Simó
       show (arTag newData) ++ ", but older pending tag " ++
198 3416e3e7 Dato Simó
       show (arTag d) ++ "exists.")
199 3416e3e7 Dato Simó
200 3416e3e7 Dato Simó
     ArNeedsRepair _ -> Bad
201 3416e3e7 Dato Simó
       "programming error: ArNeedsRepair found as an initial state"
202 3416e3e7 Dato Simó
203 a5e58418 Dato Simó
-- | Query jobs of a pending repair, returning the new instance data.
204 a5e58418 Dato Simó
processPending :: L.Client -> InstanceData -> IO InstanceData
205 a5e58418 Dato Simó
processPending client instData =
206 a5e58418 Dato Simó
  case arState instData of
207 a5e58418 Dato Simó
    (ArPendingRepair arData) -> do
208 a5e58418 Dato Simó
      sts <- L.queryJobsStatus client $ arJobs arData
209 a5e58418 Dato Simó
      time <- getClockTime
210 a5e58418 Dato Simó
      case sts of
211 a5e58418 Dato Simó
        Bad e -> exitErr $ "could not check job status: " ++ formatError e
212 a5e58418 Dato Simó
        Ok sts' ->
213 a5e58418 Dato Simó
          if any (<= JOB_STATUS_RUNNING) sts' then
214 a5e58418 Dato Simó
            return instData -- (no change)
215 a5e58418 Dato Simó
          else do
216 a5e58418 Dato Simó
            let iname = Instance.name $ arInstance instData
217 a5e58418 Dato Simó
                srcSt = arStateName $ arState instData
218 a5e58418 Dato Simó
                destSt = arStateName arState'
219 a5e58418 Dato Simó
            putStrLn ("Moving " ++ iname ++ " from " ++ show srcSt ++ " to " ++
220 a5e58418 Dato Simó
                      show destSt)
221 a5e58418 Dato Simó
            commitChange client instData'
222 a5e58418 Dato Simó
          where
223 a5e58418 Dato Simó
            instData' =
224 a5e58418 Dato Simó
              instData { arState = arState'
225 a5e58418 Dato Simó
                       , tagsToRemove = delCurTag instData
226 a5e58418 Dato Simó
                       }
227 a5e58418 Dato Simó
            arState' =
228 a5e58418 Dato Simó
              if all (== JOB_STATUS_SUCCESS) sts' then
229 a5e58418 Dato Simó
                ArHealthy $ Just (updateTag $ arData { arResult = Just ArSuccess
230 a5e58418 Dato Simó
                                                     , arTime = time })
231 a5e58418 Dato Simó
              else
232 a5e58418 Dato Simó
                ArFailedRepair (updateTag $ arData { arResult = Just ArFailure
233 a5e58418 Dato Simó
                                                   , arTime = time })
234 a5e58418 Dato Simó
235 a5e58418 Dato Simó
    _ -> return instData
236 a5e58418 Dato Simó
237 a5e58418 Dato Simó
-- | Update the tag of an 'AutoRepairData' record to match all the other fields.
238 a5e58418 Dato Simó
updateTag :: AutoRepairData -> AutoRepairData
239 a5e58418 Dato Simó
updateTag arData =
240 a5e58418 Dato Simó
  let ini = [autoRepairTypeToRaw $ arType arData,
241 a5e58418 Dato Simó
             arUuid arData,
242 a5e58418 Dato Simó
             clockTimeToString $ arTime arData]
243 a5e58418 Dato Simó
      end = [intercalate "+" . map (show . fromJobId) $ arJobs arData]
244 a5e58418 Dato Simó
      (pfx, middle) =
245 a5e58418 Dato Simó
         case arResult arData of
246 a5e58418 Dato Simó
          Nothing -> (C.autoRepairTagPending, [])
247 a5e58418 Dato Simó
          Just rs -> (C.autoRepairTagResult, [autoRepairResultToRaw rs])
248 a5e58418 Dato Simó
  in
249 a5e58418 Dato Simó
   arData { arTag = pfx ++ intercalate ":" (ini ++ middle ++ end) }
250 a5e58418 Dato Simó
251 a5e58418 Dato Simó
-- | Apply and remove tags from an instance as indicated by 'InstanceData'.
252 a5e58418 Dato Simó
--
253 a5e58418 Dato Simó
-- If the /arState/ of the /InstanceData/ record has an associated
254 a5e58418 Dato Simó
-- 'AutoRepairData', add its tag to the instance object. Additionally, if
255 a5e58418 Dato Simó
-- /tagsToRemove/ is not empty, remove those tags from the instance object. The
256 a5e58418 Dato Simó
-- returned /InstanceData/ object always has an empty /tagsToRemove/.
257 a5e58418 Dato Simó
commitChange :: L.Client -> InstanceData -> IO InstanceData
258 a5e58418 Dato Simó
commitChange client instData = do
259 a5e58418 Dato Simó
  let iname = Instance.name $ arInstance instData
260 a5e58418 Dato Simó
      arData = getArData $ arState instData
261 a5e58418 Dato Simó
      rmTags = tagsToRemove instData
262 a5e58418 Dato Simó
      execJobsWaitOk' opcodes = do
263 a5e58418 Dato Simó
        res <- execJobsWaitOk [map wrapOpCode opcodes] client
264 a5e58418 Dato Simó
        case res of
265 a5e58418 Dato Simó
          Ok _ -> return ()
266 a5e58418 Dato Simó
          Bad e -> exitErr e
267 a5e58418 Dato Simó
268 a5e58418 Dato Simó
  when (isJust arData) $ do
269 a5e58418 Dato Simó
    let tag = arTag $ fromJust arData
270 a5e58418 Dato Simó
    putStrLn (">>> Adding the following tag to " ++ iname ++ ":\n" ++ show tag)
271 5cbf7832 Jose A. Lopes
    execJobsWaitOk' [OpTagsSet TagKindInstance [tag] (Just iname)]
272 a5e58418 Dato Simó
273 a5e58418 Dato Simó
  unless (null rmTags) $ do
274 a5e58418 Dato Simó
    putStr (">>> Removing the following tags from " ++ iname ++ ":\n" ++
275 a5e58418 Dato Simó
            unlines (map show rmTags))
276 5cbf7832 Jose A. Lopes
    execJobsWaitOk' [OpTagsDel TagKindInstance rmTags (Just iname)]
277 a5e58418 Dato Simó
278 a5e58418 Dato Simó
  return instData { tagsToRemove = [] }
279 a5e58418 Dato Simó
280 546a1dcf Dato Simó
-- | Detect brokenness with an instance and suggest repair type and jobs to run.
281 4ba5f148 Dato Simó
detectBroken :: Node.List -> Instance.Instance
282 4ba5f148 Dato Simó
             -> Maybe (AutoRepairType, [OpCode])
283 4ba5f148 Dato Simó
detectBroken nl inst =
284 4ba5f148 Dato Simó
  let disk = Instance.diskTemplate inst
285 4ba5f148 Dato Simó
      iname = Instance.name inst
286 4ba5f148 Dato Simó
      offPri = Node.offline $ Container.find (Instance.pNode inst) nl
287 4ba5f148 Dato Simó
      offSec = Node.offline $ Container.find (Instance.sNode inst) nl
288 4ba5f148 Dato Simó
  in
289 4ba5f148 Dato Simó
   case disk of
290 4ba5f148 Dato Simó
     DTDrbd8
291 4ba5f148 Dato Simó
       | offPri && offSec ->
292 4ba5f148 Dato Simó
         Just (
293 4ba5f148 Dato Simó
           ArReinstall,
294 4ba5f148 Dato Simó
           [ OpInstanceRecreateDisks { opInstanceName = iname
295 da4a52a3 Thomas Thrainer
                                     , opInstanceUuid = Nothing
296 4ba5f148 Dato Simó
                                     , opRecreateDisksInfo = RecreateDisksAll
297 4ba5f148 Dato Simó
                                     , opNodes = []
298 4ba5f148 Dato Simó
                                       -- FIXME: there should be a better way to
299 546a1dcf Dato Simó
                                       -- specify opcode parameters than abusing
300 4ba5f148 Dato Simó
                                       -- mkNonEmpty in this way (using the fact
301 4ba5f148 Dato Simó
                                       -- that Maybe is used both for optional
302 4ba5f148 Dato Simó
                                       -- fields, and to express failure).
303 1c3231aa Thomas Thrainer
                                     , opNodeUuids = Nothing
304 4ba5f148 Dato Simó
                                     , opIallocator = mkNonEmpty "hail"
305 4ba5f148 Dato Simó
                                     }
306 4ba5f148 Dato Simó
           , OpInstanceReinstall { opInstanceName = iname
307 da4a52a3 Thomas Thrainer
                                 , opInstanceUuid = Nothing
308 4ba5f148 Dato Simó
                                 , opOsType = Nothing
309 4ba5f148 Dato Simó
                                 , opTempOsParams = Nothing
310 4ba5f148 Dato Simó
                                 , opForceVariant = False
311 4ba5f148 Dato Simó
                                 }
312 4ba5f148 Dato Simó
           ])
313 4ba5f148 Dato Simó
       | offPri ->
314 4ba5f148 Dato Simó
         Just (
315 4ba5f148 Dato Simó
           ArFailover,
316 4ba5f148 Dato Simó
           [ OpInstanceFailover { opInstanceName = iname
317 da4a52a3 Thomas Thrainer
                                , opInstanceUuid = Nothing
318 4ba5f148 Dato Simó
                                  -- FIXME: ditto, see above.
319 4ba5f148 Dato Simó
                                , opShutdownTimeout = fromJust $ mkNonNegative
320 4ba5f148 Dato Simó
                                                      C.defaultShutdownTimeout
321 4ba5f148 Dato Simó
                                , opIgnoreConsistency = False
322 4ba5f148 Dato Simó
                                , opTargetNode = Nothing
323 1c3231aa Thomas Thrainer
                                , opTargetNodeUuid = Nothing
324 4ba5f148 Dato Simó
                                , opIgnoreIpolicy = False
325 4ba5f148 Dato Simó
                                , opIallocator = Nothing
326 aa7a5c90 Michele Tartara
                                , opMigrationCleanup = False
327 4ba5f148 Dato Simó
                                }
328 4ba5f148 Dato Simó
           ])
329 4ba5f148 Dato Simó
       | offSec ->
330 4ba5f148 Dato Simó
         Just (
331 4ba5f148 Dato Simó
           ArFixStorage,
332 4ba5f148 Dato Simó
           [ OpInstanceReplaceDisks { opInstanceName = iname
333 da4a52a3 Thomas Thrainer
                                    , opInstanceUuid = Nothing
334 4ba5f148 Dato Simó
                                    , opReplaceDisksMode = ReplaceNewSecondary
335 4ba5f148 Dato Simó
                                    , opReplaceDisksList = []
336 4ba5f148 Dato Simó
                                    , opRemoteNode = Nothing
337 4ba5f148 Dato Simó
                                      -- FIXME: ditto, see above.
338 1c3231aa Thomas Thrainer
                                    , opRemoteNodeUuid = Nothing
339 4ba5f148 Dato Simó
                                    , opIallocator = mkNonEmpty "hail"
340 4ba5f148 Dato Simó
                                    , opEarlyRelease = False
341 4ba5f148 Dato Simó
                                    , opIgnoreIpolicy = False
342 4ba5f148 Dato Simó
                                    }
343 4ba5f148 Dato Simó
            ])
344 4ba5f148 Dato Simó
       | otherwise -> Nothing
345 4ba5f148 Dato Simó
346 4ba5f148 Dato Simó
     DTPlain
347 4ba5f148 Dato Simó
       | offPri ->
348 4ba5f148 Dato Simó
         Just (
349 4ba5f148 Dato Simó
           ArReinstall,
350 4ba5f148 Dato Simó
           [ OpInstanceRecreateDisks { opInstanceName = iname
351 da4a52a3 Thomas Thrainer
                                     , opInstanceUuid = Nothing
352 4ba5f148 Dato Simó
                                     , opRecreateDisksInfo = RecreateDisksAll
353 4ba5f148 Dato Simó
                                     , opNodes = []
354 4ba5f148 Dato Simó
                                       -- FIXME: ditto, see above.
355 1c3231aa Thomas Thrainer
                                     , opNodeUuids = Nothing
356 4ba5f148 Dato Simó
                                     , opIallocator = mkNonEmpty "hail"
357 4ba5f148 Dato Simó
                                     }
358 4ba5f148 Dato Simó
           , OpInstanceReinstall { opInstanceName = iname
359 da4a52a3 Thomas Thrainer
                                 , opInstanceUuid = Nothing
360 4ba5f148 Dato Simó
                                 , opOsType = Nothing
361 4ba5f148 Dato Simó
                                 , opTempOsParams = Nothing
362 4ba5f148 Dato Simó
                                 , opForceVariant = False
363 4ba5f148 Dato Simó
                                 }
364 4ba5f148 Dato Simó
           ])
365 4ba5f148 Dato Simó
       | otherwise -> Nothing
366 4ba5f148 Dato Simó
367 4ba5f148 Dato Simó
     _ -> Nothing  -- Other cases are unimplemented for now: DTDiskless,
368 4ba5f148 Dato Simó
                   -- DTFile, DTSharedFile, DTBlock, DTRbd, DTExt.
369 4ba5f148 Dato Simó
370 8072af6c Dato Simó
-- | Perform the suggested repair on an instance if its policy allows it.
371 8af72964 Dato Simó
doRepair :: L.Client     -- ^ The Luxi client
372 8af72964 Dato Simó
         -> Double       -- ^ Delay to insert before the first repair opcode
373 8af72964 Dato Simó
         -> InstanceData -- ^ The instance data
374 8af72964 Dato Simó
         -> (AutoRepairType, [OpCode]) -- ^ The repair job to perform
375 8af72964 Dato Simó
         -> IO InstanceData -- ^ The updated instance data
376 8af72964 Dato Simó
doRepair client delay instData (rtype, opcodes) =
377 8072af6c Dato Simó
  let inst = arInstance instData
378 8072af6c Dato Simó
      ipol = Instance.arPolicy inst
379 8072af6c Dato Simó
      iname = Instance.name inst
380 8072af6c Dato Simó
  in
381 8072af6c Dato Simó
  case ipol of
382 8072af6c Dato Simó
    ArEnabled maxtype ->
383 8072af6c Dato Simó
      if rtype > maxtype then do
384 8072af6c Dato Simó
        uuid <- newUUID
385 8072af6c Dato Simó
        time <- getClockTime
386 8072af6c Dato Simó
387 8072af6c Dato Simó
        let arState' = ArNeedsRepair (
388 8072af6c Dato Simó
              updateTag $ AutoRepairData rtype uuid time [] (Just ArEnoperm) "")
389 8072af6c Dato Simó
            instData' = instData { arState = arState'
390 8072af6c Dato Simó
                                 , tagsToRemove = delCurTag instData
391 8072af6c Dato Simó
                                 }
392 8072af6c Dato Simó
393 8072af6c Dato Simó
        putStrLn ("Not performing a repair of type " ++ show rtype ++ " on " ++
394 8072af6c Dato Simó
          iname ++ " because only repairs up to " ++ show maxtype ++
395 8072af6c Dato Simó
          " are allowed")
396 8072af6c Dato Simó
        commitChange client instData'  -- Adds "enoperm" result label.
397 8072af6c Dato Simó
      else do
398 8072af6c Dato Simó
        putStrLn ("Executing " ++ show rtype ++ " repair on " ++ iname)
399 8072af6c Dato Simó
400 8af72964 Dato Simó
        -- After submitting the job, we must write an autorepair:pending tag,
401 8af72964 Dato Simó
        -- that includes the repair job IDs so that they can be checked later.
402 8af72964 Dato Simó
        -- One problem we run into is that the repair job immediately grabs
403 8af72964 Dato Simó
        -- locks for the affected instance, and the subsequent TAGS_SET job is
404 546a1dcf Dato Simó
        -- blocked, introducing an unnecessary delay for the end-user. One
405 8af72964 Dato Simó
        -- alternative would be not to wait for the completion of the TAGS_SET
406 8af72964 Dato Simó
        -- job, contrary to what commitChange normally does; but we insist on
407 8af72964 Dato Simó
        -- waiting for the tag to be set so as to abort in case of failure,
408 8af72964 Dato Simó
        -- because the cluster is left in an invalid state in that case.
409 8af72964 Dato Simó
        --
410 8af72964 Dato Simó
        -- The proper solution (in 2.9+) would be not to use tags for storing
411 8af72964 Dato Simó
        -- autorepair data, or make the TAGS_SET opcode not grab an instance's
412 8af72964 Dato Simó
        -- locks (if that's deemed safe). In the meantime, we introduce an
413 8af72964 Dato Simó
        -- artificial delay in the repair job (via a TestDelay opcode) so that
414 8af72964 Dato Simó
        -- once we have the job ID, the TAGS_SET job can complete before the
415 8af72964 Dato Simó
        -- repair job actually grabs the locks. (Please note that this is not
416 8af72964 Dato Simó
        -- about synchronization, but merely about speeding up the execution of
417 8af72964 Dato Simó
        -- the harep tool. If this TestDelay opcode is removed, the program is
418 8af72964 Dato Simó
        -- still correct.)
419 8af72964 Dato Simó
        let opcodes' =
420 8af72964 Dato Simó
              if delay > 0 then
421 8af72964 Dato Simó
                OpTestDelay { opDelayDuration = delay
422 8af72964 Dato Simó
                            , opDelayOnMaster = True
423 8af72964 Dato Simó
                            , opDelayOnNodes = []
424 1c3231aa Thomas Thrainer
                            , opDelayOnNodeUuids = Nothing
425 8af72964 Dato Simó
                            , opDelayRepeat = fromJust $ mkNonNegative 0
426 8af72964 Dato Simó
                            } : opcodes
427 8af72964 Dato Simó
              else
428 8af72964 Dato Simó
                opcodes
429 8af72964 Dato Simó
430 8072af6c Dato Simó
        uuid <- newUUID
431 8072af6c Dato Simó
        time <- getClockTime
432 8af72964 Dato Simó
        jids <- submitJobs [map wrapOpCode opcodes'] client
433 8072af6c Dato Simó
434 8072af6c Dato Simó
        case jids of
435 8072af6c Dato Simó
          Bad e    -> exitErr e
436 8072af6c Dato Simó
          Ok jids' ->
437 8072af6c Dato Simó
            let arState' = ArPendingRepair (
438 8072af6c Dato Simó
                  updateTag $ AutoRepairData rtype uuid time jids' Nothing "")
439 8072af6c Dato Simó
                instData' = instData { arState = arState'
440 8072af6c Dato Simó
                                     , tagsToRemove = delCurTag instData
441 8072af6c Dato Simó
                                     }
442 8072af6c Dato Simó
            in
443 8072af6c Dato Simó
             commitChange client instData'  -- Adds "pending" label.
444 8072af6c Dato Simó
445 8072af6c Dato Simó
    otherSt -> do
446 8072af6c Dato Simó
      putStrLn ("Not repairing " ++ iname ++ " because it's in state " ++
447 8072af6c Dato Simó
                show otherSt)
448 8072af6c Dato Simó
      return instData
449 8072af6c Dato Simó
450 b6d9bec8 Dato Simó
-- | Main function.
451 b6d9bec8 Dato Simó
main :: Options -> [String] -> IO ()
452 3416e3e7 Dato Simó
main opts args = do
453 b6d9bec8 Dato Simó
  unless (null args) $
454 b6d9bec8 Dato Simó
    exitErr "this program doesn't take any arguments."
455 b6d9bec8 Dato Simó
456 9d049fb4 Klaus Aehlig
  luxiDef <- Path.defaultMasterSocket
457 3416e3e7 Dato Simó
  let master = fromMaybe luxiDef $ optLuxi opts
458 3416e3e7 Dato Simó
      opts' = opts { optLuxi = Just master }
459 3416e3e7 Dato Simó
460 4ba5f148 Dato Simó
  (ClusterData _ nl il _ _) <- loadExternalData opts'
461 3416e3e7 Dato Simó
462 3416e3e7 Dato Simó
  let iniDataRes = mapM setInitialState $ Container.elems il
463 a5e58418 Dato Simó
  iniData <- exitIfBad "when parsing auto-repair tags" iniDataRes
464 a5e58418 Dato Simó
465 a5e58418 Dato Simó
  -- First step: check all pending repairs, see if they are completed.
466 d605e261 Petr Pudlak
  iniData' <- bracket (L.getLuxiClient master) L.closeClient $
467 4ba5f148 Dato Simó
              forM iniData . processPending
468 4ba5f148 Dato Simó
469 4ba5f148 Dato Simó
  -- Second step: detect any problems.
470 8072af6c Dato Simó
  let repairs = map (detectBroken nl . arInstance) iniData'
471 8072af6c Dato Simó
472 8072af6c Dato Simó
  -- Third step: create repair jobs for broken instances that are in ArHealthy.
473 8072af6c Dato Simó
  let maybeRepair c (i, r) = maybe (return i) (repairHealthy c i) r
474 8af72964 Dato Simó
      jobDelay = optJobDelay opts
475 8072af6c Dato Simó
      repairHealthy c i = case arState i of
476 8af72964 Dato Simó
                            ArHealthy _ -> doRepair c jobDelay i
477 8072af6c Dato Simó
                            _           -> const (return i)
478 8072af6c Dato Simó
479 d605e261 Petr Pudlak
  repairDone <- bracket (L.getLuxiClient master) L.closeClient $
480 41238d27 Dato Simó
                forM (zip iniData' repairs) . maybeRepair
481 41238d27 Dato Simó
482 41238d27 Dato Simó
  -- Print some stats and exit.
483 41238d27 Dato Simó
  let states = map ((, 1 :: Int) . arStateName . arState) repairDone
484 41238d27 Dato Simó
      counts = Map.fromListWith (+) states
485 3416e3e7 Dato Simó
486 41238d27 Dato Simó
  putStrLn "---------------------"
487 41238d27 Dato Simó
  putStrLn "Instance status count"
488 41238d27 Dato Simó
  putStrLn "---------------------"
489 41238d27 Dato Simó
  putStr . unlines . Map.elems $
490 41238d27 Dato Simó
    Map.mapWithKey (\k v -> k ++ ": " ++ show v) counts