Statistics
| Branch: | Tag: | Revision:

root / src / Ganeti / HTools / Program / Harep.hs @ 89363f98

History | View | Annotate | Download (18.8 kB)

1 41238d27 Dato Simó
{-# LANGUAGE TupleSections #-}
2 41238d27 Dato Simó
3 b6d9bec8 Dato Simó
{-| Auto-repair tool for Ganeti.
4 b6d9bec8 Dato Simó
5 b6d9bec8 Dato Simó
-}
6 b6d9bec8 Dato Simó
7 b6d9bec8 Dato Simó
{-
8 b6d9bec8 Dato Simó
9 b6d9bec8 Dato Simó
Copyright (C) 2013 Google Inc.
10 b6d9bec8 Dato Simó
11 b6d9bec8 Dato Simó
This program is free software; you can redistribute it and/or modify
12 b6d9bec8 Dato Simó
it under the terms of the GNU General Public License as published by
13 b6d9bec8 Dato Simó
the Free Software Foundation; either version 2 of the License, or
14 b6d9bec8 Dato Simó
(at your option) any later version.
15 b6d9bec8 Dato Simó
16 b6d9bec8 Dato Simó
This program is distributed in the hope that it will be useful, but
17 b6d9bec8 Dato Simó
WITHOUT ANY WARRANTY; without even the implied warranty of
18 b6d9bec8 Dato Simó
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19 b6d9bec8 Dato Simó
General Public License for more details.
20 b6d9bec8 Dato Simó
21 b6d9bec8 Dato Simó
You should have received a copy of the GNU General Public License
22 b6d9bec8 Dato Simó
along with this program; if not, write to the Free Software
23 b6d9bec8 Dato Simó
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
24 b6d9bec8 Dato Simó
02110-1301, USA.
25 b6d9bec8 Dato Simó
26 b6d9bec8 Dato Simó
-}
27 b6d9bec8 Dato Simó
28 b6d9bec8 Dato Simó
module Ganeti.HTools.Program.Harep
29 b6d9bec8 Dato Simó
  ( main
30 b6d9bec8 Dato Simó
  , arguments
31 b6d9bec8 Dato Simó
  , options) where
32 b6d9bec8 Dato Simó
33 a5e58418 Dato Simó
import Control.Exception (bracket)
34 b6d9bec8 Dato Simó
import Control.Monad
35 3416e3e7 Dato Simó
import Data.Function
36 3416e3e7 Dato Simó
import Data.List
37 3416e3e7 Dato Simó
import Data.Maybe
38 3416e3e7 Dato Simó
import Data.Ord
39 3416e3e7 Dato Simó
import System.Time
40 41238d27 Dato Simó
import qualified Data.Map as Map
41 b6d9bec8 Dato Simó
42 3416e3e7 Dato Simó
import Ganeti.BasicTypes
43 b6d9bec8 Dato Simó
import Ganeti.Common
44 a5e58418 Dato Simó
import Ganeti.Errors
45 a5e58418 Dato Simó
import Ganeti.Jobs
46 a5e58418 Dato Simó
import Ganeti.OpCodes
47 4ba5f148 Dato Simó
import Ganeti.OpParams
48 3416e3e7 Dato Simó
import Ganeti.Types
49 b6d9bec8 Dato Simó
import Ganeti.Utils
50 3416e3e7 Dato Simó
import qualified Ganeti.Constants as C
51 a5e58418 Dato Simó
import qualified Ganeti.Luxi as L
52 3416e3e7 Dato Simó
import qualified Ganeti.Path as Path
53 b6d9bec8 Dato Simó
54 b6d9bec8 Dato Simó
import Ganeti.HTools.CLI
55 3416e3e7 Dato Simó
import Ganeti.HTools.Loader
56 3416e3e7 Dato Simó
import Ganeti.HTools.ExtLoader
57 3416e3e7 Dato Simó
import Ganeti.HTools.Types
58 3416e3e7 Dato Simó
import qualified Ganeti.HTools.Container as Container
59 3416e3e7 Dato Simó
import qualified Ganeti.HTools.Instance as Instance
60 4ba5f148 Dato Simó
import qualified Ganeti.HTools.Node as Node
61 b6d9bec8 Dato Simó
62 b6d9bec8 Dato Simó
-- | Options list and functions.
63 b6d9bec8 Dato Simó
options :: IO [OptType]
64 b6d9bec8 Dato Simó
options = do
65 b6d9bec8 Dato Simó
  luxi <- oLuxiSocket
66 b6d9bec8 Dato Simó
  return
67 b6d9bec8 Dato Simó
    [ luxi
68 8af72964 Dato Simó
    , oJobDelay
69 b6d9bec8 Dato Simó
    ]
70 b6d9bec8 Dato Simó
71 b6d9bec8 Dato Simó
arguments :: [ArgCompletion]
72 b6d9bec8 Dato Simó
arguments = []
73 b6d9bec8 Dato Simó
74 3416e3e7 Dato Simó
data InstanceData = InstanceData { arInstance :: Instance.Instance
75 3416e3e7 Dato Simó
                                 , arState :: AutoRepairStatus
76 3416e3e7 Dato Simó
                                 , tagsToRemove :: [String]
77 3416e3e7 Dato Simó
                                 }
78 3416e3e7 Dato Simó
                    deriving (Eq, Show)
79 3416e3e7 Dato Simó
80 3416e3e7 Dato Simó
-- | Parse a tag into an 'AutoRepairData' record.
81 3416e3e7 Dato Simó
--
82 3416e3e7 Dato Simó
-- @Nothing@ is returned if the tag is not an auto-repair tag, or if it's
83 3416e3e7 Dato Simó
-- malformed.
84 3416e3e7 Dato Simó
parseInitTag :: String -> Maybe AutoRepairData
85 3416e3e7 Dato Simó
parseInitTag tag =
86 3416e3e7 Dato Simó
  let parsePending = do
87 3416e3e7 Dato Simó
        subtag <- chompPrefix C.autoRepairTagPending tag
88 3416e3e7 Dato Simó
        case sepSplit ':' subtag of
89 3416e3e7 Dato Simó
          [rtype, uuid, ts, jobs] -> makeArData rtype uuid ts jobs
90 3416e3e7 Dato Simó
          _                       -> fail ("Invalid tag: " ++ show tag)
91 3416e3e7 Dato Simó
92 3416e3e7 Dato Simó
      parseResult = do
93 3416e3e7 Dato Simó
        subtag <- chompPrefix C.autoRepairTagResult tag
94 3416e3e7 Dato Simó
        case sepSplit ':' subtag of
95 3416e3e7 Dato Simó
          [rtype, uuid, ts, result, jobs] -> do
96 3416e3e7 Dato Simó
            arData <- makeArData rtype uuid ts jobs
97 3416e3e7 Dato Simó
            result' <- autoRepairResultFromRaw result
98 3416e3e7 Dato Simó
            return arData { arResult = Just result' }
99 3416e3e7 Dato Simó
          _                               -> fail ("Invalid tag: " ++ show tag)
100 3416e3e7 Dato Simó
101 3416e3e7 Dato Simó
      makeArData rtype uuid ts jobs = do
102 3416e3e7 Dato Simó
        rtype' <- autoRepairTypeFromRaw rtype
103 3416e3e7 Dato Simó
        ts' <- tryRead "auto-repair time" ts
104 3416e3e7 Dato Simó
        jobs' <- mapM makeJobIdS $ sepSplit '+' jobs
105 3416e3e7 Dato Simó
        return AutoRepairData { arType = rtype'
106 3416e3e7 Dato Simó
                              , arUuid = uuid
107 3416e3e7 Dato Simó
                              , arTime = TOD ts' 0
108 3416e3e7 Dato Simó
                              , arJobs = jobs'
109 3416e3e7 Dato Simó
                              , arResult = Nothing
110 3416e3e7 Dato Simó
                              , arTag = tag
111 3416e3e7 Dato Simó
                              }
112 3416e3e7 Dato Simó
  in
113 3416e3e7 Dato Simó
   parsePending `mplus` parseResult
114 3416e3e7 Dato Simó
115 3416e3e7 Dato Simó
-- | Return the 'AutoRepairData' element of an 'AutoRepairStatus' type.
116 3416e3e7 Dato Simó
getArData :: AutoRepairStatus -> Maybe AutoRepairData
117 3416e3e7 Dato Simó
getArData status =
118 3416e3e7 Dato Simó
  case status of
119 3416e3e7 Dato Simó
    ArHealthy (Just d) -> Just d
120 3416e3e7 Dato Simó
    ArFailedRepair  d  -> Just d
121 3416e3e7 Dato Simó
    ArPendingRepair d  -> Just d
122 3416e3e7 Dato Simó
    ArNeedsRepair   d  -> Just d
123 3416e3e7 Dato Simó
    _                  -> Nothing
124 3416e3e7 Dato Simó
125 a5e58418 Dato Simó
-- | Return a short name for each auto-repair status.
126 a5e58418 Dato Simó
--
127 a5e58418 Dato Simó
-- This is a more concise representation of the status, because the default
128 a5e58418 Dato Simó
-- "Show" formatting includes all the accompanying auto-repair data.
129 a5e58418 Dato Simó
arStateName :: AutoRepairStatus -> String
130 a5e58418 Dato Simó
arStateName status =
131 a5e58418 Dato Simó
  case status of
132 a5e58418 Dato Simó
    ArHealthy _       -> "Healthy"
133 a5e58418 Dato Simó
    ArFailedRepair _  -> "Failure"
134 a5e58418 Dato Simó
    ArPendingRepair _ -> "Pending repair"
135 a5e58418 Dato Simó
    ArNeedsRepair _   -> "Needs repair"
136 a5e58418 Dato Simó
137 3416e3e7 Dato Simó
-- | Return a new list of tags to remove that includes @arTag@ if present.
138 3416e3e7 Dato Simó
delCurTag :: InstanceData -> [String]
139 3416e3e7 Dato Simó
delCurTag instData =
140 3416e3e7 Dato Simó
  let arData = getArData $ arState instData
141 3416e3e7 Dato Simó
      rmTags = tagsToRemove instData
142 3416e3e7 Dato Simó
  in
143 3416e3e7 Dato Simó
   case arData of
144 3416e3e7 Dato Simó
     Just d  -> arTag d : rmTags
145 3416e3e7 Dato Simó
     Nothing -> rmTags
146 3416e3e7 Dato Simó
147 3416e3e7 Dato Simó
-- | Set the initial auto-repair state of an instance from its auto-repair tags.
148 3416e3e7 Dato Simó
--
149 3416e3e7 Dato Simó
-- The rules when there are multiple tags is:
150 3416e3e7 Dato Simó
--
151 3416e3e7 Dato Simó
--   * the earliest failure result always wins
152 3416e3e7 Dato Simó
--
153 3416e3e7 Dato Simó
--   * two or more pending repairs results in a fatal error
154 3416e3e7 Dato Simó
--
155 3416e3e7 Dato Simó
--   * a pending result from id X and a success result from id Y result in error
156 3416e3e7 Dato Simó
--     if Y is newer than X
157 3416e3e7 Dato Simó
--
158 3416e3e7 Dato Simó
--   * if there are no pending repairs, the newest success result wins,
159 3416e3e7 Dato Simó
--     otherwise the pending result is used.
160 3416e3e7 Dato Simó
setInitialState :: Instance.Instance -> Result InstanceData
161 3416e3e7 Dato Simó
setInitialState inst =
162 3416e3e7 Dato Simó
  let arData = mapMaybe parseInitTag $ Instance.allTags inst
163 3416e3e7 Dato Simó
      -- Group all the AutoRepairData records by id (i.e. by repair task), and
164 3416e3e7 Dato Simó
      -- present them from oldest to newest.
165 3416e3e7 Dato Simó
      arData' = sortBy (comparing arUuid) arData
166 3416e3e7 Dato Simó
      arGroups = groupBy ((==) `on` arUuid) arData'
167 3416e3e7 Dato Simó
      arGroups' = sortBy (comparing $ minimum . map arTime) arGroups
168 3416e3e7 Dato Simó
  in
169 3416e3e7 Dato Simó
   foldM arStatusCmp (InstanceData inst (ArHealthy Nothing) []) arGroups'
170 3416e3e7 Dato Simó
171 3416e3e7 Dato Simó
-- | Update the initial status of an instance with new repair task tags.
172 3416e3e7 Dato Simó
--
173 3416e3e7 Dato Simó
-- This function gets called once per repair group in an instance's tag, and it
174 3416e3e7 Dato Simó
-- determines whether to set the status of the instance according to this new
175 3416e3e7 Dato Simó
-- group, or to keep the existing state. See the documentation for
176 3416e3e7 Dato Simó
-- 'setInitialState' for the rules to be followed when determining this.
177 3416e3e7 Dato Simó
arStatusCmp :: InstanceData -> [AutoRepairData] -> Result InstanceData
178 3416e3e7 Dato Simó
arStatusCmp instData arData =
179 3416e3e7 Dato Simó
  let curSt = arState instData
180 3416e3e7 Dato Simó
      arData' = sortBy (comparing keyfn) arData
181 3416e3e7 Dato Simó
      keyfn d = (arResult d, arTime d)
182 3416e3e7 Dato Simó
      newData = last arData'
183 3416e3e7 Dato Simó
      newSt = case arResult newData of
184 3416e3e7 Dato Simó
                Just ArSuccess -> ArHealthy $ Just newData
185 3416e3e7 Dato Simó
                Just ArEnoperm -> ArHealthy $ Just newData
186 3416e3e7 Dato Simó
                Just ArFailure -> ArFailedRepair newData
187 3416e3e7 Dato Simó
                Nothing        -> ArPendingRepair newData
188 3416e3e7 Dato Simó
  in
189 3416e3e7 Dato Simó
   case curSt of
190 3416e3e7 Dato Simó
     ArFailedRepair _ -> Ok instData  -- Always keep the earliest failure.
191 3416e3e7 Dato Simó
     ArHealthy _      -> Ok instData { arState = newSt
192 3416e3e7 Dato Simó
                                     , tagsToRemove = delCurTag instData
193 3416e3e7 Dato Simó
                                     }
194 3416e3e7 Dato Simó
     ArPendingRepair d -> Bad (
195 3416e3e7 Dato Simó
       "An unfinished repair was found in instance " ++
196 3416e3e7 Dato Simó
       Instance.name (arInstance instData) ++ ": found tag " ++
197 3416e3e7 Dato Simó
       show (arTag newData) ++ ", but older pending tag " ++
198 3416e3e7 Dato Simó
       show (arTag d) ++ "exists.")
199 3416e3e7 Dato Simó
200 3416e3e7 Dato Simó
     ArNeedsRepair _ -> Bad
201 3416e3e7 Dato Simó
       "programming error: ArNeedsRepair found as an initial state"
202 3416e3e7 Dato Simó
203 a5e58418 Dato Simó
-- | Query jobs of a pending repair, returning the new instance data.
204 a5e58418 Dato Simó
processPending :: L.Client -> InstanceData -> IO InstanceData
205 a5e58418 Dato Simó
processPending client instData =
206 a5e58418 Dato Simó
  case arState instData of
207 a5e58418 Dato Simó
    (ArPendingRepair arData) -> do
208 a5e58418 Dato Simó
      sts <- L.queryJobsStatus client $ arJobs arData
209 a5e58418 Dato Simó
      time <- getClockTime
210 a5e58418 Dato Simó
      case sts of
211 a5e58418 Dato Simó
        Bad e -> exitErr $ "could not check job status: " ++ formatError e
212 a5e58418 Dato Simó
        Ok sts' ->
213 a5e58418 Dato Simó
          if any (<= JOB_STATUS_RUNNING) sts' then
214 a5e58418 Dato Simó
            return instData -- (no change)
215 a5e58418 Dato Simó
          else do
216 a5e58418 Dato Simó
            let iname = Instance.name $ arInstance instData
217 a5e58418 Dato Simó
                srcSt = arStateName $ arState instData
218 a5e58418 Dato Simó
                destSt = arStateName arState'
219 a5e58418 Dato Simó
            putStrLn ("Moving " ++ iname ++ " from " ++ show srcSt ++ " to " ++
220 a5e58418 Dato Simó
                      show destSt)
221 a5e58418 Dato Simó
            commitChange client instData'
222 a5e58418 Dato Simó
          where
223 a5e58418 Dato Simó
            instData' =
224 a5e58418 Dato Simó
              instData { arState = arState'
225 a5e58418 Dato Simó
                       , tagsToRemove = delCurTag instData
226 a5e58418 Dato Simó
                       }
227 a5e58418 Dato Simó
            arState' =
228 a5e58418 Dato Simó
              if all (== JOB_STATUS_SUCCESS) sts' then
229 a5e58418 Dato Simó
                ArHealthy $ Just (updateTag $ arData { arResult = Just ArSuccess
230 a5e58418 Dato Simó
                                                     , arTime = time })
231 a5e58418 Dato Simó
              else
232 a5e58418 Dato Simó
                ArFailedRepair (updateTag $ arData { arResult = Just ArFailure
233 a5e58418 Dato Simó
                                                   , arTime = time })
234 a5e58418 Dato Simó
235 a5e58418 Dato Simó
    _ -> return instData
236 a5e58418 Dato Simó
237 a5e58418 Dato Simó
-- | Update the tag of an 'AutoRepairData' record to match all the other fields.
238 a5e58418 Dato Simó
updateTag :: AutoRepairData -> AutoRepairData
239 a5e58418 Dato Simó
updateTag arData =
240 a5e58418 Dato Simó
  let ini = [autoRepairTypeToRaw $ arType arData,
241 a5e58418 Dato Simó
             arUuid arData,
242 a5e58418 Dato Simó
             clockTimeToString $ arTime arData]
243 a5e58418 Dato Simó
      end = [intercalate "+" . map (show . fromJobId) $ arJobs arData]
244 a5e58418 Dato Simó
      (pfx, middle) =
245 a5e58418 Dato Simó
         case arResult arData of
246 a5e58418 Dato Simó
          Nothing -> (C.autoRepairTagPending, [])
247 a5e58418 Dato Simó
          Just rs -> (C.autoRepairTagResult, [autoRepairResultToRaw rs])
248 a5e58418 Dato Simó
  in
249 a5e58418 Dato Simó
   arData { arTag = pfx ++ intercalate ":" (ini ++ middle ++ end) }
250 a5e58418 Dato Simó
251 a5e58418 Dato Simó
-- | Apply and remove tags from an instance as indicated by 'InstanceData'.
252 a5e58418 Dato Simó
--
253 a5e58418 Dato Simó
-- If the /arState/ of the /InstanceData/ record has an associated
254 a5e58418 Dato Simó
-- 'AutoRepairData', add its tag to the instance object. Additionally, if
255 a5e58418 Dato Simó
-- /tagsToRemove/ is not empty, remove those tags from the instance object. The
256 a5e58418 Dato Simó
-- returned /InstanceData/ object always has an empty /tagsToRemove/.
257 a5e58418 Dato Simó
commitChange :: L.Client -> InstanceData -> IO InstanceData
258 a5e58418 Dato Simó
commitChange client instData = do
259 a5e58418 Dato Simó
  let iname = Instance.name $ arInstance instData
260 a5e58418 Dato Simó
      arData = getArData $ arState instData
261 a5e58418 Dato Simó
      rmTags = tagsToRemove instData
262 a5e58418 Dato Simó
      execJobsWaitOk' opcodes = do
263 a5e58418 Dato Simó
        res <- execJobsWaitOk [map wrapOpCode opcodes] client
264 a5e58418 Dato Simó
        case res of
265 a5e58418 Dato Simó
          Ok _ -> return ()
266 a5e58418 Dato Simó
          Bad e -> exitErr e
267 a5e58418 Dato Simó
268 a5e58418 Dato Simó
  when (isJust arData) $ do
269 a5e58418 Dato Simó
    let tag = arTag $ fromJust arData
270 a5e58418 Dato Simó
    putStrLn (">>> Adding the following tag to " ++ iname ++ ":\n" ++ show tag)
271 a5e58418 Dato Simó
    execJobsWaitOk' [OpTagsSet (TagInstance iname) [tag]]
272 a5e58418 Dato Simó
273 a5e58418 Dato Simó
  unless (null rmTags) $ do
274 a5e58418 Dato Simó
    putStr (">>> Removing the following tags from " ++ iname ++ ":\n" ++
275 a5e58418 Dato Simó
            unlines (map show rmTags))
276 a5e58418 Dato Simó
    execJobsWaitOk' [OpTagsDel (TagInstance iname) rmTags]
277 a5e58418 Dato Simó
278 a5e58418 Dato Simó
  return instData { tagsToRemove = [] }
279 a5e58418 Dato Simó
280 546a1dcf Dato Simó
-- | Detect brokenness with an instance and suggest repair type and jobs to run.
281 4ba5f148 Dato Simó
detectBroken :: Node.List -> Instance.Instance
282 4ba5f148 Dato Simó
             -> Maybe (AutoRepairType, [OpCode])
283 4ba5f148 Dato Simó
detectBroken nl inst =
284 4ba5f148 Dato Simó
  let disk = Instance.diskTemplate inst
285 4ba5f148 Dato Simó
      iname = Instance.name inst
286 4ba5f148 Dato Simó
      offPri = Node.offline $ Container.find (Instance.pNode inst) nl
287 4ba5f148 Dato Simó
      offSec = Node.offline $ Container.find (Instance.sNode inst) nl
288 4ba5f148 Dato Simó
  in
289 4ba5f148 Dato Simó
   case disk of
290 4ba5f148 Dato Simó
     DTDrbd8
291 4ba5f148 Dato Simó
       | offPri && offSec ->
292 4ba5f148 Dato Simó
         Just (
293 4ba5f148 Dato Simó
           ArReinstall,
294 4ba5f148 Dato Simó
           [ OpInstanceRecreateDisks { opInstanceName = iname
295 4ba5f148 Dato Simó
                                     , opRecreateDisksInfo = RecreateDisksAll
296 4ba5f148 Dato Simó
                                     , opNodes = []
297 4ba5f148 Dato Simó
                                       -- FIXME: there should be a better way to
298 546a1dcf Dato Simó
                                       -- specify opcode parameters than abusing
299 4ba5f148 Dato Simó
                                       -- mkNonEmpty in this way (using the fact
300 4ba5f148 Dato Simó
                                       -- that Maybe is used both for optional
301 4ba5f148 Dato Simó
                                       -- fields, and to express failure).
302 4ba5f148 Dato Simó
                                     , opIallocator = mkNonEmpty "hail"
303 4ba5f148 Dato Simó
                                     }
304 4ba5f148 Dato Simó
           , OpInstanceReinstall { opInstanceName = iname
305 4ba5f148 Dato Simó
                                 , opOsType = Nothing
306 4ba5f148 Dato Simó
                                 , opTempOsParams = Nothing
307 4ba5f148 Dato Simó
                                 , opForceVariant = False
308 4ba5f148 Dato Simó
                                 }
309 4ba5f148 Dato Simó
           ])
310 4ba5f148 Dato Simó
       | offPri ->
311 4ba5f148 Dato Simó
         Just (
312 4ba5f148 Dato Simó
           ArFailover,
313 4ba5f148 Dato Simó
           [ OpInstanceFailover { opInstanceName = iname
314 4ba5f148 Dato Simó
                                  -- FIXME: ditto, see above.
315 4ba5f148 Dato Simó
                                , opShutdownTimeout = fromJust $ mkNonNegative
316 4ba5f148 Dato Simó
                                                      C.defaultShutdownTimeout
317 4ba5f148 Dato Simó
                                , opIgnoreConsistency = False
318 4ba5f148 Dato Simó
                                , opTargetNode = Nothing
319 4ba5f148 Dato Simó
                                , opIgnoreIpolicy = False
320 4ba5f148 Dato Simó
                                , opIallocator = Nothing
321 4ba5f148 Dato Simó
                                }
322 4ba5f148 Dato Simó
           ])
323 4ba5f148 Dato Simó
       | offSec ->
324 4ba5f148 Dato Simó
         Just (
325 4ba5f148 Dato Simó
           ArFixStorage,
326 4ba5f148 Dato Simó
           [ OpInstanceReplaceDisks { opInstanceName = iname
327 4ba5f148 Dato Simó
                                    , opReplaceDisksMode = ReplaceNewSecondary
328 4ba5f148 Dato Simó
                                    , opReplaceDisksList = []
329 4ba5f148 Dato Simó
                                    , opRemoteNode = Nothing
330 4ba5f148 Dato Simó
                                      -- FIXME: ditto, see above.
331 4ba5f148 Dato Simó
                                    , opIallocator = mkNonEmpty "hail"
332 4ba5f148 Dato Simó
                                    , opEarlyRelease = False
333 4ba5f148 Dato Simó
                                    , opIgnoreIpolicy = False
334 4ba5f148 Dato Simó
                                    }
335 4ba5f148 Dato Simó
            ])
336 4ba5f148 Dato Simó
       | otherwise -> Nothing
337 4ba5f148 Dato Simó
338 4ba5f148 Dato Simó
     DTPlain
339 4ba5f148 Dato Simó
       | offPri ->
340 4ba5f148 Dato Simó
         Just (
341 4ba5f148 Dato Simó
           ArReinstall,
342 4ba5f148 Dato Simó
           [ OpInstanceRecreateDisks { opInstanceName = iname
343 4ba5f148 Dato Simó
                                     , opRecreateDisksInfo = RecreateDisksAll
344 4ba5f148 Dato Simó
                                     , opNodes = []
345 4ba5f148 Dato Simó
                                       -- FIXME: ditto, see above.
346 4ba5f148 Dato Simó
                                     , opIallocator = mkNonEmpty "hail"
347 4ba5f148 Dato Simó
                                     }
348 4ba5f148 Dato Simó
           , OpInstanceReinstall { opInstanceName = iname
349 4ba5f148 Dato Simó
                                 , opOsType = Nothing
350 4ba5f148 Dato Simó
                                 , opTempOsParams = Nothing
351 4ba5f148 Dato Simó
                                 , opForceVariant = False
352 4ba5f148 Dato Simó
                                 }
353 4ba5f148 Dato Simó
           ])
354 4ba5f148 Dato Simó
       | otherwise -> Nothing
355 4ba5f148 Dato Simó
356 4ba5f148 Dato Simó
     _ -> Nothing  -- Other cases are unimplemented for now: DTDiskless,
357 4ba5f148 Dato Simó
                   -- DTFile, DTSharedFile, DTBlock, DTRbd, DTExt.
358 4ba5f148 Dato Simó
359 8072af6c Dato Simó
-- | Perform the suggested repair on an instance if its policy allows it.
360 8af72964 Dato Simó
doRepair :: L.Client     -- ^ The Luxi client
361 8af72964 Dato Simó
         -> Double       -- ^ Delay to insert before the first repair opcode
362 8af72964 Dato Simó
         -> InstanceData -- ^ The instance data
363 8af72964 Dato Simó
         -> (AutoRepairType, [OpCode]) -- ^ The repair job to perform
364 8af72964 Dato Simó
         -> IO InstanceData -- ^ The updated instance data
365 8af72964 Dato Simó
doRepair client delay instData (rtype, opcodes) =
366 8072af6c Dato Simó
  let inst = arInstance instData
367 8072af6c Dato Simó
      ipol = Instance.arPolicy inst
368 8072af6c Dato Simó
      iname = Instance.name inst
369 8072af6c Dato Simó
  in
370 8072af6c Dato Simó
  case ipol of
371 8072af6c Dato Simó
    ArEnabled maxtype ->
372 8072af6c Dato Simó
      if rtype > maxtype then do
373 8072af6c Dato Simó
        uuid <- newUUID
374 8072af6c Dato Simó
        time <- getClockTime
375 8072af6c Dato Simó
376 8072af6c Dato Simó
        let arState' = ArNeedsRepair (
377 8072af6c Dato Simó
              updateTag $ AutoRepairData rtype uuid time [] (Just ArEnoperm) "")
378 8072af6c Dato Simó
            instData' = instData { arState = arState'
379 8072af6c Dato Simó
                                 , tagsToRemove = delCurTag instData
380 8072af6c Dato Simó
                                 }
381 8072af6c Dato Simó
382 8072af6c Dato Simó
        putStrLn ("Not performing a repair of type " ++ show rtype ++ " on " ++
383 8072af6c Dato Simó
          iname ++ " because only repairs up to " ++ show maxtype ++
384 8072af6c Dato Simó
          " are allowed")
385 8072af6c Dato Simó
        commitChange client instData'  -- Adds "enoperm" result label.
386 8072af6c Dato Simó
      else do
387 8072af6c Dato Simó
        putStrLn ("Executing " ++ show rtype ++ " repair on " ++ iname)
388 8072af6c Dato Simó
389 8af72964 Dato Simó
        -- After submitting the job, we must write an autorepair:pending tag,
390 8af72964 Dato Simó
        -- that includes the repair job IDs so that they can be checked later.
391 8af72964 Dato Simó
        -- One problem we run into is that the repair job immediately grabs
392 8af72964 Dato Simó
        -- locks for the affected instance, and the subsequent TAGS_SET job is
393 546a1dcf Dato Simó
        -- blocked, introducing an unnecessary delay for the end-user. One
394 8af72964 Dato Simó
        -- alternative would be not to wait for the completion of the TAGS_SET
395 8af72964 Dato Simó
        -- job, contrary to what commitChange normally does; but we insist on
396 8af72964 Dato Simó
        -- waiting for the tag to be set so as to abort in case of failure,
397 8af72964 Dato Simó
        -- because the cluster is left in an invalid state in that case.
398 8af72964 Dato Simó
        --
399 8af72964 Dato Simó
        -- The proper solution (in 2.9+) would be not to use tags for storing
400 8af72964 Dato Simó
        -- autorepair data, or make the TAGS_SET opcode not grab an instance's
401 8af72964 Dato Simó
        -- locks (if that's deemed safe). In the meantime, we introduce an
402 8af72964 Dato Simó
        -- artificial delay in the repair job (via a TestDelay opcode) so that
403 8af72964 Dato Simó
        -- once we have the job ID, the TAGS_SET job can complete before the
404 8af72964 Dato Simó
        -- repair job actually grabs the locks. (Please note that this is not
405 8af72964 Dato Simó
        -- about synchronization, but merely about speeding up the execution of
406 8af72964 Dato Simó
        -- the harep tool. If this TestDelay opcode is removed, the program is
407 8af72964 Dato Simó
        -- still correct.)
408 8af72964 Dato Simó
        let opcodes' =
409 8af72964 Dato Simó
              if delay > 0 then
410 8af72964 Dato Simó
                OpTestDelay { opDelayDuration = delay
411 8af72964 Dato Simó
                            , opDelayOnMaster = True
412 8af72964 Dato Simó
                            , opDelayOnNodes = []
413 8af72964 Dato Simó
                            , opDelayRepeat = fromJust $ mkNonNegative 0
414 8af72964 Dato Simó
                            } : opcodes
415 8af72964 Dato Simó
              else
416 8af72964 Dato Simó
                opcodes
417 8af72964 Dato Simó
418 8072af6c Dato Simó
        uuid <- newUUID
419 8072af6c Dato Simó
        time <- getClockTime
420 8af72964 Dato Simó
        jids <- submitJobs [map wrapOpCode opcodes'] client
421 8072af6c Dato Simó
422 8072af6c Dato Simó
        case jids of
423 8072af6c Dato Simó
          Bad e    -> exitErr e
424 8072af6c Dato Simó
          Ok jids' ->
425 8072af6c Dato Simó
            let arState' = ArPendingRepair (
426 8072af6c Dato Simó
                  updateTag $ AutoRepairData rtype uuid time jids' Nothing "")
427 8072af6c Dato Simó
                instData' = instData { arState = arState'
428 8072af6c Dato Simó
                                     , tagsToRemove = delCurTag instData
429 8072af6c Dato Simó
                                     }
430 8072af6c Dato Simó
            in
431 8072af6c Dato Simó
             commitChange client instData'  -- Adds "pending" label.
432 8072af6c Dato Simó
433 8072af6c Dato Simó
    otherSt -> do
434 8072af6c Dato Simó
      putStrLn ("Not repairing " ++ iname ++ " because it's in state " ++
435 8072af6c Dato Simó
                show otherSt)
436 8072af6c Dato Simó
      return instData
437 8072af6c Dato Simó
438 b6d9bec8 Dato Simó
-- | Main function.
439 b6d9bec8 Dato Simó
main :: Options -> [String] -> IO ()
440 3416e3e7 Dato Simó
main opts args = do
441 b6d9bec8 Dato Simó
  unless (null args) $
442 b6d9bec8 Dato Simó
    exitErr "this program doesn't take any arguments."
443 b6d9bec8 Dato Simó
444 3416e3e7 Dato Simó
  luxiDef <- Path.defaultLuxiSocket
445 3416e3e7 Dato Simó
  let master = fromMaybe luxiDef $ optLuxi opts
446 3416e3e7 Dato Simó
      opts' = opts { optLuxi = Just master }
447 3416e3e7 Dato Simó
448 4ba5f148 Dato Simó
  (ClusterData _ nl il _ _) <- loadExternalData opts'
449 3416e3e7 Dato Simó
450 3416e3e7 Dato Simó
  let iniDataRes = mapM setInitialState $ Container.elems il
451 a5e58418 Dato Simó
  iniData <- exitIfBad "when parsing auto-repair tags" iniDataRes
452 a5e58418 Dato Simó
453 a5e58418 Dato Simó
  -- First step: check all pending repairs, see if they are completed.
454 4ba5f148 Dato Simó
  iniData' <- bracket (L.getClient master) L.closeClient $
455 4ba5f148 Dato Simó
              forM iniData . processPending
456 4ba5f148 Dato Simó
457 4ba5f148 Dato Simó
  -- Second step: detect any problems.
458 8072af6c Dato Simó
  let repairs = map (detectBroken nl . arInstance) iniData'
459 8072af6c Dato Simó
460 8072af6c Dato Simó
  -- Third step: create repair jobs for broken instances that are in ArHealthy.
461 8072af6c Dato Simó
  let maybeRepair c (i, r) = maybe (return i) (repairHealthy c i) r
462 8af72964 Dato Simó
      jobDelay = optJobDelay opts
463 8072af6c Dato Simó
      repairHealthy c i = case arState i of
464 8af72964 Dato Simó
                            ArHealthy _ -> doRepair c jobDelay i
465 8072af6c Dato Simó
                            _           -> const (return i)
466 8072af6c Dato Simó
467 41238d27 Dato Simó
  repairDone <- bracket (L.getClient master) L.closeClient $
468 41238d27 Dato Simó
                forM (zip iniData' repairs) . maybeRepair
469 41238d27 Dato Simó
470 41238d27 Dato Simó
  -- Print some stats and exit.
471 41238d27 Dato Simó
  let states = map ((, 1 :: Int) . arStateName . arState) repairDone
472 41238d27 Dato Simó
      counts = Map.fromListWith (+) states
473 3416e3e7 Dato Simó
474 41238d27 Dato Simó
  putStrLn "---------------------"
475 41238d27 Dato Simó
  putStrLn "Instance status count"
476 41238d27 Dato Simó
  putStrLn "---------------------"
477 41238d27 Dato Simó
  putStr . unlines . Map.elems $
478 41238d27 Dato Simó
    Map.mapWithKey (\k v -> k ++ ": " ++ show v) counts