root / src / Ganeti / HTools / Program / Harep.hs @ 4ba5f148
History | View | Annotate | Download (14.3 kB)
1 |
{-| Auto-repair tool for Ganeti. |
---|---|
2 |
|
3 |
-} |
4 |
|
5 |
{- |
6 |
|
7 |
Copyright (C) 2013 Google Inc. |
8 |
|
9 |
This program is free software; you can redistribute it and/or modify |
10 |
it under the terms of the GNU General Public License as published by |
11 |
the Free Software Foundation; either version 2 of the License, or |
12 |
(at your option) any later version. |
13 |
|
14 |
This program is distributed in the hope that it will be useful, but |
15 |
WITHOUT ANY WARRANTY; without even the implied warranty of |
16 |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
17 |
General Public License for more details. |
18 |
|
19 |
You should have received a copy of the GNU General Public License |
20 |
along with this program; if not, write to the Free Software |
21 |
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA |
22 |
02110-1301, USA. |
23 |
|
24 |
-} |
25 |
|
26 |
module Ganeti.HTools.Program.Harep |
27 |
( main |
28 |
, arguments |
29 |
, options) where |
30 |
|
31 |
import Control.Exception (bracket) |
32 |
import Control.Monad |
33 |
import Data.Function |
34 |
import Data.List |
35 |
import Data.Maybe |
36 |
import Data.Ord |
37 |
import System.Time |
38 |
|
39 |
import Ganeti.BasicTypes |
40 |
import Ganeti.Common |
41 |
import Ganeti.Errors |
42 |
import Ganeti.Jobs |
43 |
import Ganeti.OpCodes |
44 |
import Ganeti.OpParams |
45 |
import Ganeti.Types |
46 |
import Ganeti.Utils |
47 |
import qualified Ganeti.Constants as C |
48 |
import qualified Ganeti.Luxi as L |
49 |
import qualified Ganeti.Path as Path |
50 |
|
51 |
import Ganeti.HTools.CLI |
52 |
import Ganeti.HTools.Loader |
53 |
import Ganeti.HTools.ExtLoader |
54 |
import Ganeti.HTools.Types |
55 |
import qualified Ganeti.HTools.Container as Container |
56 |
import qualified Ganeti.HTools.Instance as Instance |
57 |
import qualified Ganeti.HTools.Node as Node |
58 |
|
59 |
-- | Options list and functions. |
60 |
options :: IO [OptType] |
61 |
options = do |
62 |
luxi <- oLuxiSocket |
63 |
return |
64 |
[ luxi |
65 |
] |
66 |
|
67 |
arguments :: [ArgCompletion] |
68 |
arguments = [] |
69 |
|
70 |
data InstanceData = InstanceData { arInstance :: Instance.Instance |
71 |
, arState :: AutoRepairStatus |
72 |
, tagsToRemove :: [String] |
73 |
} |
74 |
deriving (Eq, Show) |
75 |
|
76 |
-- | Parse a tag into an 'AutoRepairData' record. |
77 |
-- |
78 |
-- @Nothing@ is returned if the tag is not an auto-repair tag, or if it's |
79 |
-- malformed. |
80 |
parseInitTag :: String -> Maybe AutoRepairData |
81 |
parseInitTag tag = |
82 |
let parsePending = do |
83 |
subtag <- chompPrefix C.autoRepairTagPending tag |
84 |
case sepSplit ':' subtag of |
85 |
[rtype, uuid, ts, jobs] -> makeArData rtype uuid ts jobs |
86 |
_ -> fail ("Invalid tag: " ++ show tag) |
87 |
|
88 |
parseResult = do |
89 |
subtag <- chompPrefix C.autoRepairTagResult tag |
90 |
case sepSplit ':' subtag of |
91 |
[rtype, uuid, ts, result, jobs] -> do |
92 |
arData <- makeArData rtype uuid ts jobs |
93 |
result' <- autoRepairResultFromRaw result |
94 |
return arData { arResult = Just result' } |
95 |
_ -> fail ("Invalid tag: " ++ show tag) |
96 |
|
97 |
makeArData rtype uuid ts jobs = do |
98 |
rtype' <- autoRepairTypeFromRaw rtype |
99 |
ts' <- tryRead "auto-repair time" ts |
100 |
jobs' <- mapM makeJobIdS $ sepSplit '+' jobs |
101 |
return AutoRepairData { arType = rtype' |
102 |
, arUuid = uuid |
103 |
, arTime = TOD ts' 0 |
104 |
, arJobs = jobs' |
105 |
, arResult = Nothing |
106 |
, arTag = tag |
107 |
} |
108 |
in |
109 |
parsePending `mplus` parseResult |
110 |
|
111 |
-- | Return the 'AutoRepairData' element of an 'AutoRepairStatus' type. |
112 |
getArData :: AutoRepairStatus -> Maybe AutoRepairData |
113 |
getArData status = |
114 |
case status of |
115 |
ArHealthy (Just d) -> Just d |
116 |
ArFailedRepair d -> Just d |
117 |
ArPendingRepair d -> Just d |
118 |
ArNeedsRepair d -> Just d |
119 |
_ -> Nothing |
120 |
|
121 |
-- | Return a short name for each auto-repair status. |
122 |
-- |
123 |
-- This is a more concise representation of the status, because the default |
124 |
-- "Show" formatting includes all the accompanying auto-repair data. |
125 |
arStateName :: AutoRepairStatus -> String |
126 |
arStateName status = |
127 |
case status of |
128 |
ArHealthy _ -> "Healthy" |
129 |
ArFailedRepair _ -> "Failure" |
130 |
ArPendingRepair _ -> "Pending repair" |
131 |
ArNeedsRepair _ -> "Needs repair" |
132 |
|
133 |
-- | Return a new list of tags to remove that includes @arTag@ if present. |
134 |
delCurTag :: InstanceData -> [String] |
135 |
delCurTag instData = |
136 |
let arData = getArData $ arState instData |
137 |
rmTags = tagsToRemove instData |
138 |
in |
139 |
case arData of |
140 |
Just d -> arTag d : rmTags |
141 |
Nothing -> rmTags |
142 |
|
143 |
-- | Set the initial auto-repair state of an instance from its auto-repair tags. |
144 |
-- |
145 |
-- The rules when there are multiple tags is: |
146 |
-- |
147 |
-- * the earliest failure result always wins |
148 |
-- |
149 |
-- * two or more pending repairs results in a fatal error |
150 |
-- |
151 |
-- * a pending result from id X and a success result from id Y result in error |
152 |
-- if Y is newer than X |
153 |
-- |
154 |
-- * if there are no pending repairs, the newest success result wins, |
155 |
-- otherwise the pending result is used. |
156 |
setInitialState :: Instance.Instance -> Result InstanceData |
157 |
setInitialState inst = |
158 |
let arData = mapMaybe parseInitTag $ Instance.allTags inst |
159 |
-- Group all the AutoRepairData records by id (i.e. by repair task), and |
160 |
-- present them from oldest to newest. |
161 |
arData' = sortBy (comparing arUuid) arData |
162 |
arGroups = groupBy ((==) `on` arUuid) arData' |
163 |
arGroups' = sortBy (comparing $ minimum . map arTime) arGroups |
164 |
in |
165 |
foldM arStatusCmp (InstanceData inst (ArHealthy Nothing) []) arGroups' |
166 |
|
167 |
-- | Update the initial status of an instance with new repair task tags. |
168 |
-- |
169 |
-- This function gets called once per repair group in an instance's tag, and it |
170 |
-- determines whether to set the status of the instance according to this new |
171 |
-- group, or to keep the existing state. See the documentation for |
172 |
-- 'setInitialState' for the rules to be followed when determining this. |
173 |
arStatusCmp :: InstanceData -> [AutoRepairData] -> Result InstanceData |
174 |
arStatusCmp instData arData = |
175 |
let curSt = arState instData |
176 |
arData' = sortBy (comparing keyfn) arData |
177 |
keyfn d = (arResult d, arTime d) |
178 |
newData = last arData' |
179 |
newSt = case arResult newData of |
180 |
Just ArSuccess -> ArHealthy $ Just newData |
181 |
Just ArEnoperm -> ArHealthy $ Just newData |
182 |
Just ArFailure -> ArFailedRepair newData |
183 |
Nothing -> ArPendingRepair newData |
184 |
in |
185 |
case curSt of |
186 |
ArFailedRepair _ -> Ok instData -- Always keep the earliest failure. |
187 |
ArHealthy _ -> Ok instData { arState = newSt |
188 |
, tagsToRemove = delCurTag instData |
189 |
} |
190 |
ArPendingRepair d -> Bad ( |
191 |
"An unfinished repair was found in instance " ++ |
192 |
Instance.name (arInstance instData) ++ ": found tag " ++ |
193 |
show (arTag newData) ++ ", but older pending tag " ++ |
194 |
show (arTag d) ++ "exists.") |
195 |
|
196 |
ArNeedsRepair _ -> Bad |
197 |
"programming error: ArNeedsRepair found as an initial state" |
198 |
|
199 |
-- | Query jobs of a pending repair, returning the new instance data. |
200 |
processPending :: L.Client -> InstanceData -> IO InstanceData |
201 |
processPending client instData = |
202 |
case arState instData of |
203 |
(ArPendingRepair arData) -> do |
204 |
sts <- L.queryJobsStatus client $ arJobs arData |
205 |
time <- getClockTime |
206 |
case sts of |
207 |
Bad e -> exitErr $ "could not check job status: " ++ formatError e |
208 |
Ok sts' -> |
209 |
if any (<= JOB_STATUS_RUNNING) sts' then |
210 |
return instData -- (no change) |
211 |
else do |
212 |
let iname = Instance.name $ arInstance instData |
213 |
srcSt = arStateName $ arState instData |
214 |
destSt = arStateName arState' |
215 |
putStrLn ("Moving " ++ iname ++ " from " ++ show srcSt ++ " to " ++ |
216 |
show destSt) |
217 |
commitChange client instData' |
218 |
where |
219 |
instData' = |
220 |
instData { arState = arState' |
221 |
, tagsToRemove = delCurTag instData |
222 |
} |
223 |
arState' = |
224 |
if all (== JOB_STATUS_SUCCESS) sts' then |
225 |
ArHealthy $ Just (updateTag $ arData { arResult = Just ArSuccess |
226 |
, arTime = time }) |
227 |
else |
228 |
ArFailedRepair (updateTag $ arData { arResult = Just ArFailure |
229 |
, arTime = time }) |
230 |
|
231 |
_ -> return instData |
232 |
|
233 |
-- | Update the tag of an 'AutoRepairData' record to match all the other fields. |
234 |
updateTag :: AutoRepairData -> AutoRepairData |
235 |
updateTag arData = |
236 |
let ini = [autoRepairTypeToRaw $ arType arData, |
237 |
arUuid arData, |
238 |
clockTimeToString $ arTime arData] |
239 |
end = [intercalate "+" . map (show . fromJobId) $ arJobs arData] |
240 |
(pfx, middle) = |
241 |
case arResult arData of |
242 |
Nothing -> (C.autoRepairTagPending, []) |
243 |
Just rs -> (C.autoRepairTagResult, [autoRepairResultToRaw rs]) |
244 |
in |
245 |
arData { arTag = pfx ++ intercalate ":" (ini ++ middle ++ end) } |
246 |
|
247 |
-- | Apply and remove tags from an instance as indicated by 'InstanceData'. |
248 |
-- |
249 |
-- If the /arState/ of the /InstanceData/ record has an associated |
250 |
-- 'AutoRepairData', add its tag to the instance object. Additionally, if |
251 |
-- /tagsToRemove/ is not empty, remove those tags from the instance object. The |
252 |
-- returned /InstanceData/ object always has an empty /tagsToRemove/. |
253 |
commitChange :: L.Client -> InstanceData -> IO InstanceData |
254 |
commitChange client instData = do |
255 |
let iname = Instance.name $ arInstance instData |
256 |
arData = getArData $ arState instData |
257 |
rmTags = tagsToRemove instData |
258 |
execJobsWaitOk' opcodes = do |
259 |
res <- execJobsWaitOk [map wrapOpCode opcodes] client |
260 |
case res of |
261 |
Ok _ -> return () |
262 |
Bad e -> exitErr e |
263 |
|
264 |
when (isJust arData) $ do |
265 |
let tag = arTag $ fromJust arData |
266 |
putStrLn (">>> Adding the following tag to " ++ iname ++ ":\n" ++ show tag) |
267 |
execJobsWaitOk' [OpTagsSet (TagInstance iname) [tag]] |
268 |
|
269 |
unless (null rmTags) $ do |
270 |
putStr (">>> Removing the following tags from " ++ iname ++ ":\n" ++ |
271 |
unlines (map show rmTags)) |
272 |
execJobsWaitOk' [OpTagsDel (TagInstance iname) rmTags] |
273 |
|
274 |
return instData { tagsToRemove = [] } |
275 |
|
276 |
-- | Detect brokeness with an instance and suggest repair type and jobs to run. |
277 |
detectBroken :: Node.List -> Instance.Instance |
278 |
-> Maybe (AutoRepairType, [OpCode]) |
279 |
detectBroken nl inst = |
280 |
let disk = Instance.diskTemplate inst |
281 |
iname = Instance.name inst |
282 |
offPri = Node.offline $ Container.find (Instance.pNode inst) nl |
283 |
offSec = Node.offline $ Container.find (Instance.sNode inst) nl |
284 |
in |
285 |
case disk of |
286 |
DTDrbd8 |
287 |
| offPri && offSec -> |
288 |
Just ( |
289 |
ArReinstall, |
290 |
[ OpInstanceRecreateDisks { opInstanceName = iname |
291 |
, opRecreateDisksInfo = RecreateDisksAll |
292 |
, opNodes = [] |
293 |
-- FIXME: there should be a better way to |
294 |
-- specify opcode paramteres than abusing |
295 |
-- mkNonEmpty in this way (using the fact |
296 |
-- that Maybe is used both for optional |
297 |
-- fields, and to express failure). |
298 |
, opIallocator = mkNonEmpty "hail" |
299 |
} |
300 |
, OpInstanceReinstall { opInstanceName = iname |
301 |
, opOsType = Nothing |
302 |
, opTempOsParams = Nothing |
303 |
, opForceVariant = False |
304 |
} |
305 |
]) |
306 |
| offPri -> |
307 |
Just ( |
308 |
ArFailover, |
309 |
[ OpInstanceFailover { opInstanceName = iname |
310 |
-- FIXME: ditto, see above. |
311 |
, opShutdownTimeout = fromJust $ mkNonNegative |
312 |
C.defaultShutdownTimeout |
313 |
, opIgnoreConsistency = False |
314 |
, opTargetNode = Nothing |
315 |
, opIgnoreIpolicy = False |
316 |
, opIallocator = Nothing |
317 |
} |
318 |
]) |
319 |
| offSec -> |
320 |
Just ( |
321 |
ArFixStorage, |
322 |
[ OpInstanceReplaceDisks { opInstanceName = iname |
323 |
, opReplaceDisksMode = ReplaceNewSecondary |
324 |
, opReplaceDisksList = [] |
325 |
, opRemoteNode = Nothing |
326 |
-- FIXME: ditto, see above. |
327 |
, opIallocator = mkNonEmpty "hail" |
328 |
, opEarlyRelease = False |
329 |
, opIgnoreIpolicy = False |
330 |
} |
331 |
]) |
332 |
| otherwise -> Nothing |
333 |
|
334 |
DTPlain |
335 |
| offPri -> |
336 |
Just ( |
337 |
ArReinstall, |
338 |
[ OpInstanceRecreateDisks { opInstanceName = iname |
339 |
, opRecreateDisksInfo = RecreateDisksAll |
340 |
, opNodes = [] |
341 |
-- FIXME: ditto, see above. |
342 |
, opIallocator = mkNonEmpty "hail" |
343 |
} |
344 |
, OpInstanceReinstall { opInstanceName = iname |
345 |
, opOsType = Nothing |
346 |
, opTempOsParams = Nothing |
347 |
, opForceVariant = False |
348 |
} |
349 |
]) |
350 |
| otherwise -> Nothing |
351 |
|
352 |
_ -> Nothing -- Other cases are unimplemented for now: DTDiskless, |
353 |
-- DTFile, DTSharedFile, DTBlock, DTRbd, DTExt. |
354 |
|
355 |
-- | Main function. |
356 |
main :: Options -> [String] -> IO () |
357 |
main opts args = do |
358 |
unless (null args) $ |
359 |
exitErr "this program doesn't take any arguments." |
360 |
|
361 |
luxiDef <- Path.defaultLuxiSocket |
362 |
let master = fromMaybe luxiDef $ optLuxi opts |
363 |
opts' = opts { optLuxi = Just master } |
364 |
|
365 |
(ClusterData _ nl il _ _) <- loadExternalData opts' |
366 |
|
367 |
let iniDataRes = mapM setInitialState $ Container.elems il |
368 |
iniData <- exitIfBad "when parsing auto-repair tags" iniDataRes |
369 |
|
370 |
-- First step: check all pending repairs, see if they are completed. |
371 |
iniData' <- bracket (L.getClient master) L.closeClient $ |
372 |
forM iniData . processPending |
373 |
|
374 |
-- Second step: detect any problems. |
375 |
let _unused_repairs = map (detectBroken nl . arInstance) iniData' |
376 |
|
377 |
return () |