root / src / Ganeti / HTools / Program / Harep.hs @ a5e58418
History | View | Annotate | Download (10.5 kB)
1 |
{-| Auto-repair tool for Ganeti. |
---|---|
2 |
|
3 |
-} |
4 |
|
5 |
{- |
6 |
|
7 |
Copyright (C) 2013 Google Inc. |
8 |
|
9 |
This program is free software; you can redistribute it and/or modify |
10 |
it under the terms of the GNU General Public License as published by |
11 |
the Free Software Foundation; either version 2 of the License, or |
12 |
(at your option) any later version. |
13 |
|
14 |
This program is distributed in the hope that it will be useful, but |
15 |
WITHOUT ANY WARRANTY; without even the implied warranty of |
16 |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
17 |
General Public License for more details. |
18 |
|
19 |
You should have received a copy of the GNU General Public License |
20 |
along with this program; if not, write to the Free Software |
21 |
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA |
22 |
02110-1301, USA. |
23 |
|
24 |
-} |
25 |
|
26 |
module Ganeti.HTools.Program.Harep |
27 |
( main |
28 |
, arguments |
29 |
, options) where |
30 |
|
31 |
import Control.Exception (bracket) |
32 |
import Control.Monad |
33 |
import Data.Function |
34 |
import Data.List |
35 |
import Data.Maybe |
36 |
import Data.Ord |
37 |
import System.Time |
38 |
|
39 |
import Ganeti.BasicTypes |
40 |
import Ganeti.Common |
41 |
import Ganeti.Errors |
42 |
import Ganeti.Jobs |
43 |
import Ganeti.OpCodes |
44 |
import Ganeti.Types |
45 |
import Ganeti.Utils |
46 |
import qualified Ganeti.Constants as C |
47 |
import qualified Ganeti.Luxi as L |
48 |
import qualified Ganeti.Path as Path |
49 |
|
50 |
import Ganeti.HTools.CLI |
51 |
import Ganeti.HTools.Loader |
52 |
import Ganeti.HTools.ExtLoader |
53 |
import Ganeti.HTools.Types |
54 |
import qualified Ganeti.HTools.Container as Container |
55 |
import qualified Ganeti.HTools.Instance as Instance |
56 |
|
57 |
-- | Options list and functions. |
58 |
options :: IO [OptType] |
59 |
options = do |
60 |
luxi <- oLuxiSocket |
61 |
return |
62 |
[ luxi |
63 |
] |
64 |
|
65 |
arguments :: [ArgCompletion] |
66 |
arguments = [] |
67 |
|
68 |
data InstanceData = InstanceData { arInstance :: Instance.Instance |
69 |
, arState :: AutoRepairStatus |
70 |
, tagsToRemove :: [String] |
71 |
} |
72 |
deriving (Eq, Show) |
73 |
|
74 |
-- | Parse a tag into an 'AutoRepairData' record. |
75 |
-- |
76 |
-- @Nothing@ is returned if the tag is not an auto-repair tag, or if it's |
77 |
-- malformed. |
78 |
parseInitTag :: String -> Maybe AutoRepairData |
79 |
parseInitTag tag = |
80 |
let parsePending = do |
81 |
subtag <- chompPrefix C.autoRepairTagPending tag |
82 |
case sepSplit ':' subtag of |
83 |
[rtype, uuid, ts, jobs] -> makeArData rtype uuid ts jobs |
84 |
_ -> fail ("Invalid tag: " ++ show tag) |
85 |
|
86 |
parseResult = do |
87 |
subtag <- chompPrefix C.autoRepairTagResult tag |
88 |
case sepSplit ':' subtag of |
89 |
[rtype, uuid, ts, result, jobs] -> do |
90 |
arData <- makeArData rtype uuid ts jobs |
91 |
result' <- autoRepairResultFromRaw result |
92 |
return arData { arResult = Just result' } |
93 |
_ -> fail ("Invalid tag: " ++ show tag) |
94 |
|
95 |
makeArData rtype uuid ts jobs = do |
96 |
rtype' <- autoRepairTypeFromRaw rtype |
97 |
ts' <- tryRead "auto-repair time" ts |
98 |
jobs' <- mapM makeJobIdS $ sepSplit '+' jobs |
99 |
return AutoRepairData { arType = rtype' |
100 |
, arUuid = uuid |
101 |
, arTime = TOD ts' 0 |
102 |
, arJobs = jobs' |
103 |
, arResult = Nothing |
104 |
, arTag = tag |
105 |
} |
106 |
in |
107 |
parsePending `mplus` parseResult |
108 |
|
109 |
-- | Return the 'AutoRepairData' element of an 'AutoRepairStatus' type. |
110 |
getArData :: AutoRepairStatus -> Maybe AutoRepairData |
111 |
getArData status = |
112 |
case status of |
113 |
ArHealthy (Just d) -> Just d |
114 |
ArFailedRepair d -> Just d |
115 |
ArPendingRepair d -> Just d |
116 |
ArNeedsRepair d -> Just d |
117 |
_ -> Nothing |
118 |
|
119 |
-- | Return a short name for each auto-repair status. |
120 |
-- |
121 |
-- This is a more concise representation of the status, because the default |
122 |
-- "Show" formatting includes all the accompanying auto-repair data. |
123 |
arStateName :: AutoRepairStatus -> String |
124 |
arStateName status = |
125 |
case status of |
126 |
ArHealthy _ -> "Healthy" |
127 |
ArFailedRepair _ -> "Failure" |
128 |
ArPendingRepair _ -> "Pending repair" |
129 |
ArNeedsRepair _ -> "Needs repair" |
130 |
|
131 |
-- | Return a new list of tags to remove that includes @arTag@ if present. |
132 |
delCurTag :: InstanceData -> [String] |
133 |
delCurTag instData = |
134 |
let arData = getArData $ arState instData |
135 |
rmTags = tagsToRemove instData |
136 |
in |
137 |
case arData of |
138 |
Just d -> arTag d : rmTags |
139 |
Nothing -> rmTags |
140 |
|
141 |
-- | Set the initial auto-repair state of an instance from its auto-repair tags. |
142 |
-- |
143 |
-- The rules when there are multiple tags is: |
144 |
-- |
145 |
-- * the earliest failure result always wins |
146 |
-- |
147 |
-- * two or more pending repairs results in a fatal error |
148 |
-- |
149 |
-- * a pending result from id X and a success result from id Y result in error |
150 |
-- if Y is newer than X |
151 |
-- |
152 |
-- * if there are no pending repairs, the newest success result wins, |
153 |
-- otherwise the pending result is used. |
154 |
setInitialState :: Instance.Instance -> Result InstanceData |
155 |
setInitialState inst = |
156 |
let arData = mapMaybe parseInitTag $ Instance.allTags inst |
157 |
-- Group all the AutoRepairData records by id (i.e. by repair task), and |
158 |
-- present them from oldest to newest. |
159 |
arData' = sortBy (comparing arUuid) arData |
160 |
arGroups = groupBy ((==) `on` arUuid) arData' |
161 |
arGroups' = sortBy (comparing $ minimum . map arTime) arGroups |
162 |
in |
163 |
foldM arStatusCmp (InstanceData inst (ArHealthy Nothing) []) arGroups' |
164 |
|
165 |
-- | Update the initial status of an instance with new repair task tags. |
166 |
-- |
167 |
-- This function gets called once per repair group in an instance's tag, and it |
168 |
-- determines whether to set the status of the instance according to this new |
169 |
-- group, or to keep the existing state. See the documentation for |
170 |
-- 'setInitialState' for the rules to be followed when determining this. |
171 |
arStatusCmp :: InstanceData -> [AutoRepairData] -> Result InstanceData |
172 |
arStatusCmp instData arData = |
173 |
let curSt = arState instData |
174 |
arData' = sortBy (comparing keyfn) arData |
175 |
keyfn d = (arResult d, arTime d) |
176 |
newData = last arData' |
177 |
newSt = case arResult newData of |
178 |
Just ArSuccess -> ArHealthy $ Just newData |
179 |
Just ArEnoperm -> ArHealthy $ Just newData |
180 |
Just ArFailure -> ArFailedRepair newData |
181 |
Nothing -> ArPendingRepair newData |
182 |
in |
183 |
case curSt of |
184 |
ArFailedRepair _ -> Ok instData -- Always keep the earliest failure. |
185 |
ArHealthy _ -> Ok instData { arState = newSt |
186 |
, tagsToRemove = delCurTag instData |
187 |
} |
188 |
ArPendingRepair d -> Bad ( |
189 |
"An unfinished repair was found in instance " ++ |
190 |
Instance.name (arInstance instData) ++ ": found tag " ++ |
191 |
show (arTag newData) ++ ", but older pending tag " ++ |
192 |
show (arTag d) ++ "exists.") |
193 |
|
194 |
ArNeedsRepair _ -> Bad |
195 |
"programming error: ArNeedsRepair found as an initial state" |
196 |
|
197 |
-- | Query jobs of a pending repair, returning the new instance data. |
198 |
processPending :: L.Client -> InstanceData -> IO InstanceData |
199 |
processPending client instData = |
200 |
case arState instData of |
201 |
(ArPendingRepair arData) -> do |
202 |
sts <- L.queryJobsStatus client $ arJobs arData |
203 |
time <- getClockTime |
204 |
case sts of |
205 |
Bad e -> exitErr $ "could not check job status: " ++ formatError e |
206 |
Ok sts' -> |
207 |
if any (<= JOB_STATUS_RUNNING) sts' then |
208 |
return instData -- (no change) |
209 |
else do |
210 |
let iname = Instance.name $ arInstance instData |
211 |
srcSt = arStateName $ arState instData |
212 |
destSt = arStateName arState' |
213 |
putStrLn ("Moving " ++ iname ++ " from " ++ show srcSt ++ " to " ++ |
214 |
show destSt) |
215 |
commitChange client instData' |
216 |
where |
217 |
instData' = |
218 |
instData { arState = arState' |
219 |
, tagsToRemove = delCurTag instData |
220 |
} |
221 |
arState' = |
222 |
if all (== JOB_STATUS_SUCCESS) sts' then |
223 |
ArHealthy $ Just (updateTag $ arData { arResult = Just ArSuccess |
224 |
, arTime = time }) |
225 |
else |
226 |
ArFailedRepair (updateTag $ arData { arResult = Just ArFailure |
227 |
, arTime = time }) |
228 |
|
229 |
_ -> return instData |
230 |
|
231 |
-- | Update the tag of an 'AutoRepairData' record to match all the other fields. |
232 |
updateTag :: AutoRepairData -> AutoRepairData |
233 |
updateTag arData = |
234 |
let ini = [autoRepairTypeToRaw $ arType arData, |
235 |
arUuid arData, |
236 |
clockTimeToString $ arTime arData] |
237 |
end = [intercalate "+" . map (show . fromJobId) $ arJobs arData] |
238 |
(pfx, middle) = |
239 |
case arResult arData of |
240 |
Nothing -> (C.autoRepairTagPending, []) |
241 |
Just rs -> (C.autoRepairTagResult, [autoRepairResultToRaw rs]) |
242 |
in |
243 |
arData { arTag = pfx ++ intercalate ":" (ini ++ middle ++ end) } |
244 |
|
245 |
-- | Apply and remove tags from an instance as indicated by 'InstanceData'. |
246 |
-- |
247 |
-- If the /arState/ of the /InstanceData/ record has an associated |
248 |
-- 'AutoRepairData', add its tag to the instance object. Additionally, if |
249 |
-- /tagsToRemove/ is not empty, remove those tags from the instance object. The |
250 |
-- returned /InstanceData/ object always has an empty /tagsToRemove/. |
251 |
commitChange :: L.Client -> InstanceData -> IO InstanceData |
252 |
commitChange client instData = do |
253 |
let iname = Instance.name $ arInstance instData |
254 |
arData = getArData $ arState instData |
255 |
rmTags = tagsToRemove instData |
256 |
execJobsWaitOk' opcodes = do |
257 |
res <- execJobsWaitOk [map wrapOpCode opcodes] client |
258 |
case res of |
259 |
Ok _ -> return () |
260 |
Bad e -> exitErr e |
261 |
|
262 |
when (isJust arData) $ do |
263 |
let tag = arTag $ fromJust arData |
264 |
putStrLn (">>> Adding the following tag to " ++ iname ++ ":\n" ++ show tag) |
265 |
execJobsWaitOk' [OpTagsSet (TagInstance iname) [tag]] |
266 |
|
267 |
unless (null rmTags) $ do |
268 |
putStr (">>> Removing the following tags from " ++ iname ++ ":\n" ++ |
269 |
unlines (map show rmTags)) |
270 |
execJobsWaitOk' [OpTagsDel (TagInstance iname) rmTags] |
271 |
|
272 |
return instData { tagsToRemove = [] } |
273 |
|
274 |
-- | Main function. |
275 |
main :: Options -> [String] -> IO () |
276 |
main opts args = do |
277 |
unless (null args) $ |
278 |
exitErr "this program doesn't take any arguments." |
279 |
|
280 |
luxiDef <- Path.defaultLuxiSocket |
281 |
let master = fromMaybe luxiDef $ optLuxi opts |
282 |
opts' = opts { optLuxi = Just master } |
283 |
|
284 |
(ClusterData _ _ il _ _) <- loadExternalData opts' |
285 |
|
286 |
let iniDataRes = mapM setInitialState $ Container.elems il |
287 |
iniData <- exitIfBad "when parsing auto-repair tags" iniDataRes |
288 |
|
289 |
-- First step: check all pending repairs, see if they are completed. |
290 |
_unused_iniData' <- bracket (L.getClient master) L.closeClient $ |
291 |
forM iniData . processPending |
292 |
|
293 |
return () |