root / src / Ganeti / HTools / Program / Harep.hs @ d605e261
History | View | Annotate | Download (19.5 kB)
1 | 41238d27 | Dato Simó | {-# LANGUAGE TupleSections #-} |
---|---|---|---|
2 | 41238d27 | Dato Simó | |
3 | b6d9bec8 | Dato Simó | {-| Auto-repair tool for Ganeti. |
4 | b6d9bec8 | Dato Simó | |
5 | b6d9bec8 | Dato Simó | -} |
6 | b6d9bec8 | Dato Simó | |
7 | b6d9bec8 | Dato Simó | {- |
8 | b6d9bec8 | Dato Simó | |
9 | b6d9bec8 | Dato Simó | Copyright (C) 2013 Google Inc. |
10 | b6d9bec8 | Dato Simó | |
11 | b6d9bec8 | Dato Simó | This program is free software; you can redistribute it and/or modify |
12 | b6d9bec8 | Dato Simó | it under the terms of the GNU General Public License as published by |
13 | b6d9bec8 | Dato Simó | the Free Software Foundation; either version 2 of the License, or |
14 | b6d9bec8 | Dato Simó | (at your option) any later version. |
15 | b6d9bec8 | Dato Simó | |
16 | b6d9bec8 | Dato Simó | This program is distributed in the hope that it will be useful, but |
17 | b6d9bec8 | Dato Simó | WITHOUT ANY WARRANTY; without even the implied warranty of |
18 | b6d9bec8 | Dato Simó | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
19 | b6d9bec8 | Dato Simó | General Public License for more details. |
20 | b6d9bec8 | Dato Simó | |
21 | b6d9bec8 | Dato Simó | You should have received a copy of the GNU General Public License |
22 | b6d9bec8 | Dato Simó | along with this program; if not, write to the Free Software |
23 | b6d9bec8 | Dato Simó | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA |
24 | b6d9bec8 | Dato Simó | 02110-1301, USA. |
25 | b6d9bec8 | Dato Simó | |
26 | b6d9bec8 | Dato Simó | -} |
27 | b6d9bec8 | Dato Simó | |
28 | b6d9bec8 | Dato Simó | module Ganeti.HTools.Program.Harep |
29 | b6d9bec8 | Dato Simó | ( main |
30 | b6d9bec8 | Dato Simó | , arguments |
31 | b6d9bec8 | Dato Simó | , options) where |
32 | b6d9bec8 | Dato Simó | |
33 | a5e58418 | Dato Simó | import Control.Exception (bracket) |
34 | b6d9bec8 | Dato Simó | import Control.Monad |
35 | 3416e3e7 | Dato Simó | import Data.Function |
36 | 3416e3e7 | Dato Simó | import Data.List |
37 | 3416e3e7 | Dato Simó | import Data.Maybe |
38 | 3416e3e7 | Dato Simó | import Data.Ord |
39 | 3416e3e7 | Dato Simó | import System.Time |
40 | 41238d27 | Dato Simó | import qualified Data.Map as Map |
41 | b6d9bec8 | Dato Simó | |
42 | 3416e3e7 | Dato Simó | import Ganeti.BasicTypes |
43 | b6d9bec8 | Dato Simó | import Ganeti.Common |
44 | a5e58418 | Dato Simó | import Ganeti.Errors |
45 | a5e58418 | Dato Simó | import Ganeti.Jobs |
46 | a5e58418 | Dato Simó | import Ganeti.OpCodes |
47 | 4ba5f148 | Dato Simó | import Ganeti.OpParams |
48 | 3416e3e7 | Dato Simó | import Ganeti.Types |
49 | b6d9bec8 | Dato Simó | import Ganeti.Utils |
50 | 3416e3e7 | Dato Simó | import qualified Ganeti.Constants as C |
51 | a5e58418 | Dato Simó | import qualified Ganeti.Luxi as L |
52 | 3416e3e7 | Dato Simó | import qualified Ganeti.Path as Path |
53 | b6d9bec8 | Dato Simó | |
54 | b6d9bec8 | Dato Simó | import Ganeti.HTools.CLI |
55 | 3416e3e7 | Dato Simó | import Ganeti.HTools.Loader |
56 | 3416e3e7 | Dato Simó | import Ganeti.HTools.ExtLoader |
57 | 3416e3e7 | Dato Simó | import Ganeti.HTools.Types |
58 | 3416e3e7 | Dato Simó | import qualified Ganeti.HTools.Container as Container |
59 | 3416e3e7 | Dato Simó | import qualified Ganeti.HTools.Instance as Instance |
60 | 4ba5f148 | Dato Simó | import qualified Ganeti.HTools.Node as Node |
61 | b6d9bec8 | Dato Simó | |
62 | b6d9bec8 | Dato Simó | -- | Options list and functions. |
63 | b6d9bec8 | Dato Simó | options :: IO [OptType] |
64 | b6d9bec8 | Dato Simó | options = do |
65 | b6d9bec8 | Dato Simó | luxi <- oLuxiSocket |
66 | b6d9bec8 | Dato Simó | return |
67 | b6d9bec8 | Dato Simó | [ luxi |
68 | 8af72964 | Dato Simó | , oJobDelay |
69 | b6d9bec8 | Dato Simó | ] |
70 | b6d9bec8 | Dato Simó | |
71 | b6d9bec8 | Dato Simó | arguments :: [ArgCompletion] |
72 | b6d9bec8 | Dato Simó | arguments = [] |
73 | b6d9bec8 | Dato Simó | |
74 | 3416e3e7 | Dato Simó | data InstanceData = InstanceData { arInstance :: Instance.Instance |
75 | 3416e3e7 | Dato Simó | , arState :: AutoRepairStatus |
76 | 3416e3e7 | Dato Simó | , tagsToRemove :: [String] |
77 | 3416e3e7 | Dato Simó | } |
78 | 3416e3e7 | Dato Simó | deriving (Eq, Show) |
79 | 3416e3e7 | Dato Simó | |
80 | 3416e3e7 | Dato Simó | -- | Parse a tag into an 'AutoRepairData' record. |
81 | 3416e3e7 | Dato Simó | -- |
82 | 3416e3e7 | Dato Simó | -- @Nothing@ is returned if the tag is not an auto-repair tag, or if it's |
83 | 3416e3e7 | Dato Simó | -- malformed. |
84 | 3416e3e7 | Dato Simó | parseInitTag :: String -> Maybe AutoRepairData |
85 | 3416e3e7 | Dato Simó | parseInitTag tag = |
86 | 3416e3e7 | Dato Simó | let parsePending = do |
87 | 3416e3e7 | Dato Simó | subtag <- chompPrefix C.autoRepairTagPending tag |
88 | 3416e3e7 | Dato Simó | case sepSplit ':' subtag of |
89 | 3416e3e7 | Dato Simó | [rtype, uuid, ts, jobs] -> makeArData rtype uuid ts jobs |
90 | 3416e3e7 | Dato Simó | _ -> fail ("Invalid tag: " ++ show tag) |
91 | 3416e3e7 | Dato Simó | |
92 | 3416e3e7 | Dato Simó | parseResult = do |
93 | 3416e3e7 | Dato Simó | subtag <- chompPrefix C.autoRepairTagResult tag |
94 | 3416e3e7 | Dato Simó | case sepSplit ':' subtag of |
95 | 3416e3e7 | Dato Simó | [rtype, uuid, ts, result, jobs] -> do |
96 | 3416e3e7 | Dato Simó | arData <- makeArData rtype uuid ts jobs |
97 | 3416e3e7 | Dato Simó | result' <- autoRepairResultFromRaw result |
98 | 3416e3e7 | Dato Simó | return arData { arResult = Just result' } |
99 | 3416e3e7 | Dato Simó | _ -> fail ("Invalid tag: " ++ show tag) |
100 | 3416e3e7 | Dato Simó | |
101 | 3416e3e7 | Dato Simó | makeArData rtype uuid ts jobs = do |
102 | 3416e3e7 | Dato Simó | rtype' <- autoRepairTypeFromRaw rtype |
103 | 3416e3e7 | Dato Simó | ts' <- tryRead "auto-repair time" ts |
104 | 3416e3e7 | Dato Simó | jobs' <- mapM makeJobIdS $ sepSplit '+' jobs |
105 | 3416e3e7 | Dato Simó | return AutoRepairData { arType = rtype' |
106 | 3416e3e7 | Dato Simó | , arUuid = uuid |
107 | 3416e3e7 | Dato Simó | , arTime = TOD ts' 0 |
108 | 3416e3e7 | Dato Simó | , arJobs = jobs' |
109 | 3416e3e7 | Dato Simó | , arResult = Nothing |
110 | 3416e3e7 | Dato Simó | , arTag = tag |
111 | 3416e3e7 | Dato Simó | } |
112 | 3416e3e7 | Dato Simó | in |
113 | 3416e3e7 | Dato Simó | parsePending `mplus` parseResult |
114 | 3416e3e7 | Dato Simó | |
115 | 3416e3e7 | Dato Simó | -- | Return the 'AutoRepairData' element of an 'AutoRepairStatus' type. |
116 | 3416e3e7 | Dato Simó | getArData :: AutoRepairStatus -> Maybe AutoRepairData |
117 | 3416e3e7 | Dato Simó | getArData status = |
118 | 3416e3e7 | Dato Simó | case status of |
119 | 3416e3e7 | Dato Simó | ArHealthy (Just d) -> Just d |
120 | 3416e3e7 | Dato Simó | ArFailedRepair d -> Just d |
121 | 3416e3e7 | Dato Simó | ArPendingRepair d -> Just d |
122 | 3416e3e7 | Dato Simó | ArNeedsRepair d -> Just d |
123 | 3416e3e7 | Dato Simó | _ -> Nothing |
124 | 3416e3e7 | Dato Simó | |
125 | a5e58418 | Dato Simó | -- | Return a short name for each auto-repair status. |
126 | a5e58418 | Dato Simó | -- |
127 | a5e58418 | Dato Simó | -- This is a more concise representation of the status, because the default |
128 | a5e58418 | Dato Simó | -- "Show" formatting includes all the accompanying auto-repair data. |
129 | a5e58418 | Dato Simó | arStateName :: AutoRepairStatus -> String |
130 | a5e58418 | Dato Simó | arStateName status = |
131 | a5e58418 | Dato Simó | case status of |
132 | a5e58418 | Dato Simó | ArHealthy _ -> "Healthy" |
133 | a5e58418 | Dato Simó | ArFailedRepair _ -> "Failure" |
134 | a5e58418 | Dato Simó | ArPendingRepair _ -> "Pending repair" |
135 | a5e58418 | Dato Simó | ArNeedsRepair _ -> "Needs repair" |
136 | a5e58418 | Dato Simó | |
137 | 3416e3e7 | Dato Simó | -- | Return a new list of tags to remove that includes @arTag@ if present. |
138 | 3416e3e7 | Dato Simó | delCurTag :: InstanceData -> [String] |
139 | 3416e3e7 | Dato Simó | delCurTag instData = |
140 | 3416e3e7 | Dato Simó | let arData = getArData $ arState instData |
141 | 3416e3e7 | Dato Simó | rmTags = tagsToRemove instData |
142 | 3416e3e7 | Dato Simó | in |
143 | 3416e3e7 | Dato Simó | case arData of |
144 | 3416e3e7 | Dato Simó | Just d -> arTag d : rmTags |
145 | 3416e3e7 | Dato Simó | Nothing -> rmTags |
146 | 3416e3e7 | Dato Simó | |
147 | 3416e3e7 | Dato Simó | -- | Set the initial auto-repair state of an instance from its auto-repair tags. |
148 | 3416e3e7 | Dato Simó | -- |
149 | 3416e3e7 | Dato Simó | -- The rules when there are multiple tags is: |
150 | 3416e3e7 | Dato Simó | -- |
151 | 3416e3e7 | Dato Simó | -- * the earliest failure result always wins |
152 | 3416e3e7 | Dato Simó | -- |
153 | 3416e3e7 | Dato Simó | -- * two or more pending repairs results in a fatal error |
154 | 3416e3e7 | Dato Simó | -- |
155 | 3416e3e7 | Dato Simó | -- * a pending result from id X and a success result from id Y result in error |
156 | 3416e3e7 | Dato Simó | -- if Y is newer than X |
157 | 3416e3e7 | Dato Simó | -- |
158 | 3416e3e7 | Dato Simó | -- * if there are no pending repairs, the newest success result wins, |
159 | 3416e3e7 | Dato Simó | -- otherwise the pending result is used. |
160 | 3416e3e7 | Dato Simó | setInitialState :: Instance.Instance -> Result InstanceData |
161 | 3416e3e7 | Dato Simó | setInitialState inst = |
162 | 3416e3e7 | Dato Simó | let arData = mapMaybe parseInitTag $ Instance.allTags inst |
163 | 3416e3e7 | Dato Simó | -- Group all the AutoRepairData records by id (i.e. by repair task), and |
164 | 3416e3e7 | Dato Simó | -- present them from oldest to newest. |
165 | 3416e3e7 | Dato Simó | arData' = sortBy (comparing arUuid) arData |
166 | 3416e3e7 | Dato Simó | arGroups = groupBy ((==) `on` arUuid) arData' |
167 | 3416e3e7 | Dato Simó | arGroups' = sortBy (comparing $ minimum . map arTime) arGroups |
168 | 3416e3e7 | Dato Simó | in |
169 | 3416e3e7 | Dato Simó | foldM arStatusCmp (InstanceData inst (ArHealthy Nothing) []) arGroups' |
170 | 3416e3e7 | Dato Simó | |
171 | 3416e3e7 | Dato Simó | -- | Update the initial status of an instance with new repair task tags. |
172 | 3416e3e7 | Dato Simó | -- |
173 | 3416e3e7 | Dato Simó | -- This function gets called once per repair group in an instance's tag, and it |
174 | 3416e3e7 | Dato Simó | -- determines whether to set the status of the instance according to this new |
175 | 3416e3e7 | Dato Simó | -- group, or to keep the existing state. See the documentation for |
176 | 3416e3e7 | Dato Simó | -- 'setInitialState' for the rules to be followed when determining this. |
177 | 3416e3e7 | Dato Simó | arStatusCmp :: InstanceData -> [AutoRepairData] -> Result InstanceData |
178 | 3416e3e7 | Dato Simó | arStatusCmp instData arData = |
179 | 3416e3e7 | Dato Simó | let curSt = arState instData |
180 | 3416e3e7 | Dato Simó | arData' = sortBy (comparing keyfn) arData |
181 | 3416e3e7 | Dato Simó | keyfn d = (arResult d, arTime d) |
182 | 3416e3e7 | Dato Simó | newData = last arData' |
183 | 3416e3e7 | Dato Simó | newSt = case arResult newData of |
184 | 3416e3e7 | Dato Simó | Just ArSuccess -> ArHealthy $ Just newData |
185 | 3416e3e7 | Dato Simó | Just ArEnoperm -> ArHealthy $ Just newData |
186 | 3416e3e7 | Dato Simó | Just ArFailure -> ArFailedRepair newData |
187 | 3416e3e7 | Dato Simó | Nothing -> ArPendingRepair newData |
188 | 3416e3e7 | Dato Simó | in |
189 | 3416e3e7 | Dato Simó | case curSt of |
190 | 3416e3e7 | Dato Simó | ArFailedRepair _ -> Ok instData -- Always keep the earliest failure. |
191 | 3416e3e7 | Dato Simó | ArHealthy _ -> Ok instData { arState = newSt |
192 | 3416e3e7 | Dato Simó | , tagsToRemove = delCurTag instData |
193 | 3416e3e7 | Dato Simó | } |
194 | 3416e3e7 | Dato Simó | ArPendingRepair d -> Bad ( |
195 | 3416e3e7 | Dato Simó | "An unfinished repair was found in instance " ++ |
196 | 3416e3e7 | Dato Simó | Instance.name (arInstance instData) ++ ": found tag " ++ |
197 | 3416e3e7 | Dato Simó | show (arTag newData) ++ ", but older pending tag " ++ |
198 | 3416e3e7 | Dato Simó | show (arTag d) ++ "exists.") |
199 | 3416e3e7 | Dato Simó | |
200 | 3416e3e7 | Dato Simó | ArNeedsRepair _ -> Bad |
201 | 3416e3e7 | Dato Simó | "programming error: ArNeedsRepair found as an initial state" |
202 | 3416e3e7 | Dato Simó | |
203 | a5e58418 | Dato Simó | -- | Query jobs of a pending repair, returning the new instance data. |
204 | a5e58418 | Dato Simó | processPending :: L.Client -> InstanceData -> IO InstanceData |
205 | a5e58418 | Dato Simó | processPending client instData = |
206 | a5e58418 | Dato Simó | case arState instData of |
207 | a5e58418 | Dato Simó | (ArPendingRepair arData) -> do |
208 | a5e58418 | Dato Simó | sts <- L.queryJobsStatus client $ arJobs arData |
209 | a5e58418 | Dato Simó | time <- getClockTime |
210 | a5e58418 | Dato Simó | case sts of |
211 | a5e58418 | Dato Simó | Bad e -> exitErr $ "could not check job status: " ++ formatError e |
212 | a5e58418 | Dato Simó | Ok sts' -> |
213 | a5e58418 | Dato Simó | if any (<= JOB_STATUS_RUNNING) sts' then |
214 | a5e58418 | Dato Simó | return instData -- (no change) |
215 | a5e58418 | Dato Simó | else do |
216 | a5e58418 | Dato Simó | let iname = Instance.name $ arInstance instData |
217 | a5e58418 | Dato Simó | srcSt = arStateName $ arState instData |
218 | a5e58418 | Dato Simó | destSt = arStateName arState' |
219 | a5e58418 | Dato Simó | putStrLn ("Moving " ++ iname ++ " from " ++ show srcSt ++ " to " ++ |
220 | a5e58418 | Dato Simó | show destSt) |
221 | a5e58418 | Dato Simó | commitChange client instData' |
222 | a5e58418 | Dato Simó | where |
223 | a5e58418 | Dato Simó | instData' = |
224 | a5e58418 | Dato Simó | instData { arState = arState' |
225 | a5e58418 | Dato Simó | , tagsToRemove = delCurTag instData |
226 | a5e58418 | Dato Simó | } |
227 | a5e58418 | Dato Simó | arState' = |
228 | a5e58418 | Dato Simó | if all (== JOB_STATUS_SUCCESS) sts' then |
229 | a5e58418 | Dato Simó | ArHealthy $ Just (updateTag $ arData { arResult = Just ArSuccess |
230 | a5e58418 | Dato Simó | , arTime = time }) |
231 | a5e58418 | Dato Simó | else |
232 | a5e58418 | Dato Simó | ArFailedRepair (updateTag $ arData { arResult = Just ArFailure |
233 | a5e58418 | Dato Simó | , arTime = time }) |
234 | a5e58418 | Dato Simó | |
235 | a5e58418 | Dato Simó | _ -> return instData |
236 | a5e58418 | Dato Simó | |
237 | a5e58418 | Dato Simó | -- | Update the tag of an 'AutoRepairData' record to match all the other fields. |
238 | a5e58418 | Dato Simó | updateTag :: AutoRepairData -> AutoRepairData |
239 | a5e58418 | Dato Simó | updateTag arData = |
240 | a5e58418 | Dato Simó | let ini = [autoRepairTypeToRaw $ arType arData, |
241 | a5e58418 | Dato Simó | arUuid arData, |
242 | a5e58418 | Dato Simó | clockTimeToString $ arTime arData] |
243 | a5e58418 | Dato Simó | end = [intercalate "+" . map (show . fromJobId) $ arJobs arData] |
244 | a5e58418 | Dato Simó | (pfx, middle) = |
245 | a5e58418 | Dato Simó | case arResult arData of |
246 | a5e58418 | Dato Simó | Nothing -> (C.autoRepairTagPending, []) |
247 | a5e58418 | Dato Simó | Just rs -> (C.autoRepairTagResult, [autoRepairResultToRaw rs]) |
248 | a5e58418 | Dato Simó | in |
249 | a5e58418 | Dato Simó | arData { arTag = pfx ++ intercalate ":" (ini ++ middle ++ end) } |
250 | a5e58418 | Dato Simó | |
251 | a5e58418 | Dato Simó | -- | Apply and remove tags from an instance as indicated by 'InstanceData'. |
252 | a5e58418 | Dato Simó | -- |
253 | a5e58418 | Dato Simó | -- If the /arState/ of the /InstanceData/ record has an associated |
254 | a5e58418 | Dato Simó | -- 'AutoRepairData', add its tag to the instance object. Additionally, if |
255 | a5e58418 | Dato Simó | -- /tagsToRemove/ is not empty, remove those tags from the instance object. The |
256 | a5e58418 | Dato Simó | -- returned /InstanceData/ object always has an empty /tagsToRemove/. |
257 | a5e58418 | Dato Simó | commitChange :: L.Client -> InstanceData -> IO InstanceData |
258 | a5e58418 | Dato Simó | commitChange client instData = do |
259 | a5e58418 | Dato Simó | let iname = Instance.name $ arInstance instData |
260 | a5e58418 | Dato Simó | arData = getArData $ arState instData |
261 | a5e58418 | Dato Simó | rmTags = tagsToRemove instData |
262 | a5e58418 | Dato Simó | execJobsWaitOk' opcodes = do |
263 | a5e58418 | Dato Simó | res <- execJobsWaitOk [map wrapOpCode opcodes] client |
264 | a5e58418 | Dato Simó | case res of |
265 | a5e58418 | Dato Simó | Ok _ -> return () |
266 | a5e58418 | Dato Simó | Bad e -> exitErr e |
267 | a5e58418 | Dato Simó | |
268 | a5e58418 | Dato Simó | when (isJust arData) $ do |
269 | a5e58418 | Dato Simó | let tag = arTag $ fromJust arData |
270 | a5e58418 | Dato Simó | putStrLn (">>> Adding the following tag to " ++ iname ++ ":\n" ++ show tag) |
271 | 5cbf7832 | Jose A. Lopes | execJobsWaitOk' [OpTagsSet TagKindInstance [tag] (Just iname)] |
272 | a5e58418 | Dato Simó | |
273 | a5e58418 | Dato Simó | unless (null rmTags) $ do |
274 | a5e58418 | Dato Simó | putStr (">>> Removing the following tags from " ++ iname ++ ":\n" ++ |
275 | a5e58418 | Dato Simó | unlines (map show rmTags)) |
276 | 5cbf7832 | Jose A. Lopes | execJobsWaitOk' [OpTagsDel TagKindInstance rmTags (Just iname)] |
277 | a5e58418 | Dato Simó | |
278 | a5e58418 | Dato Simó | return instData { tagsToRemove = [] } |
279 | a5e58418 | Dato Simó | |
280 | 546a1dcf | Dato Simó | -- | Detect brokenness with an instance and suggest repair type and jobs to run. |
281 | 4ba5f148 | Dato Simó | detectBroken :: Node.List -> Instance.Instance |
282 | 4ba5f148 | Dato Simó | -> Maybe (AutoRepairType, [OpCode]) |
283 | 4ba5f148 | Dato Simó | detectBroken nl inst = |
284 | 4ba5f148 | Dato Simó | let disk = Instance.diskTemplate inst |
285 | 4ba5f148 | Dato Simó | iname = Instance.name inst |
286 | 4ba5f148 | Dato Simó | offPri = Node.offline $ Container.find (Instance.pNode inst) nl |
287 | 4ba5f148 | Dato Simó | offSec = Node.offline $ Container.find (Instance.sNode inst) nl |
288 | 4ba5f148 | Dato Simó | in |
289 | 4ba5f148 | Dato Simó | case disk of |
290 | 4ba5f148 | Dato Simó | DTDrbd8 |
291 | 4ba5f148 | Dato Simó | | offPri && offSec -> |
292 | 4ba5f148 | Dato Simó | Just ( |
293 | 4ba5f148 | Dato Simó | ArReinstall, |
294 | 4ba5f148 | Dato Simó | [ OpInstanceRecreateDisks { opInstanceName = iname |
295 | da4a52a3 | Thomas Thrainer | , opInstanceUuid = Nothing |
296 | 4ba5f148 | Dato Simó | , opRecreateDisksInfo = RecreateDisksAll |
297 | 4ba5f148 | Dato Simó | , opNodes = [] |
298 | 4ba5f148 | Dato Simó | -- FIXME: there should be a better way to |
299 | 546a1dcf | Dato Simó | -- specify opcode parameters than abusing |
300 | 4ba5f148 | Dato Simó | -- mkNonEmpty in this way (using the fact |
301 | 4ba5f148 | Dato Simó | -- that Maybe is used both for optional |
302 | 4ba5f148 | Dato Simó | -- fields, and to express failure). |
303 | 1c3231aa | Thomas Thrainer | , opNodeUuids = Nothing |
304 | 4ba5f148 | Dato Simó | , opIallocator = mkNonEmpty "hail" |
305 | 4ba5f148 | Dato Simó | } |
306 | 4ba5f148 | Dato Simó | , OpInstanceReinstall { opInstanceName = iname |
307 | da4a52a3 | Thomas Thrainer | , opInstanceUuid = Nothing |
308 | 4ba5f148 | Dato Simó | , opOsType = Nothing |
309 | 4ba5f148 | Dato Simó | , opTempOsParams = Nothing |
310 | 4ba5f148 | Dato Simó | , opForceVariant = False |
311 | 4ba5f148 | Dato Simó | } |
312 | 4ba5f148 | Dato Simó | ]) |
313 | 4ba5f148 | Dato Simó | | offPri -> |
314 | 4ba5f148 | Dato Simó | Just ( |
315 | 4ba5f148 | Dato Simó | ArFailover, |
316 | 4ba5f148 | Dato Simó | [ OpInstanceFailover { opInstanceName = iname |
317 | da4a52a3 | Thomas Thrainer | , opInstanceUuid = Nothing |
318 | 4ba5f148 | Dato Simó | -- FIXME: ditto, see above. |
319 | 4ba5f148 | Dato Simó | , opShutdownTimeout = fromJust $ mkNonNegative |
320 | 4ba5f148 | Dato Simó | C.defaultShutdownTimeout |
321 | 4ba5f148 | Dato Simó | , opIgnoreConsistency = False |
322 | 4ba5f148 | Dato Simó | , opTargetNode = Nothing |
323 | 1c3231aa | Thomas Thrainer | , opTargetNodeUuid = Nothing |
324 | 4ba5f148 | Dato Simó | , opIgnoreIpolicy = False |
325 | 4ba5f148 | Dato Simó | , opIallocator = Nothing |
326 | aa7a5c90 | Michele Tartara | , opMigrationCleanup = False |
327 | 4ba5f148 | Dato Simó | } |
328 | 4ba5f148 | Dato Simó | ]) |
329 | 4ba5f148 | Dato Simó | | offSec -> |
330 | 4ba5f148 | Dato Simó | Just ( |
331 | 4ba5f148 | Dato Simó | ArFixStorage, |
332 | 4ba5f148 | Dato Simó | [ OpInstanceReplaceDisks { opInstanceName = iname |
333 | da4a52a3 | Thomas Thrainer | , opInstanceUuid = Nothing |
334 | 4ba5f148 | Dato Simó | , opReplaceDisksMode = ReplaceNewSecondary |
335 | 4ba5f148 | Dato Simó | , opReplaceDisksList = [] |
336 | 4ba5f148 | Dato Simó | , opRemoteNode = Nothing |
337 | 4ba5f148 | Dato Simó | -- FIXME: ditto, see above. |
338 | 1c3231aa | Thomas Thrainer | , opRemoteNodeUuid = Nothing |
339 | 4ba5f148 | Dato Simó | , opIallocator = mkNonEmpty "hail" |
340 | 4ba5f148 | Dato Simó | , opEarlyRelease = False |
341 | 4ba5f148 | Dato Simó | , opIgnoreIpolicy = False |
342 | 4ba5f148 | Dato Simó | } |
343 | 4ba5f148 | Dato Simó | ]) |
344 | 4ba5f148 | Dato Simó | | otherwise -> Nothing |
345 | 4ba5f148 | Dato Simó | |
346 | 4ba5f148 | Dato Simó | DTPlain |
347 | 4ba5f148 | Dato Simó | | offPri -> |
348 | 4ba5f148 | Dato Simó | Just ( |
349 | 4ba5f148 | Dato Simó | ArReinstall, |
350 | 4ba5f148 | Dato Simó | [ OpInstanceRecreateDisks { opInstanceName = iname |
351 | da4a52a3 | Thomas Thrainer | , opInstanceUuid = Nothing |
352 | 4ba5f148 | Dato Simó | , opRecreateDisksInfo = RecreateDisksAll |
353 | 4ba5f148 | Dato Simó | , opNodes = [] |
354 | 4ba5f148 | Dato Simó | -- FIXME: ditto, see above. |
355 | 1c3231aa | Thomas Thrainer | , opNodeUuids = Nothing |
356 | 4ba5f148 | Dato Simó | , opIallocator = mkNonEmpty "hail" |
357 | 4ba5f148 | Dato Simó | } |
358 | 4ba5f148 | Dato Simó | , OpInstanceReinstall { opInstanceName = iname |
359 | da4a52a3 | Thomas Thrainer | , opInstanceUuid = Nothing |
360 | 4ba5f148 | Dato Simó | , opOsType = Nothing |
361 | 4ba5f148 | Dato Simó | , opTempOsParams = Nothing |
362 | 4ba5f148 | Dato Simó | , opForceVariant = False |
363 | 4ba5f148 | Dato Simó | } |
364 | 4ba5f148 | Dato Simó | ]) |
365 | 4ba5f148 | Dato Simó | | otherwise -> Nothing |
366 | 4ba5f148 | Dato Simó | |
367 | 4ba5f148 | Dato Simó | _ -> Nothing -- Other cases are unimplemented for now: DTDiskless, |
368 | 4ba5f148 | Dato Simó | -- DTFile, DTSharedFile, DTBlock, DTRbd, DTExt. |
369 | 4ba5f148 | Dato Simó | |
370 | 8072af6c | Dato Simó | -- | Perform the suggested repair on an instance if its policy allows it. |
371 | 8af72964 | Dato Simó | doRepair :: L.Client -- ^ The Luxi client |
372 | 8af72964 | Dato Simó | -> Double -- ^ Delay to insert before the first repair opcode |
373 | 8af72964 | Dato Simó | -> InstanceData -- ^ The instance data |
374 | 8af72964 | Dato Simó | -> (AutoRepairType, [OpCode]) -- ^ The repair job to perform |
375 | 8af72964 | Dato Simó | -> IO InstanceData -- ^ The updated instance data |
376 | 8af72964 | Dato Simó | doRepair client delay instData (rtype, opcodes) = |
377 | 8072af6c | Dato Simó | let inst = arInstance instData |
378 | 8072af6c | Dato Simó | ipol = Instance.arPolicy inst |
379 | 8072af6c | Dato Simó | iname = Instance.name inst |
380 | 8072af6c | Dato Simó | in |
381 | 8072af6c | Dato Simó | case ipol of |
382 | 8072af6c | Dato Simó | ArEnabled maxtype -> |
383 | 8072af6c | Dato Simó | if rtype > maxtype then do |
384 | 8072af6c | Dato Simó | uuid <- newUUID |
385 | 8072af6c | Dato Simó | time <- getClockTime |
386 | 8072af6c | Dato Simó | |
387 | 8072af6c | Dato Simó | let arState' = ArNeedsRepair ( |
388 | 8072af6c | Dato Simó | updateTag $ AutoRepairData rtype uuid time [] (Just ArEnoperm) "") |
389 | 8072af6c | Dato Simó | instData' = instData { arState = arState' |
390 | 8072af6c | Dato Simó | , tagsToRemove = delCurTag instData |
391 | 8072af6c | Dato Simó | } |
392 | 8072af6c | Dato Simó | |
393 | 8072af6c | Dato Simó | putStrLn ("Not performing a repair of type " ++ show rtype ++ " on " ++ |
394 | 8072af6c | Dato Simó | iname ++ " because only repairs up to " ++ show maxtype ++ |
395 | 8072af6c | Dato Simó | " are allowed") |
396 | 8072af6c | Dato Simó | commitChange client instData' -- Adds "enoperm" result label. |
397 | 8072af6c | Dato Simó | else do |
398 | 8072af6c | Dato Simó | putStrLn ("Executing " ++ show rtype ++ " repair on " ++ iname) |
399 | 8072af6c | Dato Simó | |
400 | 8af72964 | Dato Simó | -- After submitting the job, we must write an autorepair:pending tag, |
401 | 8af72964 | Dato Simó | -- that includes the repair job IDs so that they can be checked later. |
402 | 8af72964 | Dato Simó | -- One problem we run into is that the repair job immediately grabs |
403 | 8af72964 | Dato Simó | -- locks for the affected instance, and the subsequent TAGS_SET job is |
404 | 546a1dcf | Dato Simó | -- blocked, introducing an unnecessary delay for the end-user. One |
405 | 8af72964 | Dato Simó | -- alternative would be not to wait for the completion of the TAGS_SET |
406 | 8af72964 | Dato Simó | -- job, contrary to what commitChange normally does; but we insist on |
407 | 8af72964 | Dato Simó | -- waiting for the tag to be set so as to abort in case of failure, |
408 | 8af72964 | Dato Simó | -- because the cluster is left in an invalid state in that case. |
409 | 8af72964 | Dato Simó | -- |
410 | 8af72964 | Dato Simó | -- The proper solution (in 2.9+) would be not to use tags for storing |
411 | 8af72964 | Dato Simó | -- autorepair data, or make the TAGS_SET opcode not grab an instance's |
412 | 8af72964 | Dato Simó | -- locks (if that's deemed safe). In the meantime, we introduce an |
413 | 8af72964 | Dato Simó | -- artificial delay in the repair job (via a TestDelay opcode) so that |
414 | 8af72964 | Dato Simó | -- once we have the job ID, the TAGS_SET job can complete before the |
415 | 8af72964 | Dato Simó | -- repair job actually grabs the locks. (Please note that this is not |
416 | 8af72964 | Dato Simó | -- about synchronization, but merely about speeding up the execution of |
417 | 8af72964 | Dato Simó | -- the harep tool. If this TestDelay opcode is removed, the program is |
418 | 8af72964 | Dato Simó | -- still correct.) |
419 | 8af72964 | Dato Simó | let opcodes' = |
420 | 8af72964 | Dato Simó | if delay > 0 then |
421 | 8af72964 | Dato Simó | OpTestDelay { opDelayDuration = delay |
422 | 8af72964 | Dato Simó | , opDelayOnMaster = True |
423 | 8af72964 | Dato Simó | , opDelayOnNodes = [] |
424 | 1c3231aa | Thomas Thrainer | , opDelayOnNodeUuids = Nothing |
425 | 8af72964 | Dato Simó | , opDelayRepeat = fromJust $ mkNonNegative 0 |
426 | 8af72964 | Dato Simó | } : opcodes |
427 | 8af72964 | Dato Simó | else |
428 | 8af72964 | Dato Simó | opcodes |
429 | 8af72964 | Dato Simó | |
430 | 8072af6c | Dato Simó | uuid <- newUUID |
431 | 8072af6c | Dato Simó | time <- getClockTime |
432 | 8af72964 | Dato Simó | jids <- submitJobs [map wrapOpCode opcodes'] client |
433 | 8072af6c | Dato Simó | |
434 | 8072af6c | Dato Simó | case jids of |
435 | 8072af6c | Dato Simó | Bad e -> exitErr e |
436 | 8072af6c | Dato Simó | Ok jids' -> |
437 | 8072af6c | Dato Simó | let arState' = ArPendingRepair ( |
438 | 8072af6c | Dato Simó | updateTag $ AutoRepairData rtype uuid time jids' Nothing "") |
439 | 8072af6c | Dato Simó | instData' = instData { arState = arState' |
440 | 8072af6c | Dato Simó | , tagsToRemove = delCurTag instData |
441 | 8072af6c | Dato Simó | } |
442 | 8072af6c | Dato Simó | in |
443 | 8072af6c | Dato Simó | commitChange client instData' -- Adds "pending" label. |
444 | 8072af6c | Dato Simó | |
445 | 8072af6c | Dato Simó | otherSt -> do |
446 | 8072af6c | Dato Simó | putStrLn ("Not repairing " ++ iname ++ " because it's in state " ++ |
447 | 8072af6c | Dato Simó | show otherSt) |
448 | 8072af6c | Dato Simó | return instData |
449 | 8072af6c | Dato Simó | |
450 | b6d9bec8 | Dato Simó | -- | Main function. |
451 | b6d9bec8 | Dato Simó | main :: Options -> [String] -> IO () |
452 | 3416e3e7 | Dato Simó | main opts args = do |
453 | b6d9bec8 | Dato Simó | unless (null args) $ |
454 | b6d9bec8 | Dato Simó | exitErr "this program doesn't take any arguments." |
455 | b6d9bec8 | Dato Simó | |
456 | 9d049fb4 | Klaus Aehlig | luxiDef <- Path.defaultMasterSocket |
457 | 3416e3e7 | Dato Simó | let master = fromMaybe luxiDef $ optLuxi opts |
458 | 3416e3e7 | Dato Simó | opts' = opts { optLuxi = Just master } |
459 | 3416e3e7 | Dato Simó | |
460 | 4ba5f148 | Dato Simó | (ClusterData _ nl il _ _) <- loadExternalData opts' |
461 | 3416e3e7 | Dato Simó | |
462 | 3416e3e7 | Dato Simó | let iniDataRes = mapM setInitialState $ Container.elems il |
463 | a5e58418 | Dato Simó | iniData <- exitIfBad "when parsing auto-repair tags" iniDataRes |
464 | a5e58418 | Dato Simó | |
465 | a5e58418 | Dato Simó | -- First step: check all pending repairs, see if they are completed. |
466 | d605e261 | Petr Pudlak | iniData' <- bracket (L.getLuxiClient master) L.closeClient $ |
467 | 4ba5f148 | Dato Simó | forM iniData . processPending |
468 | 4ba5f148 | Dato Simó | |
469 | 4ba5f148 | Dato Simó | -- Second step: detect any problems. |
470 | 8072af6c | Dato Simó | let repairs = map (detectBroken nl . arInstance) iniData' |
471 | 8072af6c | Dato Simó | |
472 | 8072af6c | Dato Simó | -- Third step: create repair jobs for broken instances that are in ArHealthy. |
473 | 8072af6c | Dato Simó | let maybeRepair c (i, r) = maybe (return i) (repairHealthy c i) r |
474 | 8af72964 | Dato Simó | jobDelay = optJobDelay opts |
475 | 8072af6c | Dato Simó | repairHealthy c i = case arState i of |
476 | 8af72964 | Dato Simó | ArHealthy _ -> doRepair c jobDelay i |
477 | 8072af6c | Dato Simó | _ -> const (return i) |
478 | 8072af6c | Dato Simó | |
479 | d605e261 | Petr Pudlak | repairDone <- bracket (L.getLuxiClient master) L.closeClient $ |
480 | 41238d27 | Dato Simó | forM (zip iniData' repairs) . maybeRepair |
481 | 41238d27 | Dato Simó | |
482 | 41238d27 | Dato Simó | -- Print some stats and exit. |
483 | 41238d27 | Dato Simó | let states = map ((, 1 :: Int) . arStateName . arState) repairDone |
484 | 41238d27 | Dato Simó | counts = Map.fromListWith (+) states |
485 | 3416e3e7 | Dato Simó | |
486 | 41238d27 | Dato Simó | putStrLn "---------------------" |
487 | 41238d27 | Dato Simó | putStrLn "Instance status count" |
488 | 41238d27 | Dato Simó | putStrLn "---------------------" |
489 | 41238d27 | Dato Simó | putStr . unlines . Map.elems $ |
490 | 41238d27 | Dato Simó | Map.mapWithKey (\k v -> k ++ ": " ++ show v) counts |