Revision 8af72964 src/Ganeti/HTools/Program/Harep.hs
b/src/Ganeti/HTools/Program/Harep.hs | ||
---|---|---|
62 | 62 |
luxi <- oLuxiSocket |
63 | 63 |
return |
64 | 64 |
[ luxi |
65 |
, oJobDelay |
|
65 | 66 |
] |
66 | 67 |
|
67 | 68 |
arguments :: [ArgCompletion] |
... | ... | |
353 | 354 |
-- DTFile, DTSharedFile, DTBlock, DTRbd, DTExt. |
354 | 355 |
|
355 | 356 |
-- | Perform the suggested repair on an instance if its policy allows it. |
356 |
doRepair :: L.Client -> InstanceData -> (AutoRepairType, [OpCode]) |
|
357 |
-> IO InstanceData |
|
358 |
doRepair client instData (rtype, opcodes) = |
|
357 |
doRepair :: L.Client -- ^ The Luxi client |
|
358 |
-> Double -- ^ Delay to insert before the first repair opcode |
|
359 |
-> InstanceData -- ^ The instance data |
|
360 |
-> (AutoRepairType, [OpCode]) -- ^ The repair job to perform |
|
361 |
-> IO InstanceData -- ^ The updated instance data |
|
362 |
doRepair client delay instData (rtype, opcodes) = |
|
359 | 363 |
let inst = arInstance instData |
360 | 364 |
ipol = Instance.arPolicy inst |
361 | 365 |
iname = Instance.name inst |
... | ... | |
379 | 383 |
else do |
380 | 384 |
putStrLn ("Executing " ++ show rtype ++ " repair on " ++ iname) |
381 | 385 |
|
386 |
-- After submitting the job, we must write an autorepair:pending tag, |
|
387 |
-- that includes the repair job IDs so that they can be checked later. |
|
388 |
-- One problem we run into is that the repair job immediately grabs |
|
389 |
-- locks for the affected instance, and the subsequent TAGS_SET job is |
|
390 |
-- blocked, introducing an unnecesary delay for the end-user. One |
|
391 |
-- alternative would be not to wait for the completion of the TAGS_SET |
|
392 |
-- job, contrary to what commitChange normally does; but we insist on |
|
393 |
-- waiting for the tag to be set so as to abort in case of failure, |
|
394 |
-- because the cluster is left in an invalid state in that case. |
|
395 |
-- |
|
396 |
-- The proper solution (in 2.9+) would be not to use tags for storing |
|
397 |
-- autorepair data, or make the TAGS_SET opcode not grab an instance's |
|
398 |
-- locks (if that's deemed safe). In the meantime, we introduce an |
|
399 |
-- artificial delay in the repair job (via a TestDelay opcode) so that |
|
400 |
-- once we have the job ID, the TAGS_SET job can complete before the |
|
401 |
-- repair job actually grabs the locks. (Please note that this is not |
|
402 |
-- about synchronization, but merely about speeding up the execution of |
|
403 |
-- the harep tool. If this TestDelay opcode is removed, the program is |
|
404 |
-- still correct.) |
|
405 |
let opcodes' = |
|
406 |
if delay > 0 then |
|
407 |
OpTestDelay { opDelayDuration = delay |
|
408 |
, opDelayOnMaster = True |
|
409 |
, opDelayOnNodes = [] |
|
410 |
, opDelayRepeat = fromJust $ mkNonNegative 0 |
|
411 |
} : opcodes |
|
412 |
else |
|
413 |
opcodes |
|
414 |
|
|
382 | 415 |
uuid <- newUUID |
383 | 416 |
time <- getClockTime |
384 |
jids <- submitJobs [map wrapOpCode opcodes] client |
|
417 |
jids <- submitJobs [map wrapOpCode opcodes'] client
|
|
385 | 418 |
|
386 | 419 |
case jids of |
387 | 420 |
Bad e -> exitErr e |
... | ... | |
423 | 456 |
|
424 | 457 |
-- Third step: create repair jobs for broken instances that are in ArHealthy. |
425 | 458 |
let maybeRepair c (i, r) = maybe (return i) (repairHealthy c i) r |
459 |
jobDelay = optJobDelay opts |
|
426 | 460 |
repairHealthy c i = case arState i of |
427 |
ArHealthy _ -> doRepair c i |
|
461 |
ArHealthy _ -> doRepair c jobDelay i
|
|
428 | 462 |
_ -> const (return i) |
429 | 463 |
|
430 | 464 |
_unused_repairDone <- bracket (L.getClient master) L.closeClient $ |
Also available in: Unified diff