62 |
62 |
luxi <- oLuxiSocket
|
63 |
63 |
return
|
64 |
64 |
[ luxi
|
|
65 |
, oJobDelay
|
65 |
66 |
]
|
66 |
67 |
|
67 |
68 |
arguments :: [ArgCompletion]
|
... | ... | |
353 |
354 |
-- DTFile, DTSharedFile, DTBlock, DTRbd, DTExt.
|
354 |
355 |
|
355 |
356 |
-- | Perform the suggested repair on an instance if its policy allows it.
|
356 |
|
doRepair :: L.Client -> InstanceData -> (AutoRepairType, [OpCode])
|
357 |
|
-> IO InstanceData
|
358 |
|
doRepair client instData (rtype, opcodes) =
|
|
357 |
doRepair :: L.Client -- ^ The Luxi client
|
|
358 |
-> Double -- ^ Delay to insert before the first repair opcode
|
|
359 |
-> InstanceData -- ^ The instance data
|
|
360 |
-> (AutoRepairType, [OpCode]) -- ^ The repair job to perform
|
|
361 |
-> IO InstanceData -- ^ The updated instance data
|
|
362 |
doRepair client delay instData (rtype, opcodes) =
|
359 |
363 |
let inst = arInstance instData
|
360 |
364 |
ipol = Instance.arPolicy inst
|
361 |
365 |
iname = Instance.name inst
|
... | ... | |
379 |
383 |
else do
|
380 |
384 |
putStrLn ("Executing " ++ show rtype ++ " repair on " ++ iname)
|
381 |
385 |
|
|
386 |
-- After submitting the job, we must write an autorepair:pending tag,
|
|
387 |
-- that includes the repair job IDs so that they can be checked later.
|
|
388 |
-- One problem we run into is that the repair job immediately grabs
|
|
389 |
-- locks for the affected instance, and the subsequent TAGS_SET job is
|
|
390 |
-- blocked, introducing an unnecesary delay for the end-user. One
|
|
391 |
-- alternative would be not to wait for the completion of the TAGS_SET
|
|
392 |
-- job, contrary to what commitChange normally does; but we insist on
|
|
393 |
-- waiting for the tag to be set so as to abort in case of failure,
|
|
394 |
-- because the cluster is left in an invalid state in that case.
|
|
395 |
--
|
|
396 |
-- The proper solution (in 2.9+) would be not to use tags for storing
|
|
397 |
-- autorepair data, or make the TAGS_SET opcode not grab an instance's
|
|
398 |
-- locks (if that's deemed safe). In the meantime, we introduce an
|
|
399 |
-- artificial delay in the repair job (via a TestDelay opcode) so that
|
|
400 |
-- once we have the job ID, the TAGS_SET job can complete before the
|
|
401 |
-- repair job actually grabs the locks. (Please note that this is not
|
|
402 |
-- about synchronization, but merely about speeding up the execution of
|
|
403 |
-- the harep tool. If this TestDelay opcode is removed, the program is
|
|
404 |
-- still correct.)
|
|
405 |
let opcodes' =
|
|
406 |
if delay > 0 then
|
|
407 |
OpTestDelay { opDelayDuration = delay
|
|
408 |
, opDelayOnMaster = True
|
|
409 |
, opDelayOnNodes = []
|
|
410 |
, opDelayRepeat = fromJust $ mkNonNegative 0
|
|
411 |
} : opcodes
|
|
412 |
else
|
|
413 |
opcodes
|
|
414 |
|
382 |
415 |
uuid <- newUUID
|
383 |
416 |
time <- getClockTime
|
384 |
|
jids <- submitJobs [map wrapOpCode opcodes] client
|
|
417 |
jids <- submitJobs [map wrapOpCode opcodes'] client
|
385 |
418 |
|
386 |
419 |
case jids of
|
387 |
420 |
Bad e -> exitErr e
|
... | ... | |
423 |
456 |
|
424 |
457 |
-- Third step: create repair jobs for broken instances that are in ArHealthy.
|
425 |
458 |
let maybeRepair c (i, r) = maybe (return i) (repairHealthy c i) r
|
|
459 |
jobDelay = optJobDelay opts
|
426 |
460 |
repairHealthy c i = case arState i of
|
427 |
|
ArHealthy _ -> doRepair c i
|
|
461 |
ArHealthy _ -> doRepair c jobDelay i
|
428 |
462 |
_ -> const (return i)
|
429 |
463 |
|
430 |
464 |
_unused_repairDone <- bracket (L.getClient master) L.closeClient $
|