Revision 8af72964 src/Ganeti/HTools/Program/Harep.hs

b/src/Ganeti/HTools/Program/Harep.hs
62 62
  luxi <- oLuxiSocket
63 63
  return
64 64
    [ luxi
65
    , oJobDelay
65 66
    ]
66 67

  
67 68
arguments :: [ArgCompletion]
......
353 354
                   -- DTFile, DTSharedFile, DTBlock, DTRbd, DTExt.
354 355

  
355 356
-- | Perform the suggested repair on an instance if its policy allows it.
356
doRepair :: L.Client -> InstanceData -> (AutoRepairType, [OpCode])
357
         -> IO InstanceData
358
doRepair client instData (rtype, opcodes) =
357
doRepair :: L.Client     -- ^ The Luxi client
358
         -> Double       -- ^ Delay to insert before the first repair opcode
359
         -> InstanceData -- ^ The instance data
360
         -> (AutoRepairType, [OpCode]) -- ^ The repair job to perform
361
         -> IO InstanceData -- ^ The updated instance data
362
doRepair client delay instData (rtype, opcodes) =
359 363
  let inst = arInstance instData
360 364
      ipol = Instance.arPolicy inst
361 365
      iname = Instance.name inst
......
379 383
      else do
380 384
        putStrLn ("Executing " ++ show rtype ++ " repair on " ++ iname)
381 385

  
386
        -- After submitting the job, we must write an autorepair:pending tag,
387
        -- that includes the repair job IDs so that they can be checked later.
388
        -- One problem we run into is that the repair job immediately grabs
389
        -- locks for the affected instance, and the subsequent TAGS_SET job is
390
        -- blocked, introducing an unnecesary delay for the end-user. One
391
        -- alternative would be not to wait for the completion of the TAGS_SET
392
        -- job, contrary to what commitChange normally does; but we insist on
393
        -- waiting for the tag to be set so as to abort in case of failure,
394
        -- because the cluster is left in an invalid state in that case.
395
        --
396
        -- The proper solution (in 2.9+) would be not to use tags for storing
397
        -- autorepair data, or make the TAGS_SET opcode not grab an instance's
398
        -- locks (if that's deemed safe). In the meantime, we introduce an
399
        -- artificial delay in the repair job (via a TestDelay opcode) so that
400
        -- once we have the job ID, the TAGS_SET job can complete before the
401
        -- repair job actually grabs the locks. (Please note that this is not
402
        -- about synchronization, but merely about speeding up the execution of
403
        -- the harep tool. If this TestDelay opcode is removed, the program is
404
        -- still correct.)
405
        let opcodes' =
406
              if delay > 0 then
407
                OpTestDelay { opDelayDuration = delay
408
                            , opDelayOnMaster = True
409
                            , opDelayOnNodes = []
410
                            , opDelayRepeat = fromJust $ mkNonNegative 0
411
                            } : opcodes
412
              else
413
                opcodes
414

  
382 415
        uuid <- newUUID
383 416
        time <- getClockTime
384
        jids <- submitJobs [map wrapOpCode opcodes] client
417
        jids <- submitJobs [map wrapOpCode opcodes'] client
385 418

  
386 419
        case jids of
387 420
          Bad e    -> exitErr e
......
423 456

  
424 457
  -- Third step: create repair jobs for broken instances that are in ArHealthy.
425 458
  let maybeRepair c (i, r) = maybe (return i) (repairHealthy c i) r
459
      jobDelay = optJobDelay opts
426 460
      repairHealthy c i = case arState i of
427
                            ArHealthy _ -> doRepair c i
461
                            ArHealthy _ -> doRepair c jobDelay i
428 462
                            _           -> const (return i)
429 463

  
430 464
  _unused_repairDone <- bracket (L.getClient master) L.closeClient $

Also available in: Unified diff