root / src / Ganeti / WConfd / DeathDetection.hs @ f3010b75
History | View | Annotate | Download (3 kB)
1 |
{-| Utility function for detecting the death of a job holding resources |
---|---|
2 |
|
3 |
To clean up resources owned by jobs that die for some reason, we need |
4 |
to detect whether a job is still alive. As we have no control over PID |
5 |
reuse, our approach is that each requester for a resource has to provide |
6 |
a file where it owns an exclusive lock on. The kernel will make sure the |
7 |
lock is removed if the process dies. We can probe for such a lock by |
8 |
requesting a shared lock on the file. |
9 |
|
10 |
-} |
11 |
|
12 |
{- |
13 |
|
14 |
Copyright (C) 2014 Google Inc. |
15 |
|
16 |
This program is free software; you can redistribute it and/or modify |
17 |
it under the terms of the GNU General Public License as published by |
18 |
the Free Software Foundation; either version 2 of the License, or |
19 |
(at your option) any later version. |
20 |
|
21 |
This program is distributed in the hope that it will be useful, but |
22 |
WITHOUT ANY WARRANTY; without even the implied warranty of |
23 |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
24 |
General Public License for more details. |
25 |
|
26 |
You should have received a copy of the GNU General Public License |
27 |
along with this program; if not, write to the Free Software |
28 |
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA |
29 |
02110-1301, USA. |
30 |
|
31 |
-} |
32 |
|
33 |
module Ganeti.WConfd.DeathDetection |
34 |
( isDead |
35 |
, cleanupLocksTask |
36 |
) where |
37 |
|
38 |
import Control.Concurrent (threadDelay) |
39 |
import Control.Exception (bracket, try) |
40 |
import Control.Monad |
41 |
import System.Directory |
42 |
import System.IO |
43 |
import System.Posix.IO |
44 |
|
45 |
import Ganeti.BasicTypes |
46 |
import qualified Ganeti.Constants as C |
47 |
import qualified Ganeti.Locking.Allocation as L |
48 |
import Ganeti.Logging.Lifted (logDebug, logInfo) |
49 |
import Ganeti.WConfd.Monad |
50 |
|
51 |
-- | Detect whether a the process identified by the given path |
52 |
-- does not exist any more. This function never fails and only |
53 |
-- returns True if it has positive knowledge that the process |
54 |
-- does not exist any more (i.e., if it managed successfully |
55 |
-- obtain a shared lock on the file). |
56 |
isDead :: FilePath -> IO Bool |
57 |
isDead fpath = fmap (isOk :: Result () -> Bool) . runResultT . liftIO $ do |
58 |
filepresent <- doesFileExist fpath |
59 |
when filepresent |
60 |
$ bracket (openFd fpath ReadOnly Nothing defaultFileFlags) closeFd |
61 |
(`setLock` (ReadLock, AbsoluteSeek, 0, 0)) |
62 |
|
63 |
-- | Interval to run clean-up tasks in microseconds |
64 |
cleanupInterval :: Int |
65 |
cleanupInterval = C.wconfdDeathdetectionIntervall * 1000000 |
66 |
|
67 |
-- | Thread periodically cleaning up locks of lock owners that died. |
68 |
cleanupLocksTask :: WConfdMonadInt () |
69 |
cleanupLocksTask = forever . runResultT $ do |
70 |
logDebug "Death detection timer fired" |
71 |
owners <- liftM L.lockOwners readLockAllocation |
72 |
logDebug $ "Current lock owners: " ++ show owners |
73 |
let cleanupIfDead owner@(_, fpath) = do |
74 |
died <- liftIO (isDead fpath) |
75 |
when died $ do |
76 |
logInfo $ show owner ++ " died, releasing locks" |
77 |
modifyLockAllocation_ (`L.freeLocks` owner) |
78 |
_ <- liftIO . try $ removeFile fpath |
79 |
:: WConfdMonad (Either IOError ()) |
80 |
return () |
81 |
mapM_ cleanupIfDead owners |
82 |
liftIO $ threadDelay cleanupInterval |