Statistics
| Branch: | Tag: | Revision:

root / src / Ganeti / WConfd / DeathDetection.hs @ f3010b75

History | View | Annotate | Download (3 kB)

1
{-| Utility function for detecting the death of a job holding resources
2

    
3
To clean up resources owned by jobs that die for some reason, we need
4
to detect whether a job is still alive. As we have no control over PID
5
reuse, our approach is that each requester for a resource has to provide
6
a file where it owns an exclusive lock on. The kernel will make sure the
7
lock is removed if the process dies. We can probe for such a lock by
8
requesting a shared lock on the file.
9

    
10
-}
11

    
12
{-
13

    
14
Copyright (C) 2014 Google Inc.
15

    
16
This program is free software; you can redistribute it and/or modify
17
it under the terms of the GNU General Public License as published by
18
the Free Software Foundation; either version 2 of the License, or
19
(at your option) any later version.
20

    
21
This program is distributed in the hope that it will be useful, but
22
WITHOUT ANY WARRANTY; without even the implied warranty of
23
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
24
General Public License for more details.
25

    
26
You should have received a copy of the GNU General Public License
27
along with this program; if not, write to the Free Software
28
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
29
02110-1301, USA.
30

    
31
-}
32

    
33
module Ganeti.WConfd.DeathDetection
34
  ( isDead
35
  , cleanupLocksTask
36
  ) where
37

    
38
import Control.Concurrent (threadDelay)
39
import Control.Exception (bracket, try)
40
import Control.Monad
41
import System.Directory
42
import System.IO
43
import System.Posix.IO
44

    
45
import Ganeti.BasicTypes
46
import qualified Ganeti.Constants as C
47
import qualified Ganeti.Locking.Allocation as L
48
import Ganeti.Logging.Lifted (logDebug, logInfo)
49
import Ganeti.WConfd.Monad
50

    
51
-- | Detect whether a the process identified by the given path
52
-- does not exist any more. This function never fails and only
53
-- returns True if it has positive knowledge that the process
54
-- does not exist any more (i.e., if it managed successfully
55
-- obtain a shared lock on the file).
56
isDead :: FilePath -> IO Bool
57
isDead fpath = fmap (isOk :: Result () -> Bool) . runResultT . liftIO $ do
58
  filepresent <- doesFileExist fpath
59
  when filepresent
60
    $ bracket (openFd fpath ReadOnly Nothing defaultFileFlags) closeFd
61
              (`setLock` (ReadLock, AbsoluteSeek, 0, 0))
62

    
63
-- | Interval to run clean-up tasks in microseconds
64
cleanupInterval :: Int
65
cleanupInterval = C.wconfdDeathdetectionIntervall * 1000000
66

    
67
-- | Thread periodically cleaning up locks of lock owners that died.
68
cleanupLocksTask :: WConfdMonadInt ()
69
cleanupLocksTask = forever . runResultT $ do
70
  logDebug "Death detection timer fired"
71
  owners <- liftM L.lockOwners readLockAllocation
72
  logDebug $ "Current lock owners: " ++ show owners
73
  let cleanupIfDead owner@(_, fpath) = do
74
        died <- liftIO (isDead fpath)
75
        when died $ do
76
          logInfo $ show owner ++ " died, releasing locks"
77
          modifyLockAllocation_ (`L.freeLocks` owner)
78
          _ <- liftIO . try $ removeFile fpath
79
               :: WConfdMonad (Either IOError ())
80
          return ()
81
  mapM_ cleanupIfDead owners
82
  liftIO $ threadDelay cleanupInterval