Statistics
| Branch: | Tag: | Revision:

root / src / Ganeti / WConfd / DeathDetection.hs @ 39c1e700

History | View | Annotate | Download (3.1 kB)

1
{-| Utility function for detecting the death of a job holding resources
2

    
3
To clean up resources owned by jobs that die for some reason, we need
4
to detect whether a job is still alive. As we have no control over PID
5
reuse, our approach is that each requester for a resource has to provide
6
a file where it owns an exclusive lock on. The kernel will make sure the
7
lock is removed if the process dies. We can probe for such a lock by
8
requesting a shared lock on the file.
9

    
10
-}
11

    
12
{-
13

    
14
Copyright (C) 2014 Google Inc.
15

    
16
This program is free software; you can redistribute it and/or modify
17
it under the terms of the GNU General Public License as published by
18
the Free Software Foundation; either version 2 of the License, or
19
(at your option) any later version.
20

    
21
This program is distributed in the hope that it will be useful, but
22
WITHOUT ANY WARRANTY; without even the implied warranty of
23
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
24
General Public License for more details.
25

    
26
You should have received a copy of the GNU General Public License
27
along with this program; if not, write to the Free Software
28
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
29
02110-1301, USA.
30

    
31
-}
32

    
33
module Ganeti.WConfd.DeathDetection
34
  ( isDead
35
  , cleanupLocksTask
36
  ) where
37

    
38
import Control.Concurrent (threadDelay)
39
import Control.Exception (bracket, try)
40
import Control.Monad
41
import System.Directory
42
import System.IO
43
import System.Posix.IO
44

    
45
import Ganeti.BasicTypes
46
import qualified Ganeti.Constants as C
47
import qualified Ganeti.Locking.Allocation as L
48
import qualified Ganeti.Locking.Waiting as LW
49
import Ganeti.Locking.Locks (ClientId(..))
50
import Ganeti.Logging.Lifted (logDebug, logInfo)
51
import Ganeti.WConfd.Monad
52

    
53
-- | Detect whether a the process identified by the given path
54
-- does not exist any more. This function never fails and only
55
-- returns True if it has positive knowledge that the process
56
-- does not exist any more (i.e., if it managed successfully
57
-- obtain a shared lock on the file).
58
isDead :: FilePath -> IO Bool
59
isDead fpath = fmap (isOk :: Result () -> Bool) . runResultT . liftIO $ do
60
  filepresent <- doesFileExist fpath
61
  when filepresent
62
    $ bracket (openFd fpath ReadOnly Nothing defaultFileFlags) closeFd
63
              (`setLock` (ReadLock, AbsoluteSeek, 0, 0))
64

    
65
-- | Interval to run clean-up tasks in microseconds
66
cleanupInterval :: Int
67
cleanupInterval = C.wconfdDeathdetectionIntervall * 1000000
68

    
69
-- | Thread periodically cleaning up locks of lock owners that died.
70
cleanupLocksTask :: WConfdMonadInt ()
71
cleanupLocksTask = forever . runResultT $ do
72
  logDebug "Death detection timer fired"
73
  owners <- liftM L.lockOwners readLockAllocation
74
  logDebug $ "Current lock owners: " ++ show owners
75
  let cleanupIfDead owner = do
76
        let fpath = ciLockFile owner
77
        died <- liftIO (isDead fpath)
78
        when died $ do
79
          logInfo $ show owner ++ " died, releasing locks"
80
          modifyLockWaiting_ (LW.releaseResources owner)
81
          _ <- liftIO . try $ removeFile fpath
82
               :: WConfdMonad (Either IOError ())
83
          return ()
84
  mapM_ cleanupIfDead owners
85
  liftIO $ threadDelay cleanupInterval