4 # Copyright (C) 2006, 2007, 2010, 2011 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 """Utility functions for processes.
35 from cStringIO import StringIO
37 from ganeti import errors
38 from ganeti import constants
39 from ganeti import compat
41 from ganeti.utils import retry as utils_retry
42 from ganeti.utils import wrapper as utils_wrapper
43 from ganeti.utils import text as utils_text
44 from ganeti.utils import io as utils_io
45 from ganeti.utils import algo as utils_algo
48 #: when set to True, L{RunCmd} is disabled
53 _TIMEOUT_KILL) = range(3)
57 """Disables the use of fork(2).
60 global _no_fork # pylint: disable-msg=W0603
65 class RunResult(object):
66 """Holds the result of running external programs.
69 @ivar exit_code: the exit code of the program, or None (if the program
71 @type signal: int or None
72 @ivar signal: the signal that caused the program to finish, or None
73 (if the program wasn't terminated by a signal)
75 @ivar stdout: the standard output of the program
77 @ivar stderr: the standard error of the program
79 @ivar failed: True in case the program was
80 terminated by a signal or exited with a non-zero exit code
81 @ivar fail_reason: a string detailing the termination reason
84 __slots__ = ["exit_code", "signal", "stdout", "stderr",
85 "failed", "fail_reason", "cmd"]
88 def __init__(self, exit_code, signal_, stdout, stderr, cmd, timeout_action,
91 self.exit_code = exit_code
95 self.failed = (signal_ is not None or exit_code != 0)
98 if self.signal is not None:
99 fail_msgs.append("terminated by signal %s" % self.signal)
100 elif self.exit_code is not None:
101 fail_msgs.append("exited with exit code %s" % self.exit_code)
103 fail_msgs.append("unable to determine termination reason")
105 if timeout_action == _TIMEOUT_TERM:
106 fail_msgs.append("terminated after timeout of %.2f seconds" % timeout)
107 elif timeout_action == _TIMEOUT_KILL:
108 fail_msgs.append(("force termination after timeout of %.2f seconds"
109 " and linger for another %.2f seconds") %
110 (timeout, constants.CHILD_LINGER_TIMEOUT))
112 if fail_msgs and self.failed:
113 self.fail_reason = utils_text.CommaJoin(fail_msgs)
116 logging.debug("Command '%s' failed (%s); output: %s",
117 self.cmd, self.fail_reason, self.output)
119 def _GetOutput(self):
120 """Returns the combined stdout and stderr for easier usage.
123 return self.stdout + self.stderr
125 output = property(_GetOutput, None, None, "Return full output")
128 def _BuildCmdEnvironment(env, reset):
129 """Builds the environment for an external program.
135 cmd_env = os.environ.copy()
136 cmd_env["LC_ALL"] = "C"
144 def RunCmd(cmd, env=None, output=None, cwd="/", reset_env=False,
145 interactive=False, timeout=None, noclose_fds=None,
147 """Execute a (shell) command.
149 The command should not read from its standard input, as it will be
152 @type cmd: string or list
153 @param cmd: Command to run
155 @param env: Additional environment variables
157 @param output: if desired, the output of the command can be
158 saved in a file instead of the RunResult instance; this
159 parameter denotes the file name (if not None)
161 @param cwd: if specified, will be used as the working
162 directory for the command; the default will be /
163 @type reset_env: boolean
164 @param reset_env: whether to reset or keep the default os environment
165 @type interactive: boolean
166 @param interactive: whether we pipe stdin, stdout and stderr
167 (default behaviour) or run the command interactive
169 @param timeout: If not None, timeout in seconds until child process gets
171 @type noclose_fds: list
172 @param noclose_fds: list of additional (fd >=3) file descriptors to leave
173 open for the child process
174 @param _postfork_fn: Callback run after fork but before timeout (unittest)
176 @return: RunResult instance
177 @raise errors.ProgrammerError: if we call this when forks are disabled
181 raise errors.ProgrammerError("utils.RunCmd() called with fork() disabled")
183 if output and interactive:
184 raise errors.ProgrammerError("Parameters 'output' and 'interactive' can"
185 " not be provided at the same time")
187 if isinstance(cmd, basestring):
191 cmd = [str(val) for val in cmd]
192 strcmd = utils_text.ShellQuoteArgs(cmd)
196 logging.debug("RunCmd %s, output file '%s'", strcmd, output)
198 logging.debug("RunCmd %s", strcmd)
200 cmd_env = _BuildCmdEnvironment(env, reset_env)
204 out, err, status, timeout_action = _RunCmdPipe(cmd, cmd_env, shell, cwd,
205 interactive, timeout,
207 _postfork_fn=_postfork_fn)
209 assert _postfork_fn is None, \
210 "_postfork_fn not supported if output provided"
211 timeout_action = _TIMEOUT_NONE
212 status = _RunCmdFile(cmd, cmd_env, shell, output, cwd, noclose_fds)
215 if err.errno == errno.ENOENT:
216 raise errors.OpExecError("Can't execute '%s': not found (%s)" %
228 return RunResult(exitcode, signal_, out, err, strcmd, timeout_action, timeout)
231 def SetupDaemonEnv(cwd="/", umask=077):
232 """Setup a daemon's environment.
234 This should be called between the first and second fork, due to
237 @param cwd: the directory to which to chdir
238 @param umask: the umask to setup
246 def SetupDaemonFDs(output_file, output_fd):
247 """Setups up a daemon's file descriptors.
249 @param output_file: if not None, the file to which to redirect
251 @param output_fd: if not None, the file descriptor for stdout/stderr
254 # check that at most one is defined
255 assert [output_file, output_fd].count(None) >= 1
257 # Open /dev/null (read-only, only for stdin)
258 devnull_fd = os.open(os.devnull, os.O_RDONLY)
262 if output_fd is not None:
264 elif output_file is not None:
267 output_fd = os.open(output_file,
268 os.O_WRONLY | os.O_CREAT | os.O_APPEND, 0600)
269 except EnvironmentError, err:
270 raise Exception("Opening output file failed: %s" % err)
272 output_fd = os.open(os.devnull, os.O_WRONLY)
274 # Redirect standard I/O
275 os.dup2(devnull_fd, 0)
276 os.dup2(output_fd, 1)
277 os.dup2(output_fd, 2)
280 utils_wrapper.CloseFdNoError(devnull_fd)
282 if output_close and output_fd > 2:
283 utils_wrapper.CloseFdNoError(output_fd)
286 def StartDaemon(cmd, env=None, cwd="/", output=None, output_fd=None,
288 """Start a daemon process after forking twice.
290 @type cmd: string or list
291 @param cmd: Command to run
293 @param env: Additional environment variables
295 @param cwd: Working directory for the program
297 @param output: Path to file in which to save the output
299 @param output_fd: File descriptor for output
300 @type pidfile: string
301 @param pidfile: Process ID file
303 @return: Daemon process ID
304 @raise errors.ProgrammerError: if we call this when forks are disabled
308 raise errors.ProgrammerError("utils.StartDaemon() called with fork()"
311 if output and not (bool(output) ^ (output_fd is not None)):
312 raise errors.ProgrammerError("Only one of 'output' and 'output_fd' can be"
315 if isinstance(cmd, basestring):
316 cmd = ["/bin/sh", "-c", cmd]
318 strcmd = utils_text.ShellQuoteArgs(cmd)
321 logging.debug("StartDaemon %s, output file '%s'", strcmd, output)
323 logging.debug("StartDaemon %s", strcmd)
325 cmd_env = _BuildCmdEnvironment(env, False)
327 # Create pipe for sending PID back
328 (pidpipe_read, pidpipe_write) = os.pipe()
331 # Create pipe for sending error messages
332 (errpipe_read, errpipe_write) = os.pipe()
339 # Child process, won't return
340 _StartDaemonChild(errpipe_read, errpipe_write,
341 pidpipe_read, pidpipe_write,
343 output, output_fd, pidfile)
345 # Well, maybe child process failed
346 os._exit(1) # pylint: disable-msg=W0212
348 utils_wrapper.CloseFdNoError(errpipe_write)
350 # Wait for daemon to be started (or an error message to
351 # arrive) and read up to 100 KB as an error message
352 errormsg = utils_wrapper.RetryOnSignal(os.read, errpipe_read,
355 utils_wrapper.CloseFdNoError(errpipe_read)
357 utils_wrapper.CloseFdNoError(pidpipe_write)
359 # Read up to 128 bytes for PID
360 pidtext = utils_wrapper.RetryOnSignal(os.read, pidpipe_read, 128)
362 utils_wrapper.CloseFdNoError(pidpipe_read)
364 # Try to avoid zombies by waiting for child process
371 raise errors.OpExecError("Error when starting daemon process: %r" %
376 except (ValueError, TypeError), err:
377 raise errors.OpExecError("Error while trying to parse PID %r: %s" %
381 def _StartDaemonChild(errpipe_read, errpipe_write,
382 pidpipe_read, pidpipe_write,
384 output, fd_output, pidfile):
385 """Child process for starting daemon.
389 # Close parent's side
390 utils_wrapper.CloseFdNoError(errpipe_read)
391 utils_wrapper.CloseFdNoError(pidpipe_read)
393 # First child process
396 # And fork for the second time
399 # Exit first child process
400 os._exit(0) # pylint: disable-msg=W0212
402 # Make sure pipe is closed on execv* (and thereby notifies
404 utils_wrapper.SetCloseOnExecFlag(errpipe_write, True)
406 # List of file descriptors to be left open
407 noclose_fds = [errpipe_write]
411 fd_pidfile = utils_io.WritePidFile(pidfile)
413 # Keeping the file open to hold the lock
414 noclose_fds.append(fd_pidfile)
416 utils_wrapper.SetCloseOnExecFlag(fd_pidfile, False)
420 SetupDaemonFDs(output, fd_output)
422 # Send daemon PID to parent
423 utils_wrapper.RetryOnSignal(os.write, pidpipe_write, str(os.getpid()))
425 # Close all file descriptors except stdio and error message pipe
426 CloseFDs(noclose_fds=noclose_fds)
428 # Change working directory
432 os.execvp(args[0], args)
434 os.execvpe(args[0], args, env)
435 except: # pylint: disable-msg=W0702
437 # Report errors to original process
438 WriteErrorToFD(errpipe_write, str(sys.exc_info()[1]))
439 except: # pylint: disable-msg=W0702
440 # Ignore errors in error handling
443 os._exit(1) # pylint: disable-msg=W0212
446 def WriteErrorToFD(fd, err):
447 """Possibly write an error message to a fd.
449 @type fd: None or int (file descriptor)
450 @param fd: if not None, the error will be written to this fd
451 @param err: string, the error message
458 err = "<unknown error>"
460 utils_wrapper.RetryOnSignal(os.write, fd, err)
463 def _CheckIfAlive(child):
464 """Raises L{utils_retry.RetryAgain} if child is still alive.
466 @raises utils_retry.RetryAgain: If child is still alive
469 if child.poll() is None:
470 raise utils_retry.RetryAgain()
473 def _WaitForProcess(child, timeout):
474 """Waits for the child to terminate or until we reach timeout.
478 utils_retry.Retry(_CheckIfAlive, (1.0, 1.2, 5.0), max(0, timeout),
480 except utils_retry.RetryTimeout:
484 def _RunCmdPipe(cmd, env, via_shell, cwd, interactive, timeout, noclose_fds,
485 _linger_timeout=constants.CHILD_LINGER_TIMEOUT,
487 """Run a command and return its output.
489 @type cmd: string or list
490 @param cmd: Command to run
492 @param env: The environment to use
493 @type via_shell: bool
494 @param via_shell: if we should run via the shell
496 @param cwd: the working directory for the program
497 @type interactive: boolean
498 @param interactive: Run command interactive (without piping)
500 @param timeout: Timeout after the programm gets terminated
501 @type noclose_fds: list
502 @param noclose_fds: list of additional (fd >=3) file descriptors to leave
503 open for the child process
504 @param _postfork_fn: Function run after fork but before timeout (unittest)
506 @return: (out, err, status)
509 poller = select.poll()
511 stderr = subprocess.PIPE
512 stdout = subprocess.PIPE
513 stdin = subprocess.PIPE
516 stderr = stdout = stdin = None
519 preexec_fn = lambda: CloseFDs(noclose_fds)
525 child = subprocess.Popen(cmd, shell=via_shell,
529 close_fds=close_fds, env=env,
531 preexec_fn=preexec_fn)
534 _postfork_fn(child.pid)
539 linger_timeout = None
544 poll_timeout = utils_algo.RunningTimeout(timeout, True).Remaining
546 msg_timeout = ("Command %s (%d) run into execution timeout, terminating" %
548 msg_linger = ("Command %s (%d) run into linger timeout, killing" %
551 timeout_action = _TIMEOUT_NONE
555 poller.register(child.stdout, select.POLLIN)
556 poller.register(child.stderr, select.POLLIN)
558 child.stdout.fileno(): (out, child.stdout),
559 child.stderr.fileno(): (err, child.stderr),
562 utils_wrapper.SetNonblockFlag(fd, True)
566 pt = poll_timeout() * 1000
568 if linger_timeout is None:
569 logging.warning(msg_timeout)
570 if child.poll() is None:
571 timeout_action = _TIMEOUT_TERM
572 utils_wrapper.IgnoreProcessNotFound(os.kill, child.pid,
575 utils_algo.RunningTimeout(_linger_timeout, True).Remaining
576 pt = linger_timeout() * 1000
582 pollresult = utils_wrapper.RetryOnSignal(poller.poll, pt)
584 for fd, event in pollresult:
585 if event & select.POLLIN or event & select.POLLPRI:
586 data = fdmap[fd][1].read()
587 # no data from read signifies EOF (the same as POLLHUP)
589 poller.unregister(fd)
592 fdmap[fd][0].write(data)
593 if (event & select.POLLNVAL or event & select.POLLHUP or
594 event & select.POLLERR):
595 poller.unregister(fd)
598 if timeout is not None:
599 assert callable(poll_timeout)
601 # We have no I/O left but it might still run
602 if child.poll() is None:
603 _WaitForProcess(child, poll_timeout())
605 # Terminate if still alive after timeout
606 if child.poll() is None:
607 if linger_timeout is None:
608 logging.warning(msg_timeout)
609 timeout_action = _TIMEOUT_TERM
610 utils_wrapper.IgnoreProcessNotFound(os.kill, child.pid, signal.SIGTERM)
613 lt = linger_timeout()
614 _WaitForProcess(child, lt)
616 # Okay, still alive after timeout and linger timeout? Kill it!
617 if child.poll() is None:
618 timeout_action = _TIMEOUT_KILL
619 logging.warning(msg_linger)
620 utils_wrapper.IgnoreProcessNotFound(os.kill, child.pid, signal.SIGKILL)
625 status = child.wait()
626 return out, err, status, timeout_action
629 def _RunCmdFile(cmd, env, via_shell, output, cwd, noclose_fds):
630 """Run a command and save its output to a file.
632 @type cmd: string or list
633 @param cmd: Command to run
635 @param env: The environment to use
636 @type via_shell: bool
637 @param via_shell: if we should run via the shell
639 @param output: the filename in which to save the output
641 @param cwd: the working directory for the program
642 @type noclose_fds: list
643 @param noclose_fds: list of additional (fd >=3) file descriptors to leave
644 open for the child process
646 @return: the exit status
649 fh = open(output, "a")
652 preexec_fn = lambda: CloseFDs(noclose_fds + [fh.fileno()])
659 child = subprocess.Popen(cmd, shell=via_shell,
660 stderr=subprocess.STDOUT,
662 stdin=subprocess.PIPE,
663 close_fds=close_fds, env=env,
665 preexec_fn=preexec_fn)
668 status = child.wait()
674 def RunParts(dir_name, env=None, reset_env=False):
675 """Run Scripts or programs in a directory
677 @type dir_name: string
678 @param dir_name: absolute path to a directory
680 @param env: The environment to use
681 @type reset_env: boolean
682 @param reset_env: whether to reset or keep the default os environment
683 @rtype: list of tuples
684 @return: list of (name, (one of RUNDIR_STATUS), RunResult)
690 dir_contents = utils_io.ListVisibleFiles(dir_name)
692 logging.warning("RunParts: skipping %s (cannot list: %s)", dir_name, err)
695 for relname in sorted(dir_contents):
696 fname = utils_io.PathJoin(dir_name, relname)
697 if not (os.path.isfile(fname) and os.access(fname, os.X_OK) and
698 constants.EXT_PLUGIN_MASK.match(relname) is not None):
699 rr.append((relname, constants.RUNPARTS_SKIP, None))
702 result = RunCmd([fname], env=env, reset_env=reset_env)
703 except Exception, err: # pylint: disable-msg=W0703
704 rr.append((relname, constants.RUNPARTS_ERR, str(err)))
706 rr.append((relname, constants.RUNPARTS_RUN, result))
711 def _GetProcStatusPath(pid):
712 """Returns the path for a PID's proc status file.
715 @param pid: Process ID
719 return "/proc/%d/status" % pid
722 def IsProcessAlive(pid):
723 """Check if a given pid exists on the system.
725 @note: zombie status is not handled, so zombie processes
726 will be returned as alive
728 @param pid: the process ID to check
730 @return: True if the process exists
737 except EnvironmentError, err:
738 if err.errno in (errno.ENOENT, errno.ENOTDIR):
740 elif err.errno == errno.EINVAL:
741 raise utils_retry.RetryAgain(err)
744 assert isinstance(pid, int), "pid must be an integer"
748 # /proc in a multiprocessor environment can have strange behaviors.
749 # Retry the os.stat a few times until we get a good result.
751 return utils_retry.Retry(_TryStat, (0.01, 1.5, 0.1), 0.5,
752 args=[_GetProcStatusPath(pid)])
753 except utils_retry.RetryTimeout, err:
757 def _ParseSigsetT(sigset):
758 """Parse a rendered sigset_t value.
760 This is the opposite of the Linux kernel's fs/proc/array.c:render_sigset_t
764 @param sigset: Rendered signal set from /proc/$pid/status
766 @return: Set of all enabled signal numbers
772 for ch in reversed(sigset):
775 # The following could be done in a loop, but it's easier to read and
776 # understand in the unrolled form
778 result.add(signum + 1)
780 result.add(signum + 2)
782 result.add(signum + 3)
784 result.add(signum + 4)
791 def _GetProcStatusField(pstatus, field):
792 """Retrieves a field from the contents of a proc status file.
794 @type pstatus: string
795 @param pstatus: Contents of /proc/$pid/status
797 @param field: Name of field whose value should be returned
801 for line in pstatus.splitlines():
802 parts = line.split(":", 1)
804 if len(parts) < 2 or parts[0] != field:
807 return parts[1].strip()
812 def IsProcessHandlingSignal(pid, signum, status_path=None):
813 """Checks whether a process is handling a signal.
816 @param pid: Process ID
818 @param signum: Signal number
822 if status_path is None:
823 status_path = _GetProcStatusPath(pid)
826 proc_status = utils_io.ReadFile(status_path)
827 except EnvironmentError, err:
828 # In at least one case, reading /proc/$pid/status failed with ESRCH.
829 if err.errno in (errno.ENOENT, errno.ENOTDIR, errno.EINVAL, errno.ESRCH):
833 sigcgt = _GetProcStatusField(proc_status, "SigCgt")
835 raise RuntimeError("%s is missing 'SigCgt' field" % status_path)
837 # Now check whether signal is handled
838 return signum in _ParseSigsetT(sigcgt)
841 def Daemonize(logfile):
842 """Daemonize the current process.
844 This detaches the current process from the controlling terminal and
845 runs it in the background as a daemon.
848 @param logfile: the logfile to which we should redirect stdout/stderr
849 @rtype: tuple; (int, callable)
850 @return: File descriptor of pipe(2) which must be closed to notify parent
851 process and a callable to reopen log files
854 # pylint: disable-msg=W0212
855 # yes, we really want os._exit
857 # TODO: do another attempt to merge Daemonize and StartDaemon, or at
858 # least abstract the pipe functionality between them
860 # Create pipe for sending error messages
861 (rpipe, wpipe) = os.pipe()
865 if (pid == 0): # The first child.
869 pid = os.fork() # Fork a second child.
870 if (pid == 0): # The second child.
871 utils_wrapper.CloseFdNoError(rpipe)
873 # exit() or _exit()? See below.
874 os._exit(0) # Exit parent (the first child) of the second child.
876 utils_wrapper.CloseFdNoError(wpipe)
877 # Wait for daemon to be started (or an error message to
878 # arrive) and read up to 100 KB as an error message
879 errormsg = utils_wrapper.RetryOnSignal(os.read, rpipe, 100 * 1024)
881 sys.stderr.write("Error when starting daemon process: %r\n" % errormsg)
885 os._exit(rcode) # Exit parent of the first child.
887 reopen_fn = compat.partial(SetupDaemonFDs, logfile, None)
889 # Open logs for the first time
892 return (wpipe, reopen_fn)
895 def KillProcess(pid, signal_=signal.SIGTERM, timeout=30,
897 """Kill a process given by its pid.
900 @param pid: The PID to terminate.
902 @param signal_: The signal to send, by default SIGTERM
904 @param timeout: The timeout after which, if the process is still alive,
905 a SIGKILL will be sent. If not positive, no such checking
907 @type waitpid: boolean
908 @param waitpid: If true, we should waitpid on this process after
909 sending signals, since it's our own child and otherwise it
910 would remain as zombie
913 def _helper(pid, signal_, wait):
914 """Simple helper to encapsulate the kill/waitpid sequence"""
915 if utils_wrapper.IgnoreProcessNotFound(os.kill, pid, signal_) and wait:
917 os.waitpid(pid, os.WNOHANG)
922 # kill with pid=0 == suicide
923 raise errors.ProgrammerError("Invalid pid given '%s'" % pid)
925 if not IsProcessAlive(pid):
928 _helper(pid, signal_, waitpid)
934 if not IsProcessAlive(pid):
938 (result_pid, _) = os.waitpid(pid, os.WNOHANG)
940 raise utils_retry.RetryAgain()
945 raise utils_retry.RetryAgain()
948 # Wait up to $timeout seconds
949 utils_retry.Retry(_CheckProcess, (0.01, 1.5, 0.1), timeout)
950 except utils_retry.RetryTimeout:
953 if IsProcessAlive(pid):
954 # Kill process if it's still alive
955 _helper(pid, signal.SIGKILL, waitpid)
958 def RunInSeparateProcess(fn, *args):
959 """Runs a function in a separate process.
961 Note: Only boolean return values are supported.
964 @param fn: Function to be called
966 @return: Function's result
973 # In case the function uses temporary files
974 utils_wrapper.ResetTempfileModule()
977 result = int(bool(fn(*args)))
978 assert result in (0, 1)
979 except: # pylint: disable-msg=W0702
980 logging.exception("Error while calling function in separate process")
981 # 0 and 1 are reserved for the return value
984 os._exit(result) # pylint: disable-msg=W0212
988 # Avoid zombies and check exit code
989 (_, status) = os.waitpid(pid, 0)
991 if os.WIFSIGNALED(status):
993 signum = os.WTERMSIG(status)
995 exitcode = os.WEXITSTATUS(status)
998 if not (exitcode in (0, 1) and signum is None):
999 raise errors.GenericError("Child program failed (code=%s, signal=%s)" %
1002 return bool(exitcode)
1005 def CloseFDs(noclose_fds=None):
1006 """Close file descriptors.
1008 This closes all file descriptors above 2 (i.e. except
1011 @type noclose_fds: list or None
1012 @param noclose_fds: if given, it denotes a list of file descriptor
1013 that should not be closed
1016 # Default maximum for the number of available file descriptors.
1017 if 'SC_OPEN_MAX' in os.sysconf_names:
1019 MAXFD = os.sysconf('SC_OPEN_MAX')
1027 maxfd = resource.getrlimit(resource.RLIMIT_NOFILE)[1]
1028 if (maxfd == resource.RLIM_INFINITY):
1031 # Iterate through and close all file descriptors (except the standard ones)
1032 for fd in range(3, maxfd):
1033 if noclose_fds and fd in noclose_fds:
1035 utils_wrapper.CloseFdNoError(fd)