4 # Copyright (C) 2006, 2007, 2010, 2011 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 """Utility functions for processes.
35 from cStringIO import StringIO
37 from ganeti import errors
38 from ganeti import constants
39 from ganeti import compat
41 from ganeti.utils import retry as utils_retry
42 from ganeti.utils import wrapper as utils_wrapper
43 from ganeti.utils import text as utils_text
44 from ganeti.utils import io as utils_io
45 from ganeti.utils import algo as utils_algo
48 #: when set to True, L{RunCmd} is disabled
53 _TIMEOUT_KILL) = range(3)
57 """Disables the use of fork(2).
60 global _no_fork # pylint: disable=W0603
65 class RunResult(object):
66 """Holds the result of running external programs.
69 @ivar exit_code: the exit code of the program, or None (if the program
71 @type signal: int or None
72 @ivar signal: the signal that caused the program to finish, or None
73 (if the program wasn't terminated by a signal)
75 @ivar stdout: the standard output of the program
77 @ivar stderr: the standard error of the program
79 @ivar failed: True in case the program was
80 terminated by a signal or exited with a non-zero exit code
81 @ivar fail_reason: a string detailing the termination reason
84 __slots__ = ["exit_code", "signal", "stdout", "stderr",
85 "failed", "fail_reason", "cmd"]
87 def __init__(self, exit_code, signal_, stdout, stderr, cmd, timeout_action,
90 self.exit_code = exit_code
94 self.failed = (signal_ is not None or exit_code != 0)
97 if self.signal is not None:
98 fail_msgs.append("terminated by signal %s" % self.signal)
99 elif self.exit_code is not None:
100 fail_msgs.append("exited with exit code %s" % self.exit_code)
102 fail_msgs.append("unable to determine termination reason")
104 if timeout_action == _TIMEOUT_TERM:
105 fail_msgs.append("terminated after timeout of %.2f seconds" % timeout)
106 elif timeout_action == _TIMEOUT_KILL:
107 fail_msgs.append(("force termination after timeout of %.2f seconds"
108 " and linger for another %.2f seconds") %
109 (timeout, constants.CHILD_LINGER_TIMEOUT))
111 if fail_msgs and self.failed:
112 self.fail_reason = utils_text.CommaJoin(fail_msgs)
115 logging.debug("Command '%s' failed (%s); output: %s",
116 self.cmd, self.fail_reason, self.output)
118 def _GetOutput(self):
119 """Returns the combined stdout and stderr for easier usage.
122 return self.stdout + self.stderr
124 output = property(_GetOutput, None, None, "Return full output")
127 def _BuildCmdEnvironment(env, reset):
128 """Builds the environment for an external program.
134 cmd_env = os.environ.copy()
135 cmd_env["LC_ALL"] = "C"
143 def RunCmd(cmd, env=None, output=None, cwd="/", reset_env=False,
144 interactive=False, timeout=None, noclose_fds=None,
146 """Execute a (shell) command.
148 The command should not read from its standard input, as it will be
151 @type cmd: string or list
152 @param cmd: Command to run
154 @param env: Additional environment variables
156 @param output: if desired, the output of the command can be
157 saved in a file instead of the RunResult instance; this
158 parameter denotes the file name (if not None)
160 @param cwd: if specified, will be used as the working
161 directory for the command; the default will be /
162 @type reset_env: boolean
163 @param reset_env: whether to reset or keep the default os environment
164 @type interactive: boolean
165 @param interactive: whether we pipe stdin, stdout and stderr
166 (default behaviour) or run the command interactive
168 @param timeout: If not None, timeout in seconds until child process gets
170 @type noclose_fds: list
171 @param noclose_fds: list of additional (fd >=3) file descriptors to leave
172 open for the child process
173 @param _postfork_fn: Callback run after fork but before timeout (unittest)
175 @return: RunResult instance
176 @raise errors.ProgrammerError: if we call this when forks are disabled
180 raise errors.ProgrammerError("utils.RunCmd() called with fork() disabled")
182 if output and interactive:
183 raise errors.ProgrammerError("Parameters 'output' and 'interactive' can"
184 " not be provided at the same time")
186 if isinstance(cmd, basestring):
190 cmd = [str(val) for val in cmd]
191 strcmd = utils_text.ShellQuoteArgs(cmd)
195 logging.debug("RunCmd %s, output file '%s'", strcmd, output)
197 logging.debug("RunCmd %s", strcmd)
199 cmd_env = _BuildCmdEnvironment(env, reset_env)
203 out, err, status, timeout_action = _RunCmdPipe(cmd, cmd_env, shell, cwd,
204 interactive, timeout,
206 _postfork_fn=_postfork_fn)
208 assert _postfork_fn is None, \
209 "_postfork_fn not supported if output provided"
210 timeout_action = _TIMEOUT_NONE
211 status = _RunCmdFile(cmd, cmd_env, shell, output, cwd, noclose_fds)
214 if err.errno == errno.ENOENT:
215 raise errors.OpExecError("Can't execute '%s': not found (%s)" %
227 return RunResult(exitcode, signal_, out, err, strcmd, timeout_action, timeout)
230 def SetupDaemonEnv(cwd="/", umask=077):
231 """Setup a daemon's environment.
233 This should be called between the first and second fork, due to
236 @param cwd: the directory to which to chdir
237 @param umask: the umask to setup
245 def SetupDaemonFDs(output_file, output_fd):
246 """Setups up a daemon's file descriptors.
248 @param output_file: if not None, the file to which to redirect
250 @param output_fd: if not None, the file descriptor for stdout/stderr
253 # check that at most one is defined
254 assert [output_file, output_fd].count(None) >= 1
256 # Open /dev/null (read-only, only for stdin)
257 devnull_fd = os.open(os.devnull, os.O_RDONLY)
261 if output_fd is not None:
263 elif output_file is not None:
266 output_fd = os.open(output_file,
267 os.O_WRONLY | os.O_CREAT | os.O_APPEND, 0600)
268 except EnvironmentError, err:
269 raise Exception("Opening output file failed: %s" % err)
271 output_fd = os.open(os.devnull, os.O_WRONLY)
273 # Redirect standard I/O
274 os.dup2(devnull_fd, 0)
275 os.dup2(output_fd, 1)
276 os.dup2(output_fd, 2)
279 utils_wrapper.CloseFdNoError(devnull_fd)
281 if output_close and output_fd > 2:
282 utils_wrapper.CloseFdNoError(output_fd)
285 def StartDaemon(cmd, env=None, cwd="/", output=None, output_fd=None,
287 """Start a daemon process after forking twice.
289 @type cmd: string or list
290 @param cmd: Command to run
292 @param env: Additional environment variables
294 @param cwd: Working directory for the program
296 @param output: Path to file in which to save the output
298 @param output_fd: File descriptor for output
299 @type pidfile: string
300 @param pidfile: Process ID file
302 @return: Daemon process ID
303 @raise errors.ProgrammerError: if we call this when forks are disabled
307 raise errors.ProgrammerError("utils.StartDaemon() called with fork()"
310 if output and not (bool(output) ^ (output_fd is not None)):
311 raise errors.ProgrammerError("Only one of 'output' and 'output_fd' can be"
314 if isinstance(cmd, basestring):
315 cmd = ["/bin/sh", "-c", cmd]
317 strcmd = utils_text.ShellQuoteArgs(cmd)
320 logging.debug("StartDaemon %s, output file '%s'", strcmd, output)
322 logging.debug("StartDaemon %s", strcmd)
324 cmd_env = _BuildCmdEnvironment(env, False)
326 # Create pipe for sending PID back
327 (pidpipe_read, pidpipe_write) = os.pipe()
330 # Create pipe for sending error messages
331 (errpipe_read, errpipe_write) = os.pipe()
338 # Child process, won't return
339 _StartDaemonChild(errpipe_read, errpipe_write,
340 pidpipe_read, pidpipe_write,
342 output, output_fd, pidfile)
344 # Well, maybe child process failed
345 os._exit(1) # pylint: disable=W0212
347 utils_wrapper.CloseFdNoError(errpipe_write)
349 # Wait for daemon to be started (or an error message to
350 # arrive) and read up to 100 KB as an error message
351 errormsg = utils_wrapper.RetryOnSignal(os.read, errpipe_read,
354 utils_wrapper.CloseFdNoError(errpipe_read)
356 utils_wrapper.CloseFdNoError(pidpipe_write)
358 # Read up to 128 bytes for PID
359 pidtext = utils_wrapper.RetryOnSignal(os.read, pidpipe_read, 128)
361 utils_wrapper.CloseFdNoError(pidpipe_read)
363 # Try to avoid zombies by waiting for child process
370 raise errors.OpExecError("Error when starting daemon process: %r" %
375 except (ValueError, TypeError), err:
376 raise errors.OpExecError("Error while trying to parse PID %r: %s" %
380 def _StartDaemonChild(errpipe_read, errpipe_write,
381 pidpipe_read, pidpipe_write,
383 output, fd_output, pidfile):
384 """Child process for starting daemon.
388 # Close parent's side
389 utils_wrapper.CloseFdNoError(errpipe_read)
390 utils_wrapper.CloseFdNoError(pidpipe_read)
392 # First child process
395 # And fork for the second time
398 # Exit first child process
399 os._exit(0) # pylint: disable=W0212
401 # Make sure pipe is closed on execv* (and thereby notifies
403 utils_wrapper.SetCloseOnExecFlag(errpipe_write, True)
405 # List of file descriptors to be left open
406 noclose_fds = [errpipe_write]
410 fd_pidfile = utils_io.WritePidFile(pidfile)
412 # Keeping the file open to hold the lock
413 noclose_fds.append(fd_pidfile)
415 utils_wrapper.SetCloseOnExecFlag(fd_pidfile, False)
419 SetupDaemonFDs(output, fd_output)
421 # Send daemon PID to parent
422 utils_wrapper.RetryOnSignal(os.write, pidpipe_write, str(os.getpid()))
424 # Close all file descriptors except stdio and error message pipe
425 CloseFDs(noclose_fds=noclose_fds)
427 # Change working directory
431 os.execvp(args[0], args)
433 os.execvpe(args[0], args, env)
434 except: # pylint: disable=W0702
436 # Report errors to original process
437 WriteErrorToFD(errpipe_write, str(sys.exc_info()[1]))
438 except: # pylint: disable=W0702
439 # Ignore errors in error handling
442 os._exit(1) # pylint: disable=W0212
445 def WriteErrorToFD(fd, err):
446 """Possibly write an error message to a fd.
448 @type fd: None or int (file descriptor)
449 @param fd: if not None, the error will be written to this fd
450 @param err: string, the error message
457 err = "<unknown error>"
459 utils_wrapper.RetryOnSignal(os.write, fd, err)
462 def _CheckIfAlive(child):
463 """Raises L{utils_retry.RetryAgain} if child is still alive.
465 @raises utils_retry.RetryAgain: If child is still alive
468 if child.poll() is None:
469 raise utils_retry.RetryAgain()
472 def _WaitForProcess(child, timeout):
473 """Waits for the child to terminate or until we reach timeout.
477 utils_retry.Retry(_CheckIfAlive, (1.0, 1.2, 5.0), max(0, timeout),
479 except utils_retry.RetryTimeout:
483 def _RunCmdPipe(cmd, env, via_shell, cwd, interactive, timeout, noclose_fds,
484 _linger_timeout=constants.CHILD_LINGER_TIMEOUT,
486 """Run a command and return its output.
488 @type cmd: string or list
489 @param cmd: Command to run
491 @param env: The environment to use
492 @type via_shell: bool
493 @param via_shell: if we should run via the shell
495 @param cwd: the working directory for the program
496 @type interactive: boolean
497 @param interactive: Run command interactive (without piping)
499 @param timeout: Timeout after the programm gets terminated
500 @type noclose_fds: list
501 @param noclose_fds: list of additional (fd >=3) file descriptors to leave
502 open for the child process
503 @param _postfork_fn: Function run after fork but before timeout (unittest)
505 @return: (out, err, status)
508 poller = select.poll()
510 stderr = subprocess.PIPE
511 stdout = subprocess.PIPE
512 stdin = subprocess.PIPE
515 stderr = stdout = stdin = None
518 preexec_fn = lambda: CloseFDs(noclose_fds)
524 child = subprocess.Popen(cmd, shell=via_shell,
528 close_fds=close_fds, env=env,
530 preexec_fn=preexec_fn)
533 _postfork_fn(child.pid)
538 linger_timeout = None
543 poll_timeout = utils_algo.RunningTimeout(timeout, True).Remaining
545 msg_timeout = ("Command %s (%d) run into execution timeout, terminating" %
547 msg_linger = ("Command %s (%d) run into linger timeout, killing" %
550 timeout_action = _TIMEOUT_NONE
554 poller.register(child.stdout, select.POLLIN)
555 poller.register(child.stderr, select.POLLIN)
557 child.stdout.fileno(): (out, child.stdout),
558 child.stderr.fileno(): (err, child.stderr),
561 utils_wrapper.SetNonblockFlag(fd, True)
565 pt = poll_timeout() * 1000
567 if linger_timeout is None:
568 logging.warning(msg_timeout)
569 if child.poll() is None:
570 timeout_action = _TIMEOUT_TERM
571 utils_wrapper.IgnoreProcessNotFound(os.kill, child.pid,
574 utils_algo.RunningTimeout(_linger_timeout, True).Remaining
575 pt = linger_timeout() * 1000
581 pollresult = utils_wrapper.RetryOnSignal(poller.poll, pt)
583 for fd, event in pollresult:
584 if event & select.POLLIN or event & select.POLLPRI:
585 data = fdmap[fd][1].read()
586 # no data from read signifies EOF (the same as POLLHUP)
588 poller.unregister(fd)
591 fdmap[fd][0].write(data)
592 if (event & select.POLLNVAL or event & select.POLLHUP or
593 event & select.POLLERR):
594 poller.unregister(fd)
597 if timeout is not None:
598 assert callable(poll_timeout)
600 # We have no I/O left but it might still run
601 if child.poll() is None:
602 _WaitForProcess(child, poll_timeout())
604 # Terminate if still alive after timeout
605 if child.poll() is None:
606 if linger_timeout is None:
607 logging.warning(msg_timeout)
608 timeout_action = _TIMEOUT_TERM
609 utils_wrapper.IgnoreProcessNotFound(os.kill, child.pid, signal.SIGTERM)
612 lt = linger_timeout()
613 _WaitForProcess(child, lt)
615 # Okay, still alive after timeout and linger timeout? Kill it!
616 if child.poll() is None:
617 timeout_action = _TIMEOUT_KILL
618 logging.warning(msg_linger)
619 utils_wrapper.IgnoreProcessNotFound(os.kill, child.pid, signal.SIGKILL)
624 status = child.wait()
625 return out, err, status, timeout_action
628 def _RunCmdFile(cmd, env, via_shell, output, cwd, noclose_fds):
629 """Run a command and save its output to a file.
631 @type cmd: string or list
632 @param cmd: Command to run
634 @param env: The environment to use
635 @type via_shell: bool
636 @param via_shell: if we should run via the shell
638 @param output: the filename in which to save the output
640 @param cwd: the working directory for the program
641 @type noclose_fds: list
642 @param noclose_fds: list of additional (fd >=3) file descriptors to leave
643 open for the child process
645 @return: the exit status
648 fh = open(output, "a")
651 preexec_fn = lambda: CloseFDs(noclose_fds + [fh.fileno()])
658 child = subprocess.Popen(cmd, shell=via_shell,
659 stderr=subprocess.STDOUT,
661 stdin=subprocess.PIPE,
662 close_fds=close_fds, env=env,
664 preexec_fn=preexec_fn)
667 status = child.wait()
673 def RunParts(dir_name, env=None, reset_env=False):
674 """Run Scripts or programs in a directory
676 @type dir_name: string
677 @param dir_name: absolute path to a directory
679 @param env: The environment to use
680 @type reset_env: boolean
681 @param reset_env: whether to reset or keep the default os environment
682 @rtype: list of tuples
683 @return: list of (name, (one of RUNDIR_STATUS), RunResult)
689 dir_contents = utils_io.ListVisibleFiles(dir_name)
691 logging.warning("RunParts: skipping %s (cannot list: %s)", dir_name, err)
694 for relname in sorted(dir_contents):
695 fname = utils_io.PathJoin(dir_name, relname)
696 if not (os.path.isfile(fname) and os.access(fname, os.X_OK) and
697 constants.EXT_PLUGIN_MASK.match(relname) is not None):
698 rr.append((relname, constants.RUNPARTS_SKIP, None))
701 result = RunCmd([fname], env=env, reset_env=reset_env)
702 except Exception, err: # pylint: disable=W0703
703 rr.append((relname, constants.RUNPARTS_ERR, str(err)))
705 rr.append((relname, constants.RUNPARTS_RUN, result))
710 def _GetProcStatusPath(pid):
711 """Returns the path for a PID's proc status file.
714 @param pid: Process ID
718 return "/proc/%d/status" % pid
721 def IsProcessAlive(pid):
722 """Check if a given pid exists on the system.
724 @note: zombie status is not handled, so zombie processes
725 will be returned as alive
727 @param pid: the process ID to check
729 @return: True if the process exists
736 except EnvironmentError, err:
737 if err.errno in (errno.ENOENT, errno.ENOTDIR):
739 elif err.errno == errno.EINVAL:
740 raise utils_retry.RetryAgain(err)
743 assert isinstance(pid, int), "pid must be an integer"
747 # /proc in a multiprocessor environment can have strange behaviors.
748 # Retry the os.stat a few times until we get a good result.
750 return utils_retry.Retry(_TryStat, (0.01, 1.5, 0.1), 0.5,
751 args=[_GetProcStatusPath(pid)])
752 except utils_retry.RetryTimeout, err:
756 def _ParseSigsetT(sigset):
757 """Parse a rendered sigset_t value.
759 This is the opposite of the Linux kernel's fs/proc/array.c:render_sigset_t
763 @param sigset: Rendered signal set from /proc/$pid/status
765 @return: Set of all enabled signal numbers
771 for ch in reversed(sigset):
774 # The following could be done in a loop, but it's easier to read and
775 # understand in the unrolled form
777 result.add(signum + 1)
779 result.add(signum + 2)
781 result.add(signum + 3)
783 result.add(signum + 4)
790 def _GetProcStatusField(pstatus, field):
791 """Retrieves a field from the contents of a proc status file.
793 @type pstatus: string
794 @param pstatus: Contents of /proc/$pid/status
796 @param field: Name of field whose value should be returned
800 for line in pstatus.splitlines():
801 parts = line.split(":", 1)
803 if len(parts) < 2 or parts[0] != field:
806 return parts[1].strip()
811 def IsProcessHandlingSignal(pid, signum, status_path=None):
812 """Checks whether a process is handling a signal.
815 @param pid: Process ID
817 @param signum: Signal number
821 if status_path is None:
822 status_path = _GetProcStatusPath(pid)
825 proc_status = utils_io.ReadFile(status_path)
826 except EnvironmentError, err:
827 # In at least one case, reading /proc/$pid/status failed with ESRCH.
828 if err.errno in (errno.ENOENT, errno.ENOTDIR, errno.EINVAL, errno.ESRCH):
832 sigcgt = _GetProcStatusField(proc_status, "SigCgt")
834 raise RuntimeError("%s is missing 'SigCgt' field" % status_path)
836 # Now check whether signal is handled
837 return signum in _ParseSigsetT(sigcgt)
840 def Daemonize(logfile):
841 """Daemonize the current process.
843 This detaches the current process from the controlling terminal and
844 runs it in the background as a daemon.
847 @param logfile: the logfile to which we should redirect stdout/stderr
848 @rtype: tuple; (int, callable)
849 @return: File descriptor of pipe(2) which must be closed to notify parent
850 process and a callable to reopen log files
853 # pylint: disable=W0212
854 # yes, we really want os._exit
856 # TODO: do another attempt to merge Daemonize and StartDaemon, or at
857 # least abstract the pipe functionality between them
859 # Create pipe for sending error messages
860 (rpipe, wpipe) = os.pipe()
864 if (pid == 0): # The first child.
868 pid = os.fork() # Fork a second child.
869 if (pid == 0): # The second child.
870 utils_wrapper.CloseFdNoError(rpipe)
872 # exit() or _exit()? See below.
873 os._exit(0) # Exit parent (the first child) of the second child.
875 utils_wrapper.CloseFdNoError(wpipe)
876 # Wait for daemon to be started (or an error message to
877 # arrive) and read up to 100 KB as an error message
878 errormsg = utils_wrapper.RetryOnSignal(os.read, rpipe, 100 * 1024)
880 sys.stderr.write("Error when starting daemon process: %r\n" % errormsg)
884 os._exit(rcode) # Exit parent of the first child.
886 reopen_fn = compat.partial(SetupDaemonFDs, logfile, None)
888 # Open logs for the first time
891 return (wpipe, reopen_fn)
894 def KillProcess(pid, signal_=signal.SIGTERM, timeout=30,
896 """Kill a process given by its pid.
899 @param pid: The PID to terminate.
901 @param signal_: The signal to send, by default SIGTERM
903 @param timeout: The timeout after which, if the process is still alive,
904 a SIGKILL will be sent. If not positive, no such checking
906 @type waitpid: boolean
907 @param waitpid: If true, we should waitpid on this process after
908 sending signals, since it's our own child and otherwise it
909 would remain as zombie
912 def _helper(pid, signal_, wait):
913 """Simple helper to encapsulate the kill/waitpid sequence"""
914 if utils_wrapper.IgnoreProcessNotFound(os.kill, pid, signal_) and wait:
916 os.waitpid(pid, os.WNOHANG)
921 # kill with pid=0 == suicide
922 raise errors.ProgrammerError("Invalid pid given '%s'" % pid)
924 if not IsProcessAlive(pid):
927 _helper(pid, signal_, waitpid)
933 if not IsProcessAlive(pid):
937 (result_pid, _) = os.waitpid(pid, os.WNOHANG)
939 raise utils_retry.RetryAgain()
944 raise utils_retry.RetryAgain()
947 # Wait up to $timeout seconds
948 utils_retry.Retry(_CheckProcess, (0.01, 1.5, 0.1), timeout)
949 except utils_retry.RetryTimeout:
952 if IsProcessAlive(pid):
953 # Kill process if it's still alive
954 _helper(pid, signal.SIGKILL, waitpid)
957 def RunInSeparateProcess(fn, *args):
958 """Runs a function in a separate process.
960 Note: Only boolean return values are supported.
963 @param fn: Function to be called
965 @return: Function's result
972 # In case the function uses temporary files
973 utils_wrapper.ResetTempfileModule()
976 result = int(bool(fn(*args)))
977 assert result in (0, 1)
978 except: # pylint: disable=W0702
979 logging.exception("Error while calling function in separate process")
980 # 0 and 1 are reserved for the return value
983 os._exit(result) # pylint: disable=W0212
987 # Avoid zombies and check exit code
988 (_, status) = os.waitpid(pid, 0)
990 if os.WIFSIGNALED(status):
992 signum = os.WTERMSIG(status)
994 exitcode = os.WEXITSTATUS(status)
997 if not (exitcode in (0, 1) and signum is None):
998 raise errors.GenericError("Child program failed (code=%s, signal=%s)" %
1001 return bool(exitcode)
1004 def CloseFDs(noclose_fds=None):
1005 """Close file descriptors.
1007 This closes all file descriptors above 2 (i.e. except
1010 @type noclose_fds: list or None
1011 @param noclose_fds: if given, it denotes a list of file descriptor
1012 that should not be closed
1015 # Default maximum for the number of available file descriptors.
1016 if 'SC_OPEN_MAX' in os.sysconf_names:
1018 MAXFD = os.sysconf('SC_OPEN_MAX')
1026 maxfd = resource.getrlimit(resource.RLIMIT_NOFILE)[1]
1027 if (maxfd == resource.RLIM_INFINITY):
1030 # Iterate through and close all file descriptors (except the standard ones)
1031 for fd in range(3, maxfd):
1032 if noclose_fds and fd in noclose_fds:
1034 utils_wrapper.CloseFdNoError(fd)