X-Git-Url: https://code.grnet.gr/git/ganeti-local/blobdiff_plain/7e3dbb94450bc3f5ace499f4f74a22fb5ae93890..3956cee12cc47c8c5c63411a2a0b3532841f920c:/lib/utils.py diff --git a/lib/utils.py b/lib/utils.py index 73d1bbb..f263637 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -36,14 +36,25 @@ import shutil import errno import pwd import itertools +import select +import fcntl +import resource +import logging +import signal + +from cStringIO import StringIO -from ganeti import logger from ganeti import errors +from ganeti import constants _locksheld = [] _re_shell_unquoted = re.compile('^[-.,=:/_+@A-Za-z0-9]+$') +debug = False +no_fork = False + + class RunResult(object): """Simple class for holding the result of running external programs. @@ -63,13 +74,13 @@ class RunResult(object): "failed", "fail_reason", "cmd"] - def __init__(self, exit_code, signal, stdout, stderr, cmd): + def __init__(self, exit_code, signal_, stdout, stderr, cmd): self.cmd = cmd self.exit_code = exit_code - self.signal = signal + self.signal = signal_ self.stdout = stdout self.stderr = stderr - self.failed = (signal is not None or exit_code != 0) + self.failed = (signal_ is not None or exit_code != 0) if self.signal is not None: self.fail_reason = "terminated by signal %s" % self.signal @@ -78,6 +89,10 @@ class RunResult(object): else: self.fail_reason = "unable to determine termination reason" + if self.failed: + logging.debug("Command '%s' failed (%s); output: %s", + self.cmd, self.fail_reason, self.output) + def _GetOutput(self): """Returns the combined stdout and stderr for easier usage. @@ -87,113 +102,6 @@ class RunResult(object): output = property(_GetOutput, None, None, "Return full output") -def _GetLockFile(subsystem): - """Compute the file name for a given lock name.""" - return "/var/lock/ganeti_lock_%s" % subsystem - - -def Lock(name, max_retries=None, debug=False): - """Lock a given subsystem. - - In case the lock is already held by an alive process, the function - will sleep indefintely and poll with a one second interval. - - When the optional integer argument 'max_retries' is passed with a - non-zero value, the function will sleep only for this number of - times, and then it will will raise a LockError if the lock can't be - acquired. Passing in a negative number will cause only one try to - get the lock. Passing a positive number will make the function retry - for approximately that number of seconds. - - """ - lockfile = _GetLockFile(name) - - if name in _locksheld: - raise errors.LockError('Lock "%s" already held!' % (name,)) - - errcount = 0 - - retries = 0 - while True: - try: - fd = os.open(lockfile, os.O_CREAT | os.O_EXCL | os.O_RDWR | os.O_SYNC) - break - except OSError, creat_err: - if creat_err.errno != errno.EEXIST: - raise errors.LockError("Can't create the lock file. Error '%s'." % - str(creat_err)) - - try: - pf = open(lockfile, 'r') - except IOError, open_err: - errcount += 1 - if errcount >= 5: - raise errors.LockError("Lock file exists but cannot be opened." - " Error: '%s'." % str(open_err)) - time.sleep(1) - continue - - try: - pid = int(pf.read()) - except ValueError: - raise errors.LockError("Invalid pid string in %s" % - (lockfile,)) - - if not IsProcessAlive(pid): - raise errors.LockError("Stale lockfile %s for pid %d?" % - (lockfile, pid)) - - if max_retries and max_retries <= retries: - raise errors.LockError("Can't acquire lock during the specified" - " time, aborting.") - if retries == 5 and (debug or sys.stdin.isatty()): - logger.ToStderr("Waiting for '%s' lock from pid %d..." % (name, pid)) - - time.sleep(1) - retries += 1 - continue - - os.write(fd, '%d\n' % (os.getpid(),)) - os.close(fd) - - _locksheld.append(name) - - -def Unlock(name): - """Unlock a given subsystem. - - """ - lockfile = _GetLockFile(name) - - try: - fd = os.open(lockfile, os.O_RDONLY) - except OSError: - raise errors.LockError('Lock "%s" not held.' % (name,)) - - f = os.fdopen(fd, 'r') - pid_str = f.read() - - try: - pid = int(pid_str) - except ValueError: - raise errors.LockError('Unable to determine PID of locking process.') - - if pid != os.getpid(): - raise errors.LockError('Lock not held by me (%d != %d)' % - (os.getpid(), pid,)) - - os.unlink(lockfile) - _locksheld.remove(name) - - -def LockCleanup(): - """Remove all locks. - - """ - for lock in _locksheld: - Unlock(lock) - - def RunCmd(cmd): """Execute a (shell) command. @@ -206,6 +114,9 @@ def RunCmd(cmd): Returns: `RunResult` instance """ + if no_fork: + raise errors.ProgrammerError("utils.RunCmd() called with fork() disabled") + if isinstance(cmd, list): cmd = [str(val) for val in cmd] strcmd = " ".join(cmd) @@ -213,8 +124,10 @@ def RunCmd(cmd): else: strcmd = cmd shell = True + logging.debug("RunCmd '%s'", strcmd) env = os.environ.copy() env["LC_ALL"] = "C" + poller = select.poll() child = subprocess.Popen(cmd, shell=shell, stderr=subprocess.PIPE, stdout=subprocess.PIPE, @@ -222,42 +135,45 @@ def RunCmd(cmd): close_fds=True, env=env) child.stdin.close() - out = child.stdout.read() - err = child.stderr.read() + poller.register(child.stdout, select.POLLIN) + poller.register(child.stderr, select.POLLIN) + out = StringIO() + err = StringIO() + fdmap = { + child.stdout.fileno(): (out, child.stdout), + child.stderr.fileno(): (err, child.stderr), + } + for fd in fdmap: + status = fcntl.fcntl(fd, fcntl.F_GETFL) + fcntl.fcntl(fd, fcntl.F_SETFL, status | os.O_NONBLOCK) + + while fdmap: + for fd, event in poller.poll(): + if event & select.POLLIN or event & select.POLLPRI: + data = fdmap[fd][1].read() + # no data from read signifies EOF (the same as POLLHUP) + if not data: + poller.unregister(fd) + del fdmap[fd] + continue + fdmap[fd][0].write(data) + if (event & select.POLLNVAL or event & select.POLLHUP or + event & select.POLLERR): + poller.unregister(fd) + del fdmap[fd] + + out = out.getvalue() + err = err.getvalue() status = child.wait() if status >= 0: exitcode = status - signal = None + signal_ = None else: exitcode = None - signal = -status - - return RunResult(exitcode, signal, out, err, strcmd) - - -def RunCmdUnlocked(cmd): - """Execute a shell command without the 'cmd' lock. - - This variant of `RunCmd()` drops the 'cmd' lock before running the - command and re-aquires it afterwards, thus it can be used to call - other ganeti commands. + signal_ = -status - The argument and return values are the same as for the `RunCmd()` - function. - - Args: - cmd - command to run. (str) - - Returns: - `RunResult` - - """ - Unlock('cmd') - ret = RunCmd(cmd) - Lock('cmd') - - return ret + return RunResult(exitcode, signal_, out, err, strcmd) def RemoveFile(filename): @@ -344,8 +260,7 @@ def CheckDict(target, template, logname=None): target[k] = template[k] if missing and logname: - logger.Debug('%s missing keys %s' % - (logname, ', '.join(missing))) + logging.warning('%s missing keys %s', logname, ', '.join(missing)) def IsProcessAlive(pid): @@ -353,9 +268,13 @@ def IsProcessAlive(pid): Returns: true or false, depending on if the pid exists or not - Remarks: zombie processes treated as not alive + Remarks: zombie processes treated as not alive, and giving a pid <= + 0 makes the function to return False. """ + if pid <= 0: + return False + try: f = open("/proc/%d/status" % pid) except IOError, err: @@ -375,6 +294,32 @@ def IsProcessAlive(pid): return alive +def ReadPidFile(pidfile): + """Read the pid from a file. + + @param pidfile: Path to a file containing the pid to be checked + @type pidfile: string (filename) + @return: The process id, if the file exista and contains a valid PID, + otherwise 0 + @rtype: int + + """ + try: + pf = open(pidfile, 'r') + except EnvironmentError, err: + if err.errno != errno.ENOENT: + logging.exception("Can't read pid file?!") + return 0 + + try: + pid = int(pf.read()) + except ValueError, err: + logging.info("Can't parse pid file contents", exc_info=True) + return 0 + + return pid + + def MatchNameComponent(key, name_list): """Try to match a name against a list. @@ -472,7 +417,7 @@ def ListVolumeGroups(): name, size = line.split() size = int(float(size)) except (IndexError, ValueError), err: - logger.Error("Invalid output from vgs (%s): %s" % (err, line)) + logging.error("Invalid output from vgs (%s): %s", err, line) continue retval[name] = size @@ -524,59 +469,6 @@ def NiceSort(name_list): return [tup[1] for tup in to_sort] -def CheckDaemonAlive(pid_file, process_string): - """Check wether the specified daemon is alive. - - Args: - - pid_file: file to read the daemon pid from, the file is - expected to contain only a single line containing - only the PID - - process_string: a substring that we expect to find in - the command line of the daemon process - - Returns: - - True if the daemon is judged to be alive (that is: - - the PID file exists, is readable and contains a number - - a process of the specified PID is running - - that process contains the specified string in its - command line - - the process is not in state Z (zombie)) - - False otherwise - - """ - try: - pid_file = file(pid_file, 'r') - try: - pid = int(pid_file.readline()) - finally: - pid_file.close() - - cmdline_file_path = "/proc/%s/cmdline" % (pid) - cmdline_file = open(cmdline_file_path, 'r') - try: - cmdline = cmdline_file.readline() - finally: - cmdline_file.close() - - if not process_string in cmdline: - return False - - stat_file_path = "/proc/%s/stat" % (pid) - stat_file = open(stat_file_path, 'r') - try: - process_state = stat_file.readline().split()[2] - finally: - stat_file.close() - - if process_state == 'Z': - return False - - except (IndexError, IOError, ValueError): - return False - - return True - - def TryConvert(fn, val): """Try to convert a value ignoring errors. @@ -756,6 +648,9 @@ def SetEtcHostsEntry(file_name, ip, hostname, aliases): """Sets the name of an IP address and hostname in /etc/hosts. """ + # Ensure aliases are unique + aliases = UniqueSequence([hostname] + aliases)[1:] + fd, tmpname = tempfile.mkstemp(dir=os.path.dirname(file_name)) try: out = os.fdopen(fd, 'w') @@ -786,6 +681,14 @@ def SetEtcHostsEntry(file_name, ip, hostname, aliases): raise +def AddHostToEtcHosts(hostname): + """Wrapper around SetEtcHostsEntry. + + """ + hi = HostInfo(name=hostname) + SetEtcHostsEntry(constants.ETC_HOSTS, hi.ip, hi.name, [hi.ShortName()]) + + def RemoveEtcHostsEntry(file_name, hostname): """Removes a hostname from /etc/hosts. @@ -822,6 +725,15 @@ def RemoveEtcHostsEntry(file_name, hostname): raise +def RemoveHostFromEtcHosts(hostname): + """Wrapper around RemoveEtcHostsEntry. + + """ + hi = HostInfo(name=hostname) + RemoveEtcHostsEntry(constants.ETC_HOSTS, hi.name) + RemoveEtcHostsEntry(constants.ETC_HOSTS, hi.ShortName()) + + def CreateBackup(file_name): """Creates a backup of a file. @@ -866,25 +778,29 @@ def ShellQuoteArgs(args): return ' '.join([ShellQuote(i) for i in args]) - -def TcpPing(source, target, port, timeout=10, live_port_needed=False): +def TcpPing(target, port, timeout=10, live_port_needed=False, source=None): """Simple ping implementation using TCP connect(2). - Try to do a TCP connect(2) from the specified source IP to the specified - target IP and the specified target port. If live_port_needed is set to true, - requires the remote end to accept the connection. The timeout is specified - in seconds and defaults to 10 seconds + Try to do a TCP connect(2) from an optional source IP to the + specified target IP and the specified target port. If the optional + parameter live_port_needed is set to true, requires the remote end + to accept the connection. The timeout is specified in seconds and + defaults to 10 seconds. If the source optional argument is not + passed, the source address selection is left to the kernel, + otherwise we try to connect using the passed address (failures to + bind other than EADDRNOTAVAIL will be ignored). """ sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sucess = False - try: - sock.bind((source, 0)) - except socket.error, (errcode, errstring): - if errcode == errno.EADDRNOTAVAIL: - success = False + if source is not None: + try: + sock.bind((source, 0)) + except socket.error, (errcode, errstring): + if errcode == errno.EADDRNOTAVAIL: + success = False sock.settimeout(timeout) @@ -904,7 +820,9 @@ def ListVisibleFiles(path): """Returns a list of all visible files in a directory. """ - return [i for i in os.listdir(path) if not i.startswith(".")] + files = [i for i in os.listdir(path) if not i.startswith(".")] + files.sort() + return files def GetHomeDir(user, default=None): @@ -941,7 +859,9 @@ def NewUUID(): def WriteFile(file_name, fn=None, data=None, mode=None, uid=-1, gid=-1, - atime=None, mtime=None): + atime=None, mtime=None, close=True, + dry_run=False, backup=False, + prewrite=None, postwrite=None): """(Over)write a file atomically. The file_name and either fn (a function taking one argument, the @@ -955,6 +875,22 @@ def WriteFile(file_name, fn=None, data=None, exception, an existing target file should be unmodified and the temporary file should be removed. + Args: + file_name: New filename + fn: Content writing function, called with file descriptor as parameter + data: Content as string + mode: File mode + uid: Owner + gid: Group + atime: Access time + mtime: Modification time + close: Whether to close file after writing it + prewrite: Function object called before writing content + postwrite: Function object called after writing content + + Returns: + None if "close" parameter evaluates to True, otherwise file descriptor. + """ if not os.path.isabs(file_name): raise errors.ProgrammerError("Path passed to WriteFile is not" @@ -967,6 +903,8 @@ def WriteFile(file_name, fn=None, data=None, raise errors.ProgrammerError("Both atime and mtime must be either" " set or None") + if backup and not dry_run and os.path.isfile(file_name): + CreateBackup(file_name) dir_name, base_name = os.path.split(file_name) fd, new_name = tempfile.mkstemp('.new', base_name, dir_name) @@ -977,18 +915,50 @@ def WriteFile(file_name, fn=None, data=None, os.chown(new_name, uid, gid) if mode: os.chmod(new_name, mode) + if callable(prewrite): + prewrite(fd) if data is not None: os.write(fd, data) else: fn(fd) + if callable(postwrite): + postwrite(fd) os.fsync(fd) if atime is not None and mtime is not None: os.utime(new_name, (atime, mtime)) - os.rename(new_name, file_name) + if not dry_run: + os.rename(new_name, file_name) finally: - os.close(fd) + if close: + os.close(fd) + result = None + else: + result = fd RemoveFile(new_name) + return result + + +def FirstFree(seq, base=0): + """Returns the first non-existing integer from seq. + + The seq argument should be a sorted list of positive integers. The + first time the index of an element is smaller than the element + value, the index will be returned. + + The base argument is used to start at a different offset, + i.e. [3, 4, 6] with offset=3 will return 5. + + Example: [0, 1, 3] will return 2. + + """ + for idx, elem in enumerate(seq): + assert elem >= base, "Passed element is higher than base offset" + if elem > idx + base: + # idx is not used + return idx + base + return None + def all(seq, pred=bool): "Returns True if pred(x) is True for every element in the iterable" @@ -1002,3 +972,345 @@ def any(seq, pred=bool): for elem in itertools.ifilter(pred, seq): return True return False + + +def UniqueSequence(seq): + """Returns a list with unique elements. + + Element order is preserved. + """ + seen = set() + return [i for i in seq if i not in seen and not seen.add(i)] + + +def IsValidMac(mac): + """Predicate to check if a MAC address is valid. + + Checks wether the supplied MAC address is formally correct, only + accepts colon separated format. + """ + mac_check = re.compile("^([0-9a-f]{2}(:|$)){6}$") + return mac_check.match(mac) is not None + + +def TestDelay(duration): + """Sleep for a fixed amount of time. + + """ + if duration < 0: + return False + time.sleep(duration) + return True + + +def Daemonize(logfile, noclose_fds=None): + """Daemonize the current process. + + This detaches the current process from the controlling terminal and + runs it in the background as a daemon. + + """ + UMASK = 077 + WORKDIR = "/" + # Default maximum for the number of available file descriptors. + if 'SC_OPEN_MAX' in os.sysconf_names: + try: + MAXFD = os.sysconf('SC_OPEN_MAX') + if MAXFD < 0: + MAXFD = 1024 + except OSError: + MAXFD = 1024 + else: + MAXFD = 1024 + + # this might fail + pid = os.fork() + if (pid == 0): # The first child. + os.setsid() + # this might fail + pid = os.fork() # Fork a second child. + if (pid == 0): # The second child. + os.chdir(WORKDIR) + os.umask(UMASK) + else: + # exit() or _exit()? See below. + os._exit(0) # Exit parent (the first child) of the second child. + else: + os._exit(0) # Exit parent of the first child. + maxfd = resource.getrlimit(resource.RLIMIT_NOFILE)[1] + if (maxfd == resource.RLIM_INFINITY): + maxfd = MAXFD + + # Iterate through and close all file descriptors. + for fd in range(0, maxfd): + if noclose_fds and fd in noclose_fds: + continue + try: + os.close(fd) + except OSError: # ERROR, fd wasn't open to begin with (ignored) + pass + os.open(logfile, os.O_RDWR|os.O_CREAT|os.O_APPEND, 0600) + # Duplicate standard input to standard output and standard error. + os.dup2(0, 1) # standard output (1) + os.dup2(0, 2) # standard error (2) + return 0 + + +def DaemonPidFileName(name): + """Compute a ganeti pid file absolute path, given the daemon name. + + """ + return os.path.join(constants.RUN_GANETI_DIR, "%s.pid" % name) + + +def WritePidFile(name): + """Write the current process pidfile. + + The file will be written to constants.RUN_GANETI_DIR/name.pid + + """ + pid = os.getpid() + pidfilename = DaemonPidFileName(name) + if IsProcessAlive(ReadPidFile(pidfilename)): + raise errors.GenericError("%s contains a live process" % pidfilename) + + WriteFile(pidfilename, data="%d\n" % pid) + + +def RemovePidFile(name): + """Remove the current process pidfile. + + Any errors are ignored. + + """ + pid = os.getpid() + pidfilename = DaemonPidFileName(name) + # TODO: we could check here that the file contains our pid + try: + RemoveFile(pidfilename) + except: + pass + + +def KillProcess(pid, signal_=signal.SIGTERM, timeout=30): + """Kill a process given by its pid. + + @type pid: int + @param pid: The PID to terminate. + @type signal_: int + @param signal_: The signal to send, by default SIGTERM + @type timeout: int + @param timeout: The timeout after which, if the process is still alive, + a SIGKILL will be sent. If not positive, no such checking + will be done + + """ + if pid <= 0: + # kill with pid=0 == suicide + raise errors.ProgrammerError("Invalid pid given '%s'" % pid) + + if not IsProcessAlive(pid): + return + os.kill(pid, signal_) + if timeout <= 0: + return + end = time.time() + timeout + while time.time() < end and IsProcessAlive(pid): + time.sleep(0.1) + if IsProcessAlive(pid): + os.kill(pid, signal.SIGKILL) + + +def FindFile(name, search_path, test=os.path.exists): + """Look for a filesystem object in a given path. + + This is an abstract method to search for filesystem object (files, + dirs) under a given search path. + + Args: + - name: the name to look for + - search_path: list of directory names + - test: the test which the full path must satisfy + (defaults to os.path.exists) + + Returns: + - full path to the item if found + - None otherwise + + """ + for dir_name in search_path: + item_name = os.path.sep.join([dir_name, name]) + if test(item_name): + return item_name + return None + + +def CheckVolumeGroupSize(vglist, vgname, minsize): + """Checks if the volume group list is valid. + + A non-None return value means there's an error, and the return value + is the error message. + + """ + vgsize = vglist.get(vgname, None) + if vgsize is None: + return "volume group '%s' missing" % vgname + elif vgsize < minsize: + return ("volume group '%s' too small (%s MiB required, %d MiB found)" % + (vgname, minsize, vgsize)) + return None + + +def LockedMethod(fn): + """Synchronized object access decorator. + + This decorator is intended to protect access to an object using the + object's own lock which is hardcoded to '_lock'. + + """ + def wrapper(self, *args, **kwargs): + assert hasattr(self, '_lock') + lock = self._lock + lock.acquire() + try: + result = fn(self, *args, **kwargs) + finally: + lock.release() + return result + return wrapper + + +def LockFile(fd): + """Locks a file using POSIX locks. + + """ + try: + fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB) + except IOError, err: + if err.errno == errno.EAGAIN: + raise errors.LockError("File already locked") + raise + + +class FileLock(object): + """Utility class for file locks. + + """ + def __init__(self, filename): + self.filename = filename + self.fd = open(self.filename, "w") + + def __del__(self): + self.Close() + + def Close(self): + if self.fd: + self.fd.close() + self.fd = None + + def _flock(self, flag, blocking, errmsg): + assert self.fd, "Lock was closed" + + if not blocking: + flag |= fcntl.LOCK_NB + + try: + fcntl.flock(self.fd, flag) + except IOError, err: + logging.exception("fcntl.flock failed") + if err.errno in (errno.EAGAIN, ): + raise errors.LockError(errmsg) + raise + + def Exclusive(self, blocking=False): + """Locks the file in exclusive mode. + + """ + self._flock(fcntl.LOCK_EX, blocking, + "Failed to lock %s in exclusive mode" % self.filename) + + def Shared(self, blocking=False): + """Locks the file in shared mode. + + """ + self._flock(fcntl.LOCK_SH, blocking, + "Failed to lock %s in shared mode" % self.filename) + + def Unlock(self, blocking=True): + """Unlocks the file. + + According to "man flock", unlocking can also be a nonblocking operation: + "To make a non-blocking request, include LOCK_NB with any of the above + operations" + + """ + self._flock(fcntl.LOCK_UN, blocking, + "Failed to unlock %s" % self.filename) + + +class SignalHandler(object): + """Generic signal handler class. + + It automatically restores the original handler when deconstructed or when + Reset() is called. You can either pass your own handler function in or query + the "called" attribute to detect whether the signal was sent. + + """ + def __init__(self, signum): + """Constructs a new SignalHandler instance. + + @param signum: Single signal number or set of signal numbers + + """ + if isinstance(signum, (int, long)): + self.signum = set([signum]) + else: + self.signum = set(signum) + + self.called = False + + self._previous = {} + try: + for signum in self.signum: + # Setup handler + prev_handler = signal.signal(signum, self._HandleSignal) + try: + self._previous[signum] = prev_handler + except: + # Restore previous handler + signal.signal(signum, prev_handler) + raise + except: + # Reset all handlers + self.Reset() + # Here we have a race condition: a handler may have already been called, + # but there's not much we can do about it at this point. + raise + + def __del__(self): + self.Reset() + + def Reset(self): + """Restore previous handler. + + """ + for signum, prev_handler in self._previous.items(): + signal.signal(signum, prev_handler) + # If successful, remove from dict + del self._previous[signum] + + def Clear(self): + """Unsets "called" flag. + + This function can be used in case a signal may arrive several times. + + """ + self.called = False + + def _HandleSignal(self, signum, frame): + """Actual signal handling function. + + """ + # This is not nice and not absolutely atomic, but it appears to be the only + # solution in Python -- there are no atomic types. + self.called = True