4 # Copyright (C) 2006, 2007, 2010 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 """Run an executable on a list of hosts.
23 Script to serially run an executable on a list of hosts via ssh
24 with password auth as root. If the provided log dir does not yet
25 exist, it will try to create it.
28 - the main process spawns up to batch_size children, which:
29 - connects to the remote host via ssh as root
30 - uploads the executable with a random name to /tmp via sftp
32 - via ssh: chdirs into the upload directory and runs the script
34 - writes status messages and all output to one logfile per host
35 - the main process gathers then the status of the children and
36 reports the success/failure ratio
37 - entire script can be aborted with Ctrl-C
39 Security considerations:
40 - the root password for the remote hosts is stored in memory for the
42 - the executable to be run on the remote host is handled the following way:
43 - try to create a random directory with permissions 700 on the
44 remote host, abort furter processing on this host if this failes
45 - upload the executable with to a random filename in that directory
46 - set executable permissions to 500
48 - delete the execuable and the directory on the remote host
52 # pylint: disable-msg=C0103
53 # C0103: Invalid name ganeti-listrunner
70 REMOTE_PATH_BASE = "/tmp/listrunner"
73 def LogDirUseable(logdir):
74 """Ensure log file directory is available and usable."""
75 testfile = "%s/test-%s-%s.deleteme" % (logdir, random.random(),
80 if err.errno != errno.EEXIST:
83 logtest = open(testfile, "aw")
84 logtest.writelines("log file writeability test\n")
88 except (OSError, IOError):
92 def ShowHelp(executable):
93 """Print short usage information."""
94 print ("usage: %s -l logdir [-c|-x] value [-b batch_size]"
95 " [-f hostfile|-h hosts] [-u username]"
96 " [-p password_file]" % executable)
97 print """ -l logdir to write logfiles to
98 -x executable to run on remote host(s)
99 -c shell command to run on remote host(s)
100 -f hostlist file (one host per line)
101 -a optional auxiliary file to upload (can be given multiple times)
102 -b batch-size, how many hosts to process in parallel [15]
103 -h comma-separated list of hosts or single hostname
104 -u username used to connect [root]
105 -p password used to authenticate"""
108 def GetTimeStamp(timestamp=None):
109 """Return ISO8601 timestamp.
111 Returns ISO8601 timestamp, optionally expects a time.localtime() tuple
112 in timestamp, but will use the current time if this argument is not
115 if timestamp is None:
116 timestamp = time.localtime()
118 isotime = time.strftime("%Y-%m-%dT%H:%M:%S", timestamp)
122 def PingByTcp(target, port, timeout=10, live_port_needed=False, source=None):
123 """Simple ping implementation using TCP connect(2).
125 Try to do a TCP connect(2) from an optional source IP to the
126 specified target IP and the specified target port. If the optional
127 parameter live_port_needed is set to true, requires the remote end
128 to accept the connection. The timeout is specified in seconds and
129 defaults to 10 seconds. If the source optional argument is not
130 passed, the source address selection is left to the kernel,
131 otherwise we try to connect using the passed address (failures to
132 bind other than EADDRNOTAVAIL will be ignored).
135 sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
139 if source is not None:
141 sock.bind((source, 0))
142 except socket.error, (errcode):
143 if errcode == errno.EADDRNOTAVAIL:
146 sock.settimeout(timeout)
149 sock.connect((target, port))
152 except socket.timeout:
154 except socket.error, (errcode):
155 success = (not live_port_needed) and (errcode == errno.ECONNREFUSED)
160 def GetHosts(hostsfile):
161 """Return list of hosts from hostfile.
163 Reads the hostslist file and returns a list of hosts.
164 Expects the hostslist file to contain one hostname per line.
168 datafile = open(hostsfile, "r")
170 print "Failed to open hosts file %s: %s" % (hostsfile, msg)
173 hosts = datafile.readlines()
179 def WriteLog(message, logfile):
180 """Writes message, terminated by newline, to logfile."""
182 logfile = open(logfile, "aw")
184 print "failed to open log file %s: %s" % (logfile, msg)
185 print "log message was: %s" % message
186 sys.exit(1) # no being able to log is critical
188 timestamp = GetTimeStamp()
189 logfile.writelines("%s %s\n" % (timestamp, message))
192 print "failed to write to logfile %s: %s" % (logfile, msg)
193 print "log message was: %s" % message
194 sys.exit(1) # no being able to log is critical
198 """Tries to get a list of ssh keys from an agent."""
200 agent = paramiko.Agent()
201 return list(agent.get_keys())
202 except paramiko.SSHException:
206 def SetupSshConnection(host, username, password, keys, logfile):
207 """Setup the ssh connection used for all later steps.
209 This function sets up the ssh connection that will be used both
210 for upload and remote command execution.
212 On success, it will return paramiko.Transport object with an
213 already logged in session. On failure, False will be returned.
216 # check if target is willing to talk to us at all
217 if not PingByTcp(host, 22, live_port_needed=True):
218 WriteLog("ERROR: FAILURE_NOT_REACHABLE", logfile)
219 print " - ERROR: host not reachable on 22/tcp"
222 all_kwargs = [{"pkey": k} for k in keys]
223 all_desc = ["key %d" % d for d in range(len(keys))]
224 if password is not None:
225 all_kwargs.append({"password": password})
226 all_desc.append("password")
228 # deal with logging out of paramiko.transport
231 for desc, kwargs in zip(all_desc, all_kwargs):
233 transport = paramiko.Transport((host, 22))
235 # only try to setup the logging handler once
237 handler = logging.StreamHandler()
238 handler.setLevel(logging.ERROR)
239 log = logging.getLogger(transport.get_log_channel())
240 log.addHandler(handler)
242 transport.connect(username=username, **kwargs) # pylint: disable-msg=W0142
243 WriteLog("ssh connection established using %s" % desc, logfile)
244 # strange ... when establishing the session and the immediately
245 # setting up the channels for sftp & shell from that, it sometimes
246 # fails, but waiting 1 second after session setup makes it always work
248 # FIXME apparently needfull to give sshd some time
250 except (socket.gaierror, socket.error, paramiko.SSHException):
253 methods = ", ".join(all_desc)
254 WriteLog("ERROR: FAILURE_CONNECTION_SETUP (tried %s) " % methods, logfile)
255 WriteLog("aborted", logfile)
256 print " - ERROR: connection setup failed (tried %s)" % methods
261 def UploadFiles(connection, executable, filelist, logfile):
262 """Uploads the specified files via sftp.
264 Uploads the specified files to a random, freshly created directory with
265 a temporary name under /tmp. All uploaded files are chmod 0400 after upload
266 with the exception of executable, with is chmod 500.
268 Upon success, returns the absolute path to the remote upload directory,
269 but will return False upon failure.
271 remote_dir = "%s.%s-%s" % (REMOTE_PATH_BASE,
272 random.random(), random.random())
275 sftp = paramiko.SFTPClient.from_transport(connection)
276 sftp.mkdir(remote_dir, mode=0700)
277 for item in filelist:
278 remote_file = "%s/%s" % (remote_dir, item.split("/").pop())
279 WriteLog("uploading %s to remote %s" % (item, remote_file), logfile)
280 sftp.put(item, remote_file)
281 if item == executable:
282 sftp.chmod(remote_file, 0500)
284 sftp.chmod(remote_file, 0400)
287 WriteLog("ERROR: FAILURE_UPLOAD: %s" % err, logfile)
293 def CleanupRemoteDir(connection, upload_dir, filelist, logfile):
294 """Cleanes out and removes the remote work directory."""
296 sftp = paramiko.SFTPClient.from_transport(connection)
297 for item in filelist:
298 fullpath = "%s/%s" % (upload_dir, item.split("/").pop())
299 WriteLog("removing remote %s" % fullpath, logfile)
300 sftp.remove(fullpath)
301 sftp.rmdir(upload_dir)
304 WriteLog("ERROR: FAILURE_CLEANUP: %s" % err, logfile)
310 def RunRemoteCommand(connection, command, logfile):
311 """Execute the command via ssh on the remote host."""
312 session = connection.open_session()
313 session.setblocking(0)
315 # the following dance is needed because paramiko changed APIs:
316 # from returning True/False for success to always returning None
317 # and throwing an exception in case of problems.
318 # And I want to support both the old and the new API.
319 result = True # being optimistic here, I know
322 if session.exec_command("%s 2>&1" % command) is False:
324 except paramiko.SSHException, message:
328 WriteLog("ERROR: FAILURE_COMMAND_EXECUTION: %s" % message, logfile)
331 ### Read when data is available
333 while select.select([session], [], []):
334 data = session.recv(1024)
338 select.select([], [], [], .1)
340 WriteLog("SUCCESS: command output follows", logfile)
341 for line in output.split("\n"):
342 WriteLog("output = %s" %line, logfile)
343 WriteLog("command execution completed", logfile)
349 def HostWorker(logdir, username, password, keys, hostname,
350 executable, command, filelist):
353 This function does not return - it's the main code of the childs,
354 which exit at the end of this function. The exit code 0 or 1 will be
355 interpreted by the parent.
357 @param logdir: the directory where the logfiles must be created
358 @param username: SSH username
359 @param password: SSH password
360 @param keys: SSH keys
361 @param hostname: the hostname to connect to
362 @param executable: the executable to upload, if not None
363 @param command: the command to run
364 @param filelist: auxiliary files to upload
367 # in the child/worker process
368 logfile = "%s/%s.log" % (logdir, hostname)
369 print "%s - starting" % hostname
370 result = 0 # optimism, I know
372 connection = SetupSshConnection(hostname, username,
373 password, keys, logfile)
374 if connection is not False:
375 if executable is not None:
376 print " %s: uploading files" % hostname
377 upload_dir = UploadFiles(connection, executable,
379 command = "cd %s && ./%s" % (upload_dir,
380 executable.split("/").pop())
381 print " %s: executing remote command" % hostname
382 cmd_result = RunRemoteCommand(connection, command, logfile)
383 if cmd_result is True:
384 print " %s: remote command execution successful" % hostname
386 print (" %s: remote command execution failed,"
387 " check log for details" % hostname)
389 if executable is not None:
390 print " %s: cleaning up remote work dir" % hostname
391 cln_result = CleanupRemoteDir(connection, upload_dir,
393 if cln_result is False:
394 print (" %s: remote work dir cleanup failed, check"
395 " log for details" % hostname)
399 print " %s: connection setup failed, skipping" % hostname
401 except KeyboardInterrupt:
402 print " %s: received KeyboardInterrupt, aborting" % hostname
403 WriteLog("ERROR: ABORT_KEYBOARD_INTERRUPT", logfile)
405 except Exception, err:
407 trace = traceback.format_exc()
408 msg = "ERROR: UNHANDLED_EXECPTION_ERROR: %s\nTrace: %s" % (err, trace)
409 WriteLog(msg, logfile)
410 print " %s: %s" % (hostname, msg)
411 # and exit with exit code 0 or 1, so the parent can compute statistics
415 def LaunchWorker(child_pids, logdir, username, password, keys, hostname,
416 executable, command, filelist):
417 """Launch the per-host worker.
419 Arguments are the same as for HostWorker, except for child_pids,
420 which is a dictionary holding the pid-to-hostname mapping.
423 hostname = hostname.rstrip("\n")
426 # controller just record the pids
427 child_pids[pid] = hostname
429 HostWorker(logdir, username, password, keys, hostname,
430 executable, command, filelist)
436 optlist, _ = getopt.getopt(sys.argv[1:], "l:x:h:f:a:c:b:u:p:A")
437 except getopt.GetoptError, err:
439 ShowHelp(sys.argv[0])
442 logdir = executable = hostfile = hostlist = command = None
448 for option in optlist:
449 if option[0] == "-l":
451 if option[0] == "-x":
452 executable = option[1]
453 if option[0] == "-f":
455 if option[0] == "-h":
457 if option[0] == "-a":
458 auxfiles.append(option[1])
459 if option[0] == "-c":
461 if option[0] == "-b":
462 batch_size = int(option[1])
463 if option[0] == "-u":
465 if option[0] == "-p":
467 if option[0] == "-A":
470 if not (logdir and (executable or command) and (hostfile or hostlist)):
471 print "error: missing required commandline argument(s)"
472 ShowHelp(sys.argv[0])
475 if executable and command:
476 print "error: can run either a command or an executable, not both"
477 ShowHelp(sys.argv[0])
480 if hostlist and hostfile:
481 print "error: specify either -f or -h arguments, not both"
482 ShowHelp(sys.argv[0])
485 ### Unbuffered sys.stdout
486 sys.stdout = os.fdopen(1, "w", 0)
488 if LogDirUseable(logdir) is False:
489 print "ERROR: cannot create logfiles in dir %s, aborting" % logdir
494 keys = GetAgentKeys()
498 pwvalue = fh.readline().strip()
501 print "error: can not read in from password file %s: %s" % (password, e)
505 password = getpass.getpass("%s's password for all nodes: " % username)
508 hosts = GetHosts(hostfile)
511 hostlist = hostlist.rstrip(",") # commandline robustness
512 hosts = hostlist.split(",")
516 successes = failures = 0
518 filelist = auxfiles[:]
519 filelist.append(executable)
522 batch = hosts[:batch_size]
523 hosts = hosts[batch_size:]
525 for hostname in batch:
526 LaunchWorker(child_pids, logdir, username, password, keys, hostname,
527 executable, command, filelist)
530 pid, status = os.wait()
531 hostname = child_pids.pop(pid, "<unknown host>")
532 print " %s: done (in parent)" % hostname
533 if os.WIFEXITED(status) and os.WEXITSTATUS(status) == 0:
538 LaunchWorker(child_pids, logdir, username, password, keys,
539 hosts.pop(0), executable, command, filelist)
542 print "All done, %s successful and %s failed hosts" % (successes, failures)
547 if __name__ == "__main__":
550 except KeyboardInterrupt:
551 print "Received KeyboardInterrupt, aborting"