root / tools / ganeti-listrunner @ 99a11adc
History | View | Annotate | Download (17 kB)
1 |
#!/usr/bin/python |
---|---|
2 |
# |
3 |
|
4 |
# Copyright (C) 2006, 2007, 2010, 2011 Google Inc. |
5 |
# |
6 |
# This program is free software; you can redistribute it and/or modify |
7 |
# it under the terms of the GNU General Public License as published by |
8 |
# the Free Software Foundation; either version 2 of the License, or |
9 |
# (at your option) any later version. |
10 |
# |
11 |
# This program is distributed in the hope that it will be useful, but |
12 |
# WITHOUT ANY WARRANTY; without even the implied warranty of |
13 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 |
# General Public License for more details. |
15 |
# |
16 |
# You should have received a copy of the GNU General Public License |
17 |
# along with this program; if not, write to the Free Software |
18 |
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA |
19 |
# 02110-1301, USA. |
20 |
|
21 |
"""Run an executable on a list of hosts. |
22 |
|
23 |
Script to serially run an executable on a list of hosts via ssh |
24 |
with password auth as root. If the provided log dir does not yet |
25 |
exist, it will try to create it. |
26 |
|
27 |
Implementation: |
28 |
- the main process spawns up to batch_size children, which: |
29 |
- connects to the remote host via ssh as root |
30 |
- uploads the executable with a random name to /tmp via sftp |
31 |
- chmod 500s it |
32 |
- via ssh: chdirs into the upload directory and runs the script |
33 |
- deletes it |
34 |
- writes status messages and all output to one logfile per host |
35 |
- the main process gathers then the status of the children and |
36 |
reports the success/failure ratio |
37 |
- entire script can be aborted with Ctrl-C |
38 |
|
39 |
Security considerations: |
40 |
- the root password for the remote hosts is stored in memory for the |
41 |
runtime of the script |
42 |
- the executable to be run on the remote host is handled the following way: |
43 |
- try to create a random directory with permissions 700 on the |
44 |
remote host, abort furter processing on this host if this failes |
45 |
- upload the executable with to a random filename in that directory |
46 |
- set executable permissions to 500 |
47 |
- run the executable |
48 |
- delete the execuable and the directory on the remote host |
49 |
|
50 |
""" |
51 |
|
52 |
# pylint: disable-msg=C0103 |
53 |
# C0103: Invalid name ganeti-listrunner |
54 |
|
55 |
import errno |
56 |
import getopt |
57 |
import getpass |
58 |
import logging |
59 |
import os |
60 |
import random |
61 |
import select |
62 |
import socket |
63 |
import sys |
64 |
import time |
65 |
import traceback |
66 |
|
67 |
import paramiko |
68 |
|
69 |
|
70 |
REMOTE_PATH_BASE = "/tmp/listrunner" |
71 |
|
72 |
|
73 |
def LogDirUseable(logdir): |
74 |
"""Ensure log file directory is available and usable.""" |
75 |
testfile = "%s/test-%s-%s.deleteme" % (logdir, random.random(), |
76 |
random.random()) |
77 |
try: |
78 |
os.mkdir(logdir) |
79 |
except OSError, err: |
80 |
if err.errno != errno.EEXIST: |
81 |
raise |
82 |
try: |
83 |
logtest = open(testfile, "aw") |
84 |
logtest.writelines("log file writeability test\n") |
85 |
logtest.close() |
86 |
os.unlink(testfile) |
87 |
return True |
88 |
except (OSError, IOError): |
89 |
return False |
90 |
|
91 |
|
92 |
def ShowHelp(executable): |
93 |
"""Print short usage information.""" |
94 |
print ("usage: %s -l logdir [-c|-x] value [-b batch_size]" |
95 |
" [-f hostfile|-h hosts] [-u username]" |
96 |
" [-p password_file]" % executable) |
97 |
print """ -l logdir to write logfiles to |
98 |
-x executable to run on remote host(s) |
99 |
-c shell command to run on remote host(s) |
100 |
-f hostlist file (one host per line) |
101 |
-a optional auxiliary file to upload (can be given multiple times) |
102 |
-b batch-size, how many hosts to process in parallel [15] |
103 |
-h comma-separated list of hosts or single hostname |
104 |
-u username used to connect [root] |
105 |
-p password used to authenticate""" |
106 |
|
107 |
|
108 |
def GetTimeStamp(timestamp=None): |
109 |
"""Return ISO8601 timestamp. |
110 |
|
111 |
Returns ISO8601 timestamp, optionally expects a time.localtime() tuple |
112 |
in timestamp, but will use the current time if this argument is not |
113 |
supplied. |
114 |
""" |
115 |
if timestamp is None: |
116 |
timestamp = time.localtime() |
117 |
|
118 |
isotime = time.strftime("%Y-%m-%dT%H:%M:%S", timestamp) |
119 |
return isotime |
120 |
|
121 |
|
122 |
def PingByTcp(target, port, timeout=10, live_port_needed=False, source=None): |
123 |
"""Simple ping implementation using TCP connect(2). |
124 |
|
125 |
Try to do a TCP connect(2) from an optional source IP to the |
126 |
specified target IP and the specified target port. If the optional |
127 |
parameter live_port_needed is set to true, requires the remote end |
128 |
to accept the connection. The timeout is specified in seconds and |
129 |
defaults to 10 seconds. If the source optional argument is not |
130 |
passed, the source address selection is left to the kernel, |
131 |
otherwise we try to connect using the passed address (failures to |
132 |
bind other than EADDRNOTAVAIL will be ignored). |
133 |
|
134 |
""" |
135 |
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) |
136 |
|
137 |
success = False |
138 |
|
139 |
if source is not None: |
140 |
try: |
141 |
sock.bind((source, 0)) |
142 |
except socket.error, (errcode): |
143 |
if errcode == errno.EADDRNOTAVAIL: |
144 |
success = False |
145 |
|
146 |
sock.settimeout(timeout) |
147 |
|
148 |
try: |
149 |
sock.connect((target, port)) |
150 |
sock.close() |
151 |
success = True |
152 |
except socket.timeout: |
153 |
success = False |
154 |
except socket.error, (errcode): |
155 |
success = (not live_port_needed) and (errcode == errno.ECONNREFUSED) |
156 |
|
157 |
return success |
158 |
|
159 |
|
160 |
def GetHosts(hostsfile): |
161 |
"""Return list of hosts from hostfile. |
162 |
|
163 |
Reads the hostslist file and returns a list of hosts. |
164 |
Expects the hostslist file to contain one hostname per line. |
165 |
|
166 |
""" |
167 |
try: |
168 |
datafile = open(hostsfile, "r") |
169 |
except IOError, msg: |
170 |
print "Failed to open hosts file %s: %s" % (hostsfile, msg) |
171 |
sys.exit(2) |
172 |
|
173 |
hosts = datafile.readlines() |
174 |
datafile.close() |
175 |
|
176 |
return hosts |
177 |
|
178 |
|
179 |
def WriteLog(message, logfile): |
180 |
"""Writes message, terminated by newline, to logfile.""" |
181 |
try: |
182 |
logfile = open(logfile, "aw") |
183 |
except IOError, msg: |
184 |
print "failed to open log file %s: %s" % (logfile, msg) |
185 |
print "log message was: %s" % message |
186 |
sys.exit(1) # no being able to log is critical |
187 |
try: |
188 |
timestamp = GetTimeStamp() |
189 |
logfile.writelines("%s %s\n" % (timestamp, message)) |
190 |
logfile.close() |
191 |
except IOError, msg: |
192 |
print "failed to write to logfile %s: %s" % (logfile, msg) |
193 |
print "log message was: %s" % message |
194 |
sys.exit(1) # no being able to log is critical |
195 |
|
196 |
|
197 |
def GetAgentKeys(): |
198 |
"""Tries to get a list of ssh keys from an agent.""" |
199 |
try: |
200 |
agent = paramiko.Agent() |
201 |
return list(agent.get_keys()) |
202 |
except paramiko.SSHException: |
203 |
return [] |
204 |
|
205 |
|
206 |
def SetupSshConnection(host, username, password, use_agent, logfile): |
207 |
"""Setup the ssh connection used for all later steps. |
208 |
|
209 |
This function sets up the ssh connection that will be used both |
210 |
for upload and remote command execution. |
211 |
|
212 |
On success, it will return paramiko.Transport object with an |
213 |
already logged in session. On failure, False will be returned. |
214 |
|
215 |
""" |
216 |
# check if target is willing to talk to us at all |
217 |
if not PingByTcp(host, 22, live_port_needed=True): |
218 |
WriteLog("ERROR: FAILURE_NOT_REACHABLE", logfile) |
219 |
print " - ERROR: host not reachable on 22/tcp" |
220 |
return False |
221 |
|
222 |
if use_agent: |
223 |
keys = GetAgentKeys() |
224 |
else: |
225 |
keys = [] |
226 |
all_kwargs = [{"pkey": k} for k in keys] |
227 |
all_desc = ["key %d" % d for d in range(len(keys))] |
228 |
if password is not None: |
229 |
all_kwargs.append({"password": password}) |
230 |
all_desc.append("password") |
231 |
|
232 |
# deal with logging out of paramiko.transport |
233 |
handler = None |
234 |
|
235 |
for desc, kwargs in zip(all_desc, all_kwargs): |
236 |
try: |
237 |
transport = paramiko.Transport((host, 22)) |
238 |
|
239 |
# only try to setup the logging handler once |
240 |
if not handler: |
241 |
handler = logging.StreamHandler() |
242 |
handler.setLevel(logging.ERROR) |
243 |
log = logging.getLogger(transport.get_log_channel()) |
244 |
log.addHandler(handler) |
245 |
|
246 |
transport.connect(username=username, **kwargs) # pylint: disable-msg=W0142 |
247 |
WriteLog("ssh connection established using %s" % desc, logfile) |
248 |
# strange ... when establishing the session and the immediately |
249 |
# setting up the channels for sftp & shell from that, it sometimes |
250 |
# fails, but waiting 1 second after session setup makes it always work |
251 |
# time.sleep(1) |
252 |
# FIXME apparently needfull to give sshd some time |
253 |
return transport |
254 |
except (socket.gaierror, socket.error, paramiko.SSHException): |
255 |
continue |
256 |
|
257 |
methods = ", ".join(all_desc) |
258 |
WriteLog("ERROR: FAILURE_CONNECTION_SETUP (tried %s) " % methods, logfile) |
259 |
WriteLog("aborted", logfile) |
260 |
print " - ERROR: connection setup failed (tried %s)" % methods |
261 |
|
262 |
return False |
263 |
|
264 |
|
265 |
def UploadFiles(connection, executable, filelist, logfile): |
266 |
"""Uploads the specified files via sftp. |
267 |
|
268 |
Uploads the specified files to a random, freshly created directory with |
269 |
a temporary name under /tmp. All uploaded files are chmod 0400 after upload |
270 |
with the exception of executable, with is chmod 500. |
271 |
|
272 |
Upon success, returns the absolute path to the remote upload directory, |
273 |
but will return False upon failure. |
274 |
""" |
275 |
remote_dir = "%s.%s-%s" % (REMOTE_PATH_BASE, |
276 |
random.random(), random.random()) |
277 |
|
278 |
try: |
279 |
sftp = paramiko.SFTPClient.from_transport(connection) |
280 |
sftp.mkdir(remote_dir, mode=0700) |
281 |
for item in filelist: |
282 |
remote_file = "%s/%s" % (remote_dir, item.split("/").pop()) |
283 |
WriteLog("uploading %s to remote %s" % (item, remote_file), logfile) |
284 |
sftp.put(item, remote_file) |
285 |
if item == executable: |
286 |
sftp.chmod(remote_file, 0500) |
287 |
else: |
288 |
sftp.chmod(remote_file, 0400) |
289 |
sftp.close() |
290 |
except IOError, err: |
291 |
WriteLog("ERROR: FAILURE_UPLOAD: %s" % err, logfile) |
292 |
return False |
293 |
|
294 |
return remote_dir |
295 |
|
296 |
|
297 |
def CleanupRemoteDir(connection, upload_dir, filelist, logfile): |
298 |
"""Cleanes out and removes the remote work directory.""" |
299 |
try: |
300 |
sftp = paramiko.SFTPClient.from_transport(connection) |
301 |
for item in filelist: |
302 |
fullpath = "%s/%s" % (upload_dir, item.split("/").pop()) |
303 |
WriteLog("removing remote %s" % fullpath, logfile) |
304 |
sftp.remove(fullpath) |
305 |
sftp.rmdir(upload_dir) |
306 |
sftp.close() |
307 |
except IOError, err: |
308 |
WriteLog("ERROR: FAILURE_CLEANUP: %s" % err, logfile) |
309 |
return False |
310 |
|
311 |
return True |
312 |
|
313 |
|
314 |
def RunRemoteCommand(connection, command, logfile): |
315 |
"""Execute the command via ssh on the remote host.""" |
316 |
session = connection.open_session() |
317 |
session.setblocking(0) |
318 |
|
319 |
# the following dance is needed because paramiko changed APIs: |
320 |
# from returning True/False for success to always returning None |
321 |
# and throwing an exception in case of problems. |
322 |
# And I want to support both the old and the new API. |
323 |
result = True # being optimistic here, I know |
324 |
message = None |
325 |
try: |
326 |
if session.exec_command("%s 2>&1" % command) is False: |
327 |
result = False |
328 |
except paramiko.SSHException, message: |
329 |
result = False |
330 |
|
331 |
if not result: |
332 |
WriteLog("ERROR: FAILURE_COMMAND_EXECUTION: %s" % message, logfile) |
333 |
return False |
334 |
|
335 |
### Read when data is available |
336 |
output = "" |
337 |
while select.select([session], [], []): |
338 |
data = session.recv(1024) |
339 |
if not data: |
340 |
break |
341 |
output += data |
342 |
select.select([], [], [], .1) |
343 |
|
344 |
WriteLog("SUCCESS: command output follows", logfile) |
345 |
for line in output.split("\n"): |
346 |
WriteLog("output = %s" %line, logfile) |
347 |
WriteLog("command execution completed", logfile) |
348 |
session.close() |
349 |
|
350 |
return True |
351 |
|
352 |
|
353 |
def HostWorker(logdir, username, password, use_agent, hostname, |
354 |
executable, command, filelist): |
355 |
"""Per-host worker. |
356 |
|
357 |
This function does not return - it's the main code of the childs, |
358 |
which exit at the end of this function. The exit code 0 or 1 will be |
359 |
interpreted by the parent. |
360 |
|
361 |
@param logdir: the directory where the logfiles must be created |
362 |
@param username: SSH username |
363 |
@param password: SSH password |
364 |
@param use_agent: whether we should instead use an agent |
365 |
@param hostname: the hostname to connect to |
366 |
@param executable: the executable to upload, if not None |
367 |
@param command: the command to run |
368 |
@param filelist: auxiliary files to upload |
369 |
|
370 |
""" |
371 |
# in the child/worker process |
372 |
logfile = "%s/%s.log" % (logdir, hostname) |
373 |
print "%s - starting" % hostname |
374 |
result = 0 # optimism, I know |
375 |
try: |
376 |
connection = SetupSshConnection(hostname, username, |
377 |
password, use_agent, logfile) |
378 |
if connection is not False: |
379 |
if executable is not None: |
380 |
print " %s: uploading files" % hostname |
381 |
upload_dir = UploadFiles(connection, executable, |
382 |
filelist, logfile) |
383 |
command = "cd %s && ./%s" % (upload_dir, |
384 |
executable.split("/").pop()) |
385 |
print " %s: executing remote command" % hostname |
386 |
cmd_result = RunRemoteCommand(connection, command, logfile) |
387 |
if cmd_result is True: |
388 |
print " %s: remote command execution successful" % hostname |
389 |
else: |
390 |
print (" %s: remote command execution failed," |
391 |
" check log for details" % hostname) |
392 |
result = 1 |
393 |
if executable is not None: |
394 |
print " %s: cleaning up remote work dir" % hostname |
395 |
cln_result = CleanupRemoteDir(connection, upload_dir, |
396 |
filelist, logfile) |
397 |
if cln_result is False: |
398 |
print (" %s: remote work dir cleanup failed, check" |
399 |
" log for details" % hostname) |
400 |
result = 1 |
401 |
connection.close() |
402 |
else: |
403 |
print " %s: connection setup failed, skipping" % hostname |
404 |
result = 1 |
405 |
except KeyboardInterrupt: |
406 |
print " %s: received KeyboardInterrupt, aborting" % hostname |
407 |
WriteLog("ERROR: ABORT_KEYBOARD_INTERRUPT", logfile) |
408 |
result = 1 |
409 |
except Exception, err: |
410 |
result = 1 |
411 |
trace = traceback.format_exc() |
412 |
msg = "ERROR: UNHANDLED_EXECPTION_ERROR: %s\nTrace: %s" % (err, trace) |
413 |
WriteLog(msg, logfile) |
414 |
print " %s: %s" % (hostname, msg) |
415 |
# and exit with exit code 0 or 1, so the parent can compute statistics |
416 |
sys.exit(result) |
417 |
|
418 |
|
419 |
def LaunchWorker(child_pids, logdir, username, password, use_agent, hostname, |
420 |
executable, command, filelist): |
421 |
"""Launch the per-host worker. |
422 |
|
423 |
Arguments are the same as for HostWorker, except for child_pids, |
424 |
which is a dictionary holding the pid-to-hostname mapping. |
425 |
|
426 |
""" |
427 |
hostname = hostname.rstrip("\n") |
428 |
pid = os.fork() |
429 |
if pid > 0: |
430 |
# controller just record the pids |
431 |
child_pids[pid] = hostname |
432 |
else: |
433 |
HostWorker(logdir, username, password, use_agent, hostname, |
434 |
executable, command, filelist) |
435 |
|
436 |
|
437 |
def main(): |
438 |
"""main.""" |
439 |
try: |
440 |
optlist, _ = getopt.getopt(sys.argv[1:], "l:x:h:f:a:c:b:u:p:A") |
441 |
except getopt.GetoptError, err: |
442 |
print str(err) |
443 |
ShowHelp(sys.argv[0]) |
444 |
sys.exit(2) |
445 |
|
446 |
logdir = executable = hostfile = hostlist = command = None |
447 |
use_agent = False |
448 |
auxfiles = [] |
449 |
username = "root" |
450 |
password = None |
451 |
batch_size = 15 |
452 |
for option in optlist: |
453 |
if option[0] == "-l": |
454 |
logdir = option[1] |
455 |
if option[0] == "-x": |
456 |
executable = option[1] |
457 |
if option[0] == "-f": |
458 |
hostfile = option[1] |
459 |
if option[0] == "-h": |
460 |
hostlist = option[1] |
461 |
if option[0] == "-a": |
462 |
auxfiles.append(option[1]) |
463 |
if option[0] == "-c": |
464 |
command = option[1] |
465 |
if option[0] == "-b": |
466 |
batch_size = int(option[1]) |
467 |
if option[0] == "-u": |
468 |
username = option[1] |
469 |
if option[0] == "-p": |
470 |
password = option[1] |
471 |
if option[0] == "-A": |
472 |
use_agent = True |
473 |
|
474 |
if not (logdir and (executable or command) and (hostfile or hostlist)): |
475 |
print "error: missing required commandline argument(s)" |
476 |
ShowHelp(sys.argv[0]) |
477 |
sys.exit(3) |
478 |
|
479 |
if executable and command: |
480 |
print "error: can run either a command or an executable, not both" |
481 |
ShowHelp(sys.argv[0]) |
482 |
sys.exit(3) |
483 |
|
484 |
if hostlist and hostfile: |
485 |
print "error: specify either -f or -h arguments, not both" |
486 |
ShowHelp(sys.argv[0]) |
487 |
sys.exit(3) |
488 |
|
489 |
### Unbuffered sys.stdout |
490 |
sys.stdout = os.fdopen(1, "w", 0) |
491 |
|
492 |
if LogDirUseable(logdir) is False: |
493 |
print "ERROR: cannot create logfiles in dir %s, aborting" % logdir |
494 |
sys.exit(1) |
495 |
|
496 |
if use_agent: |
497 |
pass |
498 |
elif password: |
499 |
try: |
500 |
fh = file(password) |
501 |
pwvalue = fh.readline().strip() |
502 |
fh.close() |
503 |
except IOError, e: |
504 |
print "error: can not read in from password file %s: %s" % (password, e) |
505 |
sys.exit(1) |
506 |
password = pwvalue |
507 |
else: |
508 |
password = getpass.getpass("%s's password for all nodes: " % username) |
509 |
|
510 |
if hostfile: |
511 |
hosts = GetHosts(hostfile) |
512 |
else: |
513 |
if "," in hostlist: |
514 |
hostlist = hostlist.rstrip(",") # commandline robustness |
515 |
hosts = hostlist.split(",") |
516 |
else: |
517 |
hosts = [hostlist] |
518 |
|
519 |
successes = failures = 0 |
520 |
|
521 |
filelist = auxfiles[:] |
522 |
filelist.append(executable) |
523 |
|
524 |
# initial batch |
525 |
batch = hosts[:batch_size] |
526 |
hosts = hosts[batch_size:] |
527 |
child_pids = {} |
528 |
for hostname in batch: |
529 |
LaunchWorker(child_pids, logdir, username, password, use_agent, hostname, |
530 |
executable, command, filelist) |
531 |
|
532 |
while child_pids: |
533 |
pid, status = os.wait() |
534 |
hostname = child_pids.pop(pid, "<unknown host>") |
535 |
print " %s: done (in parent)" % hostname |
536 |
if os.WIFEXITED(status) and os.WEXITSTATUS(status) == 0: |
537 |
successes += 1 |
538 |
else: |
539 |
failures += 1 |
540 |
if hosts: |
541 |
LaunchWorker(child_pids, logdir, username, password, use_agent, |
542 |
hosts.pop(0), executable, command, filelist) |
543 |
|
544 |
|
545 |
print "All done, %s successful and %s failed hosts" % (successes, failures) |
546 |
|
547 |
sys.exit(0) |
548 |
|
549 |
|
550 |
if __name__ == "__main__": |
551 |
try: |
552 |
main() |
553 |
except KeyboardInterrupt: |
554 |
print "Received KeyboardInterrupt, aborting" |
555 |
sys.exit(1) |