Statistics
| Branch: | Tag: | Revision:

root / tools / ganeti-listrunner @ 99a11adc

History | View | Annotate | Download (17 kB)

1
#!/usr/bin/python
2
#
3

    
4
# Copyright (C) 2006, 2007, 2010, 2011 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

    
21
"""Run an executable on a list of hosts.
22

    
23
Script to serially run an executable on a list of hosts via ssh
24
with password auth as root. If the provided log dir does not yet
25
exist, it will try to create it.
26

    
27
Implementation:
28
 - the main process spawns up to batch_size children, which:
29
 - connects to the remote host via ssh as root
30
 - uploads the executable with a random name to /tmp via sftp
31
 - chmod 500s it
32
 - via ssh: chdirs into the upload directory and runs the script
33
 - deletes it
34
 - writes status messages and all output to one logfile per host
35
 - the main process gathers then the status of the children and
36
   reports the success/failure ratio
37
 - entire script can be aborted with Ctrl-C
38

    
39
Security considerations:
40
 - the root password for the remote hosts is stored in memory for the
41
   runtime of the script
42
 - the executable to be run on the remote host is handled the following way:
43
   - try to create a random directory with permissions 700 on the
44
     remote host, abort furter processing on this host if this failes
45
   - upload the executable with to a random filename in that directory
46
   - set executable permissions to 500
47
   - run the executable
48
   - delete the execuable and the directory on the remote host
49

    
50
"""
51

    
52
# pylint: disable-msg=C0103
53
# C0103: Invalid name ganeti-listrunner
54

    
55
import errno
56
import getopt
57
import getpass
58
import logging
59
import os
60
import random
61
import select
62
import socket
63
import sys
64
import time
65
import traceback
66

    
67
import paramiko
68

    
69

    
70
REMOTE_PATH_BASE = "/tmp/listrunner"
71

    
72

    
73
def LogDirUseable(logdir):
74
  """Ensure log file directory is available and usable."""
75
  testfile = "%s/test-%s-%s.deleteme" % (logdir, random.random(),
76
                                         random.random())
77
  try:
78
    os.mkdir(logdir)
79
  except OSError, err:
80
    if err.errno != errno.EEXIST:
81
      raise
82
  try:
83
    logtest = open(testfile, "aw")
84
    logtest.writelines("log file writeability test\n")
85
    logtest.close()
86
    os.unlink(testfile)
87
    return True
88
  except (OSError, IOError):
89
    return False
90

    
91

    
92
def ShowHelp(executable):
93
  """Print short usage information."""
94
  print ("usage: %s -l logdir [-c|-x] value [-b batch_size]"
95
         " [-f hostfile|-h hosts] [-u username]"
96
         " [-p password_file]" % executable)
97
  print """        -l logdir to write logfiles to
98
        -x executable to run on remote host(s)
99
        -c shell command to run on remote host(s)
100
        -f hostlist file (one host per line)
101
        -a optional auxiliary file to upload (can be given multiple times)
102
        -b batch-size, how many hosts to process in parallel [15]
103
        -h comma-separated list of hosts or single hostname
104
        -u username used to connect [root]
105
        -p password used to authenticate"""
106

    
107

    
108
def GetTimeStamp(timestamp=None):
109
  """Return ISO8601 timestamp.
110

    
111
  Returns ISO8601 timestamp, optionally expects a time.localtime() tuple
112
  in timestamp, but will use the current time if this argument is not
113
  supplied.
114
  """
115
  if timestamp is None:
116
    timestamp = time.localtime()
117

    
118
  isotime = time.strftime("%Y-%m-%dT%H:%M:%S", timestamp)
119
  return isotime
120

    
121

    
122
def PingByTcp(target, port, timeout=10, live_port_needed=False, source=None):
123
  """Simple ping implementation using TCP connect(2).
124

    
125
  Try to do a TCP connect(2) from an optional source IP to the
126
  specified target IP and the specified target port. If the optional
127
  parameter live_port_needed is set to true, requires the remote end
128
  to accept the connection. The timeout is specified in seconds and
129
  defaults to 10 seconds. If the source optional argument is not
130
  passed, the source address selection is left to the kernel,
131
  otherwise we try to connect using the passed address (failures to
132
  bind other than EADDRNOTAVAIL will be ignored).
133

    
134
  """
135
  sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
136

    
137
  success = False
138

    
139
  if source is not None:
140
    try:
141
      sock.bind((source, 0))
142
    except socket.error, (errcode):
143
      if errcode == errno.EADDRNOTAVAIL:
144
        success = False
145

    
146
  sock.settimeout(timeout)
147

    
148
  try:
149
    sock.connect((target, port))
150
    sock.close()
151
    success = True
152
  except socket.timeout:
153
    success = False
154
  except socket.error, (errcode):
155
    success = (not live_port_needed) and (errcode == errno.ECONNREFUSED)
156

    
157
  return success
158

    
159

    
160
def GetHosts(hostsfile):
161
  """Return list of hosts from hostfile.
162

    
163
  Reads the hostslist file and returns a list of hosts.
164
  Expects the hostslist file to contain one hostname per line.
165

    
166
  """
167
  try:
168
    datafile = open(hostsfile, "r")
169
  except IOError, msg:
170
    print "Failed to open hosts file %s: %s" % (hostsfile, msg)
171
    sys.exit(2)
172

    
173
  hosts = datafile.readlines()
174
  datafile.close()
175

    
176
  return hosts
177

    
178

    
179
def WriteLog(message, logfile):
180
  """Writes message, terminated by newline, to logfile."""
181
  try:
182
    logfile = open(logfile, "aw")
183
  except IOError, msg:
184
    print "failed to open log file %s: %s" % (logfile, msg)
185
    print "log message was: %s" % message
186
    sys.exit(1)  # no being able to log is critical
187
  try:
188
    timestamp = GetTimeStamp()
189
    logfile.writelines("%s %s\n" % (timestamp, message))
190
    logfile.close()
191
  except IOError, msg:
192
    print "failed to write to logfile %s: %s" % (logfile, msg)
193
    print "log message was: %s" % message
194
    sys.exit(1)  # no being able to log is critical
195

    
196

    
197
def GetAgentKeys():
198
  """Tries to get a list of ssh keys from an agent."""
199
  try:
200
    agent = paramiko.Agent()
201
    return list(agent.get_keys())
202
  except paramiko.SSHException:
203
    return []
204

    
205

    
206
def SetupSshConnection(host, username, password, use_agent, logfile):
207
  """Setup the ssh connection used for all later steps.
208

    
209
  This function sets up the ssh connection that will be used both
210
  for upload and remote command execution.
211

    
212
  On success, it will return paramiko.Transport object with an
213
  already logged in session. On failure, False will be returned.
214

    
215
  """
216
  # check if target is willing to talk to us at all
217
  if not PingByTcp(host, 22, live_port_needed=True):
218
    WriteLog("ERROR: FAILURE_NOT_REACHABLE", logfile)
219
    print "  - ERROR: host not reachable on 22/tcp"
220
    return False
221

    
222
  if use_agent:
223
    keys = GetAgentKeys()
224
  else:
225
    keys = []
226
  all_kwargs = [{"pkey": k} for k in keys]
227
  all_desc = ["key %d" % d for d in range(len(keys))]
228
  if password is not None:
229
    all_kwargs.append({"password": password})
230
    all_desc.append("password")
231

    
232
  # deal with logging out of paramiko.transport
233
  handler = None
234

    
235
  for desc, kwargs in zip(all_desc, all_kwargs):
236
    try:
237
      transport = paramiko.Transport((host, 22))
238

    
239
      # only try to setup the logging handler once
240
      if not handler:
241
        handler = logging.StreamHandler()
242
        handler.setLevel(logging.ERROR)
243
        log = logging.getLogger(transport.get_log_channel())
244
        log.addHandler(handler)
245

    
246
      transport.connect(username=username, **kwargs) # pylint: disable-msg=W0142
247
      WriteLog("ssh connection established using %s" % desc, logfile)
248
      # strange ... when establishing the session and the immediately
249
      # setting up the channels for sftp & shell from that, it sometimes
250
      # fails, but waiting 1 second after session setup makes it always work
251
      # time.sleep(1)
252
      # FIXME apparently needfull to give sshd some time
253
      return transport
254
    except (socket.gaierror, socket.error, paramiko.SSHException):
255
      continue
256

    
257
  methods = ", ".join(all_desc)
258
  WriteLog("ERROR: FAILURE_CONNECTION_SETUP (tried %s) " % methods, logfile)
259
  WriteLog("aborted", logfile)
260
  print "  - ERROR: connection setup failed (tried %s)" % methods
261

    
262
  return False
263

    
264

    
265
def UploadFiles(connection, executable, filelist, logfile):
266
  """Uploads the specified files via sftp.
267

    
268
  Uploads the specified files to a random, freshly created directory with
269
  a temporary name under /tmp. All uploaded files are chmod 0400 after upload
270
  with the exception of executable, with is chmod 500.
271

    
272
  Upon success, returns the absolute path to the remote upload directory,
273
  but will return False upon failure.
274
  """
275
  remote_dir = "%s.%s-%s" % (REMOTE_PATH_BASE,
276
                             random.random(), random.random())
277

    
278
  try:
279
    sftp = paramiko.SFTPClient.from_transport(connection)
280
    sftp.mkdir(remote_dir, mode=0700)
281
    for item in filelist:
282
      remote_file = "%s/%s" % (remote_dir, item.split("/").pop())
283
      WriteLog("uploading %s to remote %s" % (item, remote_file), logfile)
284
      sftp.put(item, remote_file)
285
      if item == executable:
286
        sftp.chmod(remote_file, 0500)
287
      else:
288
        sftp.chmod(remote_file, 0400)
289
    sftp.close()
290
  except IOError, err:
291
    WriteLog("ERROR: FAILURE_UPLOAD: %s" % err, logfile)
292
    return False
293

    
294
  return remote_dir
295

    
296

    
297
def CleanupRemoteDir(connection, upload_dir, filelist, logfile):
298
  """Cleanes out and removes the remote work directory."""
299
  try:
300
    sftp = paramiko.SFTPClient.from_transport(connection)
301
    for item in filelist:
302
      fullpath = "%s/%s" % (upload_dir, item.split("/").pop())
303
      WriteLog("removing remote %s" % fullpath, logfile)
304
      sftp.remove(fullpath)
305
    sftp.rmdir(upload_dir)
306
    sftp.close()
307
  except IOError, err:
308
    WriteLog("ERROR: FAILURE_CLEANUP: %s" % err, logfile)
309
    return False
310

    
311
  return True
312

    
313

    
314
def RunRemoteCommand(connection, command, logfile):
315
  """Execute the command via ssh on the remote host."""
316
  session = connection.open_session()
317
  session.setblocking(0)
318

    
319
  # the following dance is needed because paramiko changed APIs:
320
  # from returning True/False for success to always returning None
321
  # and throwing an exception in case of problems.
322
  # And I want to support both the old and the new API.
323
  result = True  # being optimistic here, I know
324
  message = None
325
  try:
326
    if session.exec_command("%s 2>&1" % command) is False:
327
      result = False
328
  except paramiko.SSHException, message:
329
    result = False
330

    
331
  if not result:
332
    WriteLog("ERROR: FAILURE_COMMAND_EXECUTION: %s" % message, logfile)
333
    return False
334

    
335
   ### Read when data is available
336
  output = ""
337
  while select.select([session], [], []):
338
    data = session.recv(1024)
339
    if not data:
340
      break
341
    output += data
342
    select.select([], [], [], .1)
343

    
344
  WriteLog("SUCCESS: command output follows", logfile)
345
  for line in output.split("\n"):
346
    WriteLog("output = %s" %line, logfile)
347
  WriteLog("command execution completed", logfile)
348
  session.close()
349

    
350
  return True
351

    
352

    
353
def HostWorker(logdir, username, password, use_agent, hostname,
354
               executable, command, filelist):
355
  """Per-host worker.
356

    
357
  This function does not return - it's the main code of the childs,
358
  which exit at the end of this function. The exit code 0 or 1 will be
359
  interpreted by the parent.
360

    
361
  @param logdir: the directory where the logfiles must be created
362
  @param username: SSH username
363
  @param password: SSH password
364
  @param use_agent: whether we should instead use an agent
365
  @param hostname: the hostname to connect to
366
  @param executable: the executable to upload, if not None
367
  @param command: the command to run
368
  @param filelist: auxiliary files to upload
369

    
370
  """
371
  # in the child/worker process
372
  logfile = "%s/%s.log" % (logdir, hostname)
373
  print "%s - starting" % hostname
374
  result = 0  # optimism, I know
375
  try:
376
    connection = SetupSshConnection(hostname, username,
377
                                    password, use_agent, logfile)
378
    if connection is not False:
379
      if executable is not None:
380
        print "  %s: uploading files" % hostname
381
        upload_dir = UploadFiles(connection, executable,
382
                                 filelist, logfile)
383
        command = "cd %s && ./%s" % (upload_dir,
384
                                     executable.split("/").pop())
385
      print "  %s: executing remote command" % hostname
386
      cmd_result = RunRemoteCommand(connection, command, logfile)
387
      if cmd_result is True:
388
        print "  %s: remote command execution successful" % hostname
389
      else:
390
        print ("  %s: remote command execution failed,"
391
               " check log for details" % hostname)
392
        result = 1
393
      if executable is not None:
394
        print "  %s: cleaning up remote work dir" % hostname
395
        cln_result = CleanupRemoteDir(connection, upload_dir,
396
                                      filelist, logfile)
397
        if cln_result is False:
398
          print ("  %s: remote work dir cleanup failed, check"
399
                 " log for details" % hostname)
400
          result = 1
401
      connection.close()
402
    else:
403
      print "  %s: connection setup failed, skipping" % hostname
404
      result = 1
405
  except KeyboardInterrupt:
406
    print "  %s: received KeyboardInterrupt, aborting" % hostname
407
    WriteLog("ERROR: ABORT_KEYBOARD_INTERRUPT", logfile)
408
    result = 1
409
  except Exception, err:
410
    result = 1
411
    trace = traceback.format_exc()
412
    msg = "ERROR: UNHANDLED_EXECPTION_ERROR: %s\nTrace: %s" % (err, trace)
413
    WriteLog(msg, logfile)
414
    print "  %s: %s" % (hostname, msg)
415
  # and exit with exit code 0 or 1, so the parent can compute statistics
416
  sys.exit(result)
417

    
418

    
419
def LaunchWorker(child_pids, logdir, username, password, use_agent, hostname,
420
                 executable, command, filelist):
421
  """Launch the per-host worker.
422

    
423
  Arguments are the same as for HostWorker, except for child_pids,
424
  which is a dictionary holding the pid-to-hostname mapping.
425

    
426
  """
427
  hostname = hostname.rstrip("\n")
428
  pid = os.fork()
429
  if pid > 0:
430
    # controller just record the pids
431
    child_pids[pid] = hostname
432
  else:
433
    HostWorker(logdir, username, password, use_agent, hostname,
434
               executable, command, filelist)
435

    
436

    
437
def main():
438
  """main."""
439
  try:
440
    optlist, _ = getopt.getopt(sys.argv[1:], "l:x:h:f:a:c:b:u:p:A")
441
  except getopt.GetoptError, err:
442
    print str(err)
443
    ShowHelp(sys.argv[0])
444
    sys.exit(2)
445

    
446
  logdir = executable = hostfile = hostlist = command = None
447
  use_agent = False
448
  auxfiles = []
449
  username = "root"
450
  password = None
451
  batch_size = 15
452
  for option in optlist:
453
    if option[0] == "-l":
454
      logdir = option[1]
455
    if option[0] == "-x":
456
      executable = option[1]
457
    if option[0] == "-f":
458
      hostfile = option[1]
459
    if option[0] == "-h":
460
      hostlist = option[1]
461
    if option[0] == "-a":
462
      auxfiles.append(option[1])
463
    if option[0] == "-c":
464
      command = option[1]
465
    if option[0] == "-b":
466
      batch_size = int(option[1])
467
    if option[0] == "-u":
468
      username = option[1]
469
    if option[0] == "-p":
470
      password = option[1]
471
    if option[0] == "-A":
472
      use_agent = True
473

    
474
  if not (logdir and (executable or command) and (hostfile or hostlist)):
475
    print "error: missing required commandline argument(s)"
476
    ShowHelp(sys.argv[0])
477
    sys.exit(3)
478

    
479
  if executable and command:
480
    print "error: can run either a command or an executable, not both"
481
    ShowHelp(sys.argv[0])
482
    sys.exit(3)
483

    
484
  if hostlist and hostfile:
485
    print "error: specify either -f or -h arguments, not both"
486
    ShowHelp(sys.argv[0])
487
    sys.exit(3)
488

    
489
  ### Unbuffered sys.stdout
490
  sys.stdout = os.fdopen(1, "w", 0)
491

    
492
  if LogDirUseable(logdir) is False:
493
    print "ERROR: cannot create logfiles in dir %s, aborting" % logdir
494
    sys.exit(1)
495

    
496
  if use_agent:
497
    pass
498
  elif password:
499
    try:
500
      fh = file(password)
501
      pwvalue = fh.readline().strip()
502
      fh.close()
503
    except IOError, e:
504
      print "error: can not read in from password file %s: %s" % (password, e)
505
      sys.exit(1)
506
    password = pwvalue
507
  else:
508
    password = getpass.getpass("%s's password for all nodes: " % username)
509

    
510
  if hostfile:
511
    hosts = GetHosts(hostfile)
512
  else:
513
    if "," in hostlist:
514
      hostlist = hostlist.rstrip(",")  # commandline robustness
515
      hosts = hostlist.split(",")
516
    else:
517
      hosts = [hostlist]
518

    
519
  successes = failures = 0
520

    
521
  filelist = auxfiles[:]
522
  filelist.append(executable)
523

    
524
  # initial batch
525
  batch = hosts[:batch_size]
526
  hosts = hosts[batch_size:]
527
  child_pids = {}
528
  for hostname in batch:
529
    LaunchWorker(child_pids, logdir, username, password, use_agent, hostname,
530
                 executable, command, filelist)
531

    
532
  while child_pids:
533
    pid, status = os.wait()
534
    hostname = child_pids.pop(pid, "<unknown host>")
535
    print "  %s: done (in parent)" % hostname
536
    if os.WIFEXITED(status) and os.WEXITSTATUS(status) == 0:
537
      successes += 1
538
    else:
539
      failures += 1
540
    if hosts:
541
      LaunchWorker(child_pids, logdir, username, password, use_agent,
542
                   hosts.pop(0), executable, command, filelist)
543

    
544
  print
545
  print "All done, %s successful and %s failed hosts" % (successes, failures)
546

    
547
  sys.exit(0)
548

    
549

    
550
if __name__ == "__main__":
551
  try:
552
    main()
553
  except KeyboardInterrupt:
554
    print "Received KeyboardInterrupt, aborting"
555
    sys.exit(1)