Revision 194a7cc0 snf-common/synnefo/util/urltools.py
b/snf-common/synnefo/util/urltools.py | ||
---|---|---|
26 | 26 |
from posixpath import normpath |
27 | 27 |
|
28 | 28 |
|
29 |
__all__ = ["ParseResult", "SplitResult", "parse", "extract", "split",
|
|
30 |
"split_netloc", "split_host", "assemble", "encode", "normalize",
|
|
29 |
__all__ = ["ParseResult", "SplitResult", "split", |
|
30 |
"split_netloc", "assemble", "normalize",
|
|
31 | 31 |
"normalize_host", "normalize_path", "normalize_query", |
32 | 32 |
"normalize_fragment", "unquote"] |
33 | 33 |
|
34 | 34 |
|
35 |
PSL_URL = 'http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1' |
|
36 |
|
|
37 |
def _get_public_suffix_list(): |
|
38 |
"""Get the public suffix list. |
|
39 |
""" |
|
40 |
local_psl = os.environ.get('PUBLIC_SUFFIX_LIST') |
|
41 |
if local_psl: |
|
42 |
psl_raw = open(local_psl).readlines() |
|
43 |
else: |
|
44 |
psl_raw = urllib.urlopen(PSL_URL).readlines() |
|
45 |
psl = set() |
|
46 |
for line in psl_raw: |
|
47 |
item = line.strip() |
|
48 |
if item != '' and not item.startswith('//'): |
|
49 |
psl.add(item) |
|
50 |
return psl |
|
51 |
|
|
52 |
PSL = _get_public_suffix_list() |
|
35 |
#PSL_URL = 'http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1'
|
|
36 |
# |
|
37 |
#def _get_public_suffix_list():
|
|
38 |
# """Get the public suffix list.
|
|
39 |
# """
|
|
40 |
# local_psl = os.environ.get('PUBLIC_SUFFIX_LIST')
|
|
41 |
# if local_psl:
|
|
42 |
# psl_raw = open(local_psl).readlines()
|
|
43 |
# else:
|
|
44 |
# psl_raw = urllib.urlopen(PSL_URL).readlines()
|
|
45 |
# psl = set()
|
|
46 |
# for line in psl_raw:
|
|
47 |
# item = line.strip()
|
|
48 |
# if item != '' and not item.startswith('//'):
|
|
49 |
# psl.add(item)
|
|
50 |
# return psl
|
|
51 |
# |
|
52 |
#PSL = _get_public_suffix_list()
|
|
53 | 53 |
|
54 | 54 |
|
55 | 55 |
SCHEMES = ['http', 'https', 'ftp', 'sftp', 'file', 'gopher', 'imap', 'mms', |
... | ... | |
114 | 114 |
return assemble(result) |
115 | 115 |
|
116 | 116 |
|
117 |
def encode(url): |
|
118 |
"""Encode URL |
|
119 |
""" |
|
120 |
parts = extract(url) |
|
121 |
encoded = ParseResult(*(_idna_encode(p) for p in parts)) |
|
122 |
return assemble(encoded) |
|
117 |
#def encode(url):
|
|
118 |
# """Encode URL
|
|
119 |
# """
|
|
120 |
# parts = extract(url)
|
|
121 |
# encoded = ParseResult(*(_idna_encode(p) for p in parts))
|
|
122 |
# return assemble(encoded)
|
|
123 | 123 |
|
124 | 124 |
|
125 | 125 |
def assemble(parts): |
... | ... | |
222 | 222 |
return ''.join(res) |
223 | 223 |
|
224 | 224 |
|
225 |
def parse(url): |
|
226 |
"""Parse a URL |
|
227 |
""" |
|
228 |
parts = split(url) |
|
229 |
if parts.scheme: |
|
230 |
(username, password, host, port) = split_netloc(parts.netloc) |
|
231 |
(subdomain, domain, tld) = split_host(host) |
|
232 |
else: |
|
233 |
username = password = subdomain = domain = tld = port = '' |
|
234 |
return ParseResult(parts.scheme, username, password, subdomain, domain, tld, |
|
235 |
port, parts.path, parts.query, parts.fragment) |
|
236 |
|
|
237 |
|
|
238 |
def extract(url): |
|
239 |
"""Extract as much information from a (relative) URL as possible |
|
240 |
""" |
|
241 |
parts = split(url) |
|
242 |
if parts.scheme: |
|
243 |
netloc = parts.netloc |
|
244 |
path = parts.path |
|
245 |
else: |
|
246 |
netloc = parts.path |
|
247 |
path = '' |
|
248 |
if '/' in netloc: |
|
249 |
tmp = netloc.split('/', 1) |
|
250 |
netloc = tmp[0] |
|
251 |
path = '/' + tmp[1] |
|
252 |
(username, password, host, port) = split_netloc(netloc) |
|
253 |
(subdomain, domain, tld) = split_host(host) |
|
254 |
return ParseResult(parts.scheme, username, password, subdomain, domain, tld, |
|
255 |
port, path, parts.query, parts.fragment) |
|
225 |
#def parse(url):
|
|
226 |
# """Parse a URL
|
|
227 |
# """
|
|
228 |
# parts = split(url)
|
|
229 |
# if parts.scheme:
|
|
230 |
# (username, password, host, port) = split_netloc(parts.netloc)
|
|
231 |
# (subdomain, domain, tld) = split_host(host)
|
|
232 |
# else:
|
|
233 |
# username = password = subdomain = domain = tld = port = ''
|
|
234 |
# return ParseResult(parts.scheme, username, password, subdomain, domain, tld,
|
|
235 |
# port, parts.path, parts.query, parts.fragment)
|
|
236 |
|
|
237 |
|
|
238 |
#def extract(url):
|
|
239 |
# """Extract as much information from a (relative) URL as possible
|
|
240 |
# """
|
|
241 |
# parts = split(url)
|
|
242 |
# if parts.scheme:
|
|
243 |
# netloc = parts.netloc
|
|
244 |
# path = parts.path
|
|
245 |
# else:
|
|
246 |
# netloc = parts.path
|
|
247 |
# path = ''
|
|
248 |
# if '/' in netloc:
|
|
249 |
# tmp = netloc.split('/', 1)
|
|
250 |
# netloc = tmp[0]
|
|
251 |
# path = '/' + tmp[1]
|
|
252 |
# (username, password, host, port) = split_netloc(netloc)
|
|
253 |
# (subdomain, domain, tld) = split_host(host)
|
|
254 |
# return ParseResult(parts.scheme, username, password, subdomain, domain, tld,
|
|
255 |
# port, path, parts.query, parts.fragment)
|
|
256 | 256 |
|
257 | 257 |
|
258 | 258 |
def split(url): |
... | ... | |
341 | 341 |
return username, password, host, port |
342 | 342 |
|
343 | 343 |
|
344 |
def split_host(host): |
|
345 |
"""Use the Public Suffix List to split host into subdomain, domain and tld |
|
346 |
""" |
|
347 |
if '[' in host: |
|
348 |
return '', host, '' |
|
349 |
domain = subdomain = tld = '' |
|
350 |
for c in host: |
|
351 |
if c not in IP_CHARS: |
|
352 |
break |
|
353 |
else: |
|
354 |
return '', host, '' |
|
355 |
parts = host.split('.') |
|
356 |
for i in range(len(parts)): |
|
357 |
tld = '.'.join(parts[i:]) |
|
358 |
wildcard_tld = '*.' + tld |
|
359 |
exception_tld = '!' + tld |
|
360 |
if exception_tld in PSL: |
|
361 |
domain = '.'.join(parts[:i+1]) |
|
362 |
tld = '.'.join(parts[i+1:]) |
|
363 |
break |
|
364 |
if tld in PSL: |
|
365 |
domain = '.'.join(parts[:i]) |
|
366 |
break |
|
367 |
if wildcard_tld in PSL: |
|
368 |
domain = '.'.join(parts[:i-1]) |
|
369 |
tld = '.'.join(parts[i-1:]) |
|
370 |
break |
|
371 |
if '.' in domain: |
|
372 |
(subdomain, domain) = domain.rsplit('.', 1) |
|
373 |
return subdomain, domain, tld |
|
344 |
#def split_host(host): |
|
345 |
# """Use the Public Suffix List to split host into subdomain, domain and tld |
|
346 |
# """ |
|
347 |
# if '[' in host: |
|
348 |
# return '', host, '' |
|
349 |
# domain = subdomain = tld = '' |
|
350 |
# for c in host: |
|
351 |
# if c not in IP_CHARS: |
|
352 |
# break |
|
353 |
# else: |
|
354 |
# return '', host, '' |
|
355 |
# parts = host.split('.') |
|
356 |
# for i in range(len(parts)): |
|
357 |
# tld = '.'.join(parts[i:]) |
|
358 |
# wildcard_tld = '*.' + tld |
|
359 |
# exception_tld = '!' + tld |
|
360 |
# if exception_tld in PSL: |
|
361 |
# domain = '.'.join(parts[:i+1]) |
|
362 |
# tld = '.'.join(parts[i+1:]) |
|
363 |
# break |
|
364 |
# if tld in PSL: |
|
365 |
# domain = '.'.join(parts[:i]) |
|
366 |
# break |
|
367 |
# if wildcard_tld in PSL: |
|
368 |
# domain = '.'.join(parts[:i-1]) |
|
369 |
# tld = '.'.join(parts[i-1:]) |
|
370 |
# break |
|
371 |
# if '.' in domain: |
|
372 |
# (subdomain, domain) = domain.rsplit('.', 1) |
|
373 |
# return subdomain, domain, tld |
Also available in: Unified diff