Revision 194a7cc0 snf-common/synnefo/util/urltools.py

b/snf-common/synnefo/util/urltools.py
26 26
from posixpath import normpath
27 27

  
28 28

  
29
__all__ = ["ParseResult", "SplitResult", "parse", "extract", "split",
30
           "split_netloc", "split_host", "assemble", "encode", "normalize",
29
__all__ = ["ParseResult", "SplitResult", "split",
30
           "split_netloc", "assemble", "normalize",
31 31
           "normalize_host", "normalize_path", "normalize_query",
32 32
           "normalize_fragment", "unquote"]
33 33

  
34 34

  
35
PSL_URL = 'http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1'
36

  
37
def _get_public_suffix_list():
38
    """Get the public suffix list.
39
    """
40
    local_psl = os.environ.get('PUBLIC_SUFFIX_LIST')
41
    if local_psl:
42
        psl_raw = open(local_psl).readlines()
43
    else:
44
        psl_raw = urllib.urlopen(PSL_URL).readlines()
45
    psl = set()
46
    for line in psl_raw:
47
        item = line.strip()
48
        if item != '' and not item.startswith('//'):
49
            psl.add(item)
50
    return psl
51

  
52
PSL = _get_public_suffix_list()
35
#PSL_URL = 'http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1'
36
#
37
#def _get_public_suffix_list():
38
#    """Get the public suffix list.
39
#    """
40
#    local_psl = os.environ.get('PUBLIC_SUFFIX_LIST')
41
#    if local_psl:
42
#        psl_raw = open(local_psl).readlines()
43
#    else:
44
#        psl_raw = urllib.urlopen(PSL_URL).readlines()
45
#    psl = set()
46
#    for line in psl_raw:
47
#        item = line.strip()
48
#        if item != '' and not item.startswith('//'):
49
#            psl.add(item)
50
#    return psl
51
#
52
#PSL = _get_public_suffix_list()
53 53

  
54 54

  
55 55
SCHEMES = ['http', 'https', 'ftp', 'sftp', 'file', 'gopher', 'imap', 'mms',
......
114 114
    return assemble(result)
115 115

  
116 116

  
117
def encode(url):
118
    """Encode URL
119
    """
120
    parts = extract(url)
121
    encoded = ParseResult(*(_idna_encode(p) for p in parts))
122
    return assemble(encoded)
117
#def encode(url):
118
#    """Encode URL
119
#    """
120
#    parts = extract(url)
121
#    encoded = ParseResult(*(_idna_encode(p) for p in parts))
122
#    return assemble(encoded)
123 123

  
124 124

  
125 125
def assemble(parts):
......
222 222
    return ''.join(res)
223 223

  
224 224

  
225
def parse(url):
226
    """Parse a URL
227
    """
228
    parts = split(url)
229
    if parts.scheme:
230
        (username, password, host, port) = split_netloc(parts.netloc)
231
        (subdomain, domain, tld) = split_host(host)
232
    else:
233
        username = password = subdomain = domain = tld = port = ''
234
    return ParseResult(parts.scheme, username, password, subdomain, domain, tld,
235
                       port, parts.path, parts.query, parts.fragment)
236

  
237

  
238
def extract(url):
239
    """Extract as much information from a (relative) URL as possible
240
    """
241
    parts = split(url)
242
    if parts.scheme:
243
        netloc = parts.netloc
244
        path = parts.path
245
    else:
246
        netloc = parts.path
247
        path = ''
248
        if '/' in netloc:
249
            tmp = netloc.split('/', 1)
250
            netloc = tmp[0]
251
            path = '/' + tmp[1]
252
    (username, password, host, port) = split_netloc(netloc)
253
    (subdomain, domain, tld) = split_host(host)
254
    return ParseResult(parts.scheme, username, password, subdomain, domain, tld,
255
                       port, path, parts.query, parts.fragment)
225
#def parse(url):
226
#    """Parse a URL
227
#    """
228
#    parts = split(url)
229
#    if parts.scheme:
230
#        (username, password, host, port) = split_netloc(parts.netloc)
231
#        (subdomain, domain, tld) = split_host(host)
232
#    else:
233
#        username = password = subdomain = domain = tld = port = ''
234
#    return ParseResult(parts.scheme, username, password, subdomain, domain, tld,
235
#                       port, parts.path, parts.query, parts.fragment)
236

  
237

  
238
#def extract(url):
239
#    """Extract as much information from a (relative) URL as possible
240
#    """
241
#    parts = split(url)
242
#    if parts.scheme:
243
#        netloc = parts.netloc
244
#        path = parts.path
245
#    else:
246
#        netloc = parts.path
247
#        path = ''
248
#        if '/' in netloc:
249
#            tmp = netloc.split('/', 1)
250
#            netloc = tmp[0]
251
#            path = '/' + tmp[1]
252
#    (username, password, host, port) = split_netloc(netloc)
253
#    (subdomain, domain, tld) = split_host(host)
254
#    return ParseResult(parts.scheme, username, password, subdomain, domain, tld,
255
#                       port, path, parts.query, parts.fragment)
256 256

  
257 257

  
258 258
def split(url):
......
341 341
    return username, password, host, port
342 342

  
343 343

  
344
def split_host(host):
345
    """Use the Public Suffix List to split host into subdomain, domain and tld
346
    """
347
    if '[' in host:
348
        return '', host, ''
349
    domain = subdomain = tld = ''
350
    for c in host:
351
        if c not in IP_CHARS:
352
            break
353
    else:
354
        return '', host, ''
355
    parts = host.split('.')
356
    for i in range(len(parts)):
357
        tld = '.'.join(parts[i:])
358
        wildcard_tld = '*.' + tld
359
        exception_tld = '!' + tld
360
        if exception_tld in PSL:
361
            domain = '.'.join(parts[:i+1])
362
            tld = '.'.join(parts[i+1:])
363
            break
364
        if tld in PSL:
365
            domain = '.'.join(parts[:i])
366
            break
367
        if wildcard_tld in PSL:
368
            domain = '.'.join(parts[:i-1])
369
            tld = '.'.join(parts[i-1:])
370
            break
371
    if '.' in domain:
372
        (subdomain, domain) = domain.rsplit('.', 1) 
373
    return subdomain, domain, tld
344
#def split_host(host):
345
#    """Use the Public Suffix List to split host into subdomain, domain and tld
346
#    """
347
#    if '[' in host:
348
#        return '', host, ''
349
#    domain = subdomain = tld = ''
350
#    for c in host:
351
#        if c not in IP_CHARS:
352
#            break
353
#    else:
354
#        return '', host, ''
355
#    parts = host.split('.')
356
#    for i in range(len(parts)):
357
#        tld = '.'.join(parts[i:])
358
#        wildcard_tld = '*.' + tld
359
#        exception_tld = '!' + tld
360
#        if exception_tld in PSL:
361
#            domain = '.'.join(parts[:i+1])
362
#            tld = '.'.join(parts[i+1:])
363
#            break
364
#        if tld in PSL:
365
#            domain = '.'.join(parts[:i])
366
#            break
367
#        if wildcard_tld in PSL:
368
#            domain = '.'.join(parts[:i-1])
369
#            tld = '.'.join(parts[i-1:])
370
#            break
371
#    if '.' in domain:
372
#        (subdomain, domain) = domain.rsplit('.', 1)
373
#    return subdomain, domain, tld

Also available in: Unified diff