support for non-latin (e.g. cyrillic) URLs

This commit is contained in:
Miroslav Stampar 2011-10-23 17:02:48 +00:00
parent 1c3f4e9e54
commit 3f0517d3f3
2 changed files with 59 additions and 0 deletions

View File

@ -3004,3 +3004,57 @@ def randomizeParameterValue(value):
retVal = retVal.replace(match.group(), str(randomInt(len(match.group()))))
return retVal
def asciifyUrl(url, force_quote=False):
"""
Attempts to make a unicode url usuable with ``urllib/urllib2``.
More specifically, it attempts to convert the unicode object ``url``,
which is meant to represent a IRI, to an unicode object that,
containing only ASCII characters, is a valid URI. This involves:
* IDNA/Puny-encoding the domain name.
* UTF8-quoting the path and querystring parts.
See also RFC 3987.
Reference: http://blog.elsdoerfer.name/2008/12/12/opening-iris-in-python/
"""
parts = urlparse.urlsplit(url)
if not parts.scheme or not parts.netloc:
# apparently not an url
return url
# idna-encode domain
hostname = parts.hostname.encode('idna')
# UTF8-quote the other parts. We check each part individually if
# if needs to be quoted - that should catch some additional user
# errors, say for example an umlaut in the username even though
# the path *is* already quoted.
def quote(s, safe):
s = s or ''
# Triggers on non-ascii characters - another option would be:
# urllib.quote(s.replace('%', '')) != s.replace('%', '')
# which would trigger on all %-characters, e.g. "&".
if s.encode('ascii', 'replace') != s or force_quote:
return urllib.quote(s.encode('utf8'), safe=safe)
return s
username = quote(parts.username, '')
password = quote(parts.password, safe='')
path = quote(parts.path, safe='/')
query = quote(parts.query, safe='&=')
# put everything back together
netloc = hostname
if username or password:
netloc = '@' + netloc
if password:
netloc = ':' + password + netloc
netloc = username + netloc
if parts.port:
netloc += ':' + str(parts.port)
return urlparse.urlunsplit([parts.scheme, netloc, path, query, parts.fragment])

View File

@ -17,6 +17,7 @@ import traceback
from extra.multipart import multipartpost
from lib.core.agent import agent
from lib.core.common import asciifyUrl
from lib.core.common import average
from lib.core.common import calculateDeltaSeconds
from lib.core.common import clearConsoleLine
@ -160,6 +161,10 @@ class Connect:
responseHeaders = None
logHeaders = ""
# support for non-latin URLs (e.g. cyrilic) as urllib/urllib2 doesn't
# support those by default
url = asciifyUrl(url)
# fix for known issues when using url in unicode format
# (e.g. UnicodeDecodeError: "url = url + '?' + query" in redirect case)
url = unicodeencode(url)