diff --git a/lib/core/common.py b/lib/core/common.py index c7f8afb85..788abbbd5 100644 --- a/lib/core/common.py +++ b/lib/core/common.py @@ -3004,3 +3004,57 @@ def randomizeParameterValue(value): retVal = retVal.replace(match.group(), str(randomInt(len(match.group())))) return retVal + +def asciifyUrl(url, force_quote=False): + """ + Attempts to make a unicode url usuable with ``urllib/urllib2``. + + More specifically, it attempts to convert the unicode object ``url``, + which is meant to represent a IRI, to an unicode object that, + containing only ASCII characters, is a valid URI. This involves: + + * IDNA/Puny-encoding the domain name. + * UTF8-quoting the path and querystring parts. + + See also RFC 3987. + + Reference: http://blog.elsdoerfer.name/2008/12/12/opening-iris-in-python/ + """ + + parts = urlparse.urlsplit(url) + if not parts.scheme or not parts.netloc: + # apparently not an url + return url + + # idna-encode domain + hostname = parts.hostname.encode('idna') + + # UTF8-quote the other parts. We check each part individually if + # if needs to be quoted - that should catch some additional user + # errors, say for example an umlaut in the username even though + # the path *is* already quoted. + def quote(s, safe): + s = s or '' + # Triggers on non-ascii characters - another option would be: + # urllib.quote(s.replace('%', '')) != s.replace('%', '') + # which would trigger on all %-characters, e.g. "&". + if s.encode('ascii', 'replace') != s or force_quote: + return urllib.quote(s.encode('utf8'), safe=safe) + return s + + username = quote(parts.username, '') + password = quote(parts.password, safe='') + path = quote(parts.path, safe='/') + query = quote(parts.query, safe='&=') + + # put everything back together + netloc = hostname + if username or password: + netloc = '@' + netloc + if password: + netloc = ':' + password + netloc + netloc = username + netloc + if parts.port: + netloc += ':' + str(parts.port) + + return urlparse.urlunsplit([parts.scheme, netloc, path, query, parts.fragment]) \ No newline at end of file diff --git a/lib/request/connect.py b/lib/request/connect.py index 6df954892..07a7b372c 100644 --- a/lib/request/connect.py +++ b/lib/request/connect.py @@ -17,6 +17,7 @@ import traceback from extra.multipart import multipartpost from lib.core.agent import agent +from lib.core.common import asciifyUrl from lib.core.common import average from lib.core.common import calculateDeltaSeconds from lib.core.common import clearConsoleLine @@ -160,6 +161,10 @@ class Connect: responseHeaders = None logHeaders = "" + # support for non-latin URLs (e.g. cyrilic) as urllib/urllib2 doesn't + # support those by default + url = asciifyUrl(url) + # fix for known issues when using url in unicode format # (e.g. UnicodeDecodeError: "url = url + '?' + query" in redirect case) url = unicodeencode(url)