support for non-latin (e.g. cyrillic) URLs

This commit is contained in:
Miroslav Stampar 2011-10-23 17:02:48 +00:00
parent 1c3f4e9e54
commit 3f0517d3f3
2 changed files with 59 additions and 0 deletions

View File

@ -3004,3 +3004,57 @@ def randomizeParameterValue(value):
retVal = retVal.replace(match.group(), str(randomInt(len(match.group())))) retVal = retVal.replace(match.group(), str(randomInt(len(match.group()))))
return retVal return retVal
def asciifyUrl(url, force_quote=False):
"""
Attempts to make a unicode url usuable with ``urllib/urllib2``.
More specifically, it attempts to convert the unicode object ``url``,
which is meant to represent a IRI, to an unicode object that,
containing only ASCII characters, is a valid URI. This involves:
* IDNA/Puny-encoding the domain name.
* UTF8-quoting the path and querystring parts.
See also RFC 3987.
Reference: http://blog.elsdoerfer.name/2008/12/12/opening-iris-in-python/
"""
parts = urlparse.urlsplit(url)
if not parts.scheme or not parts.netloc:
# apparently not an url
return url
# idna-encode domain
hostname = parts.hostname.encode('idna')
# UTF8-quote the other parts. We check each part individually if
# if needs to be quoted - that should catch some additional user
# errors, say for example an umlaut in the username even though
# the path *is* already quoted.
def quote(s, safe):
s = s or ''
# Triggers on non-ascii characters - another option would be:
# urllib.quote(s.replace('%', '')) != s.replace('%', '')
# which would trigger on all %-characters, e.g. "&".
if s.encode('ascii', 'replace') != s or force_quote:
return urllib.quote(s.encode('utf8'), safe=safe)
return s
username = quote(parts.username, '')
password = quote(parts.password, safe='')
path = quote(parts.path, safe='/')
query = quote(parts.query, safe='&=')
# put everything back together
netloc = hostname
if username or password:
netloc = '@' + netloc
if password:
netloc = ':' + password + netloc
netloc = username + netloc
if parts.port:
netloc += ':' + str(parts.port)
return urlparse.urlunsplit([parts.scheme, netloc, path, query, parts.fragment])

View File

@ -17,6 +17,7 @@ import traceback
from extra.multipart import multipartpost from extra.multipart import multipartpost
from lib.core.agent import agent from lib.core.agent import agent
from lib.core.common import asciifyUrl
from lib.core.common import average from lib.core.common import average
from lib.core.common import calculateDeltaSeconds from lib.core.common import calculateDeltaSeconds
from lib.core.common import clearConsoleLine from lib.core.common import clearConsoleLine
@ -160,6 +161,10 @@ class Connect:
responseHeaders = None responseHeaders = None
logHeaders = "" logHeaders = ""
# support for non-latin URLs (e.g. cyrilic) as urllib/urllib2 doesn't
# support those by default
url = asciifyUrl(url)
# fix for known issues when using url in unicode format # fix for known issues when using url in unicode format
# (e.g. UnicodeDecodeError: "url = url + '?' + query" in redirect case) # (e.g. UnicodeDecodeError: "url = url + '?' + query" in redirect case)
url = unicodeencode(url) url = unicodeencode(url)