From 62980d7d5afdc30f0bd71bc44b18c55a66d2e5d0 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Tue, 5 Mar 2013 17:32:10 +0100 Subject: [PATCH] Automatically decoding url encoded data in response --- lib/request/basic.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/lib/request/basic.py b/lib/request/basic.py index 6eb3dbc31..08bdab789 100644 --- a/lib/request/basic.py +++ b/lib/request/basic.py @@ -227,23 +227,27 @@ def decodePage(page, contentEncoding, contentType): kb.pageEncoding = conf.charset # can't do for all responses because we need to support binary files too - if contentType and not isinstance(page, unicode) and any(map(lambda _: _ in contentType.lower(), ("text/txt", "text/raw", "text/html", "text/xml"))): + if contentType and not isinstance(page, unicode) and "text/" in contentType.lower(): # e.g. Ãëàâà if "&#" in page: - page = re.sub('&#(\d{1,3});', lambda _: chr(int(_.group(1))) if int(_.group(1)) < 256 else _.group(0), page) + page = re.sub(r"&#(\d{1,3});", lambda _: chr(int(_.group(1))) if int(_.group(1)) < 256 else _.group(0), page) + + # e.g. %20%28%29 + if "%" in page: + page = re.sub(r"%([0-9a-fA-F]{2})", lambda _: _.group(1).decode("hex"), page) # e.g. & - page = re.sub('&([^;]+);', lambda _: chr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 256) < 256 else _.group(0), page) + page = re.sub(r"&([^;]+);", lambda _: chr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 256) < 256 else _.group(0), page) kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page)) page = getUnicode(page, kb.pageEncoding) # e.g. ’…™ if "&#" in page: - page = re.sub('&#(\d+);', lambda _: unichr(int(_.group(1))), page) + page = re.sub(r"&#(\d+);", lambda _: unichr(int(_.group(1))), page) # e.g. ζ - page = re.sub('&([^;]+);', lambda _: unichr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 0) > 255 else _.group(0), page) + page = re.sub(r"&([^;]+);", lambda _: unichr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 0) > 255 else _.group(0), page) return page