From c5df45a14f878d08e0b7c4312cf9feda99c85e85 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Tue, 11 Nov 2014 11:23:14 +0100 Subject: [PATCH] Minor bug fix (skipping HTML decoding in heuristic mode) --- lib/request/basic.py | 48 ++++++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/lib/request/basic.py b/lib/request/basic.py index 2fa61b6d2..72a5b03f7 100755 --- a/lib/request/basic.py +++ b/lib/request/basic.py @@ -268,33 +268,37 @@ def decodePage(page, contentEncoding, contentType): # can't do for all responses because we need to support binary files too if contentType and not isinstance(page, unicode) and "text/" in contentType.lower(): - # e.g. Ãëàâà - if "&#" in page: - page = re.sub(r"&#(\d{1,3});", lambda _: chr(int(_.group(1))) if int(_.group(1)) < 256 else _.group(0), page) + if kb.heuristicMode: + kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page)) + page = getUnicode(page, kb.pageEncoding) + else: + # e.g. Ãëàâà + if "&#" in page: + page = re.sub(r"&#(\d{1,3});", lambda _: chr(int(_.group(1))) if int(_.group(1)) < 256 else _.group(0), page) - # e.g. %20%28%29 - if "%" in page: - page = re.sub(r"%([0-9a-fA-F]{2})", lambda _: _.group(1).decode("hex"), page) + # e.g. %20%28%29 + if "%" in page: + page = re.sub(r"%([0-9a-fA-F]{2})", lambda _: _.group(1).decode("hex"), page) - # e.g. & - page = re.sub(r"&([^;]+);", lambda _: chr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 256) < 256 else _.group(0), page) + # e.g. & + page = re.sub(r"&([^;]+);", lambda _: chr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 256) < 256 else _.group(0), page) - kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page)) - page = getUnicode(page, kb.pageEncoding) + kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page)) + page = getUnicode(page, kb.pageEncoding) - # e.g. ’…™ - if "&#" in page: - def _(match): - retVal = match.group(0) - try: - retVal = unichr(int(match.group(1))) - except ValueError: - pass - return retVal - page = re.sub(r"&#(\d+);", _, page) + # e.g. ’…™ + if "&#" in page: + def _(match): + retVal = match.group(0) + try: + retVal = unichr(int(match.group(1))) + except ValueError: + pass + return retVal + page = re.sub(r"&#(\d+);", _, page) - # e.g. ζ - page = re.sub(r"&([^;]+);", lambda _: unichr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 0) > 255 else _.group(0), page) + # e.g. ζ + page = re.sub(r"&([^;]+);", lambda _: unichr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 0) > 255 else _.group(0), page) return page