diff --git a/lib/core/convert.py b/lib/core/convert.py index fa8b6ebd9..5c8c71e2d 100644 --- a/lib/core/convert.py +++ b/lib/core/convert.py @@ -144,5 +144,4 @@ def htmlunescape(value): if value and isinstance(value, basestring): codes = (('<', '<'), ('>', '>'), ('"', '"'), (' ', ' '), ('&', '&')) retVal = reduce(lambda x, y: x.replace(y[0], y[1]), codes, retVal) - retVal = re.sub('&#(\d+);', lambda x: getUnicode(chr(x.group(1))), retVal) return retVal diff --git a/lib/request/basic.py b/lib/request/basic.py index 07b718ced..506b60e47 100644 --- a/lib/request/basic.py +++ b/lib/request/basic.py @@ -209,8 +209,10 @@ def decodePage(page, contentEncoding, contentType): else: kb.pageEncoding = conf.charset + # can't do for all responses because we need to support binary files too if contentType and not isinstance(page, unicode) and any(map(lambda x: x in contentType.lower(), ("text/txt", "text/raw", "text/html", "text/xml"))): - # can't do for all responses because we need to support binary files too + if "&#" in page: + page = re.sub('&#(\d+);', lambda _: chr(int(_.group(1))), page) kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page)) page = getUnicode(page, kb.pageEncoding)