Implementing Issue #111

2026-02-20 14:20:48 +03:00 · 2012-07-23 15:14:52 +02:00 · 2012-07-23 15:14:52 +02:00 · ab9cb80602
commit ab9cb80602
parent 6809449e31
2 changed files with 3 additions and 2 deletions
--- a/lib/core/convert.py
+++ b/lib/core/convert.py
@ -144,5 +144,4 @@ def htmlunescape(value):
    if value and isinstance(value, basestring):
        codes = (('&lt;', '<'), ('&gt;', '>'), ('&quot;', '"'), ('&nbsp;', ' '), ('&amp;', '&'))
        retVal = reduce(lambda x, y: x.replace(y[0], y[1]), codes, retVal)
-        retVal = re.sub('&#(\d+);', lambda x: getUnicode(chr(x.group(1))), retVal)
    return retVal
--- a/lib/request/basic.py
+++ b/lib/request/basic.py
@ -209,8 +209,10 @@ def decodePage(page, contentEncoding, contentType):
    else:
        kb.pageEncoding = conf.charset

+    # can't do for all responses because we need to support binary files too
    if contentType and not isinstance(page, unicode) and any(map(lambda x: x in contentType.lower(), ("text/txt", "text/raw", "text/html", "text/xml"))):
-        # can't do for all responses because we need to support binary files too
+        if "&#" in page:
+            page = re.sub('&#(\d+);', lambda _: chr(int(_.group(1))), page)
        kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page))
        page = getUnicode(page, kb.pageEncoding)