Update for an Issue #111

2026-02-20 22:30:36 +03:00 · 2012-07-23 18:38:46 +02:00 · 2012-07-23 18:38:46 +02:00 · fccd69721e
commit fccd69721e
parent ab9cb80602
1 changed files with 7 additions and 1 deletions
--- a/lib/request/basic.py
+++ b/lib/request/basic.py
@ -211,11 +211,17 @@ def decodePage(page, contentEncoding, contentType):

    # can't do for all responses because we need to support binary files too
    if contentType and not isinstance(page, unicode) and any(map(lambda x: x in contentType.lower(), ("text/txt", "text/raw", "text/html", "text/xml"))):
+        # e.g. &#195;&#235;&#224;&#226;&#224;
        if "&#" in page:
-            page = re.sub('&#(\d+);', lambda _: chr(int(_.group(1))), page)
+            page = re.sub('&#(\d+);', lambda _: chr(int(_.group(1))) if int(_.group(1)) < 256 else _.group(0), page)
+
        kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page))
        page = getUnicode(page, kb.pageEncoding)

+        # e.g. &#8217;&#8230;&#8482;
+        if "&#" in page:
+            page = re.sub('&#(\d+);', lambda _: unichr(int(_.group(1))), page)
+
    return page

 def processResponse(page, responseHeaders):