Minor bug fix (skipping HTML decoding in heuristic mode)

2025-07-03 03:13:07 +03:00 · 2014-11-11 11:23:14 +01:00 · 2014-11-11 11:23:14 +01:00 · c5df45a14f
commit c5df45a14f
parent dfa8e0456d
1 changed files with 26 additions and 22 deletions
--- a/lib/request/basic.py
+++ b/lib/request/basic.py
@ -268,33 +268,37 @@ def decodePage(page, contentEncoding, contentType):
    # can't do for all responses because we need to support binary files too
    if contentType and not isinstance(page, unicode) and "text/" in contentType.lower():
-        # e.g. &#195;&#235;&#224;&#226;&#224;
+        if kb.heuristicMode:
-        if "&#" in page:
+            kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page))
-            page = re.sub(r"&#(\d{1,3});", lambda _: chr(int(_.group(1))) if int(_.group(1)) < 256 else _.group(0), page)
+            page = getUnicode(page, kb.pageEncoding)
        else:
            # e.g. &#195;&#235;&#224;&#226;&#224;
            if "&#" in page:
                page = re.sub(r"&#(\d{1,3});", lambda _: chr(int(_.group(1))) if int(_.group(1)) < 256 else _.group(0), page)
-        # e.g. %20%28%29
+            # e.g. %20%28%29
-        if "%" in page:
+            if "%" in page:
-            page = re.sub(r"%([0-9a-fA-F]{2})", lambda _: _.group(1).decode("hex"), page)
+                page = re.sub(r"%([0-9a-fA-F]{2})", lambda _: _.group(1).decode("hex"), page)
-        # e.g. &amp;
+            # e.g. &amp;
-        page = re.sub(r"&([^;]+);", lambda _: chr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 256) < 256 else _.group(0), page)
+            page = re.sub(r"&([^;]+);", lambda _: chr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 256) < 256 else _.group(0), page)
-        kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page))
+            kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page))
-        page = getUnicode(page, kb.pageEncoding)
+            page = getUnicode(page, kb.pageEncoding)
-        # e.g. &#8217;&#8230;&#8482;
+            # e.g. &#8217;&#8230;&#8482;
-        if "&#" in page:
+            if "&#" in page:
-            def _(match):
+                def _(match):
-                retVal = match.group(0)
+                    retVal = match.group(0)
-                try:
+                    try:
-                    retVal = unichr(int(match.group(1)))
+                        retVal = unichr(int(match.group(1)))
-                except ValueError:
+                    except ValueError:
-                    pass
+                        pass
-                return retVal
+                    return retVal
-            page = re.sub(r"&#(\d+);", _, page)
+                page = re.sub(r"&#(\d+);", _, page)
-        # e.g. &zeta;
+            # e.g. &zeta;
-        page = re.sub(r"&([^;]+);", lambda _: unichr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 0) > 255 else _.group(0), page)
+            page = re.sub(r"&([^;]+);", lambda _: unichr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 0) > 255 else _.group(0), page)
    return page