Minor bug fix (skipping HTML decoding in heuristic mode)

2025-12-18 23:54:31 +03:00 · 2014-11-11 11:23:14 +01:00 · 2014-11-11 11:23:14 +01:00 · c5df45a14f
commit c5df45a14f
parent dfa8e0456d
1 changed files with 26 additions and 22 deletions
--- a/lib/request/basic.py
+++ b/lib/request/basic.py
@ -268,6 +268,10 @@ def decodePage(page, contentEncoding, contentType):
    # can't do for all responses because we need to support binary files too
    if contentType and not isinstance(page, unicode) and "text/" in contentType.lower():
        if kb.heuristicMode:
            kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page))
            page = getUnicode(page, kb.pageEncoding)
        else:
            # e.g. &#195;&#235;&#224;&#226;&#224;
            if "&#" in page:
                page = re.sub(r"&#(\d{1,3});", lambda _: chr(int(_.group(1))) if int(_.group(1)) < 256 else _.group(0), page)