Minor bug fix (skipping HTML decoding in heuristic mode)

This commit is contained in:
Miroslav Stampar 2014-11-11 11:23:14 +01:00
parent dfa8e0456d
commit c5df45a14f

View File

@ -268,6 +268,10 @@ def decodePage(page, contentEncoding, contentType):
# can't do for all responses because we need to support binary files too # can't do for all responses because we need to support binary files too
if contentType and not isinstance(page, unicode) and "text/" in contentType.lower(): if contentType and not isinstance(page, unicode) and "text/" in contentType.lower():
if kb.heuristicMode:
kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page))
page = getUnicode(page, kb.pageEncoding)
else:
# e.g. Ãëàâà # e.g. Ãëàâà
if "&#" in page: if "&#" in page:
page = re.sub(r"&#(\d{1,3});", lambda _: chr(int(_.group(1))) if int(_.group(1)) < 256 else _.group(0), page) page = re.sub(r"&#(\d{1,3});", lambda _: chr(int(_.group(1))) if int(_.group(1)) < 256 else _.group(0), page)