Minor update for an Issue #111

This commit is contained in:
Miroslav Stampar 2012-07-23 18:44:50 +02:00
parent fccd69721e
commit 1153b4563c

View File

@ -213,7 +213,7 @@ def decodePage(page, contentEncoding, contentType):
if contentType and not isinstance(page, unicode) and any(map(lambda x: x in contentType.lower(), ("text/txt", "text/raw", "text/html", "text/xml"))):
# e.g. Ãëàâà
if "&#" in page:
page = re.sub('&#(\d+);', lambda _: chr(int(_.group(1))) if int(_.group(1)) < 256 else _.group(0), page)
page = re.sub('&#(\d{1,3});', lambda _: chr(int(_.group(1))) if int(_.group(1)) < 256 else _.group(0), page)
kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page))
page = getUnicode(page, kb.pageEncoding)