Minor bug fix (skipping HTML decoding in heuristic mode)

This commit is contained in:
Miroslav Stampar 2014-11-11 11:23:14 +01:00
parent dfa8e0456d
commit c5df45a14f

View File

@ -268,33 +268,37 @@ def decodePage(page, contentEncoding, contentType):
# can't do for all responses because we need to support binary files too # can't do for all responses because we need to support binary files too
if contentType and not isinstance(page, unicode) and "text/" in contentType.lower(): if contentType and not isinstance(page, unicode) and "text/" in contentType.lower():
# e.g. Ãëàâà if kb.heuristicMode:
if "&#" in page: kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page))
page = re.sub(r"&#(\d{1,3});", lambda _: chr(int(_.group(1))) if int(_.group(1)) < 256 else _.group(0), page) page = getUnicode(page, kb.pageEncoding)
else:
# e.g. &#195;&#235;&#224;&#226;&#224;
if "&#" in page:
page = re.sub(r"&#(\d{1,3});", lambda _: chr(int(_.group(1))) if int(_.group(1)) < 256 else _.group(0), page)
# e.g. %20%28%29 # e.g. %20%28%29
if "%" in page: if "%" in page:
page = re.sub(r"%([0-9a-fA-F]{2})", lambda _: _.group(1).decode("hex"), page) page = re.sub(r"%([0-9a-fA-F]{2})", lambda _: _.group(1).decode("hex"), page)
# e.g. &amp; # e.g. &amp;
page = re.sub(r"&([^;]+);", lambda _: chr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 256) < 256 else _.group(0), page) page = re.sub(r"&([^;]+);", lambda _: chr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 256) < 256 else _.group(0), page)
kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page)) kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page))
page = getUnicode(page, kb.pageEncoding) page = getUnicode(page, kb.pageEncoding)
# e.g. &#8217;&#8230;&#8482; # e.g. &#8217;&#8230;&#8482;
if "&#" in page: if "&#" in page:
def _(match): def _(match):
retVal = match.group(0) retVal = match.group(0)
try: try:
retVal = unichr(int(match.group(1))) retVal = unichr(int(match.group(1)))
except ValueError: except ValueError:
pass pass
return retVal return retVal
page = re.sub(r"&#(\d+);", _, page) page = re.sub(r"&#(\d+);", _, page)
# e.g. &zeta; # e.g. &zeta;
page = re.sub(r"&([^;]+);", lambda _: unichr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 0) > 255 else _.group(0), page) page = re.sub(r"&([^;]+);", lambda _: unichr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 0) > 255 else _.group(0), page)
return page return page