From c5df45a14f878d08e0b7c4312cf9feda99c85e85 Mon Sep 17 00:00:00 2001
From: Miroslav Stampar <miroslav.stampar@gmail.com>
Date: Tue, 11 Nov 2014 11:23:14 +0100
Subject: [PATCH] Minor bug fix (skipping HTML decoding in heuristic mode)

---
 lib/request/basic.py | 48 ++++++++++++++++++++++++--------------------
 1 file changed, 26 insertions(+), 22 deletions(-)

diff --git a/lib/request/basic.py b/lib/request/basic.py
index 2fa61b6d2..72a5b03f7 100755
--- a/lib/request/basic.py
+++ b/lib/request/basic.py
@@ -268,33 +268,37 @@ def decodePage(page, contentEncoding, contentType):
 
     # can't do for all responses because we need to support binary files too
     if contentType and not isinstance(page, unicode) and "text/" in contentType.lower():
-        # e.g. &#195;&#235;&#224;&#226;&#224;
-        if "&#" in page:
-            page = re.sub(r"&#(\d{1,3});", lambda _: chr(int(_.group(1))) if int(_.group(1)) < 256 else _.group(0), page)
+        if kb.heuristicMode:
+            kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page))
+            page = getUnicode(page, kb.pageEncoding)
+        else:
+            # e.g. &#195;&#235;&#224;&#226;&#224;
+            if "&#" in page:
+                page = re.sub(r"&#(\d{1,3});", lambda _: chr(int(_.group(1))) if int(_.group(1)) < 256 else _.group(0), page)
 
-        # e.g. %20%28%29
-        if "%" in page:
-            page = re.sub(r"%([0-9a-fA-F]{2})", lambda _: _.group(1).decode("hex"), page)
+            # e.g. %20%28%29
+            if "%" in page:
+                page = re.sub(r"%([0-9a-fA-F]{2})", lambda _: _.group(1).decode("hex"), page)
 
-        # e.g. &amp;
-        page = re.sub(r"&([^;]+);", lambda _: chr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 256) < 256 else _.group(0), page)
+            # e.g. &amp;
+            page = re.sub(r"&([^;]+);", lambda _: chr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 256) < 256 else _.group(0), page)
 
-        kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page))
-        page = getUnicode(page, kb.pageEncoding)
+            kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page))
+            page = getUnicode(page, kb.pageEncoding)
 
-        # e.g. &#8217;&#8230;&#8482;
-        if "&#" in page:
-            def _(match):
-                retVal = match.group(0)
-                try:
-                    retVal = unichr(int(match.group(1)))
-                except ValueError:
-                    pass
-                return retVal
-            page = re.sub(r"&#(\d+);", _, page)
+            # e.g. &#8217;&#8230;&#8482;
+            if "&#" in page:
+                def _(match):
+                    retVal = match.group(0)
+                    try:
+                        retVal = unichr(int(match.group(1)))
+                    except ValueError:
+                        pass
+                    return retVal
+                page = re.sub(r"&#(\d+);", _, page)
 
-        # e.g. &zeta;
-        page = re.sub(r"&([^;]+);", lambda _: unichr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 0) > 255 else _.group(0), page)
+            # e.g. &zeta;
+            page = re.sub(r"&([^;]+);", lambda _: unichr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 0) > 255 else _.group(0), page)
 
     return page