From 648d91d7907d62583faa8c9da5de50bf340a42bf Mon Sep 17 00:00:00 2001
From: Miroslav Stampar <miroslav.stampar@gmail.com>
Date: Thu, 27 Dec 2012 22:43:39 +0100
Subject: [PATCH] Distinguishing invalid unicode from safe encoded characters
 (for proper potential decoding)

---
 extra/safe2bin/safe2bin.py | 12 +++++-------
 lib/core/common.py         |  6 ++++--
 lib/core/settings.py       |  3 +++
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/extra/safe2bin/safe2bin.py b/extra/safe2bin/safe2bin.py
index d4e637d31..7bdcf3346 100755
--- a/extra/safe2bin/safe2bin.py
+++ b/extra/safe2bin/safe2bin.py
@@ -42,14 +42,14 @@ def safecharencode(value):
 
     if isinstance(value, basestring):
         if any(_ not in SAFE_CHARS for _ in value):
-            retVal = re.sub(r'(?i)(?!\\x[0-9A-F]{2})\\', SLASH_MARKER, value)
+            retVal = retVal.replace('\\', SLASH_MARKER)
 
             for char in SAFE_ENCODE_SLASH_REPLACEMENTS:
                 retVal = retVal.replace(char, repr(char).strip('\''))
 
-            retVal = retVal.replace(SLASH_MARKER, '\\\\')
-
             retVal = reduce(lambda x, y: x + (y if (y in string.printable or ord(y) > 255) else '\\x%02x' % ord(y)), retVal, (unicode if isinstance(value, unicode) else str)())
+
+            retVal = retVal.replace(SLASH_MARKER, '\\\\')
     elif isinstance(value, list):
         for i in xrange(len(value)):
             retVal[i] = safecharencode(value[i])
@@ -63,17 +63,15 @@ def safechardecode(value):
 
     retVal = value
     if isinstance(value, basestring):
-        regex = re.compile(HEX_ENCODED_CHAR_REGEX)
+        retVal = retVal.replace('\\\\', SLASH_MARKER)
 
         while True:
-            match = regex.search(retVal)
+            match = re.search(HEX_ENCODED_CHAR_REGEX, retVal)
             if match:
                 retVal = retVal.replace(match.group("result"), (unichr if isinstance(value, unicode) else chr)(ord(binascii.unhexlify(match.group("result").lstrip('\\x')))))
             else:
                 break
 
-        retVal = retVal.replace('\\\\', SLASH_MARKER)
-
         for char in SAFE_ENCODE_SLASH_REPLACEMENTS[::-1]:
             retVal = retVal.replace(repr(char).strip('\''), char)
 
diff --git a/lib/core/common.py b/lib/core/common.py
index ca9bdbcd6..8861f651d 100644
--- a/lib/core/common.py
+++ b/lib/core/common.py
@@ -94,6 +94,7 @@ from lib.core.settings import GENERIC_DOC_ROOT_DIRECTORY_NAMES
 from lib.core.settings import HASHDB_MILESTONE_VALUE
 from lib.core.settings import HOST_ALIASES
 from lib.core.settings import INFERENCE_UNKNOWN_CHAR
+from lib.core.settings import INVALID_UNICODE_CHAR_FORMAT
 from lib.core.settings import ISSUES_PAGE
 from lib.core.settings import IS_WIN
 from lib.core.settings import LARGE_OUTPUT_THRESHOLD
@@ -1811,9 +1812,9 @@ def getUnicode(value, encoding=None, system=False, noneToNull=False):
         elif isinstance(value, basestring):
             while True:
                 try:
-                    return unicode(value, encoding or kb.pageEncoding or UNICODE_ENCODING)
+                    return unicode(value, encoding or kb.get("pageEncoding") or UNICODE_ENCODING)
                 except UnicodeDecodeError, ex:
-                    value = value[:ex.start] + "".join("\\x%02x" % ord(_) for _ in value[ex.start:ex.end]) + value[ex.end:]
+                    value = value[:ex.start] + "".join(INVALID_UNICODE_CHAR_FORMAT % ord(_) for _ in value[ex.start:ex.end]) + value[ex.end:]
         else:
             return unicode(value)  # encoding ignored for non-basestring instances
     else:
@@ -3260,6 +3261,7 @@ def prioritySortColumns(columns):
     Sorts given column names by length in ascending order while those containing
     string 'id' go first
     """
+
     _ = lambda x: x and "id" in x.lower()
     return sorted(sorted(columns, key=len), lambda x, y: -1 if _(x) and not _(y) else 1 if not _(x) and _(y) else 0)
 
diff --git a/lib/core/settings.py b/lib/core/settings.py
index c1f9a89d4..2aaad4658 100644
--- a/lib/core/settings.py
+++ b/lib/core/settings.py
@@ -482,6 +482,9 @@ EVENTVALIDATION_REGEX = r'(?i)(?P<name>__EVENTVALIDATION[^"]*)[^>]+value="(?P<re
 # Number of rows to generate inside the full union test for limited output (mustn't be too large to prevent payload length problems)
 LIMITED_ROWS_TEST_NUMBER = 15
 
+# Format used for representing invalid unicode characters
+INVALID_UNICODE_CHAR_FORMAT = r"\?%02x"
+
 # Regular expression for SOAP-like POST data
 SOAP_RECOGNITION_REGEX = r"(?s)\A(<\?xml[^>]+>)?\s*<([^> ]+)( [^>]+)?>.+</\2.*>\s*\Z"