From 648d91d7907d62583faa8c9da5de50bf340a42bf Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Thu, 27 Dec 2012 22:43:39 +0100 Subject: [PATCH] Distinguishing invalid unicode from safe encoded characters (for proper potential decoding) --- extra/safe2bin/safe2bin.py | 12 +++++------- lib/core/common.py | 6 ++++-- lib/core/settings.py | 3 +++ 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/extra/safe2bin/safe2bin.py b/extra/safe2bin/safe2bin.py index d4e637d31..7bdcf3346 100755 --- a/extra/safe2bin/safe2bin.py +++ b/extra/safe2bin/safe2bin.py @@ -42,14 +42,14 @@ def safecharencode(value): if isinstance(value, basestring): if any(_ not in SAFE_CHARS for _ in value): - retVal = re.sub(r'(?i)(?!\\x[0-9A-F]{2})\\', SLASH_MARKER, value) + retVal = retVal.replace('\\', SLASH_MARKER) for char in SAFE_ENCODE_SLASH_REPLACEMENTS: retVal = retVal.replace(char, repr(char).strip('\'')) - retVal = retVal.replace(SLASH_MARKER, '\\\\') - retVal = reduce(lambda x, y: x + (y if (y in string.printable or ord(y) > 255) else '\\x%02x' % ord(y)), retVal, (unicode if isinstance(value, unicode) else str)()) + + retVal = retVal.replace(SLASH_MARKER, '\\\\') elif isinstance(value, list): for i in xrange(len(value)): retVal[i] = safecharencode(value[i]) @@ -63,17 +63,15 @@ def safechardecode(value): retVal = value if isinstance(value, basestring): - regex = re.compile(HEX_ENCODED_CHAR_REGEX) + retVal = retVal.replace('\\\\', SLASH_MARKER) while True: - match = regex.search(retVal) + match = re.search(HEX_ENCODED_CHAR_REGEX, retVal) if match: retVal = retVal.replace(match.group("result"), (unichr if isinstance(value, unicode) else chr)(ord(binascii.unhexlify(match.group("result").lstrip('\\x'))))) else: break - retVal = retVal.replace('\\\\', SLASH_MARKER) - for char in SAFE_ENCODE_SLASH_REPLACEMENTS[::-1]: retVal = retVal.replace(repr(char).strip('\''), char) diff --git a/lib/core/common.py b/lib/core/common.py index ca9bdbcd6..8861f651d 100644 --- a/lib/core/common.py +++ b/lib/core/common.py @@ -94,6 +94,7 @@ from lib.core.settings import GENERIC_DOC_ROOT_DIRECTORY_NAMES from lib.core.settings import HASHDB_MILESTONE_VALUE from lib.core.settings import HOST_ALIASES from lib.core.settings import INFERENCE_UNKNOWN_CHAR +from lib.core.settings import INVALID_UNICODE_CHAR_FORMAT from lib.core.settings import ISSUES_PAGE from lib.core.settings import IS_WIN from lib.core.settings import LARGE_OUTPUT_THRESHOLD @@ -1811,9 +1812,9 @@ def getUnicode(value, encoding=None, system=False, noneToNull=False): elif isinstance(value, basestring): while True: try: - return unicode(value, encoding or kb.pageEncoding or UNICODE_ENCODING) + return unicode(value, encoding or kb.get("pageEncoding") or UNICODE_ENCODING) except UnicodeDecodeError, ex: - value = value[:ex.start] + "".join("\\x%02x" % ord(_) for _ in value[ex.start:ex.end]) + value[ex.end:] + value = value[:ex.start] + "".join(INVALID_UNICODE_CHAR_FORMAT % ord(_) for _ in value[ex.start:ex.end]) + value[ex.end:] else: return unicode(value) # encoding ignored for non-basestring instances else: @@ -3260,6 +3261,7 @@ def prioritySortColumns(columns): Sorts given column names by length in ascending order while those containing string 'id' go first """ + _ = lambda x: x and "id" in x.lower() return sorted(sorted(columns, key=len), lambda x, y: -1 if _(x) and not _(y) else 1 if not _(x) and _(y) else 0) diff --git a/lib/core/settings.py b/lib/core/settings.py index c1f9a89d4..2aaad4658 100644 --- a/lib/core/settings.py +++ b/lib/core/settings.py @@ -482,6 +482,9 @@ EVENTVALIDATION_REGEX = r'(?i)(?P__EVENTVALIDATION[^"]*)[^>]+value="(?P]+>)?\s*<([^> ]+)( [^>]+)?>.+\s*\Z"