From 5593bf2fee7c7c2a508d85cee6fd426897295d1a Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Mon, 16 Nov 2015 15:02:30 +0100 Subject: [PATCH] Another patch related to #1539 (simplifying unicode bad chars and preventing double encoding of safe chars) --- extra/safe2bin/safe2bin.py | 17 ++++++++--------- lib/core/common.py | 4 ++++ lib/core/settings.py | 2 +- xml/queries.xml | 2 +- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/extra/safe2bin/safe2bin.py b/extra/safe2bin/safe2bin.py index c91620ec6..ff3f7d8d8 100644 --- a/extra/safe2bin/safe2bin.py +++ b/extra/safe2bin/safe2bin.py @@ -19,15 +19,18 @@ from optparse import OptionParser # Regex used for recognition of hex encoded characters HEX_ENCODED_CHAR_REGEX = r"(?P\\x[0-9A-Fa-f]{2})" -# Regex used for recognition of representation for hex encoded invalid unicode characters -INVALID_UNICODE_CHAR_REGEX = r"(?P\\\?[0-9A-Fa-f]{2})" - # Raw chars that will be safe encoded to their slash (\) representations (e.g. newline to \n) SAFE_ENCODE_SLASH_REPLACEMENTS = "\t\n\r\x0b\x0c" # Characters that don't need to be safe encoded SAFE_CHARS = "".join(filter(lambda x: x not in SAFE_ENCODE_SLASH_REPLACEMENTS, string.printable.replace('\\', ''))) +# Prefix used for hex encoded values +HEX_ENCODED_PREFIX = r"\x" + +# Strings used for temporary marking of hex encoded prefixes (to prevent double encoding) +HEX_ENCODED_PREFIX_MARKER = "__HEX_ENCODED_PREFIX__" + # String used for temporary marking of slash characters SLASH_MARKER = "__SLASH__" @@ -45,6 +48,7 @@ def safecharencode(value): if isinstance(value, basestring): if any(_ not in SAFE_CHARS for _ in value): + retVal = retVal.replace(HEX_ENCODED_PREFIX, HEX_ENCODED_PREFIX_MARKER) retVal = retVal.replace('\\', SLASH_MARKER) for char in SAFE_ENCODE_SLASH_REPLACEMENTS: @@ -53,6 +57,7 @@ def safecharencode(value): retVal = reduce(lambda x, y: x + (y if (y in string.printable or isinstance(value, unicode) and ord(y) >= 160) else '\\x%02x' % ord(y)), retVal, (unicode if isinstance(value, unicode) else str)()) retVal = retVal.replace(SLASH_MARKER, "\\\\") + retVal = retVal.replace(HEX_ENCODED_PREFIX_MARKER, HEX_ENCODED_PREFIX) elif isinstance(value, list): for i in xrange(len(value)): retVal[i] = safecharencode(value[i]) @@ -83,12 +88,6 @@ def safechardecode(value, binary=False): if binary: if isinstance(retVal, unicode): retVal = retVal.encode("utf8") - while True: - match = re.search(INVALID_UNICODE_CHAR_REGEX, retVal) - if match: - retVal = retVal.replace(match.group("result"), chr(ord(binascii.unhexlify(match.group("result").lstrip("\\?"))))) - else: - break elif isinstance(value, (list, tuple)): for i in xrange(len(value)): diff --git a/lib/core/common.py b/lib/core/common.py index 1f28b5f25..a7ddaef35 100755 --- a/lib/core/common.py +++ b/lib/core/common.py @@ -2954,6 +2954,10 @@ def decodeIntToUnicode(value): retVal = unichr(value) else: retVal = getUnicode(raw, conf.charset) + + if Backend.isDbms(DBMS.MYSQL): + import pdb + pdb.set_trace() else: retVal = getUnicode(chr(value)) except: diff --git a/lib/core/settings.py b/lib/core/settings.py index 5e9b6f1a1..d478ae50a 100644 --- a/lib/core/settings.py +++ b/lib/core/settings.py @@ -587,7 +587,7 @@ EVENTVALIDATION_REGEX = r'(?i)(?P__EVENTVALIDATION[^"]*)[^>]+value="(?P]+>(.+>)?\s*\Z" diff --git a/xml/queries.xml b/xml/queries.xml index 98b79cac7..28fd85bfb 100644 --- a/xml/queries.xml +++ b/xml/queries.xml @@ -3,7 +3,7 @@ - +