Distinguishing invalid unicode from safe encoded characters (for proper potential decoding)

This commit is contained in:
Miroslav Stampar 2012-12-27 22:43:39 +01:00
parent 3d01890147
commit 648d91d790
3 changed files with 12 additions and 9 deletions

View File

@ -42,14 +42,14 @@ def safecharencode(value):
if isinstance(value, basestring): if isinstance(value, basestring):
if any(_ not in SAFE_CHARS for _ in value): if any(_ not in SAFE_CHARS for _ in value):
retVal = re.sub(r'(?i)(?!\\x[0-9A-F]{2})\\', SLASH_MARKER, value) retVal = retVal.replace('\\', SLASH_MARKER)
for char in SAFE_ENCODE_SLASH_REPLACEMENTS: for char in SAFE_ENCODE_SLASH_REPLACEMENTS:
retVal = retVal.replace(char, repr(char).strip('\'')) retVal = retVal.replace(char, repr(char).strip('\''))
retVal = retVal.replace(SLASH_MARKER, '\\\\')
retVal = reduce(lambda x, y: x + (y if (y in string.printable or ord(y) > 255) else '\\x%02x' % ord(y)), retVal, (unicode if isinstance(value, unicode) else str)()) retVal = reduce(lambda x, y: x + (y if (y in string.printable or ord(y) > 255) else '\\x%02x' % ord(y)), retVal, (unicode if isinstance(value, unicode) else str)())
retVal = retVal.replace(SLASH_MARKER, '\\\\')
elif isinstance(value, list): elif isinstance(value, list):
for i in xrange(len(value)): for i in xrange(len(value)):
retVal[i] = safecharencode(value[i]) retVal[i] = safecharencode(value[i])
@ -63,17 +63,15 @@ def safechardecode(value):
retVal = value retVal = value
if isinstance(value, basestring): if isinstance(value, basestring):
regex = re.compile(HEX_ENCODED_CHAR_REGEX) retVal = retVal.replace('\\\\', SLASH_MARKER)
while True: while True:
match = regex.search(retVal) match = re.search(HEX_ENCODED_CHAR_REGEX, retVal)
if match: if match:
retVal = retVal.replace(match.group("result"), (unichr if isinstance(value, unicode) else chr)(ord(binascii.unhexlify(match.group("result").lstrip('\\x'))))) retVal = retVal.replace(match.group("result"), (unichr if isinstance(value, unicode) else chr)(ord(binascii.unhexlify(match.group("result").lstrip('\\x')))))
else: else:
break break
retVal = retVal.replace('\\\\', SLASH_MARKER)
for char in SAFE_ENCODE_SLASH_REPLACEMENTS[::-1]: for char in SAFE_ENCODE_SLASH_REPLACEMENTS[::-1]:
retVal = retVal.replace(repr(char).strip('\''), char) retVal = retVal.replace(repr(char).strip('\''), char)

View File

@ -94,6 +94,7 @@ from lib.core.settings import GENERIC_DOC_ROOT_DIRECTORY_NAMES
from lib.core.settings import HASHDB_MILESTONE_VALUE from lib.core.settings import HASHDB_MILESTONE_VALUE
from lib.core.settings import HOST_ALIASES from lib.core.settings import HOST_ALIASES
from lib.core.settings import INFERENCE_UNKNOWN_CHAR from lib.core.settings import INFERENCE_UNKNOWN_CHAR
from lib.core.settings import INVALID_UNICODE_CHAR_FORMAT
from lib.core.settings import ISSUES_PAGE from lib.core.settings import ISSUES_PAGE
from lib.core.settings import IS_WIN from lib.core.settings import IS_WIN
from lib.core.settings import LARGE_OUTPUT_THRESHOLD from lib.core.settings import LARGE_OUTPUT_THRESHOLD
@ -1811,9 +1812,9 @@ def getUnicode(value, encoding=None, system=False, noneToNull=False):
elif isinstance(value, basestring): elif isinstance(value, basestring):
while True: while True:
try: try:
return unicode(value, encoding or kb.pageEncoding or UNICODE_ENCODING) return unicode(value, encoding or kb.get("pageEncoding") or UNICODE_ENCODING)
except UnicodeDecodeError, ex: except UnicodeDecodeError, ex:
value = value[:ex.start] + "".join("\\x%02x" % ord(_) for _ in value[ex.start:ex.end]) + value[ex.end:] value = value[:ex.start] + "".join(INVALID_UNICODE_CHAR_FORMAT % ord(_) for _ in value[ex.start:ex.end]) + value[ex.end:]
else: else:
return unicode(value) # encoding ignored for non-basestring instances return unicode(value) # encoding ignored for non-basestring instances
else: else:
@ -3260,6 +3261,7 @@ def prioritySortColumns(columns):
Sorts given column names by length in ascending order while those containing Sorts given column names by length in ascending order while those containing
string 'id' go first string 'id' go first
""" """
_ = lambda x: x and "id" in x.lower() _ = lambda x: x and "id" in x.lower()
return sorted(sorted(columns, key=len), lambda x, y: -1 if _(x) and not _(y) else 1 if not _(x) and _(y) else 0) return sorted(sorted(columns, key=len), lambda x, y: -1 if _(x) and not _(y) else 1 if not _(x) and _(y) else 0)

View File

@ -482,6 +482,9 @@ EVENTVALIDATION_REGEX = r'(?i)(?P<name>__EVENTVALIDATION[^"]*)[^>]+value="(?P<re
# Number of rows to generate inside the full union test for limited output (mustn't be too large to prevent payload length problems) # Number of rows to generate inside the full union test for limited output (mustn't be too large to prevent payload length problems)
LIMITED_ROWS_TEST_NUMBER = 15 LIMITED_ROWS_TEST_NUMBER = 15
# Format used for representing invalid unicode characters
INVALID_UNICODE_CHAR_FORMAT = r"\?%02x"
# Regular expression for SOAP-like POST data # Regular expression for SOAP-like POST data
SOAP_RECOGNITION_REGEX = r"(?s)\A(<\?xml[^>]+>)?\s*<([^> ]+)( [^>]+)?>.+</\2.*>\s*\Z" SOAP_RECOGNITION_REGEX = r"(?s)\A(<\?xml[^>]+>)?\s*<([^> ]+)( [^>]+)?>.+</\2.*>\s*\Z"