From ff5ec48abd4ecdec4dd4cad40f1f35198cdbe11d Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Wed, 16 Jan 2013 14:16:22 +0100 Subject: [PATCH] Minor update for an Issue #8 --- extra/safe2bin/safe2bin.py | 19 ++++++++++++++++--- lib/core/dump.py | 2 +- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/extra/safe2bin/safe2bin.py b/extra/safe2bin/safe2bin.py index 7bdcf3346..4304f0fcc 100644 --- a/extra/safe2bin/safe2bin.py +++ b/extra/safe2bin/safe2bin.py @@ -19,6 +19,9 @@ from optparse import OptionParser # Regex used for recognition of hex encoded characters HEX_ENCODED_CHAR_REGEX = r"(?P\\x[0-9A-Fa-f]{2})" +# Regex used for recognition of representation for hex encoded invalid unicode characters +INVALID_UNICODE_CHAR_REGEX = r"(?P\\\?[0-9A-Fa-f]{2})" + # Raw chars that will be safe encoded to their slash (\) representations (e.g. newline to \n) SAFE_ENCODE_SLASH_REPLACEMENTS = "\t\n\r\x0b\x0c" @@ -49,14 +52,14 @@ def safecharencode(value): retVal = reduce(lambda x, y: x + (y if (y in string.printable or ord(y) > 255) else '\\x%02x' % ord(y)), retVal, (unicode if isinstance(value, unicode) else str)()) - retVal = retVal.replace(SLASH_MARKER, '\\\\') + retVal = retVal.replace(SLASH_MARKER, "\\\\") elif isinstance(value, list): for i in xrange(len(value)): retVal[i] = safecharencode(value[i]) return retVal -def safechardecode(value): +def safechardecode(value, binary=False): """ Reverse function to safecharencode """ @@ -68,7 +71,7 @@ def safechardecode(value): while True: match = re.search(HEX_ENCODED_CHAR_REGEX, retVal) if match: - retVal = retVal.replace(match.group("result"), (unichr if isinstance(value, unicode) else chr)(ord(binascii.unhexlify(match.group("result").lstrip('\\x'))))) + retVal = retVal.replace(match.group("result"), (unichr if isinstance(value, unicode) else chr)(ord(binascii.unhexlify(match.group("result").lstrip("\\x"))))) else: break @@ -77,6 +80,16 @@ def safechardecode(value): retVal = retVal.replace(SLASH_MARKER, '\\') + if binary: + if isinstance(retVal, unicode): + retVal = retVal.encode("utf8") + while True: + match = re.search(INVALID_UNICODE_CHAR_REGEX, retVal) + if match: + retVal = retVal.replace(match.group("result"), chr(ord(binascii.unhexlify(match.group("result").lstrip("\\?"))))) + else: + break + elif isinstance(value, (list, tuple)): for i in xrange(len(value)): retVal[i] = safechardecode(value[i]) diff --git a/lib/core/dump.py b/lib/core/dump.py index 8ce184906..3ca145a3b 100644 --- a/lib/core/dump.py +++ b/lib/core/dump.py @@ -483,7 +483,7 @@ class Dump(object): # mimetype = magic.from_buffer(value, mime=True) # if mimetype.startswith("application") or mimetype.startswith("image"): # with codecs.open("%s%s%s" % (dumpDbPath, os.sep, "%s-%d.bin" % (column, randomInt(8))), "wb", UNICODE_ENCODING) as f: - # _ = safechardecode(value) + # _ = safechardecode(value, True) # f.write(_) if conf.dumpFormat == DUMP_FORMAT.CSV: