Some more DREI stuff

This commit is contained in:
Miroslav Stampar 2019-04-19 11:24:34 +02:00
parent da15701a55
commit bb7bd51d94
15 changed files with 94 additions and 71 deletions

View File

@ -20,6 +20,9 @@ from optparse import OptionParser
if sys.version_info >= (3, 0): if sys.version_info >= (3, 0):
xrange = range xrange = range
text_type = str
else:
text_type = unicode
# Regex used for recognition of hex encoded characters # Regex used for recognition of hex encoded characters
HEX_ENCODED_CHAR_REGEX = r"(?P<result>\\x[0-9A-Fa-f]{2})" HEX_ENCODED_CHAR_REGEX = r"(?P<result>\\x[0-9A-Fa-f]{2})"
@ -52,14 +55,14 @@ def safecharencode(value):
retVal = value retVal = value
if isinstance(value, basestring): if isinstance(value, basestring):
if any([_ not in SAFE_CHARS for _ in value]): if any(_ not in SAFE_CHARS for _ in value):
retVal = retVal.replace(HEX_ENCODED_PREFIX, HEX_ENCODED_PREFIX_MARKER) retVal = retVal.replace(HEX_ENCODED_PREFIX, HEX_ENCODED_PREFIX_MARKER)
retVal = retVal.replace('\\', SLASH_MARKER) retVal = retVal.replace('\\', SLASH_MARKER)
for char in SAFE_ENCODE_SLASH_REPLACEMENTS: for char in SAFE_ENCODE_SLASH_REPLACEMENTS:
retVal = retVal.replace(char, repr(char).strip('\'')) retVal = retVal.replace(char, repr(char).strip('\''))
retVal = reduce(lambda x, y: x + (y if (y in string.printable or isinstance(value, unicode) and ord(y) >= 160) else '\\x%02x' % ord(y)), retVal, (unicode if isinstance(value, unicode) else str)()) retVal = reduce(lambda x, y: x + (y if (y in string.printable or isinstance(value, text_type) and ord(y) >= 160) else '\\x%02x' % ord(y)), retVal, type(value)())
retVal = retVal.replace(SLASH_MARKER, "\\\\") retVal = retVal.replace(SLASH_MARKER, "\\\\")
retVal = retVal.replace(HEX_ENCODED_PREFIX_MARKER, HEX_ENCODED_PREFIX) retVal = retVal.replace(HEX_ENCODED_PREFIX_MARKER, HEX_ENCODED_PREFIX)
@ -81,7 +84,7 @@ def safechardecode(value, binary=False):
while True: while True:
match = re.search(HEX_ENCODED_CHAR_REGEX, retVal) match = re.search(HEX_ENCODED_CHAR_REGEX, retVal)
if match: if match:
retVal = retVal.replace(match.group("result"), (unichr if isinstance(value, unicode) else chr)(ord(binascii.unhexlify(match.group("result").lstrip("\\x"))))) retVal = retVal.replace(match.group("result"), (unichr if isinstance(value, text_type) else chr)(ord(binascii.unhexlify(match.group("result").lstrip("\\x")))))
else: else:
break break
@ -91,7 +94,7 @@ def safechardecode(value, binary=False):
retVal = retVal.replace(SLASH_MARKER, '\\') retVal = retVal.replace(SLASH_MARKER, '\\')
if binary: if binary:
if isinstance(retVal, unicode): if isinstance(retVal, text_type):
retVal = retVal.encode("utf8") retVal = retVal.encode("utf8")
elif isinstance(value, (list, tuple)): elif isinstance(value, (list, tuple)):

View File

@ -4,4 +4,4 @@
# See the file 'LICENSE' for copying permission # See the file 'LICENSE' for copying permission
# Runs pyflakes on all python files (prerequisite: apt-get install pyflakes) # Runs pyflakes on all python files (prerequisite: apt-get install pyflakes)
find . -wholename "./thirdparty" -prune -o -type f -iname "*.py" -exec pyflakes '{}' \; find . -wholename "./thirdparty" -prune -o -type f -iname "*.py" -exec pyflakes '{}' \; | grep -v "redefines '_'"

View File

@ -333,7 +333,7 @@ def start():
testSqlInj = False testSqlInj = False
if PLACE.GET in conf.parameters and not any([conf.data, conf.testParameter]): if PLACE.GET in conf.parameters and not any((conf.data, conf.testParameter)):
for parameter in re.findall(r"([^=]+)=([^%s]+%s?|\Z)" % (re.escape(conf.paramDel or "") or DEFAULT_GET_POST_DELIMITER, re.escape(conf.paramDel or "") or DEFAULT_GET_POST_DELIMITER), conf.parameters[PLACE.GET]): for parameter in re.findall(r"([^=]+)=([^%s]+%s?|\Z)" % (re.escape(conf.paramDel or "") or DEFAULT_GET_POST_DELIMITER, re.escape(conf.paramDel or "") or DEFAULT_GET_POST_DELIMITER), conf.parameters[PLACE.GET]):
paramKey = (conf.hostname, conf.path, PLACE.GET, parameter[0]) paramKey = (conf.hostname, conf.path, PLACE.GET, parameter[0])

View File

@ -882,6 +882,16 @@ def singleTimeLogMessage(message, level=logging.INFO, flag=None):
logger.log(level, message) logger.log(level, message)
def boldifyMessage(message): def boldifyMessage(message):
"""
Sets ANSI bold marking on entire message if parts found in predefined BOLD_PATTERNS
>>> boldifyMessage("Hello World")
'Hello World'
>>> boldifyMessage("GET parameter id is not injectable")
'\\x1b[1mGET parameter id is not injectable\\x1b[0m'
"""
retVal = message retVal = message
if any(_ in message for _ in BOLD_PATTERNS): if any(_ in message for _ in BOLD_PATTERNS):
@ -890,6 +900,13 @@ def boldifyMessage(message):
return retVal return retVal
def setColor(message, color=None, bold=False, level=None): def setColor(message, color=None, bold=False, level=None):
"""
Sets ANSI color codes
>>> setColor("Hello World", "red")
'\\x1b[31mHello World\\x1b[0m'
"""
retVal = message retVal = message
level = level or extractRegexResult(r"\[(?P<result>%s)\]" % '|'.join(_[0] for _ in getPublicTypeMembers(LOGGING_LEVELS)), message) level = level or extractRegexResult(r"\[(?P<result>%s)\]" % '|'.join(_[0] for _ in getPublicTypeMembers(LOGGING_LEVELS)), message)
@ -933,7 +950,7 @@ def dataToStdout(data, forceOutput=False, bold=False, content_type=None, status=
if multiThreadMode: if multiThreadMode:
logging._acquireLock() logging._acquireLock()
if isinstance(data, unicode): if isinstance(data, six.text_type):
message = stdoutencode(data) message = stdoutencode(data)
else: else:
message = data message = data
@ -1840,7 +1857,7 @@ def safeFilepathEncode(filepath):
retVal = filepath retVal = filepath
if filepath and isinstance(filepath, unicode): if filepath and isinstance(filepath, six.text_type):
retVal = filepath.encode(sys.getfilesystemencoding() or UNICODE_ENCODING) retVal = filepath.encode(sys.getfilesystemencoding() or UNICODE_ENCODING)
return retVal return retVal
@ -1927,7 +1944,7 @@ def getFilteredPageContent(page, onlyText=True, split=" "):
retVal = page retVal = page
# only if the page's charset has been successfully identified # only if the page's charset has been successfully identified
if isinstance(page, unicode): if isinstance(page, six.text_type):
retVal = re.sub(r"(?si)<script.+?</script>|<!--.+?-->|<style.+?</style>%s" % (r"|<[^>]+>|\t|\n|\r" if onlyText else ""), split, page) retVal = re.sub(r"(?si)<script.+?</script>|<!--.+?-->|<style.+?</style>%s" % (r"|<[^>]+>|\t|\n|\r" if onlyText else ""), split, page)
retVal = re.sub(r"%s{2,}" % split, split, retVal) retVal = re.sub(r"%s{2,}" % split, split, retVal)
retVal = htmlunescape(retVal.strip().strip(split)) retVal = htmlunescape(retVal.strip().strip(split))
@ -1945,7 +1962,7 @@ def getPageWordSet(page):
retVal = set() retVal = set()
# only if the page's charset has been successfully identified # only if the page's charset has been successfully identified
if isinstance(page, unicode): if isinstance(page, six.text_type):
retVal = set(_.group(0) for _ in re.finditer(r"\w+", getFilteredPageContent(page))) retVal = set(_.group(0) for _ in re.finditer(r"\w+", getFilteredPageContent(page)))
return retVal return retVal
@ -2430,7 +2447,7 @@ def getUnicode(value, encoding=None, noneToNull=False):
except UnicodeDecodeError: except UnicodeDecodeError:
return six.text_type(str(value), errors="ignore") # encoding ignored for non-basestring instances return six.text_type(str(value), errors="ignore") # encoding ignored for non-basestring instances
def getBytes(value, encoding=UNICODE_ENCODING): def getBytes(value, encoding=UNICODE_ENCODING, errors="strict"):
""" """
Returns byte representation of provided Unicode value Returns byte representation of provided Unicode value
@ -2445,11 +2462,11 @@ def getBytes(value, encoding=UNICODE_ENCODING):
for char in xrange(0xF0000, 0xF00FF + 1): for char in xrange(0xF0000, 0xF00FF + 1):
value = value.replace(unichr(char), "%s%02x" % (SAFE_HEX_MARKER, char - 0xF0000)) value = value.replace(unichr(char), "%s%02x" % (SAFE_HEX_MARKER, char - 0xF0000))
retVal = value.encode(encoding) retVal = value.encode(encoding, errors)
retVal = re.sub(r"%s([0-9a-f]{2})" % SAFE_HEX_MARKER, lambda _: _.group(1).decode("hex"), retVal) retVal = re.sub(r"%s([0-9a-f]{2})" % SAFE_HEX_MARKER, lambda _: _.group(1).decode("hex"), retVal)
else: else:
retVal = value.encode(encoding) retVal = value.encode(encoding, errors)
retVal = re.sub(r"\\x([0-9a-f]{2})", lambda _: _.group(1).decode("hex"), retVal) retVal = re.sub(r"\\x([0-9a-f]{2})", lambda _: _.group(1).decode("hex"), retVal)
return retVal return retVal
@ -3694,7 +3711,7 @@ def removeReflectiveValues(content, payload, suppressWarning=False):
retVal = content retVal = content
try: try:
if all((content, payload)) and isinstance(content, unicode) and kb.reflectiveMechanism and not kb.heuristicMode: if all((content, payload)) and isinstance(content, six.text_type) and kb.reflectiveMechanism and not kb.heuristicMode:
def _(value): def _(value):
while 2 * REFLECTED_REPLACEMENT_REGEX in value: while 2 * REFLECTED_REPLACEMENT_REGEX in value:
value = value.replace(2 * REFLECTED_REPLACEMENT_REGEX, REFLECTED_REPLACEMENT_REGEX) value = value.replace(2 * REFLECTED_REPLACEMENT_REGEX, REFLECTED_REPLACEMENT_REGEX)
@ -3786,7 +3803,7 @@ def normalizeUnicode(value):
'sucuraj' 'sucuraj'
""" """
return unicodedata.normalize("NFKD", value).encode("ascii", "ignore") if isinstance(value, unicode) else value return unicodedata.normalize("NFKD", value).encode("ascii", "ignore") if isinstance(value, six.text_type) else value
def safeSQLIdentificatorNaming(name, isTable=False): def safeSQLIdentificatorNaming(name, isTable=False):
""" """
@ -4105,7 +4122,7 @@ def asciifyUrl(url, forceQuote=False):
# _urllib.parse.quote(s.replace('%', '')) != s.replace('%', '') # _urllib.parse.quote(s.replace('%', '')) != s.replace('%', '')
# which would trigger on all %-characters, e.g. "&". # which would trigger on all %-characters, e.g. "&".
if getUnicode(s).encode("ascii", "replace") != s or forceQuote: if getUnicode(s).encode("ascii", "replace") != s or forceQuote:
return _urllib.parse.quote(s.encode(UNICODE_ENCODING) if isinstance(s, unicode) else s, safe=safe) return _urllib.parse.quote(s.encode(UNICODE_ENCODING) if isinstance(s, six.text_type) else s, safe=safe)
return s return s
username = quote(parts.username, '') username = quote(parts.username, '')
@ -4459,8 +4476,8 @@ def decodeHexValue(value, raw=False):
retVal = retVal.decode("utf-16-be") retVal = retVal.decode("utf-16-be")
except UnicodeDecodeError: except UnicodeDecodeError:
pass pass
if not isinstance(retVal, unicode): if not isinstance(retVal, six.text_type):
retVal = getUnicode(retVal, conf.encoding or "utf8") retVal = getUnicode(retVal, conf.encoding or UNICODE_ENCODING)
return retVal return retVal

View File

@ -242,7 +242,7 @@ class Dump(object):
if table and isListLike(table): if table and isListLike(table):
table = table[0] table = table[0]
maxlength = max(maxlength, len(unsafeSQLIdentificatorNaming(normalizeUnicode(table) or unicode(table)))) maxlength = max(maxlength, len(unsafeSQLIdentificatorNaming(normalizeUnicode(table) or getUnicode(table))))
lines = "-" * (int(maxlength) + 2) lines = "-" * (int(maxlength) + 2)
@ -263,7 +263,7 @@ class Dump(object):
table = table[0] table = table[0]
table = unsafeSQLIdentificatorNaming(table) table = unsafeSQLIdentificatorNaming(table)
blank = " " * (maxlength - len(normalizeUnicode(table) or unicode(table))) blank = " " * (maxlength - len(normalizeUnicode(table) or getUnicode(table)))
self._write("| %s%s |" % (table, blank)) self._write("| %s%s |" % (table, blank))
self._write("+%s+\n" % lines) self._write("+%s+\n" % lines)
@ -358,7 +358,7 @@ class Dump(object):
for ctables in dbTables.values(): for ctables in dbTables.values():
for tables in ctables.values(): for tables in ctables.values():
for table in tables: for table in tables:
maxlength1 = max(maxlength1, len(normalizeUnicode(table) or unicode(table))) maxlength1 = max(maxlength1, len(normalizeUnicode(table) or getUnicode(table)))
for db, counts in dbTables.items(): for db, counts in dbTables.items():
self._write("Database: %s" % unsafeSQLIdentificatorNaming(db) if db else "Current database") self._write("Database: %s" % unsafeSQLIdentificatorNaming(db) if db else "Current database")
@ -384,7 +384,7 @@ class Dump(object):
tables.sort(key=lambda _: _.lower() if hasattr(_, "lower") else _) tables.sort(key=lambda _: _.lower() if hasattr(_, "lower") else _)
for table in tables: for table in tables:
blank1 = " " * (maxlength1 - len(normalizeUnicode(table) or unicode(table))) blank1 = " " * (maxlength1 - len(normalizeUnicode(table) or getUnicode(table)))
blank2 = " " * (maxlength2 - len(str(count))) blank2 = " " * (maxlength2 - len(str(count)))
self._write("| %s%s | %d%s |" % (table, blank1, count, blank2)) self._write("| %s%s | %d%s |" % (table, blank1, count, blank2))

View File

@ -1716,7 +1716,7 @@ def _cleanupOptions():
except re.error: except re.error:
conf.csrfToken = re.escape(conf.csrfToken) conf.csrfToken = re.escape(conf.csrfToken)
finally: finally:
class _(unicode): class _(six.text_type):
pass pass
conf.csrfToken = _(conf.csrfToken) conf.csrfToken = _(conf.csrfToken)
conf.csrfToken._original = original conf.csrfToken._original = original

View File

@ -17,7 +17,7 @@ from lib.core.enums import DBMS_DIRECTORY_NAME
from lib.core.enums import OS from lib.core.enums import OS
# sqlmap version (<major>.<minor>.<month>.<monthly commit>) # sqlmap version (<major>.<minor>.<month>.<monthly commit>)
VERSION = "1.3.4.28" VERSION = "1.3.4.29"
TYPE = "dev" if VERSION.count('.') > 2 and VERSION.split('.')[-1] != '0' else "stable" TYPE = "dev" if VERSION.count('.') > 2 and VERSION.split('.')[-1] != '0' else "stable"
TYPE_COLORS = {"dev": 33, "stable": 90, "pip": 34} TYPE_COLORS = {"dev": 33, "stable": 90, "pip": 34}
VERSION_STRING = "sqlmap/%s#%s" % ('.'.join(VERSION.split('.')[:-1]) if VERSION.count('.') > 2 and VERSION.split('.')[-1] == '0' else VERSION, TYPE) VERSION_STRING = "sqlmap/%s#%s" % ('.'.join(VERSION.split('.')[:-1]) if VERSION.count('.') > 2 and VERSION.split('.')[-1] == '0' else VERSION, TYPE)

View File

@ -73,6 +73,7 @@ from lib.core.settings import URI_INJECTABLE_REGEX
from lib.core.settings import USER_AGENT_ALIASES from lib.core.settings import USER_AGENT_ALIASES
from lib.core.settings import XML_RECOGNITION_REGEX from lib.core.settings import XML_RECOGNITION_REGEX
from lib.utils.hashdb import HashDB from lib.utils.hashdb import HashDB
from thirdparty import six
from thirdparty.odict import OrderedDict from thirdparty.odict import OrderedDict
from thirdparty.six.moves import urllib as _urllib from thirdparty.six.moves import urllib as _urllib
@ -409,7 +410,7 @@ def _setRequestParams():
message += "Do you want sqlmap to automatically update it in further requests? [y/N] " message += "Do you want sqlmap to automatically update it in further requests? [y/N] "
if readInput(message, default='N', boolean=True): if readInput(message, default='N', boolean=True):
class _(unicode): class _(six.text_type):
pass pass
conf.csrfToken = _(re.escape(getUnicode(parameter))) conf.csrfToken = _(re.escape(getUnicode(parameter)))
conf.csrfToken._original = getUnicode(parameter) conf.csrfToken._original = getUnicode(parameter)
@ -712,7 +713,7 @@ def initTargetEnv():
_setDBMS() _setDBMS()
if conf.data: if conf.data:
class _(unicode): class _(six.text_type):
pass pass
kb.postUrlEncode = True kb.postUrlEncode = True

View File

@ -17,6 +17,7 @@ from lib.core.common import Backend
from lib.core.common import extractErrorMessage from lib.core.common import extractErrorMessage
from lib.core.common import extractRegexResult from lib.core.common import extractRegexResult
from lib.core.common import filterNone from lib.core.common import filterNone
from lib.core.common import getBytes
from lib.core.common import getPublicTypeMembers from lib.core.common import getPublicTypeMembers
from lib.core.common import getSafeExString from lib.core.common import getSafeExString
from lib.core.common import getUnicode from lib.core.common import getUnicode
@ -42,11 +43,11 @@ from lib.core.settings import MAX_CONNECTION_TOTAL_SIZE
from lib.core.settings import META_CHARSET_REGEX from lib.core.settings import META_CHARSET_REGEX
from lib.core.settings import PARSE_HEADERS_LIMIT from lib.core.settings import PARSE_HEADERS_LIMIT
from lib.core.settings import SELECT_FROM_TABLE_REGEX from lib.core.settings import SELECT_FROM_TABLE_REGEX
from lib.core.settings import UNICODE_ENCODING
from lib.core.settings import VIEWSTATE_REGEX from lib.core.settings import VIEWSTATE_REGEX
from lib.parse.headers import headersParser from lib.parse.headers import headersParser
from lib.parse.html import htmlParser from lib.parse.html import htmlParser
from lib.utils.htmlentities import htmlEntities from lib.utils.htmlentities import htmlEntities
from thirdparty import six
from thirdparty.chardet import detect from thirdparty.chardet import detect
from thirdparty.odict import OrderedDict from thirdparty.odict import OrderedDict
@ -219,13 +220,13 @@ def checkCharEncoding(encoding, warn=True):
# Reference: http://www.iana.org/assignments/character-sets # Reference: http://www.iana.org/assignments/character-sets
# Reference: http://docs.python.org/library/codecs.html # Reference: http://docs.python.org/library/codecs.html
try: try:
codecs.lookup(encoding.encode(UNICODE_ENCODING) if isinstance(encoding, unicode) else encoding) codecs.lookup(encoding)
except (LookupError, ValueError): except:
encoding = None encoding = None
if encoding: if encoding:
try: try:
unicode(randomStr(), encoding) six.text_type(getBytes(randomStr()), encoding)
except: except:
if warn: if warn:
warnMsg = "invalid web page charset '%s'" % encoding warnMsg = "invalid web page charset '%s'" % encoding
@ -313,7 +314,7 @@ def decodePage(page, contentEncoding, contentType):
kb.pageEncoding = conf.encoding kb.pageEncoding = conf.encoding
# can't do for all responses because we need to support binary files too # can't do for all responses because we need to support binary files too
if not isinstance(page, unicode) and "text/" in contentType: if isinstance(page, six.binary_type) and "text/" in contentType:
# e.g. &#x9;&#195;&#235;&#224;&#226;&#224; # e.g. &#x9;&#195;&#235;&#224;&#226;&#224;
if "&#" in page: if "&#" in page:
page = re.sub(r"&#x([0-9a-f]{1,2});", lambda _: (_.group(1) if len(_.group(1)) == 2 else "0%s" % _.group(1)).decode("hex"), page) page = re.sub(r"&#x([0-9a-f]{1,2});", lambda _: (_.group(1) if len(_.group(1)) == 2 else "0%s" % _.group(1)).decode("hex"), page)

View File

@ -8,6 +8,7 @@ See the file 'LICENSE' for copying permission
import re import re
from lib.core.common import extractRegexResult from lib.core.common import extractRegexResult
from lib.core.common import getBytes
from lib.core.common import getFilteredPageContent from lib.core.common import getFilteredPageContent
from lib.core.common import listToStrValue from lib.core.common import listToStrValue
from lib.core.common import removeDynamicContent from lib.core.common import removeDynamicContent
@ -28,6 +29,7 @@ from lib.core.settings import LOWER_RATIO_BOUND
from lib.core.settings import UPPER_RATIO_BOUND from lib.core.settings import UPPER_RATIO_BOUND
from lib.core.settings import URI_HTTP_HEADER from lib.core.settings import URI_HTTP_HEADER
from lib.core.threads import getCurrentThreadData from lib.core.threads import getCurrentThreadData
from thirdparty import six
def comparison(page, headers, code=None, getRatioValue=False, pageLength=None): def comparison(page, headers, code=None, getRatioValue=False, pageLength=None):
_ = _adjust(_comparison(page, headers, code, getRatioValue, pageLength), getRatioValue) _ = _adjust(_comparison(page, headers, code, getRatioValue, pageLength), getRatioValue)
@ -105,10 +107,10 @@ def _comparison(page, headers, code, getRatioValue, pageLength):
else: else:
# Preventing "Unicode equal comparison failed to convert both arguments to Unicode" # Preventing "Unicode equal comparison failed to convert both arguments to Unicode"
# (e.g. if one page is PDF and the other is HTML) # (e.g. if one page is PDF and the other is HTML)
if isinstance(seqMatcher.a, str) and isinstance(page, unicode): if isinstance(seqMatcher.a, six.binary_type) and isinstance(page, six.text_type):
page = page.encode(kb.pageEncoding or DEFAULT_PAGE_ENCODING, "ignore") page = getBytes(page, kb.pageEncoding or DEFAULT_PAGE_ENCODING, "ignore")
elif isinstance(seqMatcher.a, unicode) and isinstance(page, str): elif isinstance(seqMatcher.a, six.text_type) and isinstance(page, six.binary_type):
seqMatcher.a = seqMatcher.a.encode(kb.pageEncoding or DEFAULT_PAGE_ENCODING, "ignore") seqMatcher.a = getBytes(seqMatcher.a, kb.pageEncoding or DEFAULT_PAGE_ENCODING, "ignore")
if any(_ is None for _ in (page, seqMatcher.a)): if any(_ is None for _ in (page, seqMatcher.a)):
return None return None

View File

@ -486,7 +486,7 @@ def getValue(expression, blind=True, union=True, error=True, time=True, fromUser
singleTimeWarnMessage(warnMsg) singleTimeWarnMessage(warnMsg)
# Dirty patch (safe-encoded unicode characters) # Dirty patch (safe-encoded unicode characters)
if isinstance(value, unicode) and "\\x" in value: if isinstance(value, six.text_type) and "\\x" in value:
try: try:
candidate = eval(repr(value).replace("\\\\x", "\\x").replace("u'", "'", 1)).decode(conf.encoding or UNICODE_ENCODING) candidate = eval(repr(value).replace("\\\\x", "\\x").replace("u'", "'", 1)).decode(conf.encoding or UNICODE_ENCODING)
if "\\x" not in candidate: if "\\x" not in candidate:

View File

@ -32,6 +32,7 @@ from lib.core.threads import getCurrentThreadData
from lib.core.threads import runThreads from lib.core.threads import runThreads
from lib.parse.sitemap import parseSitemap from lib.parse.sitemap import parseSitemap
from lib.request.connect import Connect as Request from lib.request.connect import Connect as Request
from thirdparty import six
from thirdparty.beautifulsoup.beautifulsoup import BeautifulSoup from thirdparty.beautifulsoup.beautifulsoup import BeautifulSoup
from thirdparty.six.moves import http_client as _http_client from thirdparty.six.moves import http_client as _http_client
from thirdparty.six.moves import urllib as _urllib from thirdparty.six.moves import urllib as _urllib
@ -79,7 +80,7 @@ def crawl(target):
if not kb.threadContinue: if not kb.threadContinue:
break break
if isinstance(content, unicode): if isinstance(content, six.text_type):
try: try:
match = re.search(r"(?si)<html[^>]*>(.+)</html>", content) match = re.search(r"(?si)<html[^>]*>(.+)</html>", content)
if match: if match:

View File

@ -7,8 +7,8 @@ See the file 'LICENSE' for copying permission
import binascii import binascii
from lib.core.common import getBytes
from lib.core.common import isDBMSVersionAtLeast from lib.core.common import isDBMSVersionAtLeast
from lib.core.settings import UNICODE_ENCODING
from plugins.generic.syntax import Syntax as GenericSyntax from plugins.generic.syntax import Syntax as GenericSyntax
class Syntax(GenericSyntax): class Syntax(GenericSyntax):
@ -28,7 +28,7 @@ class Syntax(GenericSyntax):
def escaper(value): def escaper(value):
# Reference: http://stackoverflow.com/questions/3444335/how-do-i-quote-a-utf-8-string-literal-in-sqlite3 # Reference: http://stackoverflow.com/questions/3444335/how-do-i-quote-a-utf-8-string-literal-in-sqlite3
return "CAST(X'%s' AS TEXT)" % binascii.hexlify(value.encode(UNICODE_ENCODING) if isinstance(value, unicode) else value) return "CAST(X'%s' AS TEXT)" % binascii.hexlify(getBytes(value))
retVal = expression retVal = expression

View File

@ -91,6 +91,11 @@ import sys
if sys.version_info >= (3, 0): if sys.version_info >= (3, 0):
xrange = range xrange = range
text_type = str
binary_type = bytes
else:
text_type = unicode
binary_type = str
try: try:
from htmlentitydefs import name2codepoint from htmlentitydefs import name2codepoint
@ -434,19 +439,13 @@ class PageElement(object):
def toEncoding(self, s, encoding=None): def toEncoding(self, s, encoding=None):
"""Encodes an object to a string in some encoding, or to Unicode. """Encodes an object to a string in some encoding, or to Unicode.
.""" ."""
if isinstance(s, unicode): if isinstance(s, text_type):
if encoding: if encoding:
s = s.encode(encoding) s = s.encode(encoding)
elif isinstance(s, str): elif isinstance(s, binary_type):
if encoding: s = s.encode(encoding or "utf8")
s = s.encode(encoding)
else:
s = unicode(s)
else: else:
if encoding: s = self.toEncoding(str(s), encoding or "utf8")
s = self.toEncoding(str(s), encoding)
else:
s = unicode(s)
return s return s
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
@ -459,7 +458,7 @@ class PageElement(object):
return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";" return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
class NavigableString(unicode, PageElement): class NavigableString(text_type, PageElement):
def __new__(cls, value): def __new__(cls, value):
"""Create a new NavigableString. """Create a new NavigableString.
@ -469,9 +468,9 @@ class NavigableString(unicode, PageElement):
passed in to the superclass's __new__ or the superclass won't know passed in to the superclass's __new__ or the superclass won't know
how to handle non-ASCII characters. how to handle non-ASCII characters.
""" """
if isinstance(value, unicode): if isinstance(value, text_type):
return unicode.__new__(cls, value) return text_type.__new__(cls, value)
return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) return text_type.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
def __getnewargs__(self): def __getnewargs__(self):
return (NavigableString.__str__(self),) return (NavigableString.__str__(self),)
@ -1006,7 +1005,7 @@ class SoupStrainer:
if isinstance(markup, Tag): if isinstance(markup, Tag):
markup = markup.name markup = markup.name
if markup and not isinstance(markup, basestring): if markup and not isinstance(markup, basestring):
markup = unicode(markup) markup = text_type(markup)
#Now we know that chunk is either a string, or None. #Now we know that chunk is either a string, or None.
if hasattr(matchAgainst, 'match'): if hasattr(matchAgainst, 'match'):
# It's a regexp object. # It's a regexp object.
@ -1016,8 +1015,8 @@ class SoupStrainer:
elif hasattr(matchAgainst, 'items'): elif hasattr(matchAgainst, 'items'):
result = markup.has_key(matchAgainst) result = markup.has_key(matchAgainst)
elif matchAgainst and isinstance(markup, basestring): elif matchAgainst and isinstance(markup, basestring):
if isinstance(markup, unicode): if isinstance(markup, text_type):
matchAgainst = unicode(matchAgainst) matchAgainst = text_type(matchAgainst)
else: else:
matchAgainst = str(matchAgainst) matchAgainst = str(matchAgainst)
@ -1181,7 +1180,7 @@ class BeautifulStoneSoup(Tag, sgmllib.SGMLParser):
def _feed(self, inDocumentEncoding=None, isHTML=False): def _feed(self, inDocumentEncoding=None, isHTML=False):
# Convert the document to Unicode. # Convert the document to Unicode.
markup = self.markup markup = self.markup
if isinstance(markup, unicode): if isinstance(markup, text_type):
if not hasattr(self, 'originalEncoding'): if not hasattr(self, 'originalEncoding'):
self.originalEncoding = None self.originalEncoding = None
else: else:
@ -1792,9 +1791,9 @@ class UnicodeDammit:
self._detectEncoding(markup, isHTML) self._detectEncoding(markup, isHTML)
self.smartQuotesTo = smartQuotesTo self.smartQuotesTo = smartQuotesTo
self.triedEncodings = [] self.triedEncodings = []
if markup == '' or isinstance(markup, unicode): if markup == '' or isinstance(markup, text_type):
self.originalEncoding = None self.originalEncoding = None
self.unicode = unicode(markup) self.unicode = text_type(markup)
return return
u = None u = None
@ -1807,7 +1806,7 @@ class UnicodeDammit:
if u: break if u: break
# If no luck and we have auto-detection library, try that: # If no luck and we have auto-detection library, try that:
if not u and chardet and not isinstance(self.markup, unicode): if not u and chardet and not isinstance(self.markup, text_type):
u = self._convertFrom(chardet.detect(self.markup)['encoding']) u = self._convertFrom(chardet.detect(self.markup)['encoding'])
# As a last resort, try utf-8 and windows-1252: # As a last resort, try utf-8 and windows-1252:
@ -1880,7 +1879,7 @@ class UnicodeDammit:
elif data[:4] == '\xff\xfe\x00\x00': elif data[:4] == '\xff\xfe\x00\x00':
encoding = 'utf-32le' encoding = 'utf-32le'
data = data[4:] data = data[4:]
newdata = unicode(data, encoding) newdata = text_type(data, encoding)
return newdata return newdata
def _detectEncoding(self, xml_data, isHTML=False): def _detectEncoding(self, xml_data, isHTML=False):
@ -1893,41 +1892,41 @@ class UnicodeDammit:
elif xml_data[:4] == '\x00\x3c\x00\x3f': elif xml_data[:4] == '\x00\x3c\x00\x3f':
# UTF-16BE # UTF-16BE
sniffed_xml_encoding = 'utf-16be' sniffed_xml_encoding = 'utf-16be'
xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') xml_data = text_type(xml_data, 'utf-16be').encode('utf-8')
elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \ elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
and (xml_data[2:4] != '\x00\x00'): and (xml_data[2:4] != '\x00\x00'):
# UTF-16BE with BOM # UTF-16BE with BOM
sniffed_xml_encoding = 'utf-16be' sniffed_xml_encoding = 'utf-16be'
xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') xml_data = text_type(xml_data[2:], 'utf-16be').encode('utf-8')
elif xml_data[:4] == '\x3c\x00\x3f\x00': elif xml_data[:4] == '\x3c\x00\x3f\x00':
# UTF-16LE # UTF-16LE
sniffed_xml_encoding = 'utf-16le' sniffed_xml_encoding = 'utf-16le'
xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') xml_data = text_type(xml_data, 'utf-16le').encode('utf-8')
elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \ elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
(xml_data[2:4] != '\x00\x00'): (xml_data[2:4] != '\x00\x00'):
# UTF-16LE with BOM # UTF-16LE with BOM
sniffed_xml_encoding = 'utf-16le' sniffed_xml_encoding = 'utf-16le'
xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') xml_data = text_type(xml_data[2:], 'utf-16le').encode('utf-8')
elif xml_data[:4] == '\x00\x00\x00\x3c': elif xml_data[:4] == '\x00\x00\x00\x3c':
# UTF-32BE # UTF-32BE
sniffed_xml_encoding = 'utf-32be' sniffed_xml_encoding = 'utf-32be'
xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') xml_data = text_type(xml_data, 'utf-32be').encode('utf-8')
elif xml_data[:4] == '\x3c\x00\x00\x00': elif xml_data[:4] == '\x3c\x00\x00\x00':
# UTF-32LE # UTF-32LE
sniffed_xml_encoding = 'utf-32le' sniffed_xml_encoding = 'utf-32le'
xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') xml_data = text_type(xml_data, 'utf-32le').encode('utf-8')
elif xml_data[:4] == '\x00\x00\xfe\xff': elif xml_data[:4] == '\x00\x00\xfe\xff':
# UTF-32BE with BOM # UTF-32BE with BOM
sniffed_xml_encoding = 'utf-32be' sniffed_xml_encoding = 'utf-32be'
xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') xml_data = text_type(xml_data[4:], 'utf-32be').encode('utf-8')
elif xml_data[:4] == '\xff\xfe\x00\x00': elif xml_data[:4] == '\xff\xfe\x00\x00':
# UTF-32LE with BOM # UTF-32LE with BOM
sniffed_xml_encoding = 'utf-32le' sniffed_xml_encoding = 'utf-32le'
xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') xml_data = text_type(xml_data[4:], 'utf-32le').encode('utf-8')
elif xml_data[:3] == '\xef\xbb\xbf': elif xml_data[:3] == '\xef\xbb\xbf':
# UTF-8 with BOM # UTF-8 with BOM
sniffed_xml_encoding = 'utf-8' sniffed_xml_encoding = 'utf-8'
xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') xml_data = text_type(xml_data[3:], 'utf-8').encode('utf-8')
else: else:
sniffed_xml_encoding = 'ascii' sniffed_xml_encoding = 'ascii'
pass pass

View File

@ -21,7 +21,6 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
""" """
import io import io
import mimetools
import mimetypes import mimetypes
import os import os
import stat import stat