Merge remote-tracking branch 'sqlmapproject/master'

This commit is contained in:
cxh852456 2015-10-16 09:32:45 +08:00
commit 2914574b9b
90 changed files with 2264 additions and 1486 deletions

View File

@ -173,6 +173,9 @@ Ivan Giacomelli, <truemilk(at)insiberia.net>
* for suggesting a minor enhancement * for suggesting a minor enhancement
* for reviewing the documentation * for reviewing the documentation
Dimitris Giannitsaros, <daremon(at)gmail.com>
* for contributing a REST-JSON API client
Nico Golde, <nico(at)ngolde.de> Nico Golde, <nico(at)ngolde.de>
* for reporting a couple of bugs * for reporting a couple of bugs

View File

@ -76,7 +76,7 @@ def main(src, dst):
# Instantiate an IP packets decoder # Instantiate an IP packets decoder
decoder = ImpactDecoder.IPDecoder() decoder = ImpactDecoder.IPDecoder()
while 1: while True:
cmd = '' cmd = ''
# Wait for incoming replies # Wait for incoming replies

View File

@ -22,6 +22,7 @@ from lib.core.common import findDynamicContent
from lib.core.common import Format from lib.core.common import Format
from lib.core.common import getLastRequestHTTPError from lib.core.common import getLastRequestHTTPError
from lib.core.common import getPublicTypeMembers from lib.core.common import getPublicTypeMembers
from lib.core.common import getSafeExString
from lib.core.common import getSortedInjectionTests from lib.core.common import getSortedInjectionTests
from lib.core.common import getUnicode from lib.core.common import getUnicode
from lib.core.common import intersect from lib.core.common import intersect
@ -38,6 +39,7 @@ from lib.core.common import singleTimeWarnMessage
from lib.core.common import urlencode from lib.core.common import urlencode
from lib.core.common import wasLastResponseDBMSError from lib.core.common import wasLastResponseDBMSError
from lib.core.common import wasLastResponseHTTPError from lib.core.common import wasLastResponseHTTPError
from lib.core.defaults import defaults
from lib.core.data import conf from lib.core.data import conf
from lib.core.data import kb from lib.core.data import kb
from lib.core.data import logger from lib.core.data import logger
@ -67,6 +69,7 @@ from lib.core.settings import URI_HTTP_HEADER
from lib.core.settings import UPPER_RATIO_BOUND from lib.core.settings import UPPER_RATIO_BOUND
from lib.core.settings import IDS_WAF_CHECK_PAYLOAD from lib.core.settings import IDS_WAF_CHECK_PAYLOAD
from lib.core.settings import IDS_WAF_CHECK_RATIO from lib.core.settings import IDS_WAF_CHECK_RATIO
from lib.core.settings import IDS_WAF_CHECK_TIMEOUT
from lib.core.threads import getCurrentThreadData from lib.core.threads import getCurrentThreadData
from lib.request.connect import Connect as Request from lib.request.connect import Connect as Request
from lib.request.inject import checkBooleanExpression from lib.request.inject import checkBooleanExpression
@ -204,6 +207,16 @@ def checkSqlInjection(place, parameter, value):
logger.debug(debugMsg) logger.debug(debugMsg)
continue continue
# Skip tests if title, vector or DBMS is included by the
# given skip filter
if conf.testSkip and any(conf.testSkip in str(item) or \
re.search(conf.testSkip, str(item), re.I) for item in \
(test.title, test.vector, payloadDbms)):
debugMsg = "skipping test '%s' because its " % title
debugMsg += "name/vector/DBMS is included by the given skip filter"
logger.debug(debugMsg)
continue
if payloadDbms is not None: if payloadDbms is not None:
# Skip DBMS-specific test if it does not match the user's # Skip DBMS-specific test if it does not match the user's
# provided DBMS # provided DBMS
@ -1139,12 +1152,12 @@ def checkWaf():
Reference: http://seclists.org/nmap-dev/2011/q2/att-1005/http-waf-detect.nse Reference: http://seclists.org/nmap-dev/2011/q2/att-1005/http-waf-detect.nse
""" """
if any((conf.string, conf.notString, conf.regexp, conf.dummy, conf.offline)): if any((conf.string, conf.notString, conf.regexp, conf.dummy, conf.offline, conf.skipWaf)):
return None return None
dbmMsg = "heuristically checking if the target is protected by " infoMsg = "checking if the target is protected by "
dbmMsg += "some kind of WAF/IPS/IDS" infoMsg += "some kind of WAF/IPS/IDS"
logger.debug(dbmMsg) logger.info(infoMsg)
retVal = False retVal = False
payload = "%d %s" % (randomInt(), IDS_WAF_CHECK_PAYLOAD) payload = "%d %s" % (randomInt(), IDS_WAF_CHECK_PAYLOAD)
@ -1152,12 +1165,16 @@ def checkWaf():
value = "" if not conf.parameters.get(PLACE.GET) else conf.parameters[PLACE.GET] + DEFAULT_GET_POST_DELIMITER value = "" if not conf.parameters.get(PLACE.GET) else conf.parameters[PLACE.GET] + DEFAULT_GET_POST_DELIMITER
value += agent.addPayloadDelimiters("%s=%s" % (randomStr(), payload)) value += agent.addPayloadDelimiters("%s=%s" % (randomStr(), payload))
pushValue(conf.timeout)
conf.timeout = IDS_WAF_CHECK_TIMEOUT
try: try:
retVal = Request.queryPage(place=PLACE.GET, value=value, getRatioValue=True, noteResponseTime=False, silent=True)[1] < IDS_WAF_CHECK_RATIO retVal = Request.queryPage(place=PLACE.GET, value=value, getRatioValue=True, noteResponseTime=False, silent=True)[1] < IDS_WAF_CHECK_RATIO
except SqlmapConnectionException: except SqlmapConnectionException:
retVal = True retVal = True
finally: finally:
kb.matchRatio = None kb.matchRatio = None
conf.timeout = popValue()
if retVal: if retVal:
warnMsg = "heuristics detected that the target " warnMsg = "heuristics detected that the target "
@ -1172,6 +1189,10 @@ def checkWaf():
if output and output[0] in ("Y", "y"): if output and output[0] in ("Y", "y"):
conf.identifyWaf = True conf.identifyWaf = True
if conf.timeout == defaults.timeout:
logger.warning("dropping timeout to %d seconds (i.e. '--timeout=%d')" % (IDS_WAF_CHECK_TIMEOUT, IDS_WAF_CHECK_TIMEOUT))
conf.timeout = IDS_WAF_CHECK_TIMEOUT
return retVal return retVal
def identifyWaf(): def identifyWaf():
@ -1278,8 +1299,8 @@ def checkNullConnection():
infoMsg = "NULL connection is supported with 'skip-read' method" infoMsg = "NULL connection is supported with 'skip-read' method"
logger.info(infoMsg) logger.info(infoMsg)
except SqlmapConnectionException, errMsg: except SqlmapConnectionException, ex:
errMsg = getUnicode(errMsg) errMsg = getSafeExString(ex)
raise SqlmapConnectionException(errMsg) raise SqlmapConnectionException(errMsg)
finally: finally:
@ -1298,7 +1319,7 @@ def checkConnection(suppressOutput=False):
raise SqlmapConnectionException(errMsg) raise SqlmapConnectionException(errMsg)
except socket.error, ex: except socket.error, ex:
errMsg = "problem occurred while " errMsg = "problem occurred while "
errMsg += "resolving a host name '%s' ('%s')" % (conf.hostname, ex.message) errMsg += "resolving a host name '%s' ('%s')" % (conf.hostname, getSafeExString(ex))
raise SqlmapConnectionException(errMsg) raise SqlmapConnectionException(errMsg)
if not suppressOutput and not conf.dummy and not conf.offline: if not suppressOutput and not conf.dummy and not conf.offline:
@ -1326,7 +1347,7 @@ def checkConnection(suppressOutput=False):
else: else:
kb.errorIsNone = True kb.errorIsNone = True
except SqlmapConnectionException, errMsg: except SqlmapConnectionException, ex:
if conf.ipv6: if conf.ipv6:
warnMsg = "check connection to a provided " warnMsg = "check connection to a provided "
warnMsg += "IPv6 address with a tool like ping6 " warnMsg += "IPv6 address with a tool like ping6 "
@ -1336,7 +1357,7 @@ def checkConnection(suppressOutput=False):
singleTimeWarnMessage(warnMsg) singleTimeWarnMessage(warnMsg)
if any(code in kb.httpErrorCodes for code in (httplib.NOT_FOUND, )): if any(code in kb.httpErrorCodes for code in (httplib.NOT_FOUND, )):
errMsg = getUnicode(errMsg) errMsg = getSafeExString(ex)
logger.critical(errMsg) logger.critical(errMsg)
if conf.multipleTargets: if conf.multipleTargets:

View File

@ -24,7 +24,7 @@ from lib.core.common import dataToStdout
from lib.core.common import extractRegexResult from lib.core.common import extractRegexResult
from lib.core.common import getFilteredPageContent from lib.core.common import getFilteredPageContent
from lib.core.common import getPublicTypeMembers from lib.core.common import getPublicTypeMembers
from lib.core.common import getUnicode from lib.core.common import getSafeExString
from lib.core.common import hashDBRetrieve from lib.core.common import hashDBRetrieve
from lib.core.common import hashDBWrite from lib.core.common import hashDBWrite
from lib.core.common import intersect from lib.core.common import intersect
@ -421,6 +421,7 @@ def start():
skip |= (place == PLACE.USER_AGENT and intersect(USER_AGENT_ALIASES, conf.skip, True) not in ([], None)) skip |= (place == PLACE.USER_AGENT and intersect(USER_AGENT_ALIASES, conf.skip, True) not in ([], None))
skip |= (place == PLACE.REFERER and intersect(REFERER_ALIASES, conf.skip, True) not in ([], None)) skip |= (place == PLACE.REFERER and intersect(REFERER_ALIASES, conf.skip, True) not in ([], None))
skip |= (place == PLACE.COOKIE and intersect(PLACE.COOKIE, conf.skip, True) not in ([], None)) skip |= (place == PLACE.COOKIE and intersect(PLACE.COOKIE, conf.skip, True) not in ([], None))
skip |= (place == PLACE.HOST and intersect(PLACE.HOST, conf.skip, True) not in ([], None))
skip &= not (place == PLACE.USER_AGENT and intersect(USER_AGENT_ALIASES, conf.testParameter, True)) skip &= not (place == PLACE.USER_AGENT and intersect(USER_AGENT_ALIASES, conf.testParameter, True))
skip &= not (place == PLACE.REFERER and intersect(REFERER_ALIASES, conf.testParameter, True)) skip &= not (place == PLACE.REFERER and intersect(REFERER_ALIASES, conf.testParameter, True))
@ -648,7 +649,7 @@ def start():
raise raise
except SqlmapBaseException, ex: except SqlmapBaseException, ex:
errMsg = getUnicode(ex.message) errMsg = getSafeExString(ex)
if conf.multipleTargets: if conf.multipleTargets:
errMsg += ", skipping to the next %s" % ("form" if conf.forms else "URL") errMsg += ", skipping to the next %s" % ("form" if conf.forms else "URL")

View File

@ -187,12 +187,12 @@ class Agent(object):
if origValue: if origValue:
regex = r"(\A|\b)%s=%s%s" % (re.escape(parameter), re.escape(origValue), r"(\Z|\b)" if origValue[-1].isalnum() else "") regex = r"(\A|\b)%s=%s%s" % (re.escape(parameter), re.escape(origValue), r"(\Z|\b)" if origValue[-1].isalnum() else "")
retVal = _(regex, "%s=%s" % (parameter, self.addPayloadDelimiters(newValue.replace("\\", "\\\\"))), paramString) retVal = _(regex, "%s=%s" % (parameter, self.addPayloadDelimiters(newValue)), paramString)
else: else:
retVal = _(r"(\A|\b)%s=%s(\Z|%s|%s|\s)" % (re.escape(parameter), re.escape(origValue), DEFAULT_GET_POST_DELIMITER, DEFAULT_COOKIE_DELIMITER), "%s=%s\g<2>" % (parameter, self.addPayloadDelimiters(newValue.replace("\\", "\\\\"))), paramString) retVal = _(r"(\A|\b)%s=%s(\Z|%s|%s|\s)" % (re.escape(parameter), re.escape(origValue), DEFAULT_GET_POST_DELIMITER, DEFAULT_COOKIE_DELIMITER), "%s=%s\g<2>" % (parameter, self.addPayloadDelimiters(newValue)), paramString)
if retVal == paramString and urlencode(parameter) != parameter: if retVal == paramString and urlencode(parameter) != parameter:
retVal = _(r"(\A|\b)%s=%s" % (re.escape(urlencode(parameter)), re.escape(origValue)), "%s=%s" % (urlencode(parameter), self.addPayloadDelimiters(newValue.replace("\\", "\\\\"))), paramString) retVal = _(r"(\A|\b)%s=%s" % (re.escape(urlencode(parameter)), re.escape(origValue)), "%s=%s" % (urlencode(parameter), self.addPayloadDelimiters(newValue)), paramString)
if retVal: if retVal:
retVal = retVal.replace(BOUNDARY_BACKSLASH_MARKER, '\\') retVal = retVal.replace(BOUNDARY_BACKSLASH_MARKER, '\\')
@ -308,8 +308,8 @@ class Agent(object):
for _ in set(re.findall(r"\[RANDSTR(?:\d+)?\]", payload, re.I)): for _ in set(re.findall(r"\[RANDSTR(?:\d+)?\]", payload, re.I)):
payload = payload.replace(_, randomStr()) payload = payload.replace(_, randomStr())
if origValue is not None: if origValue is not None and "[ORIGVALUE]" in payload:
payload = payload.replace("[ORIGVALUE]", origValue if origValue.isdigit() else unescaper.escape("'%s'" % origValue)) payload = getUnicode(payload).replace("[ORIGVALUE]", origValue if origValue.isdigit() else unescaper.escape("'%s'" % origValue))
if "[INFERENCE]" in payload: if "[INFERENCE]" in payload:
if Backend.getIdentifiedDbms() is not None: if Backend.getIdentifiedDbms() is not None:
@ -480,7 +480,7 @@ class Agent(object):
@rtype: C{str} @rtype: C{str}
""" """
prefixRegex = r"(?:\s+(?:FIRST|SKIP)\s+\d+)*" prefixRegex = r"(?:\s+(?:FIRST|SKIP|LIMIT \d+)\s+\d+)*"
fieldsSelectTop = re.search(r"\ASELECT\s+TOP\s+[\d]+\s+(.+?)\s+FROM", query, re.I) fieldsSelectTop = re.search(r"\ASELECT\s+TOP\s+[\d]+\s+(.+?)\s+FROM", query, re.I)
fieldsSelectRownum = re.search(r"\ASELECT\s+([^()]+?),\s*ROWNUM AS LIMIT FROM", query, re.I) fieldsSelectRownum = re.search(r"\ASELECT\s+([^()]+?),\s*ROWNUM AS LIMIT FROM", query, re.I)
fieldsSelectDistinct = re.search(r"\ASELECT%s\s+DISTINCT\((.+?)\)\s+FROM" % prefixRegex, query, re.I) fieldsSelectDistinct = re.search(r"\ASELECT%s\s+DISTINCT\((.+?)\)\s+FROM" % prefixRegex, query, re.I)
@ -501,13 +501,17 @@ class Agent(object):
elif fieldsMinMaxstr: elif fieldsMinMaxstr:
fieldsToCastStr = fieldsMinMaxstr.groups()[0] fieldsToCastStr = fieldsMinMaxstr.groups()[0]
elif fieldsExists: elif fieldsExists:
fieldsToCastStr = fieldsSelect.groups()[0] if fieldsSelect:
fieldsToCastStr = fieldsSelect.groups()[0]
elif fieldsSelectTop: elif fieldsSelectTop:
fieldsToCastStr = fieldsSelectTop.groups()[0] fieldsToCastStr = fieldsSelectTop.groups()[0]
elif fieldsSelectRownum: elif fieldsSelectRownum:
fieldsToCastStr = fieldsSelectRownum.groups()[0] fieldsToCastStr = fieldsSelectRownum.groups()[0]
elif fieldsSelectDistinct: elif fieldsSelectDistinct:
fieldsToCastStr = fieldsSelectDistinct.groups()[0] if Backend.getDbms() in (DBMS.HSQLDB,):
fieldsToCastStr = fieldsNoSelect
else:
fieldsToCastStr = fieldsSelectDistinct.groups()[0]
elif fieldsSelectCase: elif fieldsSelectCase:
fieldsToCastStr = fieldsSelectCase.groups()[0] fieldsToCastStr = fieldsSelectCase.groups()[0]
elif fieldsSelectFrom: elif fieldsSelectFrom:
@ -584,7 +588,7 @@ class Agent(object):
else: else:
return query return query
if Backend.getIdentifiedDbms() in (DBMS.MYSQL,): if Backend.isDbms(DBMS.MYSQL):
if fieldsExists: if fieldsExists:
concatenatedQuery = concatenatedQuery.replace("SELECT ", "CONCAT('%s'," % kb.chars.start, 1) concatenatedQuery = concatenatedQuery.replace("SELECT ", "CONCAT('%s'," % kb.chars.start, 1)
concatenatedQuery += ",'%s')" % kb.chars.stop concatenatedQuery += ",'%s')" % kb.chars.stop
@ -611,6 +615,7 @@ class Agent(object):
concatenatedQuery = concatenatedQuery.replace("SELECT ", "'%s'||" % kb.chars.start, 1) concatenatedQuery = concatenatedQuery.replace("SELECT ", "'%s'||" % kb.chars.start, 1)
_ = unArrayizeValue(zeroDepthSearch(concatenatedQuery, " FROM ")) _ = unArrayizeValue(zeroDepthSearch(concatenatedQuery, " FROM "))
concatenatedQuery = "%s||'%s'%s" % (concatenatedQuery[:_], kb.chars.stop, concatenatedQuery[_:]) concatenatedQuery = "%s||'%s'%s" % (concatenatedQuery[:_], kb.chars.stop, concatenatedQuery[_:])
concatenatedQuery = re.sub(r"('%s'\|\|)(.+)(%s)" % (kb.chars.start, re.escape(castedFields)), "\g<2>\g<1>\g<3>", concatenatedQuery)
elif fieldsSelect: elif fieldsSelect:
concatenatedQuery = concatenatedQuery.replace("SELECT ", "'%s'||" % kb.chars.start, 1) concatenatedQuery = concatenatedQuery.replace("SELECT ", "'%s'||" % kb.chars.start, 1)
concatenatedQuery += "||'%s'" % kb.chars.stop concatenatedQuery += "||'%s'" % kb.chars.stop
@ -881,12 +886,30 @@ class Agent(object):
fromIndex = limitedQuery.index(" FROM ") fromIndex = limitedQuery.index(" FROM ")
untilFrom = limitedQuery[:fromIndex] untilFrom = limitedQuery[:fromIndex]
fromFrom = limitedQuery[fromIndex + 1:] fromFrom = limitedQuery[fromIndex + 1:]
orderBy = False orderBy = None
if Backend.getIdentifiedDbms() in (DBMS.MYSQL, DBMS.PGSQL, DBMS.SQLITE): if Backend.getIdentifiedDbms() in (DBMS.MYSQL, DBMS.PGSQL, DBMS.SQLITE):
limitStr = queries[Backend.getIdentifiedDbms()].limit.query % (num, 1) limitStr = queries[Backend.getIdentifiedDbms()].limit.query % (num, 1)
limitedQuery += " %s" % limitStr limitedQuery += " %s" % limitStr
elif Backend.isDbms(DBMS.HSQLDB):
match = re.search(r"ORDER BY [^ ]+", limitedQuery)
if match:
limitedQuery = re.sub(r"\s*%s\s*" % match.group(0), " ", limitedQuery).strip()
limitedQuery += " %s" % match.group(0)
if query.startswith("SELECT "):
limitStr = queries[Backend.getIdentifiedDbms()].limit.query % (num, 1)
limitedQuery = limitedQuery.replace("SELECT ", "SELECT %s " % limitStr, 1)
else:
limitStr = queries[Backend.getIdentifiedDbms()].limit.query2 % (1, num)
limitedQuery += " %s" % limitStr
if not match:
match = re.search(r"%s\s+(\w+)" % re.escape(limitStr), limitedQuery)
if match:
orderBy = " ORDER BY %s" % match.group(1)
elif Backend.isDbms(DBMS.FIREBIRD): elif Backend.isDbms(DBMS.FIREBIRD):
limitStr = queries[Backend.getIdentifiedDbms()].limit.query % (num + 1, num + 1) limitStr = queries[Backend.getIdentifiedDbms()].limit.query % (num + 1, num + 1)
limitedQuery += " %s" % limitStr limitedQuery += " %s" % limitStr

View File

@ -79,7 +79,7 @@ class BigArray(list):
self.chunks[-1] = pickle.load(fp) self.chunks[-1] = pickle.load(fp)
except IOError, ex: except IOError, ex:
errMsg = "exception occurred while retrieving data " errMsg = "exception occurred while retrieving data "
errMsg += "from a temporary file ('%s')" % ex errMsg += "from a temporary file ('%s')" % ex.message
raise SqlmapSystemException, errMsg raise SqlmapSystemException, errMsg
return self.chunks[-1].pop() return self.chunks[-1].pop()
@ -99,7 +99,7 @@ class BigArray(list):
return filename return filename
except (OSError, IOError), ex: except (OSError, IOError), ex:
errMsg = "exception occurred while storing data " errMsg = "exception occurred while storing data "
errMsg += "to a temporary file ('%s'). Please " % ex errMsg += "to a temporary file ('%s'). Please " % ex.message
errMsg += "make sure that there is enough disk space left. If problem persists, " errMsg += "make sure that there is enough disk space left. If problem persists, "
errMsg += "try to set environment variable 'TEMP' to a location " errMsg += "try to set environment variable 'TEMP' to a location "
errMsg += "writeable by the current user" errMsg += "writeable by the current user"
@ -115,7 +115,7 @@ class BigArray(list):
self.cache = Cache(index, pickle.load(fp), False) self.cache = Cache(index, pickle.load(fp), False)
except IOError, ex: except IOError, ex:
errMsg = "exception occurred while retrieving data " errMsg = "exception occurred while retrieving data "
errMsg += "from a temporary file ('%s')" % ex errMsg += "from a temporary file ('%s')" % ex.message
raise SqlmapSystemException, errMsg raise SqlmapSystemException, errMsg
def __getstate__(self): def __getstate__(self):

View File

@ -879,7 +879,7 @@ def dataToOutFile(filename, data):
f.write(data) f.write(data)
except IOError, ex: except IOError, ex:
errMsg = "something went wrong while trying to write " errMsg = "something went wrong while trying to write "
errMsg += "to the output file ('%s')" % ex.message errMsg += "to the output file ('%s')" % getSafeExString(ex)
raise SqlmapGenericException(errMsg) raise SqlmapGenericException(errMsg)
return retVal return retVal
@ -909,14 +909,15 @@ def readInput(message, default=None, checkBatch=True):
answer = item.split('=')[1] if len(item.split('=')) > 1 else None answer = item.split('=')[1] if len(item.split('=')) > 1 else None
if answer and question.lower() in message.lower(): if answer and question.lower() in message.lower():
retVal = getUnicode(answer, UNICODE_ENCODING) retVal = getUnicode(answer, UNICODE_ENCODING)
elif answer is None and retVal:
retVal = "%s,%s" % (retVal, getUnicode(item, UNICODE_ENCODING))
infoMsg = "%s%s" % (message, retVal) if retVal:
logger.info(infoMsg) infoMsg = "%s%s" % (message, retVal)
logger.info(infoMsg)
debugMsg = "used the given answer" debugMsg = "used the given answer"
logger.debug(debugMsg) logger.debug(debugMsg)
break
if retVal is None: if retVal is None:
if checkBatch and conf.get("batch"): if checkBatch and conf.get("batch"):
@ -1369,7 +1370,7 @@ def expandAsteriskForColumns(expression):
return expression return expression
def getLimitRange(count, dump=False, plusOne=False): def getLimitRange(count, plusOne=False):
""" """
Returns range of values used in limit/offset constructs Returns range of values used in limit/offset constructs
@ -1381,12 +1382,11 @@ def getLimitRange(count, dump=False, plusOne=False):
count = int(count) count = int(count)
limitStart, limitStop = 1, count limitStart, limitStop = 1, count
if dump: if isinstance(conf.limitStop, int) and conf.limitStop > 0 and conf.limitStop < limitStop:
if isinstance(conf.limitStop, int) and conf.limitStop > 0 and conf.limitStop < limitStop: limitStop = conf.limitStop
limitStop = conf.limitStop
if isinstance(conf.limitStart, int) and conf.limitStart > 0 and conf.limitStart <= limitStop: if isinstance(conf.limitStart, int) and conf.limitStart > 0 and conf.limitStart <= limitStop:
limitStart = conf.limitStart limitStart = conf.limitStart
retVal = xrange(limitStart, limitStop + 1) if plusOne else xrange(limitStart - 1, limitStop) retVal = xrange(limitStart, limitStop + 1) if plusOne else xrange(limitStart - 1, limitStop)
@ -1622,6 +1622,15 @@ def safeStringFormat(format_, params):
index = retVal.find("%s", start) index = retVal.find("%s", start)
retVal = retVal[:index] + getUnicode(param) + retVal[index + 2:] retVal = retVal[:index] + getUnicode(param) + retVal[index + 2:]
else: else:
if any('%s' in _ for _ in conf.parameters.values()):
parts = format_.split(' ')
for i in xrange(len(parts)):
if PAYLOAD_DELIMITER in parts[i]:
parts[i] = parts[i].replace(PAYLOAD_DELIMITER, "")
parts[i] = "%s%s" % (parts[i], PAYLOAD_DELIMITER)
break
format_ = ' '.join(parts)
count = 0 count = 0
while True: while True:
match = re.search(r"(\A|[^A-Za-z0-9])(%s)([^A-Za-z0-9]|\Z)", retVal) match = re.search(r"(\A|[^A-Za-z0-9])(%s)([^A-Za-z0-9]|\Z)", retVal)
@ -1866,8 +1875,13 @@ def readCachedFileContent(filename, mode='rb'):
with kb.locks.cache: with kb.locks.cache:
if filename not in kb.cache.content: if filename not in kb.cache.content:
checkFile(filename) checkFile(filename)
with openFile(filename, mode) as f: try:
kb.cache.content[filename] = f.read() with openFile(filename, mode) as f:
kb.cache.content[filename] = f.read()
except (IOError, OSError, MemoryError), ex:
errMsg = "something went wrong while trying "
errMsg += "to read the content of file '%s' ('%s')" % (filename, ex)
raise SqlmapSystemException(errMsg)
return kb.cache.content[filename] return kb.cache.content[filename]
@ -2489,7 +2503,10 @@ def extractTextTagContent(page):
page = page or "" page = page or ""
if REFLECTED_VALUE_MARKER in page: if REFLECTED_VALUE_MARKER in page:
page = re.sub(r"(?si)[^\s>]*%s[^\s<]*" % REFLECTED_VALUE_MARKER, "", page) try:
page = re.sub(r"(?i)[^\s>]*%s[^\s<]*" % REFLECTED_VALUE_MARKER, "", page)
except MemoryError:
page = page.replace(REFLECTED_VALUE_MARKER, "")
return filter(None, (_.group('result').strip() for _ in re.finditer(TEXT_TAG_REGEX, page))) return filter(None, (_.group('result').strip() for _ in re.finditer(TEXT_TAG_REGEX, page)))
@ -2681,7 +2698,7 @@ def parseSqliteTableSchema(value):
table = {} table = {}
columns = {} columns = {}
for match in re.finditer(r"(\w+)\s+(INT|INTEGER|TINYINT|SMALLINT|MEDIUMINT|BIGINT|UNSIGNED BIG INT|INT2|INT8|INTEGER|CHARACTER|VARCHAR|VARYING CHARACTER|NCHAR|NATIVE CHARACTER|NVARCHAR|TEXT|CLOB|TEXT|BLOB|NONE|REAL|DOUBLE|DOUBLE PRECISION|FLOAT|REAL|NUMERIC|DECIMAL|BOOLEAN|DATE|DATETIME|NUMERIC)\b", value, re.I): for match in re.finditer(r"(\w+)[\"'`]?\s+(INT|INTEGER|TINYINT|SMALLINT|MEDIUMINT|BIGINT|UNSIGNED BIG INT|INT2|INT8|INTEGER|CHARACTER|VARCHAR|VARYING CHARACTER|NCHAR|NATIVE CHARACTER|NVARCHAR|TEXT|CLOB|TEXT|BLOB|NONE|REAL|DOUBLE|DOUBLE PRECISION|FLOAT|REAL|NUMERIC|DECIMAL|BOOLEAN|DATE|DATETIME|NUMERIC)\b", value, re.I):
columns[match.group(1)] = match.group(2) columns[match.group(1)] = match.group(2)
table[conf.tbl] = columns table[conf.tbl] = columns
@ -2800,7 +2817,13 @@ def unArrayizeValue(value):
""" """
if isListLike(value): if isListLike(value):
value = value[0] if len(value) > 0 else None if not value:
value = None
elif len(value) == 1 and not isListLike(value[0]):
value = value[0]
else:
_ = filter(lambda _: _ is not None, (_ for _ in flattenValue(value)))
value = _[0] if len(_) > 0 else None
return value return value
@ -3008,7 +3031,7 @@ def createGithubIssue(errMsg, excMsg):
else: else:
warnMsg = "something went wrong while creating a Github issue" warnMsg = "something went wrong while creating a Github issue"
if ex: if ex:
warnMsg += " ('%s')" % ex warnMsg += " ('%s')" % getSafeExString(ex)
if "Unauthorized" in warnMsg: if "Unauthorized" in warnMsg:
warnMsg += ". Please update to the latest revision" warnMsg += ". Please update to the latest revision"
logger.warn(warnMsg) logger.warn(warnMsg)
@ -3020,7 +3043,7 @@ def maskSensitiveData(msg):
retVal = getUnicode(msg) retVal = getUnicode(msg)
for item in filter(None, map(lambda x: conf.get(x), ("hostname", "googleDork", "authCred", "proxyCred", "tbl", "db", "col", "user", "cookie", "proxy", "rFile", "wFile", "dFile"))): for item in filter(None, map(lambda x: conf.get(x), ("hostname", "data", "googleDork", "authCred", "proxyCred", "tbl", "db", "col", "user", "cookie", "proxy", "rFile", "wFile", "dFile"))):
regex = SENSITIVE_DATA_REGEX % re.sub("(\W)", r"\\\1", getUnicode(item)) regex = SENSITIVE_DATA_REGEX % re.sub("(\W)", r"\\\1", getUnicode(item))
while extractRegexResult(regex, retVal): while extractRegexResult(regex, retVal):
value = extractRegexResult(regex, retVal) value = extractRegexResult(regex, retVal)
@ -3567,7 +3590,7 @@ def findPageForms(content, url, raise_=False, addToTargets=False):
request = form.click() request = form.click()
except (ValueError, TypeError), ex: except (ValueError, TypeError), ex:
errMsg = "there has been a problem while " errMsg = "there has been a problem while "
errMsg += "processing page forms ('%s')" % ex errMsg += "processing page forms ('%s')" % getSafeExString(ex)
if raise_: if raise_:
raise SqlmapGenericException(errMsg) raise SqlmapGenericException(errMsg)
else: else:
@ -3670,7 +3693,7 @@ def evaluateCode(code, variables=None):
except KeyboardInterrupt: except KeyboardInterrupt:
raise raise
except Exception, ex: except Exception, ex:
errMsg = "an error occurred while evaluating provided code ('%s') " % ex.message errMsg = "an error occurred while evaluating provided code ('%s') " % getSafeExString(ex)
raise SqlmapGenericException(errMsg) raise SqlmapGenericException(errMsg)
def serializeObject(object_): def serializeObject(object_):
@ -3870,13 +3893,18 @@ def decloakToTemp(filename):
""" """
content = decloak(filename) content = decloak(filename)
_ = os.path.split(filename[:-1])[-1]
_ = utf8encode(os.path.split(filename[:-1])[-1])
prefix, suffix = os.path.splitext(_) prefix, suffix = os.path.splitext(_)
prefix = prefix.split(os.extsep)[0] prefix = prefix.split(os.extsep)[0]
handle, filename = tempfile.mkstemp(prefix=prefix, suffix=suffix) handle, filename = tempfile.mkstemp(prefix=prefix, suffix=suffix)
os.close(handle) os.close(handle)
with open(filename, "w+b") as f: with open(filename, "w+b") as f:
f.write(content) f.write(content)
return filename return filename
def prioritySortColumns(columns): def prioritySortColumns(columns):
@ -3977,3 +4005,18 @@ def pollProcess(process, suppress_errors=False):
dataToStdout(" quit unexpectedly with return code %d\n" % returncode) dataToStdout(" quit unexpectedly with return code %d\n" % returncode)
break break
def getSafeExString(ex, encoding=None):
"""
Safe way how to get the proper exception represtation as a string
(Note: errors to be avoided: 1) "%s" % Exception(u'\u0161') and 2) "%s" % str(Exception(u'\u0161'))
"""
retVal = ex
if getattr(ex, "message", None):
retVal = ex.message
elif getattr(ex, "msg", None):
retVal = ex.msg
return getUnicode(retVal, encoding=encoding)

View File

@ -223,6 +223,7 @@ DEPRECATED_OPTIONS = {
"--replicate": "use '--dump-format=SQLITE' instead", "--replicate": "use '--dump-format=SQLITE' instead",
"--no-unescape": "use '--no-escape' instead", "--no-unescape": "use '--no-escape' instead",
"--binary": "use '--binary-fields' instead", "--binary": "use '--binary-fields' instead",
"--auth-private": "use '--auth-file' instead",
"--check-payload": None, "--check-payload": None,
"--check-waf": None, "--check-waf": None,
} }

View File

@ -15,6 +15,7 @@ import threading
from lib.core.common import Backend from lib.core.common import Backend
from lib.core.common import dataToDumpFile from lib.core.common import dataToDumpFile
from lib.core.common import dataToStdout from lib.core.common import dataToStdout
from lib.core.common import getSafeExString
from lib.core.common import getUnicode from lib.core.common import getUnicode
from lib.core.common import isListLike from lib.core.common import isListLike
from lib.core.common import normalizeUnicode from lib.core.common import normalizeUnicode
@ -74,7 +75,7 @@ class Dump(object):
try: try:
self._outputFP.write(text) self._outputFP.write(text)
except IOError, ex: except IOError, ex:
errMsg = "error occurred while writing to log file ('%s')" % ex.message errMsg = "error occurred while writing to log file ('%s')" % getSafeExString(ex)
raise SqlmapGenericException(errMsg) raise SqlmapGenericException(errMsg)
if kb.get("multiThreadMode"): if kb.get("multiThreadMode"):
@ -94,7 +95,7 @@ class Dump(object):
try: try:
self._outputFP = openFile(self._outputFile, "ab" if not conf.flushSession else "wb") self._outputFP = openFile(self._outputFile, "ab" if not conf.flushSession else "wb")
except IOError, ex: except IOError, ex:
errMsg = "error occurred while opening log file ('%s')" % ex.message errMsg = "error occurred while opening log file ('%s')" % getSafeExString(ex)
raise SqlmapGenericException(errMsg) raise SqlmapGenericException(errMsg)
def getOutputFile(self): def getOutputFile(self):
@ -159,7 +160,7 @@ class Dump(object):
def currentDb(self, data): def currentDb(self, data):
if Backend.isDbms(DBMS.MAXDB): if Backend.isDbms(DBMS.MAXDB):
self.string("current database (no practical usage on %s)" % Backend.getIdentifiedDbms(), data, content_type=CONTENT_TYPE.CURRENT_DB) self.string("current database (no practical usage on %s)" % Backend.getIdentifiedDbms(), data, content_type=CONTENT_TYPE.CURRENT_DB)
elif Backend.getIdentifiedDbms() in (DBMS.ORACLE, DBMS.PGSQL): elif Backend.getIdentifiedDbms() in (DBMS.ORACLE, DBMS.PGSQL, DBMS.HSQLDB):
self.string("current schema (equivalent to database on %s)" % Backend.getIdentifiedDbms(), data, content_type=CONTENT_TYPE.CURRENT_DB) self.string("current schema (equivalent to database on %s)" % Backend.getIdentifiedDbms(), data, content_type=CONTENT_TYPE.CURRENT_DB)
else: else:
self.string("current database", data, content_type=CONTENT_TYPE.CURRENT_DB) self.string("current database", data, content_type=CONTENT_TYPE.CURRENT_DB)
@ -635,11 +636,11 @@ class Dump(object):
for column in dbColumnsDict.keys(): for column in dbColumnsDict.keys():
if colConsider == "1": if colConsider == "1":
colConsiderStr = "s like '%s' were" % unsafeSQLIdentificatorNaming(column) colConsiderStr = "s LIKE '%s' were" % unsafeSQLIdentificatorNaming(column)
else: else:
colConsiderStr = " '%s' was" % unsafeSQLIdentificatorNaming(column) colConsiderStr = " '%s' was" % unsafeSQLIdentificatorNaming(column)
msg = "Column%s found in the " % colConsiderStr msg = "column%s found in the " % colConsiderStr
msg += "following databases:" msg += "following databases:"
self._write(msg) self._write(msg)

View File

@ -27,6 +27,7 @@ import lib.core.common
import lib.core.threads import lib.core.threads
import lib.core.convert import lib.core.convert
import lib.request.connect import lib.request.connect
import lib.utils.google
from lib.controller.checks import checkConnection from lib.controller.checks import checkConnection
from lib.core.common import Backend from lib.core.common import Backend
@ -34,6 +35,7 @@ from lib.core.common import boldifyMessage
from lib.core.common import checkFile from lib.core.common import checkFile
from lib.core.common import dataToStdout from lib.core.common import dataToStdout
from lib.core.common import getPublicTypeMembers from lib.core.common import getPublicTypeMembers
from lib.core.common import getSafeExString
from lib.core.common import extractRegexResult from lib.core.common import extractRegexResult
from lib.core.common import filterStringValue from lib.core.common import filterStringValue
from lib.core.common import findPageForms from lib.core.common import findPageForms
@ -90,6 +92,7 @@ from lib.core.exception import SqlmapInstallationException
from lib.core.exception import SqlmapMissingDependence from lib.core.exception import SqlmapMissingDependence
from lib.core.exception import SqlmapMissingMandatoryOptionException from lib.core.exception import SqlmapMissingMandatoryOptionException
from lib.core.exception import SqlmapMissingPrivileges from lib.core.exception import SqlmapMissingPrivileges
from lib.core.exception import SqlmapNoneDataException
from lib.core.exception import SqlmapSilentQuitException from lib.core.exception import SqlmapSilentQuitException
from lib.core.exception import SqlmapSyntaxException from lib.core.exception import SqlmapSyntaxException
from lib.core.exception import SqlmapSystemException from lib.core.exception import SqlmapSystemException
@ -638,7 +641,7 @@ def _setBulkMultipleTargets():
for line in getFileItems(conf.bulkFile): for line in getFileItems(conf.bulkFile):
if re.match(r"[^ ]+\?(.+)", line, re.I) or CUSTOM_INJECTION_MARK_CHAR in line: if re.match(r"[^ ]+\?(.+)", line, re.I) or CUSTOM_INJECTION_MARK_CHAR in line:
found = True found = True
kb.targets.add((line.strip(), None, None, None, None)) kb.targets.add((line.strip(), conf.method, conf.data, conf.cookie, None))
if not found and not conf.forms and not conf.crawlDepth: if not found and not conf.forms and not conf.crawlDepth:
warnMsg = "no usable links found (with GET parameters)" warnMsg = "no usable links found (with GET parameters)"
@ -776,6 +779,7 @@ def _setMetasploit():
kb.oldMsf = True kb.oldMsf = True
else: else:
msfEnvPathExists = False msfEnvPathExists = False
conf.msfPath = path conf.msfPath = path
break break
@ -806,7 +810,7 @@ def _setMetasploit():
for envPath in envPaths: for envPath in envPaths:
envPath = envPath.replace(";", "") envPath = envPath.replace(";", "")
if all(os.path.exists(normalizePath(os.path.join(envPath, _))) for _ in ("", "msfcli", "msfconsole")): if any(os.path.exists(normalizePath(os.path.join(envPath, _))) for _ in ("msfcli", "msfconsole")):
msfEnvPathExists = True msfEnvPathExists = True
if all(os.path.exists(normalizePath(os.path.join(envPath, _))) for _ in ("msfvenom",)): if all(os.path.exists(normalizePath(os.path.join(envPath, _))) for _ in ("msfvenom",)):
kb.oldMsf = False kb.oldMsf = False
@ -1083,18 +1087,22 @@ def _setHTTPProxy():
if hasattr(proxyHandler, "%s_open" % _): if hasattr(proxyHandler, "%s_open" % _):
delattr(proxyHandler, "%s_open" % _) delattr(proxyHandler, "%s_open" % _)
if not conf.proxy: if conf.proxyList is not None:
if conf.proxyList: if not conf.proxyList:
conf.proxy = conf.proxyList[0] errMsg = "list of usable proxies is exhausted"
conf.proxyList = conf.proxyList[1:] + conf.proxyList[:1] raise SqlmapNoneDataException(errMsg)
infoMsg = "loading proxy '%s' from a supplied proxy list file" % conf.proxy conf.proxy = conf.proxyList[0]
logger.info(infoMsg) conf.proxyList = conf.proxyList[1:]
else:
if conf.hostname in ('localhost', '127.0.0.1') or conf.ignoreProxy:
proxyHandler.proxies = {}
return infoMsg = "loading proxy '%s' from a supplied proxy list file" % conf.proxy
logger.info(infoMsg)
elif not conf.proxy:
if conf.hostname in ("localhost", "127.0.0.1") or conf.ignoreProxy:
proxyHandler.proxies = {}
return
debugMsg = "setting the HTTP/SOCKS proxy for all HTTP requests" debugMsg = "setting the HTTP/SOCKS proxy for all HTTP requests"
logger.debug(debugMsg) logger.debug(debugMsg)
@ -1126,7 +1134,7 @@ def _setHTTPProxy():
if conf.proxyCred: if conf.proxyCred:
_ = re.search("^(.*?):(.*?)$", conf.proxyCred) _ = re.search("^(.*?):(.*?)$", conf.proxyCred)
if not _: if not _:
errMsg = "Proxy authentication credentials " errMsg = "proxy authentication credentials "
errMsg += "value must be in format username:password" errMsg += "value must be in format username:password"
raise SqlmapSyntaxException(errMsg) raise SqlmapSyntaxException(errMsg)
else: else:
@ -1257,13 +1265,13 @@ def _setHTTPAuthentication():
global authHandler global authHandler
if not conf.authType and not conf.authCred and not conf.authPrivate: if not conf.authType and not conf.authCred and not conf.authFile:
return return
if conf.authPrivate and not conf.authType: if conf.authFile and not conf.authType:
conf.authType = AUTH_TYPE.PKI conf.authType = AUTH_TYPE.PKI
elif conf.authType and not conf.authCred and not conf.authPrivate: elif conf.authType and not conf.authCred and not conf.authFile:
errMsg = "you specified the HTTP authentication type, but " errMsg = "you specified the HTTP authentication type, but "
errMsg += "did not provide the credentials" errMsg += "did not provide the credentials"
raise SqlmapSyntaxException(errMsg) raise SqlmapSyntaxException(errMsg)
@ -1278,7 +1286,7 @@ def _setHTTPAuthentication():
errMsg += "Basic, Digest, NTLM or PKI" errMsg += "Basic, Digest, NTLM or PKI"
raise SqlmapSyntaxException(errMsg) raise SqlmapSyntaxException(errMsg)
if not conf.authPrivate: if not conf.authFile:
debugMsg = "setting the HTTP authentication type and credentials" debugMsg = "setting the HTTP authentication type and credentials"
logger.debug(debugMsg) logger.debug(debugMsg)
@ -1329,7 +1337,7 @@ def _setHTTPAuthentication():
debugMsg = "setting the HTTP(s) authentication PEM private key" debugMsg = "setting the HTTP(s) authentication PEM private key"
logger.debug(debugMsg) logger.debug(debugMsg)
_ = safeExpandUser(conf.authPrivate) _ = safeExpandUser(conf.authFile)
checkFile(_) checkFile(_)
authHandler = HTTPSPKIAuthHandler(_) authHandler = HTTPSPKIAuthHandler(_)
@ -1523,7 +1531,7 @@ def _createTemporaryDirectory():
os.makedirs(tempfile.gettempdir()) os.makedirs(tempfile.gettempdir())
except IOError, ex: except IOError, ex:
errMsg = "there has been a problem while accessing " errMsg = "there has been a problem while accessing "
errMsg += "system's temporary directory location(s) ('%s'). Please " % ex.message errMsg += "system's temporary directory location(s) ('%s'). Please " % getSafeExString(ex)
errMsg += "make sure that there is enough disk space left. If problem persists, " errMsg += "make sure that there is enough disk space left. If problem persists, "
errMsg += "try to set environment variable 'TEMP' to a location " errMsg += "try to set environment variable 'TEMP' to a location "
errMsg += "writeable by the current user" errMsg += "writeable by the current user"
@ -1627,6 +1635,10 @@ def _cleanupOptions():
conf.testFilter = conf.testFilter.strip('*+') conf.testFilter = conf.testFilter.strip('*+')
conf.testFilter = re.sub(r"([^.])([*+])", "\g<1>.\g<2>", conf.testFilter) conf.testFilter = re.sub(r"([^.])([*+])", "\g<1>.\g<2>", conf.testFilter)
if conf.testSkip:
conf.testSkip = conf.testSkip.strip('*+')
conf.testSkip = re.sub(r"([^.])([*+])", "\g<1>.\g<2>", conf.testSkip)
if "timeSec" not in kb.explicitSettings: if "timeSec" not in kb.explicitSettings:
if conf.tor: if conf.tor:
conf.timeSec = 2 * conf.timeSec conf.timeSec = 2 * conf.timeSec
@ -1734,7 +1746,7 @@ def _setConfAttributes():
conf.parameters = {} conf.parameters = {}
conf.path = None conf.path = None
conf.port = None conf.port = None
conf.proxyList = [] conf.proxyList = None
conf.resultsFilename = None conf.resultsFilename = None
conf.resultsFP = None conf.resultsFP = None
conf.scheme = None conf.scheme = None
@ -2071,7 +2083,7 @@ def _mergeOptions(inputOptions, overrideOptions):
inputOptions = base64unpickle(inputOptions.pickledOptions) inputOptions = base64unpickle(inputOptions.pickledOptions)
except Exception, ex: except Exception, ex:
errMsg = "provided invalid value '%s' for option '--pickled-options'" % inputOptions.pickledOptions errMsg = "provided invalid value '%s' for option '--pickled-options'" % inputOptions.pickledOptions
errMsg += " ('%s')" % ex.message if ex.message else "" errMsg += " ('%s')" % ex if ex.message else ""
raise SqlmapSyntaxException(errMsg) raise SqlmapSyntaxException(errMsg)
if inputOptions.configFile: if inputOptions.configFile:
@ -2243,7 +2255,11 @@ def _checkTor():
infoMsg = "checking Tor connection" infoMsg = "checking Tor connection"
logger.info(infoMsg) logger.info(infoMsg)
page, _, _ = Request.getPage(url="https://check.torproject.org/", raise404=False) try:
page, _, _ = Request.getPage(url="https://check.torproject.org/", raise404=False)
except SqlmapConnectionException:
page = None
if not page or 'Congratulations' not in page: if not page or 'Congratulations' not in page:
errMsg = "it seems that Tor is not properly set. Please try using options '--tor-type' and/or '--tor-port'" errMsg = "it seems that Tor is not properly set. Please try using options '--tor-type' and/or '--tor-port'"
raise SqlmapConnectionException(errMsg) raise SqlmapConnectionException(errMsg)
@ -2290,6 +2306,10 @@ def _basicOptionValidation():
errMsg = "option '-d' is incompatible with option '-u' ('--url')" errMsg = "option '-d' is incompatible with option '-u' ('--url')"
raise SqlmapSyntaxException(errMsg) raise SqlmapSyntaxException(errMsg)
if conf.identifyWaf and conf.skipWaf:
errMsg = "switch '--identify-waf' is incompatible with switch '--skip-waf'"
raise SqlmapSyntaxException(errMsg)
if conf.titles and conf.nullConnection: if conf.titles and conf.nullConnection:
errMsg = "switch '--titles' is incompatible with switch '--null-connection'" errMsg = "switch '--titles' is incompatible with switch '--null-connection'"
raise SqlmapSyntaxException(errMsg) raise SqlmapSyntaxException(errMsg)
@ -2404,6 +2424,10 @@ def _basicOptionValidation():
errMsg = "switch '--tor' is incompatible with option '--proxy'" errMsg = "switch '--tor' is incompatible with option '--proxy'"
raise SqlmapSyntaxException(errMsg) raise SqlmapSyntaxException(errMsg)
if conf.proxy and conf.proxyFile:
errMsg = "switch '--proxy' is incompatible with option '--proxy-file'"
raise SqlmapSyntaxException(errMsg)
if conf.checkTor and not any((conf.tor, conf.proxy)): if conf.checkTor and not any((conf.tor, conf.proxy)):
errMsg = "switch '--check-tor' requires usage of switch '--tor' (or option '--proxy' with HTTP proxy address using Tor)" errMsg = "switch '--check-tor' requires usage of switch '--tor' (or option '--proxy' with HTTP proxy address using Tor)"
raise SqlmapSyntaxException(errMsg) raise SqlmapSyntaxException(errMsg)
@ -2471,6 +2495,7 @@ def _resolveCrossReferences():
lib.core.common.getPageTemplate = getPageTemplate lib.core.common.getPageTemplate = getPageTemplate
lib.core.convert.singleTimeWarnMessage = singleTimeWarnMessage lib.core.convert.singleTimeWarnMessage = singleTimeWarnMessage
lib.request.connect.setHTTPProxy = _setHTTPProxy lib.request.connect.setHTTPProxy = _setHTTPProxy
lib.utils.google.setHTTPProxy = _setHTTPProxy
lib.controller.checks.setVerbosity = setVerbosity lib.controller.checks.setVerbosity = setVerbosity
def initOptions(inputOptions=AttribDict(), overrideOptions=False): def initOptions(inputOptions=AttribDict(), overrideOptions=False):

View File

@ -37,7 +37,7 @@ optDict = {
"headers": "string", "headers": "string",
"authType": "string", "authType": "string",
"authCred": "string", "authCred": "string",
"authPrivate": "string", "authFile": "string",
"proxy": "string", "proxy": "string",
"proxyCred": "string", "proxyCred": "string",
"proxyFile": "string", "proxyFile": "string",
@ -205,6 +205,7 @@ optDict = {
"saveConfig": "string", "saveConfig": "string",
"scope": "string", "scope": "string",
"testFilter": "string", "testFilter": "string",
"testSkip": "string",
"updateAll": "boolean", "updateAll": "boolean",
}, },
@ -231,6 +232,7 @@ optDict = {
"cpuThrottle": "integer", "cpuThrottle": "integer",
"forceDns": "boolean", "forceDns": "boolean",
"identifyWaf": "boolean", "identifyWaf": "boolean",
"skipWaf": "boolean",
"ignore401": "boolean", "ignore401": "boolean",
"smokeTest": "boolean", "smokeTest": "boolean",
"liveTest": "boolean", "liveTest": "boolean",

View File

@ -8,9 +8,11 @@ See the file 'doc/COPYING' for copying permission
import sqlite3 import sqlite3
from extra.safe2bin.safe2bin import safechardecode from extra.safe2bin.safe2bin import safechardecode
from lib.core.common import getSafeExString
from lib.core.common import unsafeSQLIdentificatorNaming from lib.core.common import unsafeSQLIdentificatorNaming
from lib.core.exception import SqlmapGenericException from lib.core.exception import SqlmapGenericException
from lib.core.exception import SqlmapValueException from lib.core.exception import SqlmapValueException
from lib.core.settings import UNICODE_ENCODING
class Replication(object): class Replication(object):
""" """
@ -49,11 +51,16 @@ class Replication(object):
self.name = unsafeSQLIdentificatorNaming(name) self.name = unsafeSQLIdentificatorNaming(name)
self.columns = columns self.columns = columns
if create: if create:
self.execute('DROP TABLE IF EXISTS "%s"' % self.name) try:
if not typeless: self.execute('DROP TABLE IF EXISTS "%s"' % self.name)
self.execute('CREATE TABLE "%s" (%s)' % (self.name, ','.join('"%s" %s' % (unsafeSQLIdentificatorNaming(colname), coltype) for colname, coltype in self.columns))) if not typeless:
else: self.execute('CREATE TABLE "%s" (%s)' % (self.name, ','.join('"%s" %s' % (unsafeSQLIdentificatorNaming(colname), coltype) for colname, coltype in self.columns)))
self.execute('CREATE TABLE "%s" (%s)' % (self.name, ','.join('"%s"' % unsafeSQLIdentificatorNaming(colname) for colname in self.columns))) else:
self.execute('CREATE TABLE "%s" (%s)' % (self.name, ','.join('"%s"' % unsafeSQLIdentificatorNaming(colname) for colname in self.columns)))
except Exception, ex:
errMsg = "problem occurred ('%s') while initializing the sqlite database " % getSafeExString(ex, UNICODE_ENCODING)
errMsg += "located at '%s'" % self.parent.dbpath
raise SqlmapGenericException(errMsg)
def insert(self, values): def insert(self, values):
""" """
@ -70,7 +77,7 @@ class Replication(object):
try: try:
self.parent.cursor.execute(sql, parameters) self.parent.cursor.execute(sql, parameters)
except sqlite3.OperationalError, ex: except sqlite3.OperationalError, ex:
errMsg = "problem occurred ('%s') while accessing sqlite database " % unicode(ex) errMsg = "problem occurred ('%s') while accessing sqlite database " % getSafeExString(ex, UNICODE_ENCODING)
errMsg += "located at '%s'. Please make sure that " % self.parent.dbpath errMsg += "located at '%s'. Please make sure that " % self.parent.dbpath
errMsg += "it's not used by some other program" errMsg += "it's not used by some other program"
raise SqlmapGenericException(errMsg) raise SqlmapGenericException(errMsg)

View File

@ -42,6 +42,9 @@ CONSTANT_RATIO = 0.9
# Ratio used in heuristic check for WAF/IDS/IPS protected targets # Ratio used in heuristic check for WAF/IDS/IPS protected targets
IDS_WAF_CHECK_RATIO = 0.5 IDS_WAF_CHECK_RATIO = 0.5
# Timeout used in heuristic check for WAF/IDS/IPS protected targets
IDS_WAF_CHECK_TIMEOUT = 10
# Lower and upper values for match ratio in case of stable page # Lower and upper values for match ratio in case of stable page
LOWER_RATIO_BOUND = 0.02 LOWER_RATIO_BOUND = 0.02
UPPER_RATIO_BOUND = 0.98 UPPER_RATIO_BOUND = 0.98
@ -219,6 +222,8 @@ USER_AGENT_ALIASES = ("ua", "useragent", "user-agent")
REFERER_ALIASES = ("ref", "referer", "referrer") REFERER_ALIASES = ("ref", "referer", "referrer")
HOST_ALIASES = ("host",) HOST_ALIASES = ("host",)
HSQLDB_DEFAULT_SCHEMA = "PUBLIC"
# Names that can't be used to name files on Windows OS # Names that can't be used to name files on Windows OS
WINDOWS_RESERVED_NAMES = ("CON", "PRN", "AUX", "NUL", "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9", "LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9") WINDOWS_RESERVED_NAMES = ("CON", "PRN", "AUX", "NUL", "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9", "LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9")

View File

@ -39,7 +39,6 @@ from lib.core.enums import POST_HINT
from lib.core.exception import SqlmapFilePathException from lib.core.exception import SqlmapFilePathException
from lib.core.exception import SqlmapGenericException from lib.core.exception import SqlmapGenericException
from lib.core.exception import SqlmapMissingPrivileges from lib.core.exception import SqlmapMissingPrivileges
from lib.core.exception import SqlmapSyntaxException
from lib.core.exception import SqlmapSystemException from lib.core.exception import SqlmapSystemException
from lib.core.exception import SqlmapUserQuitException from lib.core.exception import SqlmapUserQuitException
from lib.core.option import _setDBMS from lib.core.option import _setDBMS

View File

@ -10,7 +10,7 @@ import threading
import time import time
import traceback import traceback
from thread import error as threadError from thread import error as ThreadError
from lib.core.data import conf from lib.core.data import conf
from lib.core.data import kb from lib.core.data import kb
@ -89,9 +89,9 @@ def exceptionHandledFunction(threadFunction):
kb.threadContinue = False kb.threadContinue = False
kb.threadException = True kb.threadException = True
raise raise
except Exception, errMsg: except Exception, ex:
# thread is just going to be silently killed # thread is just going to be silently killed
logger.error("thread %s: %s" % (threading.currentThread().getName(), errMsg)) logger.error("thread %s: %s" % (threading.currentThread().getName(), ex.message))
def setDaemon(thread): def setDaemon(thread):
# Reference: http://stackoverflow.com/questions/190010/daemon-threads-explanation # Reference: http://stackoverflow.com/questions/190010/daemon-threads-explanation
@ -145,8 +145,8 @@ def runThreads(numThreads, threadFunction, cleanupFunction=None, forwardExceptio
try: try:
thread.start() thread.start()
except threadError, errMsg: except ThreadError, ex:
errMsg = "error occurred while starting new thread ('%s')" % errMsg errMsg = "error occurred while starting new thread ('%s')" % ex.message
logger.critical(errMsg) logger.critical(errMsg)
break break
@ -178,10 +178,10 @@ def runThreads(numThreads, threadFunction, cleanupFunction=None, forwardExceptio
if forwardException: if forwardException:
raise raise
except (SqlmapConnectionException, SqlmapValueException), errMsg: except (SqlmapConnectionException, SqlmapValueException), ex:
print print
kb.threadException = True kb.threadException = True
logger.error("thread %s: %s" % (threading.currentThread().getName(), errMsg)) logger.error("thread %s: %s" % (threading.currentThread().getName(), ex.message))
except: except:
from lib.core.common import unhandledExceptionMessage from lib.core.common import unhandledExceptionMessage

View File

@ -30,7 +30,7 @@ def update():
if not os.path.exists(os.path.join(rootDir, ".git")): if not os.path.exists(os.path.join(rootDir, ".git")):
errMsg = "not a git repository. Please checkout the 'sqlmapproject/sqlmap' repository " errMsg = "not a git repository. Please checkout the 'sqlmapproject/sqlmap' repository "
errMsg += "from GitHub (e.g. git clone https://github.com/sqlmapproject/sqlmap.git sqlmap-dev)" errMsg += "from GitHub (e.g. 'git clone https://github.com/sqlmapproject/sqlmap.git sqlmap')"
logger.error(errMsg) logger.error(errMsg)
else: else:
infoMsg = "updating sqlmap to the latest development version from the " infoMsg = "updating sqlmap to the latest development version from the "
@ -51,7 +51,12 @@ def update():
_ = lib.core.settings.REVISION = getRevisionNumber() _ = lib.core.settings.REVISION = getRevisionNumber()
logger.info("%s the latest revision '%s'" % ("already at" if "Already" in stdout else "updated to", _)) logger.info("%s the latest revision '%s'" % ("already at" if "Already" in stdout else "updated to", _))
else: else:
logger.error("update could not be completed ('%s')" % re.sub(r"\W+", " ", stderr).strip()) if "Not a git repository" in stderr:
errMsg = "not a valid git repository. Please checkout the 'sqlmapproject/sqlmap' repository "
errMsg += "from GitHub (e.g. 'git clone https://github.com/sqlmapproject/sqlmap.git sqlmap')"
logger.error(errMsg)
else:
logger.error("update could not be completed ('%s')" % re.sub(r"\W+", " ", stderr).strip())
if not success: if not success:
if IS_WIN: if IS_WIN:

View File

@ -36,14 +36,17 @@ from lib.core.shell import clearHistory
from lib.core.shell import loadHistory from lib.core.shell import loadHistory
from lib.core.shell import saveHistory from lib.core.shell import saveHistory
def cmdLineParser(): def cmdLineParser(argv=None):
""" """
This function parses the command line parameters and arguments This function parses the command line parameters and arguments
""" """
if not argv:
argv = sys.argv
checkSystemEncoding() checkSystemEncoding()
_ = getUnicode(os.path.basename(sys.argv[0]), encoding=sys.getfilesystemencoding()) _ = getUnicode(os.path.basename(argv[0]), encoding=sys.getfilesystemencoding())
usage = "%s%s [options]" % ("python " if not IS_WIN else "", \ usage = "%s%s [options]" % ("python " if not IS_WIN else "", \
"\"%s\"" % _ if " " in _ else _) "\"%s\"" % _ if " " in _ else _)
@ -141,8 +144,8 @@ def cmdLineParser():
help="HTTP authentication credentials " help="HTTP authentication credentials "
"(name:password)") "(name:password)")
request.add_option("--auth-private", dest="authPrivate", request.add_option("--auth-file", dest="authFile",
help="HTTP authentication PEM private key file") help="HTTP authentication PEM cert/private key file")
request.add_option("--ignore-401", dest="ignore401", action="store_true", request.add_option("--ignore-401", dest="ignore401", action="store_true",
help="Ignore HTTP Error 401 (Unauthorized)") help="Ignore HTTP Error 401 (Unauthorized)")
@ -671,6 +674,9 @@ def cmdLineParser():
general.add_option("--test-filter", dest="testFilter", general.add_option("--test-filter", dest="testFilter",
help="Select tests by payloads and/or titles (e.g. ROW)") help="Select tests by payloads and/or titles (e.g. ROW)")
general.add_option("--test-skip", dest="testSkip",
help="Skip tests by payloads and/or titles (e.g. BENCHMARK)")
general.add_option("--update", dest="updateAll", general.add_option("--update", dest="updateAll",
action="store_true", action="store_true",
help="Update sqlmap") help="Update sqlmap")
@ -710,6 +716,10 @@ def cmdLineParser():
action="store_true", action="store_true",
help="Make a thorough testing for a WAF/IPS/IDS protection") help="Make a thorough testing for a WAF/IPS/IDS protection")
miscellaneous.add_option("--skip-waf", dest="skipWaf",
action="store_true",
help="Skip heuristic detection of WAF/IPS/IDS protection")
miscellaneous.add_option("--mobile", dest="mobile", miscellaneous.add_option("--mobile", dest="mobile",
action="store_true", action="store_true",
help="Imitate smartphone through HTTP User-Agent header") help="Imitate smartphone through HTTP User-Agent header")
@ -756,6 +766,9 @@ def cmdLineParser():
parser.add_option("--force-dns", dest="forceDns", action="store_true", parser.add_option("--force-dns", dest="forceDns", action="store_true",
help=SUPPRESS_HELP) help=SUPPRESS_HELP)
parser.add_option("--force-threads", dest="forceThreads", action="store_true",
help=SUPPRESS_HELP)
parser.add_option("--smoke-test", dest="smokeTest", action="store_true", parser.add_option("--smoke-test", dest="smokeTest", action="store_true",
help=SUPPRESS_HELP) help=SUPPRESS_HELP)
@ -767,6 +780,9 @@ def cmdLineParser():
parser.add_option("--run-case", dest="runCase", help=SUPPRESS_HELP) parser.add_option("--run-case", dest="runCase", help=SUPPRESS_HELP)
parser.add_option("--nnc5ed", dest="nnc5ed", action="store_true",
help=SUPPRESS_HELP) # temporary hidden switch :)
parser.add_option_group(target) parser.add_option_group(target)
parser.add_option_group(request) parser.add_option_group(request)
parser.add_option_group(optimization) parser.add_option_group(optimization)
@ -802,14 +818,15 @@ def cmdLineParser():
option = parser.get_option("-h") option = parser.get_option("-h")
option.help = option.help.capitalize().replace("this help", "basic help") option.help = option.help.capitalize().replace("this help", "basic help")
argv = [] _ = []
prompt = False prompt = False
advancedHelp = True advancedHelp = True
extraHeaders = [] extraHeaders = []
for arg in sys.argv: for arg in argv:
argv.append(getUnicode(arg, encoding=sys.getfilesystemencoding())) _.append(getUnicode(arg, encoding=sys.getfilesystemencoding()))
argv = _
checkDeprecatedOptions(argv) checkDeprecatedOptions(argv)
prompt = "--sqlmap-shell" in argv prompt = "--sqlmap-shell" in argv

View File

@ -6,6 +6,7 @@ See the file 'doc/COPYING' for copying permission
""" """
from lib.core.common import checkFile from lib.core.common import checkFile
from lib.core.common import getSafeExString
from lib.core.common import getUnicode from lib.core.common import getUnicode
from lib.core.common import openFile from lib.core.common import openFile
from lib.core.common import unArrayizeValue from lib.core.common import unArrayizeValue
@ -67,7 +68,7 @@ def configFileParser(configFile):
config = UnicodeRawConfigParser() config = UnicodeRawConfigParser()
config.readfp(configFP) config.readfp(configFP)
except Exception, ex: except Exception, ex:
errMsg = "you have provided an invalid and/or unreadable configuration file ('%s')" % ex.message errMsg = "you have provided an invalid and/or unreadable configuration file ('%s')" % getSafeExString(ex)
raise SqlmapSyntaxException(errMsg) raise SqlmapSyntaxException(errMsg)
if not config.has_section("Target"): if not config.has_section("Target"):

View File

@ -128,9 +128,16 @@ def _comparison(page, headers, code, getRatioValue, pageLength):
count += 1 count += 1
else: else:
break break
if count: if count:
seq1 = seq1[count:] try:
seq2 = seq2[count:] _seq1 = seq1[count:]
_seq2 = seq2[count:]
except MemoryError:
pass
else:
seq1 = _seq1
seq2 = _seq2
while True: while True:
try: try:

View File

@ -5,6 +5,7 @@ Copyright (c) 2006-2015 sqlmap developers (http://sqlmap.org/)
See the file 'doc/COPYING' for copying permission See the file 'doc/COPYING' for copying permission
""" """
import binascii
import compiler import compiler
import httplib import httplib
import json import json
@ -40,6 +41,7 @@ from lib.core.common import getCurrentThreadData
from lib.core.common import getHeader from lib.core.common import getHeader
from lib.core.common import getHostHeader from lib.core.common import getHostHeader
from lib.core.common import getRequestHeader from lib.core.common import getRequestHeader
from lib.core.common import getSafeExString
from lib.core.common import getUnicode from lib.core.common import getUnicode
from lib.core.common import logHTTPTraffic from lib.core.common import logHTTPTraffic
from lib.core.common import pushValue from lib.core.common import pushValue
@ -142,6 +144,7 @@ class Connect(object):
warnMsg += "(e.g. '--flush-session --technique=BEUS') or try to " warnMsg += "(e.g. '--flush-session --technique=BEUS') or try to "
warnMsg += "lower the value of option '--time-sec' (e.g. '--time-sec=2')" warnMsg += "lower the value of option '--time-sec' (e.g. '--time-sec=2')"
singleTimeWarnMessage(warnMsg) singleTimeWarnMessage(warnMsg)
elif kb.originalPage is None: elif kb.originalPage is None:
if conf.tor: if conf.tor:
warnMsg = "please make sure that you have " warnMsg = "please make sure that you have "
@ -158,13 +161,12 @@ class Connect(object):
warnMsg += "with the switch '--random-agent' turned on " warnMsg += "with the switch '--random-agent' turned on "
warnMsg += "and/or proxy switches ('--ignore-proxy', '--proxy',...)" warnMsg += "and/or proxy switches ('--ignore-proxy', '--proxy',...)"
singleTimeWarnMessage(warnMsg) singleTimeWarnMessage(warnMsg)
elif conf.threads > 1: elif conf.threads > 1:
warnMsg = "if the problem persists please try to lower " warnMsg = "if the problem persists please try to lower "
warnMsg += "the number of used threads (option '--threads')" warnMsg += "the number of used threads (option '--threads')"
singleTimeWarnMessage(warnMsg) singleTimeWarnMessage(warnMsg)
time.sleep(1)
kwargs['retrying'] = True kwargs['retrying'] = True
return Connect._getPageProxy(**kwargs) return Connect._getPageProxy(**kwargs)
@ -183,7 +185,11 @@ class Connect(object):
kb.pageCompress = False kb.pageCompress = False
else: else:
while True: while True:
_ = conn.read(MAX_CONNECTION_CHUNK_SIZE) if not conn:
break
else:
_ = conn.read(MAX_CONNECTION_CHUNK_SIZE)
if len(_) == MAX_CONNECTION_CHUNK_SIZE: if len(_) == MAX_CONNECTION_CHUNK_SIZE:
warnMsg = "large response detected. This could take a while" warnMsg = "large response detected. This could take a while"
singleTimeWarnMessage(warnMsg) singleTimeWarnMessage(warnMsg)
@ -433,6 +439,11 @@ class Connect(object):
logger.log(CUSTOM_LOGGING.TRAFFIC_OUT, requestMsg) logger.log(CUSTOM_LOGGING.TRAFFIC_OUT, requestMsg)
if conf.cj:
for cookie in conf.cj:
if cookie.value is None:
cookie.value = ""
conn = urllib2.urlopen(req) conn = urllib2.urlopen(req)
if not kb.authHeader and getRequestHeader(req, HTTP_HEADER.AUTHORIZATION) and (conf.authType or "").lower() == AUTH_TYPE.BASIC.lower(): if not kb.authHeader and getRequestHeader(req, HTTP_HEADER.AUTHORIZATION) and (conf.authType or "").lower() == AUTH_TYPE.BASIC.lower():
@ -497,22 +508,22 @@ class Connect(object):
if hasattr(conn.fp, '_sock'): if hasattr(conn.fp, '_sock'):
conn.fp._sock.close() conn.fp._sock.close()
conn.close() conn.close()
except Exception, msg: except Exception, ex:
warnMsg = "problem occurred during connection closing ('%s')" % msg warnMsg = "problem occurred during connection closing ('%s')" % getSafeExString(ex)
logger.warn(warnMsg) logger.warn(warnMsg)
except urllib2.HTTPError, e: except urllib2.HTTPError, ex:
page = None page = None
responseHeaders = None responseHeaders = None
try: try:
page = e.read() if not skipRead else None page = ex.read() if not skipRead else None
responseHeaders = e.info() responseHeaders = ex.info()
responseHeaders[URI_HTTP_HEADER] = e.geturl() responseHeaders[URI_HTTP_HEADER] = ex.geturl()
page = decodePage(page, responseHeaders.get(HTTP_HEADER.CONTENT_ENCODING), responseHeaders.get(HTTP_HEADER.CONTENT_TYPE)) page = decodePage(page, responseHeaders.get(HTTP_HEADER.CONTENT_ENCODING), responseHeaders.get(HTTP_HEADER.CONTENT_TYPE))
except socket.timeout: except socket.timeout:
warnMsg = "connection timed out while trying " warnMsg = "connection timed out while trying "
warnMsg += "to get error page information (%d)" % e.code warnMsg += "to get error page information (%d)" % ex.code
logger.warn(warnMsg) logger.warn(warnMsg)
return None, None, None return None, None, None
except KeyboardInterrupt: except KeyboardInterrupt:
@ -522,13 +533,13 @@ class Connect(object):
finally: finally:
page = page if isinstance(page, unicode) else getUnicode(page) page = page if isinstance(page, unicode) else getUnicode(page)
code = e.code code = ex.code
kb.originalCode = kb.originalCode or code kb.originalCode = kb.originalCode or code
threadData.lastHTTPError = (threadData.lastRequestUID, code) threadData.lastHTTPError = (threadData.lastRequestUID, code)
kb.httpErrorCodes[code] = kb.httpErrorCodes.get(code, 0) + 1 kb.httpErrorCodes[code] = kb.httpErrorCodes.get(code, 0) + 1
status = getUnicode(e.msg) status = getUnicode(ex.msg)
responseMsg += "[#%d] (%d %s):\n" % (threadData.lastRequestUID, code, status) responseMsg += "[#%d] (%d %s):\n" % (threadData.lastRequestUID, code, status)
if responseHeaders: if responseHeaders:
@ -545,11 +556,11 @@ class Connect(object):
logger.log(CUSTOM_LOGGING.TRAFFIC_IN, responseMsg) logger.log(CUSTOM_LOGGING.TRAFFIC_IN, responseMsg)
if e.code == httplib.UNAUTHORIZED and not conf.ignore401: if ex.code == httplib.UNAUTHORIZED and not conf.ignore401:
errMsg = "not authorized, try to provide right HTTP " errMsg = "not authorized, try to provide right HTTP "
errMsg += "authentication type and valid credentials (%d)" % code errMsg += "authentication type and valid credentials (%d)" % code
raise SqlmapConnectionException(errMsg) raise SqlmapConnectionException(errMsg)
elif e.code == httplib.NOT_FOUND: elif ex.code == httplib.NOT_FOUND:
if raise404: if raise404:
errMsg = "page not found (%d)" % code errMsg = "page not found (%d)" % code
raise SqlmapConnectionException(errMsg) raise SqlmapConnectionException(errMsg)
@ -557,11 +568,11 @@ class Connect(object):
debugMsg = "page not found (%d)" % code debugMsg = "page not found (%d)" % code
singleTimeLogMessage(debugMsg, logging.DEBUG) singleTimeLogMessage(debugMsg, logging.DEBUG)
processResponse(page, responseHeaders) processResponse(page, responseHeaders)
elif e.code == httplib.GATEWAY_TIMEOUT: elif ex.code == httplib.GATEWAY_TIMEOUT:
if ignoreTimeout: if ignoreTimeout:
return None, None, None return None, None, None
else: else:
warnMsg = "unable to connect to the target URL (%d - %s)" % (e.code, httplib.responses[e.code]) warnMsg = "unable to connect to the target URL (%d - %s)" % (ex.code, httplib.responses[ex.code])
if threadData.retriesCount < conf.retries and not kb.threadException: if threadData.retriesCount < conf.retries and not kb.threadException:
warnMsg += ". sqlmap is going to retry the request" warnMsg += ". sqlmap is going to retry the request"
logger.critical(warnMsg) logger.critical(warnMsg)
@ -575,7 +586,7 @@ class Connect(object):
debugMsg = "got HTTP error code: %d (%s)" % (code, status) debugMsg = "got HTTP error code: %d (%s)" % (code, status)
logger.debug(debugMsg) logger.debug(debugMsg)
except (urllib2.URLError, socket.error, socket.timeout, httplib.HTTPException, struct.error, ProxyError, SqlmapCompressionException, WebSocketException), e: except (urllib2.URLError, socket.error, socket.timeout, httplib.HTTPException, struct.error, binascii.Error, ProxyError, SqlmapCompressionException, WebSocketException):
tbMsg = traceback.format_exc() tbMsg = traceback.format_exc()
if "no host given" in tbMsg: if "no host given" in tbMsg:
@ -619,7 +630,11 @@ class Connect(object):
return None, None, None return None, None, None
elif threadData.retriesCount < conf.retries and not kb.threadException: elif threadData.retriesCount < conf.retries and not kb.threadException:
warnMsg += ". sqlmap is going to retry the request" warnMsg += ". sqlmap is going to retry the request"
logger.critical(warnMsg) if not retrying:
warnMsg += "(s)"
logger.critical(warnMsg)
else:
logger.debug(warnMsg)
return Connect._retryProxy(**kwargs) return Connect._retryProxy(**kwargs)
elif kb.testMode: elif kb.testMode:
logger.critical(warnMsg) logger.critical(warnMsg)
@ -628,7 +643,7 @@ class Connect(object):
raise SqlmapConnectionException(warnMsg) raise SqlmapConnectionException(warnMsg)
finally: finally:
if not isinstance(page, unicode): if isinstance(page, basestring) and not isinstance(page, unicode):
if HTTP_HEADER.CONTENT_TYPE in (responseHeaders or {}) and not re.search(TEXT_CONTENT_TYPE_REGEX, responseHeaders[HTTP_HEADER.CONTENT_TYPE]): if HTTP_HEADER.CONTENT_TYPE in (responseHeaders or {}) and not re.search(TEXT_CONTENT_TYPE_REGEX, responseHeaders[HTTP_HEADER.CONTENT_TYPE]):
page = unicode(page, errors="ignore") page = unicode(page, errors="ignore")
else: else:
@ -718,7 +733,7 @@ class Connect(object):
payload = function(payload=payload, headers=auxHeaders) payload = function(payload=payload, headers=auxHeaders)
except Exception, ex: except Exception, ex:
errMsg = "error occurred while running tamper " errMsg = "error occurred while running tamper "
errMsg += "function '%s' ('%s')" % (function.func_name, ex) errMsg += "function '%s' ('%s')" % (function.func_name, getSafeExString(ex))
raise SqlmapGenericException(errMsg) raise SqlmapGenericException(errMsg)
if not isinstance(payload, basestring): if not isinstance(payload, basestring):
@ -834,7 +849,7 @@ class Connect(object):
if headers and "text/plain" in headers.get(HTTP_HEADER.CONTENT_TYPE, ""): if headers and "text/plain" in headers.get(HTTP_HEADER.CONTENT_TYPE, ""):
token = page token = page
if not token and any(_.name == conf.csrfToken for _ in conf.cj): if not token and conf.cj and any(_.name == conf.csrfToken for _ in conf.cj):
for _ in conf.cj: for _ in conf.cj:
if _.name == conf.csrfToken: if _.name == conf.csrfToken:
token = _.value token = _.value
@ -889,7 +904,7 @@ class Connect(object):
if conf.evalCode: if conf.evalCode:
delimiter = conf.paramDel or DEFAULT_GET_POST_DELIMITER delimiter = conf.paramDel or DEFAULT_GET_POST_DELIMITER
variables = {"uri": uri, "lastPage": threadData.lastPage} variables = {"uri": uri, "lastPage": threadData.lastPage, "_locals": locals()}
originals = {} originals = {}
keywords = keyword.kwlist keywords = keyword.kwlist
@ -1051,9 +1066,9 @@ class Connect(object):
_, headers, code = Connect.getPage(url=uri, get=get, post=post, method=method, cookie=cookie, ua=ua, referer=referer, host=host, silent=silent, auxHeaders=auxHeaders, raise404=raise404, skipRead=(kb.nullConnection == NULLCONNECTION.SKIP_READ)) _, headers, code = Connect.getPage(url=uri, get=get, post=post, method=method, cookie=cookie, ua=ua, referer=referer, host=host, silent=silent, auxHeaders=auxHeaders, raise404=raise404, skipRead=(kb.nullConnection == NULLCONNECTION.SKIP_READ))
if headers: if headers:
if kb.nullConnection in (NULLCONNECTION.HEAD, NULLCONNECTION.SKIP_READ) and HTTP_HEADER.CONTENT_LENGTH in headers: if kb.nullConnection in (NULLCONNECTION.HEAD, NULLCONNECTION.SKIP_READ) and headers.get(HTTP_HEADER.CONTENT_LENGTH):
pageLength = int(headers[HTTP_HEADER.CONTENT_LENGTH]) pageLength = int(headers[HTTP_HEADER.CONTENT_LENGTH])
elif kb.nullConnection == NULLCONNECTION.RANGE and HTTP_HEADER.CONTENT_RANGE in headers: elif kb.nullConnection == NULLCONNECTION.RANGE and headers.get(HTTP_HEADER.CONTENT_RANGE):
pageLength = int(headers[HTTP_HEADER.CONTENT_RANGE][headers[HTTP_HEADER.CONTENT_RANGE].find('/') + 1:]) pageLength = int(headers[HTTP_HEADER.CONTENT_RANGE][headers[HTTP_HEADER.CONTENT_RANGE].find('/') + 1:])
finally: finally:
kb.pageCompress = popValue() kb.pageCompress = popValue()

View File

@ -9,6 +9,7 @@ import httplib
import socket import socket
import urllib2 import urllib2
from lib.core.common import getSafeExString
from lib.core.data import kb from lib.core.data import kb
from lib.core.data import logger from lib.core.data import logger
from lib.core.exception import SqlmapConnectionException from lib.core.exception import SqlmapConnectionException
@ -55,9 +56,9 @@ class HTTPSConnection(httplib.HTTPSConnection):
break break
else: else:
sock.close() sock.close()
except (ssl.SSLError, socket.error, httplib.BadStatusLine), errMsg: except (ssl.SSLError, socket.error, httplib.BadStatusLine), ex:
self._tunnel_host = None self._tunnel_host = None
logger.debug("SSL connection error occurred ('%s')" % errMsg) logger.debug("SSL connection error occurred ('%s')" % getSafeExString(ex))
# Reference(s): https://docs.python.org/2/library/ssl.html#ssl.SSLContext # Reference(s): https://docs.python.org/2/library/ssl.html#ssl.SSLContext
# https://www.mnot.net/blog/2014/12/27/python_2_and_tls_sni # https://www.mnot.net/blog/2014/12/27/python_2_and_tls_sni
@ -75,9 +76,9 @@ class HTTPSConnection(httplib.HTTPSConnection):
break break
else: else:
sock.close() sock.close()
except (ssl.SSLError, socket.error, httplib.BadStatusLine), errMsg: except (ssl.SSLError, socket.error, httplib.BadStatusLine), ex:
self._tunnel_host = None self._tunnel_host = None
logger.debug("SSL connection error occurred ('%s')" % errMsg) logger.debug("SSL connection error occurred ('%s')" % getSafeExString(ex))
if not success: if not success:
raise SqlmapConnectionException("can't establish SSL connection") raise SqlmapConnectionException("can't establish SSL connection")

View File

@ -39,6 +39,7 @@ from lib.core.enums import DBMS
from lib.core.enums import EXPECTED from lib.core.enums import EXPECTED
from lib.core.enums import PAYLOAD from lib.core.enums import PAYLOAD
from lib.core.exception import SqlmapConnectionException from lib.core.exception import SqlmapConnectionException
from lib.core.exception import SqlmapDataException
from lib.core.exception import SqlmapNotVulnerableException from lib.core.exception import SqlmapNotVulnerableException
from lib.core.exception import SqlmapUserQuitException from lib.core.exception import SqlmapUserQuitException
from lib.core.settings import MAX_TECHNIQUES_PER_VALUE from lib.core.settings import MAX_TECHNIQUES_PER_VALUE
@ -78,7 +79,7 @@ def _goInference(payload, expression, charsetType=None, firstChar=None, lastChar
timeBasedCompare = (kb.technique in (PAYLOAD.TECHNIQUE.TIME, PAYLOAD.TECHNIQUE.STACKED)) timeBasedCompare = (kb.technique in (PAYLOAD.TECHNIQUE.TIME, PAYLOAD.TECHNIQUE.STACKED))
if not (timeBasedCompare and kb.dnsTest): if not (timeBasedCompare and kb.dnsTest):
if (conf.eta or conf.threads > 1) and Backend.getIdentifiedDbms() and not re.search("(COUNT|LTRIM)\(", expression, re.I) and not timeBasedCompare: if (conf.eta or conf.threads > 1) and Backend.getIdentifiedDbms() and not re.search("(COUNT|LTRIM)\(", expression, re.I) and not (timeBasedCompare and not conf.forceThreads):
if field and re.search("\ASELECT\s+DISTINCT\((.+?)\)\s+FROM", expression, re.I): if field and re.search("\ASELECT\s+DISTINCT\((.+?)\)\s+FROM", expression, re.I):
expression = "SELECT %s FROM (%s)" % (field, expression) expression = "SELECT %s FROM (%s)" % (field, expression)
@ -262,9 +263,14 @@ def _goInferenceProxy(expression, fromUser=False, batch=False, unpack=True, char
return None return None
try: try:
for num in xrange(startLimit, stopLimit): try:
output = _goInferenceFields(expression, expressionFields, expressionFieldsList, payload, num=num, charsetType=charsetType, firstChar=firstChar, lastChar=lastChar, dump=dump) for num in xrange(startLimit, stopLimit):
outputs.append(output) output = _goInferenceFields(expression, expressionFields, expressionFieldsList, payload, num=num, charsetType=charsetType, firstChar=firstChar, lastChar=lastChar, dump=dump)
outputs.append(output)
except OverflowError:
errMsg = "boundary limits (%d,%d) are too large. Please rerun " % (startLimit, stopLimit)
errMsg += "with switch '--fresh-queries'"
raise SqlmapDataException(errMsg)
except KeyboardInterrupt: except KeyboardInterrupt:
print print

View File

@ -11,12 +11,13 @@ import urllib2
from lib.core.data import conf from lib.core.data import conf
class HTTPSPKIAuthHandler(urllib2.HTTPSHandler): class HTTPSPKIAuthHandler(urllib2.HTTPSHandler):
def __init__(self, key_file): def __init__(self, auth_file):
urllib2.HTTPSHandler.__init__(self) urllib2.HTTPSHandler.__init__(self)
self.key_file = key_file self.auth_file = auth_file
def https_open(self, req): def https_open(self, req):
return self.do_open(self.getConnection, req) return self.do_open(self.getConnection, req)
def getConnection(self, host, timeout=None): def getConnection(self, host, timeout=None):
return httplib.HTTPSConnection(host, key_file=self.key_file, timeout=conf.timeout) # Reference: https://docs.python.org/2/library/ssl.html#ssl.SSLContext.load_cert_chain
return httplib.HTTPSConnection(host, cert_file=self.auth_file, key_file=self.auth_file, timeout=conf.timeout)

View File

@ -30,6 +30,7 @@ from lib.core.settings import MAX_SINGLE_URL_REDIRECTIONS
from lib.core.settings import MAX_TOTAL_REDIRECTIONS from lib.core.settings import MAX_TOTAL_REDIRECTIONS
from lib.core.threads import getCurrentThreadData from lib.core.threads import getCurrentThreadData
from lib.request.basic import decodePage from lib.request.basic import decodePage
from lib.request.basic import parseResponse
class SmartRedirectHandler(urllib2.HTTPRedirectHandler): class SmartRedirectHandler(urllib2.HTTPRedirectHandler):
def _get_header_redirect(self, headers): def _get_header_redirect(self, headers):
@ -118,6 +119,8 @@ class SmartRedirectHandler(urllib2.HTTPRedirectHandler):
result = fp result = fp
if redurl and kb.redirectChoice == REDIRECTION.YES: if redurl and kb.redirectChoice == REDIRECTION.YES:
parseResponse(content, headers)
req.headers[HTTP_HEADER.HOST] = getHostHeader(redurl) req.headers[HTTP_HEADER.HOST] = getHostHeader(redurl)
if headers and HTTP_HEADER.SET_COOKIE in headers: if headers and HTTP_HEADER.SET_COOKIE in headers:
req.headers[HTTP_HEADER.COOKIE] = headers[HTTP_HEADER.SET_COOKIE].split(conf.cookieDel or DEFAULT_COOKIE_DELIMITER)[0] req.headers[HTTP_HEADER.COOKIE] = headers[HTTP_HEADER.SET_COOKIE].split(conf.cookieDel or DEFAULT_COOKIE_DELIMITER)[0]

View File

@ -18,6 +18,7 @@ from lib.core.common import readInput
from lib.core.data import conf from lib.core.data import conf
from lib.core.data import logger from lib.core.data import logger
from lib.core.data import paths from lib.core.data import paths
from lib.core.exception import SqlmapDataException
class ICMPsh: class ICMPsh:
""" """
@ -41,6 +42,9 @@ class ICMPsh:
while not address: while not address:
address = readInput(message, default=self.remoteIP) address = readInput(message, default=self.remoteIP)
if conf.batch and not address:
raise SqlmapDataException("remote host address is missing")
return address return address
def _selectLhost(self): def _selectLhost(self):
@ -53,6 +57,9 @@ class ICMPsh:
while not address: while not address:
address = readInput(message, default=self.localIP) address = readInput(message, default=self.localIP)
if conf.batch and not address:
raise SqlmapDataException("local host address is missing")
return address return address
def _prepareIngredients(self, encode=True): def _prepareIngredients(self, encode=True):

View File

@ -258,7 +258,7 @@ class UDF:
else: else:
logger.warn("invalid value, only digits are allowed") logger.warn("invalid value, only digits are allowed")
for x in range(0, udfCount): for x in xrange(0, udfCount):
while True: while True:
msg = "what is the name of the UDF number %d? " % (x + 1) msg = "what is the name of the UDF number %d? " % (x + 1)
udfName = readInput(msg) udfName = readInput(msg)
@ -293,7 +293,7 @@ class UDF:
else: else:
logger.warn("invalid value, only digits >= 0 are allowed") logger.warn("invalid value, only digits >= 0 are allowed")
for y in range(0, parCount): for y in xrange(0, parCount):
msg = "what is the data-type of input parameter " msg = "what is the data-type of input parameter "
msg += "number %d? (default: %s) " % ((y + 1), defaultType) msg += "number %d? (default: %s) " % ((y + 1), defaultType)

View File

@ -146,12 +146,12 @@ def bisection(payload, expression, length=None, charsetType=None, firstChar=None
if showEta: if showEta:
progress = ProgressBar(maxValue=length) progress = ProgressBar(maxValue=length)
if timeBasedCompare and conf.threads > 1: if timeBasedCompare and conf.threads > 1 and not conf.forceThreads:
warnMsg = "multi-threading is considered unsafe in time-based data retrieval. Going to switch it off automatically" warnMsg = "multi-threading is considered unsafe in time-based data retrieval. Going to switch it off automatically"
singleTimeWarnMessage(warnMsg) singleTimeWarnMessage(warnMsg)
if numThreads > 1: if numThreads > 1:
if not timeBasedCompare: if not timeBasedCompare or conf.forceThreads:
debugMsg = "starting %d thread%s" % (numThreads, ("s" if numThreads > 1 else "")) debugMsg = "starting %d thread%s" % (numThreads, ("s" if numThreads > 1 else ""))
logger.debug(debugMsg) logger.debug(debugMsg)
else: else:
@ -232,8 +232,10 @@ def bisection(payload, expression, length=None, charsetType=None, firstChar=None
# Used for gradual expanding into unicode charspace # Used for gradual expanding into unicode charspace
shiftTable = [2, 2, 3, 3, 5, 4] shiftTable = [2, 2, 3, 3, 5, 4]
if CHAR_INFERENCE_MARK in payload and ord('\n') in charTbl: if "'%s'" % CHAR_INFERENCE_MARK in payload:
charTbl.remove(ord('\n')) for char in ('\n', '\r'):
if ord(char) in charTbl:
charTbl.remove(ord(char))
if not charTbl: if not charTbl:
return None return None
@ -597,8 +599,9 @@ def queryOutputLength(expression, payload):
infoMsg = "retrieving the length of query output" infoMsg = "retrieving the length of query output"
logger.info(infoMsg) logger.info(infoMsg)
lengthExprUnescaped = agent.forgeQueryOutputLength(expression)
start = time.time() start = time.time()
lengthExprUnescaped = agent.forgeQueryOutputLength(expression)
count, length = bisection(payload, lengthExprUnescaped, charsetType=CHARSET_TYPE.DIGITS) count, length = bisection(payload, lengthExprUnescaped, charsetType=CHARSET_TYPE.DIGITS)
debugMsg = "performed %d queries in %.2f seconds" % (count, calculateDeltaSeconds(start)) debugMsg = "performed %d queries in %.2f seconds" % (count, calculateDeltaSeconds(start))

View File

@ -28,9 +28,9 @@ from lib.core.enums import HASHDB_KEYS
from lib.core.enums import PAYLOAD from lib.core.enums import PAYLOAD
from lib.core.exception import SqlmapDataException from lib.core.exception import SqlmapDataException
from lib.core.exception import SqlmapMissingMandatoryOptionException from lib.core.exception import SqlmapMissingMandatoryOptionException
from lib.core.settings import METADB_SUFFIX
from lib.core.settings import BRUTE_COLUMN_EXISTS_TEMPLATE from lib.core.settings import BRUTE_COLUMN_EXISTS_TEMPLATE
from lib.core.settings import BRUTE_TABLE_EXISTS_TEMPLATE from lib.core.settings import BRUTE_TABLE_EXISTS_TEMPLATE
from lib.core.settings import METADB_SUFFIX
from lib.core.threads import getCurrentThreadData from lib.core.threads import getCurrentThreadData
from lib.core.threads import runThreads from lib.core.threads import runThreads
from lib.request import inject from lib.request import inject
@ -102,7 +102,7 @@ def tableExists(tableFile, regex=None):
break break
if conf.db and METADB_SUFFIX not in conf.db and Backend.getIdentifiedDbms() not in (DBMS.SQLITE, DBMS.ACCESS, DBMS.FIREBIRD): if conf.db and METADB_SUFFIX not in conf.db and Backend.getIdentifiedDbms() not in (DBMS.SQLITE, DBMS.ACCESS, DBMS.FIREBIRD):
fullTableName = "%s%s%s" % (conf.db, '..' if Backend.getIdentifiedDbms() in (DBMS.MSSQL, DBMS.SYBASE) else '.', table) fullTableName = "%s.%s" % (conf.db, table)
else: else:
fullTableName = table fullTableName = table

View File

@ -165,74 +165,78 @@ def _unionPosition(comment, place, parameter, prefix, suffix, count, where=PAYLO
# Unbiased approach for searching appropriate usable column # Unbiased approach for searching appropriate usable column
random.shuffle(positions) random.shuffle(positions)
# For each column of the table (# of NULL) perform a request using for charCount in (UNION_MIN_RESPONSE_CHARS << 2, UNION_MIN_RESPONSE_CHARS):
# the UNION ALL SELECT statement to test it the target URL is if vector:
# affected by an exploitable union SQL injection vulnerability break
for position in positions:
# Prepare expression with delimiters
randQuery = randomStr(UNION_MIN_RESPONSE_CHARS)
phrase = "%s%s%s".lower() % (kb.chars.start, randQuery, kb.chars.stop)
randQueryProcessed = agent.concatQuery("\'%s\'" % randQuery)
randQueryUnescaped = unescaper.escape(randQueryProcessed)
# Forge the union SQL injection request # For each column of the table (# of NULL) perform a request using
query = agent.forgeUnionQuery(randQueryUnescaped, position, count, comment, prefix, suffix, kb.uChar, where) # the UNION ALL SELECT statement to test it the target URL is
payload = agent.payload(place=place, parameter=parameter, newValue=query, where=where) # affected by an exploitable union SQL injection vulnerability
for position in positions:
# Prepare expression with delimiters
randQuery = randomStr(charCount)
phrase = "%s%s%s".lower() % (kb.chars.start, randQuery, kb.chars.stop)
randQueryProcessed = agent.concatQuery("\'%s\'" % randQuery)
randQueryUnescaped = unescaper.escape(randQueryProcessed)
# Perform the request # Forge the union SQL injection request
page, headers = Request.queryPage(payload, place=place, content=True, raise404=False) query = agent.forgeUnionQuery(randQueryUnescaped, position, count, comment, prefix, suffix, kb.uChar, where)
content = "%s%s".lower() % (removeReflectiveValues(page, payload) or "", \ payload = agent.payload(place=place, parameter=parameter, newValue=query, where=where)
removeReflectiveValues(listToStrValue(headers.headers if headers else None), \
payload, True) or "")
if content and phrase in content: # Perform the request
validPayload = payload page, headers = Request.queryPage(payload, place=place, content=True, raise404=False)
kb.unionDuplicates = len(re.findall(phrase, content, re.I)) > 1 content = "%s%s".lower() % (removeReflectiveValues(page, payload) or "", \
vector = (position, count, comment, prefix, suffix, kb.uChar, where, kb.unionDuplicates, False) removeReflectiveValues(listToStrValue(headers.headers if headers else None), \
payload, True) or "")
if where == PAYLOAD.WHERE.ORIGINAL: if content and phrase in content:
# Prepare expression with delimiters validPayload = payload
randQuery2 = randomStr(UNION_MIN_RESPONSE_CHARS) kb.unionDuplicates = len(re.findall(phrase, content, re.I)) > 1
phrase2 = "%s%s%s".lower() % (kb.chars.start, randQuery2, kb.chars.stop) vector = (position, count, comment, prefix, suffix, kb.uChar, where, kb.unionDuplicates, False)
randQueryProcessed2 = agent.concatQuery("\'%s\'" % randQuery2)
randQueryUnescaped2 = unescaper.escape(randQueryProcessed2)
# Confirm that it is a full union SQL injection if where == PAYLOAD.WHERE.ORIGINAL:
query = agent.forgeUnionQuery(randQueryUnescaped, position, count, comment, prefix, suffix, kb.uChar, where, multipleUnions=randQueryUnescaped2) # Prepare expression with delimiters
payload = agent.payload(place=place, parameter=parameter, newValue=query, where=where) randQuery2 = randomStr(charCount)
phrase2 = "%s%s%s".lower() % (kb.chars.start, randQuery2, kb.chars.stop)
randQueryProcessed2 = agent.concatQuery("\'%s\'" % randQuery2)
randQueryUnescaped2 = unescaper.escape(randQueryProcessed2)
# Perform the request # Confirm that it is a full union SQL injection
page, headers = Request.queryPage(payload, place=place, content=True, raise404=False) query = agent.forgeUnionQuery(randQueryUnescaped, position, count, comment, prefix, suffix, kb.uChar, where, multipleUnions=randQueryUnescaped2)
content = "%s%s".lower() % (page or "", listToStrValue(headers.headers if headers else None) or "")
if not all(_ in content for _ in (phrase, phrase2)):
vector = (position, count, comment, prefix, suffix, kb.uChar, where, kb.unionDuplicates, True)
elif not kb.unionDuplicates:
fromTable = " FROM (%s) AS %s" % (" UNION ".join("SELECT %d%s%s" % (_, FROM_DUMMY_TABLE.get(Backend.getIdentifiedDbms(), ""), " AS %s" % randomStr() if _ == 0 else "") for _ in xrange(LIMITED_ROWS_TEST_NUMBER)), randomStr())
# Check for limited row output
query = agent.forgeUnionQuery(randQueryUnescaped, position, count, comment, prefix, suffix, kb.uChar, where, fromTable=fromTable)
payload = agent.payload(place=place, parameter=parameter, newValue=query, where=where) payload = agent.payload(place=place, parameter=parameter, newValue=query, where=where)
# Perform the request # Perform the request
page, headers = Request.queryPage(payload, place=place, content=True, raise404=False) page, headers = Request.queryPage(payload, place=place, content=True, raise404=False)
content = "%s%s".lower() % (removeReflectiveValues(page, payload) or "", \ content = "%s%s".lower() % (page or "", listToStrValue(headers.headers if headers else None) or "")
removeReflectiveValues(listToStrValue(headers.headers if headers else None), \
payload, True) or "")
if content.count(phrase) > 0 and content.count(phrase) < LIMITED_ROWS_TEST_NUMBER:
warnMsg = "output with limited number of rows detected. Switching to partial mode"
logger.warn(warnMsg)
vector = (position, count, comment, prefix, suffix, kb.uChar, PAYLOAD.WHERE.NEGATIVE, kb.unionDuplicates, False)
unionErrorCase = kb.errorIsNone and wasLastResponseDBMSError() if not all(_ in content for _ in (phrase, phrase2)):
vector = (position, count, comment, prefix, suffix, kb.uChar, where, kb.unionDuplicates, True)
elif not kb.unionDuplicates:
fromTable = " FROM (%s) AS %s" % (" UNION ".join("SELECT %d%s%s" % (_, FROM_DUMMY_TABLE.get(Backend.getIdentifiedDbms(), ""), " AS %s" % randomStr() if _ == 0 else "") for _ in xrange(LIMITED_ROWS_TEST_NUMBER)), randomStr())
if unionErrorCase and count > 1: # Check for limited row output
warnMsg = "combined UNION/error-based SQL injection case found on " query = agent.forgeUnionQuery(randQueryUnescaped, position, count, comment, prefix, suffix, kb.uChar, where, fromTable=fromTable)
warnMsg += "column %d. sqlmap will try to find another " % (position + 1) payload = agent.payload(place=place, parameter=parameter, newValue=query, where=where)
warnMsg += "column with better characteristics"
logger.warn(warnMsg) # Perform the request
else: page, headers = Request.queryPage(payload, place=place, content=True, raise404=False)
break content = "%s%s".lower() % (removeReflectiveValues(page, payload) or "", \
removeReflectiveValues(listToStrValue(headers.headers if headers else None), \
payload, True) or "")
if content.count(phrase) > 0 and content.count(phrase) < LIMITED_ROWS_TEST_NUMBER:
warnMsg = "output with limited number of rows detected. Switching to partial mode"
logger.warn(warnMsg)
vector = (position, count, comment, prefix, suffix, kb.uChar, PAYLOAD.WHERE.NEGATIVE, kb.unionDuplicates, False)
unionErrorCase = kb.errorIsNone and wasLastResponseDBMSError()
if unionErrorCase and count > 1:
warnMsg = "combined UNION/error-based SQL injection case found on "
warnMsg += "column %d. sqlmap will try to find another " % (position + 1)
warnMsg += "column with better characteristics"
logger.warn(warnMsg)
else:
break
return validPayload, vector return validPayload, vector

View File

@ -8,11 +8,16 @@ See the file 'doc/COPYING' for copying permission
import logging import logging
import os import os
import re
import shlex
import sqlite3 import sqlite3
import sys import sys
import tempfile import tempfile
import time import time
import urllib2
from lib.core.common import dataToStdout
from lib.core.common import getSafeExString
from lib.core.common import unArrayizeValue from lib.core.common import unArrayizeValue
from lib.core.convert import base64pickle from lib.core.convert import base64pickle
from lib.core.convert import hexencode from lib.core.convert import hexencode
@ -31,6 +36,7 @@ from lib.core.log import LOGGER_HANDLER
from lib.core.optiondict import optDict from lib.core.optiondict import optDict
from lib.core.settings import IS_WIN from lib.core.settings import IS_WIN
from lib.core.subprocessng import Popen from lib.core.subprocessng import Popen
from lib.parse.cmdline import cmdLineParser
from thirdparty.bottle.bottle import error as return_error from thirdparty.bottle.bottle import error as return_error
from thirdparty.bottle.bottle import get from thirdparty.bottle.bottle import get
from thirdparty.bottle.bottle import hook from thirdparty.bottle.bottle import hook
@ -82,7 +88,7 @@ class Database(object):
else: else:
self.cursor.execute(statement) self.cursor.execute(statement)
except sqlite3.OperationalError, ex: except sqlite3.OperationalError, ex:
if not "locked" in ex.message: if not "locked" in getSafeExString(ex):
raise raise
else: else:
break break
@ -110,7 +116,8 @@ class Database(object):
class Task(object): class Task(object):
def __init__(self, taskid): def __init__(self, taskid, remote_addr):
self.remote_addr = remote_addr
self.process = None self.process = None
self.output_directory = None self.output_directory = None
self.options = None self.options = None
@ -152,8 +159,10 @@ class Task(object):
self.options = AttribDict(self._original_options) self.options = AttribDict(self._original_options)
def engine_start(self): def engine_start(self):
self.process = Popen(["python", "sqlmap.py", "--pickled-options", base64pickle(self.options)], if os.path.exists("sqlmap.py"):
shell=False, close_fds=not IS_WIN) self.process = Popen(["python", "sqlmap.py", "--pickled-options", base64pickle(self.options)], shell=False, close_fds=not IS_WIN)
else:
self.process = Popen(["sqlmap", "--pickled-options", base64pickle(self.options)], shell=False, close_fds=not IS_WIN)
def engine_stop(self): def engine_stop(self):
if self.process: if self.process:
@ -335,7 +344,9 @@ def task_new():
Create new task ID Create new task ID
""" """
taskid = hexencode(os.urandom(8)) taskid = hexencode(os.urandom(8))
DataStore.tasks[taskid] = Task(taskid) remote_addr = request.remote_addr
DataStore.tasks[taskid] = Task(taskid, remote_addr)
logger.debug("Created new task: '%s'" % taskid) logger.debug("Created new task: '%s'" % taskid)
return jsonize({"success": True, "taskid": taskid}) return jsonize({"success": True, "taskid": taskid})
@ -361,18 +372,18 @@ def task_delete(taskid):
@get("/admin/<taskid>/list") @get("/admin/<taskid>/list")
def task_list(taskid): def task_list(taskid=None):
""" """
List task pull List task pull
""" """
if is_admin(taskid): tasks = {}
logger.debug("[%s] Listed task pool" % taskid)
tasks = list(DataStore.tasks)
return jsonize({"success": True, "tasks": tasks, "tasks_num": len(tasks)})
else:
logger.warning("[%s] Unauthorized call to task_list()" % taskid)
return jsonize({"success": False, "message": "Unauthorized"})
for key in DataStore.tasks:
if is_admin(taskid) or DataStore.tasks[key].remote_addr == request.remote_addr:
tasks[key] = dejsonize(scan_status(key))["status"]
logger.debug("[%s] Listed task pool (%s)" % (taskid, "admin" if is_admin(taskid) else request.remote_addr))
return jsonize({"success": True, "tasks": tasks, "tasks_num": len(tasks)})
@get("/admin/<taskid>/flush") @get("/admin/<taskid>/flush")
def task_flush(taskid): def task_flush(taskid):
@ -381,11 +392,13 @@ def task_flush(taskid):
""" """
if is_admin(taskid): if is_admin(taskid):
DataStore.tasks = dict() DataStore.tasks = dict()
logger.debug("[%s] Flushed task pool" % taskid)
return jsonize({"success": True})
else: else:
logger.warning("[%s] Unauthorized call to task_flush()" % taskid) for key in list(DataStore.tasks):
return jsonize({"success": False, "message": "Unauthorized"}) if DataStore.tasks[key].remote_addr == request.remote_addr:
del DataStore.tasks[key]
logger.debug("[%s] Flushed task pool (%s)" % (taskid, "admin" if is_admin(taskid) else request.remote_addr))
return jsonize({"success": True})
################################## ##################################
# sqlmap core interact functions # # sqlmap core interact functions #
@ -467,7 +480,9 @@ def scan_stop(taskid):
""" """
Stop a scan Stop a scan
""" """
if taskid not in DataStore.tasks: if (taskid not in DataStore.tasks or
DataStore.tasks[taskid].engine_process() is None or
DataStore.tasks[taskid].engine_has_terminated()):
logger.warning("[%s] Invalid task ID provided to scan_stop()" % taskid) logger.warning("[%s] Invalid task ID provided to scan_stop()" % taskid)
return jsonize({"success": False, "message": "Invalid task ID"}) return jsonize({"success": False, "message": "Invalid task ID"})
@ -482,7 +497,9 @@ def scan_kill(taskid):
""" """
Kill a scan Kill a scan
""" """
if taskid not in DataStore.tasks: if (taskid not in DataStore.tasks or
DataStore.tasks[taskid].engine_process() is None or
DataStore.tasks[taskid].engine_has_terminated()):
logger.warning("[%s] Invalid task ID provided to scan_kill()" % taskid) logger.warning("[%s] Invalid task ID provided to scan_kill()" % taskid)
return jsonize({"success": False, "message": "Invalid task ID"}) return jsonize({"success": False, "message": "Invalid task ID"})
@ -552,7 +569,7 @@ def scan_log_limited(taskid, start, end):
json_log_messages = list() json_log_messages = list()
if taskid not in DataStore.tasks: if taskid not in DataStore.tasks:
logger.warning("[%s] Invalid task ID provided to scan_log_limited()") logger.warning("[%s] Invalid task ID provided to scan_log_limited()" % taskid)
return jsonize({"success": False, "message": "Invalid task ID"}) return jsonize({"success": False, "message": "Invalid task ID"})
if not start.isdigit() or not end.isdigit() or end < start: if not start.isdigit() or not end.isdigit() or end < start:
@ -581,7 +598,7 @@ def scan_log(taskid):
json_log_messages = list() json_log_messages = list()
if taskid not in DataStore.tasks: if taskid not in DataStore.tasks:
logger.warning("[%s] Invalid task ID provided to scan_log()") logger.warning("[%s] Invalid task ID provided to scan_log()" % taskid)
return jsonize({"success": False, "message": "Invalid task ID"}) return jsonize({"success": False, "message": "Invalid task ID"})
# Read all log messages from the IPC database # Read all log messages from the IPC database
@ -640,6 +657,22 @@ def server(host="0.0.0.0", port=RESTAPI_SERVER_PORT):
run(host=host, port=port, quiet=True, debug=False) run(host=host, port=port, quiet=True, debug=False)
def _client(url, options=None):
logger.debug("Calling %s" % url)
try:
data = None
if options is not None:
data = jsonize(options)
req = urllib2.Request(url, data, {'Content-Type': 'application/json'})
response = urllib2.urlopen(req)
text = response.read()
except:
if options:
logger.error("Failed to load and parse %s" % url)
raise
return text
def client(host=RESTAPI_SERVER_HOST, port=RESTAPI_SERVER_PORT): def client(host=RESTAPI_SERVER_HOST, port=RESTAPI_SERVER_PORT):
""" """
REST-JSON API client REST-JSON API client
@ -647,11 +680,106 @@ def client(host=RESTAPI_SERVER_HOST, port=RESTAPI_SERVER_PORT):
addr = "http://%s:%d" % (host, port) addr = "http://%s:%d" % (host, port)
logger.info("Starting REST-JSON API client to '%s'..." % addr) logger.info("Starting REST-JSON API client to '%s'..." % addr)
# TODO: write a simple client with requests, for now use curl from command line try:
logger.error("Not yet implemented, use curl from command line instead for now, for example:") _client(addr)
print "\n\t$ taskid=$(curl http://%s:%d/task/new 2>1 | grep -o -I '[a-f0-9]\{16\}') && echo $taskid" % (host, port) except Exception, ex:
print ("\t$ curl -H \"Content-Type: application/json\" " if not isinstance(ex, urllib2.HTTPError):
"-X POST -d '{\"url\": \"http://testphp.vulnweb.com/artists.php?artist=1\"}' " errMsg = "there has been a problem while connecting to the "
"http://%s:%d/scan/$taskid/start") % (host, port) errMsg += "REST-JSON API server at '%s' " % addr
print "\t$ curl http://%s:%d/scan/$taskid/data" % (host, port) errMsg += "(%s)" % ex
print "\t$ curl http://%s:%d/scan/$taskid/log\n" % (host, port) logger.critical(errMsg)
return
taskid = None
logger.info("Type 'help' or '?' for list of available commands")
while True:
try:
command = raw_input("api%s> " % (" (%s)" % taskid if taskid else "")).strip().lower()
except (EOFError, KeyboardInterrupt):
print
break
if command in ("data", "log", "status", "stop", "kill"):
if not taskid:
logger.error("No task ID in use")
continue
raw = _client("%s/scan/%s/%s" % (addr, taskid, command))
res = dejsonize(raw)
if not res["success"]:
logger.error("Failed to execute command %s" % command)
dataToStdout("%s\n" % raw)
elif command.startswith("new"):
if ' ' not in command:
logger.error("Program arguments are missing")
continue
argv = ["sqlmap.py"] + shlex.split(command)[1:]
try:
cmdLineOptions = cmdLineParser(argv).__dict__
except:
taskid = None
continue
for key in list(cmdLineOptions):
if cmdLineOptions[key] is None:
del cmdLineOptions[key]
raw = _client("%s/task/new" % addr)
res = dejsonize(raw)
if not res["success"]:
logger.error("Failed to create new task")
continue
taskid = res["taskid"]
logger.info("New task ID is '%s'" % taskid)
raw = _client("%s/scan/%s/start" % (addr, taskid), cmdLineOptions)
res = dejsonize(raw)
if not res["success"]:
logger.error("Failed to start scan")
continue
logger.info("Scanning started")
elif command.startswith("use"):
taskid = (command.split()[1] if ' ' in command else "").strip("'\"")
if not taskid:
logger.error("Task ID is missing")
taskid = None
continue
elif not re.search(r"\A[0-9a-fA-F]{16}\Z", taskid):
logger.error("Invalid task ID '%s'" % taskid)
taskid = None
continue
logger.info("Switching to task ID '%s' " % taskid)
elif command in ("list", "flush"):
raw = _client("%s/admin/%s/%s" % (addr, taskid or 0, command))
res = dejsonize(raw)
if not res["success"]:
logger.error("Failed to execute command %s" % command)
elif command == "flush":
taskid = None
dataToStdout("%s\n" % raw)
elif command in ("exit", "bye", "quit", 'q'):
return
elif command in ("help", "?"):
msg = "help Show this help message\n"
msg += "new ARGS Start a new scan task with provided arguments (e.g. 'new -u \"http://testphp.vulnweb.com/artists.php?artist=1\"')\n"
msg += "use TASKID Switch current context to different task (e.g. 'use c04d8c5c7582efb4')\n"
msg += "data Retrieve and show data for current task\n"
msg += "log Retrieve and show log for current task\n"
msg += "status Retrieve and show status for current task\n"
msg += "stop Stop current task\n"
msg += "kill Kill current task\n"
msg += "list Display all tasks\n"
msg += "flush Flush tasks (delete all tasks)\n"
msg += "exit Exit this client\n"
dataToStdout(msg)
elif command:
logger.error("Unknown command '%s'" % command)

View File

@ -22,6 +22,7 @@ from lib.core.data import conf
from lib.core.data import kb from lib.core.data import kb
from lib.core.data import logger from lib.core.data import logger
from lib.core.exception import SqlmapConnectionException from lib.core.exception import SqlmapConnectionException
from lib.core.exception import SqlmapSyntaxException
from lib.core.settings import CRAWL_EXCLUDE_EXTENSIONS from lib.core.settings import CRAWL_EXCLUDE_EXTENSIONS
from lib.core.threads import getCurrentThreadData from lib.core.threads import getCurrentThreadData
from lib.core.threads import runThreads from lib.core.threads import runThreads
@ -58,12 +59,15 @@ def crawl(target):
try: try:
if current: if current:
content = Request.getPage(url=current, crawling=True, raise404=False)[0] content = Request.getPage(url=current, crawling=True, raise404=False)[0]
except SqlmapConnectionException, e: except SqlmapConnectionException, ex:
errMsg = "connection exception detected (%s). skipping " % e errMsg = "connection exception detected (%s). skipping " % ex
errMsg += "URL '%s'" % current errMsg += "URL '%s'" % current
logger.critical(errMsg) logger.critical(errMsg)
except httplib.InvalidURL, e: except SqlmapSyntaxException:
errMsg = "invalid URL detected (%s). skipping " % e errMsg = "invalid URL detected. skipping '%s'" % current
logger.critical(errMsg)
except httplib.InvalidURL, ex:
errMsg = "invalid URL detected (%s). skipping " % ex
errMsg += "URL '%s'" % current errMsg += "URL '%s'" % current
logger.critical(errMsg) logger.critical(errMsg)

View File

@ -12,6 +12,7 @@ import socket
import urllib import urllib
import urllib2 import urllib2
from lib.core.common import getSafeExString
from lib.core.common import getUnicode from lib.core.common import getUnicode
from lib.core.common import readInput from lib.core.common import readInput
from lib.core.common import urlencode from lib.core.common import urlencode
@ -30,6 +31,8 @@ from lib.core.settings import HTTP_ACCEPT_ENCODING_HEADER_VALUE
from lib.core.settings import UNICODE_ENCODING from lib.core.settings import UNICODE_ENCODING
from lib.request.basic import decodePage from lib.request.basic import decodePage
from lib.request.httpshandler import HTTPSHandler from lib.request.httpshandler import HTTPSHandler
from thirdparty.socks import socks
class Google(object): class Google(object):
""" """
@ -47,10 +50,10 @@ class Google(object):
self.opener.addheaders = conf.httpHeaders self.opener.addheaders = conf.httpHeaders
try: try:
conn = self.opener.open("http://www.google.com/ncr") conn = self.opener.open("https://www.google.com/ncr")
conn.info() # retrieve session cookie conn.info() # retrieve session cookie
except Exception, ex: except Exception, ex:
errMsg = "unable to connect to Google ('%s')" % ex errMsg = "unable to connect to Google ('%s')" % getSafeExString(ex)
raise SqlmapConnectionException(errMsg) raise SqlmapConnectionException(errMsg)
def search(self, dork): def search(self, dork):
@ -65,7 +68,7 @@ class Google(object):
if not dork: if not dork:
return None return None
url = "http://www.google.com/search?" url = "https://www.google.com/search?"
url += "q=%s&" % urlencode(dork, convall=True) url += "q=%s&" % urlencode(dork, convall=True)
url += "num=100&hl=en&complete=0&safe=off&filter=0&btnG=Search" url += "num=100&hl=en&complete=0&safe=off&filter=0&btnG=Search"
url += "&start=%d" % ((gpage - 1) * 100) url += "&start=%d" % ((gpage - 1) * 100)
@ -94,12 +97,12 @@ class Google(object):
except urllib2.HTTPError, e: except urllib2.HTTPError, e:
try: try:
page = e.read() page = e.read()
except socket.timeout: except Exception, ex:
warnMsg = "connection timed out while trying " warnMsg = "problem occurred while trying to get "
warnMsg += "to get error page information (%d)" % e.code warnMsg += "an error page information (%s)" % getSafeExString(ex)
logger.critical(warnMsg) logger.critical(warnMsg)
return None return None
except (urllib2.URLError, httplib.error, socket.error, socket.timeout): except (urllib2.URLError, httplib.error, socket.error, socket.timeout, socks.ProxyError):
errMsg = "unable to connect to Google" errMsg = "unable to connect to Google"
raise SqlmapConnectionException(errMsg) raise SqlmapConnectionException(errMsg)
@ -175,3 +178,6 @@ class Google(object):
retVal = [urllib.unquote(match.group(1)) for match in re.finditer(regex, page, re.I | re.S)] retVal = [urllib.unquote(match.group(1)) for match in re.finditer(regex, page, re.I | re.S)]
return retVal return retVal
def setHTTPProxy(): # Cross-linked function
raise NotImplementedError

View File

@ -44,6 +44,7 @@ from lib.core.common import clearConsoleLine
from lib.core.common import dataToStdout from lib.core.common import dataToStdout
from lib.core.common import getFileItems from lib.core.common import getFileItems
from lib.core.common import getPublicTypeMembers from lib.core.common import getPublicTypeMembers
from lib.core.common import getSafeExString
from lib.core.common import hashDBRetrieve from lib.core.common import hashDBRetrieve
from lib.core.common import hashDBWrite from lib.core.common import hashDBWrite
from lib.core.common import normalizeUnicode from lib.core.common import normalizeUnicode
@ -326,8 +327,10 @@ def wordpress_passwd(password, salt, count, prefix, uppercase=False):
return output return output
password = password.encode(UNICODE_ENCODING)
cipher = md5(salt) cipher = md5(salt)
cipher.update(password.encode(UNICODE_ENCODING)) cipher.update(password)
hash_ = cipher.digest() hash_ = cipher.digest()
for i in xrange(count): for i in xrange(count):
@ -706,14 +709,18 @@ def dictionaryAttack(attack_dict):
item = [(user, hash_), {}] item = [(user, hash_), {}]
elif hash_regex in (HASH.ORACLE_OLD, HASH.POSTGRES): elif hash_regex in (HASH.ORACLE_OLD, HASH.POSTGRES):
item = [(user, hash_), {'username': user}] item = [(user, hash_), {'username': user}]
elif hash_regex in (HASH.ORACLE): elif hash_regex in (HASH.ORACLE,):
item = [(user, hash_), {'salt': hash_[-20:]}] item = [(user, hash_), {'salt': hash_[-20:]}]
elif hash_regex in (HASH.MSSQL, HASH.MSSQL_OLD, HASH.MSSQL_NEW): elif hash_regex in (HASH.MSSQL, HASH.MSSQL_OLD, HASH.MSSQL_NEW):
item = [(user, hash_), {'salt': hash_[6:14]}] item = [(user, hash_), {'salt': hash_[6:14]}]
elif hash_regex in (HASH.CRYPT_GENERIC): elif hash_regex in (HASH.CRYPT_GENERIC,):
item = [(user, hash_), {'salt': hash_[0:2]}] item = [(user, hash_), {'salt': hash_[0:2]}]
elif hash_regex in (HASH.WORDPRESS): elif hash_regex in (HASH.WORDPRESS,):
item = [(user, hash_), {'salt': hash_[4:12], 'count': 1 << ITOA64.index(hash_[3]), 'prefix': hash_[:12]}] if ITOA64.index(hash_[3]) < 32:
item = [(user, hash_), {'salt': hash_[4:12], 'count': 1 << ITOA64.index(hash_[3]), 'prefix': hash_[:12]}]
else:
warnMsg = "invalid hash '%s'" % hash_
logger.warn(warnMsg)
if item and hash_ not in keys: if item and hash_ not in keys:
resumed = hashDBRetrieve(hash_) resumed = hashDBRetrieve(hash_)
@ -771,7 +778,7 @@ def dictionaryAttack(attack_dict):
except Exception, ex: except Exception, ex:
warnMsg = "there was a problem while loading dictionaries" warnMsg = "there was a problem while loading dictionaries"
warnMsg += " ('%s')" % ex.message warnMsg += " ('%s')" % getSafeExString(ex)
logger.critical(warnMsg) logger.critical(warnMsg)
message = "do you want to use common password suffixes? (slow!) [y/N] " message = "do you want to use common password suffixes? (slow!) [y/N] "

View File

@ -11,6 +11,7 @@ import sqlite3
import threading import threading
import time import time
from lib.core.common import getSafeExString
from lib.core.common import getUnicode from lib.core.common import getUnicode
from lib.core.common import serializeObject from lib.core.common import serializeObject
from lib.core.common import unserializeObject from lib.core.common import unserializeObject
@ -77,7 +78,7 @@ class HashDB(object):
for row in self.cursor.execute("SELECT value FROM storage WHERE id=?", (hash_,)): for row in self.cursor.execute("SELECT value FROM storage WHERE id=?", (hash_,)):
retVal = row[0] retVal = row[0]
except sqlite3.OperationalError, ex: except sqlite3.OperationalError, ex:
if not "locked" in ex.message: if not "locked" in getSafeExString(ex):
raise raise
except sqlite3.DatabaseError, ex: except sqlite3.DatabaseError, ex:
errMsg = "error occurred while accessing session file '%s' ('%s'). " % (self.filepath, ex) errMsg = "error occurred while accessing session file '%s' ('%s'). " % (self.filepath, ex)
@ -127,7 +128,7 @@ class HashDB(object):
if retries == 0: if retries == 0:
warnMsg = "there has been a problem while writing to " warnMsg = "there has been a problem while writing to "
warnMsg += "the session file ('%s')" % ex.message warnMsg += "the session file ('%s')" % getSafeExString(ex)
logger.warn(warnMsg) logger.warn(warnMsg)
if retries >= HASHDB_FLUSH_RETRIES: if retries >= HASHDB_FLUSH_RETRIES:

View File

@ -12,6 +12,7 @@ from lib.core.data import logger
from lib.core.data import queries from lib.core.data import queries
from lib.core.common import Backend from lib.core.common import Backend
from lib.core.common import unArrayizeValue from lib.core.common import unArrayizeValue
from lib.core.settings import HSQLDB_DEFAULT_SCHEMA
from lib.request import inject from lib.request import inject
class Enumeration(GenericEnumeration): class Enumeration(GenericEnumeration):
@ -40,3 +41,6 @@ class Enumeration(GenericEnumeration):
def getHostname(self): def getHostname(self):
warnMsg = "on HSQLDB it is not possible to enumerate the hostname" warnMsg = "on HSQLDB it is not possible to enumerate the hostname"
logger.warn(warnMsg) logger.warn(warnMsg)
def getCurrentDb(self):
return HSQLDB_DEFAULT_SCHEMA

View File

@ -152,7 +152,7 @@ class Enumeration(GenericEnumeration):
warnMsg += "for database '%s'" % db warnMsg += "for database '%s'" % db
logger.warn(warnMsg) logger.warn(warnMsg)
if not kb.data.cachedTables: if not kb.data.cachedTables and not conf.search:
errMsg = "unable to retrieve the tables for any database" errMsg = "unable to retrieve the tables for any database"
raise SqlmapNoneDataException(errMsg) raise SqlmapNoneDataException(errMsg)
else: else:
@ -184,7 +184,7 @@ class Enumeration(GenericEnumeration):
infoMsg = "searching table" infoMsg = "searching table"
if tblConsider == "1": if tblConsider == "1":
infoMsg += "s like" infoMsg += "s LIKE"
infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(tbl) infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(tbl)
logger.info(infoMsg) logger.info(infoMsg)
@ -217,7 +217,7 @@ class Enumeration(GenericEnumeration):
else: else:
infoMsg = "fetching number of table" infoMsg = "fetching number of table"
if tblConsider == "1": if tblConsider == "1":
infoMsg += "s like" infoMsg += "s LIKE"
infoMsg += " '%s' in database '%s'" % (unsafeSQLIdentificatorNaming(tbl), unsafeSQLIdentificatorNaming(db)) infoMsg += " '%s' in database '%s'" % (unsafeSQLIdentificatorNaming(tbl), unsafeSQLIdentificatorNaming(db))
logger.info(infoMsg) logger.info(infoMsg)
@ -229,7 +229,7 @@ class Enumeration(GenericEnumeration):
if not isNumPosStrValue(count): if not isNumPosStrValue(count):
warnMsg = "no table" warnMsg = "no table"
if tblConsider == "1": if tblConsider == "1":
warnMsg += "s like" warnMsg += "s LIKE"
warnMsg += " '%s' " % unsafeSQLIdentificatorNaming(tbl) warnMsg += " '%s' " % unsafeSQLIdentificatorNaming(tbl)
warnMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(db) warnMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(db)
logger.warn(warnMsg) logger.warn(warnMsg)
@ -295,7 +295,7 @@ class Enumeration(GenericEnumeration):
infoMsg = "searching column" infoMsg = "searching column"
if colConsider == "1": if colConsider == "1":
infoMsg += "s like" infoMsg += "s LIKE"
infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(column) infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(column)
foundCols[column] = {} foundCols[column] = {}
@ -336,7 +336,7 @@ class Enumeration(GenericEnumeration):
values = [values] values = [values]
for foundTbl in values: for foundTbl in values:
foundTbl = safeSQLIdentificatorNaming(foundTbl, True) foundTbl = safeSQLIdentificatorNaming(unArrayizeValue(foundTbl), True)
if foundTbl is None: if foundTbl is None:
continue continue
@ -367,7 +367,7 @@ class Enumeration(GenericEnumeration):
infoMsg = "fetching number of tables containing column" infoMsg = "fetching number of tables containing column"
if colConsider == "1": if colConsider == "1":
infoMsg += "s like" infoMsg += "s LIKE"
infoMsg += " '%s' in database '%s'" % (column, db) infoMsg += " '%s' in database '%s'" % (column, db)
logger.info("%s%s" % (infoMsg, infoMsgTbl)) logger.info("%s%s" % (infoMsg, infoMsgTbl))
@ -380,7 +380,7 @@ class Enumeration(GenericEnumeration):
if not isNumPosStrValue(count): if not isNumPosStrValue(count):
warnMsg = "no tables contain column" warnMsg = "no tables contain column"
if colConsider == "1": if colConsider == "1":
warnMsg += "s like" warnMsg += "s LIKE"
warnMsg += " '%s' " % column warnMsg += " '%s' " % column
warnMsg += "in database '%s'" % db warnMsg += "in database '%s'" % db
logger.warn(warnMsg) logger.warn(warnMsg)

View File

@ -169,7 +169,7 @@ class Fingerprint(GenericFingerprint):
infoMsg = "confirming %s" % DBMS.MYSQL infoMsg = "confirming %s" % DBMS.MYSQL
logger.info(infoMsg) logger.info(infoMsg)
result = inject.checkBooleanExpression("USER() LIKE USER()") result = inject.checkBooleanExpression("SESSION_USER() LIKE USER()")
if not result: if not result:
warnMsg = "the back-end DBMS is not %s" % DBMS.MYSQL warnMsg = "the back-end DBMS is not %s" % DBMS.MYSQL

View File

@ -358,7 +358,7 @@ class Databases:
if bruteForce is None: if bruteForce is None:
logger.error(errMsg) logger.error(errMsg)
return self.getTables(bruteForce=True) return self.getTables(bruteForce=True)
else: elif not conf.search:
raise SqlmapNoneDataException(errMsg) raise SqlmapNoneDataException(errMsg)
else: else:
for db, tables in kb.data.cachedTables.items(): for db, tables in kb.data.cachedTables.items():
@ -370,7 +370,7 @@ class Databases:
return kb.data.cachedTables return kb.data.cachedTables
def getColumns(self, onlyColNames=False, colTuple=None, bruteForce=None): def getColumns(self, onlyColNames=False, colTuple=None, bruteForce=None, dumpMode=False):
self.forceDbmsEnum() self.forceDbmsEnum()
if conf.db is None or conf.db == CURRENT_DB: if conf.db is None or conf.db == CURRENT_DB:
@ -415,7 +415,7 @@ class Databases:
colList = filter(None, colList) colList = filter(None, colList)
if conf.tbl: if conf.tbl:
if Backend.getIdentifiedDbms() in (DBMS.ORACLE, DBMS.DB2): if Backend.getIdentifiedDbms() in (DBMS.ORACLE, DBMS.DB2, DBMS.HSQLDB):
conf.tbl = conf.tbl.upper() conf.tbl = conf.tbl.upper()
tblList = conf.tbl.split(",") tblList = conf.tbl.split(",")
@ -432,10 +432,12 @@ class Databases:
tblList = tblList[0] tblList = tblList[0]
tblList = list(tblList) tblList = list(tblList)
else: elif not conf.search:
errMsg = "unable to retrieve the tables " errMsg = "unable to retrieve the tables "
errMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(conf.db) errMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(conf.db)
raise SqlmapNoneDataException(errMsg) raise SqlmapNoneDataException(errMsg)
else:
return kb.data.cachedColumns
tblList = filter(None, (safeSQLIdentificatorNaming(_, True) for _ in tblList)) tblList = filter(None, (safeSQLIdentificatorNaming(_, True) for _ in tblList))
@ -509,7 +511,7 @@ class Databases:
if len(colList) > 0: if len(colList) > 0:
if colTuple: if colTuple:
_, colCondParam = colTuple _, colCondParam = colTuple
infoMsg += "like '%s' " % ", ".join(unsafeSQLIdentificatorNaming(col) for col in sorted(colList)) infoMsg += "LIKE '%s' " % ", ".join(unsafeSQLIdentificatorNaming(col) for col in sorted(colList))
else: else:
colCondParam = "='%s'" colCondParam = "='%s'"
infoMsg += "'%s' " % ", ".join(unsafeSQLIdentificatorNaming(col) for col in sorted(colList)) infoMsg += "'%s' " % ", ".join(unsafeSQLIdentificatorNaming(col) for col in sorted(colList))
@ -517,10 +519,6 @@ class Databases:
condQueryStr = "%%s%s" % colCondParam condQueryStr = "%%s%s" % colCondParam
condQuery = " AND (%s)" % " OR ".join(condQueryStr % (condition, unsafeSQLIdentificatorNaming(col)) for col in sorted(colList)) condQuery = " AND (%s)" % " OR ".join(condQueryStr % (condition, unsafeSQLIdentificatorNaming(col)) for col in sorted(colList))
infoMsg += "for table '%s' " % unsafeSQLIdentificatorNaming(tbl)
infoMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(conf.db)
logger.info(infoMsg)
if Backend.getIdentifiedDbms() in (DBMS.MYSQL, DBMS.PGSQL, DBMS.HSQLDB): if Backend.getIdentifiedDbms() in (DBMS.MYSQL, DBMS.PGSQL, DBMS.HSQLDB):
query = rootQuery.inband.query % (unsafeSQLIdentificatorNaming(tbl), unsafeSQLIdentificatorNaming(conf.db)) query = rootQuery.inband.query % (unsafeSQLIdentificatorNaming(tbl), unsafeSQLIdentificatorNaming(conf.db))
query += condQuery query += condQuery
@ -534,7 +532,14 @@ class Databases:
elif Backend.getIdentifiedDbms() in (DBMS.SQLITE, DBMS.FIREBIRD): elif Backend.getIdentifiedDbms() in (DBMS.SQLITE, DBMS.FIREBIRD):
query = rootQuery.inband.query % tbl query = rootQuery.inband.query % tbl
values = inject.getValue(query, blind=False, time=False) if dumpMode and colList:
values = [(_,) for _ in colList]
else:
infoMsg += "for table '%s' " % unsafeSQLIdentificatorNaming(tbl)
infoMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(conf.db)
logger.info(infoMsg)
values = inject.getValue(query, blind=False, time=False)
if Backend.isDbms(DBMS.MSSQL) and isNoneValue(values): if Backend.isDbms(DBMS.MSSQL) and isNoneValue(values):
index, values = 1, [] index, values = 1, []
@ -604,7 +609,7 @@ class Databases:
if len(colList) > 0: if len(colList) > 0:
if colTuple: if colTuple:
_, colCondParam = colTuple _, colCondParam = colTuple
infoMsg += "like '%s' " % ", ".join(unsafeSQLIdentificatorNaming(col) for col in sorted(colList)) infoMsg += "LIKE '%s' " % ", ".join(unsafeSQLIdentificatorNaming(col) for col in sorted(colList))
else: else:
colCondParam = "='%s'" colCondParam = "='%s'"
infoMsg += "'%s' " % ", ".join(unsafeSQLIdentificatorNaming(col) for col in sorted(colList)) infoMsg += "'%s' " % ", ".join(unsafeSQLIdentificatorNaming(col) for col in sorted(colList))
@ -612,10 +617,6 @@ class Databases:
condQueryStr = "%%s%s" % colCondParam condQueryStr = "%%s%s" % colCondParam
condQuery = " AND (%s)" % " OR ".join(condQueryStr % (condition, unsafeSQLIdentificatorNaming(col)) for col in sorted(colList)) condQuery = " AND (%s)" % " OR ".join(condQueryStr % (condition, unsafeSQLIdentificatorNaming(col)) for col in sorted(colList))
infoMsg += "for table '%s' " % unsafeSQLIdentificatorNaming(tbl)
infoMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(conf.db)
logger.info(infoMsg)
if Backend.getIdentifiedDbms() in (DBMS.MYSQL, DBMS.PGSQL, DBMS.HSQLDB): if Backend.getIdentifiedDbms() in (DBMS.MYSQL, DBMS.PGSQL, DBMS.HSQLDB):
query = rootQuery.blind.count % (unsafeSQLIdentificatorNaming(tbl), unsafeSQLIdentificatorNaming(conf.db)) query = rootQuery.blind.count % (unsafeSQLIdentificatorNaming(tbl), unsafeSQLIdentificatorNaming(conf.db))
query += condQuery query += condQuery
@ -639,32 +640,41 @@ class Databases:
parseSqliteTableSchema(value) parseSqliteTableSchema(value)
return kb.data.cachedColumns return kb.data.cachedColumns
count = inject.getValue(query, union=False, error=False, expected=EXPECTED.INT, charsetType=CHARSET_TYPE.DIGITS)
table = {} table = {}
columns = {} columns = {}
if not isNumPosStrValue(count): if dumpMode and colList:
if Backend.isDbms(DBMS.MSSQL): count = 0
count, index, values = 0, 1, [] for value in colList:
while True: columns[safeSQLIdentificatorNaming(value)] = None
query = rootQuery.blind.query3 % (conf.db, tbl, index) else:
value = unArrayizeValue(inject.getValue(query, union=False, error=False)) infoMsg += "for table '%s' " % unsafeSQLIdentificatorNaming(tbl)
if isNoneValue(value) or value == " ": infoMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(conf.db)
break logger.info(infoMsg)
else:
columns[safeSQLIdentificatorNaming(value)] = None
index += 1
if not columns: count = inject.getValue(query, union=False, error=False, expected=EXPECTED.INT, charsetType=CHARSET_TYPE.DIGITS)
errMsg = "unable to retrieve the %scolumns " % ("number of " if not Backend.isDbms(DBMS.MSSQL) else "")
errMsg += "for table '%s' " % unsafeSQLIdentificatorNaming(tbl) if not isNumPosStrValue(count):
errMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(conf.db) if Backend.isDbms(DBMS.MSSQL):
logger.error(errMsg) count, index, values = 0, 1, []
continue while True:
query = rootQuery.blind.query3 % (conf.db, tbl, index)
value = unArrayizeValue(inject.getValue(query, union=False, error=False))
if isNoneValue(value) or value == " ":
break
else:
columns[safeSQLIdentificatorNaming(value)] = None
index += 1
if not columns:
errMsg = "unable to retrieve the %scolumns " % ("number of " if not Backend.isDbms(DBMS.MSSQL) else "")
errMsg += "for table '%s' " % unsafeSQLIdentificatorNaming(tbl)
errMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(conf.db)
logger.error(errMsg)
continue
for index in getLimitRange(count): for index in getLimitRange(count):
if Backend.getIdentifiedDbms() in (DBMS.MYSQL, DBMS.PGSQL): if Backend.getIdentifiedDbms() in (DBMS.MYSQL, DBMS.PGSQL, DBMS.HSQLDB):
query = rootQuery.blind.query % (unsafeSQLIdentificatorNaming(tbl), unsafeSQLIdentificatorNaming(conf.db)) query = rootQuery.blind.query % (unsafeSQLIdentificatorNaming(tbl), unsafeSQLIdentificatorNaming(conf.db))
query += condQuery query += condQuery
field = None field = None
@ -805,7 +815,7 @@ class Databases:
elif "." in conf.tbl: elif "." in conf.tbl:
if not conf.db: if not conf.db:
conf.db, conf.tbl = conf.tbl.split(".") conf.db, conf.tbl = conf.tbl.split('.', 1)
if conf.tbl is not None and conf.db is None and Backend.getIdentifiedDbms() not in (DBMS.SQLITE, DBMS.ACCESS, DBMS.FIREBIRD): if conf.tbl is not None and conf.db is None and Backend.getIdentifiedDbms() not in (DBMS.SQLITE, DBMS.ACCESS, DBMS.FIREBIRD):
warnMsg = "missing database parameter. sqlmap is going to " warnMsg = "missing database parameter. sqlmap is going to "

View File

@ -12,6 +12,7 @@ from lib.core.bigarray import BigArray
from lib.core.common import Backend from lib.core.common import Backend
from lib.core.common import clearConsoleLine from lib.core.common import clearConsoleLine
from lib.core.common import getLimitRange from lib.core.common import getLimitRange
from lib.core.common import getSafeExString
from lib.core.common import getUnicode from lib.core.common import getUnicode
from lib.core.common import isInferenceAvailable from lib.core.common import isInferenceAvailable
from lib.core.common import isListLike from lib.core.common import isListLike
@ -88,10 +89,12 @@ class Entries:
if isinstance(tblList[0], (set, tuple, list)): if isinstance(tblList[0], (set, tuple, list)):
tblList = tblList[0] tblList = tblList[0]
else: elif not conf.search:
errMsg = "unable to retrieve the tables " errMsg = "unable to retrieve the tables "
errMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(conf.db) errMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(conf.db)
raise SqlmapNoneDataException(errMsg) raise SqlmapNoneDataException(errMsg)
else:
return
for tbl in tblList: for tbl in tblList:
tblList[tblList.index(tbl)] = safeSQLIdentificatorNaming(tbl, True) tblList[tblList.index(tbl)] = safeSQLIdentificatorNaming(tbl, True)
@ -102,7 +105,7 @@ class Entries:
if foundData is None: if foundData is None:
kb.data.cachedColumns = {} kb.data.cachedColumns = {}
self.getColumns(onlyColNames=True) self.getColumns(onlyColNames=True, dumpMode=True)
else: else:
kb.data.cachedColumns = foundData kb.data.cachedColumns = foundData
@ -272,7 +275,7 @@ class Entries:
else: else:
emptyColumns = [] emptyColumns = []
plusOne = Backend.getIdentifiedDbms() in (DBMS.ORACLE, DBMS.DB2) plusOne = Backend.getIdentifiedDbms() in (DBMS.ORACLE, DBMS.DB2)
indexRange = getLimitRange(count, dump=True, plusOne=plusOne) indexRange = getLimitRange(count, plusOne=plusOne)
if len(colList) < len(indexRange) > CHECK_ZERO_COLUMNS_THRESHOLD: if len(colList) < len(indexRange) > CHECK_ZERO_COLUMNS_THRESHOLD:
for column in colList: for column in colList:
@ -293,7 +296,7 @@ class Entries:
if column not in entries: if column not in entries:
entries[column] = BigArray() entries[column] = BigArray()
if Backend.getIdentifiedDbms() in (DBMS.MYSQL, DBMS.PGSQL): if Backend.getIdentifiedDbms() in (DBMS.MYSQL, DBMS.PGSQL, DBMS.HSQLDB):
query = rootQuery.blind.query % (agent.preprocessField(tbl, column), conf.db, conf.tbl, sorted(colList, key=len)[0], index) query = rootQuery.blind.query % (agent.preprocessField(tbl, column), conf.db, conf.tbl, sorted(colList, key=len)[0], index)
elif Backend.getIdentifiedDbms() in (DBMS.ORACLE, DBMS.DB2): elif Backend.getIdentifiedDbms() in (DBMS.ORACLE, DBMS.DB2):
query = rootQuery.blind.query % (agent.preprocessField(tbl, column), query = rootQuery.blind.query % (agent.preprocessField(tbl, column),
@ -341,13 +344,13 @@ class Entries:
attackDumpedTable() attackDumpedTable()
except (IOError, OSError), ex: except (IOError, OSError), ex:
errMsg = "an error occurred while attacking " errMsg = "an error occurred while attacking "
errMsg += "table dump ('%s')" % ex.message errMsg += "table dump ('%s')" % getSafeExString(ex)
logger.critical(errMsg) logger.critical(errMsg)
conf.dumper.dbTableValues(kb.data.dumpedTable) conf.dumper.dbTableValues(kb.data.dumpedTable)
except SqlmapConnectionException, ex: except SqlmapConnectionException, ex:
errMsg = "connection exception detected in dumping phase " errMsg = "connection exception detected in dumping phase "
errMsg += "('%s')" % ex.message errMsg += "('%s')" % getSafeExString(ex)
logger.critical(errMsg) logger.critical(errMsg)
finally: finally:

View File

@ -65,7 +65,7 @@ class Search:
infoMsg = "searching database" infoMsg = "searching database"
if dbConsider == "1": if dbConsider == "1":
infoMsg += "s like" infoMsg += "s LIKE"
infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(db) infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(db)
logger.info(infoMsg) logger.info(infoMsg)
@ -98,7 +98,7 @@ class Search:
if not values and isInferenceAvailable() and not conf.direct: if not values and isInferenceAvailable() and not conf.direct:
infoMsg = "fetching number of database" infoMsg = "fetching number of database"
if dbConsider == "1": if dbConsider == "1":
infoMsg += "s like" infoMsg += "s LIKE"
infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(db) infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(db)
logger.info(infoMsg) logger.info(infoMsg)
@ -113,7 +113,7 @@ class Search:
if not isNumPosStrValue(count): if not isNumPosStrValue(count):
warnMsg = "no database" warnMsg = "no database"
if dbConsider == "1": if dbConsider == "1":
warnMsg += "s like" warnMsg += "s LIKE"
warnMsg += " '%s' found" % unsafeSQLIdentificatorNaming(db) warnMsg += " '%s' found" % unsafeSQLIdentificatorNaming(db)
logger.warn(warnMsg) logger.warn(warnMsg)
@ -172,7 +172,7 @@ class Search:
infoMsg = "searching table" infoMsg = "searching table"
if tblConsider == "1": if tblConsider == "1":
infoMsg += "s like" infoMsg += "s LIKE"
infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(tbl) infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(tbl)
if dbCond and conf.db and conf.db != CURRENT_DB: if dbCond and conf.db and conf.db != CURRENT_DB:
@ -225,7 +225,7 @@ class Search:
if len(whereDbsQuery) == 0: if len(whereDbsQuery) == 0:
infoMsg = "fetching number of databases with table" infoMsg = "fetching number of databases with table"
if tblConsider == "1": if tblConsider == "1":
infoMsg += "s like" infoMsg += "s LIKE"
infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(tbl) infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(tbl)
logger.info(infoMsg) logger.info(infoMsg)
@ -236,7 +236,7 @@ class Search:
if not isNumPosStrValue(count): if not isNumPosStrValue(count):
warnMsg = "no databases have table" warnMsg = "no databases have table"
if tblConsider == "1": if tblConsider == "1":
warnMsg += "s like" warnMsg += "s LIKE"
warnMsg += " '%s'" % unsafeSQLIdentificatorNaming(tbl) warnMsg += " '%s'" % unsafeSQLIdentificatorNaming(tbl)
logger.warn(warnMsg) logger.warn(warnMsg)
@ -274,7 +274,7 @@ class Search:
infoMsg = "fetching number of table" infoMsg = "fetching number of table"
if tblConsider == "1": if tblConsider == "1":
infoMsg += "s like" infoMsg += "s LIKE"
infoMsg += " '%s' in database '%s'" % (unsafeSQLIdentificatorNaming(tbl), unsafeSQLIdentificatorNaming(db)) infoMsg += " '%s' in database '%s'" % (unsafeSQLIdentificatorNaming(tbl), unsafeSQLIdentificatorNaming(db))
logger.info(infoMsg) logger.info(infoMsg)
@ -288,7 +288,7 @@ class Search:
if not isNumPosStrValue(count): if not isNumPosStrValue(count):
warnMsg = "no table" warnMsg = "no table"
if tblConsider == "1": if tblConsider == "1":
warnMsg += "s like" warnMsg += "s LIKE"
warnMsg += " '%s' " % unsafeSQLIdentificatorNaming(tbl) warnMsg += " '%s' " % unsafeSQLIdentificatorNaming(tbl)
warnMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(db) warnMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(db)
logger.warn(warnMsg) logger.warn(warnMsg)
@ -390,7 +390,7 @@ class Search:
infoMsg = "searching column" infoMsg = "searching column"
if colConsider == "1": if colConsider == "1":
infoMsg += "s like" infoMsg += "s LIKE"
infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(column) infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(column)
foundCols[column] = {} foundCols[column] = {}
@ -468,7 +468,7 @@ class Search:
if not conf.db: if not conf.db:
infoMsg = "fetching number of databases with tables containing column" infoMsg = "fetching number of databases with tables containing column"
if colConsider == "1": if colConsider == "1":
infoMsg += "s like" infoMsg += "s LIKE"
infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(column) infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(column)
logger.info("%s%s%s" % (infoMsg, infoMsgTbl, infoMsgDb)) logger.info("%s%s%s" % (infoMsg, infoMsgTbl, infoMsgDb))
@ -479,7 +479,7 @@ class Search:
if not isNumPosStrValue(count): if not isNumPosStrValue(count):
warnMsg = "no databases have tables containing column" warnMsg = "no databases have tables containing column"
if colConsider == "1": if colConsider == "1":
warnMsg += "s like" warnMsg += "s LIKE"
warnMsg += " '%s'" % unsafeSQLIdentificatorNaming(column) warnMsg += " '%s'" % unsafeSQLIdentificatorNaming(column)
logger.warn("%s%s" % (warnMsg, infoMsgTbl)) logger.warn("%s%s" % (warnMsg, infoMsgTbl))
@ -519,7 +519,7 @@ class Search:
infoMsg = "fetching number of tables containing column" infoMsg = "fetching number of tables containing column"
if colConsider == "1": if colConsider == "1":
infoMsg += "s like" infoMsg += "s LIKE"
infoMsg += " '%s' in database '%s'" % (unsafeSQLIdentificatorNaming(column), unsafeSQLIdentificatorNaming(db)) infoMsg += " '%s' in database '%s'" % (unsafeSQLIdentificatorNaming(column), unsafeSQLIdentificatorNaming(db))
logger.info(infoMsg) logger.info(infoMsg)
@ -533,7 +533,7 @@ class Search:
if not isNumPosStrValue(count): if not isNumPosStrValue(count):
warnMsg = "no tables contain column" warnMsg = "no tables contain column"
if colConsider == "1": if colConsider == "1":
warnMsg += "s like" warnMsg += "s LIKE"
warnMsg += " '%s' " % unsafeSQLIdentificatorNaming(column) warnMsg += " '%s' " % unsafeSQLIdentificatorNaming(column)
warnMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(db) warnMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(db)
logger.warn(warnMsg) logger.warn(warnMsg)

View File

@ -1,4 +1,5 @@
DROP TABLE IF EXISTS %RANDSTR1%; DROP TABLE IF EXISTS %RANDSTR1%;
# https://wiki.postgresql.org/wiki/CREATE_OR_REPLACE_LANGUAGE <- if "CREATE LANGUAGE plpgsql" is required
CREATE TABLE %RANDSTR1%(%RANDSTR2% text); CREATE TABLE %RANDSTR1%(%RANDSTR2% text);
CREATE OR REPLACE FUNCTION %RANDSTR3%() CREATE OR REPLACE FUNCTION %RANDSTR3%()
RETURNS VOID AS $$ RETURNS VOID AS $$

View File

@ -93,10 +93,10 @@ authType =
# Syntax: username:password # Syntax: username:password
authCred = authCred =
# HTTP Authentication PEM private key. Useful only if the target URL requires # HTTP Authentication PEM private/cert key file. Useful only if the target URL requires
# PKI authentication and you have such data. # PKI authentication and you have such data.
# Syntax: key_file # Syntax: key_file
authPrivate = authFile =
# Use a proxy to connect to the target URL. # Use a proxy to connect to the target URL.
# Syntax: (http|https|socks4|socks5)://address:port # Syntax: (http|https|socks4|socks5)://address:port
@ -708,6 +708,9 @@ scope =
# Select tests by payloads and/or titles (e.g. ROW) # Select tests by payloads and/or titles (e.g. ROW)
testFilter = testFilter =
# Skip tests by payloads and/or titles (e.g. BENCHMARK)
testSkip =
# Update sqlmap. # Update sqlmap.
# Valid: True or False # Valid: True or False
updateAll = False updateAll = False
@ -750,6 +753,10 @@ googlePage = 1
# Valid: True or False # Valid: True or False
identifyWaf = False identifyWaf = False
# Skip heuristic detection of WAF/IPS/IDS protection.
# Valid: True or False
skipWaf = False
# Imitate smartphone through HTTP User-Agent header. # Imitate smartphone through HTTP User-Agent header.
# Valid: True or False # Valid: True or False
mobile = False mobile = False

View File

@ -25,6 +25,7 @@ from lib.controller.controller import start
from lib.core.common import banner from lib.core.common import banner
from lib.core.common import createGithubIssue from lib.core.common import createGithubIssue
from lib.core.common import dataToStdout from lib.core.common import dataToStdout
from lib.core.common import getSafeExString
from lib.core.common import getUnicode from lib.core.common import getUnicode
from lib.core.common import maskSensitiveData from lib.core.common import maskSensitiveData
from lib.core.common import setPaths from lib.core.common import setPaths
@ -76,7 +77,7 @@ def main():
errMsg = "your system does not properly handle non-ASCII paths. " errMsg = "your system does not properly handle non-ASCII paths. "
errMsg += "Please move the sqlmap's directory to the other location" errMsg += "Please move the sqlmap's directory to the other location"
logger.error(errMsg) logger.error(errMsg)
exit() raise SystemExit
setPaths() setPaths()
@ -119,9 +120,9 @@ def main():
cmdLineOptions.sqlmapShell = False cmdLineOptions.sqlmapShell = False
except SqlmapBaseException as ex: except SqlmapBaseException as ex:
errMsg = getUnicode(ex.message) errMsg = getSafeExString(ex)
logger.critical(errMsg) logger.critical(errMsg)
sys.exit(1) raise SystemExit
except KeyboardInterrupt: except KeyboardInterrupt:
print print
@ -141,6 +142,11 @@ def main():
errMsg = unhandledExceptionMessage() errMsg = unhandledExceptionMessage()
excMsg = traceback.format_exc() excMsg = traceback.format_exc()
if "No space left" in excMsg:
errMsg = "no space left on output device"
logger.error(errMsg)
raise SystemExit
for match in re.finditer(r'File "(.+?)", line', excMsg): for match in re.finditer(r'File "(.+?)", line', excMsg):
file_ = match.group(1) file_ = match.group(1)
file_ = os.path.relpath(file_, os.path.dirname(__file__)) file_ = os.path.relpath(file_, os.path.dirname(__file__))

View File

@ -35,15 +35,9 @@ def tamper(payload, **kwargs):
'SELECT * FROM users WHERE id LIKE 1' 'SELECT * FROM users WHERE id LIKE 1'
""" """
def process(match):
word = match.group()
word = "%sLIKE%s" % (" " if word[0] != " " else "", " " if word[-1] != " " else "")
return word
retVal = payload retVal = payload
if payload: if payload:
retVal = re.sub(r"\s*=\s*", lambda match: process(match), retVal) retVal = re.sub(r"\s*=\s*", " LIKE ", retVal)
return retVal return retVal

View File

@ -19,7 +19,7 @@ def tamper(payload, **kwargs):
Replaces AND and OR logical operators with their symbolic counterparts (&& and ||) Replaces AND and OR logical operators with their symbolic counterparts (&& and ||)
>>> tamper("1 AND '1'='1") >>> tamper("1 AND '1'='1")
'1 && '1'='1' "1 %26%26 '1'='1"
""" """
retVal = payload retVal = payload

46
tamper/uppercase.py Normal file
View File

@ -0,0 +1,46 @@
#!/usr/bin/env python
"""
Copyright (c) 2006-2015 sqlmap developers (http://sqlmap.org/)
See the file 'doc/COPYING' for copying permission
"""
import re
from lib.core.data import kb
from lib.core.enums import PRIORITY
__priority__ = PRIORITY.NORMAL
def dependencies():
pass
def tamper(payload, **kwargs):
"""
Replaces each keyword character with upper case value
Tested against:
* Microsoft SQL Server 2005
* MySQL 4, 5.0 and 5.5
* Oracle 10g
* PostgreSQL 8.3, 8.4, 9.0
Notes:
* Useful to bypass very weak and bespoke web application firewalls
that has poorly written permissive regular expressions
* This tamper script should work against all (?) databases
>>> tamper('insert')
'INSERT'
"""
retVal = payload
if payload:
for match in re.finditer(r"[A-Za-z_]+", retVal):
word = match.group()
if word.upper() in kb.keywords:
retVal = retVal.replace(word, word.upper())
return retVal

View File

@ -3,22 +3,28 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
__version__ = "2.0.1" __version__ = "2.3.0"
from sys import version_info
def detect(aBuf): def detect(aBuf):
import universaldetector if ((version_info < (3, 0) and isinstance(aBuf, unicode)) or
(version_info >= (3, 0) and not isinstance(aBuf, bytes))):
raise ValueError('Expected a bytes object, not a unicode object')
from . import universaldetector
u = universaldetector.UniversalDetector() u = universaldetector.UniversalDetector()
u.reset() u.reset()
u.feed(aBuf) u.feed(aBuf)

View File

@ -1,11 +1,11 @@
######################## BEGIN LICENSE BLOCK ######################## ######################## BEGIN LICENSE BLOCK ########################
# The Original Code is Mozilla Communicator client code. # The Original Code is Mozilla Communicator client code.
# #
# The Initial Developer of the Original Code is # The Initial Developer of the Original Code is
# Netscape Communications Corporation. # Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 1998 # Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved. # the Initial Developer. All Rights Reserved.
# #
# Contributor(s): # Contributor(s):
# Mark Pilgrim - port to Python # Mark Pilgrim - port to Python
# #
@ -13,12 +13,12 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
@ -26,18 +26,18 @@
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
# Big5 frequency table # Big5 frequency table
# by Taiwan's Mandarin Promotion Council # by Taiwan's Mandarin Promotion Council
# <http://www.edu.tw:81/mandr/> # <http://www.edu.tw:81/mandr/>
# #
# 128 --> 0.42261 # 128 --> 0.42261
# 256 --> 0.57851 # 256 --> 0.57851
# 512 --> 0.74851 # 512 --> 0.74851
# 1024 --> 0.89384 # 1024 --> 0.89384
# 2048 --> 0.97583 # 2048 --> 0.97583
# #
# Ideal Distribution Ratio = 0.74851/(1-0.74851) =2.98 # Ideal Distribution Ratio = 0.74851/(1-0.74851) =2.98
# Random Distribution Ration = 512/(5401-512)=0.105 # Random Distribution Ration = 512/(5401-512)=0.105
# #
# Typical Distribution Ratio about 25% of Ideal one, still much higher than RDR # Typical Distribution Ratio about 25% of Ideal one, still much higher than RDR
BIG5_TYPICAL_DISTRIBUTION_RATIO = 0.75 BIG5_TYPICAL_DISTRIBUTION_RATIO = 0.75
@ -45,7 +45,7 @@ BIG5_TYPICAL_DISTRIBUTION_RATIO = 0.75
#Char to FreqOrder table #Char to FreqOrder table
BIG5_TABLE_SIZE = 5376 BIG5_TABLE_SIZE = 5376
Big5CharToFreqOrder = ( \ Big5CharToFreqOrder = (
1,1801,1506, 255,1431, 198, 9, 82, 6,5008, 177, 202,3681,1256,2821, 110, # 16 1,1801,1506, 255,1431, 198, 9, 82, 6,5008, 177, 202,3681,1256,2821, 110, # 16
3814, 33,3274, 261, 76, 44,2114, 16,2946,2187,1176, 659,3971, 26,3451,2653, # 32 3814, 33,3274, 261, 76, 44,2114, 16,2946,2187,1176, 659,3971, 26,3451,2653, # 32
1198,3972,3350,4202, 410,2215, 302, 590, 361,1964, 8, 204, 58,4510,5009,1932, # 48 1198,3972,3350,4202, 410,2215, 302, 590, 361,1964, 8, 204, 58,4510,5009,1932, # 48
@ -921,3 +921,5 @@ Big5CharToFreqOrder = ( \
13936,13937,13938,13939,13940,13941,13942,13943,13944,13945,13946,13947,13948,13949,13950,13951, #13952 13936,13937,13938,13939,13940,13941,13942,13943,13944,13945,13946,13947,13948,13949,13950,13951, #13952
13952,13953,13954,13955,13956,13957,13958,13959,13960,13961,13962,13963,13964,13965,13966,13967, #13968 13952,13953,13954,13955,13956,13957,13958,13959,13960,13961,13962,13963,13964,13965,13966,13967, #13968
13968,13969,13970,13971,13972) #13973 13968,13969,13970,13971,13972) #13973
# flake8: noqa

View File

@ -1,11 +1,11 @@
######################## BEGIN LICENSE BLOCK ######################## ######################## BEGIN LICENSE BLOCK ########################
# The Original Code is Mozilla Communicator client code. # The Original Code is Mozilla Communicator client code.
# #
# The Initial Developer of the Original Code is # The Initial Developer of the Original Code is
# Netscape Communications Corporation. # Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 1998 # Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved. # the Initial Developer. All Rights Reserved.
# #
# Contributor(s): # Contributor(s):
# Mark Pilgrim - port to Python # Mark Pilgrim - port to Python
# #
@ -13,22 +13,23 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from mbcharsetprober import MultiByteCharSetProber from .mbcharsetprober import MultiByteCharSetProber
from codingstatemachine import CodingStateMachine from .codingstatemachine import CodingStateMachine
from chardistribution import Big5DistributionAnalysis from .chardistribution import Big5DistributionAnalysis
from mbcssm import Big5SMModel from .mbcssm import Big5SMModel
class Big5Prober(MultiByteCharSetProber): class Big5Prober(MultiByteCharSetProber):
def __init__(self): def __init__(self):

80
thirdparty/chardet/chardetect.py vendored Normal file
View File

@ -0,0 +1,80 @@
#!/usr/bin/env python
"""
Script which takes one or more file paths and reports on their detected
encodings
Example::
% chardetect somefile someotherfile
somefile: windows-1252 with confidence 0.5
someotherfile: ascii with confidence 1.0
If no paths are provided, it takes its input from stdin.
"""
from __future__ import absolute_import, print_function, unicode_literals
import argparse
import sys
from io import open
from chardet import __version__
from chardet.universaldetector import UniversalDetector
def description_of(lines, name='stdin'):
"""
Return a string describing the probable encoding of a file or
list of strings.
:param lines: The lines to get the encoding of.
:type lines: Iterable of bytes
:param name: Name of file or collection of lines
:type name: str
"""
u = UniversalDetector()
for line in lines:
u.feed(line)
u.close()
result = u.result
if result['encoding']:
return '{0}: {1} with confidence {2}'.format(name, result['encoding'],
result['confidence'])
else:
return '{0}: no result'.format(name)
def main(argv=None):
'''
Handles command line arguments and gets things started.
:param argv: List of arguments, as if specified on the command-line.
If None, ``sys.argv[1:]`` is used instead.
:type argv: list of str
'''
# Get command line arguments
parser = argparse.ArgumentParser(
description="Takes one or more file paths and reports their detected \
encodings",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
conflict_handler='resolve')
parser.add_argument('input',
help='File whose encoding we would like to determine.',
type=argparse.FileType('rb'), nargs='*',
default=[sys.stdin])
parser.add_argument('--version', action='version',
version='%(prog)s {0}'.format(__version__))
args = parser.parse_args(argv)
for f in args.input:
if f.isatty():
print("You are running chardetect interactively. Press " +
"CTRL-D twice at the start of a blank line to signal the " +
"end of your input. If you want help, run chardetect " +
"--help\n", file=sys.stderr)
print(description_of(f, f.name))
if __name__ == '__main__':
main()

View File

@ -1,11 +1,11 @@
######################## BEGIN LICENSE BLOCK ######################## ######################## BEGIN LICENSE BLOCK ########################
# The Original Code is Mozilla Communicator client code. # The Original Code is Mozilla Communicator client code.
# #
# The Initial Developer of the Original Code is # The Initial Developer of the Original Code is
# Netscape Communications Corporation. # Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 1998 # Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved. # the Initial Developer. All Rights Reserved.
# #
# Contributor(s): # Contributor(s):
# Mark Pilgrim - port to Python # Mark Pilgrim - port to Python
# #
@ -13,47 +13,63 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants from .euctwfreq import (EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE,
from euctwfreq import EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE, EUCTW_TYPICAL_DISTRIBUTION_RATIO EUCTW_TYPICAL_DISTRIBUTION_RATIO)
from euckrfreq import EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE, EUCKR_TYPICAL_DISTRIBUTION_RATIO from .euckrfreq import (EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE,
from gb2312freq import GB2312CharToFreqOrder, GB2312_TABLE_SIZE, GB2312_TYPICAL_DISTRIBUTION_RATIO EUCKR_TYPICAL_DISTRIBUTION_RATIO)
from big5freq import Big5CharToFreqOrder, BIG5_TABLE_SIZE, BIG5_TYPICAL_DISTRIBUTION_RATIO from .gb2312freq import (GB2312CharToFreqOrder, GB2312_TABLE_SIZE,
from jisfreq import JISCharToFreqOrder, JIS_TABLE_SIZE, JIS_TYPICAL_DISTRIBUTION_RATIO GB2312_TYPICAL_DISTRIBUTION_RATIO)
from .big5freq import (Big5CharToFreqOrder, BIG5_TABLE_SIZE,
BIG5_TYPICAL_DISTRIBUTION_RATIO)
from .jisfreq import (JISCharToFreqOrder, JIS_TABLE_SIZE,
JIS_TYPICAL_DISTRIBUTION_RATIO)
from .compat import wrap_ord
ENOUGH_DATA_THRESHOLD = 1024 ENOUGH_DATA_THRESHOLD = 1024
SURE_YES = 0.99 SURE_YES = 0.99
SURE_NO = 0.01 SURE_NO = 0.01
MINIMUM_DATA_THRESHOLD = 3
class CharDistributionAnalysis: class CharDistributionAnalysis:
def __init__(self): def __init__(self):
self._mCharToFreqOrder = None # Mapping table to get frequency order from char order (get from GetOrder()) # Mapping table to get frequency order from char order (get from
self._mTableSize = None # Size of above table # GetOrder())
self._mTypicalDistributionRatio = None # This is a constant value which varies from language to language, used in calculating confidence. See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail. self._mCharToFreqOrder = None
self._mTableSize = None # Size of above table
# This is a constant value which varies from language to language,
# used in calculating confidence. See
# http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
# for further detail.
self._mTypicalDistributionRatio = None
self.reset() self.reset()
def reset(self): def reset(self):
"""reset analyser, clear any state""" """reset analyser, clear any state"""
self._mDone = constants.False # If this flag is set to constants.True, detection is done and conclusion has been made # If this flag is set to True, detection is done and conclusion has
self._mTotalChars = 0 # Total characters encountered # been made
self._mFreqChars = 0 # The number of characters whose frequency order is less than 512 self._mDone = False
self._mTotalChars = 0 # Total characters encountered
# The number of characters whose frequency order is less than 512
self._mFreqChars = 0
def feed(self, aStr, aCharLen): def feed(self, aBuf, aCharLen):
"""feed a character with known length""" """feed a character with known length"""
if aCharLen == 2: if aCharLen == 2:
# we only care about 2-bytes character in our distribution analysis # we only care about 2-bytes character in our distribution analysis
order = self.get_order(aStr) order = self.get_order(aBuf)
else: else:
order = -1 order = -1
if order >= 0: if order >= 0:
@ -65,12 +81,14 @@ class CharDistributionAnalysis:
def get_confidence(self): def get_confidence(self):
"""return confidence based on existing data""" """return confidence based on existing data"""
# if we didn't receive any character in our consideration range, return negative answer # if we didn't receive any character in our consideration range,
if self._mTotalChars <= 0: # return negative answer
if self._mTotalChars <= 0 or self._mFreqChars <= MINIMUM_DATA_THRESHOLD:
return SURE_NO return SURE_NO
if self._mTotalChars != self._mFreqChars: if self._mTotalChars != self._mFreqChars:
r = self._mFreqChars / ((self._mTotalChars - self._mFreqChars) * self._mTypicalDistributionRatio) r = (self._mFreqChars / ((self._mTotalChars - self._mFreqChars)
* self._mTypicalDistributionRatio))
if r < SURE_YES: if r < SURE_YES:
return r return r
@ -78,16 +96,18 @@ class CharDistributionAnalysis:
return SURE_YES return SURE_YES
def got_enough_data(self): def got_enough_data(self):
# It is not necessary to receive all data to draw conclusion. For charset detection, # It is not necessary to receive all data to draw conclusion.
# certain amount of data is enough # For charset detection, certain amount of data is enough
return self._mTotalChars > ENOUGH_DATA_THRESHOLD return self._mTotalChars > ENOUGH_DATA_THRESHOLD
def get_order(self, aStr): def get_order(self, aBuf):
# We do not handle characters based on the original encoding string, but # We do not handle characters based on the original encoding string,
# convert this encoding string to a number, here called order. # but convert this encoding string to a number, here called order.
# This allows multiple encodings of a language to share one frequency table. # This allows multiple encodings of a language to share one frequency
# table.
return -1 return -1
class EUCTWDistributionAnalysis(CharDistributionAnalysis): class EUCTWDistributionAnalysis(CharDistributionAnalysis):
def __init__(self): def __init__(self):
CharDistributionAnalysis.__init__(self) CharDistributionAnalysis.__init__(self)
@ -95,16 +115,18 @@ class EUCTWDistributionAnalysis(CharDistributionAnalysis):
self._mTableSize = EUCTW_TABLE_SIZE self._mTableSize = EUCTW_TABLE_SIZE
self._mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO self._mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, aStr): def get_order(self, aBuf):
# for euc-TW encoding, we are interested # for euc-TW encoding, we are interested
# first byte range: 0xc4 -- 0xfe # first byte range: 0xc4 -- 0xfe
# second byte range: 0xa1 -- 0xfe # second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that # no validation needed here. State machine has done that
if aStr[0] >= '\xC4': first_char = wrap_ord(aBuf[0])
return 94 * (ord(aStr[0]) - 0xC4) + ord(aStr[1]) - 0xA1 if first_char >= 0xC4:
return 94 * (first_char - 0xC4) + wrap_ord(aBuf[1]) - 0xA1
else: else:
return -1 return -1
class EUCKRDistributionAnalysis(CharDistributionAnalysis): class EUCKRDistributionAnalysis(CharDistributionAnalysis):
def __init__(self): def __init__(self):
CharDistributionAnalysis.__init__(self) CharDistributionAnalysis.__init__(self)
@ -112,15 +134,17 @@ class EUCKRDistributionAnalysis(CharDistributionAnalysis):
self._mTableSize = EUCKR_TABLE_SIZE self._mTableSize = EUCKR_TABLE_SIZE
self._mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO self._mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, aStr): def get_order(self, aBuf):
# for euc-KR encoding, we are interested # for euc-KR encoding, we are interested
# first byte range: 0xb0 -- 0xfe # first byte range: 0xb0 -- 0xfe
# second byte range: 0xa1 -- 0xfe # second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that # no validation needed here. State machine has done that
if aStr[0] >= '\xB0': first_char = wrap_ord(aBuf[0])
return 94 * (ord(aStr[0]) - 0xB0) + ord(aStr[1]) - 0xA1 if first_char >= 0xB0:
return 94 * (first_char - 0xB0) + wrap_ord(aBuf[1]) - 0xA1
else: else:
return -1; return -1
class GB2312DistributionAnalysis(CharDistributionAnalysis): class GB2312DistributionAnalysis(CharDistributionAnalysis):
def __init__(self): def __init__(self):
@ -129,15 +153,17 @@ class GB2312DistributionAnalysis(CharDistributionAnalysis):
self._mTableSize = GB2312_TABLE_SIZE self._mTableSize = GB2312_TABLE_SIZE
self._mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO self._mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, aStr): def get_order(self, aBuf):
# for GB2312 encoding, we are interested # for GB2312 encoding, we are interested
# first byte range: 0xb0 -- 0xfe # first byte range: 0xb0 -- 0xfe
# second byte range: 0xa1 -- 0xfe # second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that # no validation needed here. State machine has done that
if (aStr[0] >= '\xB0') and (aStr[1] >= '\xA1'): first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1])
return 94 * (ord(aStr[0]) - 0xB0) + ord(aStr[1]) - 0xA1 if (first_char >= 0xB0) and (second_char >= 0xA1):
return 94 * (first_char - 0xB0) + second_char - 0xA1
else: else:
return -1; return -1
class Big5DistributionAnalysis(CharDistributionAnalysis): class Big5DistributionAnalysis(CharDistributionAnalysis):
def __init__(self): def __init__(self):
@ -146,19 +172,21 @@ class Big5DistributionAnalysis(CharDistributionAnalysis):
self._mTableSize = BIG5_TABLE_SIZE self._mTableSize = BIG5_TABLE_SIZE
self._mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO self._mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, aStr): def get_order(self, aBuf):
# for big5 encoding, we are interested # for big5 encoding, we are interested
# first byte range: 0xa4 -- 0xfe # first byte range: 0xa4 -- 0xfe
# second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe # second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
# no validation needed here. State machine has done that # no validation needed here. State machine has done that
if aStr[0] >= '\xA4': first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1])
if aStr[1] >= '\xA1': if first_char >= 0xA4:
return 157 * (ord(aStr[0]) - 0xA4) + ord(aStr[1]) - 0xA1 + 63 if second_char >= 0xA1:
return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63
else: else:
return 157 * (ord(aStr[0]) - 0xA4) + ord(aStr[1]) - 0x40 return 157 * (first_char - 0xA4) + second_char - 0x40
else: else:
return -1 return -1
class SJISDistributionAnalysis(CharDistributionAnalysis): class SJISDistributionAnalysis(CharDistributionAnalysis):
def __init__(self): def __init__(self):
CharDistributionAnalysis.__init__(self) CharDistributionAnalysis.__init__(self)
@ -166,22 +194,24 @@ class SJISDistributionAnalysis(CharDistributionAnalysis):
self._mTableSize = JIS_TABLE_SIZE self._mTableSize = JIS_TABLE_SIZE
self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, aStr): def get_order(self, aBuf):
# for sjis encoding, we are interested # for sjis encoding, we are interested
# first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe # first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
# second byte range: 0x40 -- 0x7e, 0x81 -- oxfe # second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
# no validation needed here. State machine has done that # no validation needed here. State machine has done that
if (aStr[0] >= '\x81') and (aStr[0] <= '\x9F'): first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1])
order = 188 * (ord(aStr[0]) - 0x81) if (first_char >= 0x81) and (first_char <= 0x9F):
elif (aStr[0] >= '\xE0') and (aStr[0] <= '\xEF'): order = 188 * (first_char - 0x81)
order = 188 * (ord(aStr[0]) - 0xE0 + 31) elif (first_char >= 0xE0) and (first_char <= 0xEF):
order = 188 * (first_char - 0xE0 + 31)
else: else:
return -1; return -1
order = order + ord(aStr[1]) - 0x40 order = order + second_char - 0x40
if aStr[1] > '\x7F': if second_char > 0x7F:
order =- 1 order = -1
return order return order
class EUCJPDistributionAnalysis(CharDistributionAnalysis): class EUCJPDistributionAnalysis(CharDistributionAnalysis):
def __init__(self): def __init__(self):
CharDistributionAnalysis.__init__(self) CharDistributionAnalysis.__init__(self)
@ -189,12 +219,13 @@ class EUCJPDistributionAnalysis(CharDistributionAnalysis):
self._mTableSize = JIS_TABLE_SIZE self._mTableSize = JIS_TABLE_SIZE
self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, aStr): def get_order(self, aBuf):
# for euc-JP encoding, we are interested # for euc-JP encoding, we are interested
# first byte range: 0xa0 -- 0xfe # first byte range: 0xa0 -- 0xfe
# second byte range: 0xa1 -- 0xfe # second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that # no validation needed here. State machine has done that
if aStr[0] >= '\xA0': char = wrap_ord(aBuf[0])
return 94 * (ord(aStr[0]) - 0xA1) + ord(aStr[1]) - 0xa1 if char >= 0xA0:
return 94 * (char - 0xA1) + wrap_ord(aBuf[1]) - 0xa1
else: else:
return -1 return -1

View File

@ -25,8 +25,10 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants, sys from . import constants
from charsetprober import CharSetProber import sys
from .charsetprober import CharSetProber
class CharSetGroupProber(CharSetProber): class CharSetGroupProber(CharSetProber):
def __init__(self): def __init__(self):
@ -41,28 +43,32 @@ class CharSetGroupProber(CharSetProber):
for prober in self._mProbers: for prober in self._mProbers:
if prober: if prober:
prober.reset() prober.reset()
prober.active = constants.True prober.active = True
self._mActiveNum += 1 self._mActiveNum += 1
self._mBestGuessProber = None self._mBestGuessProber = None
def get_charset_name(self): def get_charset_name(self):
if not self._mBestGuessProber: if not self._mBestGuessProber:
self.get_confidence() self.get_confidence()
if not self._mBestGuessProber: return None if not self._mBestGuessProber:
return None
# self._mBestGuessProber = self._mProbers[0] # self._mBestGuessProber = self._mProbers[0]
return self._mBestGuessProber.get_charset_name() return self._mBestGuessProber.get_charset_name()
def feed(self, aBuf): def feed(self, aBuf):
for prober in self._mProbers: for prober in self._mProbers:
if not prober: continue if not prober:
if not prober.active: continue continue
if not prober.active:
continue
st = prober.feed(aBuf) st = prober.feed(aBuf)
if not st: continue if not st:
continue
if st == constants.eFoundIt: if st == constants.eFoundIt:
self._mBestGuessProber = prober self._mBestGuessProber = prober
return self.get_state() return self.get_state()
elif st == constants.eNotMe: elif st == constants.eNotMe:
prober.active = constants.False prober.active = False
self._mActiveNum -= 1 self._mActiveNum -= 1
if self._mActiveNum <= 0: if self._mActiveNum <= 0:
self._mState = constants.eNotMe self._mState = constants.eNotMe
@ -78,18 +84,22 @@ class CharSetGroupProber(CharSetProber):
bestConf = 0.0 bestConf = 0.0
self._mBestGuessProber = None self._mBestGuessProber = None
for prober in self._mProbers: for prober in self._mProbers:
if not prober: continue if not prober:
continue
if not prober.active: if not prober.active:
if constants._debug: if constants._debug:
sys.stderr.write(prober.get_charset_name() + ' not active\n') sys.stderr.write(prober.get_charset_name()
+ ' not active\n')
continue continue
cf = prober.get_confidence() cf = prober.get_confidence()
if constants._debug: if constants._debug:
sys.stderr.write('%s confidence = %s\n' % (prober.get_charset_name(), cf)) sys.stderr.write('%s confidence = %s\n' %
(prober.get_charset_name(), cf))
if bestConf < cf: if bestConf < cf:
bestConf = cf bestConf = cf
self._mBestGuessProber = prober self._mBestGuessProber = prober
if not self._mBestGuessProber: return 0.0 if not self._mBestGuessProber:
return 0.0
return bestConf return bestConf
# else: # else:
# self._mBestGuessProber = self._mProbers[0] # self._mBestGuessProber = self._mProbers[0]

View File

@ -1,11 +1,11 @@
######################## BEGIN LICENSE BLOCK ######################## ######################## BEGIN LICENSE BLOCK ########################
# The Original Code is Mozilla Universal charset detector code. # The Original Code is Mozilla Universal charset detector code.
# #
# The Initial Developer of the Original Code is # The Initial Developer of the Original Code is
# Netscape Communications Corporation. # Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 2001 # Portions created by the Initial Developer are Copyright (C) 2001
# the Initial Developer. All Rights Reserved. # the Initial Developer. All Rights Reserved.
# #
# Contributor(s): # Contributor(s):
# Mark Pilgrim - port to Python # Mark Pilgrim - port to Python
# Shy Shalom - original C code # Shy Shalom - original C code
@ -14,19 +14,21 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants, re from . import constants
import re
class CharSetProber: class CharSetProber:
def __init__(self): def __init__(self):
@ -48,11 +50,11 @@ class CharSetProber:
return 0.0 return 0.0
def filter_high_bit_only(self, aBuf): def filter_high_bit_only(self, aBuf):
aBuf = re.sub(r'([\x00-\x7F])+', ' ', aBuf) aBuf = re.sub(b'([\x00-\x7F])+', b' ', aBuf)
return aBuf return aBuf
def filter_without_english_letters(self, aBuf): def filter_without_english_letters(self, aBuf):
aBuf = re.sub(r'([A-Za-z])+', ' ', aBuf) aBuf = re.sub(b'([A-Za-z])+', b' ', aBuf)
return aBuf return aBuf
def filter_with_english_letters(self, aBuf): def filter_with_english_letters(self, aBuf):

View File

@ -13,19 +13,21 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from constants import eStart, eError, eItsMe from .constants import eStart
from .compat import wrap_ord
class CodingStateMachine: class CodingStateMachine:
def __init__(self, sm): def __init__(self, sm):
@ -40,12 +42,15 @@ class CodingStateMachine:
def next_state(self, c): def next_state(self, c):
# for each byte we get its class # for each byte we get its class
# if it is first byte, we also get byte length # if it is first byte, we also get byte length
byteCls = self._mModel['classTable'][ord(c)] # PY3K: aBuf is a byte stream, so c is an int, not a byte
byteCls = self._mModel['classTable'][wrap_ord(c)]
if self._mCurrentState == eStart: if self._mCurrentState == eStart:
self._mCurrentBytePos = 0 self._mCurrentBytePos = 0
self._mCurrentCharLen = self._mModel['charLenTable'][byteCls] self._mCurrentCharLen = self._mModel['charLenTable'][byteCls]
# from byte's class and stateTable, we get its next state # from byte's class and stateTable, we get its next state
self._mCurrentState = self._mModel['stateTable'][self._mCurrentState * self._mModel['classFactor'] + byteCls] curr_state = (self._mCurrentState * self._mModel['classFactor']
+ byteCls)
self._mCurrentState = self._mModel['stateTable'][curr_state]
self._mCurrentBytePos += 1 self._mCurrentBytePos += 1
return self._mCurrentState return self._mCurrentState

34
thirdparty/chardet/compat.py vendored Normal file
View File

@ -0,0 +1,34 @@
######################## BEGIN LICENSE BLOCK ########################
# Contributor(s):
# Ian Cordasco - port to Python
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import sys
if sys.version_info < (3, 0):
base_str = (str, unicode)
else:
base_str = (bytes, str)
def wrap_ord(a):
if sys.version_info < (3, 0) and isinstance(a, base_str):
return ord(a)
else:
return a

View File

@ -37,11 +37,3 @@ eError = 1
eItsMe = 2 eItsMe = 2
SHORTCUT_THRESHOLD = 0.95 SHORTCUT_THRESHOLD = 0.95
import __builtin__
if not hasattr(__builtin__, 'False'):
False = 0
True = 1
else:
False = __builtin__.False
True = __builtin__.True

44
thirdparty/chardet/cp949prober.py vendored Normal file
View File

@ -0,0 +1,44 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is mozilla.org code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from .chardistribution import EUCKRDistributionAnalysis
from .mbcssm import CP949SMModel
class CP949Prober(MultiByteCharSetProber):
def __init__(self):
MultiByteCharSetProber.__init__(self)
self._mCodingSM = CodingStateMachine(CP949SMModel)
# NOTE: CP949 is a superset of EUC-KR, so the distribution should be
# not different.
self._mDistributionAnalyzer = EUCKRDistributionAnalysis()
self.reset()
def get_charset_name(self):
return "CP949"

View File

@ -13,39 +13,43 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants, sys from . import constants
from escsm import HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, ISO2022KRSMModel from .escsm import (HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel,
from charsetprober import CharSetProber ISO2022KRSMModel)
from codingstatemachine import CodingStateMachine from .charsetprober import CharSetProber
from .codingstatemachine import CodingStateMachine
from .compat import wrap_ord
class EscCharSetProber(CharSetProber): class EscCharSetProber(CharSetProber):
def __init__(self): def __init__(self):
CharSetProber.__init__(self) CharSetProber.__init__(self)
self._mCodingSM = [ \ self._mCodingSM = [
CodingStateMachine(HZSMModel), CodingStateMachine(HZSMModel),
CodingStateMachine(ISO2022CNSMModel), CodingStateMachine(ISO2022CNSMModel),
CodingStateMachine(ISO2022JPSMModel), CodingStateMachine(ISO2022JPSMModel),
CodingStateMachine(ISO2022KRSMModel) CodingStateMachine(ISO2022KRSMModel)
] ]
self.reset() self.reset()
def reset(self): def reset(self):
CharSetProber.reset(self) CharSetProber.reset(self)
for codingSM in self._mCodingSM: for codingSM in self._mCodingSM:
if not codingSM: continue if not codingSM:
codingSM.active = constants.True continue
codingSM.active = True
codingSM.reset() codingSM.reset()
self._mActiveSM = len(self._mCodingSM) self._mActiveSM = len(self._mCodingSM)
self._mDetectedCharset = None self._mDetectedCharset = None
@ -61,19 +65,22 @@ class EscCharSetProber(CharSetProber):
def feed(self, aBuf): def feed(self, aBuf):
for c in aBuf: for c in aBuf:
# PY3K: aBuf is a byte array, so c is an int, not a byte
for codingSM in self._mCodingSM: for codingSM in self._mCodingSM:
if not codingSM: continue if not codingSM:
if not codingSM.active: continue continue
codingState = codingSM.next_state(c) if not codingSM.active:
continue
codingState = codingSM.next_state(wrap_ord(c))
if codingState == constants.eError: if codingState == constants.eError:
codingSM.active = constants.False codingSM.active = False
self._mActiveSM -= 1 self._mActiveSM -= 1
if self._mActiveSM <= 0: if self._mActiveSM <= 0:
self._mState = constants.eNotMe self._mState = constants.eNotMe
return self.get_state() return self.get_state()
elif codingState == constants.eItsMe: elif codingState == constants.eItsMe:
self._mState = constants.eFoundIt self._mState = constants.eFoundIt
self._mDetectedCharset = codingSM.get_coding_state_machine() self._mDetectedCharset = codingSM.get_coding_state_machine() # nopep8
return self.get_state() return self.get_state()
return self.get_state() return self.get_state()

View File

@ -13,62 +13,62 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from constants import eStart, eError, eItsMe from .constants import eStart, eError, eItsMe
HZ_cls = ( \ HZ_cls = (
1,0,0,0,0,0,0,0, # 00 - 07 1,0,0,0,0,0,0,0, # 00 - 07
0,0,0,0,0,0,0,0, # 08 - 0f 0,0,0,0,0,0,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17 0,0,0,0,0,0,0,0, # 10 - 17
0,0,0,1,0,0,0,0, # 18 - 1f 0,0,0,1,0,0,0,0, # 18 - 1f
0,0,0,0,0,0,0,0, # 20 - 27 0,0,0,0,0,0,0,0, # 20 - 27
0,0,0,0,0,0,0,0, # 28 - 2f 0,0,0,0,0,0,0,0, # 28 - 2f
0,0,0,0,0,0,0,0, # 30 - 37 0,0,0,0,0,0,0,0, # 30 - 37
0,0,0,0,0,0,0,0, # 38 - 3f 0,0,0,0,0,0,0,0, # 38 - 3f
0,0,0,0,0,0,0,0, # 40 - 47 0,0,0,0,0,0,0,0, # 40 - 47
0,0,0,0,0,0,0,0, # 48 - 4f 0,0,0,0,0,0,0,0, # 48 - 4f
0,0,0,0,0,0,0,0, # 50 - 57 0,0,0,0,0,0,0,0, # 50 - 57
0,0,0,0,0,0,0,0, # 58 - 5f 0,0,0,0,0,0,0,0, # 58 - 5f
0,0,0,0,0,0,0,0, # 60 - 67 0,0,0,0,0,0,0,0, # 60 - 67
0,0,0,0,0,0,0,0, # 68 - 6f 0,0,0,0,0,0,0,0, # 68 - 6f
0,0,0,0,0,0,0,0, # 70 - 77 0,0,0,0,0,0,0,0, # 70 - 77
0,0,0,4,0,5,2,0, # 78 - 7f 0,0,0,4,0,5,2,0, # 78 - 7f
1,1,1,1,1,1,1,1, # 80 - 87 1,1,1,1,1,1,1,1, # 80 - 87
1,1,1,1,1,1,1,1, # 88 - 8f 1,1,1,1,1,1,1,1, # 88 - 8f
1,1,1,1,1,1,1,1, # 90 - 97 1,1,1,1,1,1,1,1, # 90 - 97
1,1,1,1,1,1,1,1, # 98 - 9f 1,1,1,1,1,1,1,1, # 98 - 9f
1,1,1,1,1,1,1,1, # a0 - a7 1,1,1,1,1,1,1,1, # a0 - a7
1,1,1,1,1,1,1,1, # a8 - af 1,1,1,1,1,1,1,1, # a8 - af
1,1,1,1,1,1,1,1, # b0 - b7 1,1,1,1,1,1,1,1, # b0 - b7
1,1,1,1,1,1,1,1, # b8 - bf 1,1,1,1,1,1,1,1, # b8 - bf
1,1,1,1,1,1,1,1, # c0 - c7 1,1,1,1,1,1,1,1, # c0 - c7
1,1,1,1,1,1,1,1, # c8 - cf 1,1,1,1,1,1,1,1, # c8 - cf
1,1,1,1,1,1,1,1, # d0 - d7 1,1,1,1,1,1,1,1, # d0 - d7
1,1,1,1,1,1,1,1, # d8 - df 1,1,1,1,1,1,1,1, # d8 - df
1,1,1,1,1,1,1,1, # e0 - e7 1,1,1,1,1,1,1,1, # e0 - e7
1,1,1,1,1,1,1,1, # e8 - ef 1,1,1,1,1,1,1,1, # e8 - ef
1,1,1,1,1,1,1,1, # f0 - f7 1,1,1,1,1,1,1,1, # f0 - f7
1,1,1,1,1,1,1,1, # f8 - ff 1,1,1,1,1,1,1,1, # f8 - ff
) )
HZ_st = ( \ HZ_st = (
eStart,eError, 3,eStart,eStart,eStart,eError,eError,# 00-07 eStart,eError, 3,eStart,eStart,eStart,eError,eError,# 00-07
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f
eItsMe,eItsMe,eError,eError,eStart,eStart, 4,eError,# 10-17 eItsMe,eItsMe,eError,eError,eStart,eStart, 4,eError,# 10-17
5,eError, 6,eError, 5, 5, 4,eError,# 18-1f 5,eError, 6,eError, 5, 5, 4,eError,# 18-1f
4,eError, 4, 4, 4,eError, 4,eError,# 20-27 4,eError, 4, 4, 4,eError, 4,eError,# 20-27
4,eItsMe,eStart,eStart,eStart,eStart,eStart,eStart,# 28-2f 4,eItsMe,eStart,eStart,eStart,eStart,eStart,eStart,# 28-2f
) )
HZCharLenTable = (0, 0, 0, 0, 0, 0) HZCharLenTable = (0, 0, 0, 0, 0, 0)
@ -79,50 +79,50 @@ HZSMModel = {'classTable': HZ_cls,
'charLenTable': HZCharLenTable, 'charLenTable': HZCharLenTable,
'name': "HZ-GB-2312"} 'name': "HZ-GB-2312"}
ISO2022CN_cls = ( \ ISO2022CN_cls = (
2,0,0,0,0,0,0,0, # 00 - 07 2,0,0,0,0,0,0,0, # 00 - 07
0,0,0,0,0,0,0,0, # 08 - 0f 0,0,0,0,0,0,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17 0,0,0,0,0,0,0,0, # 10 - 17
0,0,0,1,0,0,0,0, # 18 - 1f 0,0,0,1,0,0,0,0, # 18 - 1f
0,0,0,0,0,0,0,0, # 20 - 27 0,0,0,0,0,0,0,0, # 20 - 27
0,3,0,0,0,0,0,0, # 28 - 2f 0,3,0,0,0,0,0,0, # 28 - 2f
0,0,0,0,0,0,0,0, # 30 - 37 0,0,0,0,0,0,0,0, # 30 - 37
0,0,0,0,0,0,0,0, # 38 - 3f 0,0,0,0,0,0,0,0, # 38 - 3f
0,0,0,4,0,0,0,0, # 40 - 47 0,0,0,4,0,0,0,0, # 40 - 47
0,0,0,0,0,0,0,0, # 48 - 4f 0,0,0,0,0,0,0,0, # 48 - 4f
0,0,0,0,0,0,0,0, # 50 - 57 0,0,0,0,0,0,0,0, # 50 - 57
0,0,0,0,0,0,0,0, # 58 - 5f 0,0,0,0,0,0,0,0, # 58 - 5f
0,0,0,0,0,0,0,0, # 60 - 67 0,0,0,0,0,0,0,0, # 60 - 67
0,0,0,0,0,0,0,0, # 68 - 6f 0,0,0,0,0,0,0,0, # 68 - 6f
0,0,0,0,0,0,0,0, # 70 - 77 0,0,0,0,0,0,0,0, # 70 - 77
0,0,0,0,0,0,0,0, # 78 - 7f 0,0,0,0,0,0,0,0, # 78 - 7f
2,2,2,2,2,2,2,2, # 80 - 87 2,2,2,2,2,2,2,2, # 80 - 87
2,2,2,2,2,2,2,2, # 88 - 8f 2,2,2,2,2,2,2,2, # 88 - 8f
2,2,2,2,2,2,2,2, # 90 - 97 2,2,2,2,2,2,2,2, # 90 - 97
2,2,2,2,2,2,2,2, # 98 - 9f 2,2,2,2,2,2,2,2, # 98 - 9f
2,2,2,2,2,2,2,2, # a0 - a7 2,2,2,2,2,2,2,2, # a0 - a7
2,2,2,2,2,2,2,2, # a8 - af 2,2,2,2,2,2,2,2, # a8 - af
2,2,2,2,2,2,2,2, # b0 - b7 2,2,2,2,2,2,2,2, # b0 - b7
2,2,2,2,2,2,2,2, # b8 - bf 2,2,2,2,2,2,2,2, # b8 - bf
2,2,2,2,2,2,2,2, # c0 - c7 2,2,2,2,2,2,2,2, # c0 - c7
2,2,2,2,2,2,2,2, # c8 - cf 2,2,2,2,2,2,2,2, # c8 - cf
2,2,2,2,2,2,2,2, # d0 - d7 2,2,2,2,2,2,2,2, # d0 - d7
2,2,2,2,2,2,2,2, # d8 - df 2,2,2,2,2,2,2,2, # d8 - df
2,2,2,2,2,2,2,2, # e0 - e7 2,2,2,2,2,2,2,2, # e0 - e7
2,2,2,2,2,2,2,2, # e8 - ef 2,2,2,2,2,2,2,2, # e8 - ef
2,2,2,2,2,2,2,2, # f0 - f7 2,2,2,2,2,2,2,2, # f0 - f7
2,2,2,2,2,2,2,2, # f8 - ff 2,2,2,2,2,2,2,2, # f8 - ff
) )
ISO2022CN_st = ( \ ISO2022CN_st = (
eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07 eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07
eStart,eError,eError,eError,eError,eError,eError,eError,# 08-0f eStart,eError,eError,eError,eError,eError,eError,eError,# 08-0f
eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17 eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17
eItsMe,eItsMe,eItsMe,eError,eError,eError, 4,eError,# 18-1f eItsMe,eItsMe,eItsMe,eError,eError,eError, 4,eError,# 18-1f
eError,eError,eError,eItsMe,eError,eError,eError,eError,# 20-27 eError,eError,eError,eItsMe,eError,eError,eError,eError,# 20-27
5, 6,eError,eError,eError,eError,eError,eError,# 28-2f 5, 6,eError,eError,eError,eError,eError,eError,# 28-2f
eError,eError,eError,eItsMe,eError,eError,eError,eError,# 30-37 eError,eError,eError,eItsMe,eError,eError,eError,eError,# 30-37
eError,eError,eError,eError,eError,eItsMe,eError,eStart,# 38-3f eError,eError,eError,eError,eError,eItsMe,eError,eStart,# 38-3f
) )
ISO2022CNCharLenTable = (0, 0, 0, 0, 0, 0, 0, 0, 0) ISO2022CNCharLenTable = (0, 0, 0, 0, 0, 0, 0, 0, 0)
@ -133,51 +133,51 @@ ISO2022CNSMModel = {'classTable': ISO2022CN_cls,
'charLenTable': ISO2022CNCharLenTable, 'charLenTable': ISO2022CNCharLenTable,
'name': "ISO-2022-CN"} 'name': "ISO-2022-CN"}
ISO2022JP_cls = ( \ ISO2022JP_cls = (
2,0,0,0,0,0,0,0, # 00 - 07 2,0,0,0,0,0,0,0, # 00 - 07
0,0,0,0,0,0,2,2, # 08 - 0f 0,0,0,0,0,0,2,2, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17 0,0,0,0,0,0,0,0, # 10 - 17
0,0,0,1,0,0,0,0, # 18 - 1f 0,0,0,1,0,0,0,0, # 18 - 1f
0,0,0,0,7,0,0,0, # 20 - 27 0,0,0,0,7,0,0,0, # 20 - 27
3,0,0,0,0,0,0,0, # 28 - 2f 3,0,0,0,0,0,0,0, # 28 - 2f
0,0,0,0,0,0,0,0, # 30 - 37 0,0,0,0,0,0,0,0, # 30 - 37
0,0,0,0,0,0,0,0, # 38 - 3f 0,0,0,0,0,0,0,0, # 38 - 3f
6,0,4,0,8,0,0,0, # 40 - 47 6,0,4,0,8,0,0,0, # 40 - 47
0,9,5,0,0,0,0,0, # 48 - 4f 0,9,5,0,0,0,0,0, # 48 - 4f
0,0,0,0,0,0,0,0, # 50 - 57 0,0,0,0,0,0,0,0, # 50 - 57
0,0,0,0,0,0,0,0, # 58 - 5f 0,0,0,0,0,0,0,0, # 58 - 5f
0,0,0,0,0,0,0,0, # 60 - 67 0,0,0,0,0,0,0,0, # 60 - 67
0,0,0,0,0,0,0,0, # 68 - 6f 0,0,0,0,0,0,0,0, # 68 - 6f
0,0,0,0,0,0,0,0, # 70 - 77 0,0,0,0,0,0,0,0, # 70 - 77
0,0,0,0,0,0,0,0, # 78 - 7f 0,0,0,0,0,0,0,0, # 78 - 7f
2,2,2,2,2,2,2,2, # 80 - 87 2,2,2,2,2,2,2,2, # 80 - 87
2,2,2,2,2,2,2,2, # 88 - 8f 2,2,2,2,2,2,2,2, # 88 - 8f
2,2,2,2,2,2,2,2, # 90 - 97 2,2,2,2,2,2,2,2, # 90 - 97
2,2,2,2,2,2,2,2, # 98 - 9f 2,2,2,2,2,2,2,2, # 98 - 9f
2,2,2,2,2,2,2,2, # a0 - a7 2,2,2,2,2,2,2,2, # a0 - a7
2,2,2,2,2,2,2,2, # a8 - af 2,2,2,2,2,2,2,2, # a8 - af
2,2,2,2,2,2,2,2, # b0 - b7 2,2,2,2,2,2,2,2, # b0 - b7
2,2,2,2,2,2,2,2, # b8 - bf 2,2,2,2,2,2,2,2, # b8 - bf
2,2,2,2,2,2,2,2, # c0 - c7 2,2,2,2,2,2,2,2, # c0 - c7
2,2,2,2,2,2,2,2, # c8 - cf 2,2,2,2,2,2,2,2, # c8 - cf
2,2,2,2,2,2,2,2, # d0 - d7 2,2,2,2,2,2,2,2, # d0 - d7
2,2,2,2,2,2,2,2, # d8 - df 2,2,2,2,2,2,2,2, # d8 - df
2,2,2,2,2,2,2,2, # e0 - e7 2,2,2,2,2,2,2,2, # e0 - e7
2,2,2,2,2,2,2,2, # e8 - ef 2,2,2,2,2,2,2,2, # e8 - ef
2,2,2,2,2,2,2,2, # f0 - f7 2,2,2,2,2,2,2,2, # f0 - f7
2,2,2,2,2,2,2,2, # f8 - ff 2,2,2,2,2,2,2,2, # f8 - ff
) )
ISO2022JP_st = ( \ ISO2022JP_st = (
eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07 eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07
eStart,eStart,eError,eError,eError,eError,eError,eError,# 08-0f eStart,eStart,eError,eError,eError,eError,eError,eError,# 08-0f
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17 eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,# 18-1f eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,# 18-1f
eError, 5,eError,eError,eError, 4,eError,eError,# 20-27 eError, 5,eError,eError,eError, 4,eError,eError,# 20-27
eError,eError,eError, 6,eItsMe,eError,eItsMe,eError,# 28-2f eError,eError,eError, 6,eItsMe,eError,eItsMe,eError,# 28-2f
eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,# 30-37 eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,# 30-37
eError,eError,eError,eItsMe,eError,eError,eError,eError,# 38-3f eError,eError,eError,eItsMe,eError,eError,eError,eError,# 38-3f
eError,eError,eError,eError,eItsMe,eError,eStart,eStart,# 40-47 eError,eError,eError,eError,eItsMe,eError,eStart,eStart,# 40-47
) )
ISO2022JPCharLenTable = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0) ISO2022JPCharLenTable = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
@ -188,47 +188,47 @@ ISO2022JPSMModel = {'classTable': ISO2022JP_cls,
'charLenTable': ISO2022JPCharLenTable, 'charLenTable': ISO2022JPCharLenTable,
'name': "ISO-2022-JP"} 'name': "ISO-2022-JP"}
ISO2022KR_cls = ( \ ISO2022KR_cls = (
2,0,0,0,0,0,0,0, # 00 - 07 2,0,0,0,0,0,0,0, # 00 - 07
0,0,0,0,0,0,0,0, # 08 - 0f 0,0,0,0,0,0,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17 0,0,0,0,0,0,0,0, # 10 - 17
0,0,0,1,0,0,0,0, # 18 - 1f 0,0,0,1,0,0,0,0, # 18 - 1f
0,0,0,0,3,0,0,0, # 20 - 27 0,0,0,0,3,0,0,0, # 20 - 27
0,4,0,0,0,0,0,0, # 28 - 2f 0,4,0,0,0,0,0,0, # 28 - 2f
0,0,0,0,0,0,0,0, # 30 - 37 0,0,0,0,0,0,0,0, # 30 - 37
0,0,0,0,0,0,0,0, # 38 - 3f 0,0,0,0,0,0,0,0, # 38 - 3f
0,0,0,5,0,0,0,0, # 40 - 47 0,0,0,5,0,0,0,0, # 40 - 47
0,0,0,0,0,0,0,0, # 48 - 4f 0,0,0,0,0,0,0,0, # 48 - 4f
0,0,0,0,0,0,0,0, # 50 - 57 0,0,0,0,0,0,0,0, # 50 - 57
0,0,0,0,0,0,0,0, # 58 - 5f 0,0,0,0,0,0,0,0, # 58 - 5f
0,0,0,0,0,0,0,0, # 60 - 67 0,0,0,0,0,0,0,0, # 60 - 67
0,0,0,0,0,0,0,0, # 68 - 6f 0,0,0,0,0,0,0,0, # 68 - 6f
0,0,0,0,0,0,0,0, # 70 - 77 0,0,0,0,0,0,0,0, # 70 - 77
0,0,0,0,0,0,0,0, # 78 - 7f 0,0,0,0,0,0,0,0, # 78 - 7f
2,2,2,2,2,2,2,2, # 80 - 87 2,2,2,2,2,2,2,2, # 80 - 87
2,2,2,2,2,2,2,2, # 88 - 8f 2,2,2,2,2,2,2,2, # 88 - 8f
2,2,2,2,2,2,2,2, # 90 - 97 2,2,2,2,2,2,2,2, # 90 - 97
2,2,2,2,2,2,2,2, # 98 - 9f 2,2,2,2,2,2,2,2, # 98 - 9f
2,2,2,2,2,2,2,2, # a0 - a7 2,2,2,2,2,2,2,2, # a0 - a7
2,2,2,2,2,2,2,2, # a8 - af 2,2,2,2,2,2,2,2, # a8 - af
2,2,2,2,2,2,2,2, # b0 - b7 2,2,2,2,2,2,2,2, # b0 - b7
2,2,2,2,2,2,2,2, # b8 - bf 2,2,2,2,2,2,2,2, # b8 - bf
2,2,2,2,2,2,2,2, # c0 - c7 2,2,2,2,2,2,2,2, # c0 - c7
2,2,2,2,2,2,2,2, # c8 - cf 2,2,2,2,2,2,2,2, # c8 - cf
2,2,2,2,2,2,2,2, # d0 - d7 2,2,2,2,2,2,2,2, # d0 - d7
2,2,2,2,2,2,2,2, # d8 - df 2,2,2,2,2,2,2,2, # d8 - df
2,2,2,2,2,2,2,2, # e0 - e7 2,2,2,2,2,2,2,2, # e0 - e7
2,2,2,2,2,2,2,2, # e8 - ef 2,2,2,2,2,2,2,2, # e8 - ef
2,2,2,2,2,2,2,2, # f0 - f7 2,2,2,2,2,2,2,2, # f0 - f7
2,2,2,2,2,2,2,2, # f8 - ff 2,2,2,2,2,2,2,2, # f8 - ff
) )
ISO2022KR_st = ( \ ISO2022KR_st = (
eStart, 3,eError,eStart,eStart,eStart,eError,eError,# 00-07 eStart, 3,eError,eStart,eStart,eStart,eError,eError,# 00-07
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f
eItsMe,eItsMe,eError,eError,eError, 4,eError,eError,# 10-17 eItsMe,eItsMe,eError,eError,eError, 4,eError,eError,# 10-17
eError,eError,eError,eError, 5,eError,eError,eError,# 18-1f eError,eError,eError,eError, 5,eError,eError,eError,# 18-1f
eError,eError,eError,eItsMe,eStart,eStart,eStart,eStart,# 20-27 eError,eError,eError,eItsMe,eStart,eStart,eStart,eStart,# 20-27
) )
ISO2022KRCharLenTable = (0, 0, 0, 0, 0, 0) ISO2022KRCharLenTable = (0, 0, 0, 0, 0, 0)
@ -238,3 +238,5 @@ ISO2022KRSMModel = {'classTable': ISO2022KR_cls,
'stateTable': ISO2022KR_st, 'stateTable': ISO2022KR_st,
'charLenTable': ISO2022KRCharLenTable, 'charLenTable': ISO2022KRCharLenTable,
'name': "ISO-2022-KR"} 'name': "ISO-2022-KR"}
# flake8: noqa

View File

@ -13,25 +13,26 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants, sys import sys
from constants import eStart, eError, eItsMe from . import constants
from mbcharsetprober import MultiByteCharSetProber from .mbcharsetprober import MultiByteCharSetProber
from codingstatemachine import CodingStateMachine from .codingstatemachine import CodingStateMachine
from chardistribution import EUCJPDistributionAnalysis from .chardistribution import EUCJPDistributionAnalysis
from jpcntx import EUCJPContextAnalysis from .jpcntx import EUCJPContextAnalysis
from mbcssm import EUCJPSMModel from .mbcssm import EUCJPSMModel
class EUCJPProber(MultiByteCharSetProber): class EUCJPProber(MultiByteCharSetProber):
def __init__(self): def __init__(self):
@ -51,30 +52,34 @@ class EUCJPProber(MultiByteCharSetProber):
def feed(self, aBuf): def feed(self, aBuf):
aLen = len(aBuf) aLen = len(aBuf)
for i in xrange(0, aLen): for i in xrange(0, aLen):
# PY3K: aBuf is a byte array, so aBuf[i] is an int, not a byte
codingState = self._mCodingSM.next_state(aBuf[i]) codingState = self._mCodingSM.next_state(aBuf[i])
if codingState == eError: if codingState == constants.eError:
if constants._debug: if constants._debug:
sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n') sys.stderr.write(self.get_charset_name()
+ ' prober hit error at byte ' + str(i)
+ '\n')
self._mState = constants.eNotMe self._mState = constants.eNotMe
break break
elif codingState == eItsMe: elif codingState == constants.eItsMe:
self._mState = constants.eFoundIt self._mState = constants.eFoundIt
break break
elif codingState == eStart: elif codingState == constants.eStart:
charLen = self._mCodingSM.get_current_charlen() charLen = self._mCodingSM.get_current_charlen()
if i == 0: if i == 0:
self._mLastChar[1] = aBuf[0] self._mLastChar[1] = aBuf[0]
self._mContextAnalyzer.feed(self._mLastChar, charLen) self._mContextAnalyzer.feed(self._mLastChar, charLen)
self._mDistributionAnalyzer.feed(self._mLastChar, charLen) self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
else: else:
self._mContextAnalyzer.feed(aBuf[i-1:i+1], charLen) self._mContextAnalyzer.feed(aBuf[i - 1:i + 1], charLen)
self._mDistributionAnalyzer.feed(aBuf[i-1:i+1], charLen) self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1],
charLen)
self._mLastChar[0] = aBuf[aLen - 1] self._mLastChar[0] = aBuf[aLen - 1]
if self.get_state() == constants.eDetecting: if self.get_state() == constants.eDetecting:
if self._mContextAnalyzer.got_enough_data() and \ if (self._mContextAnalyzer.got_enough_data() and
(self.get_confidence() > constants.SHORTCUT_THRESHOLD): (self.get_confidence() > constants.SHORTCUT_THRESHOLD)):
self._mState = constants.eFoundIt self._mState = constants.eFoundIt
return self.get_state() return self.get_state()

View File

@ -592,3 +592,5 @@ EUCKRCharToFreqOrder = ( \
8704,8705,8706,8707,8708,8709,8710,8711,8712,8713,8714,8715,8716,8717,8718,8719, 8704,8705,8706,8707,8708,8709,8710,8711,8712,8713,8714,8715,8716,8717,8718,8719,
8720,8721,8722,8723,8724,8725,8726,8727,8728,8729,8730,8731,8732,8733,8734,8735, 8720,8721,8722,8723,8724,8725,8726,8727,8728,8729,8730,8731,8732,8733,8734,8735,
8736,8737,8738,8739,8740,8741) 8736,8737,8738,8739,8740,8741)
# flake8: noqa

View File

@ -13,22 +13,23 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from mbcharsetprober import MultiByteCharSetProber from .mbcharsetprober import MultiByteCharSetProber
from codingstatemachine import CodingStateMachine from .codingstatemachine import CodingStateMachine
from chardistribution import EUCKRDistributionAnalysis from .chardistribution import EUCKRDistributionAnalysis
from mbcssm import EUCKRSMModel from .mbcssm import EUCKRSMModel
class EUCKRProber(MultiByteCharSetProber): class EUCKRProber(MultiByteCharSetProber):
def __init__(self): def __init__(self):

View File

@ -13,12 +13,12 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
@ -26,8 +26,8 @@
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
# EUCTW frequency table # EUCTW frequency table
# Converted from big5 work # Converted from big5 work
# by Taiwan's Mandarin Promotion Council # by Taiwan's Mandarin Promotion Council
# <http:#www.edu.tw:81/mandr/> # <http:#www.edu.tw:81/mandr/>
# 128 --> 0.42261 # 128 --> 0.42261
@ -38,15 +38,15 @@
# #
# Idea Distribution Ratio = 0.74851/(1-0.74851) =2.98 # Idea Distribution Ratio = 0.74851/(1-0.74851) =2.98
# Random Distribution Ration = 512/(5401-512)=0.105 # Random Distribution Ration = 512/(5401-512)=0.105
# #
# Typical Distribution Ratio about 25% of Ideal one, still much higher than RDR # Typical Distribution Ratio about 25% of Ideal one, still much higher than RDR
EUCTW_TYPICAL_DISTRIBUTION_RATIO = 0.75 EUCTW_TYPICAL_DISTRIBUTION_RATIO = 0.75
# Char to FreqOrder table , # Char to FreqOrder table ,
EUCTW_TABLE_SIZE = 8102 EUCTW_TABLE_SIZE = 8102
EUCTWCharToFreqOrder = ( \ EUCTWCharToFreqOrder = (
1,1800,1506, 255,1431, 198, 9, 82, 6,7310, 177, 202,3615,1256,2808, 110, # 2742 1,1800,1506, 255,1431, 198, 9, 82, 6,7310, 177, 202,3615,1256,2808, 110, # 2742
3735, 33,3241, 261, 76, 44,2113, 16,2931,2184,1176, 659,3868, 26,3404,2643, # 2758 3735, 33,3241, 261, 76, 44,2113, 16,2931,2184,1176, 659,3868, 26,3404,2643, # 2758
1198,3869,3313,4060, 410,2211, 302, 590, 361,1963, 8, 204, 58,4296,7311,1931, # 2774 1198,3869,3313,4060, 410,2211, 302, 590, 361,1963, 8, 204, 58,4296,7311,1931, # 2774
@ -424,3 +424,5 @@ EUCTWCharToFreqOrder = ( \
8694,8695,8696,8697,8698,8699,8700,8701,8702,8703,8704,8705,8706,8707,8708,8709, # 8710 8694,8695,8696,8697,8698,8699,8700,8701,8702,8703,8704,8705,8706,8707,8708,8709, # 8710
8710,8711,8712,8713,8714,8715,8716,8717,8718,8719,8720,8721,8722,8723,8724,8725, # 8726 8710,8711,8712,8713,8714,8715,8716,8717,8718,8719,8720,8721,8722,8723,8724,8725, # 8726
8726,8727,8728,8729,8730,8731,8732,8733,8734,8735,8736,8737,8738,8739,8740,8741) # 8742 8726,8727,8728,8729,8730,8731,8732,8733,8734,8735,8736,8737,8738,8739,8740,8741) # 8742
# flake8: noqa

View File

@ -25,10 +25,10 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from mbcharsetprober import MultiByteCharSetProber from .mbcharsetprober import MultiByteCharSetProber
from codingstatemachine import CodingStateMachine from .codingstatemachine import CodingStateMachine
from chardistribution import EUCTWDistributionAnalysis from .chardistribution import EUCTWDistributionAnalysis
from mbcssm import EUCTWSMModel from .mbcssm import EUCTWSMModel
class EUCTWProber(MultiByteCharSetProber): class EUCTWProber(MultiByteCharSetProber):
def __init__(self): def __init__(self):

View File

@ -13,12 +13,12 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
@ -36,14 +36,14 @@
# #
# Ideal Distribution Ratio = 0.79135/(1-0.79135) = 3.79 # Ideal Distribution Ratio = 0.79135/(1-0.79135) = 3.79
# Random Distribution Ration = 512 / (3755 - 512) = 0.157 # Random Distribution Ration = 512 / (3755 - 512) = 0.157
# #
# Typical Distribution Ratio about 25% of Ideal one, still much higher that RDR # Typical Distribution Ratio about 25% of Ideal one, still much higher that RDR
GB2312_TYPICAL_DISTRIBUTION_RATIO = 0.9 GB2312_TYPICAL_DISTRIBUTION_RATIO = 0.9
GB2312_TABLE_SIZE = 3760 GB2312_TABLE_SIZE = 3760
GB2312CharToFreqOrder = ( \ GB2312CharToFreqOrder = (
1671, 749,1443,2364,3924,3807,2330,3921,1704,3463,2691,1511,1515, 572,3191,2205, 1671, 749,1443,2364,3924,3807,2330,3921,1704,3463,2691,1511,1515, 572,3191,2205,
2361, 224,2558, 479,1711, 963,3162, 440,4060,1905,2966,2947,3580,2647,3961,3842, 2361, 224,2558, 479,1711, 963,3162, 440,4060,1905,2966,2947,3580,2647,3961,3842,
2204, 869,4207, 970,2678,5626,2944,2956,1479,4048, 514,3595, 588,1346,2820,3409, 2204, 869,4207, 970,2678,5626,2944,2956,1479,4048, 514,3595, 588,1346,2820,3409,
@ -469,3 +469,4 @@ GB2312CharToFreqOrder = ( \
5867,5507,6273,4206,6274,4789,6098,6764,3619,3646,3833,3804,2394,3788,4936,3978, 5867,5507,6273,4206,6274,4789,6098,6764,3619,3646,3833,3804,2394,3788,4936,3978,
4866,4899,6099,6100,5559,6478,6765,3599,5868,6101,5869,5870,6275,6766,4527,6767) 4866,4899,6099,6100,5559,6478,6765,3599,5868,6101,5869,5870,6275,6766,4527,6767)
# flake8: noqa

View File

@ -25,10 +25,10 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from mbcharsetprober import MultiByteCharSetProber from .mbcharsetprober import MultiByteCharSetProber
from codingstatemachine import CodingStateMachine from .codingstatemachine import CodingStateMachine
from chardistribution import GB2312DistributionAnalysis from .chardistribution import GB2312DistributionAnalysis
from mbcssm import GB2312SMModel from .mbcssm import GB2312SMModel
class GB2312Prober(MultiByteCharSetProber): class GB2312Prober(MultiByteCharSetProber):
def __init__(self): def __init__(self):

View File

@ -13,20 +13,21 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from charsetprober import CharSetProber from .charsetprober import CharSetProber
import constants from .constants import eNotMe, eDetecting
from .compat import wrap_ord
# This prober doesn't actually recognize a language or a charset. # This prober doesn't actually recognize a language or a charset.
# It is a helper prober for the use of the Hebrew model probers # It is a helper prober for the use of the Hebrew model probers
@ -35,40 +36,40 @@ import constants
# #
# Four main charsets exist in Hebrew: # Four main charsets exist in Hebrew:
# "ISO-8859-8" - Visual Hebrew # "ISO-8859-8" - Visual Hebrew
# "windows-1255" - Logical Hebrew # "windows-1255" - Logical Hebrew
# "ISO-8859-8-I" - Logical Hebrew # "ISO-8859-8-I" - Logical Hebrew
# "x-mac-hebrew" - ?? Logical Hebrew ?? # "x-mac-hebrew" - ?? Logical Hebrew ??
# #
# Both "ISO" charsets use a completely identical set of code points, whereas # Both "ISO" charsets use a completely identical set of code points, whereas
# "windows-1255" and "x-mac-hebrew" are two different proper supersets of # "windows-1255" and "x-mac-hebrew" are two different proper supersets of
# these code points. windows-1255 defines additional characters in the range # these code points. windows-1255 defines additional characters in the range
# 0x80-0x9F as some misc punctuation marks as well as some Hebrew-specific # 0x80-0x9F as some misc punctuation marks as well as some Hebrew-specific
# diacritics and additional 'Yiddish' ligature letters in the range 0xc0-0xd6. # diacritics and additional 'Yiddish' ligature letters in the range 0xc0-0xd6.
# x-mac-hebrew defines similar additional code points but with a different # x-mac-hebrew defines similar additional code points but with a different
# mapping. # mapping.
# #
# As far as an average Hebrew text with no diacritics is concerned, all four # As far as an average Hebrew text with no diacritics is concerned, all four
# charsets are identical with respect to code points. Meaning that for the # charsets are identical with respect to code points. Meaning that for the
# main Hebrew alphabet, all four map the same values to all 27 Hebrew letters # main Hebrew alphabet, all four map the same values to all 27 Hebrew letters
# (including final letters). # (including final letters).
# #
# The dominant difference between these charsets is their directionality. # The dominant difference between these charsets is their directionality.
# "Visual" directionality means that the text is ordered as if the renderer is # "Visual" directionality means that the text is ordered as if the renderer is
# not aware of a BIDI rendering algorithm. The renderer sees the text and # not aware of a BIDI rendering algorithm. The renderer sees the text and
# draws it from left to right. The text itself when ordered naturally is read # draws it from left to right. The text itself when ordered naturally is read
# backwards. A buffer of Visual Hebrew generally looks like so: # backwards. A buffer of Visual Hebrew generally looks like so:
# "[last word of first line spelled backwards] [whole line ordered backwards # "[last word of first line spelled backwards] [whole line ordered backwards
# and spelled backwards] [first word of first line spelled backwards] # and spelled backwards] [first word of first line spelled backwards]
# [end of line] [last word of second line] ... etc' " # [end of line] [last word of second line] ... etc' "
# adding punctuation marks, numbers and English text to visual text is # adding punctuation marks, numbers and English text to visual text is
# naturally also "visual" and from left to right. # naturally also "visual" and from left to right.
# #
# "Logical" directionality means the text is ordered "naturally" according to # "Logical" directionality means the text is ordered "naturally" according to
# the order it is read. It is the responsibility of the renderer to display # the order it is read. It is the responsibility of the renderer to display
# the text from right to left. A BIDI algorithm is used to place general # the text from right to left. A BIDI algorithm is used to place general
# punctuation marks, numbers and English text in the text. # punctuation marks, numbers and English text in the text.
# #
# Texts in x-mac-hebrew are almost impossible to find on the Internet. From # Texts in x-mac-hebrew are almost impossible to find on the Internet. From
# what little evidence I could find, it seems that its general directionality # what little evidence I could find, it seems that its general directionality
# is Logical. # is Logical.
# #
@ -76,17 +77,17 @@ import constants
# charsets: # charsets:
# Visual Hebrew - "ISO-8859-8" - backwards text - Words and sentences are # Visual Hebrew - "ISO-8859-8" - backwards text - Words and sentences are
# backwards while line order is natural. For charset recognition purposes # backwards while line order is natural. For charset recognition purposes
# the line order is unimportant (In fact, for this implementation, even # the line order is unimportant (In fact, for this implementation, even
# word order is unimportant). # word order is unimportant).
# Logical Hebrew - "windows-1255" - normal, naturally ordered text. # Logical Hebrew - "windows-1255" - normal, naturally ordered text.
# #
# "ISO-8859-8-I" is a subset of windows-1255 and doesn't need to be # "ISO-8859-8-I" is a subset of windows-1255 and doesn't need to be
# specifically identified. # specifically identified.
# "x-mac-hebrew" is also identified as windows-1255. A text in x-mac-hebrew # "x-mac-hebrew" is also identified as windows-1255. A text in x-mac-hebrew
# that contain special punctuation marks or diacritics is displayed with # that contain special punctuation marks or diacritics is displayed with
# some unconverted characters showing as question marks. This problem might # some unconverted characters showing as question marks. This problem might
# be corrected using another model prober for x-mac-hebrew. Due to the fact # be corrected using another model prober for x-mac-hebrew. Due to the fact
# that x-mac-hebrew texts are so rare, writing another model prober isn't # that x-mac-hebrew texts are so rare, writing another model prober isn't
# worth the effort and performance hit. # worth the effort and performance hit.
# #
#### The Prober #### #### The Prober ####
@ -126,28 +127,31 @@ import constants
# charset identified, either "windows-1255" or "ISO-8859-8". # charset identified, either "windows-1255" or "ISO-8859-8".
# windows-1255 / ISO-8859-8 code points of interest # windows-1255 / ISO-8859-8 code points of interest
FINAL_KAF = '\xea' FINAL_KAF = 0xea
NORMAL_KAF = '\xeb' NORMAL_KAF = 0xeb
FINAL_MEM = '\xed' FINAL_MEM = 0xed
NORMAL_MEM = '\xee' NORMAL_MEM = 0xee
FINAL_NUN = '\xef' FINAL_NUN = 0xef
NORMAL_NUN = '\xf0' NORMAL_NUN = 0xf0
FINAL_PE = '\xf3' FINAL_PE = 0xf3
NORMAL_PE = '\xf4' NORMAL_PE = 0xf4
FINAL_TSADI = '\xf5' FINAL_TSADI = 0xf5
NORMAL_TSADI = '\xf6' NORMAL_TSADI = 0xf6
# Minimum Visual vs Logical final letter score difference. # Minimum Visual vs Logical final letter score difference.
# If the difference is below this, don't rely solely on the final letter score distance. # If the difference is below this, don't rely solely on the final letter score
# distance.
MIN_FINAL_CHAR_DISTANCE = 5 MIN_FINAL_CHAR_DISTANCE = 5
# Minimum Visual vs Logical model score difference. # Minimum Visual vs Logical model score difference.
# If the difference is below this, don't rely at all on the model score distance. # If the difference is below this, don't rely at all on the model score
# distance.
MIN_MODEL_DISTANCE = 0.01 MIN_MODEL_DISTANCE = 0.01
VISUAL_HEBREW_NAME = "ISO-8859-8" VISUAL_HEBREW_NAME = "ISO-8859-8"
LOGICAL_HEBREW_NAME = "windows-1255" LOGICAL_HEBREW_NAME = "windows-1255"
class HebrewProber(CharSetProber): class HebrewProber(CharSetProber):
def __init__(self): def __init__(self):
CharSetProber.__init__(self) CharSetProber.__init__(self)
@ -159,8 +163,8 @@ class HebrewProber(CharSetProber):
self._mFinalCharLogicalScore = 0 self._mFinalCharLogicalScore = 0
self._mFinalCharVisualScore = 0 self._mFinalCharVisualScore = 0
# The two last characters seen in the previous buffer, # The two last characters seen in the previous buffer,
# mPrev and mBeforePrev are initialized to space in order to simulate a word # mPrev and mBeforePrev are initialized to space in order to simulate
# delimiter at the beginning of the data # a word delimiter at the beginning of the data
self._mPrev = ' ' self._mPrev = ' '
self._mBeforePrev = ' ' self._mBeforePrev = ' '
# These probers are owned by the group prober. # These probers are owned by the group prober.
@ -170,49 +174,52 @@ class HebrewProber(CharSetProber):
self._mVisualProber = visualProber self._mVisualProber = visualProber
def is_final(self, c): def is_final(self, c):
return c in [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE, FINAL_TSADI] return wrap_ord(c) in [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE,
FINAL_TSADI]
def is_non_final(self, c): def is_non_final(self, c):
# The normal Tsadi is not a good Non-Final letter due to words like # The normal Tsadi is not a good Non-Final letter due to words like
# 'lechotet' (to chat) containing an apostrophe after the tsadi. This # 'lechotet' (to chat) containing an apostrophe after the tsadi. This
# apostrophe is converted to a space in FilterWithoutEnglishLetters causing # apostrophe is converted to a space in FilterWithoutEnglishLetters
# the Non-Final tsadi to appear at an end of a word even though this is not # causing the Non-Final tsadi to appear at an end of a word even
# the case in the original text. # though this is not the case in the original text.
# The letters Pe and Kaf rarely display a related behavior of not being a # The letters Pe and Kaf rarely display a related behavior of not being
# good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for # a good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak'
# example legally end with a Non-Final Pe or Kaf. However, the benefit of # for example legally end with a Non-Final Pe or Kaf. However, the
# these letters as Non-Final letters outweighs the damage since these words # benefit of these letters as Non-Final letters outweighs the damage
# are quite rare. # since these words are quite rare.
return c in [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE] return wrap_ord(c) in [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE]
def feed(self, aBuf): def feed(self, aBuf):
# Final letter analysis for logical-visual decision. # Final letter analysis for logical-visual decision.
# Look for evidence that the received buffer is either logical Hebrew or # Look for evidence that the received buffer is either logical Hebrew
# visual Hebrew. # or visual Hebrew.
# The following cases are checked: # The following cases are checked:
# 1) A word longer than 1 letter, ending with a final letter. This is an # 1) A word longer than 1 letter, ending with a final letter. This is
# indication that the text is laid out "naturally" since the final letter # an indication that the text is laid out "naturally" since the
# really appears at the end. +1 for logical score. # final letter really appears at the end. +1 for logical score.
# 2) A word longer than 1 letter, ending with a Non-Final letter. In normal # 2) A word longer than 1 letter, ending with a Non-Final letter. In
# Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with # normal Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi,
# the Non-Final form of that letter. Exceptions to this rule are mentioned # should not end with the Non-Final form of that letter. Exceptions
# above in isNonFinal(). This is an indication that the text is laid out # to this rule are mentioned above in isNonFinal(). This is an
# backwards. +1 for visual score # indication that the text is laid out backwards. +1 for visual
# 3) A word longer than 1 letter, starting with a final letter. Final letters # score
# should not appear at the beginning of a word. This is an indication that # 3) A word longer than 1 letter, starting with a final letter. Final
# the text is laid out backwards. +1 for visual score. # letters should not appear at the beginning of a word. This is an
# # indication that the text is laid out backwards. +1 for visual
# The visual score and logical score are accumulated throughout the text and # score.
# are finally checked against each other in GetCharSetName(). #
# No checking for final letters in the middle of words is done since that case # The visual score and logical score are accumulated throughout the
# is not an indication for either Logical or Visual text. # text and are finally checked against each other in GetCharSetName().
# # No checking for final letters in the middle of words is done since
# We automatically filter out all 7-bit characters (replace them with spaces) # that case is not an indication for either Logical or Visual text.
# so the word boundary detection works properly. [MAP] #
# We automatically filter out all 7-bit characters (replace them with
# spaces) so the word boundary detection works properly. [MAP]
if self.get_state() == constants.eNotMe: if self.get_state() == eNotMe:
# Both model probers say it's not them. No reason to continue. # Both model probers say it's not them. No reason to continue.
return constants.eNotMe return eNotMe
aBuf = self.filter_high_bit_only(aBuf) aBuf = self.filter_high_bit_only(aBuf)
@ -220,23 +227,27 @@ class HebrewProber(CharSetProber):
if cur == ' ': if cur == ' ':
# We stand on a space - a word just ended # We stand on a space - a word just ended
if self._mBeforePrev != ' ': if self._mBeforePrev != ' ':
# next-to-last char was not a space so self._mPrev is not a 1 letter word # next-to-last char was not a space so self._mPrev is not a
# 1 letter word
if self.is_final(self._mPrev): if self.is_final(self._mPrev):
# case (1) [-2:not space][-1:final letter][cur:space] # case (1) [-2:not space][-1:final letter][cur:space]
self._mFinalCharLogicalScore += 1 self._mFinalCharLogicalScore += 1
elif self.is_non_final(self._mPrev): elif self.is_non_final(self._mPrev):
# case (2) [-2:not space][-1:Non-Final letter][cur:space] # case (2) [-2:not space][-1:Non-Final letter][
# cur:space]
self._mFinalCharVisualScore += 1 self._mFinalCharVisualScore += 1
else: else:
# Not standing on a space # Not standing on a space
if (self._mBeforePrev == ' ') and (self.is_final(self._mPrev)) and (cur != ' '): if ((self._mBeforePrev == ' ') and
(self.is_final(self._mPrev)) and (cur != ' ')):
# case (3) [-2:space][-1:final letter][cur:not space] # case (3) [-2:space][-1:final letter][cur:not space]
self._mFinalCharVisualScore += 1 self._mFinalCharVisualScore += 1
self._mBeforePrev = self._mPrev self._mBeforePrev = self._mPrev
self._mPrev = cur self._mPrev = cur
# Forever detecting, till the end or until both model probers return eNotMe (handled above) # Forever detecting, till the end or until both model probers return
return constants.eDetecting # eNotMe (handled above)
return eDetecting
def get_charset_name(self): def get_charset_name(self):
# Make the decision: is it Logical or Visual? # Make the decision: is it Logical or Visual?
@ -248,22 +259,25 @@ class HebrewProber(CharSetProber):
return VISUAL_HEBREW_NAME return VISUAL_HEBREW_NAME
# It's not dominant enough, try to rely on the model scores instead. # It's not dominant enough, try to rely on the model scores instead.
modelsub = self._mLogicalProber.get_confidence() - self._mVisualProber.get_confidence() modelsub = (self._mLogicalProber.get_confidence()
- self._mVisualProber.get_confidence())
if modelsub > MIN_MODEL_DISTANCE: if modelsub > MIN_MODEL_DISTANCE:
return LOGICAL_HEBREW_NAME return LOGICAL_HEBREW_NAME
if modelsub < -MIN_MODEL_DISTANCE: if modelsub < -MIN_MODEL_DISTANCE:
return VISUAL_HEBREW_NAME return VISUAL_HEBREW_NAME
# Still no good, back to final letter distance, maybe it'll save the day. # Still no good, back to final letter distance, maybe it'll save the
# day.
if finalsub < 0.0: if finalsub < 0.0:
return VISUAL_HEBREW_NAME return VISUAL_HEBREW_NAME
# (finalsub > 0 - Logical) or (don't know what to do) default to Logical. # (finalsub > 0 - Logical) or (don't know what to do) default to
# Logical.
return LOGICAL_HEBREW_NAME return LOGICAL_HEBREW_NAME
def get_state(self): def get_state(self):
# Remain active as long as any of the model probers are active. # Remain active as long as any of the model probers are active.
if (self._mLogicalProber.get_state() == constants.eNotMe) and \ if (self._mLogicalProber.get_state() == eNotMe) and \
(self._mVisualProber.get_state() == constants.eNotMe): (self._mVisualProber.get_state() == eNotMe):
return constants.eNotMe return eNotMe
return constants.eDetecting return eDetecting

View File

@ -13,12 +13,12 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
@ -28,7 +28,7 @@
# Sampling from about 20M text materials include literature and computer technology # Sampling from about 20M text materials include literature and computer technology
# #
# Japanese frequency table, applied to both S-JIS and EUC-JP # Japanese frequency table, applied to both S-JIS and EUC-JP
# They are sorted in order. # They are sorted in order.
# 128 --> 0.77094 # 128 --> 0.77094
# 256 --> 0.85710 # 256 --> 0.85710
@ -38,15 +38,15 @@
# #
# Ideal Distribution Ratio = 0.92635 / (1-0.92635) = 12.58 # Ideal Distribution Ratio = 0.92635 / (1-0.92635) = 12.58
# Random Distribution Ration = 512 / (2965+62+83+86-512) = 0.191 # Random Distribution Ration = 512 / (2965+62+83+86-512) = 0.191
# #
# Typical Distribution Ratio, 25% of IDR # Typical Distribution Ratio, 25% of IDR
JIS_TYPICAL_DISTRIBUTION_RATIO = 3.0 JIS_TYPICAL_DISTRIBUTION_RATIO = 3.0
# Char to FreqOrder table , # Char to FreqOrder table ,
JIS_TABLE_SIZE = 4368 JIS_TABLE_SIZE = 4368
JISCharToFreqOrder = ( \ JISCharToFreqOrder = (
40, 1, 6, 182, 152, 180, 295,2127, 285, 381,3295,4304,3068,4606,3165,3510, # 16 40, 1, 6, 182, 152, 180, 295,2127, 285, 381,3295,4304,3068,4606,3165,3510, # 16
3511,1822,2785,4607,1193,2226,5070,4608, 171,2996,1247, 18, 179,5071, 856,1661, # 32 3511,1822,2785,4607,1193,2226,5070,4608, 171,2996,1247, 18, 179,5071, 856,1661, # 32
1262,5072, 619, 127,3431,3512,3230,1899,1700, 232, 228,1294,1298, 284, 283,2041, # 48 1262,5072, 619, 127,3431,3512,3230,1899,1700, 232, 228,1294,1298, 284, 283,2041, # 48
@ -565,3 +565,5 @@ JISCharToFreqOrder = ( \
8224,8225,8226,8227,8228,8229,8230,8231,8232,8233,8234,8235,8236,8237,8238,8239, # 8240 8224,8225,8226,8227,8228,8229,8230,8231,8232,8233,8234,8235,8236,8237,8238,8239, # 8240
8240,8241,8242,8243,8244,8245,8246,8247,8248,8249,8250,8251,8252,8253,8254,8255, # 8256 8240,8241,8242,8243,8244,8245,8246,8247,8248,8249,8250,8251,8252,8253,8254,8255, # 8256
8256,8257,8258,8259,8260,8261,8262,8263,8264,8265,8266,8267,8268,8269,8270,8271) # 8272 8256,8257,8258,8259,8260,8261,8262,8263,8264,8265,8266,8267,8268,8269,8270,8271) # 8272
# flake8: noqa

View File

@ -13,19 +13,19 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants from .compat import wrap_ord
NUM_OF_CATEGORY = 6 NUM_OF_CATEGORY = 6
DONT_KNOW = -1 DONT_KNOW = -1
@ -34,7 +34,7 @@ MAX_REL_THRESHOLD = 1000
MINIMUM_DATA_THRESHOLD = 4 MINIMUM_DATA_THRESHOLD = 4
# This is hiragana 2-char sequence table, the number in each cell represents its frequency category # This is hiragana 2-char sequence table, the number in each cell represents its frequency category
jp2CharContext = ( \ jp2CharContext = (
(0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1), (0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1),
(2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4), (2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4),
(0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2), (0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2),
@ -125,24 +125,31 @@ class JapaneseContextAnalysis:
self.reset() self.reset()
def reset(self): def reset(self):
self._mTotalRel = 0 # total sequence received self._mTotalRel = 0 # total sequence received
self._mRelSample = [0] * NUM_OF_CATEGORY # category counters, each interger counts sequence in its category # category counters, each interger counts sequence in its category
self._mNeedToSkipCharNum = 0 # if last byte in current buffer is not the last byte of a character, we need to know how many bytes to skip in next buffer self._mRelSample = [0] * NUM_OF_CATEGORY
self._mLastCharOrder = -1 # The order of previous char # if last byte in current buffer is not the last byte of a character,
self._mDone = constants.False # If this flag is set to constants.True, detection is done and conclusion has been made # we need to know how many bytes to skip in next buffer
self._mNeedToSkipCharNum = 0
self._mLastCharOrder = -1 # The order of previous char
# If this flag is set to True, detection is done and conclusion has
# been made
self._mDone = False
def feed(self, aBuf, aLen): def feed(self, aBuf, aLen):
if self._mDone: return if self._mDone:
return
# The buffer we got is byte oriented, and a character may span in more than one # The buffer we got is byte oriented, and a character may span in more than one
# buffers. In case the last one or two byte in last buffer is not complete, we # buffers. In case the last one or two byte in last buffer is not
# record how many byte needed to complete that character and skip these bytes here. # complete, we record how many byte needed to complete that character
# We can choose to record those bytes as well and analyse the character once it # and skip these bytes here. We can choose to record those bytes as
# is complete, but since a character will not make much difference, by simply skipping # well and analyse the character once it is complete, but since a
# character will not make much difference, by simply skipping
# this character will simply our logic and improve performance. # this character will simply our logic and improve performance.
i = self._mNeedToSkipCharNum i = self._mNeedToSkipCharNum
while i < aLen: while i < aLen:
order, charLen = self.get_order(aBuf[i:i+2]) order, charLen = self.get_order(aBuf[i:i + 2])
i += charLen i += charLen
if i > aLen: if i > aLen:
self._mNeedToSkipCharNum = i - aLen self._mNeedToSkipCharNum = i - aLen
@ -151,7 +158,7 @@ class JapaneseContextAnalysis:
if (order != -1) and (self._mLastCharOrder != -1): if (order != -1) and (self._mLastCharOrder != -1):
self._mTotalRel += 1 self._mTotalRel += 1
if self._mTotalRel > MAX_REL_THRESHOLD: if self._mTotalRel > MAX_REL_THRESHOLD:
self._mDone = constants.True self._mDone = True
break break
self._mRelSample[jp2CharContext[self._mLastCharOrder][order]] += 1 self._mRelSample[jp2CharContext[self._mLastCharOrder][order]] += 1
self._mLastCharOrder = order self._mLastCharOrder = order
@ -166,45 +173,55 @@ class JapaneseContextAnalysis:
else: else:
return DONT_KNOW return DONT_KNOW
def get_order(self, aStr): def get_order(self, aBuf):
return -1, 1 return -1, 1
class SJISContextAnalysis(JapaneseContextAnalysis): class SJISContextAnalysis(JapaneseContextAnalysis):
def get_order(self, aStr): def __init__(self):
if not aStr: return -1, 1 self.charset_name = "SHIFT_JIS"
def get_charset_name(self):
return self.charset_name
def get_order(self, aBuf):
if not aBuf:
return -1, 1
# find out current char's byte length # find out current char's byte length
if ((aStr[0] >= '\x81') and (aStr[0] <= '\x9F')) or \ first_char = wrap_ord(aBuf[0])
((aStr[0] >= '\xE0') and (aStr[0] <= '\xFC')): if ((0x81 <= first_char <= 0x9F) or (0xE0 <= first_char <= 0xFC)):
charLen = 2 charLen = 2
if (first_char == 0x87) or (0xFA <= first_char <= 0xFC):
self.charset_name = "CP932"
else: else:
charLen = 1 charLen = 1
# return its order if it is hiragana # return its order if it is hiragana
if len(aStr) > 1: if len(aBuf) > 1:
if (aStr[0] == '\202') and \ second_char = wrap_ord(aBuf[1])
(aStr[1] >= '\x9F') and \ if (first_char == 202) and (0x9F <= second_char <= 0xF1):
(aStr[1] <= '\xF1'): return second_char - 0x9F, charLen
return ord(aStr[1]) - 0x9F, charLen
return -1, charLen return -1, charLen
class EUCJPContextAnalysis(JapaneseContextAnalysis): class EUCJPContextAnalysis(JapaneseContextAnalysis):
def get_order(self, aStr): def get_order(self, aBuf):
if not aStr: return -1, 1 if not aBuf:
return -1, 1
# find out current char's byte length # find out current char's byte length
if (aStr[0] == '\x8E') or \ first_char = wrap_ord(aBuf[0])
((aStr[0] >= '\xA1') and (aStr[0] <= '\xFE')): if (first_char == 0x8E) or (0xA1 <= first_char <= 0xFE):
charLen = 2 charLen = 2
elif aStr[0] == '\x8F': elif first_char == 0x8F:
charLen = 3 charLen = 3
else: else:
charLen = 1 charLen = 1
# return its order if it is hiragana # return its order if it is hiragana
if len(aStr) > 1: if len(aBuf) > 1:
if (aStr[0] == '\xA4') and \ second_char = wrap_ord(aBuf[1])
(aStr[1] >= '\xA1') and \ if (first_char == 0xA4) and (0xA1 <= second_char <= 0xF3):
(aStr[1] <= '\xF3'): return second_char - 0xA1, charLen
return ord(aStr[1]) - 0xA1, charLen
return -1, charLen return -1, charLen
# flake8: noqa

View File

@ -13,30 +13,28 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants
# 255: Control characters that usually does not exist in any text # 255: Control characters that usually does not exist in any text
# 254: Carriage/Return # 254: Carriage/Return
# 253: symbol (punctuation) that does not belong to word # 253: symbol (punctuation) that does not belong to word
# 252: 0 - 9 # 252: 0 - 9
# Character Mapping Table: # Character Mapping Table:
# this table is modified base on win1251BulgarianCharToOrderMap, so # this table is modified base on win1251BulgarianCharToOrderMap, so
# only number <64 is sure valid # only number <64 is sure valid
Latin5_BulgarianCharToOrderMap = ( \ Latin5_BulgarianCharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -55,7 +53,7 @@ Latin5_BulgarianCharToOrderMap = ( \
62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,252,253, # f0 62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,252,253, # f0
) )
win1251BulgarianCharToOrderMap = ( \ win1251BulgarianCharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -74,13 +72,13 @@ win1251BulgarianCharToOrderMap = ( \
7, 8, 5, 19, 29, 25, 22, 21, 27, 24, 17, 75, 52,253, 42, 16, # f0 7, 8, 5, 19, 29, 25, 22, 21, 27, 24, 17, 75, 52,253, 42, 16, # f0
) )
# Model Table: # Model Table:
# total sequences: 100% # total sequences: 100%
# first 512 sequences: 96.9392% # first 512 sequences: 96.9392%
# first 1024 sequences:3.0618% # first 1024 sequences:3.0618%
# rest sequences: 0.2992% # rest sequences: 0.2992%
# negative sequences: 0.0020% # negative sequences: 0.0020%
BulgarianLangModel = ( \ BulgarianLangModel = (
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3, 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,2,1,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,2,1,2,2,
3,1,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,3,0,3,0,1, 3,1,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,3,0,3,0,1,
@ -211,18 +209,21 @@ BulgarianLangModel = ( \
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
) )
Latin5BulgarianModel = { \ Latin5BulgarianModel = {
'charToOrderMap': Latin5_BulgarianCharToOrderMap, 'charToOrderMap': Latin5_BulgarianCharToOrderMap,
'precedenceMatrix': BulgarianLangModel, 'precedenceMatrix': BulgarianLangModel,
'mTypicalPositiveRatio': 0.969392, 'mTypicalPositiveRatio': 0.969392,
'keepEnglishLetter': constants.False, 'keepEnglishLetter': False,
'charsetName': "ISO-8859-5" 'charsetName': "ISO-8859-5"
} }
Win1251BulgarianModel = { \ Win1251BulgarianModel = {
'charToOrderMap': win1251BulgarianCharToOrderMap, 'charToOrderMap': win1251BulgarianCharToOrderMap,
'precedenceMatrix': BulgarianLangModel, 'precedenceMatrix': BulgarianLangModel,
'mTypicalPositiveRatio': 0.969392, 'mTypicalPositiveRatio': 0.969392,
'keepEnglishLetter': constants.False, 'keepEnglishLetter': False,
'charsetName': "windows-1251" 'charsetName': "windows-1251"
} }
# flake8: noqa

View File

@ -13,23 +13,21 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants
# KOI8-R language model # KOI8-R language model
# Character Mapping Table: # Character Mapping Table:
KOI8R_CharToOrderMap = ( \ KOI8R_CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -48,7 +46,7 @@ KOI8R_CharToOrderMap = ( \
35, 43, 45, 32, 40, 52, 56, 33, 61, 62, 51, 57, 47, 63, 50, 70, # f0 35, 43, 45, 32, 40, 52, 56, 33, 61, 62, 51, 57, 47, 63, 50, 70, # f0
) )
win1251_CharToOrderMap = ( \ win1251_CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -67,7 +65,7 @@ win1251_CharToOrderMap = ( \
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16, 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16,
) )
latin5_CharToOrderMap = ( \ latin5_CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -86,7 +84,7 @@ latin5_CharToOrderMap = ( \
239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255, 239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255,
) )
macCyrillic_CharToOrderMap = ( \ macCyrillic_CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -105,7 +103,7 @@ macCyrillic_CharToOrderMap = ( \
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,255, 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,255,
) )
IBM855_CharToOrderMap = ( \ IBM855_CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -124,7 +122,7 @@ IBM855_CharToOrderMap = ( \
250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,252,255, 250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,252,255,
) )
IBM866_CharToOrderMap = ( \ IBM866_CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -143,13 +141,13 @@ IBM866_CharToOrderMap = ( \
239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255, 239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255,
) )
# Model Table: # Model Table:
# total sequences: 100% # total sequences: 100%
# first 512 sequences: 97.6601% # first 512 sequences: 97.6601%
# first 1024 sequences: 2.3389% # first 1024 sequences: 2.3389%
# rest sequences: 0.1237% # rest sequences: 0.1237%
# negative sequences: 0.0009% # negative sequences: 0.0009%
RussianLangModel = ( \ RussianLangModel = (
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3, 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,2,2,2,0,0,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,2,2,2,0,0,2,
3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,3,0,0,3,3,3,3,3,3,3,3,3,2,3,2,0, 3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,3,0,0,3,3,3,3,3,3,3,3,3,2,3,2,0,
@ -280,50 +278,52 @@ RussianLangModel = ( \
0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0, 0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,
) )
Koi8rModel = { \ Koi8rModel = {
'charToOrderMap': KOI8R_CharToOrderMap, 'charToOrderMap': KOI8R_CharToOrderMap,
'precedenceMatrix': RussianLangModel, 'precedenceMatrix': RussianLangModel,
'mTypicalPositiveRatio': 0.976601, 'mTypicalPositiveRatio': 0.976601,
'keepEnglishLetter': constants.False, 'keepEnglishLetter': False,
'charsetName': "KOI8-R" 'charsetName': "KOI8-R"
} }
Win1251CyrillicModel = { \ Win1251CyrillicModel = {
'charToOrderMap': win1251_CharToOrderMap, 'charToOrderMap': win1251_CharToOrderMap,
'precedenceMatrix': RussianLangModel, 'precedenceMatrix': RussianLangModel,
'mTypicalPositiveRatio': 0.976601, 'mTypicalPositiveRatio': 0.976601,
'keepEnglishLetter': constants.False, 'keepEnglishLetter': False,
'charsetName': "windows-1251" 'charsetName': "windows-1251"
} }
Latin5CyrillicModel = { \ Latin5CyrillicModel = {
'charToOrderMap': latin5_CharToOrderMap, 'charToOrderMap': latin5_CharToOrderMap,
'precedenceMatrix': RussianLangModel, 'precedenceMatrix': RussianLangModel,
'mTypicalPositiveRatio': 0.976601, 'mTypicalPositiveRatio': 0.976601,
'keepEnglishLetter': constants.False, 'keepEnglishLetter': False,
'charsetName': "ISO-8859-5" 'charsetName': "ISO-8859-5"
} }
MacCyrillicModel = { \ MacCyrillicModel = {
'charToOrderMap': macCyrillic_CharToOrderMap, 'charToOrderMap': macCyrillic_CharToOrderMap,
'precedenceMatrix': RussianLangModel, 'precedenceMatrix': RussianLangModel,
'mTypicalPositiveRatio': 0.976601, 'mTypicalPositiveRatio': 0.976601,
'keepEnglishLetter': constants.False, 'keepEnglishLetter': False,
'charsetName': "MacCyrillic" 'charsetName': "MacCyrillic"
}; };
Ibm866Model = { \ Ibm866Model = {
'charToOrderMap': IBM866_CharToOrderMap, 'charToOrderMap': IBM866_CharToOrderMap,
'precedenceMatrix': RussianLangModel, 'precedenceMatrix': RussianLangModel,
'mTypicalPositiveRatio': 0.976601, 'mTypicalPositiveRatio': 0.976601,
'keepEnglishLetter': constants.False, 'keepEnglishLetter': False,
'charsetName': "IBM866" 'charsetName': "IBM866"
} }
Ibm855Model = { \ Ibm855Model = {
'charToOrderMap': IBM855_CharToOrderMap, 'charToOrderMap': IBM855_CharToOrderMap,
'precedenceMatrix': RussianLangModel, 'precedenceMatrix': RussianLangModel,
'mTypicalPositiveRatio': 0.976601, 'mTypicalPositiveRatio': 0.976601,
'keepEnglishLetter': constants.False, 'keepEnglishLetter': False,
'charsetName': "IBM855" 'charsetName': "IBM855"
} }
# flake8: noqa

View File

@ -13,27 +13,25 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants
# 255: Control characters that usually does not exist in any text # 255: Control characters that usually does not exist in any text
# 254: Carriage/Return # 254: Carriage/Return
# 253: symbol (punctuation) that does not belong to word # 253: symbol (punctuation) that does not belong to word
# 252: 0 - 9 # 252: 0 - 9
# Character Mapping Table: # Character Mapping Table:
Latin7_CharToOrderMap = ( \ Latin7_CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -52,7 +50,7 @@ Latin7_CharToOrderMap = ( \
9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, # f0 9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, # f0
) )
win1253_CharToOrderMap = ( \ win1253_CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -71,13 +69,13 @@ win1253_CharToOrderMap = ( \
9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, # f0 9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, # f0
) )
# Model Table: # Model Table:
# total sequences: 100% # total sequences: 100%
# first 512 sequences: 98.2851% # first 512 sequences: 98.2851%
# first 1024 sequences:1.7001% # first 1024 sequences:1.7001%
# rest sequences: 0.0359% # rest sequences: 0.0359%
# negative sequences: 0.0148% # negative sequences: 0.0148%
GreekLangModel = ( \ GreekLangModel = (
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,3,2,2,3,3,3,3,3,3,3,3,1,3,3,3,0,2,2,3,3,0,3,0,3,2,0,3,3,3,0, 0,0,3,2,2,3,3,3,3,3,3,3,3,1,3,3,3,0,2,2,3,3,0,3,0,3,2,0,3,3,3,0,
@ -208,18 +206,20 @@ GreekLangModel = ( \
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
) )
Latin7GreekModel = { \ Latin7GreekModel = {
'charToOrderMap': Latin7_CharToOrderMap, 'charToOrderMap': Latin7_CharToOrderMap,
'precedenceMatrix': GreekLangModel, 'precedenceMatrix': GreekLangModel,
'mTypicalPositiveRatio': 0.982851, 'mTypicalPositiveRatio': 0.982851,
'keepEnglishLetter': constants.False, 'keepEnglishLetter': False,
'charsetName': "ISO-8859-7" 'charsetName': "ISO-8859-7"
} }
Win1253GreekModel = { \ Win1253GreekModel = {
'charToOrderMap': win1253_CharToOrderMap, 'charToOrderMap': win1253_CharToOrderMap,
'precedenceMatrix': GreekLangModel, 'precedenceMatrix': GreekLangModel,
'mTypicalPositiveRatio': 0.982851, 'mTypicalPositiveRatio': 0.982851,
'keepEnglishLetter': constants.False, 'keepEnglishLetter': False,
'charsetName': "windows-1253" 'charsetName': "windows-1253"
} }
# flake8: noqa

View File

@ -15,20 +15,18 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants
# 255: Control characters that usually does not exist in any text # 255: Control characters that usually does not exist in any text
# 254: Carriage/Return # 254: Carriage/Return
# 253: symbol (punctuation) that does not belong to word # 253: symbol (punctuation) that does not belong to word
@ -36,7 +34,7 @@ import constants
# Windows-1255 language model # Windows-1255 language model
# Character Mapping Table: # Character Mapping Table:
win1255_CharToOrderMap = ( \ win1255_CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -55,13 +53,13 @@ win1255_CharToOrderMap = ( \
12, 19, 13, 26, 18, 27, 21, 17, 7, 10, 5,251,252,128, 96,253, 12, 19, 13, 26, 18, 27, 21, 17, 7, 10, 5,251,252,128, 96,253,
) )
# Model Table: # Model Table:
# total sequences: 100% # total sequences: 100%
# first 512 sequences: 98.4004% # first 512 sequences: 98.4004%
# first 1024 sequences: 1.5981% # first 1024 sequences: 1.5981%
# rest sequences: 0.087% # rest sequences: 0.087%
# negative sequences: 0.0015% # negative sequences: 0.0015%
HebrewLangModel = ( \ HebrewLangModel = (
0,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,2,1,2,0,1,0,0, 0,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,2,1,2,0,1,0,0,
3,0,3,1,0,0,1,3,2,0,1,1,2,0,2,2,2,1,1,1,1,2,1,1,1,2,0,0,2,2,0,1, 3,0,3,1,0,0,1,3,2,0,1,1,2,0,2,2,2,1,1,1,1,2,1,1,1,2,0,0,2,2,0,1,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,
@ -192,10 +190,12 @@ HebrewLangModel = ( \
0,0,0,0,0,0,0,0,0,0,1,2,1,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,1,1,0,0, 0,0,0,0,0,0,0,0,0,0,1,2,1,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,1,1,0,0,
) )
Win1255HebrewModel = { \ Win1255HebrewModel = {
'charToOrderMap': win1255_CharToOrderMap, 'charToOrderMap': win1255_CharToOrderMap,
'precedenceMatrix': HebrewLangModel, 'precedenceMatrix': HebrewLangModel,
'mTypicalPositiveRatio': 0.984004, 'mTypicalPositiveRatio': 0.984004,
'keepEnglishLetter': constants.False, 'keepEnglishLetter': False,
'charsetName': "windows-1255" 'charsetName': "windows-1255"
} }
# flake8: noqa

View File

@ -13,27 +13,25 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants
# 255: Control characters that usually does not exist in any text # 255: Control characters that usually does not exist in any text
# 254: Carriage/Return # 254: Carriage/Return
# 253: symbol (punctuation) that does not belong to word # 253: symbol (punctuation) that does not belong to word
# 252: 0 - 9 # 252: 0 - 9
# Character Mapping Table: # Character Mapping Table:
Latin2_HungarianCharToOrderMap = ( \ Latin2_HungarianCharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -52,7 +50,7 @@ Latin2_HungarianCharToOrderMap = ( \
245,246,247, 25, 73, 42, 24,248,249,250, 31, 56, 29,251,252,253, 245,246,247, 25, 73, 42, 24,248,249,250, 31, 56, 29,251,252,253,
) )
win1250HungarianCharToOrderMap = ( \ win1250HungarianCharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -71,13 +69,13 @@ win1250HungarianCharToOrderMap = ( \
245,246,247, 25, 74, 42, 24,248,249,250, 31, 56, 29,251,252,253, 245,246,247, 25, 74, 42, 24,248,249,250, 31, 56, 29,251,252,253,
) )
# Model Table: # Model Table:
# total sequences: 100% # total sequences: 100%
# first 512 sequences: 94.7368% # first 512 sequences: 94.7368%
# first 1024 sequences:5.2623% # first 1024 sequences:5.2623%
# rest sequences: 0.8894% # rest sequences: 0.8894%
# negative sequences: 0.0009% # negative sequences: 0.0009%
HungarianLangModel = ( \ HungarianLangModel = (
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,1,1,2,2,2,2,2,1,2, 3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,1,1,2,2,2,2,2,1,2,
3,2,2,3,3,3,3,3,2,3,3,3,3,3,3,1,2,3,3,3,3,2,3,3,1,1,3,3,0,1,1,1, 3,2,2,3,3,3,3,3,2,3,3,3,3,3,3,1,2,3,3,3,3,2,3,3,1,1,3,3,0,1,1,1,
@ -208,18 +206,20 @@ HungarianLangModel = ( \
0,1,1,1,1,1,1,0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0, 0,1,1,1,1,1,1,0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,
) )
Latin2HungarianModel = { \ Latin2HungarianModel = {
'charToOrderMap': Latin2_HungarianCharToOrderMap, 'charToOrderMap': Latin2_HungarianCharToOrderMap,
'precedenceMatrix': HungarianLangModel, 'precedenceMatrix': HungarianLangModel,
'mTypicalPositiveRatio': 0.947368, 'mTypicalPositiveRatio': 0.947368,
'keepEnglishLetter': constants.True, 'keepEnglishLetter': True,
'charsetName': "ISO-8859-2" 'charsetName': "ISO-8859-2"
} }
Win1250HungarianModel = { \ Win1250HungarianModel = {
'charToOrderMap': win1250HungarianCharToOrderMap, 'charToOrderMap': win1250HungarianCharToOrderMap,
'precedenceMatrix': HungarianLangModel, 'precedenceMatrix': HungarianLangModel,
'mTypicalPositiveRatio': 0.947368, 'mTypicalPositiveRatio': 0.947368,
'keepEnglishLetter': constants.True, 'keepEnglishLetter': True,
'charsetName': "windows-1250" 'charsetName': "windows-1250"
} }
# flake8: noqa

View File

@ -13,29 +13,27 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants
# 255: Control characters that usually does not exist in any text # 255: Control characters that usually does not exist in any text
# 254: Carriage/Return # 254: Carriage/Return
# 253: symbol (punctuation) that does not belong to word # 253: symbol (punctuation) that does not belong to word
# 252: 0 - 9 # 252: 0 - 9
# The following result for thai was collected from a limited sample (1M). # The following result for thai was collected from a limited sample (1M).
# Character Mapping Table: # Character Mapping Table:
TIS620CharToOrderMap = ( \ TIS620CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -54,13 +52,13 @@ TIS620CharToOrderMap = ( \
68, 56, 59, 65, 69, 60, 70, 80, 71, 87,248,249,250,251,252,253, 68, 56, 59, 65, 69, 60, 70, 80, 71, 87,248,249,250,251,252,253,
) )
# Model Table: # Model Table:
# total sequences: 100% # total sequences: 100%
# first 512 sequences: 92.6386% # first 512 sequences: 92.6386%
# first 1024 sequences:7.3177% # first 1024 sequences:7.3177%
# rest sequences: 1.0230% # rest sequences: 1.0230%
# negative sequences: 0.0436% # negative sequences: 0.0436%
ThaiLangModel = ( \ ThaiLangModel = (
0,1,3,3,3,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,3,3, 0,1,3,3,3,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,3,3,
0,3,3,0,0,0,1,3,0,3,3,2,3,3,0,1,2,3,3,3,3,0,2,0,2,0,0,3,2,1,2,2, 0,3,3,0,0,0,1,3,0,3,3,2,3,3,0,1,2,3,3,3,3,0,2,0,2,0,0,3,2,1,2,2,
3,0,3,3,2,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,3,0,3,2,3,0,2,2,2,3, 3,0,3,3,2,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,3,0,3,2,3,0,2,2,2,3,
@ -191,10 +189,12 @@ ThaiLangModel = ( \
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
) )
TIS620ThaiModel = { \ TIS620ThaiModel = {
'charToOrderMap': TIS620CharToOrderMap, 'charToOrderMap': TIS620CharToOrderMap,
'precedenceMatrix': ThaiLangModel, 'precedenceMatrix': ThaiLangModel,
'mTypicalPositiveRatio': 0.926386, 'mTypicalPositiveRatio': 0.926386,
'keepEnglishLetter': constants.False, 'keepEnglishLetter': False,
'charsetName': "TIS-620" 'charsetName': "TIS-620"
} }
# flake8: noqa

View File

@ -14,85 +14,86 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from charsetprober import CharSetProber from .charsetprober import CharSetProber
import constants from .constants import eNotMe
import operator from .compat import wrap_ord
FREQ_CAT_NUM = 4 FREQ_CAT_NUM = 4
UDF = 0 # undefined UDF = 0 # undefined
OTH = 1 # other OTH = 1 # other
ASC = 2 # ascii capital letter ASC = 2 # ascii capital letter
ASS = 3 # ascii small letter ASS = 3 # ascii small letter
ACV = 4 # accent capital vowel ACV = 4 # accent capital vowel
ACO = 5 # accent capital other ACO = 5 # accent capital other
ASV = 6 # accent small vowel ASV = 6 # accent small vowel
ASO = 7 # accent small other ASO = 7 # accent small other
CLASS_NUM = 8 # total classes CLASS_NUM = 8 # total classes
Latin1_CharToClass = ( \ Latin1_CharToClass = (
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F
OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47 OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57
ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F
OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67 OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77 ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77
ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F
OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, # 80 - 87 OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, # 80 - 87
OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, # 88 - 8F OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, # 88 - 8F
UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 90 - 97 UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 90 - 97
OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, # 98 - 9F OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, # 98 - 9F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A0 - A7 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A0 - A7
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A8 - AF OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A8 - AF
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B8 - BF OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B8 - BF
ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, # C0 - C7 ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, # C0 - C7
ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # C8 - CF ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # C8 - CF
ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, # D0 - D7 ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, # D0 - D7
ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, # D8 - DF ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, # D8 - DF
ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, # E0 - E7 ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, # E0 - E7
ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # E8 - EF ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # E8 - EF
ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, # F0 - F7 ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, # F0 - F7
ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, # F8 - FF ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, # F8 - FF
) )
# 0 : illegal # 0 : illegal
# 1 : very unlikely # 1 : very unlikely
# 2 : normal # 2 : normal
# 3 : very likely # 3 : very likely
Latin1ClassModel = ( \ Latin1ClassModel = (
# UDF OTH ASC ASS ACV ACO ASV ASO # UDF OTH ASC ASS ACV ACO ASV ASO
0, 0, 0, 0, 0, 0, 0, 0, # UDF 0, 0, 0, 0, 0, 0, 0, 0, # UDF
0, 3, 3, 3, 3, 3, 3, 3, # OTH 0, 3, 3, 3, 3, 3, 3, 3, # OTH
0, 3, 3, 3, 3, 3, 3, 3, # ASC 0, 3, 3, 3, 3, 3, 3, 3, # ASC
0, 3, 3, 3, 1, 1, 3, 3, # ASS 0, 3, 3, 3, 1, 1, 3, 3, # ASS
0, 3, 3, 3, 1, 2, 1, 2, # ACV 0, 3, 3, 3, 1, 2, 1, 2, # ACV
0, 3, 3, 3, 3, 3, 3, 3, # ACO 0, 3, 3, 3, 3, 3, 3, 3, # ACO
0, 3, 1, 3, 1, 1, 1, 3, # ASV 0, 3, 1, 3, 1, 1, 1, 3, # ASV
0, 3, 1, 3, 1, 1, 3, 3, # ASO 0, 3, 1, 3, 1, 1, 3, 3, # ASO
) )
class Latin1Prober(CharSetProber): class Latin1Prober(CharSetProber):
def __init__(self): def __init__(self):
CharSetProber.__init__(self) CharSetProber.__init__(self)
@ -109,10 +110,11 @@ class Latin1Prober(CharSetProber):
def feed(self, aBuf): def feed(self, aBuf):
aBuf = self.filter_with_english_letters(aBuf) aBuf = self.filter_with_english_letters(aBuf)
for c in aBuf: for c in aBuf:
charClass = Latin1_CharToClass[ord(c)] charClass = Latin1_CharToClass[wrap_ord(c)]
freq = Latin1ClassModel[(self._mLastCharClass * CLASS_NUM) + charClass] freq = Latin1ClassModel[(self._mLastCharClass * CLASS_NUM)
+ charClass]
if freq == 0: if freq == 0:
self._mState = constants.eNotMe self._mState = eNotMe
break break
self._mFreqCounter[freq] += 1 self._mFreqCounter[freq] += 1
self._mLastCharClass = charClass self._mLastCharClass = charClass
@ -120,17 +122,18 @@ class Latin1Prober(CharSetProber):
return self.get_state() return self.get_state()
def get_confidence(self): def get_confidence(self):
if self.get_state() == constants.eNotMe: if self.get_state() == eNotMe:
return 0.01 return 0.01
total = reduce(operator.add, self._mFreqCounter) total = sum(self._mFreqCounter)
if total < 0.01: if total < 0.01:
confidence = 0.0 confidence = 0.0
else: else:
confidence = (self._mFreqCounter[3] / total) - (self._mFreqCounter[1] * 20.0 / total) confidence = ((self._mFreqCounter[3] - self._mFreqCounter[1] * 20.0)
/ total)
if confidence < 0.0: if confidence < 0.0:
confidence = 0.0 confidence = 0.0
# lower the confidence of latin1 so that other more accurate detector # lower the confidence of latin1 so that other more accurate
# can take priority. # detector can take priority.
confidence = confidence * 0.5 confidence = confidence * 0.73
return confidence return confidence

View File

@ -15,28 +15,29 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants, sys import sys
from constants import eStart, eError, eItsMe from . import constants
from charsetprober import CharSetProber from .charsetprober import CharSetProber
class MultiByteCharSetProber(CharSetProber): class MultiByteCharSetProber(CharSetProber):
def __init__(self): def __init__(self):
CharSetProber.__init__(self) CharSetProber.__init__(self)
self._mDistributionAnalyzer = None self._mDistributionAnalyzer = None
self._mCodingSM = None self._mCodingSM = None
self._mLastChar = ['\x00', '\x00'] self._mLastChar = [0, 0]
def reset(self): def reset(self):
CharSetProber.reset(self) CharSetProber.reset(self)
@ -44,7 +45,7 @@ class MultiByteCharSetProber(CharSetProber):
self._mCodingSM.reset() self._mCodingSM.reset()
if self._mDistributionAnalyzer: if self._mDistributionAnalyzer:
self._mDistributionAnalyzer.reset() self._mDistributionAnalyzer.reset()
self._mLastChar = ['\x00', '\x00'] self._mLastChar = [0, 0]
def get_charset_name(self): def get_charset_name(self):
pass pass
@ -53,27 +54,30 @@ class MultiByteCharSetProber(CharSetProber):
aLen = len(aBuf) aLen = len(aBuf)
for i in xrange(0, aLen): for i in xrange(0, aLen):
codingState = self._mCodingSM.next_state(aBuf[i]) codingState = self._mCodingSM.next_state(aBuf[i])
if codingState == eError: if codingState == constants.eError:
if constants._debug: if constants._debug:
sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n') sys.stderr.write(self.get_charset_name()
+ ' prober hit error at byte ' + str(i)
+ '\n')
self._mState = constants.eNotMe self._mState = constants.eNotMe
break break
elif codingState == eItsMe: elif codingState == constants.eItsMe:
self._mState = constants.eFoundIt self._mState = constants.eFoundIt
break break
elif codingState == eStart: elif codingState == constants.eStart:
charLen = self._mCodingSM.get_current_charlen() charLen = self._mCodingSM.get_current_charlen()
if i == 0: if i == 0:
self._mLastChar[1] = aBuf[0] self._mLastChar[1] = aBuf[0]
self._mDistributionAnalyzer.feed(self._mLastChar, charLen) self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
else: else:
self._mDistributionAnalyzer.feed(aBuf[i-1:i+1], charLen) self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1],
charLen)
self._mLastChar[0] = aBuf[aLen - 1] self._mLastChar[0] = aBuf[aLen - 1]
if self.get_state() == constants.eDetecting: if self.get_state() == constants.eDetecting:
if self._mDistributionAnalyzer.got_enough_data() and \ if (self._mDistributionAnalyzer.got_enough_data() and
(self.get_confidence() > constants.SHORTCUT_THRESHOLD): (self.get_confidence() > constants.SHORTCUT_THRESHOLD)):
self._mState = constants.eFoundIt self._mState = constants.eFoundIt
return self.get_state() return self.get_state()

View File

@ -15,36 +15,40 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from charsetgroupprober import CharSetGroupProber from .charsetgroupprober import CharSetGroupProber
from utf8prober import UTF8Prober from .utf8prober import UTF8Prober
from sjisprober import SJISProber from .sjisprober import SJISProber
from eucjpprober import EUCJPProber from .eucjpprober import EUCJPProber
from gb2312prober import GB2312Prober from .gb2312prober import GB2312Prober
from euckrprober import EUCKRProber from .euckrprober import EUCKRProber
from big5prober import Big5Prober from .cp949prober import CP949Prober
from euctwprober import EUCTWProber from .big5prober import Big5Prober
from .euctwprober import EUCTWProber
class MBCSGroupProber(CharSetGroupProber): class MBCSGroupProber(CharSetGroupProber):
def __init__(self): def __init__(self):
CharSetGroupProber.__init__(self) CharSetGroupProber.__init__(self)
self._mProbers = [ \ self._mProbers = [
UTF8Prober(), UTF8Prober(),
SJISProber(), SJISProber(),
EUCJPProber(), EUCJPProber(),
GB2312Prober(), GB2312Prober(),
EUCKRProber(), EUCKRProber(),
CP949Prober(),
Big5Prober(), Big5Prober(),
EUCTWProber()] EUCTWProber()
]
self.reset() self.reset()

View File

@ -13,60 +13,62 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from constants import eStart, eError, eItsMe from .constants import eStart, eError, eItsMe
# BIG5 # BIG5
BIG5_cls = ( \ BIG5_cls = (
1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as legal value 1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as legal value
1,1,1,1,1,1,0,0, # 08 - 0f 1,1,1,1,1,1,0,0, # 08 - 0f
1,1,1,1,1,1,1,1, # 10 - 17 1,1,1,1,1,1,1,1, # 10 - 17
1,1,1,0,1,1,1,1, # 18 - 1f 1,1,1,0,1,1,1,1, # 18 - 1f
1,1,1,1,1,1,1,1, # 20 - 27 1,1,1,1,1,1,1,1, # 20 - 27
1,1,1,1,1,1,1,1, # 28 - 2f 1,1,1,1,1,1,1,1, # 28 - 2f
1,1,1,1,1,1,1,1, # 30 - 37 1,1,1,1,1,1,1,1, # 30 - 37
1,1,1,1,1,1,1,1, # 38 - 3f 1,1,1,1,1,1,1,1, # 38 - 3f
2,2,2,2,2,2,2,2, # 40 - 47 2,2,2,2,2,2,2,2, # 40 - 47
2,2,2,2,2,2,2,2, # 48 - 4f 2,2,2,2,2,2,2,2, # 48 - 4f
2,2,2,2,2,2,2,2, # 50 - 57 2,2,2,2,2,2,2,2, # 50 - 57
2,2,2,2,2,2,2,2, # 58 - 5f 2,2,2,2,2,2,2,2, # 58 - 5f
2,2,2,2,2,2,2,2, # 60 - 67 2,2,2,2,2,2,2,2, # 60 - 67
2,2,2,2,2,2,2,2, # 68 - 6f 2,2,2,2,2,2,2,2, # 68 - 6f
2,2,2,2,2,2,2,2, # 70 - 77 2,2,2,2,2,2,2,2, # 70 - 77
2,2,2,2,2,2,2,1, # 78 - 7f 2,2,2,2,2,2,2,1, # 78 - 7f
4,4,4,4,4,4,4,4, # 80 - 87 4,4,4,4,4,4,4,4, # 80 - 87
4,4,4,4,4,4,4,4, # 88 - 8f 4,4,4,4,4,4,4,4, # 88 - 8f
4,4,4,4,4,4,4,4, # 90 - 97 4,4,4,4,4,4,4,4, # 90 - 97
4,4,4,4,4,4,4,4, # 98 - 9f 4,4,4,4,4,4,4,4, # 98 - 9f
4,3,3,3,3,3,3,3, # a0 - a7 4,3,3,3,3,3,3,3, # a0 - a7
3,3,3,3,3,3,3,3, # a8 - af 3,3,3,3,3,3,3,3, # a8 - af
3,3,3,3,3,3,3,3, # b0 - b7 3,3,3,3,3,3,3,3, # b0 - b7
3,3,3,3,3,3,3,3, # b8 - bf 3,3,3,3,3,3,3,3, # b8 - bf
3,3,3,3,3,3,3,3, # c0 - c7 3,3,3,3,3,3,3,3, # c0 - c7
3,3,3,3,3,3,3,3, # c8 - cf 3,3,3,3,3,3,3,3, # c8 - cf
3,3,3,3,3,3,3,3, # d0 - d7 3,3,3,3,3,3,3,3, # d0 - d7
3,3,3,3,3,3,3,3, # d8 - df 3,3,3,3,3,3,3,3, # d8 - df
3,3,3,3,3,3,3,3, # e0 - e7 3,3,3,3,3,3,3,3, # e0 - e7
3,3,3,3,3,3,3,3, # e8 - ef 3,3,3,3,3,3,3,3, # e8 - ef
3,3,3,3,3,3,3,3, # f0 - f7 3,3,3,3,3,3,3,3, # f0 - f7
3,3,3,3,3,3,3,0) # f8 - ff 3,3,3,3,3,3,3,0 # f8 - ff
)
BIG5_st = ( \ BIG5_st = (
eError,eStart,eStart, 3,eError,eError,eError,eError,#00-07 eError,eStart,eStart, 3,eError,eError,eError,eError,#00-07
eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,#08-0f eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,#08-0f
eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart)#10-17 eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart#10-17
)
Big5CharLenTable = (0, 1, 1, 2, 0) Big5CharLenTable = (0, 1, 1, 2, 0)
@ -76,48 +78,90 @@ Big5SMModel = {'classTable': BIG5_cls,
'charLenTable': Big5CharLenTable, 'charLenTable': Big5CharLenTable,
'name': 'Big5'} 'name': 'Big5'}
# CP949
CP949_cls = (
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,0,0, # 00 - 0f
1,1,1,1,1,1,1,1, 1,1,1,0,1,1,1,1, # 10 - 1f
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, # 20 - 2f
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, # 30 - 3f
1,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, # 40 - 4f
4,4,5,5,5,5,5,5, 5,5,5,1,1,1,1,1, # 50 - 5f
1,5,5,5,5,5,5,5, 5,5,5,5,5,5,5,5, # 60 - 6f
5,5,5,5,5,5,5,5, 5,5,5,1,1,1,1,1, # 70 - 7f
0,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6, # 80 - 8f
6,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6, # 90 - 9f
6,7,7,7,7,7,7,7, 7,7,7,7,7,8,8,8, # a0 - af
7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7, # b0 - bf
7,7,7,7,7,7,9,2, 2,3,2,2,2,2,2,2, # c0 - cf
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, # d0 - df
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, # e0 - ef
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,0, # f0 - ff
)
CP949_st = (
#cls= 0 1 2 3 4 5 6 7 8 9 # previous state =
eError,eStart, 3,eError,eStart,eStart, 4, 5,eError, 6, # eStart
eError,eError,eError,eError,eError,eError,eError,eError,eError,eError, # eError
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe, # eItsMe
eError,eError,eStart,eStart,eError,eError,eError,eStart,eStart,eStart, # 3
eError,eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart,eStart, # 4
eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart,eStart,eStart, # 5
eError,eStart,eStart,eStart,eStart,eError,eError,eStart,eStart,eStart, # 6
)
CP949CharLenTable = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2)
CP949SMModel = {'classTable': CP949_cls,
'classFactor': 10,
'stateTable': CP949_st,
'charLenTable': CP949CharLenTable,
'name': 'CP949'}
# EUC-JP # EUC-JP
EUCJP_cls = ( \ EUCJP_cls = (
4,4,4,4,4,4,4,4, # 00 - 07 4,4,4,4,4,4,4,4, # 00 - 07
4,4,4,4,4,4,5,5, # 08 - 0f 4,4,4,4,4,4,5,5, # 08 - 0f
4,4,4,4,4,4,4,4, # 10 - 17 4,4,4,4,4,4,4,4, # 10 - 17
4,4,4,5,4,4,4,4, # 18 - 1f 4,4,4,5,4,4,4,4, # 18 - 1f
4,4,4,4,4,4,4,4, # 20 - 27 4,4,4,4,4,4,4,4, # 20 - 27
4,4,4,4,4,4,4,4, # 28 - 2f 4,4,4,4,4,4,4,4, # 28 - 2f
4,4,4,4,4,4,4,4, # 30 - 37 4,4,4,4,4,4,4,4, # 30 - 37
4,4,4,4,4,4,4,4, # 38 - 3f 4,4,4,4,4,4,4,4, # 38 - 3f
4,4,4,4,4,4,4,4, # 40 - 47 4,4,4,4,4,4,4,4, # 40 - 47
4,4,4,4,4,4,4,4, # 48 - 4f 4,4,4,4,4,4,4,4, # 48 - 4f
4,4,4,4,4,4,4,4, # 50 - 57 4,4,4,4,4,4,4,4, # 50 - 57
4,4,4,4,4,4,4,4, # 58 - 5f 4,4,4,4,4,4,4,4, # 58 - 5f
4,4,4,4,4,4,4,4, # 60 - 67 4,4,4,4,4,4,4,4, # 60 - 67
4,4,4,4,4,4,4,4, # 68 - 6f 4,4,4,4,4,4,4,4, # 68 - 6f
4,4,4,4,4,4,4,4, # 70 - 77 4,4,4,4,4,4,4,4, # 70 - 77
4,4,4,4,4,4,4,4, # 78 - 7f 4,4,4,4,4,4,4,4, # 78 - 7f
5,5,5,5,5,5,5,5, # 80 - 87 5,5,5,5,5,5,5,5, # 80 - 87
5,5,5,5,5,5,1,3, # 88 - 8f 5,5,5,5,5,5,1,3, # 88 - 8f
5,5,5,5,5,5,5,5, # 90 - 97 5,5,5,5,5,5,5,5, # 90 - 97
5,5,5,5,5,5,5,5, # 98 - 9f 5,5,5,5,5,5,5,5, # 98 - 9f
5,2,2,2,2,2,2,2, # a0 - a7 5,2,2,2,2,2,2,2, # a0 - a7
2,2,2,2,2,2,2,2, # a8 - af 2,2,2,2,2,2,2,2, # a8 - af
2,2,2,2,2,2,2,2, # b0 - b7 2,2,2,2,2,2,2,2, # b0 - b7
2,2,2,2,2,2,2,2, # b8 - bf 2,2,2,2,2,2,2,2, # b8 - bf
2,2,2,2,2,2,2,2, # c0 - c7 2,2,2,2,2,2,2,2, # c0 - c7
2,2,2,2,2,2,2,2, # c8 - cf 2,2,2,2,2,2,2,2, # c8 - cf
2,2,2,2,2,2,2,2, # d0 - d7 2,2,2,2,2,2,2,2, # d0 - d7
2,2,2,2,2,2,2,2, # d8 - df 2,2,2,2,2,2,2,2, # d8 - df
0,0,0,0,0,0,0,0, # e0 - e7 0,0,0,0,0,0,0,0, # e0 - e7
0,0,0,0,0,0,0,0, # e8 - ef 0,0,0,0,0,0,0,0, # e8 - ef
0,0,0,0,0,0,0,0, # f0 - f7 0,0,0,0,0,0,0,0, # f0 - f7
0,0,0,0,0,0,0,5) # f8 - ff 0,0,0,0,0,0,0,5 # f8 - ff
)
EUCJP_st = ( \ EUCJP_st = (
3, 4, 3, 5,eStart,eError,eError,eError,#00-07 3, 4, 3, 5,eStart,eError,eError,eError,#00-07
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
eItsMe,eItsMe,eStart,eError,eStart,eError,eError,eError,#10-17 eItsMe,eItsMe,eStart,eError,eStart,eError,eError,eError,#10-17
eError,eError,eStart,eError,eError,eError, 3,eError,#18-1f eError,eError,eStart,eError,eError,eError, 3,eError,#18-1f
3,eError,eError,eError,eStart,eStart,eStart,eStart)#20-27 3,eError,eError,eError,eStart,eStart,eStart,eStart#20-27
)
EUCJPCharLenTable = (2, 2, 2, 3, 1, 0) EUCJPCharLenTable = (2, 2, 2, 3, 1, 0)
@ -129,43 +173,45 @@ EUCJPSMModel = {'classTable': EUCJP_cls,
# EUC-KR # EUC-KR
EUCKR_cls = ( \ EUCKR_cls = (
1,1,1,1,1,1,1,1, # 00 - 07 1,1,1,1,1,1,1,1, # 00 - 07
1,1,1,1,1,1,0,0, # 08 - 0f 1,1,1,1,1,1,0,0, # 08 - 0f
1,1,1,1,1,1,1,1, # 10 - 17 1,1,1,1,1,1,1,1, # 10 - 17
1,1,1,0,1,1,1,1, # 18 - 1f 1,1,1,0,1,1,1,1, # 18 - 1f
1,1,1,1,1,1,1,1, # 20 - 27 1,1,1,1,1,1,1,1, # 20 - 27
1,1,1,1,1,1,1,1, # 28 - 2f 1,1,1,1,1,1,1,1, # 28 - 2f
1,1,1,1,1,1,1,1, # 30 - 37 1,1,1,1,1,1,1,1, # 30 - 37
1,1,1,1,1,1,1,1, # 38 - 3f 1,1,1,1,1,1,1,1, # 38 - 3f
1,1,1,1,1,1,1,1, # 40 - 47 1,1,1,1,1,1,1,1, # 40 - 47
1,1,1,1,1,1,1,1, # 48 - 4f 1,1,1,1,1,1,1,1, # 48 - 4f
1,1,1,1,1,1,1,1, # 50 - 57 1,1,1,1,1,1,1,1, # 50 - 57
1,1,1,1,1,1,1,1, # 58 - 5f 1,1,1,1,1,1,1,1, # 58 - 5f
1,1,1,1,1,1,1,1, # 60 - 67 1,1,1,1,1,1,1,1, # 60 - 67
1,1,1,1,1,1,1,1, # 68 - 6f 1,1,1,1,1,1,1,1, # 68 - 6f
1,1,1,1,1,1,1,1, # 70 - 77 1,1,1,1,1,1,1,1, # 70 - 77
1,1,1,1,1,1,1,1, # 78 - 7f 1,1,1,1,1,1,1,1, # 78 - 7f
0,0,0,0,0,0,0,0, # 80 - 87 0,0,0,0,0,0,0,0, # 80 - 87
0,0,0,0,0,0,0,0, # 88 - 8f 0,0,0,0,0,0,0,0, # 88 - 8f
0,0,0,0,0,0,0,0, # 90 - 97 0,0,0,0,0,0,0,0, # 90 - 97
0,0,0,0,0,0,0,0, # 98 - 9f 0,0,0,0,0,0,0,0, # 98 - 9f
0,2,2,2,2,2,2,2, # a0 - a7 0,2,2,2,2,2,2,2, # a0 - a7
2,2,2,2,2,3,3,3, # a8 - af 2,2,2,2,2,3,3,3, # a8 - af
2,2,2,2,2,2,2,2, # b0 - b7 2,2,2,2,2,2,2,2, # b0 - b7
2,2,2,2,2,2,2,2, # b8 - bf 2,2,2,2,2,2,2,2, # b8 - bf
2,2,2,2,2,2,2,2, # c0 - c7 2,2,2,2,2,2,2,2, # c0 - c7
2,3,2,2,2,2,2,2, # c8 - cf 2,3,2,2,2,2,2,2, # c8 - cf
2,2,2,2,2,2,2,2, # d0 - d7 2,2,2,2,2,2,2,2, # d0 - d7
2,2,2,2,2,2,2,2, # d8 - df 2,2,2,2,2,2,2,2, # d8 - df
2,2,2,2,2,2,2,2, # e0 - e7 2,2,2,2,2,2,2,2, # e0 - e7
2,2,2,2,2,2,2,2, # e8 - ef 2,2,2,2,2,2,2,2, # e8 - ef
2,2,2,2,2,2,2,2, # f0 - f7 2,2,2,2,2,2,2,2, # f0 - f7
2,2,2,2,2,2,2,0) # f8 - ff 2,2,2,2,2,2,2,0 # f8 - ff
)
EUCKR_st = ( EUCKR_st = (
eError,eStart, 3,eError,eError,eError,eError,eError,#00-07 eError,eStart, 3,eError,eError,eError,eError,eError,#00-07
eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart)#08-0f eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart #08-0f
)
EUCKRCharLenTable = (0, 1, 2, 0) EUCKRCharLenTable = (0, 1, 2, 0)
@ -177,47 +223,49 @@ EUCKRSMModel = {'classTable': EUCKR_cls,
# EUC-TW # EUC-TW
EUCTW_cls = ( \ EUCTW_cls = (
2,2,2,2,2,2,2,2, # 00 - 07 2,2,2,2,2,2,2,2, # 00 - 07
2,2,2,2,2,2,0,0, # 08 - 0f 2,2,2,2,2,2,0,0, # 08 - 0f
2,2,2,2,2,2,2,2, # 10 - 17 2,2,2,2,2,2,2,2, # 10 - 17
2,2,2,0,2,2,2,2, # 18 - 1f 2,2,2,0,2,2,2,2, # 18 - 1f
2,2,2,2,2,2,2,2, # 20 - 27 2,2,2,2,2,2,2,2, # 20 - 27
2,2,2,2,2,2,2,2, # 28 - 2f 2,2,2,2,2,2,2,2, # 28 - 2f
2,2,2,2,2,2,2,2, # 30 - 37 2,2,2,2,2,2,2,2, # 30 - 37
2,2,2,2,2,2,2,2, # 38 - 3f 2,2,2,2,2,2,2,2, # 38 - 3f
2,2,2,2,2,2,2,2, # 40 - 47 2,2,2,2,2,2,2,2, # 40 - 47
2,2,2,2,2,2,2,2, # 48 - 4f 2,2,2,2,2,2,2,2, # 48 - 4f
2,2,2,2,2,2,2,2, # 50 - 57 2,2,2,2,2,2,2,2, # 50 - 57
2,2,2,2,2,2,2,2, # 58 - 5f 2,2,2,2,2,2,2,2, # 58 - 5f
2,2,2,2,2,2,2,2, # 60 - 67 2,2,2,2,2,2,2,2, # 60 - 67
2,2,2,2,2,2,2,2, # 68 - 6f 2,2,2,2,2,2,2,2, # 68 - 6f
2,2,2,2,2,2,2,2, # 70 - 77 2,2,2,2,2,2,2,2, # 70 - 77
2,2,2,2,2,2,2,2, # 78 - 7f 2,2,2,2,2,2,2,2, # 78 - 7f
0,0,0,0,0,0,0,0, # 80 - 87 0,0,0,0,0,0,0,0, # 80 - 87
0,0,0,0,0,0,6,0, # 88 - 8f 0,0,0,0,0,0,6,0, # 88 - 8f
0,0,0,0,0,0,0,0, # 90 - 97 0,0,0,0,0,0,0,0, # 90 - 97
0,0,0,0,0,0,0,0, # 98 - 9f 0,0,0,0,0,0,0,0, # 98 - 9f
0,3,4,4,4,4,4,4, # a0 - a7 0,3,4,4,4,4,4,4, # a0 - a7
5,5,1,1,1,1,1,1, # a8 - af 5,5,1,1,1,1,1,1, # a8 - af
1,1,1,1,1,1,1,1, # b0 - b7 1,1,1,1,1,1,1,1, # b0 - b7
1,1,1,1,1,1,1,1, # b8 - bf 1,1,1,1,1,1,1,1, # b8 - bf
1,1,3,1,3,3,3,3, # c0 - c7 1,1,3,1,3,3,3,3, # c0 - c7
3,3,3,3,3,3,3,3, # c8 - cf 3,3,3,3,3,3,3,3, # c8 - cf
3,3,3,3,3,3,3,3, # d0 - d7 3,3,3,3,3,3,3,3, # d0 - d7
3,3,3,3,3,3,3,3, # d8 - df 3,3,3,3,3,3,3,3, # d8 - df
3,3,3,3,3,3,3,3, # e0 - e7 3,3,3,3,3,3,3,3, # e0 - e7
3,3,3,3,3,3,3,3, # e8 - ef 3,3,3,3,3,3,3,3, # e8 - ef
3,3,3,3,3,3,3,3, # f0 - f7 3,3,3,3,3,3,3,3, # f0 - f7
3,3,3,3,3,3,3,0) # f8 - ff 3,3,3,3,3,3,3,0 # f8 - ff
)
EUCTW_st = ( \ EUCTW_st = (
eError,eError,eStart, 3, 3, 3, 4,eError,#00-07 eError,eError,eStart, 3, 3, 3, 4,eError,#00-07
eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,#08-0f eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,#08-0f
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eStart,eError,#10-17 eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eStart,eError,#10-17
eStart,eStart,eStart,eError,eError,eError,eError,eError,#18-1f eStart,eStart,eStart,eError,eError,eError,eError,eError,#18-1f
5,eError,eError,eError,eStart,eError,eStart,eStart,#20-27 5,eError,eError,eError,eStart,eError,eStart,eStart,#20-27
eStart,eError,eStart,eStart,eStart,eStart,eStart,eStart)#28-2f eStart,eError,eStart,eStart,eStart,eStart,eStart,eStart #28-2f
)
EUCTWCharLenTable = (0, 0, 1, 2, 2, 2, 3) EUCTWCharLenTable = (0, 0, 1, 2, 2, 2, 3)
@ -229,53 +277,55 @@ EUCTWSMModel = {'classTable': EUCTW_cls,
# GB2312 # GB2312
GB2312_cls = ( \ GB2312_cls = (
1,1,1,1,1,1,1,1, # 00 - 07 1,1,1,1,1,1,1,1, # 00 - 07
1,1,1,1,1,1,0,0, # 08 - 0f 1,1,1,1,1,1,0,0, # 08 - 0f
1,1,1,1,1,1,1,1, # 10 - 17 1,1,1,1,1,1,1,1, # 10 - 17
1,1,1,0,1,1,1,1, # 18 - 1f 1,1,1,0,1,1,1,1, # 18 - 1f
1,1,1,1,1,1,1,1, # 20 - 27 1,1,1,1,1,1,1,1, # 20 - 27
1,1,1,1,1,1,1,1, # 28 - 2f 1,1,1,1,1,1,1,1, # 28 - 2f
3,3,3,3,3,3,3,3, # 30 - 37 3,3,3,3,3,3,3,3, # 30 - 37
3,3,1,1,1,1,1,1, # 38 - 3f 3,3,1,1,1,1,1,1, # 38 - 3f
2,2,2,2,2,2,2,2, # 40 - 47 2,2,2,2,2,2,2,2, # 40 - 47
2,2,2,2,2,2,2,2, # 48 - 4f 2,2,2,2,2,2,2,2, # 48 - 4f
2,2,2,2,2,2,2,2, # 50 - 57 2,2,2,2,2,2,2,2, # 50 - 57
2,2,2,2,2,2,2,2, # 58 - 5f 2,2,2,2,2,2,2,2, # 58 - 5f
2,2,2,2,2,2,2,2, # 60 - 67 2,2,2,2,2,2,2,2, # 60 - 67
2,2,2,2,2,2,2,2, # 68 - 6f 2,2,2,2,2,2,2,2, # 68 - 6f
2,2,2,2,2,2,2,2, # 70 - 77 2,2,2,2,2,2,2,2, # 70 - 77
2,2,2,2,2,2,2,4, # 78 - 7f 2,2,2,2,2,2,2,4, # 78 - 7f
5,6,6,6,6,6,6,6, # 80 - 87 5,6,6,6,6,6,6,6, # 80 - 87
6,6,6,6,6,6,6,6, # 88 - 8f 6,6,6,6,6,6,6,6, # 88 - 8f
6,6,6,6,6,6,6,6, # 90 - 97 6,6,6,6,6,6,6,6, # 90 - 97
6,6,6,6,6,6,6,6, # 98 - 9f 6,6,6,6,6,6,6,6, # 98 - 9f
6,6,6,6,6,6,6,6, # a0 - a7 6,6,6,6,6,6,6,6, # a0 - a7
6,6,6,6,6,6,6,6, # a8 - af 6,6,6,6,6,6,6,6, # a8 - af
6,6,6,6,6,6,6,6, # b0 - b7 6,6,6,6,6,6,6,6, # b0 - b7
6,6,6,6,6,6,6,6, # b8 - bf 6,6,6,6,6,6,6,6, # b8 - bf
6,6,6,6,6,6,6,6, # c0 - c7 6,6,6,6,6,6,6,6, # c0 - c7
6,6,6,6,6,6,6,6, # c8 - cf 6,6,6,6,6,6,6,6, # c8 - cf
6,6,6,6,6,6,6,6, # d0 - d7 6,6,6,6,6,6,6,6, # d0 - d7
6,6,6,6,6,6,6,6, # d8 - df 6,6,6,6,6,6,6,6, # d8 - df
6,6,6,6,6,6,6,6, # e0 - e7 6,6,6,6,6,6,6,6, # e0 - e7
6,6,6,6,6,6,6,6, # e8 - ef 6,6,6,6,6,6,6,6, # e8 - ef
6,6,6,6,6,6,6,6, # f0 - f7 6,6,6,6,6,6,6,6, # f0 - f7
6,6,6,6,6,6,6,0) # f8 - ff 6,6,6,6,6,6,6,0 # f8 - ff
)
GB2312_st = ( \ GB2312_st = (
eError,eStart,eStart,eStart,eStart,eStart, 3,eError,#00-07 eError,eStart,eStart,eStart,eStart,eStart, 3,eError,#00-07
eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,#08-0f eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,#08-0f
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,#10-17 eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,#10-17
4,eError,eStart,eStart,eError,eError,eError,eError,#18-1f 4,eError,eStart,eStart,eError,eError,eError,eError,#18-1f
eError,eError, 5,eError,eError,eError,eItsMe,eError,#20-27 eError,eError, 5,eError,eError,eError,eItsMe,eError,#20-27
eError,eError,eStart,eStart,eStart,eStart,eStart,eStart)#28-2f eError,eError,eStart,eStart,eStart,eStart,eStart,eStart #28-2f
)
# To be accurate, the length of class 6 can be either 2 or 4. # To be accurate, the length of class 6 can be either 2 or 4.
# But it is not necessary to discriminate between the two since # But it is not necessary to discriminate between the two since
# it is used for frequency analysis only, and we are validing # it is used for frequency analysis only, and we are validing
# each code range there as well. So it is safe to set it to be # each code range there as well. So it is safe to set it to be
# 2 here. # 2 here.
GB2312CharLenTable = (0, 1, 1, 1, 1, 1, 2) GB2312CharLenTable = (0, 1, 1, 1, 1, 1, 2)
GB2312SMModel = {'classTable': GB2312_cls, GB2312SMModel = {'classTable': GB2312_cls,
@ -286,46 +336,48 @@ GB2312SMModel = {'classTable': GB2312_cls,
# Shift_JIS # Shift_JIS
SJIS_cls = ( \ SJIS_cls = (
1,1,1,1,1,1,1,1, # 00 - 07 1,1,1,1,1,1,1,1, # 00 - 07
1,1,1,1,1,1,0,0, # 08 - 0f 1,1,1,1,1,1,0,0, # 08 - 0f
1,1,1,1,1,1,1,1, # 10 - 17 1,1,1,1,1,1,1,1, # 10 - 17
1,1,1,0,1,1,1,1, # 18 - 1f 1,1,1,0,1,1,1,1, # 18 - 1f
1,1,1,1,1,1,1,1, # 20 - 27 1,1,1,1,1,1,1,1, # 20 - 27
1,1,1,1,1,1,1,1, # 28 - 2f 1,1,1,1,1,1,1,1, # 28 - 2f
1,1,1,1,1,1,1,1, # 30 - 37 1,1,1,1,1,1,1,1, # 30 - 37
1,1,1,1,1,1,1,1, # 38 - 3f 1,1,1,1,1,1,1,1, # 38 - 3f
2,2,2,2,2,2,2,2, # 40 - 47 2,2,2,2,2,2,2,2, # 40 - 47
2,2,2,2,2,2,2,2, # 48 - 4f 2,2,2,2,2,2,2,2, # 48 - 4f
2,2,2,2,2,2,2,2, # 50 - 57 2,2,2,2,2,2,2,2, # 50 - 57
2,2,2,2,2,2,2,2, # 58 - 5f 2,2,2,2,2,2,2,2, # 58 - 5f
2,2,2,2,2,2,2,2, # 60 - 67 2,2,2,2,2,2,2,2, # 60 - 67
2,2,2,2,2,2,2,2, # 68 - 6f 2,2,2,2,2,2,2,2, # 68 - 6f
2,2,2,2,2,2,2,2, # 70 - 77 2,2,2,2,2,2,2,2, # 70 - 77
2,2,2,2,2,2,2,1, # 78 - 7f 2,2,2,2,2,2,2,1, # 78 - 7f
3,3,3,3,3,3,3,3, # 80 - 87 3,3,3,3,3,2,2,3, # 80 - 87
3,3,3,3,3,3,3,3, # 88 - 8f 3,3,3,3,3,3,3,3, # 88 - 8f
3,3,3,3,3,3,3,3, # 90 - 97 3,3,3,3,3,3,3,3, # 90 - 97
3,3,3,3,3,3,3,3, # 98 - 9f 3,3,3,3,3,3,3,3, # 98 - 9f
#0xa0 is illegal in sjis encoding, but some pages does #0xa0 is illegal in sjis encoding, but some pages does
#contain such byte. We need to be more error forgiven. #contain such byte. We need to be more error forgiven.
2,2,2,2,2,2,2,2, # a0 - a7 2,2,2,2,2,2,2,2, # a0 - a7
2,2,2,2,2,2,2,2, # a8 - af 2,2,2,2,2,2,2,2, # a8 - af
2,2,2,2,2,2,2,2, # b0 - b7 2,2,2,2,2,2,2,2, # b0 - b7
2,2,2,2,2,2,2,2, # b8 - bf 2,2,2,2,2,2,2,2, # b8 - bf
2,2,2,2,2,2,2,2, # c0 - c7 2,2,2,2,2,2,2,2, # c0 - c7
2,2,2,2,2,2,2,2, # c8 - cf 2,2,2,2,2,2,2,2, # c8 - cf
2,2,2,2,2,2,2,2, # d0 - d7 2,2,2,2,2,2,2,2, # d0 - d7
2,2,2,2,2,2,2,2, # d8 - df 2,2,2,2,2,2,2,2, # d8 - df
3,3,3,3,3,3,3,3, # e0 - e7 3,3,3,3,3,3,3,3, # e0 - e7
3,3,3,3,3,4,4,4, # e8 - ef 3,3,3,3,3,4,4,4, # e8 - ef
4,4,4,4,4,4,4,4, # f0 - f7 3,3,3,3,3,3,3,3, # f0 - f7
4,4,4,4,4,0,0,0) # f8 - ff 3,3,3,3,3,0,0,0) # f8 - ff
SJIS_st = ( \
eError,eStart,eStart, 3,eError,eError,eError,eError,#00-07 SJIS_st = (
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f eError,eStart,eStart, 3,eError,eError,eError,eError,#00-07
eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart)#10-17 eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart #10-17
)
SJISCharLenTable = (0, 1, 1, 2, 0, 0) SJISCharLenTable = (0, 1, 1, 2, 0, 0)
@ -337,48 +389,50 @@ SJISSMModel = {'classTable': SJIS_cls,
# UCS2-BE # UCS2-BE
UCS2BE_cls = ( \ UCS2BE_cls = (
0,0,0,0,0,0,0,0, # 00 - 07 0,0,0,0,0,0,0,0, # 00 - 07
0,0,1,0,0,2,0,0, # 08 - 0f 0,0,1,0,0,2,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17 0,0,0,0,0,0,0,0, # 10 - 17
0,0,0,3,0,0,0,0, # 18 - 1f 0,0,0,3,0,0,0,0, # 18 - 1f
0,0,0,0,0,0,0,0, # 20 - 27 0,0,0,0,0,0,0,0, # 20 - 27
0,3,3,3,3,3,0,0, # 28 - 2f 0,3,3,3,3,3,0,0, # 28 - 2f
0,0,0,0,0,0,0,0, # 30 - 37 0,0,0,0,0,0,0,0, # 30 - 37
0,0,0,0,0,0,0,0, # 38 - 3f 0,0,0,0,0,0,0,0, # 38 - 3f
0,0,0,0,0,0,0,0, # 40 - 47 0,0,0,0,0,0,0,0, # 40 - 47
0,0,0,0,0,0,0,0, # 48 - 4f 0,0,0,0,0,0,0,0, # 48 - 4f
0,0,0,0,0,0,0,0, # 50 - 57 0,0,0,0,0,0,0,0, # 50 - 57
0,0,0,0,0,0,0,0, # 58 - 5f 0,0,0,0,0,0,0,0, # 58 - 5f
0,0,0,0,0,0,0,0, # 60 - 67 0,0,0,0,0,0,0,0, # 60 - 67
0,0,0,0,0,0,0,0, # 68 - 6f 0,0,0,0,0,0,0,0, # 68 - 6f
0,0,0,0,0,0,0,0, # 70 - 77 0,0,0,0,0,0,0,0, # 70 - 77
0,0,0,0,0,0,0,0, # 78 - 7f 0,0,0,0,0,0,0,0, # 78 - 7f
0,0,0,0,0,0,0,0, # 80 - 87 0,0,0,0,0,0,0,0, # 80 - 87
0,0,0,0,0,0,0,0, # 88 - 8f 0,0,0,0,0,0,0,0, # 88 - 8f
0,0,0,0,0,0,0,0, # 90 - 97 0,0,0,0,0,0,0,0, # 90 - 97
0,0,0,0,0,0,0,0, # 98 - 9f 0,0,0,0,0,0,0,0, # 98 - 9f
0,0,0,0,0,0,0,0, # a0 - a7 0,0,0,0,0,0,0,0, # a0 - a7
0,0,0,0,0,0,0,0, # a8 - af 0,0,0,0,0,0,0,0, # a8 - af
0,0,0,0,0,0,0,0, # b0 - b7 0,0,0,0,0,0,0,0, # b0 - b7
0,0,0,0,0,0,0,0, # b8 - bf 0,0,0,0,0,0,0,0, # b8 - bf
0,0,0,0,0,0,0,0, # c0 - c7 0,0,0,0,0,0,0,0, # c0 - c7
0,0,0,0,0,0,0,0, # c8 - cf 0,0,0,0,0,0,0,0, # c8 - cf
0,0,0,0,0,0,0,0, # d0 - d7 0,0,0,0,0,0,0,0, # d0 - d7
0,0,0,0,0,0,0,0, # d8 - df 0,0,0,0,0,0,0,0, # d8 - df
0,0,0,0,0,0,0,0, # e0 - e7 0,0,0,0,0,0,0,0, # e0 - e7
0,0,0,0,0,0,0,0, # e8 - ef 0,0,0,0,0,0,0,0, # e8 - ef
0,0,0,0,0,0,0,0, # f0 - f7 0,0,0,0,0,0,0,0, # f0 - f7
0,0,0,0,0,0,4,5) # f8 - ff 0,0,0,0,0,0,4,5 # f8 - ff
)
UCS2BE_st = ( \ UCS2BE_st = (
5, 7, 7,eError, 4, 3,eError,eError,#00-07 5, 7, 7,eError, 4, 3,eError,eError,#00-07
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
eItsMe,eItsMe, 6, 6, 6, 6,eError,eError,#10-17 eItsMe,eItsMe, 6, 6, 6, 6,eError,eError,#10-17
6, 6, 6, 6, 6,eItsMe, 6, 6,#18-1f 6, 6, 6, 6, 6,eItsMe, 6, 6,#18-1f
6, 6, 6, 6, 5, 7, 7,eError,#20-27 6, 6, 6, 6, 5, 7, 7,eError,#20-27
5, 8, 6, 6,eError, 6, 6, 6,#28-2f 5, 8, 6, 6,eError, 6, 6, 6,#28-2f
6, 6, 6, 6,eError,eError,eStart,eStart)#30-37 6, 6, 6, 6,eError,eError,eStart,eStart #30-37
)
UCS2BECharLenTable = (2, 2, 2, 0, 2, 2) UCS2BECharLenTable = (2, 2, 2, 0, 2, 2)
@ -390,48 +444,50 @@ UCS2BESMModel = {'classTable': UCS2BE_cls,
# UCS2-LE # UCS2-LE
UCS2LE_cls = ( \ UCS2LE_cls = (
0,0,0,0,0,0,0,0, # 00 - 07 0,0,0,0,0,0,0,0, # 00 - 07
0,0,1,0,0,2,0,0, # 08 - 0f 0,0,1,0,0,2,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17 0,0,0,0,0,0,0,0, # 10 - 17
0,0,0,3,0,0,0,0, # 18 - 1f 0,0,0,3,0,0,0,0, # 18 - 1f
0,0,0,0,0,0,0,0, # 20 - 27 0,0,0,0,0,0,0,0, # 20 - 27
0,3,3,3,3,3,0,0, # 28 - 2f 0,3,3,3,3,3,0,0, # 28 - 2f
0,0,0,0,0,0,0,0, # 30 - 37 0,0,0,0,0,0,0,0, # 30 - 37
0,0,0,0,0,0,0,0, # 38 - 3f 0,0,0,0,0,0,0,0, # 38 - 3f
0,0,0,0,0,0,0,0, # 40 - 47 0,0,0,0,0,0,0,0, # 40 - 47
0,0,0,0,0,0,0,0, # 48 - 4f 0,0,0,0,0,0,0,0, # 48 - 4f
0,0,0,0,0,0,0,0, # 50 - 57 0,0,0,0,0,0,0,0, # 50 - 57
0,0,0,0,0,0,0,0, # 58 - 5f 0,0,0,0,0,0,0,0, # 58 - 5f
0,0,0,0,0,0,0,0, # 60 - 67 0,0,0,0,0,0,0,0, # 60 - 67
0,0,0,0,0,0,0,0, # 68 - 6f 0,0,0,0,0,0,0,0, # 68 - 6f
0,0,0,0,0,0,0,0, # 70 - 77 0,0,0,0,0,0,0,0, # 70 - 77
0,0,0,0,0,0,0,0, # 78 - 7f 0,0,0,0,0,0,0,0, # 78 - 7f
0,0,0,0,0,0,0,0, # 80 - 87 0,0,0,0,0,0,0,0, # 80 - 87
0,0,0,0,0,0,0,0, # 88 - 8f 0,0,0,0,0,0,0,0, # 88 - 8f
0,0,0,0,0,0,0,0, # 90 - 97 0,0,0,0,0,0,0,0, # 90 - 97
0,0,0,0,0,0,0,0, # 98 - 9f 0,0,0,0,0,0,0,0, # 98 - 9f
0,0,0,0,0,0,0,0, # a0 - a7 0,0,0,0,0,0,0,0, # a0 - a7
0,0,0,0,0,0,0,0, # a8 - af 0,0,0,0,0,0,0,0, # a8 - af
0,0,0,0,0,0,0,0, # b0 - b7 0,0,0,0,0,0,0,0, # b0 - b7
0,0,0,0,0,0,0,0, # b8 - bf 0,0,0,0,0,0,0,0, # b8 - bf
0,0,0,0,0,0,0,0, # c0 - c7 0,0,0,0,0,0,0,0, # c0 - c7
0,0,0,0,0,0,0,0, # c8 - cf 0,0,0,0,0,0,0,0, # c8 - cf
0,0,0,0,0,0,0,0, # d0 - d7 0,0,0,0,0,0,0,0, # d0 - d7
0,0,0,0,0,0,0,0, # d8 - df 0,0,0,0,0,0,0,0, # d8 - df
0,0,0,0,0,0,0,0, # e0 - e7 0,0,0,0,0,0,0,0, # e0 - e7
0,0,0,0,0,0,0,0, # e8 - ef 0,0,0,0,0,0,0,0, # e8 - ef
0,0,0,0,0,0,0,0, # f0 - f7 0,0,0,0,0,0,0,0, # f0 - f7
0,0,0,0,0,0,4,5) # f8 - ff 0,0,0,0,0,0,4,5 # f8 - ff
)
UCS2LE_st = ( \ UCS2LE_st = (
6, 6, 7, 6, 4, 3,eError,eError,#00-07 6, 6, 7, 6, 4, 3,eError,eError,#00-07
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
eItsMe,eItsMe, 5, 5, 5,eError,eItsMe,eError,#10-17 eItsMe,eItsMe, 5, 5, 5,eError,eItsMe,eError,#10-17
5, 5, 5,eError, 5,eError, 6, 6,#18-1f 5, 5, 5,eError, 5,eError, 6, 6,#18-1f
7, 6, 8, 8, 5, 5, 5,eError,#20-27 7, 6, 8, 8, 5, 5, 5,eError,#20-27
5, 5, 5,eError,eError,eError, 5, 5,#28-2f 5, 5, 5,eError,eError,eError, 5, 5,#28-2f
5, 5, 5,eError, 5,eError,eStart,eStart)#30-37 5, 5, 5,eError, 5,eError,eStart,eStart #30-37
)
UCS2LECharLenTable = (2, 2, 2, 2, 2, 2) UCS2LECharLenTable = (2, 2, 2, 2, 2, 2)
@ -443,67 +499,69 @@ UCS2LESMModel = {'classTable': UCS2LE_cls,
# UTF-8 # UTF-8
UTF8_cls = ( \ UTF8_cls = (
1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as a legal value 1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as a legal value
1,1,1,1,1,1,0,0, # 08 - 0f 1,1,1,1,1,1,0,0, # 08 - 0f
1,1,1,1,1,1,1,1, # 10 - 17 1,1,1,1,1,1,1,1, # 10 - 17
1,1,1,0,1,1,1,1, # 18 - 1f 1,1,1,0,1,1,1,1, # 18 - 1f
1,1,1,1,1,1,1,1, # 20 - 27 1,1,1,1,1,1,1,1, # 20 - 27
1,1,1,1,1,1,1,1, # 28 - 2f 1,1,1,1,1,1,1,1, # 28 - 2f
1,1,1,1,1,1,1,1, # 30 - 37 1,1,1,1,1,1,1,1, # 30 - 37
1,1,1,1,1,1,1,1, # 38 - 3f 1,1,1,1,1,1,1,1, # 38 - 3f
1,1,1,1,1,1,1,1, # 40 - 47 1,1,1,1,1,1,1,1, # 40 - 47
1,1,1,1,1,1,1,1, # 48 - 4f 1,1,1,1,1,1,1,1, # 48 - 4f
1,1,1,1,1,1,1,1, # 50 - 57 1,1,1,1,1,1,1,1, # 50 - 57
1,1,1,1,1,1,1,1, # 58 - 5f 1,1,1,1,1,1,1,1, # 58 - 5f
1,1,1,1,1,1,1,1, # 60 - 67 1,1,1,1,1,1,1,1, # 60 - 67
1,1,1,1,1,1,1,1, # 68 - 6f 1,1,1,1,1,1,1,1, # 68 - 6f
1,1,1,1,1,1,1,1, # 70 - 77 1,1,1,1,1,1,1,1, # 70 - 77
1,1,1,1,1,1,1,1, # 78 - 7f 1,1,1,1,1,1,1,1, # 78 - 7f
2,2,2,2,3,3,3,3, # 80 - 87 2,2,2,2,3,3,3,3, # 80 - 87
4,4,4,4,4,4,4,4, # 88 - 8f 4,4,4,4,4,4,4,4, # 88 - 8f
4,4,4,4,4,4,4,4, # 90 - 97 4,4,4,4,4,4,4,4, # 90 - 97
4,4,4,4,4,4,4,4, # 98 - 9f 4,4,4,4,4,4,4,4, # 98 - 9f
5,5,5,5,5,5,5,5, # a0 - a7 5,5,5,5,5,5,5,5, # a0 - a7
5,5,5,5,5,5,5,5, # a8 - af 5,5,5,5,5,5,5,5, # a8 - af
5,5,5,5,5,5,5,5, # b0 - b7 5,5,5,5,5,5,5,5, # b0 - b7
5,5,5,5,5,5,5,5, # b8 - bf 5,5,5,5,5,5,5,5, # b8 - bf
0,0,6,6,6,6,6,6, # c0 - c7 0,0,6,6,6,6,6,6, # c0 - c7
6,6,6,6,6,6,6,6, # c8 - cf 6,6,6,6,6,6,6,6, # c8 - cf
6,6,6,6,6,6,6,6, # d0 - d7 6,6,6,6,6,6,6,6, # d0 - d7
6,6,6,6,6,6,6,6, # d8 - df 6,6,6,6,6,6,6,6, # d8 - df
7,8,8,8,8,8,8,8, # e0 - e7 7,8,8,8,8,8,8,8, # e0 - e7
8,8,8,8,8,9,8,8, # e8 - ef 8,8,8,8,8,9,8,8, # e8 - ef
10,11,11,11,11,11,11,11, # f0 - f7 10,11,11,11,11,11,11,11, # f0 - f7
12,13,13,13,14,15,0,0) # f8 - ff 12,13,13,13,14,15,0,0 # f8 - ff
)
UTF8_st = ( \ UTF8_st = (
eError,eStart,eError,eError,eError,eError, 12, 10,#00-07 eError,eStart,eError,eError,eError,eError, 12, 10,#00-07
9, 11, 8, 7, 6, 5, 4, 3,#08-0f 9, 11, 8, 7, 6, 5, 4, 3,#08-0f
eError,eError,eError,eError,eError,eError,eError,eError,#10-17 eError,eError,eError,eError,eError,eError,eError,eError,#10-17
eError,eError,eError,eError,eError,eError,eError,eError,#18-1f eError,eError,eError,eError,eError,eError,eError,eError,#18-1f
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,#20-27 eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,#20-27
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,#28-2f eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,#28-2f
eError,eError, 5, 5, 5, 5,eError,eError,#30-37 eError,eError, 5, 5, 5, 5,eError,eError,#30-37
eError,eError,eError,eError,eError,eError,eError,eError,#38-3f eError,eError,eError,eError,eError,eError,eError,eError,#38-3f
eError,eError,eError, 5, 5, 5,eError,eError,#40-47 eError,eError,eError, 5, 5, 5,eError,eError,#40-47
eError,eError,eError,eError,eError,eError,eError,eError,#48-4f eError,eError,eError,eError,eError,eError,eError,eError,#48-4f
eError,eError, 7, 7, 7, 7,eError,eError,#50-57 eError,eError, 7, 7, 7, 7,eError,eError,#50-57
eError,eError,eError,eError,eError,eError,eError,eError,#58-5f eError,eError,eError,eError,eError,eError,eError,eError,#58-5f
eError,eError,eError,eError, 7, 7,eError,eError,#60-67 eError,eError,eError,eError, 7, 7,eError,eError,#60-67
eError,eError,eError,eError,eError,eError,eError,eError,#68-6f eError,eError,eError,eError,eError,eError,eError,eError,#68-6f
eError,eError, 9, 9, 9, 9,eError,eError,#70-77 eError,eError, 9, 9, 9, 9,eError,eError,#70-77
eError,eError,eError,eError,eError,eError,eError,eError,#78-7f eError,eError,eError,eError,eError,eError,eError,eError,#78-7f
eError,eError,eError,eError,eError, 9,eError,eError,#80-87 eError,eError,eError,eError,eError, 9,eError,eError,#80-87
eError,eError,eError,eError,eError,eError,eError,eError,#88-8f eError,eError,eError,eError,eError,eError,eError,eError,#88-8f
eError,eError, 12, 12, 12, 12,eError,eError,#90-97 eError,eError, 12, 12, 12, 12,eError,eError,#90-97
eError,eError,eError,eError,eError,eError,eError,eError,#98-9f eError,eError,eError,eError,eError,eError,eError,eError,#98-9f
eError,eError,eError,eError,eError, 12,eError,eError,#a0-a7 eError,eError,eError,eError,eError, 12,eError,eError,#a0-a7
eError,eError,eError,eError,eError,eError,eError,eError,#a8-af eError,eError,eError,eError,eError,eError,eError,eError,#a8-af
eError,eError, 12, 12, 12,eError,eError,eError,#b0-b7 eError,eError, 12, 12, 12,eError,eError,eError,#b0-b7
eError,eError,eError,eError,eError,eError,eError,eError,#b8-bf eError,eError,eError,eError,eError,eError,eError,eError,#b8-bf
eError,eError,eStart,eStart,eStart,eStart,eError,eError,#c0-c7 eError,eError,eStart,eStart,eStart,eStart,eError,eError,#c0-c7
eError,eError,eError,eError,eError,eError,eError,eError)#c8-cf eError,eError,eError,eError,eError,eError,eError,eError #c8-cf
)
UTF8CharLenTable = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6) UTF8CharLenTable = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6)

View File

@ -14,20 +14,22 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants, sys import sys
from charsetprober import CharSetProber from . import constants
from .charsetprober import CharSetProber
from .compat import wrap_ord
SAMPLE_SIZE = 64 SAMPLE_SIZE = 64
SB_ENOUGH_REL_THRESHOLD = 1024 SB_ENOUGH_REL_THRESHOLD = 1024
@ -38,21 +40,26 @@ NUMBER_OF_SEQ_CAT = 4
POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1 POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1
#NEGATIVE_CAT = 0 #NEGATIVE_CAT = 0
class SingleByteCharSetProber(CharSetProber): class SingleByteCharSetProber(CharSetProber):
def __init__(self, model, reversed=constants.False, nameProber=None): def __init__(self, model, reversed=False, nameProber=None):
CharSetProber.__init__(self) CharSetProber.__init__(self)
self._mModel = model self._mModel = model
self._mReversed = reversed # TRUE if we need to reverse every pair in the model lookup # TRUE if we need to reverse every pair in the model lookup
self._mNameProber = nameProber # Optional auxiliary prober for name decision self._mReversed = reversed
# Optional auxiliary prober for name decision
self._mNameProber = nameProber
self.reset() self.reset()
def reset(self): def reset(self):
CharSetProber.reset(self) CharSetProber.reset(self)
self._mLastOrder = 255 # char order of last character # char order of last character
self._mLastOrder = 255
self._mSeqCounters = [0] * NUMBER_OF_SEQ_CAT self._mSeqCounters = [0] * NUMBER_OF_SEQ_CAT
self._mTotalSeqs = 0 self._mTotalSeqs = 0
self._mTotalChar = 0 self._mTotalChar = 0
self._mFreqChar = 0 # characters that fall in our sampling range # characters that fall in our sampling range
self._mFreqChar = 0
def get_charset_name(self): def get_charset_name(self):
if self._mNameProber: if self._mNameProber:
@ -67,7 +74,7 @@ class SingleByteCharSetProber(CharSetProber):
if not aLen: if not aLen:
return self.get_state() return self.get_state()
for c in aBuf: for c in aBuf:
order = self._mModel['charToOrderMap'][ord(c)] order = self._mModel['charToOrderMap'][wrap_ord(c)]
if order < SYMBOL_CAT_ORDER: if order < SYMBOL_CAT_ORDER:
self._mTotalChar += 1 self._mTotalChar += 1
if order < SAMPLE_SIZE: if order < SAMPLE_SIZE:
@ -75,9 +82,12 @@ class SingleByteCharSetProber(CharSetProber):
if self._mLastOrder < SAMPLE_SIZE: if self._mLastOrder < SAMPLE_SIZE:
self._mTotalSeqs += 1 self._mTotalSeqs += 1
if not self._mReversed: if not self._mReversed:
self._mSeqCounters[self._mModel['precedenceMatrix'][(self._mLastOrder * SAMPLE_SIZE) + order]] += 1 i = (self._mLastOrder * SAMPLE_SIZE) + order
else: # reverse the order of the letters in the lookup model = self._mModel['precedenceMatrix'][i]
self._mSeqCounters[self._mModel['precedenceMatrix'][(order * SAMPLE_SIZE) + self._mLastOrder]] += 1 else: # reverse the order of the letters in the lookup
i = (order * SAMPLE_SIZE) + self._mLastOrder
model = self._mModel['precedenceMatrix'][i]
self._mSeqCounters[model] += 1
self._mLastOrder = order self._mLastOrder = order
if self.get_state() == constants.eDetecting: if self.get_state() == constants.eDetecting:
@ -85,11 +95,16 @@ class SingleByteCharSetProber(CharSetProber):
cf = self.get_confidence() cf = self.get_confidence()
if cf > POSITIVE_SHORTCUT_THRESHOLD: if cf > POSITIVE_SHORTCUT_THRESHOLD:
if constants._debug: if constants._debug:
sys.stderr.write('%s confidence = %s, we have a winner\n' % (self._mModel['charsetName'], cf)) sys.stderr.write('%s confidence = %s, we have a'
'winner\n' %
(self._mModel['charsetName'], cf))
self._mState = constants.eFoundIt self._mState = constants.eFoundIt
elif cf < NEGATIVE_SHORTCUT_THRESHOLD: elif cf < NEGATIVE_SHORTCUT_THRESHOLD:
if constants._debug: if constants._debug:
sys.stderr.write('%s confidence = %s, below negative shortcut threshhold %s\n' % (self._mModel['charsetName'], cf, NEGATIVE_SHORTCUT_THRESHOLD)) sys.stderr.write('%s confidence = %s, below negative'
'shortcut threshhold %s\n' %
(self._mModel['charsetName'], cf,
NEGATIVE_SHORTCUT_THRESHOLD))
self._mState = constants.eNotMe self._mState = constants.eNotMe
return self.get_state() return self.get_state()
@ -97,9 +112,8 @@ class SingleByteCharSetProber(CharSetProber):
def get_confidence(self): def get_confidence(self):
r = 0.01 r = 0.01
if self._mTotalSeqs > 0: if self._mTotalSeqs > 0:
# print self._mSeqCounters[POSITIVE_CAT], self._mTotalSeqs, self._mModel['mTypicalPositiveRatio'] r = ((1.0 * self._mSeqCounters[POSITIVE_CAT]) / self._mTotalSeqs
r = (1.0 * self._mSeqCounters[POSITIVE_CAT]) / self._mTotalSeqs / self._mModel['mTypicalPositiveRatio'] / self._mModel['mTypicalPositiveRatio'])
# print r, self._mFreqChar, self._mTotalChar
r = r * self._mFreqChar / self._mTotalChar r = r * self._mFreqChar / self._mTotalChar
if r >= 1.0: if r >= 1.0:
r = 0.99 r = 0.99

View File

@ -14,33 +14,35 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants, sys from .charsetgroupprober import CharSetGroupProber
from charsetgroupprober import CharSetGroupProber from .sbcharsetprober import SingleByteCharSetProber
from sbcharsetprober import SingleByteCharSetProber from .langcyrillicmodel import (Win1251CyrillicModel, Koi8rModel,
from langcyrillicmodel import Win1251CyrillicModel, Koi8rModel, Latin5CyrillicModel, MacCyrillicModel, Ibm866Model, Ibm855Model Latin5CyrillicModel, MacCyrillicModel,
from langgreekmodel import Latin7GreekModel, Win1253GreekModel Ibm866Model, Ibm855Model)
from langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel from .langgreekmodel import Latin7GreekModel, Win1253GreekModel
from langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel from .langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel
from langthaimodel import TIS620ThaiModel from .langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel
from langhebrewmodel import Win1255HebrewModel from .langthaimodel import TIS620ThaiModel
from hebrewprober import HebrewProber from .langhebrewmodel import Win1255HebrewModel
from .hebrewprober import HebrewProber
class SBCSGroupProber(CharSetGroupProber): class SBCSGroupProber(CharSetGroupProber):
def __init__(self): def __init__(self):
CharSetGroupProber.__init__(self) CharSetGroupProber.__init__(self)
self._mProbers = [ \ self._mProbers = [
SingleByteCharSetProber(Win1251CyrillicModel), SingleByteCharSetProber(Win1251CyrillicModel),
SingleByteCharSetProber(Koi8rModel), SingleByteCharSetProber(Koi8rModel),
SingleByteCharSetProber(Latin5CyrillicModel), SingleByteCharSetProber(Latin5CyrillicModel),
@ -54,11 +56,14 @@ class SBCSGroupProber(CharSetGroupProber):
SingleByteCharSetProber(Latin2HungarianModel), SingleByteCharSetProber(Latin2HungarianModel),
SingleByteCharSetProber(Win1250HungarianModel), SingleByteCharSetProber(Win1250HungarianModel),
SingleByteCharSetProber(TIS620ThaiModel), SingleByteCharSetProber(TIS620ThaiModel),
] ]
hebrewProber = HebrewProber() hebrewProber = HebrewProber()
logicalHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, constants.False, hebrewProber) logicalHebrewProber = SingleByteCharSetProber(Win1255HebrewModel,
visualHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, constants.True, hebrewProber) False, hebrewProber)
visualHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, True,
hebrewProber)
hebrewProber.set_model_probers(logicalHebrewProber, visualHebrewProber) hebrewProber.set_model_probers(logicalHebrewProber, visualHebrewProber)
self._mProbers.extend([hebrewProber, logicalHebrewProber, visualHebrewProber]) self._mProbers.extend([hebrewProber, logicalHebrewProber,
visualHebrewProber])
self.reset() self.reset()

View File

@ -13,25 +13,26 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from mbcharsetprober import MultiByteCharSetProber import sys
from codingstatemachine import CodingStateMachine from .mbcharsetprober import MultiByteCharSetProber
from chardistribution import SJISDistributionAnalysis from .codingstatemachine import CodingStateMachine
from jpcntx import SJISContextAnalysis from .chardistribution import SJISDistributionAnalysis
from mbcssm import SJISSMModel from .jpcntx import SJISContextAnalysis
import constants, sys from .mbcssm import SJISSMModel
from constants import eStart, eError, eItsMe from . import constants
class SJISProber(MultiByteCharSetProber): class SJISProber(MultiByteCharSetProber):
def __init__(self): def __init__(self):
@ -46,35 +47,40 @@ class SJISProber(MultiByteCharSetProber):
self._mContextAnalyzer.reset() self._mContextAnalyzer.reset()
def get_charset_name(self): def get_charset_name(self):
return "SHIFT_JIS" return self._mContextAnalyzer.get_charset_name()
def feed(self, aBuf): def feed(self, aBuf):
aLen = len(aBuf) aLen = len(aBuf)
for i in xrange(0, aLen): for i in xrange(0, aLen):
codingState = self._mCodingSM.next_state(aBuf[i]) codingState = self._mCodingSM.next_state(aBuf[i])
if codingState == eError: if codingState == constants.eError:
if constants._debug: if constants._debug:
sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n') sys.stderr.write(self.get_charset_name()
+ ' prober hit error at byte ' + str(i)
+ '\n')
self._mState = constants.eNotMe self._mState = constants.eNotMe
break break
elif codingState == eItsMe: elif codingState == constants.eItsMe:
self._mState = constants.eFoundIt self._mState = constants.eFoundIt
break break
elif codingState == eStart: elif codingState == constants.eStart:
charLen = self._mCodingSM.get_current_charlen() charLen = self._mCodingSM.get_current_charlen()
if i == 0: if i == 0:
self._mLastChar[1] = aBuf[0] self._mLastChar[1] = aBuf[0]
self._mContextAnalyzer.feed(self._mLastChar[2 - charLen :], charLen) self._mContextAnalyzer.feed(self._mLastChar[2 - charLen:],
charLen)
self._mDistributionAnalyzer.feed(self._mLastChar, charLen) self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
else: else:
self._mContextAnalyzer.feed(aBuf[i + 1 - charLen : i + 3 - charLen], charLen) self._mContextAnalyzer.feed(aBuf[i + 1 - charLen:i + 3
self._mDistributionAnalyzer.feed(aBuf[i - 1 : i + 1], charLen) - charLen], charLen)
self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1],
charLen)
self._mLastChar[0] = aBuf[aLen - 1] self._mLastChar[0] = aBuf[aLen - 1]
if self.get_state() == constants.eDetecting: if self.get_state() == constants.eDetecting:
if self._mContextAnalyzer.got_enough_data() and \ if (self._mContextAnalyzer.got_enough_data() and
(self.get_confidence() > constants.SHORTCUT_THRESHOLD): (self.get_confidence() > constants.SHORTCUT_THRESHOLD)):
self._mState = constants.eFoundIt self._mState = constants.eFoundIt
return self.get_state() return self.get_state()

View File

@ -1,20 +0,0 @@
import sys, glob
sys.path.insert(0, '..')
from chardet.universaldetector import UniversalDetector
count = 0
u = UniversalDetector()
for f in glob.glob(sys.argv[1]):
print f.ljust(60),
u.reset()
for line in file(f, 'rb'):
u.feed(line)
if u.done: break
u.close()
result = u.result
if result['encoding']:
print result['encoding'], 'with confidence', result['confidence']
else:
print '******** no result'
count += 1
print count, 'tests'

View File

@ -14,23 +14,25 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants, sys from . import constants
from latin1prober import Latin1Prober # windows-1252 import sys
from mbcsgroupprober import MBCSGroupProber # multi-byte character sets import codecs
from sbcsgroupprober import SBCSGroupProber # single-byte character sets from .latin1prober import Latin1Prober # windows-1252
from escprober import EscCharSetProber # ISO-2122, etc. from .mbcsgroupprober import MBCSGroupProber # multi-byte character sets
from .sbcsgroupprober import SBCSGroupProber # single-byte character sets
from .escprober import EscCharSetProber # ISO-2122, etc.
import re import re
MINIMUM_THRESHOLD = 0.20 MINIMUM_THRESHOLD = 0.20
@ -38,68 +40,78 @@ ePureAscii = 0
eEscAscii = 1 eEscAscii = 1
eHighbyte = 2 eHighbyte = 2
class UniversalDetector: class UniversalDetector:
def __init__(self): def __init__(self):
self._highBitDetector = re.compile(r'[\x80-\xFF]') self._highBitDetector = re.compile(b'[\x80-\xFF]')
self._escDetector = re.compile(r'(\033|~{)') self._escDetector = re.compile(b'(\033|~{)')
self._mEscCharSetProber = None self._mEscCharSetProber = None
self._mCharSetProbers = [] self._mCharSetProbers = []
self.reset() self.reset()
def reset(self): def reset(self):
self.result = {'encoding': None, 'confidence': 0.0} self.result = {'encoding': None, 'confidence': 0.0}
self.done = constants.False self.done = False
self._mStart = constants.True self._mStart = True
self._mGotData = constants.False self._mGotData = False
self._mInputState = ePureAscii self._mInputState = ePureAscii
self._mLastChar = '' self._mLastChar = b''
if self._mEscCharSetProber: if self._mEscCharSetProber:
self._mEscCharSetProber.reset() self._mEscCharSetProber.reset()
for prober in self._mCharSetProbers: for prober in self._mCharSetProbers:
prober.reset() prober.reset()
def feed(self, aBuf): def feed(self, aBuf):
if self.done: return if self.done:
return
aLen = len(aBuf) aLen = len(aBuf)
if not aLen: return if not aLen:
return
if not self._mGotData: if not self._mGotData:
# If the data starts with BOM, we know it is UTF # If the data starts with BOM, we know it is UTF
if aBuf[:3] == '\xEF\xBB\xBF': if aBuf[:3] == codecs.BOM_UTF8:
# EF BB BF UTF-8 with BOM # EF BB BF UTF-8 with BOM
self.result = {'encoding': "UTF-8", 'confidence': 1.0} self.result = {'encoding': "UTF-8-SIG", 'confidence': 1.0}
elif aBuf[:4] == '\xFF\xFE\x00\x00': elif aBuf[:4] == codecs.BOM_UTF32_LE:
# FF FE 00 00 UTF-32, little-endian BOM # FF FE 00 00 UTF-32, little-endian BOM
self.result = {'encoding': "UTF-32LE", 'confidence': 1.0} self.result = {'encoding': "UTF-32LE", 'confidence': 1.0}
elif aBuf[:4] == '\x00\x00\xFE\xFF': elif aBuf[:4] == codecs.BOM_UTF32_BE:
# 00 00 FE FF UTF-32, big-endian BOM # 00 00 FE FF UTF-32, big-endian BOM
self.result = {'encoding': "UTF-32BE", 'confidence': 1.0} self.result = {'encoding': "UTF-32BE", 'confidence': 1.0}
elif aBuf[:4] == '\xFE\xFF\x00\x00': elif aBuf[:4] == b'\xFE\xFF\x00\x00':
# FE FF 00 00 UCS-4, unusual octet order BOM (3412) # FE FF 00 00 UCS-4, unusual octet order BOM (3412)
self.result = {'encoding': "X-ISO-10646-UCS-4-3412", 'confidence': 1.0} self.result = {
elif aBuf[:4] == '\x00\x00\xFF\xFE': 'encoding': "X-ISO-10646-UCS-4-3412",
'confidence': 1.0
}
elif aBuf[:4] == b'\x00\x00\xFF\xFE':
# 00 00 FF FE UCS-4, unusual octet order BOM (2143) # 00 00 FF FE UCS-4, unusual octet order BOM (2143)
self.result = {'encoding': "X-ISO-10646-UCS-4-2143", 'confidence': 1.0} self.result = {
elif aBuf[:2] == '\xFF\xFE': 'encoding': "X-ISO-10646-UCS-4-2143",
'confidence': 1.0
}
elif aBuf[:2] == codecs.BOM_LE:
# FF FE UTF-16, little endian BOM # FF FE UTF-16, little endian BOM
self.result = {'encoding': "UTF-16LE", 'confidence': 1.0} self.result = {'encoding': "UTF-16LE", 'confidence': 1.0}
elif aBuf[:2] == '\xFE\xFF': elif aBuf[:2] == codecs.BOM_BE:
# FE FF UTF-16, big endian BOM # FE FF UTF-16, big endian BOM
self.result = {'encoding': "UTF-16BE", 'confidence': 1.0} self.result = {'encoding': "UTF-16BE", 'confidence': 1.0}
self._mGotData = constants.True self._mGotData = True
if self.result['encoding'] and (self.result['confidence'] > 0.0): if self.result['encoding'] and (self.result['confidence'] > 0.0):
self.done = constants.True self.done = True
return return
if self._mInputState == ePureAscii: if self._mInputState == ePureAscii:
if self._highBitDetector.search(aBuf): if self._highBitDetector.search(aBuf):
self._mInputState = eHighbyte self._mInputState = eHighbyte
elif (self._mInputState == ePureAscii) and self._escDetector.search(self._mLastChar + aBuf): elif ((self._mInputState == ePureAscii) and
self._escDetector.search(self._mLastChar + aBuf)):
self._mInputState = eEscAscii self._mInputState = eEscAscii
self._mLastChar = aBuf[-1] self._mLastChar = aBuf[-1:]
if self._mInputState == eEscAscii: if self._mInputState == eEscAscii:
if not self._mEscCharSetProber: if not self._mEscCharSetProber:
@ -107,24 +119,26 @@ class UniversalDetector:
if self._mEscCharSetProber.feed(aBuf) == constants.eFoundIt: if self._mEscCharSetProber.feed(aBuf) == constants.eFoundIt:
self.result = {'encoding': self._mEscCharSetProber.get_charset_name(), self.result = {'encoding': self._mEscCharSetProber.get_charset_name(),
'confidence': self._mEscCharSetProber.get_confidence()} 'confidence': self._mEscCharSetProber.get_confidence()}
self.done = constants.True self.done = True
elif self._mInputState == eHighbyte: elif self._mInputState == eHighbyte:
if not self._mCharSetProbers: if not self._mCharSetProbers:
self._mCharSetProbers = [MBCSGroupProber(), SBCSGroupProber(), Latin1Prober()] self._mCharSetProbers = [MBCSGroupProber(), SBCSGroupProber(),
Latin1Prober()]
for prober in self._mCharSetProbers: for prober in self._mCharSetProbers:
if prober.feed(aBuf) == constants.eFoundIt: if prober.feed(aBuf) == constants.eFoundIt:
self.result = {'encoding': prober.get_charset_name(), self.result = {'encoding': prober.get_charset_name(),
'confidence': prober.get_confidence()} 'confidence': prober.get_confidence()}
self.done = constants.True self.done = True
break break
def close(self): def close(self):
if self.done: return if self.done:
return
if not self._mGotData: if not self._mGotData:
if constants._debug: if constants._debug:
sys.stderr.write('no data received!\n') sys.stderr.write('no data received!\n')
return return
self.done = constants.True self.done = True
if self._mInputState == ePureAscii: if self._mInputState == ePureAscii:
self.result = {'encoding': 'ascii', 'confidence': 1.0} self.result = {'encoding': 'ascii', 'confidence': 1.0}
@ -135,7 +149,8 @@ class UniversalDetector:
maxProberConfidence = 0.0 maxProberConfidence = 0.0
maxProber = None maxProber = None
for prober in self._mCharSetProbers: for prober in self._mCharSetProbers:
if not prober: continue if not prober:
continue
proberConfidence = prober.get_confidence() proberConfidence = prober.get_confidence()
if proberConfidence > maxProberConfidence: if proberConfidence > maxProberConfidence:
maxProberConfidence = proberConfidence maxProberConfidence = proberConfidence
@ -148,7 +163,8 @@ class UniversalDetector:
if constants._debug: if constants._debug:
sys.stderr.write('no probers hit minimum threshhold\n') sys.stderr.write('no probers hit minimum threshhold\n')
for prober in self._mCharSetProbers[0].mProbers: for prober in self._mCharSetProbers[0].mProbers:
if not prober: continue if not prober:
sys.stderr.write('%s confidence = %s\n' % \ continue
(prober.get_charset_name(), \ sys.stderr.write('%s confidence = %s\n' %
(prober.get_charset_name(),
prober.get_confidence())) prober.get_confidence()))

View File

@ -13,26 +13,26 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants, sys from . import constants
from constants import eStart, eError, eItsMe from .charsetprober import CharSetProber
from charsetprober import CharSetProber from .codingstatemachine import CodingStateMachine
from codingstatemachine import CodingStateMachine from .mbcssm import UTF8SMModel
from mbcssm import UTF8SMModel
ONE_CHAR_PROB = 0.5 ONE_CHAR_PROB = 0.5
class UTF8Prober(CharSetProber): class UTF8Prober(CharSetProber):
def __init__(self): def __init__(self):
CharSetProber.__init__(self) CharSetProber.__init__(self)
@ -50,13 +50,13 @@ class UTF8Prober(CharSetProber):
def feed(self, aBuf): def feed(self, aBuf):
for c in aBuf: for c in aBuf:
codingState = self._mCodingSM.next_state(c) codingState = self._mCodingSM.next_state(c)
if codingState == eError: if codingState == constants.eError:
self._mState = constants.eNotMe self._mState = constants.eNotMe
break break
elif codingState == eItsMe: elif codingState == constants.eItsMe:
self._mState = constants.eFoundIt self._mState = constants.eFoundIt
break break
elif codingState == eStart: elif codingState == constants.eStart:
if self._mCodingSM.get_current_charlen() >= 2: if self._mCodingSM.get_current_charlen() >= 2:
self._mNumOfMBChar += 1 self._mNumOfMBChar += 1

View File

@ -73,7 +73,7 @@ class MultipartPostHandler(urllib2.BaseHandler):
request.add_data(data) request.add_data(data)
return request return request
def multipart_encode(vars, files, boundary = None, buf = None): def multipart_encode(vars, files, boundary=None, buf=None):
if boundary is None: if boundary is None:
boundary = mimetools.choose_boundary() boundary = mimetools.choose_boundary()
@ -100,7 +100,7 @@ class MultipartPostHandler(urllib2.BaseHandler):
# buf += 'Content-Length: %s\r\n' % file_size # buf += 'Content-Length: %s\r\n' % file_size
fd.seek(0) fd.seek(0)
buf = str(buf) buf = str(buf) if not isinstance(buf, unicode) else buf.encode("utf8")
buf += '\r\n%s\r\n' % fd.read() buf += '\r\n%s\r\n' % fd.read()
buf += '--%s--\r\n\r\n' % boundary buf += '--%s--\r\n\r\n' % boundary

View File

@ -2596,3 +2596,7 @@ tmp_lahir
universitas universitas
urut urut
waktu waktu
# WebGoat
cookie
login_count

View File

@ -3366,3 +3366,6 @@ tuser
tusers tusers
userstbl userstbl
usertbl usertbl
# WebGoat
user_data

View File

@ -104,6 +104,8 @@
<!-- HSQLDB --> <!-- HSQLDB -->
<dbms value="HSQLDB"> <dbms value="HSQLDB">
<error regexp="org\.hsqldb\.jdbc"/> <error regexp="org\.hsqldb\.jdbc"/>
<error regexp="Unexpected end of command in statement \["/>
<error regexp="Unexpected token.*in statement \["/>
</dbms> </dbms>
</root> </root>

View File

@ -651,8 +651,8 @@
<cast query="CAST(%s AS LONGVARCHAR)"/> <cast query="CAST(%s AS LONGVARCHAR)"/>
<length query="CHAR_LENGTH(%s)"/> <length query="CHAR_LENGTH(%s)"/>
<isnull query="IFNULL(%s,' ')"/> <isnull query="IFNULL(%s,' ')"/>
<delimiter query=","/> <delimiter query="||"/>
<limit query="LIMIT %d %d"/> <limit query="LIMIT %d %d" query2="LIMIT %d OFFSET %d"/>
<limitregexp query="\s+LIMIT\s+([\d]+)\s*\,\s*([\d]+)" query2="\s+LIMIT\s+([\d]+)"/> <limitregexp query="\s+LIMIT\s+([\d]+)\s*\,\s*([\d]+)" query2="\s+LIMIT\s+([\d]+)"/>
<limitgroupstart query="1"/> <limitgroupstart query="1"/>
<limitgroupstop query="2"/> <limitgroupstop query="2"/>
@ -675,30 +675,30 @@
<check_udf/> <check_udf/>
<users> <users>
<!-- LIMIT is needed at start for v1.7 this gets mangled unless no-cast is used --> <!-- LIMIT is needed at start for v1.7 this gets mangled unless no-cast is used -->
<blind query="SELECT LIMIT %d 1 DISTINCT(user) FROM INFORMATION_SCHEMA.SYSTEM_USERS" count="SELECT COUNT(DISTINCT(user)) FROM INFORMATION_SCHEMA.SYSTEM_USERS"/> <blind query="SELECT LIMIT %d 1 DISTINCT(user) FROM INFORMATION_SCHEMA.SYSTEM_USERS ORDER BY user" count="SELECT COUNT(DISTINCT(user)) FROM INFORMATION_SCHEMA.SYSTEM_USERS"/>
<inband query="SELECT user FROM INFORMATION_SCHEMA.SYSTEM_USERS"/> <inband query="SELECT user FROM INFORMATION_SCHEMA.SYSTEM_USERS ORDER BY user"/>
</users> </users>
<passwords> <passwords>
<!-- Passwords only shown in later versions &gt;=2.0 --> <!-- Passwords only shown in later versions &gt;=2.0 -->
<blind query="SELECT LIMIT %d 1 DISTINCT(password_digest) FROM INFORMATION_SCHEMA.SYSTEM_USERS WHERE user_name='%s'" count="SELECT COUNT(DISTINCT(password_digest)) FROM INFORMATION_SCHEMA.SYSTEM_USERS WHERE user_name='%s'"/> <blind query="SELECT LIMIT %d 1 DISTINCT(password_digest) FROM INFORMATION_SCHEMA.SYSTEM_USERS WHERE user_name='%s' ORDER BY password_digest" count="SELECT COUNT(DISTINCT(password_digest)) FROM INFORMATION_SCHEMA.SYSTEM_USERS WHERE user_name='%s'"/>
<inband query="SELECT user_name,password_digest FROM INFORMATION_SCHEMA.SYSTEM_USERS" condition="user_name"/> <inband query="SELECT user_name,password_digest FROM INFORMATION_SCHEMA.SYSTEM_USERS ORDER BY user_name" condition="user_name"/>
</passwords> </passwords>
<privileges/> <privileges/>
<roles/> <roles/>
<dbs> <dbs>
<blind query="SELECT LIMIT %d 1 DISTINCT(table_schem) FROM INFORMATION_SCHEMA.SYSTEM_SCHEMAS" count="SELECT COUNT(table_schem) FROM INFORMATION_SCHEMA.SYSTEM_SCHEMAS"/> <blind query="SELECT LIMIT %d 1 DISTINCT(table_schem) FROM INFORMATION_SCHEMA.SYSTEM_SCHEMAS ORDER BY table_schem" count="SELECT COUNT(table_schem) FROM INFORMATION_SCHEMA.SYSTEM_SCHEMAS"/>
<inband query="SELECT table_schem FROM INFORMATION_SCHEMA.SYSTEM_SCHEMAS" /> <inband query="SELECT table_schem FROM INFORMATION_SCHEMA.SYSTEM_SCHEMAS ORDER BY table_schem" />
</dbs> </dbs>
<tables> <tables>
<blind query="SELECT LIMIT %d 1 table_name FROM INFORMATION_SCHEMA.SYSTEM_TABLES WHERE table_schem='%s' " count="SELECT COUNT(table_name) FROM INFORMATION_SCHEMA.SYSTEM_TABLES WHERE table_schem='%s'"/> <blind query="SELECT LIMIT %d 1 table_name FROM INFORMATION_SCHEMA.SYSTEM_TABLES WHERE table_schem='%s' ORDER BY table_name" count="SELECT COUNT(table_name) FROM INFORMATION_SCHEMA.SYSTEM_TABLES WHERE table_schem='%s'"/>
<inband query="SELECT table_schem,table_name FROM INFORMATION_SCHEMA.SYSTEM_TABLES" condition="table_schem"/> <inband query="SELECT table_schem,table_name FROM INFORMATION_SCHEMA.SYSTEM_TABLES ORDER BY table_schem" condition="table_schem"/>
</tables> </tables>
<columns> <columns>
<blind query="SELECT column_name FROM INFORMATION_SCHEMA.SYSTEM_COLUMNS WHERE table_name='%s' AND table_schem='%s'" query2="SELECT column_type FROM INFORMATION_SCHEMA.COLUMNS WHERE table_name='%s' AND column_name='%s' AND table_schema='%s'" count="SELECT COUNT(column_name) FROM INFORMATION_SCHEMA.COLUMNS WHERE table_name='%s' AND table_schema='%s'" condition="column_name"/> <blind query="SELECT column_name FROM INFORMATION_SCHEMA.SYSTEM_COLUMNS WHERE table_name='%s' AND table_schem='%s' ORDER BY column_name" query2="SELECT column_type FROM INFORMATION_SCHEMA.SYSTEM_COLUMNS WHERE table_name='%s' AND column_name='%s' AND table_schem='%s'" count="SELECT COUNT(column_name) FROM INFORMATION_SCHEMA.SYSTEM_COLUMNS WHERE table_name='%s' AND table_schem='%s'" condition="column_name"/>
<inband query="SELECT column_name,type_name FROM INFORMATION_SCHEMA.SYSTEM_COLUMNS WHERE table_name='%s' AND table_schem='%s'" condition="column_name"/> <inband query="SELECT column_name,type_name FROM INFORMATION_SCHEMA.SYSTEM_COLUMNS WHERE table_name='%s' AND table_schem='%s' ORDER BY column_name" condition="column_name"/>
</columns> </columns>
<dump_table> <dump_table>
<blind query="SELECT LIMIT %d 1 %s FROM %s.%s ORDER BY %s " count="SELECT COUNT(*) FROM %s.%s"/> <blind query="SELECT %s FROM %s.%s ORDER BY %s LIMIT 1 OFFSET %d" count="SELECT COUNT(*) FROM %s.%s"/>
<inband query="SELECT %s FROM %s.%s ORDER BY %s"/> <inband query="SELECT %s FROM %s.%s ORDER BY %s"/>
</dump_table> </dump_table>
<search_db> <search_db>