mirror of
https://github.com/sqlmapproject/sqlmap.git
synced 2025-07-29 17:39:56 +03:00
Merge remote-tracking branch 'sqlmapproject/master'
This commit is contained in:
commit
2914574b9b
|
@ -173,6 +173,9 @@ Ivan Giacomelli, <truemilk(at)insiberia.net>
|
|||
* for suggesting a minor enhancement
|
||||
* for reviewing the documentation
|
||||
|
||||
Dimitris Giannitsaros, <daremon(at)gmail.com>
|
||||
* for contributing a REST-JSON API client
|
||||
|
||||
Nico Golde, <nico(at)ngolde.de>
|
||||
* for reporting a couple of bugs
|
||||
|
||||
|
|
|
@ -76,7 +76,7 @@ def main(src, dst):
|
|||
# Instantiate an IP packets decoder
|
||||
decoder = ImpactDecoder.IPDecoder()
|
||||
|
||||
while 1:
|
||||
while True:
|
||||
cmd = ''
|
||||
|
||||
# Wait for incoming replies
|
||||
|
|
|
@ -22,6 +22,7 @@ from lib.core.common import findDynamicContent
|
|||
from lib.core.common import Format
|
||||
from lib.core.common import getLastRequestHTTPError
|
||||
from lib.core.common import getPublicTypeMembers
|
||||
from lib.core.common import getSafeExString
|
||||
from lib.core.common import getSortedInjectionTests
|
||||
from lib.core.common import getUnicode
|
||||
from lib.core.common import intersect
|
||||
|
@ -38,6 +39,7 @@ from lib.core.common import singleTimeWarnMessage
|
|||
from lib.core.common import urlencode
|
||||
from lib.core.common import wasLastResponseDBMSError
|
||||
from lib.core.common import wasLastResponseHTTPError
|
||||
from lib.core.defaults import defaults
|
||||
from lib.core.data import conf
|
||||
from lib.core.data import kb
|
||||
from lib.core.data import logger
|
||||
|
@ -67,6 +69,7 @@ from lib.core.settings import URI_HTTP_HEADER
|
|||
from lib.core.settings import UPPER_RATIO_BOUND
|
||||
from lib.core.settings import IDS_WAF_CHECK_PAYLOAD
|
||||
from lib.core.settings import IDS_WAF_CHECK_RATIO
|
||||
from lib.core.settings import IDS_WAF_CHECK_TIMEOUT
|
||||
from lib.core.threads import getCurrentThreadData
|
||||
from lib.request.connect import Connect as Request
|
||||
from lib.request.inject import checkBooleanExpression
|
||||
|
@ -204,6 +207,16 @@ def checkSqlInjection(place, parameter, value):
|
|||
logger.debug(debugMsg)
|
||||
continue
|
||||
|
||||
# Skip tests if title, vector or DBMS is included by the
|
||||
# given skip filter
|
||||
if conf.testSkip and any(conf.testSkip in str(item) or \
|
||||
re.search(conf.testSkip, str(item), re.I) for item in \
|
||||
(test.title, test.vector, payloadDbms)):
|
||||
debugMsg = "skipping test '%s' because its " % title
|
||||
debugMsg += "name/vector/DBMS is included by the given skip filter"
|
||||
logger.debug(debugMsg)
|
||||
continue
|
||||
|
||||
if payloadDbms is not None:
|
||||
# Skip DBMS-specific test if it does not match the user's
|
||||
# provided DBMS
|
||||
|
@ -1139,12 +1152,12 @@ def checkWaf():
|
|||
Reference: http://seclists.org/nmap-dev/2011/q2/att-1005/http-waf-detect.nse
|
||||
"""
|
||||
|
||||
if any((conf.string, conf.notString, conf.regexp, conf.dummy, conf.offline)):
|
||||
if any((conf.string, conf.notString, conf.regexp, conf.dummy, conf.offline, conf.skipWaf)):
|
||||
return None
|
||||
|
||||
dbmMsg = "heuristically checking if the target is protected by "
|
||||
dbmMsg += "some kind of WAF/IPS/IDS"
|
||||
logger.debug(dbmMsg)
|
||||
infoMsg = "checking if the target is protected by "
|
||||
infoMsg += "some kind of WAF/IPS/IDS"
|
||||
logger.info(infoMsg)
|
||||
|
||||
retVal = False
|
||||
payload = "%d %s" % (randomInt(), IDS_WAF_CHECK_PAYLOAD)
|
||||
|
@ -1152,12 +1165,16 @@ def checkWaf():
|
|||
value = "" if not conf.parameters.get(PLACE.GET) else conf.parameters[PLACE.GET] + DEFAULT_GET_POST_DELIMITER
|
||||
value += agent.addPayloadDelimiters("%s=%s" % (randomStr(), payload))
|
||||
|
||||
pushValue(conf.timeout)
|
||||
conf.timeout = IDS_WAF_CHECK_TIMEOUT
|
||||
|
||||
try:
|
||||
retVal = Request.queryPage(place=PLACE.GET, value=value, getRatioValue=True, noteResponseTime=False, silent=True)[1] < IDS_WAF_CHECK_RATIO
|
||||
except SqlmapConnectionException:
|
||||
retVal = True
|
||||
finally:
|
||||
kb.matchRatio = None
|
||||
conf.timeout = popValue()
|
||||
|
||||
if retVal:
|
||||
warnMsg = "heuristics detected that the target "
|
||||
|
@ -1172,6 +1189,10 @@ def checkWaf():
|
|||
if output and output[0] in ("Y", "y"):
|
||||
conf.identifyWaf = True
|
||||
|
||||
if conf.timeout == defaults.timeout:
|
||||
logger.warning("dropping timeout to %d seconds (i.e. '--timeout=%d')" % (IDS_WAF_CHECK_TIMEOUT, IDS_WAF_CHECK_TIMEOUT))
|
||||
conf.timeout = IDS_WAF_CHECK_TIMEOUT
|
||||
|
||||
return retVal
|
||||
|
||||
def identifyWaf():
|
||||
|
@ -1278,8 +1299,8 @@ def checkNullConnection():
|
|||
infoMsg = "NULL connection is supported with 'skip-read' method"
|
||||
logger.info(infoMsg)
|
||||
|
||||
except SqlmapConnectionException, errMsg:
|
||||
errMsg = getUnicode(errMsg)
|
||||
except SqlmapConnectionException, ex:
|
||||
errMsg = getSafeExString(ex)
|
||||
raise SqlmapConnectionException(errMsg)
|
||||
|
||||
finally:
|
||||
|
@ -1298,7 +1319,7 @@ def checkConnection(suppressOutput=False):
|
|||
raise SqlmapConnectionException(errMsg)
|
||||
except socket.error, ex:
|
||||
errMsg = "problem occurred while "
|
||||
errMsg += "resolving a host name '%s' ('%s')" % (conf.hostname, ex.message)
|
||||
errMsg += "resolving a host name '%s' ('%s')" % (conf.hostname, getSafeExString(ex))
|
||||
raise SqlmapConnectionException(errMsg)
|
||||
|
||||
if not suppressOutput and not conf.dummy and not conf.offline:
|
||||
|
@ -1326,7 +1347,7 @@ def checkConnection(suppressOutput=False):
|
|||
else:
|
||||
kb.errorIsNone = True
|
||||
|
||||
except SqlmapConnectionException, errMsg:
|
||||
except SqlmapConnectionException, ex:
|
||||
if conf.ipv6:
|
||||
warnMsg = "check connection to a provided "
|
||||
warnMsg += "IPv6 address with a tool like ping6 "
|
||||
|
@ -1336,7 +1357,7 @@ def checkConnection(suppressOutput=False):
|
|||
singleTimeWarnMessage(warnMsg)
|
||||
|
||||
if any(code in kb.httpErrorCodes for code in (httplib.NOT_FOUND, )):
|
||||
errMsg = getUnicode(errMsg)
|
||||
errMsg = getSafeExString(ex)
|
||||
logger.critical(errMsg)
|
||||
|
||||
if conf.multipleTargets:
|
||||
|
|
|
@ -24,7 +24,7 @@ from lib.core.common import dataToStdout
|
|||
from lib.core.common import extractRegexResult
|
||||
from lib.core.common import getFilteredPageContent
|
||||
from lib.core.common import getPublicTypeMembers
|
||||
from lib.core.common import getUnicode
|
||||
from lib.core.common import getSafeExString
|
||||
from lib.core.common import hashDBRetrieve
|
||||
from lib.core.common import hashDBWrite
|
||||
from lib.core.common import intersect
|
||||
|
@ -421,6 +421,7 @@ def start():
|
|||
skip |= (place == PLACE.USER_AGENT and intersect(USER_AGENT_ALIASES, conf.skip, True) not in ([], None))
|
||||
skip |= (place == PLACE.REFERER and intersect(REFERER_ALIASES, conf.skip, True) not in ([], None))
|
||||
skip |= (place == PLACE.COOKIE and intersect(PLACE.COOKIE, conf.skip, True) not in ([], None))
|
||||
skip |= (place == PLACE.HOST and intersect(PLACE.HOST, conf.skip, True) not in ([], None))
|
||||
|
||||
skip &= not (place == PLACE.USER_AGENT and intersect(USER_AGENT_ALIASES, conf.testParameter, True))
|
||||
skip &= not (place == PLACE.REFERER and intersect(REFERER_ALIASES, conf.testParameter, True))
|
||||
|
@ -648,7 +649,7 @@ def start():
|
|||
raise
|
||||
|
||||
except SqlmapBaseException, ex:
|
||||
errMsg = getUnicode(ex.message)
|
||||
errMsg = getSafeExString(ex)
|
||||
|
||||
if conf.multipleTargets:
|
||||
errMsg += ", skipping to the next %s" % ("form" if conf.forms else "URL")
|
||||
|
|
|
@ -187,12 +187,12 @@ class Agent(object):
|
|||
|
||||
if origValue:
|
||||
regex = r"(\A|\b)%s=%s%s" % (re.escape(parameter), re.escape(origValue), r"(\Z|\b)" if origValue[-1].isalnum() else "")
|
||||
retVal = _(regex, "%s=%s" % (parameter, self.addPayloadDelimiters(newValue.replace("\\", "\\\\"))), paramString)
|
||||
retVal = _(regex, "%s=%s" % (parameter, self.addPayloadDelimiters(newValue)), paramString)
|
||||
else:
|
||||
retVal = _(r"(\A|\b)%s=%s(\Z|%s|%s|\s)" % (re.escape(parameter), re.escape(origValue), DEFAULT_GET_POST_DELIMITER, DEFAULT_COOKIE_DELIMITER), "%s=%s\g<2>" % (parameter, self.addPayloadDelimiters(newValue.replace("\\", "\\\\"))), paramString)
|
||||
retVal = _(r"(\A|\b)%s=%s(\Z|%s|%s|\s)" % (re.escape(parameter), re.escape(origValue), DEFAULT_GET_POST_DELIMITER, DEFAULT_COOKIE_DELIMITER), "%s=%s\g<2>" % (parameter, self.addPayloadDelimiters(newValue)), paramString)
|
||||
|
||||
if retVal == paramString and urlencode(parameter) != parameter:
|
||||
retVal = _(r"(\A|\b)%s=%s" % (re.escape(urlencode(parameter)), re.escape(origValue)), "%s=%s" % (urlencode(parameter), self.addPayloadDelimiters(newValue.replace("\\", "\\\\"))), paramString)
|
||||
retVal = _(r"(\A|\b)%s=%s" % (re.escape(urlencode(parameter)), re.escape(origValue)), "%s=%s" % (urlencode(parameter), self.addPayloadDelimiters(newValue)), paramString)
|
||||
|
||||
if retVal:
|
||||
retVal = retVal.replace(BOUNDARY_BACKSLASH_MARKER, '\\')
|
||||
|
@ -308,8 +308,8 @@ class Agent(object):
|
|||
for _ in set(re.findall(r"\[RANDSTR(?:\d+)?\]", payload, re.I)):
|
||||
payload = payload.replace(_, randomStr())
|
||||
|
||||
if origValue is not None:
|
||||
payload = payload.replace("[ORIGVALUE]", origValue if origValue.isdigit() else unescaper.escape("'%s'" % origValue))
|
||||
if origValue is not None and "[ORIGVALUE]" in payload:
|
||||
payload = getUnicode(payload).replace("[ORIGVALUE]", origValue if origValue.isdigit() else unescaper.escape("'%s'" % origValue))
|
||||
|
||||
if "[INFERENCE]" in payload:
|
||||
if Backend.getIdentifiedDbms() is not None:
|
||||
|
@ -480,7 +480,7 @@ class Agent(object):
|
|||
@rtype: C{str}
|
||||
"""
|
||||
|
||||
prefixRegex = r"(?:\s+(?:FIRST|SKIP)\s+\d+)*"
|
||||
prefixRegex = r"(?:\s+(?:FIRST|SKIP|LIMIT \d+)\s+\d+)*"
|
||||
fieldsSelectTop = re.search(r"\ASELECT\s+TOP\s+[\d]+\s+(.+?)\s+FROM", query, re.I)
|
||||
fieldsSelectRownum = re.search(r"\ASELECT\s+([^()]+?),\s*ROWNUM AS LIMIT FROM", query, re.I)
|
||||
fieldsSelectDistinct = re.search(r"\ASELECT%s\s+DISTINCT\((.+?)\)\s+FROM" % prefixRegex, query, re.I)
|
||||
|
@ -501,13 +501,17 @@ class Agent(object):
|
|||
elif fieldsMinMaxstr:
|
||||
fieldsToCastStr = fieldsMinMaxstr.groups()[0]
|
||||
elif fieldsExists:
|
||||
fieldsToCastStr = fieldsSelect.groups()[0]
|
||||
if fieldsSelect:
|
||||
fieldsToCastStr = fieldsSelect.groups()[0]
|
||||
elif fieldsSelectTop:
|
||||
fieldsToCastStr = fieldsSelectTop.groups()[0]
|
||||
elif fieldsSelectRownum:
|
||||
fieldsToCastStr = fieldsSelectRownum.groups()[0]
|
||||
elif fieldsSelectDistinct:
|
||||
fieldsToCastStr = fieldsSelectDistinct.groups()[0]
|
||||
if Backend.getDbms() in (DBMS.HSQLDB,):
|
||||
fieldsToCastStr = fieldsNoSelect
|
||||
else:
|
||||
fieldsToCastStr = fieldsSelectDistinct.groups()[0]
|
||||
elif fieldsSelectCase:
|
||||
fieldsToCastStr = fieldsSelectCase.groups()[0]
|
||||
elif fieldsSelectFrom:
|
||||
|
@ -584,7 +588,7 @@ class Agent(object):
|
|||
else:
|
||||
return query
|
||||
|
||||
if Backend.getIdentifiedDbms() in (DBMS.MYSQL,):
|
||||
if Backend.isDbms(DBMS.MYSQL):
|
||||
if fieldsExists:
|
||||
concatenatedQuery = concatenatedQuery.replace("SELECT ", "CONCAT('%s'," % kb.chars.start, 1)
|
||||
concatenatedQuery += ",'%s')" % kb.chars.stop
|
||||
|
@ -611,6 +615,7 @@ class Agent(object):
|
|||
concatenatedQuery = concatenatedQuery.replace("SELECT ", "'%s'||" % kb.chars.start, 1)
|
||||
_ = unArrayizeValue(zeroDepthSearch(concatenatedQuery, " FROM "))
|
||||
concatenatedQuery = "%s||'%s'%s" % (concatenatedQuery[:_], kb.chars.stop, concatenatedQuery[_:])
|
||||
concatenatedQuery = re.sub(r"('%s'\|\|)(.+)(%s)" % (kb.chars.start, re.escape(castedFields)), "\g<2>\g<1>\g<3>", concatenatedQuery)
|
||||
elif fieldsSelect:
|
||||
concatenatedQuery = concatenatedQuery.replace("SELECT ", "'%s'||" % kb.chars.start, 1)
|
||||
concatenatedQuery += "||'%s'" % kb.chars.stop
|
||||
|
@ -881,12 +886,30 @@ class Agent(object):
|
|||
fromIndex = limitedQuery.index(" FROM ")
|
||||
untilFrom = limitedQuery[:fromIndex]
|
||||
fromFrom = limitedQuery[fromIndex + 1:]
|
||||
orderBy = False
|
||||
orderBy = None
|
||||
|
||||
if Backend.getIdentifiedDbms() in (DBMS.MYSQL, DBMS.PGSQL, DBMS.SQLITE):
|
||||
limitStr = queries[Backend.getIdentifiedDbms()].limit.query % (num, 1)
|
||||
limitedQuery += " %s" % limitStr
|
||||
|
||||
elif Backend.isDbms(DBMS.HSQLDB):
|
||||
match = re.search(r"ORDER BY [^ ]+", limitedQuery)
|
||||
if match:
|
||||
limitedQuery = re.sub(r"\s*%s\s*" % match.group(0), " ", limitedQuery).strip()
|
||||
limitedQuery += " %s" % match.group(0)
|
||||
|
||||
if query.startswith("SELECT "):
|
||||
limitStr = queries[Backend.getIdentifiedDbms()].limit.query % (num, 1)
|
||||
limitedQuery = limitedQuery.replace("SELECT ", "SELECT %s " % limitStr, 1)
|
||||
else:
|
||||
limitStr = queries[Backend.getIdentifiedDbms()].limit.query2 % (1, num)
|
||||
limitedQuery += " %s" % limitStr
|
||||
|
||||
if not match:
|
||||
match = re.search(r"%s\s+(\w+)" % re.escape(limitStr), limitedQuery)
|
||||
if match:
|
||||
orderBy = " ORDER BY %s" % match.group(1)
|
||||
|
||||
elif Backend.isDbms(DBMS.FIREBIRD):
|
||||
limitStr = queries[Backend.getIdentifiedDbms()].limit.query % (num + 1, num + 1)
|
||||
limitedQuery += " %s" % limitStr
|
||||
|
|
|
@ -79,7 +79,7 @@ class BigArray(list):
|
|||
self.chunks[-1] = pickle.load(fp)
|
||||
except IOError, ex:
|
||||
errMsg = "exception occurred while retrieving data "
|
||||
errMsg += "from a temporary file ('%s')" % ex
|
||||
errMsg += "from a temporary file ('%s')" % ex.message
|
||||
raise SqlmapSystemException, errMsg
|
||||
return self.chunks[-1].pop()
|
||||
|
||||
|
@ -99,7 +99,7 @@ class BigArray(list):
|
|||
return filename
|
||||
except (OSError, IOError), ex:
|
||||
errMsg = "exception occurred while storing data "
|
||||
errMsg += "to a temporary file ('%s'). Please " % ex
|
||||
errMsg += "to a temporary file ('%s'). Please " % ex.message
|
||||
errMsg += "make sure that there is enough disk space left. If problem persists, "
|
||||
errMsg += "try to set environment variable 'TEMP' to a location "
|
||||
errMsg += "writeable by the current user"
|
||||
|
@ -115,7 +115,7 @@ class BigArray(list):
|
|||
self.cache = Cache(index, pickle.load(fp), False)
|
||||
except IOError, ex:
|
||||
errMsg = "exception occurred while retrieving data "
|
||||
errMsg += "from a temporary file ('%s')" % ex
|
||||
errMsg += "from a temporary file ('%s')" % ex.message
|
||||
raise SqlmapSystemException, errMsg
|
||||
|
||||
def __getstate__(self):
|
||||
|
|
|
@ -879,7 +879,7 @@ def dataToOutFile(filename, data):
|
|||
f.write(data)
|
||||
except IOError, ex:
|
||||
errMsg = "something went wrong while trying to write "
|
||||
errMsg += "to the output file ('%s')" % ex.message
|
||||
errMsg += "to the output file ('%s')" % getSafeExString(ex)
|
||||
raise SqlmapGenericException(errMsg)
|
||||
|
||||
return retVal
|
||||
|
@ -909,14 +909,15 @@ def readInput(message, default=None, checkBatch=True):
|
|||
answer = item.split('=')[1] if len(item.split('=')) > 1 else None
|
||||
if answer and question.lower() in message.lower():
|
||||
retVal = getUnicode(answer, UNICODE_ENCODING)
|
||||
elif answer is None and retVal:
|
||||
retVal = "%s,%s" % (retVal, getUnicode(item, UNICODE_ENCODING))
|
||||
|
||||
infoMsg = "%s%s" % (message, retVal)
|
||||
logger.info(infoMsg)
|
||||
if retVal:
|
||||
infoMsg = "%s%s" % (message, retVal)
|
||||
logger.info(infoMsg)
|
||||
|
||||
debugMsg = "used the given answer"
|
||||
logger.debug(debugMsg)
|
||||
|
||||
break
|
||||
debugMsg = "used the given answer"
|
||||
logger.debug(debugMsg)
|
||||
|
||||
if retVal is None:
|
||||
if checkBatch and conf.get("batch"):
|
||||
|
@ -1369,7 +1370,7 @@ def expandAsteriskForColumns(expression):
|
|||
|
||||
return expression
|
||||
|
||||
def getLimitRange(count, dump=False, plusOne=False):
|
||||
def getLimitRange(count, plusOne=False):
|
||||
"""
|
||||
Returns range of values used in limit/offset constructs
|
||||
|
||||
|
@ -1381,12 +1382,11 @@ def getLimitRange(count, dump=False, plusOne=False):
|
|||
count = int(count)
|
||||
limitStart, limitStop = 1, count
|
||||
|
||||
if dump:
|
||||
if isinstance(conf.limitStop, int) and conf.limitStop > 0 and conf.limitStop < limitStop:
|
||||
limitStop = conf.limitStop
|
||||
if isinstance(conf.limitStop, int) and conf.limitStop > 0 and conf.limitStop < limitStop:
|
||||
limitStop = conf.limitStop
|
||||
|
||||
if isinstance(conf.limitStart, int) and conf.limitStart > 0 and conf.limitStart <= limitStop:
|
||||
limitStart = conf.limitStart
|
||||
if isinstance(conf.limitStart, int) and conf.limitStart > 0 and conf.limitStart <= limitStop:
|
||||
limitStart = conf.limitStart
|
||||
|
||||
retVal = xrange(limitStart, limitStop + 1) if plusOne else xrange(limitStart - 1, limitStop)
|
||||
|
||||
|
@ -1622,6 +1622,15 @@ def safeStringFormat(format_, params):
|
|||
index = retVal.find("%s", start)
|
||||
retVal = retVal[:index] + getUnicode(param) + retVal[index + 2:]
|
||||
else:
|
||||
if any('%s' in _ for _ in conf.parameters.values()):
|
||||
parts = format_.split(' ')
|
||||
for i in xrange(len(parts)):
|
||||
if PAYLOAD_DELIMITER in parts[i]:
|
||||
parts[i] = parts[i].replace(PAYLOAD_DELIMITER, "")
|
||||
parts[i] = "%s%s" % (parts[i], PAYLOAD_DELIMITER)
|
||||
break
|
||||
format_ = ' '.join(parts)
|
||||
|
||||
count = 0
|
||||
while True:
|
||||
match = re.search(r"(\A|[^A-Za-z0-9])(%s)([^A-Za-z0-9]|\Z)", retVal)
|
||||
|
@ -1866,8 +1875,13 @@ def readCachedFileContent(filename, mode='rb'):
|
|||
with kb.locks.cache:
|
||||
if filename not in kb.cache.content:
|
||||
checkFile(filename)
|
||||
with openFile(filename, mode) as f:
|
||||
kb.cache.content[filename] = f.read()
|
||||
try:
|
||||
with openFile(filename, mode) as f:
|
||||
kb.cache.content[filename] = f.read()
|
||||
except (IOError, OSError, MemoryError), ex:
|
||||
errMsg = "something went wrong while trying "
|
||||
errMsg += "to read the content of file '%s' ('%s')" % (filename, ex)
|
||||
raise SqlmapSystemException(errMsg)
|
||||
|
||||
return kb.cache.content[filename]
|
||||
|
||||
|
@ -2489,7 +2503,10 @@ def extractTextTagContent(page):
|
|||
page = page or ""
|
||||
|
||||
if REFLECTED_VALUE_MARKER in page:
|
||||
page = re.sub(r"(?si)[^\s>]*%s[^\s<]*" % REFLECTED_VALUE_MARKER, "", page)
|
||||
try:
|
||||
page = re.sub(r"(?i)[^\s>]*%s[^\s<]*" % REFLECTED_VALUE_MARKER, "", page)
|
||||
except MemoryError:
|
||||
page = page.replace(REFLECTED_VALUE_MARKER, "")
|
||||
|
||||
return filter(None, (_.group('result').strip() for _ in re.finditer(TEXT_TAG_REGEX, page)))
|
||||
|
||||
|
@ -2681,7 +2698,7 @@ def parseSqliteTableSchema(value):
|
|||
table = {}
|
||||
columns = {}
|
||||
|
||||
for match in re.finditer(r"(\w+)\s+(INT|INTEGER|TINYINT|SMALLINT|MEDIUMINT|BIGINT|UNSIGNED BIG INT|INT2|INT8|INTEGER|CHARACTER|VARCHAR|VARYING CHARACTER|NCHAR|NATIVE CHARACTER|NVARCHAR|TEXT|CLOB|TEXT|BLOB|NONE|REAL|DOUBLE|DOUBLE PRECISION|FLOAT|REAL|NUMERIC|DECIMAL|BOOLEAN|DATE|DATETIME|NUMERIC)\b", value, re.I):
|
||||
for match in re.finditer(r"(\w+)[\"'`]?\s+(INT|INTEGER|TINYINT|SMALLINT|MEDIUMINT|BIGINT|UNSIGNED BIG INT|INT2|INT8|INTEGER|CHARACTER|VARCHAR|VARYING CHARACTER|NCHAR|NATIVE CHARACTER|NVARCHAR|TEXT|CLOB|TEXT|BLOB|NONE|REAL|DOUBLE|DOUBLE PRECISION|FLOAT|REAL|NUMERIC|DECIMAL|BOOLEAN|DATE|DATETIME|NUMERIC)\b", value, re.I):
|
||||
columns[match.group(1)] = match.group(2)
|
||||
|
||||
table[conf.tbl] = columns
|
||||
|
@ -2800,7 +2817,13 @@ def unArrayizeValue(value):
|
|||
"""
|
||||
|
||||
if isListLike(value):
|
||||
value = value[0] if len(value) > 0 else None
|
||||
if not value:
|
||||
value = None
|
||||
elif len(value) == 1 and not isListLike(value[0]):
|
||||
value = value[0]
|
||||
else:
|
||||
_ = filter(lambda _: _ is not None, (_ for _ in flattenValue(value)))
|
||||
value = _[0] if len(_) > 0 else None
|
||||
|
||||
return value
|
||||
|
||||
|
@ -3008,7 +3031,7 @@ def createGithubIssue(errMsg, excMsg):
|
|||
else:
|
||||
warnMsg = "something went wrong while creating a Github issue"
|
||||
if ex:
|
||||
warnMsg += " ('%s')" % ex
|
||||
warnMsg += " ('%s')" % getSafeExString(ex)
|
||||
if "Unauthorized" in warnMsg:
|
||||
warnMsg += ". Please update to the latest revision"
|
||||
logger.warn(warnMsg)
|
||||
|
@ -3020,7 +3043,7 @@ def maskSensitiveData(msg):
|
|||
|
||||
retVal = getUnicode(msg)
|
||||
|
||||
for item in filter(None, map(lambda x: conf.get(x), ("hostname", "googleDork", "authCred", "proxyCred", "tbl", "db", "col", "user", "cookie", "proxy", "rFile", "wFile", "dFile"))):
|
||||
for item in filter(None, map(lambda x: conf.get(x), ("hostname", "data", "googleDork", "authCred", "proxyCred", "tbl", "db", "col", "user", "cookie", "proxy", "rFile", "wFile", "dFile"))):
|
||||
regex = SENSITIVE_DATA_REGEX % re.sub("(\W)", r"\\\1", getUnicode(item))
|
||||
while extractRegexResult(regex, retVal):
|
||||
value = extractRegexResult(regex, retVal)
|
||||
|
@ -3567,7 +3590,7 @@ def findPageForms(content, url, raise_=False, addToTargets=False):
|
|||
request = form.click()
|
||||
except (ValueError, TypeError), ex:
|
||||
errMsg = "there has been a problem while "
|
||||
errMsg += "processing page forms ('%s')" % ex
|
||||
errMsg += "processing page forms ('%s')" % getSafeExString(ex)
|
||||
if raise_:
|
||||
raise SqlmapGenericException(errMsg)
|
||||
else:
|
||||
|
@ -3670,7 +3693,7 @@ def evaluateCode(code, variables=None):
|
|||
except KeyboardInterrupt:
|
||||
raise
|
||||
except Exception, ex:
|
||||
errMsg = "an error occurred while evaluating provided code ('%s') " % ex.message
|
||||
errMsg = "an error occurred while evaluating provided code ('%s') " % getSafeExString(ex)
|
||||
raise SqlmapGenericException(errMsg)
|
||||
|
||||
def serializeObject(object_):
|
||||
|
@ -3870,13 +3893,18 @@ def decloakToTemp(filename):
|
|||
"""
|
||||
|
||||
content = decloak(filename)
|
||||
_ = os.path.split(filename[:-1])[-1]
|
||||
|
||||
_ = utf8encode(os.path.split(filename[:-1])[-1])
|
||||
|
||||
prefix, suffix = os.path.splitext(_)
|
||||
prefix = prefix.split(os.extsep)[0]
|
||||
|
||||
handle, filename = tempfile.mkstemp(prefix=prefix, suffix=suffix)
|
||||
os.close(handle)
|
||||
|
||||
with open(filename, "w+b") as f:
|
||||
f.write(content)
|
||||
|
||||
return filename
|
||||
|
||||
def prioritySortColumns(columns):
|
||||
|
@ -3977,3 +4005,18 @@ def pollProcess(process, suppress_errors=False):
|
|||
dataToStdout(" quit unexpectedly with return code %d\n" % returncode)
|
||||
|
||||
break
|
||||
|
||||
def getSafeExString(ex, encoding=None):
|
||||
"""
|
||||
Safe way how to get the proper exception represtation as a string
|
||||
(Note: errors to be avoided: 1) "%s" % Exception(u'\u0161') and 2) "%s" % str(Exception(u'\u0161'))
|
||||
"""
|
||||
|
||||
retVal = ex
|
||||
|
||||
if getattr(ex, "message", None):
|
||||
retVal = ex.message
|
||||
elif getattr(ex, "msg", None):
|
||||
retVal = ex.msg
|
||||
|
||||
return getUnicode(retVal, encoding=encoding)
|
||||
|
|
|
@ -223,6 +223,7 @@ DEPRECATED_OPTIONS = {
|
|||
"--replicate": "use '--dump-format=SQLITE' instead",
|
||||
"--no-unescape": "use '--no-escape' instead",
|
||||
"--binary": "use '--binary-fields' instead",
|
||||
"--auth-private": "use '--auth-file' instead",
|
||||
"--check-payload": None,
|
||||
"--check-waf": None,
|
||||
}
|
||||
|
|
|
@ -15,6 +15,7 @@ import threading
|
|||
from lib.core.common import Backend
|
||||
from lib.core.common import dataToDumpFile
|
||||
from lib.core.common import dataToStdout
|
||||
from lib.core.common import getSafeExString
|
||||
from lib.core.common import getUnicode
|
||||
from lib.core.common import isListLike
|
||||
from lib.core.common import normalizeUnicode
|
||||
|
@ -74,7 +75,7 @@ class Dump(object):
|
|||
try:
|
||||
self._outputFP.write(text)
|
||||
except IOError, ex:
|
||||
errMsg = "error occurred while writing to log file ('%s')" % ex.message
|
||||
errMsg = "error occurred while writing to log file ('%s')" % getSafeExString(ex)
|
||||
raise SqlmapGenericException(errMsg)
|
||||
|
||||
if kb.get("multiThreadMode"):
|
||||
|
@ -94,7 +95,7 @@ class Dump(object):
|
|||
try:
|
||||
self._outputFP = openFile(self._outputFile, "ab" if not conf.flushSession else "wb")
|
||||
except IOError, ex:
|
||||
errMsg = "error occurred while opening log file ('%s')" % ex.message
|
||||
errMsg = "error occurred while opening log file ('%s')" % getSafeExString(ex)
|
||||
raise SqlmapGenericException(errMsg)
|
||||
|
||||
def getOutputFile(self):
|
||||
|
@ -159,7 +160,7 @@ class Dump(object):
|
|||
def currentDb(self, data):
|
||||
if Backend.isDbms(DBMS.MAXDB):
|
||||
self.string("current database (no practical usage on %s)" % Backend.getIdentifiedDbms(), data, content_type=CONTENT_TYPE.CURRENT_DB)
|
||||
elif Backend.getIdentifiedDbms() in (DBMS.ORACLE, DBMS.PGSQL):
|
||||
elif Backend.getIdentifiedDbms() in (DBMS.ORACLE, DBMS.PGSQL, DBMS.HSQLDB):
|
||||
self.string("current schema (equivalent to database on %s)" % Backend.getIdentifiedDbms(), data, content_type=CONTENT_TYPE.CURRENT_DB)
|
||||
else:
|
||||
self.string("current database", data, content_type=CONTENT_TYPE.CURRENT_DB)
|
||||
|
@ -635,11 +636,11 @@ class Dump(object):
|
|||
|
||||
for column in dbColumnsDict.keys():
|
||||
if colConsider == "1":
|
||||
colConsiderStr = "s like '%s' were" % unsafeSQLIdentificatorNaming(column)
|
||||
colConsiderStr = "s LIKE '%s' were" % unsafeSQLIdentificatorNaming(column)
|
||||
else:
|
||||
colConsiderStr = " '%s' was" % unsafeSQLIdentificatorNaming(column)
|
||||
|
||||
msg = "Column%s found in the " % colConsiderStr
|
||||
msg = "column%s found in the " % colConsiderStr
|
||||
msg += "following databases:"
|
||||
self._write(msg)
|
||||
|
||||
|
|
|
@ -27,6 +27,7 @@ import lib.core.common
|
|||
import lib.core.threads
|
||||
import lib.core.convert
|
||||
import lib.request.connect
|
||||
import lib.utils.google
|
||||
|
||||
from lib.controller.checks import checkConnection
|
||||
from lib.core.common import Backend
|
||||
|
@ -34,6 +35,7 @@ from lib.core.common import boldifyMessage
|
|||
from lib.core.common import checkFile
|
||||
from lib.core.common import dataToStdout
|
||||
from lib.core.common import getPublicTypeMembers
|
||||
from lib.core.common import getSafeExString
|
||||
from lib.core.common import extractRegexResult
|
||||
from lib.core.common import filterStringValue
|
||||
from lib.core.common import findPageForms
|
||||
|
@ -90,6 +92,7 @@ from lib.core.exception import SqlmapInstallationException
|
|||
from lib.core.exception import SqlmapMissingDependence
|
||||
from lib.core.exception import SqlmapMissingMandatoryOptionException
|
||||
from lib.core.exception import SqlmapMissingPrivileges
|
||||
from lib.core.exception import SqlmapNoneDataException
|
||||
from lib.core.exception import SqlmapSilentQuitException
|
||||
from lib.core.exception import SqlmapSyntaxException
|
||||
from lib.core.exception import SqlmapSystemException
|
||||
|
@ -638,7 +641,7 @@ def _setBulkMultipleTargets():
|
|||
for line in getFileItems(conf.bulkFile):
|
||||
if re.match(r"[^ ]+\?(.+)", line, re.I) or CUSTOM_INJECTION_MARK_CHAR in line:
|
||||
found = True
|
||||
kb.targets.add((line.strip(), None, None, None, None))
|
||||
kb.targets.add((line.strip(), conf.method, conf.data, conf.cookie, None))
|
||||
|
||||
if not found and not conf.forms and not conf.crawlDepth:
|
||||
warnMsg = "no usable links found (with GET parameters)"
|
||||
|
@ -776,6 +779,7 @@ def _setMetasploit():
|
|||
kb.oldMsf = True
|
||||
else:
|
||||
msfEnvPathExists = False
|
||||
|
||||
conf.msfPath = path
|
||||
break
|
||||
|
||||
|
@ -806,7 +810,7 @@ def _setMetasploit():
|
|||
for envPath in envPaths:
|
||||
envPath = envPath.replace(";", "")
|
||||
|
||||
if all(os.path.exists(normalizePath(os.path.join(envPath, _))) for _ in ("", "msfcli", "msfconsole")):
|
||||
if any(os.path.exists(normalizePath(os.path.join(envPath, _))) for _ in ("msfcli", "msfconsole")):
|
||||
msfEnvPathExists = True
|
||||
if all(os.path.exists(normalizePath(os.path.join(envPath, _))) for _ in ("msfvenom",)):
|
||||
kb.oldMsf = False
|
||||
|
@ -1083,18 +1087,22 @@ def _setHTTPProxy():
|
|||
if hasattr(proxyHandler, "%s_open" % _):
|
||||
delattr(proxyHandler, "%s_open" % _)
|
||||
|
||||
if not conf.proxy:
|
||||
if conf.proxyList:
|
||||
conf.proxy = conf.proxyList[0]
|
||||
conf.proxyList = conf.proxyList[1:] + conf.proxyList[:1]
|
||||
if conf.proxyList is not None:
|
||||
if not conf.proxyList:
|
||||
errMsg = "list of usable proxies is exhausted"
|
||||
raise SqlmapNoneDataException(errMsg)
|
||||
|
||||
infoMsg = "loading proxy '%s' from a supplied proxy list file" % conf.proxy
|
||||
logger.info(infoMsg)
|
||||
else:
|
||||
if conf.hostname in ('localhost', '127.0.0.1') or conf.ignoreProxy:
|
||||
proxyHandler.proxies = {}
|
||||
conf.proxy = conf.proxyList[0]
|
||||
conf.proxyList = conf.proxyList[1:]
|
||||
|
||||
return
|
||||
infoMsg = "loading proxy '%s' from a supplied proxy list file" % conf.proxy
|
||||
logger.info(infoMsg)
|
||||
|
||||
elif not conf.proxy:
|
||||
if conf.hostname in ("localhost", "127.0.0.1") or conf.ignoreProxy:
|
||||
proxyHandler.proxies = {}
|
||||
|
||||
return
|
||||
|
||||
debugMsg = "setting the HTTP/SOCKS proxy for all HTTP requests"
|
||||
logger.debug(debugMsg)
|
||||
|
@ -1126,7 +1134,7 @@ def _setHTTPProxy():
|
|||
if conf.proxyCred:
|
||||
_ = re.search("^(.*?):(.*?)$", conf.proxyCred)
|
||||
if not _:
|
||||
errMsg = "Proxy authentication credentials "
|
||||
errMsg = "proxy authentication credentials "
|
||||
errMsg += "value must be in format username:password"
|
||||
raise SqlmapSyntaxException(errMsg)
|
||||
else:
|
||||
|
@ -1257,13 +1265,13 @@ def _setHTTPAuthentication():
|
|||
|
||||
global authHandler
|
||||
|
||||
if not conf.authType and not conf.authCred and not conf.authPrivate:
|
||||
if not conf.authType and not conf.authCred and not conf.authFile:
|
||||
return
|
||||
|
||||
if conf.authPrivate and not conf.authType:
|
||||
if conf.authFile and not conf.authType:
|
||||
conf.authType = AUTH_TYPE.PKI
|
||||
|
||||
elif conf.authType and not conf.authCred and not conf.authPrivate:
|
||||
elif conf.authType and not conf.authCred and not conf.authFile:
|
||||
errMsg = "you specified the HTTP authentication type, but "
|
||||
errMsg += "did not provide the credentials"
|
||||
raise SqlmapSyntaxException(errMsg)
|
||||
|
@ -1278,7 +1286,7 @@ def _setHTTPAuthentication():
|
|||
errMsg += "Basic, Digest, NTLM or PKI"
|
||||
raise SqlmapSyntaxException(errMsg)
|
||||
|
||||
if not conf.authPrivate:
|
||||
if not conf.authFile:
|
||||
debugMsg = "setting the HTTP authentication type and credentials"
|
||||
logger.debug(debugMsg)
|
||||
|
||||
|
@ -1329,7 +1337,7 @@ def _setHTTPAuthentication():
|
|||
debugMsg = "setting the HTTP(s) authentication PEM private key"
|
||||
logger.debug(debugMsg)
|
||||
|
||||
_ = safeExpandUser(conf.authPrivate)
|
||||
_ = safeExpandUser(conf.authFile)
|
||||
checkFile(_)
|
||||
authHandler = HTTPSPKIAuthHandler(_)
|
||||
|
||||
|
@ -1523,7 +1531,7 @@ def _createTemporaryDirectory():
|
|||
os.makedirs(tempfile.gettempdir())
|
||||
except IOError, ex:
|
||||
errMsg = "there has been a problem while accessing "
|
||||
errMsg += "system's temporary directory location(s) ('%s'). Please " % ex.message
|
||||
errMsg += "system's temporary directory location(s) ('%s'). Please " % getSafeExString(ex)
|
||||
errMsg += "make sure that there is enough disk space left. If problem persists, "
|
||||
errMsg += "try to set environment variable 'TEMP' to a location "
|
||||
errMsg += "writeable by the current user"
|
||||
|
@ -1627,6 +1635,10 @@ def _cleanupOptions():
|
|||
conf.testFilter = conf.testFilter.strip('*+')
|
||||
conf.testFilter = re.sub(r"([^.])([*+])", "\g<1>.\g<2>", conf.testFilter)
|
||||
|
||||
if conf.testSkip:
|
||||
conf.testSkip = conf.testSkip.strip('*+')
|
||||
conf.testSkip = re.sub(r"([^.])([*+])", "\g<1>.\g<2>", conf.testSkip)
|
||||
|
||||
if "timeSec" not in kb.explicitSettings:
|
||||
if conf.tor:
|
||||
conf.timeSec = 2 * conf.timeSec
|
||||
|
@ -1734,7 +1746,7 @@ def _setConfAttributes():
|
|||
conf.parameters = {}
|
||||
conf.path = None
|
||||
conf.port = None
|
||||
conf.proxyList = []
|
||||
conf.proxyList = None
|
||||
conf.resultsFilename = None
|
||||
conf.resultsFP = None
|
||||
conf.scheme = None
|
||||
|
@ -2071,7 +2083,7 @@ def _mergeOptions(inputOptions, overrideOptions):
|
|||
inputOptions = base64unpickle(inputOptions.pickledOptions)
|
||||
except Exception, ex:
|
||||
errMsg = "provided invalid value '%s' for option '--pickled-options'" % inputOptions.pickledOptions
|
||||
errMsg += " ('%s')" % ex.message if ex.message else ""
|
||||
errMsg += " ('%s')" % ex if ex.message else ""
|
||||
raise SqlmapSyntaxException(errMsg)
|
||||
|
||||
if inputOptions.configFile:
|
||||
|
@ -2243,7 +2255,11 @@ def _checkTor():
|
|||
infoMsg = "checking Tor connection"
|
||||
logger.info(infoMsg)
|
||||
|
||||
page, _, _ = Request.getPage(url="https://check.torproject.org/", raise404=False)
|
||||
try:
|
||||
page, _, _ = Request.getPage(url="https://check.torproject.org/", raise404=False)
|
||||
except SqlmapConnectionException:
|
||||
page = None
|
||||
|
||||
if not page or 'Congratulations' not in page:
|
||||
errMsg = "it seems that Tor is not properly set. Please try using options '--tor-type' and/or '--tor-port'"
|
||||
raise SqlmapConnectionException(errMsg)
|
||||
|
@ -2290,6 +2306,10 @@ def _basicOptionValidation():
|
|||
errMsg = "option '-d' is incompatible with option '-u' ('--url')"
|
||||
raise SqlmapSyntaxException(errMsg)
|
||||
|
||||
if conf.identifyWaf and conf.skipWaf:
|
||||
errMsg = "switch '--identify-waf' is incompatible with switch '--skip-waf'"
|
||||
raise SqlmapSyntaxException(errMsg)
|
||||
|
||||
if conf.titles and conf.nullConnection:
|
||||
errMsg = "switch '--titles' is incompatible with switch '--null-connection'"
|
||||
raise SqlmapSyntaxException(errMsg)
|
||||
|
@ -2404,6 +2424,10 @@ def _basicOptionValidation():
|
|||
errMsg = "switch '--tor' is incompatible with option '--proxy'"
|
||||
raise SqlmapSyntaxException(errMsg)
|
||||
|
||||
if conf.proxy and conf.proxyFile:
|
||||
errMsg = "switch '--proxy' is incompatible with option '--proxy-file'"
|
||||
raise SqlmapSyntaxException(errMsg)
|
||||
|
||||
if conf.checkTor and not any((conf.tor, conf.proxy)):
|
||||
errMsg = "switch '--check-tor' requires usage of switch '--tor' (or option '--proxy' with HTTP proxy address using Tor)"
|
||||
raise SqlmapSyntaxException(errMsg)
|
||||
|
@ -2471,6 +2495,7 @@ def _resolveCrossReferences():
|
|||
lib.core.common.getPageTemplate = getPageTemplate
|
||||
lib.core.convert.singleTimeWarnMessage = singleTimeWarnMessage
|
||||
lib.request.connect.setHTTPProxy = _setHTTPProxy
|
||||
lib.utils.google.setHTTPProxy = _setHTTPProxy
|
||||
lib.controller.checks.setVerbosity = setVerbosity
|
||||
|
||||
def initOptions(inputOptions=AttribDict(), overrideOptions=False):
|
||||
|
|
|
@ -37,7 +37,7 @@ optDict = {
|
|||
"headers": "string",
|
||||
"authType": "string",
|
||||
"authCred": "string",
|
||||
"authPrivate": "string",
|
||||
"authFile": "string",
|
||||
"proxy": "string",
|
||||
"proxyCred": "string",
|
||||
"proxyFile": "string",
|
||||
|
@ -205,6 +205,7 @@ optDict = {
|
|||
"saveConfig": "string",
|
||||
"scope": "string",
|
||||
"testFilter": "string",
|
||||
"testSkip": "string",
|
||||
"updateAll": "boolean",
|
||||
},
|
||||
|
||||
|
@ -231,6 +232,7 @@ optDict = {
|
|||
"cpuThrottle": "integer",
|
||||
"forceDns": "boolean",
|
||||
"identifyWaf": "boolean",
|
||||
"skipWaf": "boolean",
|
||||
"ignore401": "boolean",
|
||||
"smokeTest": "boolean",
|
||||
"liveTest": "boolean",
|
||||
|
|
|
@ -8,9 +8,11 @@ See the file 'doc/COPYING' for copying permission
|
|||
import sqlite3
|
||||
|
||||
from extra.safe2bin.safe2bin import safechardecode
|
||||
from lib.core.common import getSafeExString
|
||||
from lib.core.common import unsafeSQLIdentificatorNaming
|
||||
from lib.core.exception import SqlmapGenericException
|
||||
from lib.core.exception import SqlmapValueException
|
||||
from lib.core.settings import UNICODE_ENCODING
|
||||
|
||||
class Replication(object):
|
||||
"""
|
||||
|
@ -49,11 +51,16 @@ class Replication(object):
|
|||
self.name = unsafeSQLIdentificatorNaming(name)
|
||||
self.columns = columns
|
||||
if create:
|
||||
self.execute('DROP TABLE IF EXISTS "%s"' % self.name)
|
||||
if not typeless:
|
||||
self.execute('CREATE TABLE "%s" (%s)' % (self.name, ','.join('"%s" %s' % (unsafeSQLIdentificatorNaming(colname), coltype) for colname, coltype in self.columns)))
|
||||
else:
|
||||
self.execute('CREATE TABLE "%s" (%s)' % (self.name, ','.join('"%s"' % unsafeSQLIdentificatorNaming(colname) for colname in self.columns)))
|
||||
try:
|
||||
self.execute('DROP TABLE IF EXISTS "%s"' % self.name)
|
||||
if not typeless:
|
||||
self.execute('CREATE TABLE "%s" (%s)' % (self.name, ','.join('"%s" %s' % (unsafeSQLIdentificatorNaming(colname), coltype) for colname, coltype in self.columns)))
|
||||
else:
|
||||
self.execute('CREATE TABLE "%s" (%s)' % (self.name, ','.join('"%s"' % unsafeSQLIdentificatorNaming(colname) for colname in self.columns)))
|
||||
except Exception, ex:
|
||||
errMsg = "problem occurred ('%s') while initializing the sqlite database " % getSafeExString(ex, UNICODE_ENCODING)
|
||||
errMsg += "located at '%s'" % self.parent.dbpath
|
||||
raise SqlmapGenericException(errMsg)
|
||||
|
||||
def insert(self, values):
|
||||
"""
|
||||
|
@ -70,7 +77,7 @@ class Replication(object):
|
|||
try:
|
||||
self.parent.cursor.execute(sql, parameters)
|
||||
except sqlite3.OperationalError, ex:
|
||||
errMsg = "problem occurred ('%s') while accessing sqlite database " % unicode(ex)
|
||||
errMsg = "problem occurred ('%s') while accessing sqlite database " % getSafeExString(ex, UNICODE_ENCODING)
|
||||
errMsg += "located at '%s'. Please make sure that " % self.parent.dbpath
|
||||
errMsg += "it's not used by some other program"
|
||||
raise SqlmapGenericException(errMsg)
|
||||
|
|
|
@ -42,6 +42,9 @@ CONSTANT_RATIO = 0.9
|
|||
# Ratio used in heuristic check for WAF/IDS/IPS protected targets
|
||||
IDS_WAF_CHECK_RATIO = 0.5
|
||||
|
||||
# Timeout used in heuristic check for WAF/IDS/IPS protected targets
|
||||
IDS_WAF_CHECK_TIMEOUT = 10
|
||||
|
||||
# Lower and upper values for match ratio in case of stable page
|
||||
LOWER_RATIO_BOUND = 0.02
|
||||
UPPER_RATIO_BOUND = 0.98
|
||||
|
@ -219,6 +222,8 @@ USER_AGENT_ALIASES = ("ua", "useragent", "user-agent")
|
|||
REFERER_ALIASES = ("ref", "referer", "referrer")
|
||||
HOST_ALIASES = ("host",)
|
||||
|
||||
HSQLDB_DEFAULT_SCHEMA = "PUBLIC"
|
||||
|
||||
# Names that can't be used to name files on Windows OS
|
||||
WINDOWS_RESERVED_NAMES = ("CON", "PRN", "AUX", "NUL", "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9", "LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9")
|
||||
|
||||
|
|
|
@ -39,7 +39,6 @@ from lib.core.enums import POST_HINT
|
|||
from lib.core.exception import SqlmapFilePathException
|
||||
from lib.core.exception import SqlmapGenericException
|
||||
from lib.core.exception import SqlmapMissingPrivileges
|
||||
from lib.core.exception import SqlmapSyntaxException
|
||||
from lib.core.exception import SqlmapSystemException
|
||||
from lib.core.exception import SqlmapUserQuitException
|
||||
from lib.core.option import _setDBMS
|
||||
|
|
|
@ -10,7 +10,7 @@ import threading
|
|||
import time
|
||||
import traceback
|
||||
|
||||
from thread import error as threadError
|
||||
from thread import error as ThreadError
|
||||
|
||||
from lib.core.data import conf
|
||||
from lib.core.data import kb
|
||||
|
@ -89,9 +89,9 @@ def exceptionHandledFunction(threadFunction):
|
|||
kb.threadContinue = False
|
||||
kb.threadException = True
|
||||
raise
|
||||
except Exception, errMsg:
|
||||
except Exception, ex:
|
||||
# thread is just going to be silently killed
|
||||
logger.error("thread %s: %s" % (threading.currentThread().getName(), errMsg))
|
||||
logger.error("thread %s: %s" % (threading.currentThread().getName(), ex.message))
|
||||
|
||||
def setDaemon(thread):
|
||||
# Reference: http://stackoverflow.com/questions/190010/daemon-threads-explanation
|
||||
|
@ -145,8 +145,8 @@ def runThreads(numThreads, threadFunction, cleanupFunction=None, forwardExceptio
|
|||
|
||||
try:
|
||||
thread.start()
|
||||
except threadError, errMsg:
|
||||
errMsg = "error occurred while starting new thread ('%s')" % errMsg
|
||||
except ThreadError, ex:
|
||||
errMsg = "error occurred while starting new thread ('%s')" % ex.message
|
||||
logger.critical(errMsg)
|
||||
break
|
||||
|
||||
|
@ -178,10 +178,10 @@ def runThreads(numThreads, threadFunction, cleanupFunction=None, forwardExceptio
|
|||
if forwardException:
|
||||
raise
|
||||
|
||||
except (SqlmapConnectionException, SqlmapValueException), errMsg:
|
||||
except (SqlmapConnectionException, SqlmapValueException), ex:
|
||||
print
|
||||
kb.threadException = True
|
||||
logger.error("thread %s: %s" % (threading.currentThread().getName(), errMsg))
|
||||
logger.error("thread %s: %s" % (threading.currentThread().getName(), ex.message))
|
||||
|
||||
except:
|
||||
from lib.core.common import unhandledExceptionMessage
|
||||
|
|
|
@ -30,7 +30,7 @@ def update():
|
|||
|
||||
if not os.path.exists(os.path.join(rootDir, ".git")):
|
||||
errMsg = "not a git repository. Please checkout the 'sqlmapproject/sqlmap' repository "
|
||||
errMsg += "from GitHub (e.g. git clone https://github.com/sqlmapproject/sqlmap.git sqlmap-dev)"
|
||||
errMsg += "from GitHub (e.g. 'git clone https://github.com/sqlmapproject/sqlmap.git sqlmap')"
|
||||
logger.error(errMsg)
|
||||
else:
|
||||
infoMsg = "updating sqlmap to the latest development version from the "
|
||||
|
@ -51,7 +51,12 @@ def update():
|
|||
_ = lib.core.settings.REVISION = getRevisionNumber()
|
||||
logger.info("%s the latest revision '%s'" % ("already at" if "Already" in stdout else "updated to", _))
|
||||
else:
|
||||
logger.error("update could not be completed ('%s')" % re.sub(r"\W+", " ", stderr).strip())
|
||||
if "Not a git repository" in stderr:
|
||||
errMsg = "not a valid git repository. Please checkout the 'sqlmapproject/sqlmap' repository "
|
||||
errMsg += "from GitHub (e.g. 'git clone https://github.com/sqlmapproject/sqlmap.git sqlmap')"
|
||||
logger.error(errMsg)
|
||||
else:
|
||||
logger.error("update could not be completed ('%s')" % re.sub(r"\W+", " ", stderr).strip())
|
||||
|
||||
if not success:
|
||||
if IS_WIN:
|
||||
|
|
|
@ -36,14 +36,17 @@ from lib.core.shell import clearHistory
|
|||
from lib.core.shell import loadHistory
|
||||
from lib.core.shell import saveHistory
|
||||
|
||||
def cmdLineParser():
|
||||
def cmdLineParser(argv=None):
|
||||
"""
|
||||
This function parses the command line parameters and arguments
|
||||
"""
|
||||
|
||||
if not argv:
|
||||
argv = sys.argv
|
||||
|
||||
checkSystemEncoding()
|
||||
|
||||
_ = getUnicode(os.path.basename(sys.argv[0]), encoding=sys.getfilesystemencoding())
|
||||
_ = getUnicode(os.path.basename(argv[0]), encoding=sys.getfilesystemencoding())
|
||||
|
||||
usage = "%s%s [options]" % ("python " if not IS_WIN else "", \
|
||||
"\"%s\"" % _ if " " in _ else _)
|
||||
|
@ -141,8 +144,8 @@ def cmdLineParser():
|
|||
help="HTTP authentication credentials "
|
||||
"(name:password)")
|
||||
|
||||
request.add_option("--auth-private", dest="authPrivate",
|
||||
help="HTTP authentication PEM private key file")
|
||||
request.add_option("--auth-file", dest="authFile",
|
||||
help="HTTP authentication PEM cert/private key file")
|
||||
|
||||
request.add_option("--ignore-401", dest="ignore401", action="store_true",
|
||||
help="Ignore HTTP Error 401 (Unauthorized)")
|
||||
|
@ -671,6 +674,9 @@ def cmdLineParser():
|
|||
general.add_option("--test-filter", dest="testFilter",
|
||||
help="Select tests by payloads and/or titles (e.g. ROW)")
|
||||
|
||||
general.add_option("--test-skip", dest="testSkip",
|
||||
help="Skip tests by payloads and/or titles (e.g. BENCHMARK)")
|
||||
|
||||
general.add_option("--update", dest="updateAll",
|
||||
action="store_true",
|
||||
help="Update sqlmap")
|
||||
|
@ -710,6 +716,10 @@ def cmdLineParser():
|
|||
action="store_true",
|
||||
help="Make a thorough testing for a WAF/IPS/IDS protection")
|
||||
|
||||
miscellaneous.add_option("--skip-waf", dest="skipWaf",
|
||||
action="store_true",
|
||||
help="Skip heuristic detection of WAF/IPS/IDS protection")
|
||||
|
||||
miscellaneous.add_option("--mobile", dest="mobile",
|
||||
action="store_true",
|
||||
help="Imitate smartphone through HTTP User-Agent header")
|
||||
|
@ -756,6 +766,9 @@ def cmdLineParser():
|
|||
parser.add_option("--force-dns", dest="forceDns", action="store_true",
|
||||
help=SUPPRESS_HELP)
|
||||
|
||||
parser.add_option("--force-threads", dest="forceThreads", action="store_true",
|
||||
help=SUPPRESS_HELP)
|
||||
|
||||
parser.add_option("--smoke-test", dest="smokeTest", action="store_true",
|
||||
help=SUPPRESS_HELP)
|
||||
|
||||
|
@ -767,6 +780,9 @@ def cmdLineParser():
|
|||
|
||||
parser.add_option("--run-case", dest="runCase", help=SUPPRESS_HELP)
|
||||
|
||||
parser.add_option("--nnc5ed", dest="nnc5ed", action="store_true",
|
||||
help=SUPPRESS_HELP) # temporary hidden switch :)
|
||||
|
||||
parser.add_option_group(target)
|
||||
parser.add_option_group(request)
|
||||
parser.add_option_group(optimization)
|
||||
|
@ -802,14 +818,15 @@ def cmdLineParser():
|
|||
option = parser.get_option("-h")
|
||||
option.help = option.help.capitalize().replace("this help", "basic help")
|
||||
|
||||
argv = []
|
||||
_ = []
|
||||
prompt = False
|
||||
advancedHelp = True
|
||||
extraHeaders = []
|
||||
|
||||
for arg in sys.argv:
|
||||
argv.append(getUnicode(arg, encoding=sys.getfilesystemencoding()))
|
||||
for arg in argv:
|
||||
_.append(getUnicode(arg, encoding=sys.getfilesystemencoding()))
|
||||
|
||||
argv = _
|
||||
checkDeprecatedOptions(argv)
|
||||
|
||||
prompt = "--sqlmap-shell" in argv
|
||||
|
|
|
@ -6,6 +6,7 @@ See the file 'doc/COPYING' for copying permission
|
|||
"""
|
||||
|
||||
from lib.core.common import checkFile
|
||||
from lib.core.common import getSafeExString
|
||||
from lib.core.common import getUnicode
|
||||
from lib.core.common import openFile
|
||||
from lib.core.common import unArrayizeValue
|
||||
|
@ -67,7 +68,7 @@ def configFileParser(configFile):
|
|||
config = UnicodeRawConfigParser()
|
||||
config.readfp(configFP)
|
||||
except Exception, ex:
|
||||
errMsg = "you have provided an invalid and/or unreadable configuration file ('%s')" % ex.message
|
||||
errMsg = "you have provided an invalid and/or unreadable configuration file ('%s')" % getSafeExString(ex)
|
||||
raise SqlmapSyntaxException(errMsg)
|
||||
|
||||
if not config.has_section("Target"):
|
||||
|
|
|
@ -128,9 +128,16 @@ def _comparison(page, headers, code, getRatioValue, pageLength):
|
|||
count += 1
|
||||
else:
|
||||
break
|
||||
|
||||
if count:
|
||||
seq1 = seq1[count:]
|
||||
seq2 = seq2[count:]
|
||||
try:
|
||||
_seq1 = seq1[count:]
|
||||
_seq2 = seq2[count:]
|
||||
except MemoryError:
|
||||
pass
|
||||
else:
|
||||
seq1 = _seq1
|
||||
seq2 = _seq2
|
||||
|
||||
while True:
|
||||
try:
|
||||
|
|
|
@ -5,6 +5,7 @@ Copyright (c) 2006-2015 sqlmap developers (http://sqlmap.org/)
|
|||
See the file 'doc/COPYING' for copying permission
|
||||
"""
|
||||
|
||||
import binascii
|
||||
import compiler
|
||||
import httplib
|
||||
import json
|
||||
|
@ -40,6 +41,7 @@ from lib.core.common import getCurrentThreadData
|
|||
from lib.core.common import getHeader
|
||||
from lib.core.common import getHostHeader
|
||||
from lib.core.common import getRequestHeader
|
||||
from lib.core.common import getSafeExString
|
||||
from lib.core.common import getUnicode
|
||||
from lib.core.common import logHTTPTraffic
|
||||
from lib.core.common import pushValue
|
||||
|
@ -142,6 +144,7 @@ class Connect(object):
|
|||
warnMsg += "(e.g. '--flush-session --technique=BEUS') or try to "
|
||||
warnMsg += "lower the value of option '--time-sec' (e.g. '--time-sec=2')"
|
||||
singleTimeWarnMessage(warnMsg)
|
||||
|
||||
elif kb.originalPage is None:
|
||||
if conf.tor:
|
||||
warnMsg = "please make sure that you have "
|
||||
|
@ -158,13 +161,12 @@ class Connect(object):
|
|||
warnMsg += "with the switch '--random-agent' turned on "
|
||||
warnMsg += "and/or proxy switches ('--ignore-proxy', '--proxy',...)"
|
||||
singleTimeWarnMessage(warnMsg)
|
||||
|
||||
elif conf.threads > 1:
|
||||
warnMsg = "if the problem persists please try to lower "
|
||||
warnMsg += "the number of used threads (option '--threads')"
|
||||
singleTimeWarnMessage(warnMsg)
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
kwargs['retrying'] = True
|
||||
return Connect._getPageProxy(**kwargs)
|
||||
|
||||
|
@ -183,7 +185,11 @@ class Connect(object):
|
|||
kb.pageCompress = False
|
||||
else:
|
||||
while True:
|
||||
_ = conn.read(MAX_CONNECTION_CHUNK_SIZE)
|
||||
if not conn:
|
||||
break
|
||||
else:
|
||||
_ = conn.read(MAX_CONNECTION_CHUNK_SIZE)
|
||||
|
||||
if len(_) == MAX_CONNECTION_CHUNK_SIZE:
|
||||
warnMsg = "large response detected. This could take a while"
|
||||
singleTimeWarnMessage(warnMsg)
|
||||
|
@ -433,6 +439,11 @@ class Connect(object):
|
|||
|
||||
logger.log(CUSTOM_LOGGING.TRAFFIC_OUT, requestMsg)
|
||||
|
||||
if conf.cj:
|
||||
for cookie in conf.cj:
|
||||
if cookie.value is None:
|
||||
cookie.value = ""
|
||||
|
||||
conn = urllib2.urlopen(req)
|
||||
|
||||
if not kb.authHeader and getRequestHeader(req, HTTP_HEADER.AUTHORIZATION) and (conf.authType or "").lower() == AUTH_TYPE.BASIC.lower():
|
||||
|
@ -497,22 +508,22 @@ class Connect(object):
|
|||
if hasattr(conn.fp, '_sock'):
|
||||
conn.fp._sock.close()
|
||||
conn.close()
|
||||
except Exception, msg:
|
||||
warnMsg = "problem occurred during connection closing ('%s')" % msg
|
||||
except Exception, ex:
|
||||
warnMsg = "problem occurred during connection closing ('%s')" % getSafeExString(ex)
|
||||
logger.warn(warnMsg)
|
||||
|
||||
except urllib2.HTTPError, e:
|
||||
except urllib2.HTTPError, ex:
|
||||
page = None
|
||||
responseHeaders = None
|
||||
|
||||
try:
|
||||
page = e.read() if not skipRead else None
|
||||
responseHeaders = e.info()
|
||||
responseHeaders[URI_HTTP_HEADER] = e.geturl()
|
||||
page = ex.read() if not skipRead else None
|
||||
responseHeaders = ex.info()
|
||||
responseHeaders[URI_HTTP_HEADER] = ex.geturl()
|
||||
page = decodePage(page, responseHeaders.get(HTTP_HEADER.CONTENT_ENCODING), responseHeaders.get(HTTP_HEADER.CONTENT_TYPE))
|
||||
except socket.timeout:
|
||||
warnMsg = "connection timed out while trying "
|
||||
warnMsg += "to get error page information (%d)" % e.code
|
||||
warnMsg += "to get error page information (%d)" % ex.code
|
||||
logger.warn(warnMsg)
|
||||
return None, None, None
|
||||
except KeyboardInterrupt:
|
||||
|
@ -522,13 +533,13 @@ class Connect(object):
|
|||
finally:
|
||||
page = page if isinstance(page, unicode) else getUnicode(page)
|
||||
|
||||
code = e.code
|
||||
code = ex.code
|
||||
|
||||
kb.originalCode = kb.originalCode or code
|
||||
threadData.lastHTTPError = (threadData.lastRequestUID, code)
|
||||
kb.httpErrorCodes[code] = kb.httpErrorCodes.get(code, 0) + 1
|
||||
|
||||
status = getUnicode(e.msg)
|
||||
status = getUnicode(ex.msg)
|
||||
responseMsg += "[#%d] (%d %s):\n" % (threadData.lastRequestUID, code, status)
|
||||
|
||||
if responseHeaders:
|
||||
|
@ -545,11 +556,11 @@ class Connect(object):
|
|||
|
||||
logger.log(CUSTOM_LOGGING.TRAFFIC_IN, responseMsg)
|
||||
|
||||
if e.code == httplib.UNAUTHORIZED and not conf.ignore401:
|
||||
if ex.code == httplib.UNAUTHORIZED and not conf.ignore401:
|
||||
errMsg = "not authorized, try to provide right HTTP "
|
||||
errMsg += "authentication type and valid credentials (%d)" % code
|
||||
raise SqlmapConnectionException(errMsg)
|
||||
elif e.code == httplib.NOT_FOUND:
|
||||
elif ex.code == httplib.NOT_FOUND:
|
||||
if raise404:
|
||||
errMsg = "page not found (%d)" % code
|
||||
raise SqlmapConnectionException(errMsg)
|
||||
|
@ -557,11 +568,11 @@ class Connect(object):
|
|||
debugMsg = "page not found (%d)" % code
|
||||
singleTimeLogMessage(debugMsg, logging.DEBUG)
|
||||
processResponse(page, responseHeaders)
|
||||
elif e.code == httplib.GATEWAY_TIMEOUT:
|
||||
elif ex.code == httplib.GATEWAY_TIMEOUT:
|
||||
if ignoreTimeout:
|
||||
return None, None, None
|
||||
else:
|
||||
warnMsg = "unable to connect to the target URL (%d - %s)" % (e.code, httplib.responses[e.code])
|
||||
warnMsg = "unable to connect to the target URL (%d - %s)" % (ex.code, httplib.responses[ex.code])
|
||||
if threadData.retriesCount < conf.retries and not kb.threadException:
|
||||
warnMsg += ". sqlmap is going to retry the request"
|
||||
logger.critical(warnMsg)
|
||||
|
@ -575,7 +586,7 @@ class Connect(object):
|
|||
debugMsg = "got HTTP error code: %d (%s)" % (code, status)
|
||||
logger.debug(debugMsg)
|
||||
|
||||
except (urllib2.URLError, socket.error, socket.timeout, httplib.HTTPException, struct.error, ProxyError, SqlmapCompressionException, WebSocketException), e:
|
||||
except (urllib2.URLError, socket.error, socket.timeout, httplib.HTTPException, struct.error, binascii.Error, ProxyError, SqlmapCompressionException, WebSocketException):
|
||||
tbMsg = traceback.format_exc()
|
||||
|
||||
if "no host given" in tbMsg:
|
||||
|
@ -619,7 +630,11 @@ class Connect(object):
|
|||
return None, None, None
|
||||
elif threadData.retriesCount < conf.retries and not kb.threadException:
|
||||
warnMsg += ". sqlmap is going to retry the request"
|
||||
logger.critical(warnMsg)
|
||||
if not retrying:
|
||||
warnMsg += "(s)"
|
||||
logger.critical(warnMsg)
|
||||
else:
|
||||
logger.debug(warnMsg)
|
||||
return Connect._retryProxy(**kwargs)
|
||||
elif kb.testMode:
|
||||
logger.critical(warnMsg)
|
||||
|
@ -628,7 +643,7 @@ class Connect(object):
|
|||
raise SqlmapConnectionException(warnMsg)
|
||||
|
||||
finally:
|
||||
if not isinstance(page, unicode):
|
||||
if isinstance(page, basestring) and not isinstance(page, unicode):
|
||||
if HTTP_HEADER.CONTENT_TYPE in (responseHeaders or {}) and not re.search(TEXT_CONTENT_TYPE_REGEX, responseHeaders[HTTP_HEADER.CONTENT_TYPE]):
|
||||
page = unicode(page, errors="ignore")
|
||||
else:
|
||||
|
@ -718,7 +733,7 @@ class Connect(object):
|
|||
payload = function(payload=payload, headers=auxHeaders)
|
||||
except Exception, ex:
|
||||
errMsg = "error occurred while running tamper "
|
||||
errMsg += "function '%s' ('%s')" % (function.func_name, ex)
|
||||
errMsg += "function '%s' ('%s')" % (function.func_name, getSafeExString(ex))
|
||||
raise SqlmapGenericException(errMsg)
|
||||
|
||||
if not isinstance(payload, basestring):
|
||||
|
@ -834,7 +849,7 @@ class Connect(object):
|
|||
if headers and "text/plain" in headers.get(HTTP_HEADER.CONTENT_TYPE, ""):
|
||||
token = page
|
||||
|
||||
if not token and any(_.name == conf.csrfToken for _ in conf.cj):
|
||||
if not token and conf.cj and any(_.name == conf.csrfToken for _ in conf.cj):
|
||||
for _ in conf.cj:
|
||||
if _.name == conf.csrfToken:
|
||||
token = _.value
|
||||
|
@ -889,7 +904,7 @@ class Connect(object):
|
|||
|
||||
if conf.evalCode:
|
||||
delimiter = conf.paramDel or DEFAULT_GET_POST_DELIMITER
|
||||
variables = {"uri": uri, "lastPage": threadData.lastPage}
|
||||
variables = {"uri": uri, "lastPage": threadData.lastPage, "_locals": locals()}
|
||||
originals = {}
|
||||
keywords = keyword.kwlist
|
||||
|
||||
|
@ -1051,9 +1066,9 @@ class Connect(object):
|
|||
_, headers, code = Connect.getPage(url=uri, get=get, post=post, method=method, cookie=cookie, ua=ua, referer=referer, host=host, silent=silent, auxHeaders=auxHeaders, raise404=raise404, skipRead=(kb.nullConnection == NULLCONNECTION.SKIP_READ))
|
||||
|
||||
if headers:
|
||||
if kb.nullConnection in (NULLCONNECTION.HEAD, NULLCONNECTION.SKIP_READ) and HTTP_HEADER.CONTENT_LENGTH in headers:
|
||||
if kb.nullConnection in (NULLCONNECTION.HEAD, NULLCONNECTION.SKIP_READ) and headers.get(HTTP_HEADER.CONTENT_LENGTH):
|
||||
pageLength = int(headers[HTTP_HEADER.CONTENT_LENGTH])
|
||||
elif kb.nullConnection == NULLCONNECTION.RANGE and HTTP_HEADER.CONTENT_RANGE in headers:
|
||||
elif kb.nullConnection == NULLCONNECTION.RANGE and headers.get(HTTP_HEADER.CONTENT_RANGE):
|
||||
pageLength = int(headers[HTTP_HEADER.CONTENT_RANGE][headers[HTTP_HEADER.CONTENT_RANGE].find('/') + 1:])
|
||||
finally:
|
||||
kb.pageCompress = popValue()
|
||||
|
|
|
@ -9,6 +9,7 @@ import httplib
|
|||
import socket
|
||||
import urllib2
|
||||
|
||||
from lib.core.common import getSafeExString
|
||||
from lib.core.data import kb
|
||||
from lib.core.data import logger
|
||||
from lib.core.exception import SqlmapConnectionException
|
||||
|
@ -55,9 +56,9 @@ class HTTPSConnection(httplib.HTTPSConnection):
|
|||
break
|
||||
else:
|
||||
sock.close()
|
||||
except (ssl.SSLError, socket.error, httplib.BadStatusLine), errMsg:
|
||||
except (ssl.SSLError, socket.error, httplib.BadStatusLine), ex:
|
||||
self._tunnel_host = None
|
||||
logger.debug("SSL connection error occurred ('%s')" % errMsg)
|
||||
logger.debug("SSL connection error occurred ('%s')" % getSafeExString(ex))
|
||||
|
||||
# Reference(s): https://docs.python.org/2/library/ssl.html#ssl.SSLContext
|
||||
# https://www.mnot.net/blog/2014/12/27/python_2_and_tls_sni
|
||||
|
@ -75,9 +76,9 @@ class HTTPSConnection(httplib.HTTPSConnection):
|
|||
break
|
||||
else:
|
||||
sock.close()
|
||||
except (ssl.SSLError, socket.error, httplib.BadStatusLine), errMsg:
|
||||
except (ssl.SSLError, socket.error, httplib.BadStatusLine), ex:
|
||||
self._tunnel_host = None
|
||||
logger.debug("SSL connection error occurred ('%s')" % errMsg)
|
||||
logger.debug("SSL connection error occurred ('%s')" % getSafeExString(ex))
|
||||
|
||||
if not success:
|
||||
raise SqlmapConnectionException("can't establish SSL connection")
|
||||
|
|
|
@ -39,6 +39,7 @@ from lib.core.enums import DBMS
|
|||
from lib.core.enums import EXPECTED
|
||||
from lib.core.enums import PAYLOAD
|
||||
from lib.core.exception import SqlmapConnectionException
|
||||
from lib.core.exception import SqlmapDataException
|
||||
from lib.core.exception import SqlmapNotVulnerableException
|
||||
from lib.core.exception import SqlmapUserQuitException
|
||||
from lib.core.settings import MAX_TECHNIQUES_PER_VALUE
|
||||
|
@ -78,7 +79,7 @@ def _goInference(payload, expression, charsetType=None, firstChar=None, lastChar
|
|||
timeBasedCompare = (kb.technique in (PAYLOAD.TECHNIQUE.TIME, PAYLOAD.TECHNIQUE.STACKED))
|
||||
|
||||
if not (timeBasedCompare and kb.dnsTest):
|
||||
if (conf.eta or conf.threads > 1) and Backend.getIdentifiedDbms() and not re.search("(COUNT|LTRIM)\(", expression, re.I) and not timeBasedCompare:
|
||||
if (conf.eta or conf.threads > 1) and Backend.getIdentifiedDbms() and not re.search("(COUNT|LTRIM)\(", expression, re.I) and not (timeBasedCompare and not conf.forceThreads):
|
||||
|
||||
if field and re.search("\ASELECT\s+DISTINCT\((.+?)\)\s+FROM", expression, re.I):
|
||||
expression = "SELECT %s FROM (%s)" % (field, expression)
|
||||
|
@ -262,9 +263,14 @@ def _goInferenceProxy(expression, fromUser=False, batch=False, unpack=True, char
|
|||
return None
|
||||
|
||||
try:
|
||||
for num in xrange(startLimit, stopLimit):
|
||||
output = _goInferenceFields(expression, expressionFields, expressionFieldsList, payload, num=num, charsetType=charsetType, firstChar=firstChar, lastChar=lastChar, dump=dump)
|
||||
outputs.append(output)
|
||||
try:
|
||||
for num in xrange(startLimit, stopLimit):
|
||||
output = _goInferenceFields(expression, expressionFields, expressionFieldsList, payload, num=num, charsetType=charsetType, firstChar=firstChar, lastChar=lastChar, dump=dump)
|
||||
outputs.append(output)
|
||||
except OverflowError:
|
||||
errMsg = "boundary limits (%d,%d) are too large. Please rerun " % (startLimit, stopLimit)
|
||||
errMsg += "with switch '--fresh-queries'"
|
||||
raise SqlmapDataException(errMsg)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print
|
||||
|
|
|
@ -11,12 +11,13 @@ import urllib2
|
|||
from lib.core.data import conf
|
||||
|
||||
class HTTPSPKIAuthHandler(urllib2.HTTPSHandler):
|
||||
def __init__(self, key_file):
|
||||
def __init__(self, auth_file):
|
||||
urllib2.HTTPSHandler.__init__(self)
|
||||
self.key_file = key_file
|
||||
self.auth_file = auth_file
|
||||
|
||||
def https_open(self, req):
|
||||
return self.do_open(self.getConnection, req)
|
||||
|
||||
def getConnection(self, host, timeout=None):
|
||||
return httplib.HTTPSConnection(host, key_file=self.key_file, timeout=conf.timeout)
|
||||
# Reference: https://docs.python.org/2/library/ssl.html#ssl.SSLContext.load_cert_chain
|
||||
return httplib.HTTPSConnection(host, cert_file=self.auth_file, key_file=self.auth_file, timeout=conf.timeout)
|
||||
|
|
|
@ -30,6 +30,7 @@ from lib.core.settings import MAX_SINGLE_URL_REDIRECTIONS
|
|||
from lib.core.settings import MAX_TOTAL_REDIRECTIONS
|
||||
from lib.core.threads import getCurrentThreadData
|
||||
from lib.request.basic import decodePage
|
||||
from lib.request.basic import parseResponse
|
||||
|
||||
class SmartRedirectHandler(urllib2.HTTPRedirectHandler):
|
||||
def _get_header_redirect(self, headers):
|
||||
|
@ -118,6 +119,8 @@ class SmartRedirectHandler(urllib2.HTTPRedirectHandler):
|
|||
result = fp
|
||||
|
||||
if redurl and kb.redirectChoice == REDIRECTION.YES:
|
||||
parseResponse(content, headers)
|
||||
|
||||
req.headers[HTTP_HEADER.HOST] = getHostHeader(redurl)
|
||||
if headers and HTTP_HEADER.SET_COOKIE in headers:
|
||||
req.headers[HTTP_HEADER.COOKIE] = headers[HTTP_HEADER.SET_COOKIE].split(conf.cookieDel or DEFAULT_COOKIE_DELIMITER)[0]
|
||||
|
|
|
@ -18,6 +18,7 @@ from lib.core.common import readInput
|
|||
from lib.core.data import conf
|
||||
from lib.core.data import logger
|
||||
from lib.core.data import paths
|
||||
from lib.core.exception import SqlmapDataException
|
||||
|
||||
class ICMPsh:
|
||||
"""
|
||||
|
@ -41,6 +42,9 @@ class ICMPsh:
|
|||
while not address:
|
||||
address = readInput(message, default=self.remoteIP)
|
||||
|
||||
if conf.batch and not address:
|
||||
raise SqlmapDataException("remote host address is missing")
|
||||
|
||||
return address
|
||||
|
||||
def _selectLhost(self):
|
||||
|
@ -53,6 +57,9 @@ class ICMPsh:
|
|||
while not address:
|
||||
address = readInput(message, default=self.localIP)
|
||||
|
||||
if conf.batch and not address:
|
||||
raise SqlmapDataException("local host address is missing")
|
||||
|
||||
return address
|
||||
|
||||
def _prepareIngredients(self, encode=True):
|
||||
|
|
|
@ -258,7 +258,7 @@ class UDF:
|
|||
else:
|
||||
logger.warn("invalid value, only digits are allowed")
|
||||
|
||||
for x in range(0, udfCount):
|
||||
for x in xrange(0, udfCount):
|
||||
while True:
|
||||
msg = "what is the name of the UDF number %d? " % (x + 1)
|
||||
udfName = readInput(msg)
|
||||
|
@ -293,7 +293,7 @@ class UDF:
|
|||
else:
|
||||
logger.warn("invalid value, only digits >= 0 are allowed")
|
||||
|
||||
for y in range(0, parCount):
|
||||
for y in xrange(0, parCount):
|
||||
msg = "what is the data-type of input parameter "
|
||||
msg += "number %d? (default: %s) " % ((y + 1), defaultType)
|
||||
|
||||
|
|
|
@ -146,12 +146,12 @@ def bisection(payload, expression, length=None, charsetType=None, firstChar=None
|
|||
if showEta:
|
||||
progress = ProgressBar(maxValue=length)
|
||||
|
||||
if timeBasedCompare and conf.threads > 1:
|
||||
if timeBasedCompare and conf.threads > 1 and not conf.forceThreads:
|
||||
warnMsg = "multi-threading is considered unsafe in time-based data retrieval. Going to switch it off automatically"
|
||||
singleTimeWarnMessage(warnMsg)
|
||||
|
||||
if numThreads > 1:
|
||||
if not timeBasedCompare:
|
||||
if not timeBasedCompare or conf.forceThreads:
|
||||
debugMsg = "starting %d thread%s" % (numThreads, ("s" if numThreads > 1 else ""))
|
||||
logger.debug(debugMsg)
|
||||
else:
|
||||
|
@ -232,8 +232,10 @@ def bisection(payload, expression, length=None, charsetType=None, firstChar=None
|
|||
# Used for gradual expanding into unicode charspace
|
||||
shiftTable = [2, 2, 3, 3, 5, 4]
|
||||
|
||||
if CHAR_INFERENCE_MARK in payload and ord('\n') in charTbl:
|
||||
charTbl.remove(ord('\n'))
|
||||
if "'%s'" % CHAR_INFERENCE_MARK in payload:
|
||||
for char in ('\n', '\r'):
|
||||
if ord(char) in charTbl:
|
||||
charTbl.remove(ord(char))
|
||||
|
||||
if not charTbl:
|
||||
return None
|
||||
|
@ -597,8 +599,9 @@ def queryOutputLength(expression, payload):
|
|||
infoMsg = "retrieving the length of query output"
|
||||
logger.info(infoMsg)
|
||||
|
||||
lengthExprUnescaped = agent.forgeQueryOutputLength(expression)
|
||||
start = time.time()
|
||||
|
||||
lengthExprUnescaped = agent.forgeQueryOutputLength(expression)
|
||||
count, length = bisection(payload, lengthExprUnescaped, charsetType=CHARSET_TYPE.DIGITS)
|
||||
|
||||
debugMsg = "performed %d queries in %.2f seconds" % (count, calculateDeltaSeconds(start))
|
||||
|
|
|
@ -28,9 +28,9 @@ from lib.core.enums import HASHDB_KEYS
|
|||
from lib.core.enums import PAYLOAD
|
||||
from lib.core.exception import SqlmapDataException
|
||||
from lib.core.exception import SqlmapMissingMandatoryOptionException
|
||||
from lib.core.settings import METADB_SUFFIX
|
||||
from lib.core.settings import BRUTE_COLUMN_EXISTS_TEMPLATE
|
||||
from lib.core.settings import BRUTE_TABLE_EXISTS_TEMPLATE
|
||||
from lib.core.settings import METADB_SUFFIX
|
||||
from lib.core.threads import getCurrentThreadData
|
||||
from lib.core.threads import runThreads
|
||||
from lib.request import inject
|
||||
|
@ -102,7 +102,7 @@ def tableExists(tableFile, regex=None):
|
|||
break
|
||||
|
||||
if conf.db and METADB_SUFFIX not in conf.db and Backend.getIdentifiedDbms() not in (DBMS.SQLITE, DBMS.ACCESS, DBMS.FIREBIRD):
|
||||
fullTableName = "%s%s%s" % (conf.db, '..' if Backend.getIdentifiedDbms() in (DBMS.MSSQL, DBMS.SYBASE) else '.', table)
|
||||
fullTableName = "%s.%s" % (conf.db, table)
|
||||
else:
|
||||
fullTableName = table
|
||||
|
||||
|
|
|
@ -165,74 +165,78 @@ def _unionPosition(comment, place, parameter, prefix, suffix, count, where=PAYLO
|
|||
# Unbiased approach for searching appropriate usable column
|
||||
random.shuffle(positions)
|
||||
|
||||
# For each column of the table (# of NULL) perform a request using
|
||||
# the UNION ALL SELECT statement to test it the target URL is
|
||||
# affected by an exploitable union SQL injection vulnerability
|
||||
for position in positions:
|
||||
# Prepare expression with delimiters
|
||||
randQuery = randomStr(UNION_MIN_RESPONSE_CHARS)
|
||||
phrase = "%s%s%s".lower() % (kb.chars.start, randQuery, kb.chars.stop)
|
||||
randQueryProcessed = agent.concatQuery("\'%s\'" % randQuery)
|
||||
randQueryUnescaped = unescaper.escape(randQueryProcessed)
|
||||
for charCount in (UNION_MIN_RESPONSE_CHARS << 2, UNION_MIN_RESPONSE_CHARS):
|
||||
if vector:
|
||||
break
|
||||
|
||||
# Forge the union SQL injection request
|
||||
query = agent.forgeUnionQuery(randQueryUnescaped, position, count, comment, prefix, suffix, kb.uChar, where)
|
||||
payload = agent.payload(place=place, parameter=parameter, newValue=query, where=where)
|
||||
# For each column of the table (# of NULL) perform a request using
|
||||
# the UNION ALL SELECT statement to test it the target URL is
|
||||
# affected by an exploitable union SQL injection vulnerability
|
||||
for position in positions:
|
||||
# Prepare expression with delimiters
|
||||
randQuery = randomStr(charCount)
|
||||
phrase = "%s%s%s".lower() % (kb.chars.start, randQuery, kb.chars.stop)
|
||||
randQueryProcessed = agent.concatQuery("\'%s\'" % randQuery)
|
||||
randQueryUnescaped = unescaper.escape(randQueryProcessed)
|
||||
|
||||
# Perform the request
|
||||
page, headers = Request.queryPage(payload, place=place, content=True, raise404=False)
|
||||
content = "%s%s".lower() % (removeReflectiveValues(page, payload) or "", \
|
||||
removeReflectiveValues(listToStrValue(headers.headers if headers else None), \
|
||||
payload, True) or "")
|
||||
# Forge the union SQL injection request
|
||||
query = agent.forgeUnionQuery(randQueryUnescaped, position, count, comment, prefix, suffix, kb.uChar, where)
|
||||
payload = agent.payload(place=place, parameter=parameter, newValue=query, where=where)
|
||||
|
||||
if content and phrase in content:
|
||||
validPayload = payload
|
||||
kb.unionDuplicates = len(re.findall(phrase, content, re.I)) > 1
|
||||
vector = (position, count, comment, prefix, suffix, kb.uChar, where, kb.unionDuplicates, False)
|
||||
# Perform the request
|
||||
page, headers = Request.queryPage(payload, place=place, content=True, raise404=False)
|
||||
content = "%s%s".lower() % (removeReflectiveValues(page, payload) or "", \
|
||||
removeReflectiveValues(listToStrValue(headers.headers if headers else None), \
|
||||
payload, True) or "")
|
||||
|
||||
if where == PAYLOAD.WHERE.ORIGINAL:
|
||||
# Prepare expression with delimiters
|
||||
randQuery2 = randomStr(UNION_MIN_RESPONSE_CHARS)
|
||||
phrase2 = "%s%s%s".lower() % (kb.chars.start, randQuery2, kb.chars.stop)
|
||||
randQueryProcessed2 = agent.concatQuery("\'%s\'" % randQuery2)
|
||||
randQueryUnescaped2 = unescaper.escape(randQueryProcessed2)
|
||||
if content and phrase in content:
|
||||
validPayload = payload
|
||||
kb.unionDuplicates = len(re.findall(phrase, content, re.I)) > 1
|
||||
vector = (position, count, comment, prefix, suffix, kb.uChar, where, kb.unionDuplicates, False)
|
||||
|
||||
# Confirm that it is a full union SQL injection
|
||||
query = agent.forgeUnionQuery(randQueryUnescaped, position, count, comment, prefix, suffix, kb.uChar, where, multipleUnions=randQueryUnescaped2)
|
||||
payload = agent.payload(place=place, parameter=parameter, newValue=query, where=where)
|
||||
if where == PAYLOAD.WHERE.ORIGINAL:
|
||||
# Prepare expression with delimiters
|
||||
randQuery2 = randomStr(charCount)
|
||||
phrase2 = "%s%s%s".lower() % (kb.chars.start, randQuery2, kb.chars.stop)
|
||||
randQueryProcessed2 = agent.concatQuery("\'%s\'" % randQuery2)
|
||||
randQueryUnescaped2 = unescaper.escape(randQueryProcessed2)
|
||||
|
||||
# Perform the request
|
||||
page, headers = Request.queryPage(payload, place=place, content=True, raise404=False)
|
||||
content = "%s%s".lower() % (page or "", listToStrValue(headers.headers if headers else None) or "")
|
||||
|
||||
if not all(_ in content for _ in (phrase, phrase2)):
|
||||
vector = (position, count, comment, prefix, suffix, kb.uChar, where, kb.unionDuplicates, True)
|
||||
elif not kb.unionDuplicates:
|
||||
fromTable = " FROM (%s) AS %s" % (" UNION ".join("SELECT %d%s%s" % (_, FROM_DUMMY_TABLE.get(Backend.getIdentifiedDbms(), ""), " AS %s" % randomStr() if _ == 0 else "") for _ in xrange(LIMITED_ROWS_TEST_NUMBER)), randomStr())
|
||||
|
||||
# Check for limited row output
|
||||
query = agent.forgeUnionQuery(randQueryUnescaped, position, count, comment, prefix, suffix, kb.uChar, where, fromTable=fromTable)
|
||||
# Confirm that it is a full union SQL injection
|
||||
query = agent.forgeUnionQuery(randQueryUnescaped, position, count, comment, prefix, suffix, kb.uChar, where, multipleUnions=randQueryUnescaped2)
|
||||
payload = agent.payload(place=place, parameter=parameter, newValue=query, where=where)
|
||||
|
||||
# Perform the request
|
||||
page, headers = Request.queryPage(payload, place=place, content=True, raise404=False)
|
||||
content = "%s%s".lower() % (removeReflectiveValues(page, payload) or "", \
|
||||
removeReflectiveValues(listToStrValue(headers.headers if headers else None), \
|
||||
payload, True) or "")
|
||||
if content.count(phrase) > 0 and content.count(phrase) < LIMITED_ROWS_TEST_NUMBER:
|
||||
warnMsg = "output with limited number of rows detected. Switching to partial mode"
|
||||
logger.warn(warnMsg)
|
||||
vector = (position, count, comment, prefix, suffix, kb.uChar, PAYLOAD.WHERE.NEGATIVE, kb.unionDuplicates, False)
|
||||
content = "%s%s".lower() % (page or "", listToStrValue(headers.headers if headers else None) or "")
|
||||
|
||||
unionErrorCase = kb.errorIsNone and wasLastResponseDBMSError()
|
||||
if not all(_ in content for _ in (phrase, phrase2)):
|
||||
vector = (position, count, comment, prefix, suffix, kb.uChar, where, kb.unionDuplicates, True)
|
||||
elif not kb.unionDuplicates:
|
||||
fromTable = " FROM (%s) AS %s" % (" UNION ".join("SELECT %d%s%s" % (_, FROM_DUMMY_TABLE.get(Backend.getIdentifiedDbms(), ""), " AS %s" % randomStr() if _ == 0 else "") for _ in xrange(LIMITED_ROWS_TEST_NUMBER)), randomStr())
|
||||
|
||||
if unionErrorCase and count > 1:
|
||||
warnMsg = "combined UNION/error-based SQL injection case found on "
|
||||
warnMsg += "column %d. sqlmap will try to find another " % (position + 1)
|
||||
warnMsg += "column with better characteristics"
|
||||
logger.warn(warnMsg)
|
||||
else:
|
||||
break
|
||||
# Check for limited row output
|
||||
query = agent.forgeUnionQuery(randQueryUnescaped, position, count, comment, prefix, suffix, kb.uChar, where, fromTable=fromTable)
|
||||
payload = agent.payload(place=place, parameter=parameter, newValue=query, where=where)
|
||||
|
||||
# Perform the request
|
||||
page, headers = Request.queryPage(payload, place=place, content=True, raise404=False)
|
||||
content = "%s%s".lower() % (removeReflectiveValues(page, payload) or "", \
|
||||
removeReflectiveValues(listToStrValue(headers.headers if headers else None), \
|
||||
payload, True) or "")
|
||||
if content.count(phrase) > 0 and content.count(phrase) < LIMITED_ROWS_TEST_NUMBER:
|
||||
warnMsg = "output with limited number of rows detected. Switching to partial mode"
|
||||
logger.warn(warnMsg)
|
||||
vector = (position, count, comment, prefix, suffix, kb.uChar, PAYLOAD.WHERE.NEGATIVE, kb.unionDuplicates, False)
|
||||
|
||||
unionErrorCase = kb.errorIsNone and wasLastResponseDBMSError()
|
||||
|
||||
if unionErrorCase and count > 1:
|
||||
warnMsg = "combined UNION/error-based SQL injection case found on "
|
||||
warnMsg += "column %d. sqlmap will try to find another " % (position + 1)
|
||||
warnMsg += "column with better characteristics"
|
||||
logger.warn(warnMsg)
|
||||
else:
|
||||
break
|
||||
|
||||
return validPayload, vector
|
||||
|
||||
|
|
186
lib/utils/api.py
186
lib/utils/api.py
|
@ -8,11 +8,16 @@ See the file 'doc/COPYING' for copying permission
|
|||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import shlex
|
||||
import sqlite3
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
import urllib2
|
||||
|
||||
from lib.core.common import dataToStdout
|
||||
from lib.core.common import getSafeExString
|
||||
from lib.core.common import unArrayizeValue
|
||||
from lib.core.convert import base64pickle
|
||||
from lib.core.convert import hexencode
|
||||
|
@ -31,6 +36,7 @@ from lib.core.log import LOGGER_HANDLER
|
|||
from lib.core.optiondict import optDict
|
||||
from lib.core.settings import IS_WIN
|
||||
from lib.core.subprocessng import Popen
|
||||
from lib.parse.cmdline import cmdLineParser
|
||||
from thirdparty.bottle.bottle import error as return_error
|
||||
from thirdparty.bottle.bottle import get
|
||||
from thirdparty.bottle.bottle import hook
|
||||
|
@ -82,7 +88,7 @@ class Database(object):
|
|||
else:
|
||||
self.cursor.execute(statement)
|
||||
except sqlite3.OperationalError, ex:
|
||||
if not "locked" in ex.message:
|
||||
if not "locked" in getSafeExString(ex):
|
||||
raise
|
||||
else:
|
||||
break
|
||||
|
@ -110,7 +116,8 @@ class Database(object):
|
|||
|
||||
|
||||
class Task(object):
|
||||
def __init__(self, taskid):
|
||||
def __init__(self, taskid, remote_addr):
|
||||
self.remote_addr = remote_addr
|
||||
self.process = None
|
||||
self.output_directory = None
|
||||
self.options = None
|
||||
|
@ -152,8 +159,10 @@ class Task(object):
|
|||
self.options = AttribDict(self._original_options)
|
||||
|
||||
def engine_start(self):
|
||||
self.process = Popen(["python", "sqlmap.py", "--pickled-options", base64pickle(self.options)],
|
||||
shell=False, close_fds=not IS_WIN)
|
||||
if os.path.exists("sqlmap.py"):
|
||||
self.process = Popen(["python", "sqlmap.py", "--pickled-options", base64pickle(self.options)], shell=False, close_fds=not IS_WIN)
|
||||
else:
|
||||
self.process = Popen(["sqlmap", "--pickled-options", base64pickle(self.options)], shell=False, close_fds=not IS_WIN)
|
||||
|
||||
def engine_stop(self):
|
||||
if self.process:
|
||||
|
@ -335,7 +344,9 @@ def task_new():
|
|||
Create new task ID
|
||||
"""
|
||||
taskid = hexencode(os.urandom(8))
|
||||
DataStore.tasks[taskid] = Task(taskid)
|
||||
remote_addr = request.remote_addr
|
||||
|
||||
DataStore.tasks[taskid] = Task(taskid, remote_addr)
|
||||
|
||||
logger.debug("Created new task: '%s'" % taskid)
|
||||
return jsonize({"success": True, "taskid": taskid})
|
||||
|
@ -361,18 +372,18 @@ def task_delete(taskid):
|
|||
|
||||
|
||||
@get("/admin/<taskid>/list")
|
||||
def task_list(taskid):
|
||||
def task_list(taskid=None):
|
||||
"""
|
||||
List task pull
|
||||
"""
|
||||
if is_admin(taskid):
|
||||
logger.debug("[%s] Listed task pool" % taskid)
|
||||
tasks = list(DataStore.tasks)
|
||||
return jsonize({"success": True, "tasks": tasks, "tasks_num": len(tasks)})
|
||||
else:
|
||||
logger.warning("[%s] Unauthorized call to task_list()" % taskid)
|
||||
return jsonize({"success": False, "message": "Unauthorized"})
|
||||
tasks = {}
|
||||
|
||||
for key in DataStore.tasks:
|
||||
if is_admin(taskid) or DataStore.tasks[key].remote_addr == request.remote_addr:
|
||||
tasks[key] = dejsonize(scan_status(key))["status"]
|
||||
|
||||
logger.debug("[%s] Listed task pool (%s)" % (taskid, "admin" if is_admin(taskid) else request.remote_addr))
|
||||
return jsonize({"success": True, "tasks": tasks, "tasks_num": len(tasks)})
|
||||
|
||||
@get("/admin/<taskid>/flush")
|
||||
def task_flush(taskid):
|
||||
|
@ -381,11 +392,13 @@ def task_flush(taskid):
|
|||
"""
|
||||
if is_admin(taskid):
|
||||
DataStore.tasks = dict()
|
||||
logger.debug("[%s] Flushed task pool" % taskid)
|
||||
return jsonize({"success": True})
|
||||
else:
|
||||
logger.warning("[%s] Unauthorized call to task_flush()" % taskid)
|
||||
return jsonize({"success": False, "message": "Unauthorized"})
|
||||
for key in list(DataStore.tasks):
|
||||
if DataStore.tasks[key].remote_addr == request.remote_addr:
|
||||
del DataStore.tasks[key]
|
||||
|
||||
logger.debug("[%s] Flushed task pool (%s)" % (taskid, "admin" if is_admin(taskid) else request.remote_addr))
|
||||
return jsonize({"success": True})
|
||||
|
||||
##################################
|
||||
# sqlmap core interact functions #
|
||||
|
@ -467,7 +480,9 @@ def scan_stop(taskid):
|
|||
"""
|
||||
Stop a scan
|
||||
"""
|
||||
if taskid not in DataStore.tasks:
|
||||
if (taskid not in DataStore.tasks or
|
||||
DataStore.tasks[taskid].engine_process() is None or
|
||||
DataStore.tasks[taskid].engine_has_terminated()):
|
||||
logger.warning("[%s] Invalid task ID provided to scan_stop()" % taskid)
|
||||
return jsonize({"success": False, "message": "Invalid task ID"})
|
||||
|
||||
|
@ -482,7 +497,9 @@ def scan_kill(taskid):
|
|||
"""
|
||||
Kill a scan
|
||||
"""
|
||||
if taskid not in DataStore.tasks:
|
||||
if (taskid not in DataStore.tasks or
|
||||
DataStore.tasks[taskid].engine_process() is None or
|
||||
DataStore.tasks[taskid].engine_has_terminated()):
|
||||
logger.warning("[%s] Invalid task ID provided to scan_kill()" % taskid)
|
||||
return jsonize({"success": False, "message": "Invalid task ID"})
|
||||
|
||||
|
@ -552,7 +569,7 @@ def scan_log_limited(taskid, start, end):
|
|||
json_log_messages = list()
|
||||
|
||||
if taskid not in DataStore.tasks:
|
||||
logger.warning("[%s] Invalid task ID provided to scan_log_limited()")
|
||||
logger.warning("[%s] Invalid task ID provided to scan_log_limited()" % taskid)
|
||||
return jsonize({"success": False, "message": "Invalid task ID"})
|
||||
|
||||
if not start.isdigit() or not end.isdigit() or end < start:
|
||||
|
@ -581,7 +598,7 @@ def scan_log(taskid):
|
|||
json_log_messages = list()
|
||||
|
||||
if taskid not in DataStore.tasks:
|
||||
logger.warning("[%s] Invalid task ID provided to scan_log()")
|
||||
logger.warning("[%s] Invalid task ID provided to scan_log()" % taskid)
|
||||
return jsonize({"success": False, "message": "Invalid task ID"})
|
||||
|
||||
# Read all log messages from the IPC database
|
||||
|
@ -640,6 +657,22 @@ def server(host="0.0.0.0", port=RESTAPI_SERVER_PORT):
|
|||
run(host=host, port=port, quiet=True, debug=False)
|
||||
|
||||
|
||||
def _client(url, options=None):
|
||||
logger.debug("Calling %s" % url)
|
||||
try:
|
||||
data = None
|
||||
if options is not None:
|
||||
data = jsonize(options)
|
||||
req = urllib2.Request(url, data, {'Content-Type': 'application/json'})
|
||||
response = urllib2.urlopen(req)
|
||||
text = response.read()
|
||||
except:
|
||||
if options:
|
||||
logger.error("Failed to load and parse %s" % url)
|
||||
raise
|
||||
return text
|
||||
|
||||
|
||||
def client(host=RESTAPI_SERVER_HOST, port=RESTAPI_SERVER_PORT):
|
||||
"""
|
||||
REST-JSON API client
|
||||
|
@ -647,11 +680,106 @@ def client(host=RESTAPI_SERVER_HOST, port=RESTAPI_SERVER_PORT):
|
|||
addr = "http://%s:%d" % (host, port)
|
||||
logger.info("Starting REST-JSON API client to '%s'..." % addr)
|
||||
|
||||
# TODO: write a simple client with requests, for now use curl from command line
|
||||
logger.error("Not yet implemented, use curl from command line instead for now, for example:")
|
||||
print "\n\t$ taskid=$(curl http://%s:%d/task/new 2>1 | grep -o -I '[a-f0-9]\{16\}') && echo $taskid" % (host, port)
|
||||
print ("\t$ curl -H \"Content-Type: application/json\" "
|
||||
"-X POST -d '{\"url\": \"http://testphp.vulnweb.com/artists.php?artist=1\"}' "
|
||||
"http://%s:%d/scan/$taskid/start") % (host, port)
|
||||
print "\t$ curl http://%s:%d/scan/$taskid/data" % (host, port)
|
||||
print "\t$ curl http://%s:%d/scan/$taskid/log\n" % (host, port)
|
||||
try:
|
||||
_client(addr)
|
||||
except Exception, ex:
|
||||
if not isinstance(ex, urllib2.HTTPError):
|
||||
errMsg = "there has been a problem while connecting to the "
|
||||
errMsg += "REST-JSON API server at '%s' " % addr
|
||||
errMsg += "(%s)" % ex
|
||||
logger.critical(errMsg)
|
||||
return
|
||||
|
||||
taskid = None
|
||||
logger.info("Type 'help' or '?' for list of available commands")
|
||||
|
||||
while True:
|
||||
try:
|
||||
command = raw_input("api%s> " % (" (%s)" % taskid if taskid else "")).strip().lower()
|
||||
except (EOFError, KeyboardInterrupt):
|
||||
print
|
||||
break
|
||||
|
||||
if command in ("data", "log", "status", "stop", "kill"):
|
||||
if not taskid:
|
||||
logger.error("No task ID in use")
|
||||
continue
|
||||
raw = _client("%s/scan/%s/%s" % (addr, taskid, command))
|
||||
res = dejsonize(raw)
|
||||
if not res["success"]:
|
||||
logger.error("Failed to execute command %s" % command)
|
||||
dataToStdout("%s\n" % raw)
|
||||
|
||||
elif command.startswith("new"):
|
||||
if ' ' not in command:
|
||||
logger.error("Program arguments are missing")
|
||||
continue
|
||||
|
||||
argv = ["sqlmap.py"] + shlex.split(command)[1:]
|
||||
|
||||
try:
|
||||
cmdLineOptions = cmdLineParser(argv).__dict__
|
||||
except:
|
||||
taskid = None
|
||||
continue
|
||||
|
||||
for key in list(cmdLineOptions):
|
||||
if cmdLineOptions[key] is None:
|
||||
del cmdLineOptions[key]
|
||||
|
||||
raw = _client("%s/task/new" % addr)
|
||||
res = dejsonize(raw)
|
||||
if not res["success"]:
|
||||
logger.error("Failed to create new task")
|
||||
continue
|
||||
taskid = res["taskid"]
|
||||
logger.info("New task ID is '%s'" % taskid)
|
||||
|
||||
raw = _client("%s/scan/%s/start" % (addr, taskid), cmdLineOptions)
|
||||
res = dejsonize(raw)
|
||||
if not res["success"]:
|
||||
logger.error("Failed to start scan")
|
||||
continue
|
||||
logger.info("Scanning started")
|
||||
|
||||
elif command.startswith("use"):
|
||||
taskid = (command.split()[1] if ' ' in command else "").strip("'\"")
|
||||
if not taskid:
|
||||
logger.error("Task ID is missing")
|
||||
taskid = None
|
||||
continue
|
||||
elif not re.search(r"\A[0-9a-fA-F]{16}\Z", taskid):
|
||||
logger.error("Invalid task ID '%s'" % taskid)
|
||||
taskid = None
|
||||
continue
|
||||
logger.info("Switching to task ID '%s' " % taskid)
|
||||
|
||||
elif command in ("list", "flush"):
|
||||
raw = _client("%s/admin/%s/%s" % (addr, taskid or 0, command))
|
||||
res = dejsonize(raw)
|
||||
if not res["success"]:
|
||||
logger.error("Failed to execute command %s" % command)
|
||||
elif command == "flush":
|
||||
taskid = None
|
||||
dataToStdout("%s\n" % raw)
|
||||
|
||||
elif command in ("exit", "bye", "quit", 'q'):
|
||||
return
|
||||
|
||||
elif command in ("help", "?"):
|
||||
msg = "help Show this help message\n"
|
||||
msg += "new ARGS Start a new scan task with provided arguments (e.g. 'new -u \"http://testphp.vulnweb.com/artists.php?artist=1\"')\n"
|
||||
msg += "use TASKID Switch current context to different task (e.g. 'use c04d8c5c7582efb4')\n"
|
||||
msg += "data Retrieve and show data for current task\n"
|
||||
msg += "log Retrieve and show log for current task\n"
|
||||
msg += "status Retrieve and show status for current task\n"
|
||||
msg += "stop Stop current task\n"
|
||||
msg += "kill Kill current task\n"
|
||||
msg += "list Display all tasks\n"
|
||||
msg += "flush Flush tasks (delete all tasks)\n"
|
||||
msg += "exit Exit this client\n"
|
||||
|
||||
dataToStdout(msg)
|
||||
|
||||
elif command:
|
||||
logger.error("Unknown command '%s'" % command)
|
||||
|
|
|
@ -22,6 +22,7 @@ from lib.core.data import conf
|
|||
from lib.core.data import kb
|
||||
from lib.core.data import logger
|
||||
from lib.core.exception import SqlmapConnectionException
|
||||
from lib.core.exception import SqlmapSyntaxException
|
||||
from lib.core.settings import CRAWL_EXCLUDE_EXTENSIONS
|
||||
from lib.core.threads import getCurrentThreadData
|
||||
from lib.core.threads import runThreads
|
||||
|
@ -58,12 +59,15 @@ def crawl(target):
|
|||
try:
|
||||
if current:
|
||||
content = Request.getPage(url=current, crawling=True, raise404=False)[0]
|
||||
except SqlmapConnectionException, e:
|
||||
errMsg = "connection exception detected (%s). skipping " % e
|
||||
except SqlmapConnectionException, ex:
|
||||
errMsg = "connection exception detected (%s). skipping " % ex
|
||||
errMsg += "URL '%s'" % current
|
||||
logger.critical(errMsg)
|
||||
except httplib.InvalidURL, e:
|
||||
errMsg = "invalid URL detected (%s). skipping " % e
|
||||
except SqlmapSyntaxException:
|
||||
errMsg = "invalid URL detected. skipping '%s'" % current
|
||||
logger.critical(errMsg)
|
||||
except httplib.InvalidURL, ex:
|
||||
errMsg = "invalid URL detected (%s). skipping " % ex
|
||||
errMsg += "URL '%s'" % current
|
||||
logger.critical(errMsg)
|
||||
|
||||
|
|
|
@ -12,6 +12,7 @@ import socket
|
|||
import urllib
|
||||
import urllib2
|
||||
|
||||
from lib.core.common import getSafeExString
|
||||
from lib.core.common import getUnicode
|
||||
from lib.core.common import readInput
|
||||
from lib.core.common import urlencode
|
||||
|
@ -30,6 +31,8 @@ from lib.core.settings import HTTP_ACCEPT_ENCODING_HEADER_VALUE
|
|||
from lib.core.settings import UNICODE_ENCODING
|
||||
from lib.request.basic import decodePage
|
||||
from lib.request.httpshandler import HTTPSHandler
|
||||
from thirdparty.socks import socks
|
||||
|
||||
|
||||
class Google(object):
|
||||
"""
|
||||
|
@ -47,10 +50,10 @@ class Google(object):
|
|||
self.opener.addheaders = conf.httpHeaders
|
||||
|
||||
try:
|
||||
conn = self.opener.open("http://www.google.com/ncr")
|
||||
conn = self.opener.open("https://www.google.com/ncr")
|
||||
conn.info() # retrieve session cookie
|
||||
except Exception, ex:
|
||||
errMsg = "unable to connect to Google ('%s')" % ex
|
||||
errMsg = "unable to connect to Google ('%s')" % getSafeExString(ex)
|
||||
raise SqlmapConnectionException(errMsg)
|
||||
|
||||
def search(self, dork):
|
||||
|
@ -65,7 +68,7 @@ class Google(object):
|
|||
if not dork:
|
||||
return None
|
||||
|
||||
url = "http://www.google.com/search?"
|
||||
url = "https://www.google.com/search?"
|
||||
url += "q=%s&" % urlencode(dork, convall=True)
|
||||
url += "num=100&hl=en&complete=0&safe=off&filter=0&btnG=Search"
|
||||
url += "&start=%d" % ((gpage - 1) * 100)
|
||||
|
@ -94,12 +97,12 @@ class Google(object):
|
|||
except urllib2.HTTPError, e:
|
||||
try:
|
||||
page = e.read()
|
||||
except socket.timeout:
|
||||
warnMsg = "connection timed out while trying "
|
||||
warnMsg += "to get error page information (%d)" % e.code
|
||||
except Exception, ex:
|
||||
warnMsg = "problem occurred while trying to get "
|
||||
warnMsg += "an error page information (%s)" % getSafeExString(ex)
|
||||
logger.critical(warnMsg)
|
||||
return None
|
||||
except (urllib2.URLError, httplib.error, socket.error, socket.timeout):
|
||||
except (urllib2.URLError, httplib.error, socket.error, socket.timeout, socks.ProxyError):
|
||||
errMsg = "unable to connect to Google"
|
||||
raise SqlmapConnectionException(errMsg)
|
||||
|
||||
|
@ -175,3 +178,6 @@ class Google(object):
|
|||
retVal = [urllib.unquote(match.group(1)) for match in re.finditer(regex, page, re.I | re.S)]
|
||||
|
||||
return retVal
|
||||
|
||||
def setHTTPProxy(): # Cross-linked function
|
||||
raise NotImplementedError
|
||||
|
|
|
@ -44,6 +44,7 @@ from lib.core.common import clearConsoleLine
|
|||
from lib.core.common import dataToStdout
|
||||
from lib.core.common import getFileItems
|
||||
from lib.core.common import getPublicTypeMembers
|
||||
from lib.core.common import getSafeExString
|
||||
from lib.core.common import hashDBRetrieve
|
||||
from lib.core.common import hashDBWrite
|
||||
from lib.core.common import normalizeUnicode
|
||||
|
@ -326,8 +327,10 @@ def wordpress_passwd(password, salt, count, prefix, uppercase=False):
|
|||
|
||||
return output
|
||||
|
||||
password = password.encode(UNICODE_ENCODING)
|
||||
|
||||
cipher = md5(salt)
|
||||
cipher.update(password.encode(UNICODE_ENCODING))
|
||||
cipher.update(password)
|
||||
hash_ = cipher.digest()
|
||||
|
||||
for i in xrange(count):
|
||||
|
@ -706,14 +709,18 @@ def dictionaryAttack(attack_dict):
|
|||
item = [(user, hash_), {}]
|
||||
elif hash_regex in (HASH.ORACLE_OLD, HASH.POSTGRES):
|
||||
item = [(user, hash_), {'username': user}]
|
||||
elif hash_regex in (HASH.ORACLE):
|
||||
elif hash_regex in (HASH.ORACLE,):
|
||||
item = [(user, hash_), {'salt': hash_[-20:]}]
|
||||
elif hash_regex in (HASH.MSSQL, HASH.MSSQL_OLD, HASH.MSSQL_NEW):
|
||||
item = [(user, hash_), {'salt': hash_[6:14]}]
|
||||
elif hash_regex in (HASH.CRYPT_GENERIC):
|
||||
elif hash_regex in (HASH.CRYPT_GENERIC,):
|
||||
item = [(user, hash_), {'salt': hash_[0:2]}]
|
||||
elif hash_regex in (HASH.WORDPRESS):
|
||||
item = [(user, hash_), {'salt': hash_[4:12], 'count': 1 << ITOA64.index(hash_[3]), 'prefix': hash_[:12]}]
|
||||
elif hash_regex in (HASH.WORDPRESS,):
|
||||
if ITOA64.index(hash_[3]) < 32:
|
||||
item = [(user, hash_), {'salt': hash_[4:12], 'count': 1 << ITOA64.index(hash_[3]), 'prefix': hash_[:12]}]
|
||||
else:
|
||||
warnMsg = "invalid hash '%s'" % hash_
|
||||
logger.warn(warnMsg)
|
||||
|
||||
if item and hash_ not in keys:
|
||||
resumed = hashDBRetrieve(hash_)
|
||||
|
@ -771,7 +778,7 @@ def dictionaryAttack(attack_dict):
|
|||
|
||||
except Exception, ex:
|
||||
warnMsg = "there was a problem while loading dictionaries"
|
||||
warnMsg += " ('%s')" % ex.message
|
||||
warnMsg += " ('%s')" % getSafeExString(ex)
|
||||
logger.critical(warnMsg)
|
||||
|
||||
message = "do you want to use common password suffixes? (slow!) [y/N] "
|
||||
|
|
|
@ -11,6 +11,7 @@ import sqlite3
|
|||
import threading
|
||||
import time
|
||||
|
||||
from lib.core.common import getSafeExString
|
||||
from lib.core.common import getUnicode
|
||||
from lib.core.common import serializeObject
|
||||
from lib.core.common import unserializeObject
|
||||
|
@ -77,7 +78,7 @@ class HashDB(object):
|
|||
for row in self.cursor.execute("SELECT value FROM storage WHERE id=?", (hash_,)):
|
||||
retVal = row[0]
|
||||
except sqlite3.OperationalError, ex:
|
||||
if not "locked" in ex.message:
|
||||
if not "locked" in getSafeExString(ex):
|
||||
raise
|
||||
except sqlite3.DatabaseError, ex:
|
||||
errMsg = "error occurred while accessing session file '%s' ('%s'). " % (self.filepath, ex)
|
||||
|
@ -127,7 +128,7 @@ class HashDB(object):
|
|||
|
||||
if retries == 0:
|
||||
warnMsg = "there has been a problem while writing to "
|
||||
warnMsg += "the session file ('%s')" % ex.message
|
||||
warnMsg += "the session file ('%s')" % getSafeExString(ex)
|
||||
logger.warn(warnMsg)
|
||||
|
||||
if retries >= HASHDB_FLUSH_RETRIES:
|
||||
|
|
|
@ -12,6 +12,7 @@ from lib.core.data import logger
|
|||
from lib.core.data import queries
|
||||
from lib.core.common import Backend
|
||||
from lib.core.common import unArrayizeValue
|
||||
from lib.core.settings import HSQLDB_DEFAULT_SCHEMA
|
||||
from lib.request import inject
|
||||
|
||||
class Enumeration(GenericEnumeration):
|
||||
|
@ -40,3 +41,6 @@ class Enumeration(GenericEnumeration):
|
|||
def getHostname(self):
|
||||
warnMsg = "on HSQLDB it is not possible to enumerate the hostname"
|
||||
logger.warn(warnMsg)
|
||||
|
||||
def getCurrentDb(self):
|
||||
return HSQLDB_DEFAULT_SCHEMA
|
||||
|
|
|
@ -152,7 +152,7 @@ class Enumeration(GenericEnumeration):
|
|||
warnMsg += "for database '%s'" % db
|
||||
logger.warn(warnMsg)
|
||||
|
||||
if not kb.data.cachedTables:
|
||||
if not kb.data.cachedTables and not conf.search:
|
||||
errMsg = "unable to retrieve the tables for any database"
|
||||
raise SqlmapNoneDataException(errMsg)
|
||||
else:
|
||||
|
@ -184,7 +184,7 @@ class Enumeration(GenericEnumeration):
|
|||
|
||||
infoMsg = "searching table"
|
||||
if tblConsider == "1":
|
||||
infoMsg += "s like"
|
||||
infoMsg += "s LIKE"
|
||||
infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(tbl)
|
||||
logger.info(infoMsg)
|
||||
|
||||
|
@ -217,7 +217,7 @@ class Enumeration(GenericEnumeration):
|
|||
else:
|
||||
infoMsg = "fetching number of table"
|
||||
if tblConsider == "1":
|
||||
infoMsg += "s like"
|
||||
infoMsg += "s LIKE"
|
||||
infoMsg += " '%s' in database '%s'" % (unsafeSQLIdentificatorNaming(tbl), unsafeSQLIdentificatorNaming(db))
|
||||
logger.info(infoMsg)
|
||||
|
||||
|
@ -229,7 +229,7 @@ class Enumeration(GenericEnumeration):
|
|||
if not isNumPosStrValue(count):
|
||||
warnMsg = "no table"
|
||||
if tblConsider == "1":
|
||||
warnMsg += "s like"
|
||||
warnMsg += "s LIKE"
|
||||
warnMsg += " '%s' " % unsafeSQLIdentificatorNaming(tbl)
|
||||
warnMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(db)
|
||||
logger.warn(warnMsg)
|
||||
|
@ -295,7 +295,7 @@ class Enumeration(GenericEnumeration):
|
|||
|
||||
infoMsg = "searching column"
|
||||
if colConsider == "1":
|
||||
infoMsg += "s like"
|
||||
infoMsg += "s LIKE"
|
||||
infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(column)
|
||||
|
||||
foundCols[column] = {}
|
||||
|
@ -336,7 +336,7 @@ class Enumeration(GenericEnumeration):
|
|||
values = [values]
|
||||
|
||||
for foundTbl in values:
|
||||
foundTbl = safeSQLIdentificatorNaming(foundTbl, True)
|
||||
foundTbl = safeSQLIdentificatorNaming(unArrayizeValue(foundTbl), True)
|
||||
|
||||
if foundTbl is None:
|
||||
continue
|
||||
|
@ -367,7 +367,7 @@ class Enumeration(GenericEnumeration):
|
|||
|
||||
infoMsg = "fetching number of tables containing column"
|
||||
if colConsider == "1":
|
||||
infoMsg += "s like"
|
||||
infoMsg += "s LIKE"
|
||||
infoMsg += " '%s' in database '%s'" % (column, db)
|
||||
logger.info("%s%s" % (infoMsg, infoMsgTbl))
|
||||
|
||||
|
@ -380,7 +380,7 @@ class Enumeration(GenericEnumeration):
|
|||
if not isNumPosStrValue(count):
|
||||
warnMsg = "no tables contain column"
|
||||
if colConsider == "1":
|
||||
warnMsg += "s like"
|
||||
warnMsg += "s LIKE"
|
||||
warnMsg += " '%s' " % column
|
||||
warnMsg += "in database '%s'" % db
|
||||
logger.warn(warnMsg)
|
||||
|
|
|
@ -169,7 +169,7 @@ class Fingerprint(GenericFingerprint):
|
|||
infoMsg = "confirming %s" % DBMS.MYSQL
|
||||
logger.info(infoMsg)
|
||||
|
||||
result = inject.checkBooleanExpression("USER() LIKE USER()")
|
||||
result = inject.checkBooleanExpression("SESSION_USER() LIKE USER()")
|
||||
|
||||
if not result:
|
||||
warnMsg = "the back-end DBMS is not %s" % DBMS.MYSQL
|
||||
|
|
|
@ -358,7 +358,7 @@ class Databases:
|
|||
if bruteForce is None:
|
||||
logger.error(errMsg)
|
||||
return self.getTables(bruteForce=True)
|
||||
else:
|
||||
elif not conf.search:
|
||||
raise SqlmapNoneDataException(errMsg)
|
||||
else:
|
||||
for db, tables in kb.data.cachedTables.items():
|
||||
|
@ -370,7 +370,7 @@ class Databases:
|
|||
|
||||
return kb.data.cachedTables
|
||||
|
||||
def getColumns(self, onlyColNames=False, colTuple=None, bruteForce=None):
|
||||
def getColumns(self, onlyColNames=False, colTuple=None, bruteForce=None, dumpMode=False):
|
||||
self.forceDbmsEnum()
|
||||
|
||||
if conf.db is None or conf.db == CURRENT_DB:
|
||||
|
@ -415,7 +415,7 @@ class Databases:
|
|||
colList = filter(None, colList)
|
||||
|
||||
if conf.tbl:
|
||||
if Backend.getIdentifiedDbms() in (DBMS.ORACLE, DBMS.DB2):
|
||||
if Backend.getIdentifiedDbms() in (DBMS.ORACLE, DBMS.DB2, DBMS.HSQLDB):
|
||||
conf.tbl = conf.tbl.upper()
|
||||
|
||||
tblList = conf.tbl.split(",")
|
||||
|
@ -432,10 +432,12 @@ class Databases:
|
|||
tblList = tblList[0]
|
||||
|
||||
tblList = list(tblList)
|
||||
else:
|
||||
elif not conf.search:
|
||||
errMsg = "unable to retrieve the tables "
|
||||
errMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(conf.db)
|
||||
raise SqlmapNoneDataException(errMsg)
|
||||
else:
|
||||
return kb.data.cachedColumns
|
||||
|
||||
tblList = filter(None, (safeSQLIdentificatorNaming(_, True) for _ in tblList))
|
||||
|
||||
|
@ -509,7 +511,7 @@ class Databases:
|
|||
if len(colList) > 0:
|
||||
if colTuple:
|
||||
_, colCondParam = colTuple
|
||||
infoMsg += "like '%s' " % ", ".join(unsafeSQLIdentificatorNaming(col) for col in sorted(colList))
|
||||
infoMsg += "LIKE '%s' " % ", ".join(unsafeSQLIdentificatorNaming(col) for col in sorted(colList))
|
||||
else:
|
||||
colCondParam = "='%s'"
|
||||
infoMsg += "'%s' " % ", ".join(unsafeSQLIdentificatorNaming(col) for col in sorted(colList))
|
||||
|
@ -517,10 +519,6 @@ class Databases:
|
|||
condQueryStr = "%%s%s" % colCondParam
|
||||
condQuery = " AND (%s)" % " OR ".join(condQueryStr % (condition, unsafeSQLIdentificatorNaming(col)) for col in sorted(colList))
|
||||
|
||||
infoMsg += "for table '%s' " % unsafeSQLIdentificatorNaming(tbl)
|
||||
infoMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(conf.db)
|
||||
logger.info(infoMsg)
|
||||
|
||||
if Backend.getIdentifiedDbms() in (DBMS.MYSQL, DBMS.PGSQL, DBMS.HSQLDB):
|
||||
query = rootQuery.inband.query % (unsafeSQLIdentificatorNaming(tbl), unsafeSQLIdentificatorNaming(conf.db))
|
||||
query += condQuery
|
||||
|
@ -534,7 +532,14 @@ class Databases:
|
|||
elif Backend.getIdentifiedDbms() in (DBMS.SQLITE, DBMS.FIREBIRD):
|
||||
query = rootQuery.inband.query % tbl
|
||||
|
||||
values = inject.getValue(query, blind=False, time=False)
|
||||
if dumpMode and colList:
|
||||
values = [(_,) for _ in colList]
|
||||
else:
|
||||
infoMsg += "for table '%s' " % unsafeSQLIdentificatorNaming(tbl)
|
||||
infoMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(conf.db)
|
||||
logger.info(infoMsg)
|
||||
|
||||
values = inject.getValue(query, blind=False, time=False)
|
||||
|
||||
if Backend.isDbms(DBMS.MSSQL) and isNoneValue(values):
|
||||
index, values = 1, []
|
||||
|
@ -604,7 +609,7 @@ class Databases:
|
|||
if len(colList) > 0:
|
||||
if colTuple:
|
||||
_, colCondParam = colTuple
|
||||
infoMsg += "like '%s' " % ", ".join(unsafeSQLIdentificatorNaming(col) for col in sorted(colList))
|
||||
infoMsg += "LIKE '%s' " % ", ".join(unsafeSQLIdentificatorNaming(col) for col in sorted(colList))
|
||||
else:
|
||||
colCondParam = "='%s'"
|
||||
infoMsg += "'%s' " % ", ".join(unsafeSQLIdentificatorNaming(col) for col in sorted(colList))
|
||||
|
@ -612,10 +617,6 @@ class Databases:
|
|||
condQueryStr = "%%s%s" % colCondParam
|
||||
condQuery = " AND (%s)" % " OR ".join(condQueryStr % (condition, unsafeSQLIdentificatorNaming(col)) for col in sorted(colList))
|
||||
|
||||
infoMsg += "for table '%s' " % unsafeSQLIdentificatorNaming(tbl)
|
||||
infoMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(conf.db)
|
||||
logger.info(infoMsg)
|
||||
|
||||
if Backend.getIdentifiedDbms() in (DBMS.MYSQL, DBMS.PGSQL, DBMS.HSQLDB):
|
||||
query = rootQuery.blind.count % (unsafeSQLIdentificatorNaming(tbl), unsafeSQLIdentificatorNaming(conf.db))
|
||||
query += condQuery
|
||||
|
@ -639,32 +640,41 @@ class Databases:
|
|||
parseSqliteTableSchema(value)
|
||||
return kb.data.cachedColumns
|
||||
|
||||
count = inject.getValue(query, union=False, error=False, expected=EXPECTED.INT, charsetType=CHARSET_TYPE.DIGITS)
|
||||
|
||||
table = {}
|
||||
columns = {}
|
||||
|
||||
if not isNumPosStrValue(count):
|
||||
if Backend.isDbms(DBMS.MSSQL):
|
||||
count, index, values = 0, 1, []
|
||||
while True:
|
||||
query = rootQuery.blind.query3 % (conf.db, tbl, index)
|
||||
value = unArrayizeValue(inject.getValue(query, union=False, error=False))
|
||||
if isNoneValue(value) or value == " ":
|
||||
break
|
||||
else:
|
||||
columns[safeSQLIdentificatorNaming(value)] = None
|
||||
index += 1
|
||||
if dumpMode and colList:
|
||||
count = 0
|
||||
for value in colList:
|
||||
columns[safeSQLIdentificatorNaming(value)] = None
|
||||
else:
|
||||
infoMsg += "for table '%s' " % unsafeSQLIdentificatorNaming(tbl)
|
||||
infoMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(conf.db)
|
||||
logger.info(infoMsg)
|
||||
|
||||
if not columns:
|
||||
errMsg = "unable to retrieve the %scolumns " % ("number of " if not Backend.isDbms(DBMS.MSSQL) else "")
|
||||
errMsg += "for table '%s' " % unsafeSQLIdentificatorNaming(tbl)
|
||||
errMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(conf.db)
|
||||
logger.error(errMsg)
|
||||
continue
|
||||
count = inject.getValue(query, union=False, error=False, expected=EXPECTED.INT, charsetType=CHARSET_TYPE.DIGITS)
|
||||
|
||||
if not isNumPosStrValue(count):
|
||||
if Backend.isDbms(DBMS.MSSQL):
|
||||
count, index, values = 0, 1, []
|
||||
while True:
|
||||
query = rootQuery.blind.query3 % (conf.db, tbl, index)
|
||||
value = unArrayizeValue(inject.getValue(query, union=False, error=False))
|
||||
if isNoneValue(value) or value == " ":
|
||||
break
|
||||
else:
|
||||
columns[safeSQLIdentificatorNaming(value)] = None
|
||||
index += 1
|
||||
|
||||
if not columns:
|
||||
errMsg = "unable to retrieve the %scolumns " % ("number of " if not Backend.isDbms(DBMS.MSSQL) else "")
|
||||
errMsg += "for table '%s' " % unsafeSQLIdentificatorNaming(tbl)
|
||||
errMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(conf.db)
|
||||
logger.error(errMsg)
|
||||
continue
|
||||
|
||||
for index in getLimitRange(count):
|
||||
if Backend.getIdentifiedDbms() in (DBMS.MYSQL, DBMS.PGSQL):
|
||||
if Backend.getIdentifiedDbms() in (DBMS.MYSQL, DBMS.PGSQL, DBMS.HSQLDB):
|
||||
query = rootQuery.blind.query % (unsafeSQLIdentificatorNaming(tbl), unsafeSQLIdentificatorNaming(conf.db))
|
||||
query += condQuery
|
||||
field = None
|
||||
|
@ -805,7 +815,7 @@ class Databases:
|
|||
|
||||
elif "." in conf.tbl:
|
||||
if not conf.db:
|
||||
conf.db, conf.tbl = conf.tbl.split(".")
|
||||
conf.db, conf.tbl = conf.tbl.split('.', 1)
|
||||
|
||||
if conf.tbl is not None and conf.db is None and Backend.getIdentifiedDbms() not in (DBMS.SQLITE, DBMS.ACCESS, DBMS.FIREBIRD):
|
||||
warnMsg = "missing database parameter. sqlmap is going to "
|
||||
|
|
|
@ -12,6 +12,7 @@ from lib.core.bigarray import BigArray
|
|||
from lib.core.common import Backend
|
||||
from lib.core.common import clearConsoleLine
|
||||
from lib.core.common import getLimitRange
|
||||
from lib.core.common import getSafeExString
|
||||
from lib.core.common import getUnicode
|
||||
from lib.core.common import isInferenceAvailable
|
||||
from lib.core.common import isListLike
|
||||
|
@ -88,10 +89,12 @@ class Entries:
|
|||
|
||||
if isinstance(tblList[0], (set, tuple, list)):
|
||||
tblList = tblList[0]
|
||||
else:
|
||||
elif not conf.search:
|
||||
errMsg = "unable to retrieve the tables "
|
||||
errMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(conf.db)
|
||||
raise SqlmapNoneDataException(errMsg)
|
||||
else:
|
||||
return
|
||||
|
||||
for tbl in tblList:
|
||||
tblList[tblList.index(tbl)] = safeSQLIdentificatorNaming(tbl, True)
|
||||
|
@ -102,7 +105,7 @@ class Entries:
|
|||
|
||||
if foundData is None:
|
||||
kb.data.cachedColumns = {}
|
||||
self.getColumns(onlyColNames=True)
|
||||
self.getColumns(onlyColNames=True, dumpMode=True)
|
||||
else:
|
||||
kb.data.cachedColumns = foundData
|
||||
|
||||
|
@ -272,7 +275,7 @@ class Entries:
|
|||
else:
|
||||
emptyColumns = []
|
||||
plusOne = Backend.getIdentifiedDbms() in (DBMS.ORACLE, DBMS.DB2)
|
||||
indexRange = getLimitRange(count, dump=True, plusOne=plusOne)
|
||||
indexRange = getLimitRange(count, plusOne=plusOne)
|
||||
|
||||
if len(colList) < len(indexRange) > CHECK_ZERO_COLUMNS_THRESHOLD:
|
||||
for column in colList:
|
||||
|
@ -293,7 +296,7 @@ class Entries:
|
|||
if column not in entries:
|
||||
entries[column] = BigArray()
|
||||
|
||||
if Backend.getIdentifiedDbms() in (DBMS.MYSQL, DBMS.PGSQL):
|
||||
if Backend.getIdentifiedDbms() in (DBMS.MYSQL, DBMS.PGSQL, DBMS.HSQLDB):
|
||||
query = rootQuery.blind.query % (agent.preprocessField(tbl, column), conf.db, conf.tbl, sorted(colList, key=len)[0], index)
|
||||
elif Backend.getIdentifiedDbms() in (DBMS.ORACLE, DBMS.DB2):
|
||||
query = rootQuery.blind.query % (agent.preprocessField(tbl, column),
|
||||
|
@ -341,13 +344,13 @@ class Entries:
|
|||
attackDumpedTable()
|
||||
except (IOError, OSError), ex:
|
||||
errMsg = "an error occurred while attacking "
|
||||
errMsg += "table dump ('%s')" % ex.message
|
||||
errMsg += "table dump ('%s')" % getSafeExString(ex)
|
||||
logger.critical(errMsg)
|
||||
conf.dumper.dbTableValues(kb.data.dumpedTable)
|
||||
|
||||
except SqlmapConnectionException, ex:
|
||||
errMsg = "connection exception detected in dumping phase "
|
||||
errMsg += "('%s')" % ex.message
|
||||
errMsg += "('%s')" % getSafeExString(ex)
|
||||
logger.critical(errMsg)
|
||||
|
||||
finally:
|
||||
|
|
|
@ -65,7 +65,7 @@ class Search:
|
|||
|
||||
infoMsg = "searching database"
|
||||
if dbConsider == "1":
|
||||
infoMsg += "s like"
|
||||
infoMsg += "s LIKE"
|
||||
infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(db)
|
||||
logger.info(infoMsg)
|
||||
|
||||
|
@ -98,7 +98,7 @@ class Search:
|
|||
if not values and isInferenceAvailable() and not conf.direct:
|
||||
infoMsg = "fetching number of database"
|
||||
if dbConsider == "1":
|
||||
infoMsg += "s like"
|
||||
infoMsg += "s LIKE"
|
||||
infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(db)
|
||||
logger.info(infoMsg)
|
||||
|
||||
|
@ -113,7 +113,7 @@ class Search:
|
|||
if not isNumPosStrValue(count):
|
||||
warnMsg = "no database"
|
||||
if dbConsider == "1":
|
||||
warnMsg += "s like"
|
||||
warnMsg += "s LIKE"
|
||||
warnMsg += " '%s' found" % unsafeSQLIdentificatorNaming(db)
|
||||
logger.warn(warnMsg)
|
||||
|
||||
|
@ -172,7 +172,7 @@ class Search:
|
|||
|
||||
infoMsg = "searching table"
|
||||
if tblConsider == "1":
|
||||
infoMsg += "s like"
|
||||
infoMsg += "s LIKE"
|
||||
infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(tbl)
|
||||
|
||||
if dbCond and conf.db and conf.db != CURRENT_DB:
|
||||
|
@ -225,7 +225,7 @@ class Search:
|
|||
if len(whereDbsQuery) == 0:
|
||||
infoMsg = "fetching number of databases with table"
|
||||
if tblConsider == "1":
|
||||
infoMsg += "s like"
|
||||
infoMsg += "s LIKE"
|
||||
infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(tbl)
|
||||
logger.info(infoMsg)
|
||||
|
||||
|
@ -236,7 +236,7 @@ class Search:
|
|||
if not isNumPosStrValue(count):
|
||||
warnMsg = "no databases have table"
|
||||
if tblConsider == "1":
|
||||
warnMsg += "s like"
|
||||
warnMsg += "s LIKE"
|
||||
warnMsg += " '%s'" % unsafeSQLIdentificatorNaming(tbl)
|
||||
logger.warn(warnMsg)
|
||||
|
||||
|
@ -274,7 +274,7 @@ class Search:
|
|||
|
||||
infoMsg = "fetching number of table"
|
||||
if tblConsider == "1":
|
||||
infoMsg += "s like"
|
||||
infoMsg += "s LIKE"
|
||||
infoMsg += " '%s' in database '%s'" % (unsafeSQLIdentificatorNaming(tbl), unsafeSQLIdentificatorNaming(db))
|
||||
logger.info(infoMsg)
|
||||
|
||||
|
@ -288,7 +288,7 @@ class Search:
|
|||
if not isNumPosStrValue(count):
|
||||
warnMsg = "no table"
|
||||
if tblConsider == "1":
|
||||
warnMsg += "s like"
|
||||
warnMsg += "s LIKE"
|
||||
warnMsg += " '%s' " % unsafeSQLIdentificatorNaming(tbl)
|
||||
warnMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(db)
|
||||
logger.warn(warnMsg)
|
||||
|
@ -390,7 +390,7 @@ class Search:
|
|||
|
||||
infoMsg = "searching column"
|
||||
if colConsider == "1":
|
||||
infoMsg += "s like"
|
||||
infoMsg += "s LIKE"
|
||||
infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(column)
|
||||
|
||||
foundCols[column] = {}
|
||||
|
@ -468,7 +468,7 @@ class Search:
|
|||
if not conf.db:
|
||||
infoMsg = "fetching number of databases with tables containing column"
|
||||
if colConsider == "1":
|
||||
infoMsg += "s like"
|
||||
infoMsg += "s LIKE"
|
||||
infoMsg += " '%s'" % unsafeSQLIdentificatorNaming(column)
|
||||
logger.info("%s%s%s" % (infoMsg, infoMsgTbl, infoMsgDb))
|
||||
|
||||
|
@ -479,7 +479,7 @@ class Search:
|
|||
if not isNumPosStrValue(count):
|
||||
warnMsg = "no databases have tables containing column"
|
||||
if colConsider == "1":
|
||||
warnMsg += "s like"
|
||||
warnMsg += "s LIKE"
|
||||
warnMsg += " '%s'" % unsafeSQLIdentificatorNaming(column)
|
||||
logger.warn("%s%s" % (warnMsg, infoMsgTbl))
|
||||
|
||||
|
@ -519,7 +519,7 @@ class Search:
|
|||
|
||||
infoMsg = "fetching number of tables containing column"
|
||||
if colConsider == "1":
|
||||
infoMsg += "s like"
|
||||
infoMsg += "s LIKE"
|
||||
infoMsg += " '%s' in database '%s'" % (unsafeSQLIdentificatorNaming(column), unsafeSQLIdentificatorNaming(db))
|
||||
logger.info(infoMsg)
|
||||
|
||||
|
@ -533,7 +533,7 @@ class Search:
|
|||
if not isNumPosStrValue(count):
|
||||
warnMsg = "no tables contain column"
|
||||
if colConsider == "1":
|
||||
warnMsg += "s like"
|
||||
warnMsg += "s LIKE"
|
||||
warnMsg += " '%s' " % unsafeSQLIdentificatorNaming(column)
|
||||
warnMsg += "in database '%s'" % unsafeSQLIdentificatorNaming(db)
|
||||
logger.warn(warnMsg)
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
DROP TABLE IF EXISTS %RANDSTR1%;
|
||||
# https://wiki.postgresql.org/wiki/CREATE_OR_REPLACE_LANGUAGE <- if "CREATE LANGUAGE plpgsql" is required
|
||||
CREATE TABLE %RANDSTR1%(%RANDSTR2% text);
|
||||
CREATE OR REPLACE FUNCTION %RANDSTR3%()
|
||||
RETURNS VOID AS $$
|
||||
|
|
11
sqlmap.conf
11
sqlmap.conf
|
@ -93,10 +93,10 @@ authType =
|
|||
# Syntax: username:password
|
||||
authCred =
|
||||
|
||||
# HTTP Authentication PEM private key. Useful only if the target URL requires
|
||||
# HTTP Authentication PEM private/cert key file. Useful only if the target URL requires
|
||||
# PKI authentication and you have such data.
|
||||
# Syntax: key_file
|
||||
authPrivate =
|
||||
authFile =
|
||||
|
||||
# Use a proxy to connect to the target URL.
|
||||
# Syntax: (http|https|socks4|socks5)://address:port
|
||||
|
@ -708,6 +708,9 @@ scope =
|
|||
# Select tests by payloads and/or titles (e.g. ROW)
|
||||
testFilter =
|
||||
|
||||
# Skip tests by payloads and/or titles (e.g. BENCHMARK)
|
||||
testSkip =
|
||||
|
||||
# Update sqlmap.
|
||||
# Valid: True or False
|
||||
updateAll = False
|
||||
|
@ -750,6 +753,10 @@ googlePage = 1
|
|||
# Valid: True or False
|
||||
identifyWaf = False
|
||||
|
||||
# Skip heuristic detection of WAF/IPS/IDS protection.
|
||||
# Valid: True or False
|
||||
skipWaf = False
|
||||
|
||||
# Imitate smartphone through HTTP User-Agent header.
|
||||
# Valid: True or False
|
||||
mobile = False
|
||||
|
|
12
sqlmap.py
12
sqlmap.py
|
@ -25,6 +25,7 @@ from lib.controller.controller import start
|
|||
from lib.core.common import banner
|
||||
from lib.core.common import createGithubIssue
|
||||
from lib.core.common import dataToStdout
|
||||
from lib.core.common import getSafeExString
|
||||
from lib.core.common import getUnicode
|
||||
from lib.core.common import maskSensitiveData
|
||||
from lib.core.common import setPaths
|
||||
|
@ -76,7 +77,7 @@ def main():
|
|||
errMsg = "your system does not properly handle non-ASCII paths. "
|
||||
errMsg += "Please move the sqlmap's directory to the other location"
|
||||
logger.error(errMsg)
|
||||
exit()
|
||||
raise SystemExit
|
||||
|
||||
setPaths()
|
||||
|
||||
|
@ -119,9 +120,9 @@ def main():
|
|||
cmdLineOptions.sqlmapShell = False
|
||||
|
||||
except SqlmapBaseException as ex:
|
||||
errMsg = getUnicode(ex.message)
|
||||
errMsg = getSafeExString(ex)
|
||||
logger.critical(errMsg)
|
||||
sys.exit(1)
|
||||
raise SystemExit
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print
|
||||
|
@ -141,6 +142,11 @@ def main():
|
|||
errMsg = unhandledExceptionMessage()
|
||||
excMsg = traceback.format_exc()
|
||||
|
||||
if "No space left" in excMsg:
|
||||
errMsg = "no space left on output device"
|
||||
logger.error(errMsg)
|
||||
raise SystemExit
|
||||
|
||||
for match in re.finditer(r'File "(.+?)", line', excMsg):
|
||||
file_ = match.group(1)
|
||||
file_ = os.path.relpath(file_, os.path.dirname(__file__))
|
||||
|
|
|
@ -35,15 +35,9 @@ def tamper(payload, **kwargs):
|
|||
'SELECT * FROM users WHERE id LIKE 1'
|
||||
"""
|
||||
|
||||
def process(match):
|
||||
word = match.group()
|
||||
word = "%sLIKE%s" % (" " if word[0] != " " else "", " " if word[-1] != " " else "")
|
||||
|
||||
return word
|
||||
|
||||
retVal = payload
|
||||
|
||||
if payload:
|
||||
retVal = re.sub(r"\s*=\s*", lambda match: process(match), retVal)
|
||||
retVal = re.sub(r"\s*=\s*", " LIKE ", retVal)
|
||||
|
||||
return retVal
|
||||
|
|
|
@ -19,7 +19,7 @@ def tamper(payload, **kwargs):
|
|||
Replaces AND and OR logical operators with their symbolic counterparts (&& and ||)
|
||||
|
||||
>>> tamper("1 AND '1'='1")
|
||||
'1 && '1'='1'
|
||||
"1 %26%26 '1'='1"
|
||||
"""
|
||||
|
||||
retVal = payload
|
||||
|
|
46
tamper/uppercase.py
Normal file
46
tamper/uppercase.py
Normal file
|
@ -0,0 +1,46 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
"""
|
||||
Copyright (c) 2006-2015 sqlmap developers (http://sqlmap.org/)
|
||||
See the file 'doc/COPYING' for copying permission
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from lib.core.data import kb
|
||||
from lib.core.enums import PRIORITY
|
||||
|
||||
__priority__ = PRIORITY.NORMAL
|
||||
|
||||
def dependencies():
|
||||
pass
|
||||
|
||||
def tamper(payload, **kwargs):
|
||||
"""
|
||||
Replaces each keyword character with upper case value
|
||||
|
||||
Tested against:
|
||||
* Microsoft SQL Server 2005
|
||||
* MySQL 4, 5.0 and 5.5
|
||||
* Oracle 10g
|
||||
* PostgreSQL 8.3, 8.4, 9.0
|
||||
|
||||
Notes:
|
||||
* Useful to bypass very weak and bespoke web application firewalls
|
||||
that has poorly written permissive regular expressions
|
||||
* This tamper script should work against all (?) databases
|
||||
|
||||
>>> tamper('insert')
|
||||
'INSERT'
|
||||
"""
|
||||
|
||||
retVal = payload
|
||||
|
||||
if payload:
|
||||
for match in re.finditer(r"[A-Za-z_]+", retVal):
|
||||
word = match.group()
|
||||
|
||||
if word.upper() in kb.keywords:
|
||||
retVal = retVal.replace(word, word.upper())
|
||||
|
||||
return retVal
|
14
thirdparty/chardet/__init__.py
vendored
14
thirdparty/chardet/__init__.py
vendored
|
@ -3,22 +3,28 @@
|
|||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
__version__ = "2.0.1"
|
||||
__version__ = "2.3.0"
|
||||
from sys import version_info
|
||||
|
||||
|
||||
def detect(aBuf):
|
||||
import universaldetector
|
||||
if ((version_info < (3, 0) and isinstance(aBuf, unicode)) or
|
||||
(version_info >= (3, 0) and not isinstance(aBuf, bytes))):
|
||||
raise ValueError('Expected a bytes object, not a unicode object')
|
||||
|
||||
from . import universaldetector
|
||||
u = universaldetector.UniversalDetector()
|
||||
u.reset()
|
||||
u.feed(aBuf)
|
||||
|
|
20
thirdparty/chardet/big5freq.py
vendored
20
thirdparty/chardet/big5freq.py
vendored
|
@ -1,11 +1,11 @@
|
|||
######################## BEGIN LICENSE BLOCK ########################
|
||||
# The Original Code is Mozilla Communicator client code.
|
||||
#
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 1998
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
#
|
||||
# Contributor(s):
|
||||
# Mark Pilgrim - port to Python
|
||||
#
|
||||
|
@ -13,12 +13,12 @@
|
|||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
|
@ -26,18 +26,18 @@
|
|||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
# Big5 frequency table
|
||||
# by Taiwan's Mandarin Promotion Council
|
||||
# by Taiwan's Mandarin Promotion Council
|
||||
# <http://www.edu.tw:81/mandr/>
|
||||
#
|
||||
#
|
||||
# 128 --> 0.42261
|
||||
# 256 --> 0.57851
|
||||
# 512 --> 0.74851
|
||||
# 1024 --> 0.89384
|
||||
# 2048 --> 0.97583
|
||||
#
|
||||
#
|
||||
# Ideal Distribution Ratio = 0.74851/(1-0.74851) =2.98
|
||||
# Random Distribution Ration = 512/(5401-512)=0.105
|
||||
#
|
||||
#
|
||||
# Typical Distribution Ratio about 25% of Ideal one, still much higher than RDR
|
||||
|
||||
BIG5_TYPICAL_DISTRIBUTION_RATIO = 0.75
|
||||
|
@ -45,7 +45,7 @@ BIG5_TYPICAL_DISTRIBUTION_RATIO = 0.75
|
|||
#Char to FreqOrder table
|
||||
BIG5_TABLE_SIZE = 5376
|
||||
|
||||
Big5CharToFreqOrder = ( \
|
||||
Big5CharToFreqOrder = (
|
||||
1,1801,1506, 255,1431, 198, 9, 82, 6,5008, 177, 202,3681,1256,2821, 110, # 16
|
||||
3814, 33,3274, 261, 76, 44,2114, 16,2946,2187,1176, 659,3971, 26,3451,2653, # 32
|
||||
1198,3972,3350,4202, 410,2215, 302, 590, 361,1964, 8, 204, 58,4510,5009,1932, # 48
|
||||
|
@ -921,3 +921,5 @@ Big5CharToFreqOrder = ( \
|
|||
13936,13937,13938,13939,13940,13941,13942,13943,13944,13945,13946,13947,13948,13949,13950,13951, #13952
|
||||
13952,13953,13954,13955,13956,13957,13958,13959,13960,13961,13962,13963,13964,13965,13966,13967, #13968
|
||||
13968,13969,13970,13971,13972) #13973
|
||||
|
||||
# flake8: noqa
|
||||
|
|
17
thirdparty/chardet/big5prober.py
vendored
17
thirdparty/chardet/big5prober.py
vendored
|
@ -1,11 +1,11 @@
|
|||
######################## BEGIN LICENSE BLOCK ########################
|
||||
# The Original Code is Mozilla Communicator client code.
|
||||
#
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 1998
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
#
|
||||
# Contributor(s):
|
||||
# Mark Pilgrim - port to Python
|
||||
#
|
||||
|
@ -13,22 +13,23 @@
|
|||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from mbcharsetprober import MultiByteCharSetProber
|
||||
from codingstatemachine import CodingStateMachine
|
||||
from chardistribution import Big5DistributionAnalysis
|
||||
from mbcssm import Big5SMModel
|
||||
from .mbcharsetprober import MultiByteCharSetProber
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .chardistribution import Big5DistributionAnalysis
|
||||
from .mbcssm import Big5SMModel
|
||||
|
||||
|
||||
class Big5Prober(MultiByteCharSetProber):
|
||||
def __init__(self):
|
||||
|
|
80
thirdparty/chardet/chardetect.py
vendored
Normal file
80
thirdparty/chardet/chardetect.py
vendored
Normal file
|
@ -0,0 +1,80 @@
|
|||
#!/usr/bin/env python
|
||||
"""
|
||||
Script which takes one or more file paths and reports on their detected
|
||||
encodings
|
||||
|
||||
Example::
|
||||
|
||||
% chardetect somefile someotherfile
|
||||
somefile: windows-1252 with confidence 0.5
|
||||
someotherfile: ascii with confidence 1.0
|
||||
|
||||
If no paths are provided, it takes its input from stdin.
|
||||
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import, print_function, unicode_literals
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from io import open
|
||||
|
||||
from chardet import __version__
|
||||
from chardet.universaldetector import UniversalDetector
|
||||
|
||||
|
||||
def description_of(lines, name='stdin'):
|
||||
"""
|
||||
Return a string describing the probable encoding of a file or
|
||||
list of strings.
|
||||
|
||||
:param lines: The lines to get the encoding of.
|
||||
:type lines: Iterable of bytes
|
||||
:param name: Name of file or collection of lines
|
||||
:type name: str
|
||||
"""
|
||||
u = UniversalDetector()
|
||||
for line in lines:
|
||||
u.feed(line)
|
||||
u.close()
|
||||
result = u.result
|
||||
if result['encoding']:
|
||||
return '{0}: {1} with confidence {2}'.format(name, result['encoding'],
|
||||
result['confidence'])
|
||||
else:
|
||||
return '{0}: no result'.format(name)
|
||||
|
||||
|
||||
def main(argv=None):
|
||||
'''
|
||||
Handles command line arguments and gets things started.
|
||||
|
||||
:param argv: List of arguments, as if specified on the command-line.
|
||||
If None, ``sys.argv[1:]`` is used instead.
|
||||
:type argv: list of str
|
||||
'''
|
||||
# Get command line arguments
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Takes one or more file paths and reports their detected \
|
||||
encodings",
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
||||
conflict_handler='resolve')
|
||||
parser.add_argument('input',
|
||||
help='File whose encoding we would like to determine.',
|
||||
type=argparse.FileType('rb'), nargs='*',
|
||||
default=[sys.stdin])
|
||||
parser.add_argument('--version', action='version',
|
||||
version='%(prog)s {0}'.format(__version__))
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
for f in args.input:
|
||||
if f.isatty():
|
||||
print("You are running chardetect interactively. Press " +
|
||||
"CTRL-D twice at the start of a blank line to signal the " +
|
||||
"end of your input. If you want help, run chardetect " +
|
||||
"--help\n", file=sys.stderr)
|
||||
print(description_of(f, f.name))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
153
thirdparty/chardet/chardistribution.py
vendored
153
thirdparty/chardet/chardistribution.py
vendored
|
@ -1,11 +1,11 @@
|
|||
######################## BEGIN LICENSE BLOCK ########################
|
||||
# The Original Code is Mozilla Communicator client code.
|
||||
#
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 1998
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
#
|
||||
# Contributor(s):
|
||||
# Mark Pilgrim - port to Python
|
||||
#
|
||||
|
@ -13,47 +13,63 @@
|
|||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import constants
|
||||
from euctwfreq import EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE, EUCTW_TYPICAL_DISTRIBUTION_RATIO
|
||||
from euckrfreq import EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE, EUCKR_TYPICAL_DISTRIBUTION_RATIO
|
||||
from gb2312freq import GB2312CharToFreqOrder, GB2312_TABLE_SIZE, GB2312_TYPICAL_DISTRIBUTION_RATIO
|
||||
from big5freq import Big5CharToFreqOrder, BIG5_TABLE_SIZE, BIG5_TYPICAL_DISTRIBUTION_RATIO
|
||||
from jisfreq import JISCharToFreqOrder, JIS_TABLE_SIZE, JIS_TYPICAL_DISTRIBUTION_RATIO
|
||||
from .euctwfreq import (EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE,
|
||||
EUCTW_TYPICAL_DISTRIBUTION_RATIO)
|
||||
from .euckrfreq import (EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE,
|
||||
EUCKR_TYPICAL_DISTRIBUTION_RATIO)
|
||||
from .gb2312freq import (GB2312CharToFreqOrder, GB2312_TABLE_SIZE,
|
||||
GB2312_TYPICAL_DISTRIBUTION_RATIO)
|
||||
from .big5freq import (Big5CharToFreqOrder, BIG5_TABLE_SIZE,
|
||||
BIG5_TYPICAL_DISTRIBUTION_RATIO)
|
||||
from .jisfreq import (JISCharToFreqOrder, JIS_TABLE_SIZE,
|
||||
JIS_TYPICAL_DISTRIBUTION_RATIO)
|
||||
from .compat import wrap_ord
|
||||
|
||||
ENOUGH_DATA_THRESHOLD = 1024
|
||||
SURE_YES = 0.99
|
||||
SURE_NO = 0.01
|
||||
MINIMUM_DATA_THRESHOLD = 3
|
||||
|
||||
|
||||
class CharDistributionAnalysis:
|
||||
def __init__(self):
|
||||
self._mCharToFreqOrder = None # Mapping table to get frequency order from char order (get from GetOrder())
|
||||
self._mTableSize = None # Size of above table
|
||||
self._mTypicalDistributionRatio = None # This is a constant value which varies from language to language, used in calculating confidence. See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail.
|
||||
# Mapping table to get frequency order from char order (get from
|
||||
# GetOrder())
|
||||
self._mCharToFreqOrder = None
|
||||
self._mTableSize = None # Size of above table
|
||||
# This is a constant value which varies from language to language,
|
||||
# used in calculating confidence. See
|
||||
# http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
|
||||
# for further detail.
|
||||
self._mTypicalDistributionRatio = None
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
"""reset analyser, clear any state"""
|
||||
self._mDone = constants.False # If this flag is set to constants.True, detection is done and conclusion has been made
|
||||
self._mTotalChars = 0 # Total characters encountered
|
||||
self._mFreqChars = 0 # The number of characters whose frequency order is less than 512
|
||||
# If this flag is set to True, detection is done and conclusion has
|
||||
# been made
|
||||
self._mDone = False
|
||||
self._mTotalChars = 0 # Total characters encountered
|
||||
# The number of characters whose frequency order is less than 512
|
||||
self._mFreqChars = 0
|
||||
|
||||
def feed(self, aStr, aCharLen):
|
||||
def feed(self, aBuf, aCharLen):
|
||||
"""feed a character with known length"""
|
||||
if aCharLen == 2:
|
||||
# we only care about 2-bytes character in our distribution analysis
|
||||
order = self.get_order(aStr)
|
||||
order = self.get_order(aBuf)
|
||||
else:
|
||||
order = -1
|
||||
if order >= 0:
|
||||
|
@ -65,12 +81,14 @@ class CharDistributionAnalysis:
|
|||
|
||||
def get_confidence(self):
|
||||
"""return confidence based on existing data"""
|
||||
# if we didn't receive any character in our consideration range, return negative answer
|
||||
if self._mTotalChars <= 0:
|
||||
# if we didn't receive any character in our consideration range,
|
||||
# return negative answer
|
||||
if self._mTotalChars <= 0 or self._mFreqChars <= MINIMUM_DATA_THRESHOLD:
|
||||
return SURE_NO
|
||||
|
||||
if self._mTotalChars != self._mFreqChars:
|
||||
r = self._mFreqChars / ((self._mTotalChars - self._mFreqChars) * self._mTypicalDistributionRatio)
|
||||
r = (self._mFreqChars / ((self._mTotalChars - self._mFreqChars)
|
||||
* self._mTypicalDistributionRatio))
|
||||
if r < SURE_YES:
|
||||
return r
|
||||
|
||||
|
@ -78,16 +96,18 @@ class CharDistributionAnalysis:
|
|||
return SURE_YES
|
||||
|
||||
def got_enough_data(self):
|
||||
# It is not necessary to receive all data to draw conclusion. For charset detection,
|
||||
# certain amount of data is enough
|
||||
# It is not necessary to receive all data to draw conclusion.
|
||||
# For charset detection, certain amount of data is enough
|
||||
return self._mTotalChars > ENOUGH_DATA_THRESHOLD
|
||||
|
||||
def get_order(self, aStr):
|
||||
# We do not handle characters based on the original encoding string, but
|
||||
# convert this encoding string to a number, here called order.
|
||||
# This allows multiple encodings of a language to share one frequency table.
|
||||
def get_order(self, aBuf):
|
||||
# We do not handle characters based on the original encoding string,
|
||||
# but convert this encoding string to a number, here called order.
|
||||
# This allows multiple encodings of a language to share one frequency
|
||||
# table.
|
||||
return -1
|
||||
|
||||
|
||||
class EUCTWDistributionAnalysis(CharDistributionAnalysis):
|
||||
def __init__(self):
|
||||
CharDistributionAnalysis.__init__(self)
|
||||
|
@ -95,16 +115,18 @@ class EUCTWDistributionAnalysis(CharDistributionAnalysis):
|
|||
self._mTableSize = EUCTW_TABLE_SIZE
|
||||
self._mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
|
||||
|
||||
def get_order(self, aStr):
|
||||
# for euc-TW encoding, we are interested
|
||||
def get_order(self, aBuf):
|
||||
# for euc-TW encoding, we are interested
|
||||
# first byte range: 0xc4 -- 0xfe
|
||||
# second byte range: 0xa1 -- 0xfe
|
||||
# no validation needed here. State machine has done that
|
||||
if aStr[0] >= '\xC4':
|
||||
return 94 * (ord(aStr[0]) - 0xC4) + ord(aStr[1]) - 0xA1
|
||||
first_char = wrap_ord(aBuf[0])
|
||||
if first_char >= 0xC4:
|
||||
return 94 * (first_char - 0xC4) + wrap_ord(aBuf[1]) - 0xA1
|
||||
else:
|
||||
return -1
|
||||
|
||||
|
||||
class EUCKRDistributionAnalysis(CharDistributionAnalysis):
|
||||
def __init__(self):
|
||||
CharDistributionAnalysis.__init__(self)
|
||||
|
@ -112,15 +134,17 @@ class EUCKRDistributionAnalysis(CharDistributionAnalysis):
|
|||
self._mTableSize = EUCKR_TABLE_SIZE
|
||||
self._mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
|
||||
|
||||
def get_order(self, aStr):
|
||||
# for euc-KR encoding, we are interested
|
||||
def get_order(self, aBuf):
|
||||
# for euc-KR encoding, we are interested
|
||||
# first byte range: 0xb0 -- 0xfe
|
||||
# second byte range: 0xa1 -- 0xfe
|
||||
# no validation needed here. State machine has done that
|
||||
if aStr[0] >= '\xB0':
|
||||
return 94 * (ord(aStr[0]) - 0xB0) + ord(aStr[1]) - 0xA1
|
||||
first_char = wrap_ord(aBuf[0])
|
||||
if first_char >= 0xB0:
|
||||
return 94 * (first_char - 0xB0) + wrap_ord(aBuf[1]) - 0xA1
|
||||
else:
|
||||
return -1;
|
||||
return -1
|
||||
|
||||
|
||||
class GB2312DistributionAnalysis(CharDistributionAnalysis):
|
||||
def __init__(self):
|
||||
|
@ -129,15 +153,17 @@ class GB2312DistributionAnalysis(CharDistributionAnalysis):
|
|||
self._mTableSize = GB2312_TABLE_SIZE
|
||||
self._mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO
|
||||
|
||||
def get_order(self, aStr):
|
||||
# for GB2312 encoding, we are interested
|
||||
def get_order(self, aBuf):
|
||||
# for GB2312 encoding, we are interested
|
||||
# first byte range: 0xb0 -- 0xfe
|
||||
# second byte range: 0xa1 -- 0xfe
|
||||
# no validation needed here. State machine has done that
|
||||
if (aStr[0] >= '\xB0') and (aStr[1] >= '\xA1'):
|
||||
return 94 * (ord(aStr[0]) - 0xB0) + ord(aStr[1]) - 0xA1
|
||||
first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1])
|
||||
if (first_char >= 0xB0) and (second_char >= 0xA1):
|
||||
return 94 * (first_char - 0xB0) + second_char - 0xA1
|
||||
else:
|
||||
return -1;
|
||||
return -1
|
||||
|
||||
|
||||
class Big5DistributionAnalysis(CharDistributionAnalysis):
|
||||
def __init__(self):
|
||||
|
@ -146,19 +172,21 @@ class Big5DistributionAnalysis(CharDistributionAnalysis):
|
|||
self._mTableSize = BIG5_TABLE_SIZE
|
||||
self._mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO
|
||||
|
||||
def get_order(self, aStr):
|
||||
# for big5 encoding, we are interested
|
||||
def get_order(self, aBuf):
|
||||
# for big5 encoding, we are interested
|
||||
# first byte range: 0xa4 -- 0xfe
|
||||
# second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
|
||||
# no validation needed here. State machine has done that
|
||||
if aStr[0] >= '\xA4':
|
||||
if aStr[1] >= '\xA1':
|
||||
return 157 * (ord(aStr[0]) - 0xA4) + ord(aStr[1]) - 0xA1 + 63
|
||||
first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1])
|
||||
if first_char >= 0xA4:
|
||||
if second_char >= 0xA1:
|
||||
return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63
|
||||
else:
|
||||
return 157 * (ord(aStr[0]) - 0xA4) + ord(aStr[1]) - 0x40
|
||||
return 157 * (first_char - 0xA4) + second_char - 0x40
|
||||
else:
|
||||
return -1
|
||||
|
||||
|
||||
class SJISDistributionAnalysis(CharDistributionAnalysis):
|
||||
def __init__(self):
|
||||
CharDistributionAnalysis.__init__(self)
|
||||
|
@ -166,22 +194,24 @@ class SJISDistributionAnalysis(CharDistributionAnalysis):
|
|||
self._mTableSize = JIS_TABLE_SIZE
|
||||
self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
|
||||
|
||||
def get_order(self, aStr):
|
||||
# for sjis encoding, we are interested
|
||||
def get_order(self, aBuf):
|
||||
# for sjis encoding, we are interested
|
||||
# first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
|
||||
# second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
|
||||
# no validation needed here. State machine has done that
|
||||
if (aStr[0] >= '\x81') and (aStr[0] <= '\x9F'):
|
||||
order = 188 * (ord(aStr[0]) - 0x81)
|
||||
elif (aStr[0] >= '\xE0') and (aStr[0] <= '\xEF'):
|
||||
order = 188 * (ord(aStr[0]) - 0xE0 + 31)
|
||||
first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1])
|
||||
if (first_char >= 0x81) and (first_char <= 0x9F):
|
||||
order = 188 * (first_char - 0x81)
|
||||
elif (first_char >= 0xE0) and (first_char <= 0xEF):
|
||||
order = 188 * (first_char - 0xE0 + 31)
|
||||
else:
|
||||
return -1;
|
||||
order = order + ord(aStr[1]) - 0x40
|
||||
if aStr[1] > '\x7F':
|
||||
order =- 1
|
||||
return -1
|
||||
order = order + second_char - 0x40
|
||||
if second_char > 0x7F:
|
||||
order = -1
|
||||
return order
|
||||
|
||||
|
||||
class EUCJPDistributionAnalysis(CharDistributionAnalysis):
|
||||
def __init__(self):
|
||||
CharDistributionAnalysis.__init__(self)
|
||||
|
@ -189,12 +219,13 @@ class EUCJPDistributionAnalysis(CharDistributionAnalysis):
|
|||
self._mTableSize = JIS_TABLE_SIZE
|
||||
self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
|
||||
|
||||
def get_order(self, aStr):
|
||||
# for euc-JP encoding, we are interested
|
||||
def get_order(self, aBuf):
|
||||
# for euc-JP encoding, we are interested
|
||||
# first byte range: 0xa0 -- 0xfe
|
||||
# second byte range: 0xa1 -- 0xfe
|
||||
# no validation needed here. State machine has done that
|
||||
if aStr[0] >= '\xA0':
|
||||
return 94 * (ord(aStr[0]) - 0xA1) + ord(aStr[1]) - 0xa1
|
||||
char = wrap_ord(aBuf[0])
|
||||
if char >= 0xA0:
|
||||
return 94 * (char - 0xA1) + wrap_ord(aBuf[1]) - 0xa1
|
||||
else:
|
||||
return -1
|
||||
|
|
34
thirdparty/chardet/charsetgroupprober.py
vendored
34
thirdparty/chardet/charsetgroupprober.py
vendored
|
@ -25,8 +25,10 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import constants, sys
|
||||
from charsetprober import CharSetProber
|
||||
from . import constants
|
||||
import sys
|
||||
from .charsetprober import CharSetProber
|
||||
|
||||
|
||||
class CharSetGroupProber(CharSetProber):
|
||||
def __init__(self):
|
||||
|
@ -41,28 +43,32 @@ class CharSetGroupProber(CharSetProber):
|
|||
for prober in self._mProbers:
|
||||
if prober:
|
||||
prober.reset()
|
||||
prober.active = constants.True
|
||||
prober.active = True
|
||||
self._mActiveNum += 1
|
||||
self._mBestGuessProber = None
|
||||
|
||||
def get_charset_name(self):
|
||||
if not self._mBestGuessProber:
|
||||
self.get_confidence()
|
||||
if not self._mBestGuessProber: return None
|
||||
if not self._mBestGuessProber:
|
||||
return None
|
||||
# self._mBestGuessProber = self._mProbers[0]
|
||||
return self._mBestGuessProber.get_charset_name()
|
||||
|
||||
def feed(self, aBuf):
|
||||
for prober in self._mProbers:
|
||||
if not prober: continue
|
||||
if not prober.active: continue
|
||||
if not prober:
|
||||
continue
|
||||
if not prober.active:
|
||||
continue
|
||||
st = prober.feed(aBuf)
|
||||
if not st: continue
|
||||
if not st:
|
||||
continue
|
||||
if st == constants.eFoundIt:
|
||||
self._mBestGuessProber = prober
|
||||
return self.get_state()
|
||||
elif st == constants.eNotMe:
|
||||
prober.active = constants.False
|
||||
prober.active = False
|
||||
self._mActiveNum -= 1
|
||||
if self._mActiveNum <= 0:
|
||||
self._mState = constants.eNotMe
|
||||
|
@ -78,18 +84,22 @@ class CharSetGroupProber(CharSetProber):
|
|||
bestConf = 0.0
|
||||
self._mBestGuessProber = None
|
||||
for prober in self._mProbers:
|
||||
if not prober: continue
|
||||
if not prober:
|
||||
continue
|
||||
if not prober.active:
|
||||
if constants._debug:
|
||||
sys.stderr.write(prober.get_charset_name() + ' not active\n')
|
||||
sys.stderr.write(prober.get_charset_name()
|
||||
+ ' not active\n')
|
||||
continue
|
||||
cf = prober.get_confidence()
|
||||
if constants._debug:
|
||||
sys.stderr.write('%s confidence = %s\n' % (prober.get_charset_name(), cf))
|
||||
sys.stderr.write('%s confidence = %s\n' %
|
||||
(prober.get_charset_name(), cf))
|
||||
if bestConf < cf:
|
||||
bestConf = cf
|
||||
self._mBestGuessProber = prober
|
||||
if not self._mBestGuessProber: return 0.0
|
||||
if not self._mBestGuessProber:
|
||||
return 0.0
|
||||
return bestConf
|
||||
# else:
|
||||
# self._mBestGuessProber = self._mProbers[0]
|
||||
|
|
16
thirdparty/chardet/charsetprober.py
vendored
16
thirdparty/chardet/charsetprober.py
vendored
|
@ -1,11 +1,11 @@
|
|||
######################## BEGIN LICENSE BLOCK ########################
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
#
|
||||
# Contributor(s):
|
||||
# Mark Pilgrim - port to Python
|
||||
# Shy Shalom - original C code
|
||||
|
@ -14,19 +14,21 @@
|
|||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import constants, re
|
||||
from . import constants
|
||||
import re
|
||||
|
||||
|
||||
class CharSetProber:
|
||||
def __init__(self):
|
||||
|
@ -48,11 +50,11 @@ class CharSetProber:
|
|||
return 0.0
|
||||
|
||||
def filter_high_bit_only(self, aBuf):
|
||||
aBuf = re.sub(r'([\x00-\x7F])+', ' ', aBuf)
|
||||
aBuf = re.sub(b'([\x00-\x7F])+', b' ', aBuf)
|
||||
return aBuf
|
||||
|
||||
def filter_without_english_letters(self, aBuf):
|
||||
aBuf = re.sub(r'([A-Za-z])+', ' ', aBuf)
|
||||
aBuf = re.sub(b'([A-Za-z])+', b' ', aBuf)
|
||||
return aBuf
|
||||
|
||||
def filter_with_english_letters(self, aBuf):
|
||||
|
|
15
thirdparty/chardet/codingstatemachine.py
vendored
15
thirdparty/chardet/codingstatemachine.py
vendored
|
@ -13,19 +13,21 @@
|
|||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from constants import eStart, eError, eItsMe
|
||||
from .constants import eStart
|
||||
from .compat import wrap_ord
|
||||
|
||||
|
||||
class CodingStateMachine:
|
||||
def __init__(self, sm):
|
||||
|
@ -40,12 +42,15 @@ class CodingStateMachine:
|
|||
def next_state(self, c):
|
||||
# for each byte we get its class
|
||||
# if it is first byte, we also get byte length
|
||||
byteCls = self._mModel['classTable'][ord(c)]
|
||||
# PY3K: aBuf is a byte stream, so c is an int, not a byte
|
||||
byteCls = self._mModel['classTable'][wrap_ord(c)]
|
||||
if self._mCurrentState == eStart:
|
||||
self._mCurrentBytePos = 0
|
||||
self._mCurrentCharLen = self._mModel['charLenTable'][byteCls]
|
||||
# from byte's class and stateTable, we get its next state
|
||||
self._mCurrentState = self._mModel['stateTable'][self._mCurrentState * self._mModel['classFactor'] + byteCls]
|
||||
curr_state = (self._mCurrentState * self._mModel['classFactor']
|
||||
+ byteCls)
|
||||
self._mCurrentState = self._mModel['stateTable'][curr_state]
|
||||
self._mCurrentBytePos += 1
|
||||
return self._mCurrentState
|
||||
|
||||
|
|
34
thirdparty/chardet/compat.py
vendored
Normal file
34
thirdparty/chardet/compat.py
vendored
Normal file
|
@ -0,0 +1,34 @@
|
|||
######################## BEGIN LICENSE BLOCK ########################
|
||||
# Contributor(s):
|
||||
# Ian Cordasco - port to Python
|
||||
#
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import sys
|
||||
|
||||
|
||||
if sys.version_info < (3, 0):
|
||||
base_str = (str, unicode)
|
||||
else:
|
||||
base_str = (bytes, str)
|
||||
|
||||
|
||||
def wrap_ord(a):
|
||||
if sys.version_info < (3, 0) and isinstance(a, base_str):
|
||||
return ord(a)
|
||||
else:
|
||||
return a
|
8
thirdparty/chardet/constants.py
vendored
8
thirdparty/chardet/constants.py
vendored
|
@ -37,11 +37,3 @@ eError = 1
|
|||
eItsMe = 2
|
||||
|
||||
SHORTCUT_THRESHOLD = 0.95
|
||||
|
||||
import __builtin__
|
||||
if not hasattr(__builtin__, 'False'):
|
||||
False = 0
|
||||
True = 1
|
||||
else:
|
||||
False = __builtin__.False
|
||||
True = __builtin__.True
|
||||
|
|
44
thirdparty/chardet/cp949prober.py
vendored
Normal file
44
thirdparty/chardet/cp949prober.py
vendored
Normal file
|
@ -0,0 +1,44 @@
|
|||
######################## BEGIN LICENSE BLOCK ########################
|
||||
# The Original Code is mozilla.org code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 1998
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Mark Pilgrim - port to Python
|
||||
#
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from .mbcharsetprober import MultiByteCharSetProber
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .chardistribution import EUCKRDistributionAnalysis
|
||||
from .mbcssm import CP949SMModel
|
||||
|
||||
|
||||
class CP949Prober(MultiByteCharSetProber):
|
||||
def __init__(self):
|
||||
MultiByteCharSetProber.__init__(self)
|
||||
self._mCodingSM = CodingStateMachine(CP949SMModel)
|
||||
# NOTE: CP949 is a superset of EUC-KR, so the distribution should be
|
||||
# not different.
|
||||
self._mDistributionAnalyzer = EUCKRDistributionAnalysis()
|
||||
self.reset()
|
||||
|
||||
def get_charset_name(self):
|
||||
return "CP949"
|
37
thirdparty/chardet/escprober.py
vendored
37
thirdparty/chardet/escprober.py
vendored
|
@ -13,39 +13,43 @@
|
|||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import constants, sys
|
||||
from escsm import HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, ISO2022KRSMModel
|
||||
from charsetprober import CharSetProber
|
||||
from codingstatemachine import CodingStateMachine
|
||||
from . import constants
|
||||
from .escsm import (HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel,
|
||||
ISO2022KRSMModel)
|
||||
from .charsetprober import CharSetProber
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .compat import wrap_ord
|
||||
|
||||
|
||||
class EscCharSetProber(CharSetProber):
|
||||
def __init__(self):
|
||||
CharSetProber.__init__(self)
|
||||
self._mCodingSM = [ \
|
||||
self._mCodingSM = [
|
||||
CodingStateMachine(HZSMModel),
|
||||
CodingStateMachine(ISO2022CNSMModel),
|
||||
CodingStateMachine(ISO2022JPSMModel),
|
||||
CodingStateMachine(ISO2022KRSMModel)
|
||||
]
|
||||
]
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
CharSetProber.reset(self)
|
||||
for codingSM in self._mCodingSM:
|
||||
if not codingSM: continue
|
||||
codingSM.active = constants.True
|
||||
if not codingSM:
|
||||
continue
|
||||
codingSM.active = True
|
||||
codingSM.reset()
|
||||
self._mActiveSM = len(self._mCodingSM)
|
||||
self._mDetectedCharset = None
|
||||
|
@ -61,19 +65,22 @@ class EscCharSetProber(CharSetProber):
|
|||
|
||||
def feed(self, aBuf):
|
||||
for c in aBuf:
|
||||
# PY3K: aBuf is a byte array, so c is an int, not a byte
|
||||
for codingSM in self._mCodingSM:
|
||||
if not codingSM: continue
|
||||
if not codingSM.active: continue
|
||||
codingState = codingSM.next_state(c)
|
||||
if not codingSM:
|
||||
continue
|
||||
if not codingSM.active:
|
||||
continue
|
||||
codingState = codingSM.next_state(wrap_ord(c))
|
||||
if codingState == constants.eError:
|
||||
codingSM.active = constants.False
|
||||
codingSM.active = False
|
||||
self._mActiveSM -= 1
|
||||
if self._mActiveSM <= 0:
|
||||
self._mState = constants.eNotMe
|
||||
return self.get_state()
|
||||
elif codingState == constants.eItsMe:
|
||||
self._mState = constants.eFoundIt
|
||||
self._mDetectedCharset = codingSM.get_coding_state_machine()
|
||||
self._mDetectedCharset = codingSM.get_coding_state_machine() # nopep8
|
||||
return self.get_state()
|
||||
|
||||
return self.get_state()
|
||||
|
|
336
thirdparty/chardet/escsm.py
vendored
336
thirdparty/chardet/escsm.py
vendored
|
@ -13,62 +13,62 @@
|
|||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from constants import eStart, eError, eItsMe
|
||||
from .constants import eStart, eError, eItsMe
|
||||
|
||||
HZ_cls = ( \
|
||||
1,0,0,0,0,0,0,0, # 00 - 07
|
||||
0,0,0,0,0,0,0,0, # 08 - 0f
|
||||
0,0,0,0,0,0,0,0, # 10 - 17
|
||||
0,0,0,1,0,0,0,0, # 18 - 1f
|
||||
0,0,0,0,0,0,0,0, # 20 - 27
|
||||
0,0,0,0,0,0,0,0, # 28 - 2f
|
||||
0,0,0,0,0,0,0,0, # 30 - 37
|
||||
0,0,0,0,0,0,0,0, # 38 - 3f
|
||||
0,0,0,0,0,0,0,0, # 40 - 47
|
||||
0,0,0,0,0,0,0,0, # 48 - 4f
|
||||
0,0,0,0,0,0,0,0, # 50 - 57
|
||||
0,0,0,0,0,0,0,0, # 58 - 5f
|
||||
0,0,0,0,0,0,0,0, # 60 - 67
|
||||
0,0,0,0,0,0,0,0, # 68 - 6f
|
||||
0,0,0,0,0,0,0,0, # 70 - 77
|
||||
0,0,0,4,0,5,2,0, # 78 - 7f
|
||||
1,1,1,1,1,1,1,1, # 80 - 87
|
||||
1,1,1,1,1,1,1,1, # 88 - 8f
|
||||
1,1,1,1,1,1,1,1, # 90 - 97
|
||||
1,1,1,1,1,1,1,1, # 98 - 9f
|
||||
1,1,1,1,1,1,1,1, # a0 - a7
|
||||
1,1,1,1,1,1,1,1, # a8 - af
|
||||
1,1,1,1,1,1,1,1, # b0 - b7
|
||||
1,1,1,1,1,1,1,1, # b8 - bf
|
||||
1,1,1,1,1,1,1,1, # c0 - c7
|
||||
1,1,1,1,1,1,1,1, # c8 - cf
|
||||
1,1,1,1,1,1,1,1, # d0 - d7
|
||||
1,1,1,1,1,1,1,1, # d8 - df
|
||||
1,1,1,1,1,1,1,1, # e0 - e7
|
||||
1,1,1,1,1,1,1,1, # e8 - ef
|
||||
1,1,1,1,1,1,1,1, # f0 - f7
|
||||
1,1,1,1,1,1,1,1, # f8 - ff
|
||||
HZ_cls = (
|
||||
1,0,0,0,0,0,0,0, # 00 - 07
|
||||
0,0,0,0,0,0,0,0, # 08 - 0f
|
||||
0,0,0,0,0,0,0,0, # 10 - 17
|
||||
0,0,0,1,0,0,0,0, # 18 - 1f
|
||||
0,0,0,0,0,0,0,0, # 20 - 27
|
||||
0,0,0,0,0,0,0,0, # 28 - 2f
|
||||
0,0,0,0,0,0,0,0, # 30 - 37
|
||||
0,0,0,0,0,0,0,0, # 38 - 3f
|
||||
0,0,0,0,0,0,0,0, # 40 - 47
|
||||
0,0,0,0,0,0,0,0, # 48 - 4f
|
||||
0,0,0,0,0,0,0,0, # 50 - 57
|
||||
0,0,0,0,0,0,0,0, # 58 - 5f
|
||||
0,0,0,0,0,0,0,0, # 60 - 67
|
||||
0,0,0,0,0,0,0,0, # 68 - 6f
|
||||
0,0,0,0,0,0,0,0, # 70 - 77
|
||||
0,0,0,4,0,5,2,0, # 78 - 7f
|
||||
1,1,1,1,1,1,1,1, # 80 - 87
|
||||
1,1,1,1,1,1,1,1, # 88 - 8f
|
||||
1,1,1,1,1,1,1,1, # 90 - 97
|
||||
1,1,1,1,1,1,1,1, # 98 - 9f
|
||||
1,1,1,1,1,1,1,1, # a0 - a7
|
||||
1,1,1,1,1,1,1,1, # a8 - af
|
||||
1,1,1,1,1,1,1,1, # b0 - b7
|
||||
1,1,1,1,1,1,1,1, # b8 - bf
|
||||
1,1,1,1,1,1,1,1, # c0 - c7
|
||||
1,1,1,1,1,1,1,1, # c8 - cf
|
||||
1,1,1,1,1,1,1,1, # d0 - d7
|
||||
1,1,1,1,1,1,1,1, # d8 - df
|
||||
1,1,1,1,1,1,1,1, # e0 - e7
|
||||
1,1,1,1,1,1,1,1, # e8 - ef
|
||||
1,1,1,1,1,1,1,1, # f0 - f7
|
||||
1,1,1,1,1,1,1,1, # f8 - ff
|
||||
)
|
||||
|
||||
HZ_st = ( \
|
||||
eStart,eError, 3,eStart,eStart,eStart,eError,eError,# 00-07
|
||||
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f
|
||||
eItsMe,eItsMe,eError,eError,eStart,eStart, 4,eError,# 10-17
|
||||
5,eError, 6,eError, 5, 5, 4,eError,# 18-1f
|
||||
4,eError, 4, 4, 4,eError, 4,eError,# 20-27
|
||||
4,eItsMe,eStart,eStart,eStart,eStart,eStart,eStart,# 28-2f
|
||||
HZ_st = (
|
||||
eStart,eError, 3,eStart,eStart,eStart,eError,eError,# 00-07
|
||||
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f
|
||||
eItsMe,eItsMe,eError,eError,eStart,eStart, 4,eError,# 10-17
|
||||
5,eError, 6,eError, 5, 5, 4,eError,# 18-1f
|
||||
4,eError, 4, 4, 4,eError, 4,eError,# 20-27
|
||||
4,eItsMe,eStart,eStart,eStart,eStart,eStart,eStart,# 28-2f
|
||||
)
|
||||
|
||||
HZCharLenTable = (0, 0, 0, 0, 0, 0)
|
||||
|
@ -79,50 +79,50 @@ HZSMModel = {'classTable': HZ_cls,
|
|||
'charLenTable': HZCharLenTable,
|
||||
'name': "HZ-GB-2312"}
|
||||
|
||||
ISO2022CN_cls = ( \
|
||||
2,0,0,0,0,0,0,0, # 00 - 07
|
||||
0,0,0,0,0,0,0,0, # 08 - 0f
|
||||
0,0,0,0,0,0,0,0, # 10 - 17
|
||||
0,0,0,1,0,0,0,0, # 18 - 1f
|
||||
0,0,0,0,0,0,0,0, # 20 - 27
|
||||
0,3,0,0,0,0,0,0, # 28 - 2f
|
||||
0,0,0,0,0,0,0,0, # 30 - 37
|
||||
0,0,0,0,0,0,0,0, # 38 - 3f
|
||||
0,0,0,4,0,0,0,0, # 40 - 47
|
||||
0,0,0,0,0,0,0,0, # 48 - 4f
|
||||
0,0,0,0,0,0,0,0, # 50 - 57
|
||||
0,0,0,0,0,0,0,0, # 58 - 5f
|
||||
0,0,0,0,0,0,0,0, # 60 - 67
|
||||
0,0,0,0,0,0,0,0, # 68 - 6f
|
||||
0,0,0,0,0,0,0,0, # 70 - 77
|
||||
0,0,0,0,0,0,0,0, # 78 - 7f
|
||||
2,2,2,2,2,2,2,2, # 80 - 87
|
||||
2,2,2,2,2,2,2,2, # 88 - 8f
|
||||
2,2,2,2,2,2,2,2, # 90 - 97
|
||||
2,2,2,2,2,2,2,2, # 98 - 9f
|
||||
2,2,2,2,2,2,2,2, # a0 - a7
|
||||
2,2,2,2,2,2,2,2, # a8 - af
|
||||
2,2,2,2,2,2,2,2, # b0 - b7
|
||||
2,2,2,2,2,2,2,2, # b8 - bf
|
||||
2,2,2,2,2,2,2,2, # c0 - c7
|
||||
2,2,2,2,2,2,2,2, # c8 - cf
|
||||
2,2,2,2,2,2,2,2, # d0 - d7
|
||||
2,2,2,2,2,2,2,2, # d8 - df
|
||||
2,2,2,2,2,2,2,2, # e0 - e7
|
||||
2,2,2,2,2,2,2,2, # e8 - ef
|
||||
2,2,2,2,2,2,2,2, # f0 - f7
|
||||
2,2,2,2,2,2,2,2, # f8 - ff
|
||||
ISO2022CN_cls = (
|
||||
2,0,0,0,0,0,0,0, # 00 - 07
|
||||
0,0,0,0,0,0,0,0, # 08 - 0f
|
||||
0,0,0,0,0,0,0,0, # 10 - 17
|
||||
0,0,0,1,0,0,0,0, # 18 - 1f
|
||||
0,0,0,0,0,0,0,0, # 20 - 27
|
||||
0,3,0,0,0,0,0,0, # 28 - 2f
|
||||
0,0,0,0,0,0,0,0, # 30 - 37
|
||||
0,0,0,0,0,0,0,0, # 38 - 3f
|
||||
0,0,0,4,0,0,0,0, # 40 - 47
|
||||
0,0,0,0,0,0,0,0, # 48 - 4f
|
||||
0,0,0,0,0,0,0,0, # 50 - 57
|
||||
0,0,0,0,0,0,0,0, # 58 - 5f
|
||||
0,0,0,0,0,0,0,0, # 60 - 67
|
||||
0,0,0,0,0,0,0,0, # 68 - 6f
|
||||
0,0,0,0,0,0,0,0, # 70 - 77
|
||||
0,0,0,0,0,0,0,0, # 78 - 7f
|
||||
2,2,2,2,2,2,2,2, # 80 - 87
|
||||
2,2,2,2,2,2,2,2, # 88 - 8f
|
||||
2,2,2,2,2,2,2,2, # 90 - 97
|
||||
2,2,2,2,2,2,2,2, # 98 - 9f
|
||||
2,2,2,2,2,2,2,2, # a0 - a7
|
||||
2,2,2,2,2,2,2,2, # a8 - af
|
||||
2,2,2,2,2,2,2,2, # b0 - b7
|
||||
2,2,2,2,2,2,2,2, # b8 - bf
|
||||
2,2,2,2,2,2,2,2, # c0 - c7
|
||||
2,2,2,2,2,2,2,2, # c8 - cf
|
||||
2,2,2,2,2,2,2,2, # d0 - d7
|
||||
2,2,2,2,2,2,2,2, # d8 - df
|
||||
2,2,2,2,2,2,2,2, # e0 - e7
|
||||
2,2,2,2,2,2,2,2, # e8 - ef
|
||||
2,2,2,2,2,2,2,2, # f0 - f7
|
||||
2,2,2,2,2,2,2,2, # f8 - ff
|
||||
)
|
||||
|
||||
ISO2022CN_st = ( \
|
||||
eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07
|
||||
eStart,eError,eError,eError,eError,eError,eError,eError,# 08-0f
|
||||
eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17
|
||||
eItsMe,eItsMe,eItsMe,eError,eError,eError, 4,eError,# 18-1f
|
||||
eError,eError,eError,eItsMe,eError,eError,eError,eError,# 20-27
|
||||
5, 6,eError,eError,eError,eError,eError,eError,# 28-2f
|
||||
eError,eError,eError,eItsMe,eError,eError,eError,eError,# 30-37
|
||||
eError,eError,eError,eError,eError,eItsMe,eError,eStart,# 38-3f
|
||||
ISO2022CN_st = (
|
||||
eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07
|
||||
eStart,eError,eError,eError,eError,eError,eError,eError,# 08-0f
|
||||
eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17
|
||||
eItsMe,eItsMe,eItsMe,eError,eError,eError, 4,eError,# 18-1f
|
||||
eError,eError,eError,eItsMe,eError,eError,eError,eError,# 20-27
|
||||
5, 6,eError,eError,eError,eError,eError,eError,# 28-2f
|
||||
eError,eError,eError,eItsMe,eError,eError,eError,eError,# 30-37
|
||||
eError,eError,eError,eError,eError,eItsMe,eError,eStart,# 38-3f
|
||||
)
|
||||
|
||||
ISO2022CNCharLenTable = (0, 0, 0, 0, 0, 0, 0, 0, 0)
|
||||
|
@ -133,51 +133,51 @@ ISO2022CNSMModel = {'classTable': ISO2022CN_cls,
|
|||
'charLenTable': ISO2022CNCharLenTable,
|
||||
'name': "ISO-2022-CN"}
|
||||
|
||||
ISO2022JP_cls = ( \
|
||||
2,0,0,0,0,0,0,0, # 00 - 07
|
||||
0,0,0,0,0,0,2,2, # 08 - 0f
|
||||
0,0,0,0,0,0,0,0, # 10 - 17
|
||||
0,0,0,1,0,0,0,0, # 18 - 1f
|
||||
0,0,0,0,7,0,0,0, # 20 - 27
|
||||
3,0,0,0,0,0,0,0, # 28 - 2f
|
||||
0,0,0,0,0,0,0,0, # 30 - 37
|
||||
0,0,0,0,0,0,0,0, # 38 - 3f
|
||||
6,0,4,0,8,0,0,0, # 40 - 47
|
||||
0,9,5,0,0,0,0,0, # 48 - 4f
|
||||
0,0,0,0,0,0,0,0, # 50 - 57
|
||||
0,0,0,0,0,0,0,0, # 58 - 5f
|
||||
0,0,0,0,0,0,0,0, # 60 - 67
|
||||
0,0,0,0,0,0,0,0, # 68 - 6f
|
||||
0,0,0,0,0,0,0,0, # 70 - 77
|
||||
0,0,0,0,0,0,0,0, # 78 - 7f
|
||||
2,2,2,2,2,2,2,2, # 80 - 87
|
||||
2,2,2,2,2,2,2,2, # 88 - 8f
|
||||
2,2,2,2,2,2,2,2, # 90 - 97
|
||||
2,2,2,2,2,2,2,2, # 98 - 9f
|
||||
2,2,2,2,2,2,2,2, # a0 - a7
|
||||
2,2,2,2,2,2,2,2, # a8 - af
|
||||
2,2,2,2,2,2,2,2, # b0 - b7
|
||||
2,2,2,2,2,2,2,2, # b8 - bf
|
||||
2,2,2,2,2,2,2,2, # c0 - c7
|
||||
2,2,2,2,2,2,2,2, # c8 - cf
|
||||
2,2,2,2,2,2,2,2, # d0 - d7
|
||||
2,2,2,2,2,2,2,2, # d8 - df
|
||||
2,2,2,2,2,2,2,2, # e0 - e7
|
||||
2,2,2,2,2,2,2,2, # e8 - ef
|
||||
2,2,2,2,2,2,2,2, # f0 - f7
|
||||
2,2,2,2,2,2,2,2, # f8 - ff
|
||||
ISO2022JP_cls = (
|
||||
2,0,0,0,0,0,0,0, # 00 - 07
|
||||
0,0,0,0,0,0,2,2, # 08 - 0f
|
||||
0,0,0,0,0,0,0,0, # 10 - 17
|
||||
0,0,0,1,0,0,0,0, # 18 - 1f
|
||||
0,0,0,0,7,0,0,0, # 20 - 27
|
||||
3,0,0,0,0,0,0,0, # 28 - 2f
|
||||
0,0,0,0,0,0,0,0, # 30 - 37
|
||||
0,0,0,0,0,0,0,0, # 38 - 3f
|
||||
6,0,4,0,8,0,0,0, # 40 - 47
|
||||
0,9,5,0,0,0,0,0, # 48 - 4f
|
||||
0,0,0,0,0,0,0,0, # 50 - 57
|
||||
0,0,0,0,0,0,0,0, # 58 - 5f
|
||||
0,0,0,0,0,0,0,0, # 60 - 67
|
||||
0,0,0,0,0,0,0,0, # 68 - 6f
|
||||
0,0,0,0,0,0,0,0, # 70 - 77
|
||||
0,0,0,0,0,0,0,0, # 78 - 7f
|
||||
2,2,2,2,2,2,2,2, # 80 - 87
|
||||
2,2,2,2,2,2,2,2, # 88 - 8f
|
||||
2,2,2,2,2,2,2,2, # 90 - 97
|
||||
2,2,2,2,2,2,2,2, # 98 - 9f
|
||||
2,2,2,2,2,2,2,2, # a0 - a7
|
||||
2,2,2,2,2,2,2,2, # a8 - af
|
||||
2,2,2,2,2,2,2,2, # b0 - b7
|
||||
2,2,2,2,2,2,2,2, # b8 - bf
|
||||
2,2,2,2,2,2,2,2, # c0 - c7
|
||||
2,2,2,2,2,2,2,2, # c8 - cf
|
||||
2,2,2,2,2,2,2,2, # d0 - d7
|
||||
2,2,2,2,2,2,2,2, # d8 - df
|
||||
2,2,2,2,2,2,2,2, # e0 - e7
|
||||
2,2,2,2,2,2,2,2, # e8 - ef
|
||||
2,2,2,2,2,2,2,2, # f0 - f7
|
||||
2,2,2,2,2,2,2,2, # f8 - ff
|
||||
)
|
||||
|
||||
ISO2022JP_st = ( \
|
||||
eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07
|
||||
eStart,eStart,eError,eError,eError,eError,eError,eError,# 08-0f
|
||||
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17
|
||||
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,# 18-1f
|
||||
eError, 5,eError,eError,eError, 4,eError,eError,# 20-27
|
||||
eError,eError,eError, 6,eItsMe,eError,eItsMe,eError,# 28-2f
|
||||
eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,# 30-37
|
||||
eError,eError,eError,eItsMe,eError,eError,eError,eError,# 38-3f
|
||||
eError,eError,eError,eError,eItsMe,eError,eStart,eStart,# 40-47
|
||||
ISO2022JP_st = (
|
||||
eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07
|
||||
eStart,eStart,eError,eError,eError,eError,eError,eError,# 08-0f
|
||||
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17
|
||||
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,# 18-1f
|
||||
eError, 5,eError,eError,eError, 4,eError,eError,# 20-27
|
||||
eError,eError,eError, 6,eItsMe,eError,eItsMe,eError,# 28-2f
|
||||
eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,# 30-37
|
||||
eError,eError,eError,eItsMe,eError,eError,eError,eError,# 38-3f
|
||||
eError,eError,eError,eError,eItsMe,eError,eStart,eStart,# 40-47
|
||||
)
|
||||
|
||||
ISO2022JPCharLenTable = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
|
||||
|
@ -188,47 +188,47 @@ ISO2022JPSMModel = {'classTable': ISO2022JP_cls,
|
|||
'charLenTable': ISO2022JPCharLenTable,
|
||||
'name': "ISO-2022-JP"}
|
||||
|
||||
ISO2022KR_cls = ( \
|
||||
2,0,0,0,0,0,0,0, # 00 - 07
|
||||
0,0,0,0,0,0,0,0, # 08 - 0f
|
||||
0,0,0,0,0,0,0,0, # 10 - 17
|
||||
0,0,0,1,0,0,0,0, # 18 - 1f
|
||||
0,0,0,0,3,0,0,0, # 20 - 27
|
||||
0,4,0,0,0,0,0,0, # 28 - 2f
|
||||
0,0,0,0,0,0,0,0, # 30 - 37
|
||||
0,0,0,0,0,0,0,0, # 38 - 3f
|
||||
0,0,0,5,0,0,0,0, # 40 - 47
|
||||
0,0,0,0,0,0,0,0, # 48 - 4f
|
||||
0,0,0,0,0,0,0,0, # 50 - 57
|
||||
0,0,0,0,0,0,0,0, # 58 - 5f
|
||||
0,0,0,0,0,0,0,0, # 60 - 67
|
||||
0,0,0,0,0,0,0,0, # 68 - 6f
|
||||
0,0,0,0,0,0,0,0, # 70 - 77
|
||||
0,0,0,0,0,0,0,0, # 78 - 7f
|
||||
2,2,2,2,2,2,2,2, # 80 - 87
|
||||
2,2,2,2,2,2,2,2, # 88 - 8f
|
||||
2,2,2,2,2,2,2,2, # 90 - 97
|
||||
2,2,2,2,2,2,2,2, # 98 - 9f
|
||||
2,2,2,2,2,2,2,2, # a0 - a7
|
||||
2,2,2,2,2,2,2,2, # a8 - af
|
||||
2,2,2,2,2,2,2,2, # b0 - b7
|
||||
2,2,2,2,2,2,2,2, # b8 - bf
|
||||
2,2,2,2,2,2,2,2, # c0 - c7
|
||||
2,2,2,2,2,2,2,2, # c8 - cf
|
||||
2,2,2,2,2,2,2,2, # d0 - d7
|
||||
2,2,2,2,2,2,2,2, # d8 - df
|
||||
2,2,2,2,2,2,2,2, # e0 - e7
|
||||
2,2,2,2,2,2,2,2, # e8 - ef
|
||||
2,2,2,2,2,2,2,2, # f0 - f7
|
||||
2,2,2,2,2,2,2,2, # f8 - ff
|
||||
ISO2022KR_cls = (
|
||||
2,0,0,0,0,0,0,0, # 00 - 07
|
||||
0,0,0,0,0,0,0,0, # 08 - 0f
|
||||
0,0,0,0,0,0,0,0, # 10 - 17
|
||||
0,0,0,1,0,0,0,0, # 18 - 1f
|
||||
0,0,0,0,3,0,0,0, # 20 - 27
|
||||
0,4,0,0,0,0,0,0, # 28 - 2f
|
||||
0,0,0,0,0,0,0,0, # 30 - 37
|
||||
0,0,0,0,0,0,0,0, # 38 - 3f
|
||||
0,0,0,5,0,0,0,0, # 40 - 47
|
||||
0,0,0,0,0,0,0,0, # 48 - 4f
|
||||
0,0,0,0,0,0,0,0, # 50 - 57
|
||||
0,0,0,0,0,0,0,0, # 58 - 5f
|
||||
0,0,0,0,0,0,0,0, # 60 - 67
|
||||
0,0,0,0,0,0,0,0, # 68 - 6f
|
||||
0,0,0,0,0,0,0,0, # 70 - 77
|
||||
0,0,0,0,0,0,0,0, # 78 - 7f
|
||||
2,2,2,2,2,2,2,2, # 80 - 87
|
||||
2,2,2,2,2,2,2,2, # 88 - 8f
|
||||
2,2,2,2,2,2,2,2, # 90 - 97
|
||||
2,2,2,2,2,2,2,2, # 98 - 9f
|
||||
2,2,2,2,2,2,2,2, # a0 - a7
|
||||
2,2,2,2,2,2,2,2, # a8 - af
|
||||
2,2,2,2,2,2,2,2, # b0 - b7
|
||||
2,2,2,2,2,2,2,2, # b8 - bf
|
||||
2,2,2,2,2,2,2,2, # c0 - c7
|
||||
2,2,2,2,2,2,2,2, # c8 - cf
|
||||
2,2,2,2,2,2,2,2, # d0 - d7
|
||||
2,2,2,2,2,2,2,2, # d8 - df
|
||||
2,2,2,2,2,2,2,2, # e0 - e7
|
||||
2,2,2,2,2,2,2,2, # e8 - ef
|
||||
2,2,2,2,2,2,2,2, # f0 - f7
|
||||
2,2,2,2,2,2,2,2, # f8 - ff
|
||||
)
|
||||
|
||||
ISO2022KR_st = ( \
|
||||
eStart, 3,eError,eStart,eStart,eStart,eError,eError,# 00-07
|
||||
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f
|
||||
eItsMe,eItsMe,eError,eError,eError, 4,eError,eError,# 10-17
|
||||
eError,eError,eError,eError, 5,eError,eError,eError,# 18-1f
|
||||
eError,eError,eError,eItsMe,eStart,eStart,eStart,eStart,# 20-27
|
||||
ISO2022KR_st = (
|
||||
eStart, 3,eError,eStart,eStart,eStart,eError,eError,# 00-07
|
||||
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f
|
||||
eItsMe,eItsMe,eError,eError,eError, 4,eError,eError,# 10-17
|
||||
eError,eError,eError,eError, 5,eError,eError,eError,# 18-1f
|
||||
eError,eError,eError,eItsMe,eStart,eStart,eStart,eStart,# 20-27
|
||||
)
|
||||
|
||||
ISO2022KRCharLenTable = (0, 0, 0, 0, 0, 0)
|
||||
|
@ -238,3 +238,5 @@ ISO2022KRSMModel = {'classTable': ISO2022KR_cls,
|
|||
'stateTable': ISO2022KR_st,
|
||||
'charLenTable': ISO2022KRCharLenTable,
|
||||
'name': "ISO-2022-KR"}
|
||||
|
||||
# flake8: noqa
|
||||
|
|
39
thirdparty/chardet/eucjpprober.py
vendored
39
thirdparty/chardet/eucjpprober.py
vendored
|
@ -13,25 +13,26 @@
|
|||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import constants, sys
|
||||
from constants import eStart, eError, eItsMe
|
||||
from mbcharsetprober import MultiByteCharSetProber
|
||||
from codingstatemachine import CodingStateMachine
|
||||
from chardistribution import EUCJPDistributionAnalysis
|
||||
from jpcntx import EUCJPContextAnalysis
|
||||
from mbcssm import EUCJPSMModel
|
||||
import sys
|
||||
from . import constants
|
||||
from .mbcharsetprober import MultiByteCharSetProber
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .chardistribution import EUCJPDistributionAnalysis
|
||||
from .jpcntx import EUCJPContextAnalysis
|
||||
from .mbcssm import EUCJPSMModel
|
||||
|
||||
|
||||
class EUCJPProber(MultiByteCharSetProber):
|
||||
def __init__(self):
|
||||
|
@ -51,30 +52,34 @@ class EUCJPProber(MultiByteCharSetProber):
|
|||
def feed(self, aBuf):
|
||||
aLen = len(aBuf)
|
||||
for i in xrange(0, aLen):
|
||||
# PY3K: aBuf is a byte array, so aBuf[i] is an int, not a byte
|
||||
codingState = self._mCodingSM.next_state(aBuf[i])
|
||||
if codingState == eError:
|
||||
if codingState == constants.eError:
|
||||
if constants._debug:
|
||||
sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n')
|
||||
sys.stderr.write(self.get_charset_name()
|
||||
+ ' prober hit error at byte ' + str(i)
|
||||
+ '\n')
|
||||
self._mState = constants.eNotMe
|
||||
break
|
||||
elif codingState == eItsMe:
|
||||
elif codingState == constants.eItsMe:
|
||||
self._mState = constants.eFoundIt
|
||||
break
|
||||
elif codingState == eStart:
|
||||
elif codingState == constants.eStart:
|
||||
charLen = self._mCodingSM.get_current_charlen()
|
||||
if i == 0:
|
||||
self._mLastChar[1] = aBuf[0]
|
||||
self._mContextAnalyzer.feed(self._mLastChar, charLen)
|
||||
self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
|
||||
else:
|
||||
self._mContextAnalyzer.feed(aBuf[i-1:i+1], charLen)
|
||||
self._mDistributionAnalyzer.feed(aBuf[i-1:i+1], charLen)
|
||||
self._mContextAnalyzer.feed(aBuf[i - 1:i + 1], charLen)
|
||||
self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1],
|
||||
charLen)
|
||||
|
||||
self._mLastChar[0] = aBuf[aLen - 1]
|
||||
|
||||
if self.get_state() == constants.eDetecting:
|
||||
if self._mContextAnalyzer.got_enough_data() and \
|
||||
(self.get_confidence() > constants.SHORTCUT_THRESHOLD):
|
||||
if (self._mContextAnalyzer.got_enough_data() and
|
||||
(self.get_confidence() > constants.SHORTCUT_THRESHOLD)):
|
||||
self._mState = constants.eFoundIt
|
||||
|
||||
return self.get_state()
|
||||
|
|
2
thirdparty/chardet/euckrfreq.py
vendored
2
thirdparty/chardet/euckrfreq.py
vendored
|
@ -592,3 +592,5 @@ EUCKRCharToFreqOrder = ( \
|
|||
8704,8705,8706,8707,8708,8709,8710,8711,8712,8713,8714,8715,8716,8717,8718,8719,
|
||||
8720,8721,8722,8723,8724,8725,8726,8727,8728,8729,8730,8731,8732,8733,8734,8735,
|
||||
8736,8737,8738,8739,8740,8741)
|
||||
|
||||
# flake8: noqa
|
||||
|
|
13
thirdparty/chardet/euckrprober.py
vendored
13
thirdparty/chardet/euckrprober.py
vendored
|
@ -13,22 +13,23 @@
|
|||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from mbcharsetprober import MultiByteCharSetProber
|
||||
from codingstatemachine import CodingStateMachine
|
||||
from chardistribution import EUCKRDistributionAnalysis
|
||||
from mbcssm import EUCKRSMModel
|
||||
from .mbcharsetprober import MultiByteCharSetProber
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .chardistribution import EUCKRDistributionAnalysis
|
||||
from .mbcssm import EUCKRSMModel
|
||||
|
||||
|
||||
class EUCKRProber(MultiByteCharSetProber):
|
||||
def __init__(self):
|
||||
|
|
16
thirdparty/chardet/euctwfreq.py
vendored
16
thirdparty/chardet/euctwfreq.py
vendored
|
@ -13,12 +13,12 @@
|
|||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
|
@ -26,8 +26,8 @@
|
|||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
# EUCTW frequency table
|
||||
# Converted from big5 work
|
||||
# by Taiwan's Mandarin Promotion Council
|
||||
# Converted from big5 work
|
||||
# by Taiwan's Mandarin Promotion Council
|
||||
# <http:#www.edu.tw:81/mandr/>
|
||||
|
||||
# 128 --> 0.42261
|
||||
|
@ -38,15 +38,15 @@
|
|||
#
|
||||
# Idea Distribution Ratio = 0.74851/(1-0.74851) =2.98
|
||||
# Random Distribution Ration = 512/(5401-512)=0.105
|
||||
#
|
||||
#
|
||||
# Typical Distribution Ratio about 25% of Ideal one, still much higher than RDR
|
||||
|
||||
EUCTW_TYPICAL_DISTRIBUTION_RATIO = 0.75
|
||||
|
||||
# Char to FreqOrder table ,
|
||||
# Char to FreqOrder table ,
|
||||
EUCTW_TABLE_SIZE = 8102
|
||||
|
||||
EUCTWCharToFreqOrder = ( \
|
||||
EUCTWCharToFreqOrder = (
|
||||
1,1800,1506, 255,1431, 198, 9, 82, 6,7310, 177, 202,3615,1256,2808, 110, # 2742
|
||||
3735, 33,3241, 261, 76, 44,2113, 16,2931,2184,1176, 659,3868, 26,3404,2643, # 2758
|
||||
1198,3869,3313,4060, 410,2211, 302, 590, 361,1963, 8, 204, 58,4296,7311,1931, # 2774
|
||||
|
@ -424,3 +424,5 @@ EUCTWCharToFreqOrder = ( \
|
|||
8694,8695,8696,8697,8698,8699,8700,8701,8702,8703,8704,8705,8706,8707,8708,8709, # 8710
|
||||
8710,8711,8712,8713,8714,8715,8716,8717,8718,8719,8720,8721,8722,8723,8724,8725, # 8726
|
||||
8726,8727,8728,8729,8730,8731,8732,8733,8734,8735,8736,8737,8738,8739,8740,8741) # 8742
|
||||
|
||||
# flake8: noqa
|
||||
|
|
8
thirdparty/chardet/euctwprober.py
vendored
8
thirdparty/chardet/euctwprober.py
vendored
|
@ -25,10 +25,10 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from mbcharsetprober import MultiByteCharSetProber
|
||||
from codingstatemachine import CodingStateMachine
|
||||
from chardistribution import EUCTWDistributionAnalysis
|
||||
from mbcssm import EUCTWSMModel
|
||||
from .mbcharsetprober import MultiByteCharSetProber
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .chardistribution import EUCTWDistributionAnalysis
|
||||
from .mbcssm import EUCTWSMModel
|
||||
|
||||
class EUCTWProber(MultiByteCharSetProber):
|
||||
def __init__(self):
|
||||
|
|
9
thirdparty/chardet/gb2312freq.py
vendored
9
thirdparty/chardet/gb2312freq.py
vendored
|
@ -13,12 +13,12 @@
|
|||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
|
@ -36,14 +36,14 @@
|
|||
#
|
||||
# Ideal Distribution Ratio = 0.79135/(1-0.79135) = 3.79
|
||||
# Random Distribution Ration = 512 / (3755 - 512) = 0.157
|
||||
#
|
||||
#
|
||||
# Typical Distribution Ratio about 25% of Ideal one, still much higher that RDR
|
||||
|
||||
GB2312_TYPICAL_DISTRIBUTION_RATIO = 0.9
|
||||
|
||||
GB2312_TABLE_SIZE = 3760
|
||||
|
||||
GB2312CharToFreqOrder = ( \
|
||||
GB2312CharToFreqOrder = (
|
||||
1671, 749,1443,2364,3924,3807,2330,3921,1704,3463,2691,1511,1515, 572,3191,2205,
|
||||
2361, 224,2558, 479,1711, 963,3162, 440,4060,1905,2966,2947,3580,2647,3961,3842,
|
||||
2204, 869,4207, 970,2678,5626,2944,2956,1479,4048, 514,3595, 588,1346,2820,3409,
|
||||
|
@ -469,3 +469,4 @@ GB2312CharToFreqOrder = ( \
|
|||
5867,5507,6273,4206,6274,4789,6098,6764,3619,3646,3833,3804,2394,3788,4936,3978,
|
||||
4866,4899,6099,6100,5559,6478,6765,3599,5868,6101,5869,5870,6275,6766,4527,6767)
|
||||
|
||||
# flake8: noqa
|
||||
|
|
8
thirdparty/chardet/gb2312prober.py
vendored
8
thirdparty/chardet/gb2312prober.py
vendored
|
@ -25,10 +25,10 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from mbcharsetprober import MultiByteCharSetProber
|
||||
from codingstatemachine import CodingStateMachine
|
||||
from chardistribution import GB2312DistributionAnalysis
|
||||
from mbcssm import GB2312SMModel
|
||||
from .mbcharsetprober import MultiByteCharSetProber
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .chardistribution import GB2312DistributionAnalysis
|
||||
from .mbcssm import GB2312SMModel
|
||||
|
||||
class GB2312Prober(MultiByteCharSetProber):
|
||||
def __init__(self):
|
||||
|
|
178
thirdparty/chardet/hebrewprober.py
vendored
178
thirdparty/chardet/hebrewprober.py
vendored
|
@ -13,20 +13,21 @@
|
|||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from charsetprober import CharSetProber
|
||||
import constants
|
||||
from .charsetprober import CharSetProber
|
||||
from .constants import eNotMe, eDetecting
|
||||
from .compat import wrap_ord
|
||||
|
||||
# This prober doesn't actually recognize a language or a charset.
|
||||
# It is a helper prober for the use of the Hebrew model probers
|
||||
|
@ -35,40 +36,40 @@ import constants
|
|||
#
|
||||
# Four main charsets exist in Hebrew:
|
||||
# "ISO-8859-8" - Visual Hebrew
|
||||
# "windows-1255" - Logical Hebrew
|
||||
# "windows-1255" - Logical Hebrew
|
||||
# "ISO-8859-8-I" - Logical Hebrew
|
||||
# "x-mac-hebrew" - ?? Logical Hebrew ??
|
||||
#
|
||||
# Both "ISO" charsets use a completely identical set of code points, whereas
|
||||
# "windows-1255" and "x-mac-hebrew" are two different proper supersets of
|
||||
# "windows-1255" and "x-mac-hebrew" are two different proper supersets of
|
||||
# these code points. windows-1255 defines additional characters in the range
|
||||
# 0x80-0x9F as some misc punctuation marks as well as some Hebrew-specific
|
||||
# 0x80-0x9F as some misc punctuation marks as well as some Hebrew-specific
|
||||
# diacritics and additional 'Yiddish' ligature letters in the range 0xc0-0xd6.
|
||||
# x-mac-hebrew defines similar additional code points but with a different
|
||||
# x-mac-hebrew defines similar additional code points but with a different
|
||||
# mapping.
|
||||
#
|
||||
# As far as an average Hebrew text with no diacritics is concerned, all four
|
||||
# charsets are identical with respect to code points. Meaning that for the
|
||||
# main Hebrew alphabet, all four map the same values to all 27 Hebrew letters
|
||||
# As far as an average Hebrew text with no diacritics is concerned, all four
|
||||
# charsets are identical with respect to code points. Meaning that for the
|
||||
# main Hebrew alphabet, all four map the same values to all 27 Hebrew letters
|
||||
# (including final letters).
|
||||
#
|
||||
# The dominant difference between these charsets is their directionality.
|
||||
# "Visual" directionality means that the text is ordered as if the renderer is
|
||||
# not aware of a BIDI rendering algorithm. The renderer sees the text and
|
||||
# draws it from left to right. The text itself when ordered naturally is read
|
||||
# not aware of a BIDI rendering algorithm. The renderer sees the text and
|
||||
# draws it from left to right. The text itself when ordered naturally is read
|
||||
# backwards. A buffer of Visual Hebrew generally looks like so:
|
||||
# "[last word of first line spelled backwards] [whole line ordered backwards
|
||||
# and spelled backwards] [first word of first line spelled backwards]
|
||||
# and spelled backwards] [first word of first line spelled backwards]
|
||||
# [end of line] [last word of second line] ... etc' "
|
||||
# adding punctuation marks, numbers and English text to visual text is
|
||||
# naturally also "visual" and from left to right.
|
||||
#
|
||||
#
|
||||
# "Logical" directionality means the text is ordered "naturally" according to
|
||||
# the order it is read. It is the responsibility of the renderer to display
|
||||
# the text from right to left. A BIDI algorithm is used to place general
|
||||
# the order it is read. It is the responsibility of the renderer to display
|
||||
# the text from right to left. A BIDI algorithm is used to place general
|
||||
# punctuation marks, numbers and English text in the text.
|
||||
#
|
||||
# Texts in x-mac-hebrew are almost impossible to find on the Internet. From
|
||||
# Texts in x-mac-hebrew are almost impossible to find on the Internet. From
|
||||
# what little evidence I could find, it seems that its general directionality
|
||||
# is Logical.
|
||||
#
|
||||
|
@ -76,17 +77,17 @@ import constants
|
|||
# charsets:
|
||||
# Visual Hebrew - "ISO-8859-8" - backwards text - Words and sentences are
|
||||
# backwards while line order is natural. For charset recognition purposes
|
||||
# the line order is unimportant (In fact, for this implementation, even
|
||||
# the line order is unimportant (In fact, for this implementation, even
|
||||
# word order is unimportant).
|
||||
# Logical Hebrew - "windows-1255" - normal, naturally ordered text.
|
||||
#
|
||||
# "ISO-8859-8-I" is a subset of windows-1255 and doesn't need to be
|
||||
# "ISO-8859-8-I" is a subset of windows-1255 and doesn't need to be
|
||||
# specifically identified.
|
||||
# "x-mac-hebrew" is also identified as windows-1255. A text in x-mac-hebrew
|
||||
# that contain special punctuation marks or diacritics is displayed with
|
||||
# some unconverted characters showing as question marks. This problem might
|
||||
# be corrected using another model prober for x-mac-hebrew. Due to the fact
|
||||
# that x-mac-hebrew texts are so rare, writing another model prober isn't
|
||||
# that x-mac-hebrew texts are so rare, writing another model prober isn't
|
||||
# worth the effort and performance hit.
|
||||
#
|
||||
#### The Prober ####
|
||||
|
@ -126,28 +127,31 @@ import constants
|
|||
# charset identified, either "windows-1255" or "ISO-8859-8".
|
||||
|
||||
# windows-1255 / ISO-8859-8 code points of interest
|
||||
FINAL_KAF = '\xea'
|
||||
NORMAL_KAF = '\xeb'
|
||||
FINAL_MEM = '\xed'
|
||||
NORMAL_MEM = '\xee'
|
||||
FINAL_NUN = '\xef'
|
||||
NORMAL_NUN = '\xf0'
|
||||
FINAL_PE = '\xf3'
|
||||
NORMAL_PE = '\xf4'
|
||||
FINAL_TSADI = '\xf5'
|
||||
NORMAL_TSADI = '\xf6'
|
||||
FINAL_KAF = 0xea
|
||||
NORMAL_KAF = 0xeb
|
||||
FINAL_MEM = 0xed
|
||||
NORMAL_MEM = 0xee
|
||||
FINAL_NUN = 0xef
|
||||
NORMAL_NUN = 0xf0
|
||||
FINAL_PE = 0xf3
|
||||
NORMAL_PE = 0xf4
|
||||
FINAL_TSADI = 0xf5
|
||||
NORMAL_TSADI = 0xf6
|
||||
|
||||
# Minimum Visual vs Logical final letter score difference.
|
||||
# If the difference is below this, don't rely solely on the final letter score distance.
|
||||
# If the difference is below this, don't rely solely on the final letter score
|
||||
# distance.
|
||||
MIN_FINAL_CHAR_DISTANCE = 5
|
||||
|
||||
# Minimum Visual vs Logical model score difference.
|
||||
# If the difference is below this, don't rely at all on the model score distance.
|
||||
# If the difference is below this, don't rely at all on the model score
|
||||
# distance.
|
||||
MIN_MODEL_DISTANCE = 0.01
|
||||
|
||||
VISUAL_HEBREW_NAME = "ISO-8859-8"
|
||||
LOGICAL_HEBREW_NAME = "windows-1255"
|
||||
|
||||
|
||||
class HebrewProber(CharSetProber):
|
||||
def __init__(self):
|
||||
CharSetProber.__init__(self)
|
||||
|
@ -159,8 +163,8 @@ class HebrewProber(CharSetProber):
|
|||
self._mFinalCharLogicalScore = 0
|
||||
self._mFinalCharVisualScore = 0
|
||||
# The two last characters seen in the previous buffer,
|
||||
# mPrev and mBeforePrev are initialized to space in order to simulate a word
|
||||
# delimiter at the beginning of the data
|
||||
# mPrev and mBeforePrev are initialized to space in order to simulate
|
||||
# a word delimiter at the beginning of the data
|
||||
self._mPrev = ' '
|
||||
self._mBeforePrev = ' '
|
||||
# These probers are owned by the group prober.
|
||||
|
@ -170,49 +174,52 @@ class HebrewProber(CharSetProber):
|
|||
self._mVisualProber = visualProber
|
||||
|
||||
def is_final(self, c):
|
||||
return c in [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE, FINAL_TSADI]
|
||||
return wrap_ord(c) in [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE,
|
||||
FINAL_TSADI]
|
||||
|
||||
def is_non_final(self, c):
|
||||
# The normal Tsadi is not a good Non-Final letter due to words like
|
||||
# 'lechotet' (to chat) containing an apostrophe after the tsadi. This
|
||||
# apostrophe is converted to a space in FilterWithoutEnglishLetters causing
|
||||
# the Non-Final tsadi to appear at an end of a word even though this is not
|
||||
# the case in the original text.
|
||||
# The letters Pe and Kaf rarely display a related behavior of not being a
|
||||
# good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for
|
||||
# example legally end with a Non-Final Pe or Kaf. However, the benefit of
|
||||
# these letters as Non-Final letters outweighs the damage since these words
|
||||
# are quite rare.
|
||||
return c in [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE]
|
||||
# The normal Tsadi is not a good Non-Final letter due to words like
|
||||
# 'lechotet' (to chat) containing an apostrophe after the tsadi. This
|
||||
# apostrophe is converted to a space in FilterWithoutEnglishLetters
|
||||
# causing the Non-Final tsadi to appear at an end of a word even
|
||||
# though this is not the case in the original text.
|
||||
# The letters Pe and Kaf rarely display a related behavior of not being
|
||||
# a good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak'
|
||||
# for example legally end with a Non-Final Pe or Kaf. However, the
|
||||
# benefit of these letters as Non-Final letters outweighs the damage
|
||||
# since these words are quite rare.
|
||||
return wrap_ord(c) in [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE]
|
||||
|
||||
def feed(self, aBuf):
|
||||
# Final letter analysis for logical-visual decision.
|
||||
# Look for evidence that the received buffer is either logical Hebrew or
|
||||
# visual Hebrew.
|
||||
# Look for evidence that the received buffer is either logical Hebrew
|
||||
# or visual Hebrew.
|
||||
# The following cases are checked:
|
||||
# 1) A word longer than 1 letter, ending with a final letter. This is an
|
||||
# indication that the text is laid out "naturally" since the final letter
|
||||
# really appears at the end. +1 for logical score.
|
||||
# 2) A word longer than 1 letter, ending with a Non-Final letter. In normal
|
||||
# Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with
|
||||
# the Non-Final form of that letter. Exceptions to this rule are mentioned
|
||||
# above in isNonFinal(). This is an indication that the text is laid out
|
||||
# backwards. +1 for visual score
|
||||
# 3) A word longer than 1 letter, starting with a final letter. Final letters
|
||||
# should not appear at the beginning of a word. This is an indication that
|
||||
# the text is laid out backwards. +1 for visual score.
|
||||
#
|
||||
# The visual score and logical score are accumulated throughout the text and
|
||||
# are finally checked against each other in GetCharSetName().
|
||||
# No checking for final letters in the middle of words is done since that case
|
||||
# is not an indication for either Logical or Visual text.
|
||||
#
|
||||
# We automatically filter out all 7-bit characters (replace them with spaces)
|
||||
# so the word boundary detection works properly. [MAP]
|
||||
# 1) A word longer than 1 letter, ending with a final letter. This is
|
||||
# an indication that the text is laid out "naturally" since the
|
||||
# final letter really appears at the end. +1 for logical score.
|
||||
# 2) A word longer than 1 letter, ending with a Non-Final letter. In
|
||||
# normal Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi,
|
||||
# should not end with the Non-Final form of that letter. Exceptions
|
||||
# to this rule are mentioned above in isNonFinal(). This is an
|
||||
# indication that the text is laid out backwards. +1 for visual
|
||||
# score
|
||||
# 3) A word longer than 1 letter, starting with a final letter. Final
|
||||
# letters should not appear at the beginning of a word. This is an
|
||||
# indication that the text is laid out backwards. +1 for visual
|
||||
# score.
|
||||
#
|
||||
# The visual score and logical score are accumulated throughout the
|
||||
# text and are finally checked against each other in GetCharSetName().
|
||||
# No checking for final letters in the middle of words is done since
|
||||
# that case is not an indication for either Logical or Visual text.
|
||||
#
|
||||
# We automatically filter out all 7-bit characters (replace them with
|
||||
# spaces) so the word boundary detection works properly. [MAP]
|
||||
|
||||
if self.get_state() == constants.eNotMe:
|
||||
if self.get_state() == eNotMe:
|
||||
# Both model probers say it's not them. No reason to continue.
|
||||
return constants.eNotMe
|
||||
return eNotMe
|
||||
|
||||
aBuf = self.filter_high_bit_only(aBuf)
|
||||
|
||||
|
@ -220,23 +227,27 @@ class HebrewProber(CharSetProber):
|
|||
if cur == ' ':
|
||||
# We stand on a space - a word just ended
|
||||
if self._mBeforePrev != ' ':
|
||||
# next-to-last char was not a space so self._mPrev is not a 1 letter word
|
||||
# next-to-last char was not a space so self._mPrev is not a
|
||||
# 1 letter word
|
||||
if self.is_final(self._mPrev):
|
||||
# case (1) [-2:not space][-1:final letter][cur:space]
|
||||
self._mFinalCharLogicalScore += 1
|
||||
elif self.is_non_final(self._mPrev):
|
||||
# case (2) [-2:not space][-1:Non-Final letter][cur:space]
|
||||
# case (2) [-2:not space][-1:Non-Final letter][
|
||||
# cur:space]
|
||||
self._mFinalCharVisualScore += 1
|
||||
else:
|
||||
# Not standing on a space
|
||||
if (self._mBeforePrev == ' ') and (self.is_final(self._mPrev)) and (cur != ' '):
|
||||
if ((self._mBeforePrev == ' ') and
|
||||
(self.is_final(self._mPrev)) and (cur != ' ')):
|
||||
# case (3) [-2:space][-1:final letter][cur:not space]
|
||||
self._mFinalCharVisualScore += 1
|
||||
self._mBeforePrev = self._mPrev
|
||||
self._mPrev = cur
|
||||
|
||||
# Forever detecting, till the end or until both model probers return eNotMe (handled above)
|
||||
return constants.eDetecting
|
||||
# Forever detecting, till the end or until both model probers return
|
||||
# eNotMe (handled above)
|
||||
return eDetecting
|
||||
|
||||
def get_charset_name(self):
|
||||
# Make the decision: is it Logical or Visual?
|
||||
|
@ -248,22 +259,25 @@ class HebrewProber(CharSetProber):
|
|||
return VISUAL_HEBREW_NAME
|
||||
|
||||
# It's not dominant enough, try to rely on the model scores instead.
|
||||
modelsub = self._mLogicalProber.get_confidence() - self._mVisualProber.get_confidence()
|
||||
modelsub = (self._mLogicalProber.get_confidence()
|
||||
- self._mVisualProber.get_confidence())
|
||||
if modelsub > MIN_MODEL_DISTANCE:
|
||||
return LOGICAL_HEBREW_NAME
|
||||
if modelsub < -MIN_MODEL_DISTANCE:
|
||||
return VISUAL_HEBREW_NAME
|
||||
|
||||
# Still no good, back to final letter distance, maybe it'll save the day.
|
||||
# Still no good, back to final letter distance, maybe it'll save the
|
||||
# day.
|
||||
if finalsub < 0.0:
|
||||
return VISUAL_HEBREW_NAME
|
||||
|
||||
# (finalsub > 0 - Logical) or (don't know what to do) default to Logical.
|
||||
# (finalsub > 0 - Logical) or (don't know what to do) default to
|
||||
# Logical.
|
||||
return LOGICAL_HEBREW_NAME
|
||||
|
||||
def get_state(self):
|
||||
# Remain active as long as any of the model probers are active.
|
||||
if (self._mLogicalProber.get_state() == constants.eNotMe) and \
|
||||
(self._mVisualProber.get_state() == constants.eNotMe):
|
||||
return constants.eNotMe
|
||||
return constants.eDetecting
|
||||
if (self._mLogicalProber.get_state() == eNotMe) and \
|
||||
(self._mVisualProber.get_state() == eNotMe):
|
||||
return eNotMe
|
||||
return eDetecting
|
||||
|
|
16
thirdparty/chardet/jisfreq.py
vendored
16
thirdparty/chardet/jisfreq.py
vendored
|
@ -13,12 +13,12 @@
|
|||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
|
@ -28,7 +28,7 @@
|
|||
# Sampling from about 20M text materials include literature and computer technology
|
||||
#
|
||||
# Japanese frequency table, applied to both S-JIS and EUC-JP
|
||||
# They are sorted in order.
|
||||
# They are sorted in order.
|
||||
|
||||
# 128 --> 0.77094
|
||||
# 256 --> 0.85710
|
||||
|
@ -38,15 +38,15 @@
|
|||
#
|
||||
# Ideal Distribution Ratio = 0.92635 / (1-0.92635) = 12.58
|
||||
# Random Distribution Ration = 512 / (2965+62+83+86-512) = 0.191
|
||||
#
|
||||
# Typical Distribution Ratio, 25% of IDR
|
||||
#
|
||||
# Typical Distribution Ratio, 25% of IDR
|
||||
|
||||
JIS_TYPICAL_DISTRIBUTION_RATIO = 3.0
|
||||
|
||||
# Char to FreqOrder table ,
|
||||
# Char to FreqOrder table ,
|
||||
JIS_TABLE_SIZE = 4368
|
||||
|
||||
JISCharToFreqOrder = ( \
|
||||
JISCharToFreqOrder = (
|
||||
40, 1, 6, 182, 152, 180, 295,2127, 285, 381,3295,4304,3068,4606,3165,3510, # 16
|
||||
3511,1822,2785,4607,1193,2226,5070,4608, 171,2996,1247, 18, 179,5071, 856,1661, # 32
|
||||
1262,5072, 619, 127,3431,3512,3230,1899,1700, 232, 228,1294,1298, 284, 283,2041, # 48
|
||||
|
@ -565,3 +565,5 @@ JISCharToFreqOrder = ( \
|
|||
8224,8225,8226,8227,8228,8229,8230,8231,8232,8233,8234,8235,8236,8237,8238,8239, # 8240
|
||||
8240,8241,8242,8243,8244,8245,8246,8247,8248,8249,8250,8251,8252,8253,8254,8255, # 8256
|
||||
8256,8257,8258,8259,8260,8261,8262,8263,8264,8265,8266,8267,8268,8269,8270,8271) # 8272
|
||||
|
||||
# flake8: noqa
|
||||
|
|
89
thirdparty/chardet/jpcntx.py
vendored
89
thirdparty/chardet/jpcntx.py
vendored
|
@ -13,19 +13,19 @@
|
|||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import constants
|
||||
from .compat import wrap_ord
|
||||
|
||||
NUM_OF_CATEGORY = 6
|
||||
DONT_KNOW = -1
|
||||
|
@ -34,7 +34,7 @@ MAX_REL_THRESHOLD = 1000
|
|||
MINIMUM_DATA_THRESHOLD = 4
|
||||
|
||||
# This is hiragana 2-char sequence table, the number in each cell represents its frequency category
|
||||
jp2CharContext = ( \
|
||||
jp2CharContext = (
|
||||
(0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1),
|
||||
(2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4),
|
||||
(0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2),
|
||||
|
@ -125,24 +125,31 @@ class JapaneseContextAnalysis:
|
|||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self._mTotalRel = 0 # total sequence received
|
||||
self._mRelSample = [0] * NUM_OF_CATEGORY # category counters, each interger counts sequence in its category
|
||||
self._mNeedToSkipCharNum = 0 # if last byte in current buffer is not the last byte of a character, we need to know how many bytes to skip in next buffer
|
||||
self._mLastCharOrder = -1 # The order of previous char
|
||||
self._mDone = constants.False # If this flag is set to constants.True, detection is done and conclusion has been made
|
||||
self._mTotalRel = 0 # total sequence received
|
||||
# category counters, each interger counts sequence in its category
|
||||
self._mRelSample = [0] * NUM_OF_CATEGORY
|
||||
# if last byte in current buffer is not the last byte of a character,
|
||||
# we need to know how many bytes to skip in next buffer
|
||||
self._mNeedToSkipCharNum = 0
|
||||
self._mLastCharOrder = -1 # The order of previous char
|
||||
# If this flag is set to True, detection is done and conclusion has
|
||||
# been made
|
||||
self._mDone = False
|
||||
|
||||
def feed(self, aBuf, aLen):
|
||||
if self._mDone: return
|
||||
if self._mDone:
|
||||
return
|
||||
|
||||
# The buffer we got is byte oriented, and a character may span in more than one
|
||||
# buffers. In case the last one or two byte in last buffer is not complete, we
|
||||
# record how many byte needed to complete that character and skip these bytes here.
|
||||
# We can choose to record those bytes as well and analyse the character once it
|
||||
# is complete, but since a character will not make much difference, by simply skipping
|
||||
# buffers. In case the last one or two byte in last buffer is not
|
||||
# complete, we record how many byte needed to complete that character
|
||||
# and skip these bytes here. We can choose to record those bytes as
|
||||
# well and analyse the character once it is complete, but since a
|
||||
# character will not make much difference, by simply skipping
|
||||
# this character will simply our logic and improve performance.
|
||||
i = self._mNeedToSkipCharNum
|
||||
while i < aLen:
|
||||
order, charLen = self.get_order(aBuf[i:i+2])
|
||||
order, charLen = self.get_order(aBuf[i:i + 2])
|
||||
i += charLen
|
||||
if i > aLen:
|
||||
self._mNeedToSkipCharNum = i - aLen
|
||||
|
@ -151,7 +158,7 @@ class JapaneseContextAnalysis:
|
|||
if (order != -1) and (self._mLastCharOrder != -1):
|
||||
self._mTotalRel += 1
|
||||
if self._mTotalRel > MAX_REL_THRESHOLD:
|
||||
self._mDone = constants.True
|
||||
self._mDone = True
|
||||
break
|
||||
self._mRelSample[jp2CharContext[self._mLastCharOrder][order]] += 1
|
||||
self._mLastCharOrder = order
|
||||
|
@ -166,45 +173,55 @@ class JapaneseContextAnalysis:
|
|||
else:
|
||||
return DONT_KNOW
|
||||
|
||||
def get_order(self, aStr):
|
||||
def get_order(self, aBuf):
|
||||
return -1, 1
|
||||
|
||||
class SJISContextAnalysis(JapaneseContextAnalysis):
|
||||
def get_order(self, aStr):
|
||||
if not aStr: return -1, 1
|
||||
def __init__(self):
|
||||
self.charset_name = "SHIFT_JIS"
|
||||
|
||||
def get_charset_name(self):
|
||||
return self.charset_name
|
||||
|
||||
def get_order(self, aBuf):
|
||||
if not aBuf:
|
||||
return -1, 1
|
||||
# find out current char's byte length
|
||||
if ((aStr[0] >= '\x81') and (aStr[0] <= '\x9F')) or \
|
||||
((aStr[0] >= '\xE0') and (aStr[0] <= '\xFC')):
|
||||
first_char = wrap_ord(aBuf[0])
|
||||
if ((0x81 <= first_char <= 0x9F) or (0xE0 <= first_char <= 0xFC)):
|
||||
charLen = 2
|
||||
if (first_char == 0x87) or (0xFA <= first_char <= 0xFC):
|
||||
self.charset_name = "CP932"
|
||||
else:
|
||||
charLen = 1
|
||||
|
||||
# return its order if it is hiragana
|
||||
if len(aStr) > 1:
|
||||
if (aStr[0] == '\202') and \
|
||||
(aStr[1] >= '\x9F') and \
|
||||
(aStr[1] <= '\xF1'):
|
||||
return ord(aStr[1]) - 0x9F, charLen
|
||||
if len(aBuf) > 1:
|
||||
second_char = wrap_ord(aBuf[1])
|
||||
if (first_char == 202) and (0x9F <= second_char <= 0xF1):
|
||||
return second_char - 0x9F, charLen
|
||||
|
||||
return -1, charLen
|
||||
|
||||
class EUCJPContextAnalysis(JapaneseContextAnalysis):
|
||||
def get_order(self, aStr):
|
||||
if not aStr: return -1, 1
|
||||
def get_order(self, aBuf):
|
||||
if not aBuf:
|
||||
return -1, 1
|
||||
# find out current char's byte length
|
||||
if (aStr[0] == '\x8E') or \
|
||||
((aStr[0] >= '\xA1') and (aStr[0] <= '\xFE')):
|
||||
first_char = wrap_ord(aBuf[0])
|
||||
if (first_char == 0x8E) or (0xA1 <= first_char <= 0xFE):
|
||||
charLen = 2
|
||||
elif aStr[0] == '\x8F':
|
||||
elif first_char == 0x8F:
|
||||
charLen = 3
|
||||
else:
|
||||
charLen = 1
|
||||
|
||||
# return its order if it is hiragana
|
||||
if len(aStr) > 1:
|
||||
if (aStr[0] == '\xA4') and \
|
||||
(aStr[1] >= '\xA1') and \
|
||||
(aStr[1] <= '\xF3'):
|
||||
return ord(aStr[1]) - 0xA1, charLen
|
||||
if len(aBuf) > 1:
|
||||
second_char = wrap_ord(aBuf[1])
|
||||
if (first_char == 0xA4) and (0xA1 <= second_char <= 0xF3):
|
||||
return second_char - 0xA1, charLen
|
||||
|
||||
return -1, charLen
|
||||
|
||||
# flake8: noqa
|
||||
|
|
29
thirdparty/chardet/langbulgarianmodel.py
vendored
29
thirdparty/chardet/langbulgarianmodel.py
vendored
|
@ -13,30 +13,28 @@
|
|||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import constants
|
||||
|
||||
# 255: Control characters that usually does not exist in any text
|
||||
# 254: Carriage/Return
|
||||
# 253: symbol (punctuation) that does not belong to word
|
||||
# 252: 0 - 9
|
||||
|
||||
# Character Mapping Table:
|
||||
# this table is modified base on win1251BulgarianCharToOrderMap, so
|
||||
# this table is modified base on win1251BulgarianCharToOrderMap, so
|
||||
# only number <64 is sure valid
|
||||
|
||||
Latin5_BulgarianCharToOrderMap = ( \
|
||||
Latin5_BulgarianCharToOrderMap = (
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||
|
@ -55,7 +53,7 @@ Latin5_BulgarianCharToOrderMap = ( \
|
|||
62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,252,253, # f0
|
||||
)
|
||||
|
||||
win1251BulgarianCharToOrderMap = ( \
|
||||
win1251BulgarianCharToOrderMap = (
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||
|
@ -74,13 +72,13 @@ win1251BulgarianCharToOrderMap = ( \
|
|||
7, 8, 5, 19, 29, 25, 22, 21, 27, 24, 17, 75, 52,253, 42, 16, # f0
|
||||
)
|
||||
|
||||
# Model Table:
|
||||
# Model Table:
|
||||
# total sequences: 100%
|
||||
# first 512 sequences: 96.9392%
|
||||
# first 1024 sequences:3.0618%
|
||||
# rest sequences: 0.2992%
|
||||
# negative sequences: 0.0020%
|
||||
BulgarianLangModel = ( \
|
||||
# negative sequences: 0.0020%
|
||||
BulgarianLangModel = (
|
||||
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,2,1,2,2,
|
||||
3,1,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,3,0,3,0,1,
|
||||
|
@ -211,18 +209,21 @@ BulgarianLangModel = ( \
|
|||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
|
||||
)
|
||||
|
||||
Latin5BulgarianModel = { \
|
||||
Latin5BulgarianModel = {
|
||||
'charToOrderMap': Latin5_BulgarianCharToOrderMap,
|
||||
'precedenceMatrix': BulgarianLangModel,
|
||||
'mTypicalPositiveRatio': 0.969392,
|
||||
'keepEnglishLetter': constants.False,
|
||||
'keepEnglishLetter': False,
|
||||
'charsetName': "ISO-8859-5"
|
||||
}
|
||||
|
||||
Win1251BulgarianModel = { \
|
||||
Win1251BulgarianModel = {
|
||||
'charToOrderMap': win1251BulgarianCharToOrderMap,
|
||||
'precedenceMatrix': BulgarianLangModel,
|
||||
'mTypicalPositiveRatio': 0.969392,
|
||||
'keepEnglishLetter': constants.False,
|
||||
'keepEnglishLetter': False,
|
||||
'charsetName': "windows-1251"
|
||||
}
|
||||
|
||||
|
||||
# flake8: noqa
|
||||
|
|
50
thirdparty/chardet/langcyrillicmodel.py
vendored
50
thirdparty/chardet/langcyrillicmodel.py
vendored
|
@ -13,23 +13,21 @@
|
|||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import constants
|
||||
|
||||
# KOI8-R language model
|
||||
# Character Mapping Table:
|
||||
KOI8R_CharToOrderMap = ( \
|
||||
KOI8R_CharToOrderMap = (
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||
|
@ -48,7 +46,7 @@ KOI8R_CharToOrderMap = ( \
|
|||
35, 43, 45, 32, 40, 52, 56, 33, 61, 62, 51, 57, 47, 63, 50, 70, # f0
|
||||
)
|
||||
|
||||
win1251_CharToOrderMap = ( \
|
||||
win1251_CharToOrderMap = (
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||
|
@ -67,7 +65,7 @@ win1251_CharToOrderMap = ( \
|
|||
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16,
|
||||
)
|
||||
|
||||
latin5_CharToOrderMap = ( \
|
||||
latin5_CharToOrderMap = (
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||
|
@ -86,7 +84,7 @@ latin5_CharToOrderMap = ( \
|
|||
239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255,
|
||||
)
|
||||
|
||||
macCyrillic_CharToOrderMap = ( \
|
||||
macCyrillic_CharToOrderMap = (
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||
|
@ -105,7 +103,7 @@ macCyrillic_CharToOrderMap = ( \
|
|||
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,255,
|
||||
)
|
||||
|
||||
IBM855_CharToOrderMap = ( \
|
||||
IBM855_CharToOrderMap = (
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||
|
@ -124,7 +122,7 @@ IBM855_CharToOrderMap = ( \
|
|||
250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,252,255,
|
||||
)
|
||||
|
||||
IBM866_CharToOrderMap = ( \
|
||||
IBM866_CharToOrderMap = (
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||
|
@ -143,13 +141,13 @@ IBM866_CharToOrderMap = ( \
|
|||
239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255,
|
||||
)
|
||||
|
||||
# Model Table:
|
||||
# Model Table:
|
||||
# total sequences: 100%
|
||||
# first 512 sequences: 97.6601%
|
||||
# first 1024 sequences: 2.3389%
|
||||
# rest sequences: 0.1237%
|
||||
# negative sequences: 0.0009%
|
||||
RussianLangModel = ( \
|
||||
# negative sequences: 0.0009%
|
||||
RussianLangModel = (
|
||||
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,2,2,2,0,0,2,
|
||||
3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,3,0,0,3,3,3,3,3,3,3,3,3,2,3,2,0,
|
||||
|
@ -280,50 +278,52 @@ RussianLangModel = ( \
|
|||
0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,
|
||||
)
|
||||
|
||||
Koi8rModel = { \
|
||||
Koi8rModel = {
|
||||
'charToOrderMap': KOI8R_CharToOrderMap,
|
||||
'precedenceMatrix': RussianLangModel,
|
||||
'mTypicalPositiveRatio': 0.976601,
|
||||
'keepEnglishLetter': constants.False,
|
||||
'keepEnglishLetter': False,
|
||||
'charsetName': "KOI8-R"
|
||||
}
|
||||
|
||||
Win1251CyrillicModel = { \
|
||||
Win1251CyrillicModel = {
|
||||
'charToOrderMap': win1251_CharToOrderMap,
|
||||
'precedenceMatrix': RussianLangModel,
|
||||
'mTypicalPositiveRatio': 0.976601,
|
||||
'keepEnglishLetter': constants.False,
|
||||
'keepEnglishLetter': False,
|
||||
'charsetName': "windows-1251"
|
||||
}
|
||||
|
||||
Latin5CyrillicModel = { \
|
||||
Latin5CyrillicModel = {
|
||||
'charToOrderMap': latin5_CharToOrderMap,
|
||||
'precedenceMatrix': RussianLangModel,
|
||||
'mTypicalPositiveRatio': 0.976601,
|
||||
'keepEnglishLetter': constants.False,
|
||||
'keepEnglishLetter': False,
|
||||
'charsetName': "ISO-8859-5"
|
||||
}
|
||||
|
||||
MacCyrillicModel = { \
|
||||
MacCyrillicModel = {
|
||||
'charToOrderMap': macCyrillic_CharToOrderMap,
|
||||
'precedenceMatrix': RussianLangModel,
|
||||
'mTypicalPositiveRatio': 0.976601,
|
||||
'keepEnglishLetter': constants.False,
|
||||
'keepEnglishLetter': False,
|
||||
'charsetName': "MacCyrillic"
|
||||
};
|
||||
|
||||
Ibm866Model = { \
|
||||
Ibm866Model = {
|
||||
'charToOrderMap': IBM866_CharToOrderMap,
|
||||
'precedenceMatrix': RussianLangModel,
|
||||
'mTypicalPositiveRatio': 0.976601,
|
||||
'keepEnglishLetter': constants.False,
|
||||
'keepEnglishLetter': False,
|
||||
'charsetName': "IBM866"
|
||||
}
|
||||
|
||||
Ibm855Model = { \
|
||||
Ibm855Model = {
|
||||
'charToOrderMap': IBM855_CharToOrderMap,
|
||||
'precedenceMatrix': RussianLangModel,
|
||||
'mTypicalPositiveRatio': 0.976601,
|
||||
'keepEnglishLetter': constants.False,
|
||||
'keepEnglishLetter': False,
|
||||
'charsetName': "IBM855"
|
||||
}
|
||||
|
||||
# flake8: noqa
|
||||
|
|
26
thirdparty/chardet/langgreekmodel.py
vendored
26
thirdparty/chardet/langgreekmodel.py
vendored
|
@ -13,27 +13,25 @@
|
|||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import constants
|
||||
|
||||
# 255: Control characters that usually does not exist in any text
|
||||
# 254: Carriage/Return
|
||||
# 253: symbol (punctuation) that does not belong to word
|
||||
# 252: 0 - 9
|
||||
|
||||
# Character Mapping Table:
|
||||
Latin7_CharToOrderMap = ( \
|
||||
Latin7_CharToOrderMap = (
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||
|
@ -52,7 +50,7 @@ Latin7_CharToOrderMap = ( \
|
|||
9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, # f0
|
||||
)
|
||||
|
||||
win1253_CharToOrderMap = ( \
|
||||
win1253_CharToOrderMap = (
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||
|
@ -71,13 +69,13 @@ win1253_CharToOrderMap = ( \
|
|||
9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, # f0
|
||||
)
|
||||
|
||||
# Model Table:
|
||||
# Model Table:
|
||||
# total sequences: 100%
|
||||
# first 512 sequences: 98.2851%
|
||||
# first 1024 sequences:1.7001%
|
||||
# rest sequences: 0.0359%
|
||||
# negative sequences: 0.0148%
|
||||
GreekLangModel = ( \
|
||||
# negative sequences: 0.0148%
|
||||
GreekLangModel = (
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,3,2,2,3,3,3,3,3,3,3,3,1,3,3,3,0,2,2,3,3,0,3,0,3,2,0,3,3,3,0,
|
||||
|
@ -208,18 +206,20 @@ GreekLangModel = ( \
|
|||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
)
|
||||
|
||||
Latin7GreekModel = { \
|
||||
Latin7GreekModel = {
|
||||
'charToOrderMap': Latin7_CharToOrderMap,
|
||||
'precedenceMatrix': GreekLangModel,
|
||||
'mTypicalPositiveRatio': 0.982851,
|
||||
'keepEnglishLetter': constants.False,
|
||||
'keepEnglishLetter': False,
|
||||
'charsetName': "ISO-8859-7"
|
||||
}
|
||||
|
||||
Win1253GreekModel = { \
|
||||
Win1253GreekModel = {
|
||||
'charToOrderMap': win1253_CharToOrderMap,
|
||||
'precedenceMatrix': GreekLangModel,
|
||||
'mTypicalPositiveRatio': 0.982851,
|
||||
'keepEnglishLetter': constants.False,
|
||||
'keepEnglishLetter': False,
|
||||
'charsetName': "windows-1253"
|
||||
}
|
||||
|
||||
# flake8: noqa
|
||||
|
|
20
thirdparty/chardet/langhebrewmodel.py
vendored
20
thirdparty/chardet/langhebrewmodel.py
vendored
|
@ -15,20 +15,18 @@
|
|||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import constants
|
||||
|
||||
# 255: Control characters that usually does not exist in any text
|
||||
# 254: Carriage/Return
|
||||
# 253: symbol (punctuation) that does not belong to word
|
||||
|
@ -36,7 +34,7 @@ import constants
|
|||
|
||||
# Windows-1255 language model
|
||||
# Character Mapping Table:
|
||||
win1255_CharToOrderMap = ( \
|
||||
win1255_CharToOrderMap = (
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||
|
@ -55,13 +53,13 @@ win1255_CharToOrderMap = ( \
|
|||
12, 19, 13, 26, 18, 27, 21, 17, 7, 10, 5,251,252,128, 96,253,
|
||||
)
|
||||
|
||||
# Model Table:
|
||||
# Model Table:
|
||||
# total sequences: 100%
|
||||
# first 512 sequences: 98.4004%
|
||||
# first 1024 sequences: 1.5981%
|
||||
# rest sequences: 0.087%
|
||||
# negative sequences: 0.0015%
|
||||
HebrewLangModel = ( \
|
||||
# negative sequences: 0.0015%
|
||||
HebrewLangModel = (
|
||||
0,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,2,1,2,0,1,0,0,
|
||||
3,0,3,1,0,0,1,3,2,0,1,1,2,0,2,2,2,1,1,1,1,2,1,1,1,2,0,0,2,2,0,1,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,
|
||||
|
@ -192,10 +190,12 @@ HebrewLangModel = ( \
|
|||
0,0,0,0,0,0,0,0,0,0,1,2,1,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,1,1,0,0,
|
||||
)
|
||||
|
||||
Win1255HebrewModel = { \
|
||||
Win1255HebrewModel = {
|
||||
'charToOrderMap': win1255_CharToOrderMap,
|
||||
'precedenceMatrix': HebrewLangModel,
|
||||
'mTypicalPositiveRatio': 0.984004,
|
||||
'keepEnglishLetter': constants.False,
|
||||
'keepEnglishLetter': False,
|
||||
'charsetName': "windows-1255"
|
||||
}
|
||||
|
||||
# flake8: noqa
|
||||
|
|
26
thirdparty/chardet/langhungarianmodel.py
vendored
26
thirdparty/chardet/langhungarianmodel.py
vendored
|
@ -13,27 +13,25 @@
|
|||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import constants
|
||||
|
||||
# 255: Control characters that usually does not exist in any text
|
||||
# 254: Carriage/Return
|
||||
# 253: symbol (punctuation) that does not belong to word
|
||||
# 252: 0 - 9
|
||||
|
||||
# Character Mapping Table:
|
||||
Latin2_HungarianCharToOrderMap = ( \
|
||||
Latin2_HungarianCharToOrderMap = (
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||
|
@ -52,7 +50,7 @@ Latin2_HungarianCharToOrderMap = ( \
|
|||
245,246,247, 25, 73, 42, 24,248,249,250, 31, 56, 29,251,252,253,
|
||||
)
|
||||
|
||||
win1250HungarianCharToOrderMap = ( \
|
||||
win1250HungarianCharToOrderMap = (
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||
|
@ -71,13 +69,13 @@ win1250HungarianCharToOrderMap = ( \
|
|||
245,246,247, 25, 74, 42, 24,248,249,250, 31, 56, 29,251,252,253,
|
||||
)
|
||||
|
||||
# Model Table:
|
||||
# Model Table:
|
||||
# total sequences: 100%
|
||||
# first 512 sequences: 94.7368%
|
||||
# first 1024 sequences:5.2623%
|
||||
# rest sequences: 0.8894%
|
||||
# negative sequences: 0.0009%
|
||||
HungarianLangModel = ( \
|
||||
# negative sequences: 0.0009%
|
||||
HungarianLangModel = (
|
||||
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
|
||||
3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,1,1,2,2,2,2,2,1,2,
|
||||
3,2,2,3,3,3,3,3,2,3,3,3,3,3,3,1,2,3,3,3,3,2,3,3,1,1,3,3,0,1,1,1,
|
||||
|
@ -208,18 +206,20 @@ HungarianLangModel = ( \
|
|||
0,1,1,1,1,1,1,0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,
|
||||
)
|
||||
|
||||
Latin2HungarianModel = { \
|
||||
Latin2HungarianModel = {
|
||||
'charToOrderMap': Latin2_HungarianCharToOrderMap,
|
||||
'precedenceMatrix': HungarianLangModel,
|
||||
'mTypicalPositiveRatio': 0.947368,
|
||||
'keepEnglishLetter': constants.True,
|
||||
'keepEnglishLetter': True,
|
||||
'charsetName': "ISO-8859-2"
|
||||
}
|
||||
|
||||
Win1250HungarianModel = { \
|
||||
Win1250HungarianModel = {
|
||||
'charToOrderMap': win1250HungarianCharToOrderMap,
|
||||
'precedenceMatrix': HungarianLangModel,
|
||||
'mTypicalPositiveRatio': 0.947368,
|
||||
'keepEnglishLetter': constants.True,
|
||||
'keepEnglishLetter': True,
|
||||
'charsetName': "windows-1250"
|
||||
}
|
||||
|
||||
# flake8: noqa
|
||||
|
|
22
thirdparty/chardet/langthaimodel.py
vendored
22
thirdparty/chardet/langthaimodel.py
vendored
|
@ -13,29 +13,27 @@
|
|||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import constants
|
||||
|
||||
# 255: Control characters that usually does not exist in any text
|
||||
# 254: Carriage/Return
|
||||
# 253: symbol (punctuation) that does not belong to word
|
||||
# 252: 0 - 9
|
||||
|
||||
# The following result for thai was collected from a limited sample (1M).
|
||||
# The following result for thai was collected from a limited sample (1M).
|
||||
|
||||
# Character Mapping Table:
|
||||
TIS620CharToOrderMap = ( \
|
||||
TIS620CharToOrderMap = (
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||
|
@ -54,13 +52,13 @@ TIS620CharToOrderMap = ( \
|
|||
68, 56, 59, 65, 69, 60, 70, 80, 71, 87,248,249,250,251,252,253,
|
||||
)
|
||||
|
||||
# Model Table:
|
||||
# Model Table:
|
||||
# total sequences: 100%
|
||||
# first 512 sequences: 92.6386%
|
||||
# first 1024 sequences:7.3177%
|
||||
# rest sequences: 1.0230%
|
||||
# negative sequences: 0.0436%
|
||||
ThaiLangModel = ( \
|
||||
# negative sequences: 0.0436%
|
||||
ThaiLangModel = (
|
||||
0,1,3,3,3,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,3,3,
|
||||
0,3,3,0,0,0,1,3,0,3,3,2,3,3,0,1,2,3,3,3,3,0,2,0,2,0,0,3,2,1,2,2,
|
||||
3,0,3,3,2,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,3,0,3,2,3,0,2,2,2,3,
|
||||
|
@ -191,10 +189,12 @@ ThaiLangModel = ( \
|
|||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
)
|
||||
|
||||
TIS620ThaiModel = { \
|
||||
TIS620ThaiModel = {
|
||||
'charToOrderMap': TIS620CharToOrderMap,
|
||||
'precedenceMatrix': ThaiLangModel,
|
||||
'mTypicalPositiveRatio': 0.926386,
|
||||
'keepEnglishLetter': constants.False,
|
||||
'keepEnglishLetter': False,
|
||||
'charsetName': "TIS-620"
|
||||
}
|
||||
|
||||
# flake8: noqa
|
||||
|
|
141
thirdparty/chardet/latin1prober.py
vendored
141
thirdparty/chardet/latin1prober.py
vendored
|
@ -14,85 +14,86 @@
|
|||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from charsetprober import CharSetProber
|
||||
import constants
|
||||
import operator
|
||||
from .charsetprober import CharSetProber
|
||||
from .constants import eNotMe
|
||||
from .compat import wrap_ord
|
||||
|
||||
FREQ_CAT_NUM = 4
|
||||
|
||||
UDF = 0 # undefined
|
||||
OTH = 1 # other
|
||||
ASC = 2 # ascii capital letter
|
||||
ASS = 3 # ascii small letter
|
||||
ACV = 4 # accent capital vowel
|
||||
ACO = 5 # accent capital other
|
||||
ASV = 6 # accent small vowel
|
||||
ASO = 7 # accent small other
|
||||
CLASS_NUM = 8 # total classes
|
||||
UDF = 0 # undefined
|
||||
OTH = 1 # other
|
||||
ASC = 2 # ascii capital letter
|
||||
ASS = 3 # ascii small letter
|
||||
ACV = 4 # accent capital vowel
|
||||
ACO = 5 # accent capital other
|
||||
ASV = 6 # accent small vowel
|
||||
ASO = 7 # accent small other
|
||||
CLASS_NUM = 8 # total classes
|
||||
|
||||
Latin1_CharToClass = ( \
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F
|
||||
OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47
|
||||
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F
|
||||
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57
|
||||
ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F
|
||||
OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67
|
||||
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F
|
||||
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77
|
||||
ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F
|
||||
OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, # 80 - 87
|
||||
OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, # 88 - 8F
|
||||
UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 90 - 97
|
||||
OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, # 98 - 9F
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A0 - A7
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A8 - AF
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B8 - BF
|
||||
ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, # C0 - C7
|
||||
ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # C8 - CF
|
||||
ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, # D0 - D7
|
||||
ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, # D8 - DF
|
||||
ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, # E0 - E7
|
||||
ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # E8 - EF
|
||||
ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, # F0 - F7
|
||||
ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, # F8 - FF
|
||||
Latin1_CharToClass = (
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F
|
||||
OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47
|
||||
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F
|
||||
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57
|
||||
ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F
|
||||
OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67
|
||||
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F
|
||||
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77
|
||||
ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F
|
||||
OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, # 80 - 87
|
||||
OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, # 88 - 8F
|
||||
UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 90 - 97
|
||||
OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, # 98 - 9F
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A0 - A7
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A8 - AF
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B8 - BF
|
||||
ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, # C0 - C7
|
||||
ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # C8 - CF
|
||||
ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, # D0 - D7
|
||||
ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, # D8 - DF
|
||||
ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, # E0 - E7
|
||||
ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # E8 - EF
|
||||
ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, # F0 - F7
|
||||
ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, # F8 - FF
|
||||
)
|
||||
|
||||
# 0 : illegal
|
||||
# 1 : very unlikely
|
||||
# 2 : normal
|
||||
# 0 : illegal
|
||||
# 1 : very unlikely
|
||||
# 2 : normal
|
||||
# 3 : very likely
|
||||
Latin1ClassModel = ( \
|
||||
# UDF OTH ASC ASS ACV ACO ASV ASO
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # UDF
|
||||
0, 3, 3, 3, 3, 3, 3, 3, # OTH
|
||||
0, 3, 3, 3, 3, 3, 3, 3, # ASC
|
||||
0, 3, 3, 3, 1, 1, 3, 3, # ASS
|
||||
0, 3, 3, 3, 1, 2, 1, 2, # ACV
|
||||
0, 3, 3, 3, 3, 3, 3, 3, # ACO
|
||||
0, 3, 1, 3, 1, 1, 1, 3, # ASV
|
||||
0, 3, 1, 3, 1, 1, 3, 3, # ASO
|
||||
Latin1ClassModel = (
|
||||
# UDF OTH ASC ASS ACV ACO ASV ASO
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # UDF
|
||||
0, 3, 3, 3, 3, 3, 3, 3, # OTH
|
||||
0, 3, 3, 3, 3, 3, 3, 3, # ASC
|
||||
0, 3, 3, 3, 1, 1, 3, 3, # ASS
|
||||
0, 3, 3, 3, 1, 2, 1, 2, # ACV
|
||||
0, 3, 3, 3, 3, 3, 3, 3, # ACO
|
||||
0, 3, 1, 3, 1, 1, 1, 3, # ASV
|
||||
0, 3, 1, 3, 1, 1, 3, 3, # ASO
|
||||
)
|
||||
|
||||
|
||||
class Latin1Prober(CharSetProber):
|
||||
def __init__(self):
|
||||
CharSetProber.__init__(self)
|
||||
|
@ -109,10 +110,11 @@ class Latin1Prober(CharSetProber):
|
|||
def feed(self, aBuf):
|
||||
aBuf = self.filter_with_english_letters(aBuf)
|
||||
for c in aBuf:
|
||||
charClass = Latin1_CharToClass[ord(c)]
|
||||
freq = Latin1ClassModel[(self._mLastCharClass * CLASS_NUM) + charClass]
|
||||
charClass = Latin1_CharToClass[wrap_ord(c)]
|
||||
freq = Latin1ClassModel[(self._mLastCharClass * CLASS_NUM)
|
||||
+ charClass]
|
||||
if freq == 0:
|
||||
self._mState = constants.eNotMe
|
||||
self._mState = eNotMe
|
||||
break
|
||||
self._mFreqCounter[freq] += 1
|
||||
self._mLastCharClass = charClass
|
||||
|
@ -120,17 +122,18 @@ class Latin1Prober(CharSetProber):
|
|||
return self.get_state()
|
||||
|
||||
def get_confidence(self):
|
||||
if self.get_state() == constants.eNotMe:
|
||||
if self.get_state() == eNotMe:
|
||||
return 0.01
|
||||
|
||||
total = reduce(operator.add, self._mFreqCounter)
|
||||
total = sum(self._mFreqCounter)
|
||||
if total < 0.01:
|
||||
confidence = 0.0
|
||||
else:
|
||||
confidence = (self._mFreqCounter[3] / total) - (self._mFreqCounter[1] * 20.0 / total)
|
||||
confidence = ((self._mFreqCounter[3] - self._mFreqCounter[1] * 20.0)
|
||||
/ total)
|
||||
if confidence < 0.0:
|
||||
confidence = 0.0
|
||||
# lower the confidence of latin1 so that other more accurate detector
|
||||
# can take priority.
|
||||
confidence = confidence * 0.5
|
||||
# lower the confidence of latin1 so that other more accurate
|
||||
# detector can take priority.
|
||||
confidence = confidence * 0.73
|
||||
return confidence
|
||||
|
|
32
thirdparty/chardet/mbcharsetprober.py
vendored
32
thirdparty/chardet/mbcharsetprober.py
vendored
|
@ -15,28 +15,29 @@
|
|||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import constants, sys
|
||||
from constants import eStart, eError, eItsMe
|
||||
from charsetprober import CharSetProber
|
||||
import sys
|
||||
from . import constants
|
||||
from .charsetprober import CharSetProber
|
||||
|
||||
|
||||
class MultiByteCharSetProber(CharSetProber):
|
||||
def __init__(self):
|
||||
CharSetProber.__init__(self)
|
||||
self._mDistributionAnalyzer = None
|
||||
self._mCodingSM = None
|
||||
self._mLastChar = ['\x00', '\x00']
|
||||
self._mLastChar = [0, 0]
|
||||
|
||||
def reset(self):
|
||||
CharSetProber.reset(self)
|
||||
|
@ -44,7 +45,7 @@ class MultiByteCharSetProber(CharSetProber):
|
|||
self._mCodingSM.reset()
|
||||
if self._mDistributionAnalyzer:
|
||||
self._mDistributionAnalyzer.reset()
|
||||
self._mLastChar = ['\x00', '\x00']
|
||||
self._mLastChar = [0, 0]
|
||||
|
||||
def get_charset_name(self):
|
||||
pass
|
||||
|
@ -53,27 +54,30 @@ class MultiByteCharSetProber(CharSetProber):
|
|||
aLen = len(aBuf)
|
||||
for i in xrange(0, aLen):
|
||||
codingState = self._mCodingSM.next_state(aBuf[i])
|
||||
if codingState == eError:
|
||||
if codingState == constants.eError:
|
||||
if constants._debug:
|
||||
sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n')
|
||||
sys.stderr.write(self.get_charset_name()
|
||||
+ ' prober hit error at byte ' + str(i)
|
||||
+ '\n')
|
||||
self._mState = constants.eNotMe
|
||||
break
|
||||
elif codingState == eItsMe:
|
||||
elif codingState == constants.eItsMe:
|
||||
self._mState = constants.eFoundIt
|
||||
break
|
||||
elif codingState == eStart:
|
||||
elif codingState == constants.eStart:
|
||||
charLen = self._mCodingSM.get_current_charlen()
|
||||
if i == 0:
|
||||
self._mLastChar[1] = aBuf[0]
|
||||
self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
|
||||
else:
|
||||
self._mDistributionAnalyzer.feed(aBuf[i-1:i+1], charLen)
|
||||
self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1],
|
||||
charLen)
|
||||
|
||||
self._mLastChar[0] = aBuf[aLen - 1]
|
||||
|
||||
if self.get_state() == constants.eDetecting:
|
||||
if self._mDistributionAnalyzer.got_enough_data() and \
|
||||
(self.get_confidence() > constants.SHORTCUT_THRESHOLD):
|
||||
if (self._mDistributionAnalyzer.got_enough_data() and
|
||||
(self.get_confidence() > constants.SHORTCUT_THRESHOLD)):
|
||||
self._mState = constants.eFoundIt
|
||||
|
||||
return self.get_state()
|
||||
|
|
28
thirdparty/chardet/mbcsgroupprober.py
vendored
28
thirdparty/chardet/mbcsgroupprober.py
vendored
|
@ -15,36 +15,40 @@
|
|||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from charsetgroupprober import CharSetGroupProber
|
||||
from utf8prober import UTF8Prober
|
||||
from sjisprober import SJISProber
|
||||
from eucjpprober import EUCJPProber
|
||||
from gb2312prober import GB2312Prober
|
||||
from euckrprober import EUCKRProber
|
||||
from big5prober import Big5Prober
|
||||
from euctwprober import EUCTWProber
|
||||
from .charsetgroupprober import CharSetGroupProber
|
||||
from .utf8prober import UTF8Prober
|
||||
from .sjisprober import SJISProber
|
||||
from .eucjpprober import EUCJPProber
|
||||
from .gb2312prober import GB2312Prober
|
||||
from .euckrprober import EUCKRProber
|
||||
from .cp949prober import CP949Prober
|
||||
from .big5prober import Big5Prober
|
||||
from .euctwprober import EUCTWProber
|
||||
|
||||
|
||||
class MBCSGroupProber(CharSetGroupProber):
|
||||
def __init__(self):
|
||||
CharSetGroupProber.__init__(self)
|
||||
self._mProbers = [ \
|
||||
self._mProbers = [
|
||||
UTF8Prober(),
|
||||
SJISProber(),
|
||||
EUCJPProber(),
|
||||
GB2312Prober(),
|
||||
EUCKRProber(),
|
||||
CP949Prober(),
|
||||
Big5Prober(),
|
||||
EUCTWProber()]
|
||||
EUCTWProber()
|
||||
]
|
||||
self.reset()
|
||||
|
|
814
thirdparty/chardet/mbcssm.py
vendored
814
thirdparty/chardet/mbcssm.py
vendored
|
@ -13,60 +13,62 @@
|
|||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from constants import eStart, eError, eItsMe
|
||||
from .constants import eStart, eError, eItsMe
|
||||
|
||||
# BIG5
|
||||
# BIG5
|
||||
|
||||
BIG5_cls = ( \
|
||||
BIG5_cls = (
|
||||
1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as legal value
|
||||
1,1,1,1,1,1,0,0, # 08 - 0f
|
||||
1,1,1,1,1,1,1,1, # 10 - 17
|
||||
1,1,1,0,1,1,1,1, # 18 - 1f
|
||||
1,1,1,1,1,1,1,1, # 20 - 27
|
||||
1,1,1,1,1,1,1,1, # 28 - 2f
|
||||
1,1,1,1,1,1,1,1, # 30 - 37
|
||||
1,1,1,1,1,1,1,1, # 38 - 3f
|
||||
2,2,2,2,2,2,2,2, # 40 - 47
|
||||
2,2,2,2,2,2,2,2, # 48 - 4f
|
||||
2,2,2,2,2,2,2,2, # 50 - 57
|
||||
2,2,2,2,2,2,2,2, # 58 - 5f
|
||||
2,2,2,2,2,2,2,2, # 60 - 67
|
||||
2,2,2,2,2,2,2,2, # 68 - 6f
|
||||
2,2,2,2,2,2,2,2, # 70 - 77
|
||||
2,2,2,2,2,2,2,1, # 78 - 7f
|
||||
4,4,4,4,4,4,4,4, # 80 - 87
|
||||
4,4,4,4,4,4,4,4, # 88 - 8f
|
||||
4,4,4,4,4,4,4,4, # 90 - 97
|
||||
4,4,4,4,4,4,4,4, # 98 - 9f
|
||||
4,3,3,3,3,3,3,3, # a0 - a7
|
||||
3,3,3,3,3,3,3,3, # a8 - af
|
||||
3,3,3,3,3,3,3,3, # b0 - b7
|
||||
3,3,3,3,3,3,3,3, # b8 - bf
|
||||
3,3,3,3,3,3,3,3, # c0 - c7
|
||||
3,3,3,3,3,3,3,3, # c8 - cf
|
||||
3,3,3,3,3,3,3,3, # d0 - d7
|
||||
3,3,3,3,3,3,3,3, # d8 - df
|
||||
3,3,3,3,3,3,3,3, # e0 - e7
|
||||
3,3,3,3,3,3,3,3, # e8 - ef
|
||||
3,3,3,3,3,3,3,3, # f0 - f7
|
||||
3,3,3,3,3,3,3,0) # f8 - ff
|
||||
1,1,1,1,1,1,0,0, # 08 - 0f
|
||||
1,1,1,1,1,1,1,1, # 10 - 17
|
||||
1,1,1,0,1,1,1,1, # 18 - 1f
|
||||
1,1,1,1,1,1,1,1, # 20 - 27
|
||||
1,1,1,1,1,1,1,1, # 28 - 2f
|
||||
1,1,1,1,1,1,1,1, # 30 - 37
|
||||
1,1,1,1,1,1,1,1, # 38 - 3f
|
||||
2,2,2,2,2,2,2,2, # 40 - 47
|
||||
2,2,2,2,2,2,2,2, # 48 - 4f
|
||||
2,2,2,2,2,2,2,2, # 50 - 57
|
||||
2,2,2,2,2,2,2,2, # 58 - 5f
|
||||
2,2,2,2,2,2,2,2, # 60 - 67
|
||||
2,2,2,2,2,2,2,2, # 68 - 6f
|
||||
2,2,2,2,2,2,2,2, # 70 - 77
|
||||
2,2,2,2,2,2,2,1, # 78 - 7f
|
||||
4,4,4,4,4,4,4,4, # 80 - 87
|
||||
4,4,4,4,4,4,4,4, # 88 - 8f
|
||||
4,4,4,4,4,4,4,4, # 90 - 97
|
||||
4,4,4,4,4,4,4,4, # 98 - 9f
|
||||
4,3,3,3,3,3,3,3, # a0 - a7
|
||||
3,3,3,3,3,3,3,3, # a8 - af
|
||||
3,3,3,3,3,3,3,3, # b0 - b7
|
||||
3,3,3,3,3,3,3,3, # b8 - bf
|
||||
3,3,3,3,3,3,3,3, # c0 - c7
|
||||
3,3,3,3,3,3,3,3, # c8 - cf
|
||||
3,3,3,3,3,3,3,3, # d0 - d7
|
||||
3,3,3,3,3,3,3,3, # d8 - df
|
||||
3,3,3,3,3,3,3,3, # e0 - e7
|
||||
3,3,3,3,3,3,3,3, # e8 - ef
|
||||
3,3,3,3,3,3,3,3, # f0 - f7
|
||||
3,3,3,3,3,3,3,0 # f8 - ff
|
||||
)
|
||||
|
||||
BIG5_st = ( \
|
||||
eError,eStart,eStart, 3,eError,eError,eError,eError,#00-07
|
||||
eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,#08-0f
|
||||
eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart)#10-17
|
||||
BIG5_st = (
|
||||
eError,eStart,eStart, 3,eError,eError,eError,eError,#00-07
|
||||
eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,#08-0f
|
||||
eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart#10-17
|
||||
)
|
||||
|
||||
Big5CharLenTable = (0, 1, 1, 2, 0)
|
||||
|
||||
|
@ -76,48 +78,90 @@ Big5SMModel = {'classTable': BIG5_cls,
|
|||
'charLenTable': Big5CharLenTable,
|
||||
'name': 'Big5'}
|
||||
|
||||
# CP949
|
||||
|
||||
CP949_cls = (
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,0,0, # 00 - 0f
|
||||
1,1,1,1,1,1,1,1, 1,1,1,0,1,1,1,1, # 10 - 1f
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, # 20 - 2f
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, # 30 - 3f
|
||||
1,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, # 40 - 4f
|
||||
4,4,5,5,5,5,5,5, 5,5,5,1,1,1,1,1, # 50 - 5f
|
||||
1,5,5,5,5,5,5,5, 5,5,5,5,5,5,5,5, # 60 - 6f
|
||||
5,5,5,5,5,5,5,5, 5,5,5,1,1,1,1,1, # 70 - 7f
|
||||
0,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6, # 80 - 8f
|
||||
6,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6, # 90 - 9f
|
||||
6,7,7,7,7,7,7,7, 7,7,7,7,7,8,8,8, # a0 - af
|
||||
7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7, # b0 - bf
|
||||
7,7,7,7,7,7,9,2, 2,3,2,2,2,2,2,2, # c0 - cf
|
||||
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, # d0 - df
|
||||
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, # e0 - ef
|
||||
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,0, # f0 - ff
|
||||
)
|
||||
|
||||
CP949_st = (
|
||||
#cls= 0 1 2 3 4 5 6 7 8 9 # previous state =
|
||||
eError,eStart, 3,eError,eStart,eStart, 4, 5,eError, 6, # eStart
|
||||
eError,eError,eError,eError,eError,eError,eError,eError,eError,eError, # eError
|
||||
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe, # eItsMe
|
||||
eError,eError,eStart,eStart,eError,eError,eError,eStart,eStart,eStart, # 3
|
||||
eError,eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart,eStart, # 4
|
||||
eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart,eStart,eStart, # 5
|
||||
eError,eStart,eStart,eStart,eStart,eError,eError,eStart,eStart,eStart, # 6
|
||||
)
|
||||
|
||||
CP949CharLenTable = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2)
|
||||
|
||||
CP949SMModel = {'classTable': CP949_cls,
|
||||
'classFactor': 10,
|
||||
'stateTable': CP949_st,
|
||||
'charLenTable': CP949CharLenTable,
|
||||
'name': 'CP949'}
|
||||
|
||||
# EUC-JP
|
||||
|
||||
EUCJP_cls = ( \
|
||||
4,4,4,4,4,4,4,4, # 00 - 07
|
||||
4,4,4,4,4,4,5,5, # 08 - 0f
|
||||
4,4,4,4,4,4,4,4, # 10 - 17
|
||||
4,4,4,5,4,4,4,4, # 18 - 1f
|
||||
4,4,4,4,4,4,4,4, # 20 - 27
|
||||
4,4,4,4,4,4,4,4, # 28 - 2f
|
||||
4,4,4,4,4,4,4,4, # 30 - 37
|
||||
4,4,4,4,4,4,4,4, # 38 - 3f
|
||||
4,4,4,4,4,4,4,4, # 40 - 47
|
||||
4,4,4,4,4,4,4,4, # 48 - 4f
|
||||
4,4,4,4,4,4,4,4, # 50 - 57
|
||||
4,4,4,4,4,4,4,4, # 58 - 5f
|
||||
4,4,4,4,4,4,4,4, # 60 - 67
|
||||
4,4,4,4,4,4,4,4, # 68 - 6f
|
||||
4,4,4,4,4,4,4,4, # 70 - 77
|
||||
4,4,4,4,4,4,4,4, # 78 - 7f
|
||||
5,5,5,5,5,5,5,5, # 80 - 87
|
||||
5,5,5,5,5,5,1,3, # 88 - 8f
|
||||
5,5,5,5,5,5,5,5, # 90 - 97
|
||||
5,5,5,5,5,5,5,5, # 98 - 9f
|
||||
5,2,2,2,2,2,2,2, # a0 - a7
|
||||
2,2,2,2,2,2,2,2, # a8 - af
|
||||
2,2,2,2,2,2,2,2, # b0 - b7
|
||||
2,2,2,2,2,2,2,2, # b8 - bf
|
||||
2,2,2,2,2,2,2,2, # c0 - c7
|
||||
2,2,2,2,2,2,2,2, # c8 - cf
|
||||
2,2,2,2,2,2,2,2, # d0 - d7
|
||||
2,2,2,2,2,2,2,2, # d8 - df
|
||||
0,0,0,0,0,0,0,0, # e0 - e7
|
||||
0,0,0,0,0,0,0,0, # e8 - ef
|
||||
0,0,0,0,0,0,0,0, # f0 - f7
|
||||
0,0,0,0,0,0,0,5) # f8 - ff
|
||||
EUCJP_cls = (
|
||||
4,4,4,4,4,4,4,4, # 00 - 07
|
||||
4,4,4,4,4,4,5,5, # 08 - 0f
|
||||
4,4,4,4,4,4,4,4, # 10 - 17
|
||||
4,4,4,5,4,4,4,4, # 18 - 1f
|
||||
4,4,4,4,4,4,4,4, # 20 - 27
|
||||
4,4,4,4,4,4,4,4, # 28 - 2f
|
||||
4,4,4,4,4,4,4,4, # 30 - 37
|
||||
4,4,4,4,4,4,4,4, # 38 - 3f
|
||||
4,4,4,4,4,4,4,4, # 40 - 47
|
||||
4,4,4,4,4,4,4,4, # 48 - 4f
|
||||
4,4,4,4,4,4,4,4, # 50 - 57
|
||||
4,4,4,4,4,4,4,4, # 58 - 5f
|
||||
4,4,4,4,4,4,4,4, # 60 - 67
|
||||
4,4,4,4,4,4,4,4, # 68 - 6f
|
||||
4,4,4,4,4,4,4,4, # 70 - 77
|
||||
4,4,4,4,4,4,4,4, # 78 - 7f
|
||||
5,5,5,5,5,5,5,5, # 80 - 87
|
||||
5,5,5,5,5,5,1,3, # 88 - 8f
|
||||
5,5,5,5,5,5,5,5, # 90 - 97
|
||||
5,5,5,5,5,5,5,5, # 98 - 9f
|
||||
5,2,2,2,2,2,2,2, # a0 - a7
|
||||
2,2,2,2,2,2,2,2, # a8 - af
|
||||
2,2,2,2,2,2,2,2, # b0 - b7
|
||||
2,2,2,2,2,2,2,2, # b8 - bf
|
||||
2,2,2,2,2,2,2,2, # c0 - c7
|
||||
2,2,2,2,2,2,2,2, # c8 - cf
|
||||
2,2,2,2,2,2,2,2, # d0 - d7
|
||||
2,2,2,2,2,2,2,2, # d8 - df
|
||||
0,0,0,0,0,0,0,0, # e0 - e7
|
||||
0,0,0,0,0,0,0,0, # e8 - ef
|
||||
0,0,0,0,0,0,0,0, # f0 - f7
|
||||
0,0,0,0,0,0,0,5 # f8 - ff
|
||||
)
|
||||
|
||||
EUCJP_st = ( \
|
||||
3, 4, 3, 5,eStart,eError,eError,eError,#00-07
|
||||
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
|
||||
eItsMe,eItsMe,eStart,eError,eStart,eError,eError,eError,#10-17
|
||||
eError,eError,eStart,eError,eError,eError, 3,eError,#18-1f
|
||||
3,eError,eError,eError,eStart,eStart,eStart,eStart)#20-27
|
||||
EUCJP_st = (
|
||||
3, 4, 3, 5,eStart,eError,eError,eError,#00-07
|
||||
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
|
||||
eItsMe,eItsMe,eStart,eError,eStart,eError,eError,eError,#10-17
|
||||
eError,eError,eStart,eError,eError,eError, 3,eError,#18-1f
|
||||
3,eError,eError,eError,eStart,eStart,eStart,eStart#20-27
|
||||
)
|
||||
|
||||
EUCJPCharLenTable = (2, 2, 2, 3, 1, 0)
|
||||
|
||||
|
@ -129,43 +173,45 @@ EUCJPSMModel = {'classTable': EUCJP_cls,
|
|||
|
||||
# EUC-KR
|
||||
|
||||
EUCKR_cls = ( \
|
||||
1,1,1,1,1,1,1,1, # 00 - 07
|
||||
1,1,1,1,1,1,0,0, # 08 - 0f
|
||||
1,1,1,1,1,1,1,1, # 10 - 17
|
||||
1,1,1,0,1,1,1,1, # 18 - 1f
|
||||
1,1,1,1,1,1,1,1, # 20 - 27
|
||||
1,1,1,1,1,1,1,1, # 28 - 2f
|
||||
1,1,1,1,1,1,1,1, # 30 - 37
|
||||
1,1,1,1,1,1,1,1, # 38 - 3f
|
||||
1,1,1,1,1,1,1,1, # 40 - 47
|
||||
1,1,1,1,1,1,1,1, # 48 - 4f
|
||||
1,1,1,1,1,1,1,1, # 50 - 57
|
||||
1,1,1,1,1,1,1,1, # 58 - 5f
|
||||
1,1,1,1,1,1,1,1, # 60 - 67
|
||||
1,1,1,1,1,1,1,1, # 68 - 6f
|
||||
1,1,1,1,1,1,1,1, # 70 - 77
|
||||
1,1,1,1,1,1,1,1, # 78 - 7f
|
||||
0,0,0,0,0,0,0,0, # 80 - 87
|
||||
0,0,0,0,0,0,0,0, # 88 - 8f
|
||||
0,0,0,0,0,0,0,0, # 90 - 97
|
||||
0,0,0,0,0,0,0,0, # 98 - 9f
|
||||
0,2,2,2,2,2,2,2, # a0 - a7
|
||||
2,2,2,2,2,3,3,3, # a8 - af
|
||||
2,2,2,2,2,2,2,2, # b0 - b7
|
||||
2,2,2,2,2,2,2,2, # b8 - bf
|
||||
2,2,2,2,2,2,2,2, # c0 - c7
|
||||
2,3,2,2,2,2,2,2, # c8 - cf
|
||||
2,2,2,2,2,2,2,2, # d0 - d7
|
||||
2,2,2,2,2,2,2,2, # d8 - df
|
||||
2,2,2,2,2,2,2,2, # e0 - e7
|
||||
2,2,2,2,2,2,2,2, # e8 - ef
|
||||
2,2,2,2,2,2,2,2, # f0 - f7
|
||||
2,2,2,2,2,2,2,0) # f8 - ff
|
||||
EUCKR_cls = (
|
||||
1,1,1,1,1,1,1,1, # 00 - 07
|
||||
1,1,1,1,1,1,0,0, # 08 - 0f
|
||||
1,1,1,1,1,1,1,1, # 10 - 17
|
||||
1,1,1,0,1,1,1,1, # 18 - 1f
|
||||
1,1,1,1,1,1,1,1, # 20 - 27
|
||||
1,1,1,1,1,1,1,1, # 28 - 2f
|
||||
1,1,1,1,1,1,1,1, # 30 - 37
|
||||
1,1,1,1,1,1,1,1, # 38 - 3f
|
||||
1,1,1,1,1,1,1,1, # 40 - 47
|
||||
1,1,1,1,1,1,1,1, # 48 - 4f
|
||||
1,1,1,1,1,1,1,1, # 50 - 57
|
||||
1,1,1,1,1,1,1,1, # 58 - 5f
|
||||
1,1,1,1,1,1,1,1, # 60 - 67
|
||||
1,1,1,1,1,1,1,1, # 68 - 6f
|
||||
1,1,1,1,1,1,1,1, # 70 - 77
|
||||
1,1,1,1,1,1,1,1, # 78 - 7f
|
||||
0,0,0,0,0,0,0,0, # 80 - 87
|
||||
0,0,0,0,0,0,0,0, # 88 - 8f
|
||||
0,0,0,0,0,0,0,0, # 90 - 97
|
||||
0,0,0,0,0,0,0,0, # 98 - 9f
|
||||
0,2,2,2,2,2,2,2, # a0 - a7
|
||||
2,2,2,2,2,3,3,3, # a8 - af
|
||||
2,2,2,2,2,2,2,2, # b0 - b7
|
||||
2,2,2,2,2,2,2,2, # b8 - bf
|
||||
2,2,2,2,2,2,2,2, # c0 - c7
|
||||
2,3,2,2,2,2,2,2, # c8 - cf
|
||||
2,2,2,2,2,2,2,2, # d0 - d7
|
||||
2,2,2,2,2,2,2,2, # d8 - df
|
||||
2,2,2,2,2,2,2,2, # e0 - e7
|
||||
2,2,2,2,2,2,2,2, # e8 - ef
|
||||
2,2,2,2,2,2,2,2, # f0 - f7
|
||||
2,2,2,2,2,2,2,0 # f8 - ff
|
||||
)
|
||||
|
||||
EUCKR_st = (
|
||||
eError,eStart, 3,eError,eError,eError,eError,eError,#00-07
|
||||
eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart)#08-0f
|
||||
eError,eStart, 3,eError,eError,eError,eError,eError,#00-07
|
||||
eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart #08-0f
|
||||
)
|
||||
|
||||
EUCKRCharLenTable = (0, 1, 2, 0)
|
||||
|
||||
|
@ -177,47 +223,49 @@ EUCKRSMModel = {'classTable': EUCKR_cls,
|
|||
|
||||
# EUC-TW
|
||||
|
||||
EUCTW_cls = ( \
|
||||
2,2,2,2,2,2,2,2, # 00 - 07
|
||||
2,2,2,2,2,2,0,0, # 08 - 0f
|
||||
2,2,2,2,2,2,2,2, # 10 - 17
|
||||
2,2,2,0,2,2,2,2, # 18 - 1f
|
||||
2,2,2,2,2,2,2,2, # 20 - 27
|
||||
2,2,2,2,2,2,2,2, # 28 - 2f
|
||||
2,2,2,2,2,2,2,2, # 30 - 37
|
||||
2,2,2,2,2,2,2,2, # 38 - 3f
|
||||
2,2,2,2,2,2,2,2, # 40 - 47
|
||||
2,2,2,2,2,2,2,2, # 48 - 4f
|
||||
2,2,2,2,2,2,2,2, # 50 - 57
|
||||
2,2,2,2,2,2,2,2, # 58 - 5f
|
||||
2,2,2,2,2,2,2,2, # 60 - 67
|
||||
2,2,2,2,2,2,2,2, # 68 - 6f
|
||||
2,2,2,2,2,2,2,2, # 70 - 77
|
||||
2,2,2,2,2,2,2,2, # 78 - 7f
|
||||
0,0,0,0,0,0,0,0, # 80 - 87
|
||||
0,0,0,0,0,0,6,0, # 88 - 8f
|
||||
0,0,0,0,0,0,0,0, # 90 - 97
|
||||
0,0,0,0,0,0,0,0, # 98 - 9f
|
||||
0,3,4,4,4,4,4,4, # a0 - a7
|
||||
5,5,1,1,1,1,1,1, # a8 - af
|
||||
1,1,1,1,1,1,1,1, # b0 - b7
|
||||
1,1,1,1,1,1,1,1, # b8 - bf
|
||||
1,1,3,1,3,3,3,3, # c0 - c7
|
||||
3,3,3,3,3,3,3,3, # c8 - cf
|
||||
3,3,3,3,3,3,3,3, # d0 - d7
|
||||
3,3,3,3,3,3,3,3, # d8 - df
|
||||
3,3,3,3,3,3,3,3, # e0 - e7
|
||||
3,3,3,3,3,3,3,3, # e8 - ef
|
||||
3,3,3,3,3,3,3,3, # f0 - f7
|
||||
3,3,3,3,3,3,3,0) # f8 - ff
|
||||
EUCTW_cls = (
|
||||
2,2,2,2,2,2,2,2, # 00 - 07
|
||||
2,2,2,2,2,2,0,0, # 08 - 0f
|
||||
2,2,2,2,2,2,2,2, # 10 - 17
|
||||
2,2,2,0,2,2,2,2, # 18 - 1f
|
||||
2,2,2,2,2,2,2,2, # 20 - 27
|
||||
2,2,2,2,2,2,2,2, # 28 - 2f
|
||||
2,2,2,2,2,2,2,2, # 30 - 37
|
||||
2,2,2,2,2,2,2,2, # 38 - 3f
|
||||
2,2,2,2,2,2,2,2, # 40 - 47
|
||||
2,2,2,2,2,2,2,2, # 48 - 4f
|
||||
2,2,2,2,2,2,2,2, # 50 - 57
|
||||
2,2,2,2,2,2,2,2, # 58 - 5f
|
||||
2,2,2,2,2,2,2,2, # 60 - 67
|
||||
2,2,2,2,2,2,2,2, # 68 - 6f
|
||||
2,2,2,2,2,2,2,2, # 70 - 77
|
||||
2,2,2,2,2,2,2,2, # 78 - 7f
|
||||
0,0,0,0,0,0,0,0, # 80 - 87
|
||||
0,0,0,0,0,0,6,0, # 88 - 8f
|
||||
0,0,0,0,0,0,0,0, # 90 - 97
|
||||
0,0,0,0,0,0,0,0, # 98 - 9f
|
||||
0,3,4,4,4,4,4,4, # a0 - a7
|
||||
5,5,1,1,1,1,1,1, # a8 - af
|
||||
1,1,1,1,1,1,1,1, # b0 - b7
|
||||
1,1,1,1,1,1,1,1, # b8 - bf
|
||||
1,1,3,1,3,3,3,3, # c0 - c7
|
||||
3,3,3,3,3,3,3,3, # c8 - cf
|
||||
3,3,3,3,3,3,3,3, # d0 - d7
|
||||
3,3,3,3,3,3,3,3, # d8 - df
|
||||
3,3,3,3,3,3,3,3, # e0 - e7
|
||||
3,3,3,3,3,3,3,3, # e8 - ef
|
||||
3,3,3,3,3,3,3,3, # f0 - f7
|
||||
3,3,3,3,3,3,3,0 # f8 - ff
|
||||
)
|
||||
|
||||
EUCTW_st = ( \
|
||||
eError,eError,eStart, 3, 3, 3, 4,eError,#00-07
|
||||
eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,#08-0f
|
||||
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eStart,eError,#10-17
|
||||
eStart,eStart,eStart,eError,eError,eError,eError,eError,#18-1f
|
||||
5,eError,eError,eError,eStart,eError,eStart,eStart,#20-27
|
||||
eStart,eError,eStart,eStart,eStart,eStart,eStart,eStart)#28-2f
|
||||
EUCTW_st = (
|
||||
eError,eError,eStart, 3, 3, 3, 4,eError,#00-07
|
||||
eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,#08-0f
|
||||
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eStart,eError,#10-17
|
||||
eStart,eStart,eStart,eError,eError,eError,eError,eError,#18-1f
|
||||
5,eError,eError,eError,eStart,eError,eStart,eStart,#20-27
|
||||
eStart,eError,eStart,eStart,eStart,eStart,eStart,eStart #28-2f
|
||||
)
|
||||
|
||||
EUCTWCharLenTable = (0, 0, 1, 2, 2, 2, 3)
|
||||
|
||||
|
@ -229,53 +277,55 @@ EUCTWSMModel = {'classTable': EUCTW_cls,
|
|||
|
||||
# GB2312
|
||||
|
||||
GB2312_cls = ( \
|
||||
1,1,1,1,1,1,1,1, # 00 - 07
|
||||
1,1,1,1,1,1,0,0, # 08 - 0f
|
||||
1,1,1,1,1,1,1,1, # 10 - 17
|
||||
1,1,1,0,1,1,1,1, # 18 - 1f
|
||||
1,1,1,1,1,1,1,1, # 20 - 27
|
||||
1,1,1,1,1,1,1,1, # 28 - 2f
|
||||
3,3,3,3,3,3,3,3, # 30 - 37
|
||||
3,3,1,1,1,1,1,1, # 38 - 3f
|
||||
2,2,2,2,2,2,2,2, # 40 - 47
|
||||
2,2,2,2,2,2,2,2, # 48 - 4f
|
||||
2,2,2,2,2,2,2,2, # 50 - 57
|
||||
2,2,2,2,2,2,2,2, # 58 - 5f
|
||||
2,2,2,2,2,2,2,2, # 60 - 67
|
||||
2,2,2,2,2,2,2,2, # 68 - 6f
|
||||
2,2,2,2,2,2,2,2, # 70 - 77
|
||||
2,2,2,2,2,2,2,4, # 78 - 7f
|
||||
5,6,6,6,6,6,6,6, # 80 - 87
|
||||
6,6,6,6,6,6,6,6, # 88 - 8f
|
||||
6,6,6,6,6,6,6,6, # 90 - 97
|
||||
6,6,6,6,6,6,6,6, # 98 - 9f
|
||||
6,6,6,6,6,6,6,6, # a0 - a7
|
||||
6,6,6,6,6,6,6,6, # a8 - af
|
||||
6,6,6,6,6,6,6,6, # b0 - b7
|
||||
6,6,6,6,6,6,6,6, # b8 - bf
|
||||
6,6,6,6,6,6,6,6, # c0 - c7
|
||||
6,6,6,6,6,6,6,6, # c8 - cf
|
||||
6,6,6,6,6,6,6,6, # d0 - d7
|
||||
6,6,6,6,6,6,6,6, # d8 - df
|
||||
6,6,6,6,6,6,6,6, # e0 - e7
|
||||
6,6,6,6,6,6,6,6, # e8 - ef
|
||||
6,6,6,6,6,6,6,6, # f0 - f7
|
||||
6,6,6,6,6,6,6,0) # f8 - ff
|
||||
GB2312_cls = (
|
||||
1,1,1,1,1,1,1,1, # 00 - 07
|
||||
1,1,1,1,1,1,0,0, # 08 - 0f
|
||||
1,1,1,1,1,1,1,1, # 10 - 17
|
||||
1,1,1,0,1,1,1,1, # 18 - 1f
|
||||
1,1,1,1,1,1,1,1, # 20 - 27
|
||||
1,1,1,1,1,1,1,1, # 28 - 2f
|
||||
3,3,3,3,3,3,3,3, # 30 - 37
|
||||
3,3,1,1,1,1,1,1, # 38 - 3f
|
||||
2,2,2,2,2,2,2,2, # 40 - 47
|
||||
2,2,2,2,2,2,2,2, # 48 - 4f
|
||||
2,2,2,2,2,2,2,2, # 50 - 57
|
||||
2,2,2,2,2,2,2,2, # 58 - 5f
|
||||
2,2,2,2,2,2,2,2, # 60 - 67
|
||||
2,2,2,2,2,2,2,2, # 68 - 6f
|
||||
2,2,2,2,2,2,2,2, # 70 - 77
|
||||
2,2,2,2,2,2,2,4, # 78 - 7f
|
||||
5,6,6,6,6,6,6,6, # 80 - 87
|
||||
6,6,6,6,6,6,6,6, # 88 - 8f
|
||||
6,6,6,6,6,6,6,6, # 90 - 97
|
||||
6,6,6,6,6,6,6,6, # 98 - 9f
|
||||
6,6,6,6,6,6,6,6, # a0 - a7
|
||||
6,6,6,6,6,6,6,6, # a8 - af
|
||||
6,6,6,6,6,6,6,6, # b0 - b7
|
||||
6,6,6,6,6,6,6,6, # b8 - bf
|
||||
6,6,6,6,6,6,6,6, # c0 - c7
|
||||
6,6,6,6,6,6,6,6, # c8 - cf
|
||||
6,6,6,6,6,6,6,6, # d0 - d7
|
||||
6,6,6,6,6,6,6,6, # d8 - df
|
||||
6,6,6,6,6,6,6,6, # e0 - e7
|
||||
6,6,6,6,6,6,6,6, # e8 - ef
|
||||
6,6,6,6,6,6,6,6, # f0 - f7
|
||||
6,6,6,6,6,6,6,0 # f8 - ff
|
||||
)
|
||||
|
||||
GB2312_st = ( \
|
||||
eError,eStart,eStart,eStart,eStart,eStart, 3,eError,#00-07
|
||||
eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,#08-0f
|
||||
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,#10-17
|
||||
4,eError,eStart,eStart,eError,eError,eError,eError,#18-1f
|
||||
eError,eError, 5,eError,eError,eError,eItsMe,eError,#20-27
|
||||
eError,eError,eStart,eStart,eStart,eStart,eStart,eStart)#28-2f
|
||||
GB2312_st = (
|
||||
eError,eStart,eStart,eStart,eStart,eStart, 3,eError,#00-07
|
||||
eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,#08-0f
|
||||
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,#10-17
|
||||
4,eError,eStart,eStart,eError,eError,eError,eError,#18-1f
|
||||
eError,eError, 5,eError,eError,eError,eItsMe,eError,#20-27
|
||||
eError,eError,eStart,eStart,eStart,eStart,eStart,eStart #28-2f
|
||||
)
|
||||
|
||||
# To be accurate, the length of class 6 can be either 2 or 4.
|
||||
# But it is not necessary to discriminate between the two since
|
||||
# it is used for frequency analysis only, and we are validing
|
||||
# each code range there as well. So it is safe to set it to be
|
||||
# 2 here.
|
||||
# To be accurate, the length of class 6 can be either 2 or 4.
|
||||
# But it is not necessary to discriminate between the two since
|
||||
# it is used for frequency analysis only, and we are validing
|
||||
# each code range there as well. So it is safe to set it to be
|
||||
# 2 here.
|
||||
GB2312CharLenTable = (0, 1, 1, 1, 1, 1, 2)
|
||||
|
||||
GB2312SMModel = {'classTable': GB2312_cls,
|
||||
|
@ -286,46 +336,48 @@ GB2312SMModel = {'classTable': GB2312_cls,
|
|||
|
||||
# Shift_JIS
|
||||
|
||||
SJIS_cls = ( \
|
||||
1,1,1,1,1,1,1,1, # 00 - 07
|
||||
1,1,1,1,1,1,0,0, # 08 - 0f
|
||||
1,1,1,1,1,1,1,1, # 10 - 17
|
||||
1,1,1,0,1,1,1,1, # 18 - 1f
|
||||
1,1,1,1,1,1,1,1, # 20 - 27
|
||||
1,1,1,1,1,1,1,1, # 28 - 2f
|
||||
1,1,1,1,1,1,1,1, # 30 - 37
|
||||
1,1,1,1,1,1,1,1, # 38 - 3f
|
||||
2,2,2,2,2,2,2,2, # 40 - 47
|
||||
2,2,2,2,2,2,2,2, # 48 - 4f
|
||||
2,2,2,2,2,2,2,2, # 50 - 57
|
||||
2,2,2,2,2,2,2,2, # 58 - 5f
|
||||
2,2,2,2,2,2,2,2, # 60 - 67
|
||||
2,2,2,2,2,2,2,2, # 68 - 6f
|
||||
2,2,2,2,2,2,2,2, # 70 - 77
|
||||
2,2,2,2,2,2,2,1, # 78 - 7f
|
||||
3,3,3,3,3,3,3,3, # 80 - 87
|
||||
3,3,3,3,3,3,3,3, # 88 - 8f
|
||||
3,3,3,3,3,3,3,3, # 90 - 97
|
||||
3,3,3,3,3,3,3,3, # 98 - 9f
|
||||
#0xa0 is illegal in sjis encoding, but some pages does
|
||||
SJIS_cls = (
|
||||
1,1,1,1,1,1,1,1, # 00 - 07
|
||||
1,1,1,1,1,1,0,0, # 08 - 0f
|
||||
1,1,1,1,1,1,1,1, # 10 - 17
|
||||
1,1,1,0,1,1,1,1, # 18 - 1f
|
||||
1,1,1,1,1,1,1,1, # 20 - 27
|
||||
1,1,1,1,1,1,1,1, # 28 - 2f
|
||||
1,1,1,1,1,1,1,1, # 30 - 37
|
||||
1,1,1,1,1,1,1,1, # 38 - 3f
|
||||
2,2,2,2,2,2,2,2, # 40 - 47
|
||||
2,2,2,2,2,2,2,2, # 48 - 4f
|
||||
2,2,2,2,2,2,2,2, # 50 - 57
|
||||
2,2,2,2,2,2,2,2, # 58 - 5f
|
||||
2,2,2,2,2,2,2,2, # 60 - 67
|
||||
2,2,2,2,2,2,2,2, # 68 - 6f
|
||||
2,2,2,2,2,2,2,2, # 70 - 77
|
||||
2,2,2,2,2,2,2,1, # 78 - 7f
|
||||
3,3,3,3,3,2,2,3, # 80 - 87
|
||||
3,3,3,3,3,3,3,3, # 88 - 8f
|
||||
3,3,3,3,3,3,3,3, # 90 - 97
|
||||
3,3,3,3,3,3,3,3, # 98 - 9f
|
||||
#0xa0 is illegal in sjis encoding, but some pages does
|
||||
#contain such byte. We need to be more error forgiven.
|
||||
2,2,2,2,2,2,2,2, # a0 - a7
|
||||
2,2,2,2,2,2,2,2, # a8 - af
|
||||
2,2,2,2,2,2,2,2, # b0 - b7
|
||||
2,2,2,2,2,2,2,2, # b8 - bf
|
||||
2,2,2,2,2,2,2,2, # c0 - c7
|
||||
2,2,2,2,2,2,2,2, # c8 - cf
|
||||
2,2,2,2,2,2,2,2, # d0 - d7
|
||||
2,2,2,2,2,2,2,2, # d8 - df
|
||||
3,3,3,3,3,3,3,3, # e0 - e7
|
||||
3,3,3,3,3,4,4,4, # e8 - ef
|
||||
4,4,4,4,4,4,4,4, # f0 - f7
|
||||
4,4,4,4,4,0,0,0) # f8 - ff
|
||||
2,2,2,2,2,2,2,2, # a0 - a7
|
||||
2,2,2,2,2,2,2,2, # a8 - af
|
||||
2,2,2,2,2,2,2,2, # b0 - b7
|
||||
2,2,2,2,2,2,2,2, # b8 - bf
|
||||
2,2,2,2,2,2,2,2, # c0 - c7
|
||||
2,2,2,2,2,2,2,2, # c8 - cf
|
||||
2,2,2,2,2,2,2,2, # d0 - d7
|
||||
2,2,2,2,2,2,2,2, # d8 - df
|
||||
3,3,3,3,3,3,3,3, # e0 - e7
|
||||
3,3,3,3,3,4,4,4, # e8 - ef
|
||||
3,3,3,3,3,3,3,3, # f0 - f7
|
||||
3,3,3,3,3,0,0,0) # f8 - ff
|
||||
|
||||
SJIS_st = ( \
|
||||
eError,eStart,eStart, 3,eError,eError,eError,eError,#00-07
|
||||
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
|
||||
eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart)#10-17
|
||||
|
||||
SJIS_st = (
|
||||
eError,eStart,eStart, 3,eError,eError,eError,eError,#00-07
|
||||
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
|
||||
eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart #10-17
|
||||
)
|
||||
|
||||
SJISCharLenTable = (0, 1, 1, 2, 0, 0)
|
||||
|
||||
|
@ -337,48 +389,50 @@ SJISSMModel = {'classTable': SJIS_cls,
|
|||
|
||||
# UCS2-BE
|
||||
|
||||
UCS2BE_cls = ( \
|
||||
0,0,0,0,0,0,0,0, # 00 - 07
|
||||
0,0,1,0,0,2,0,0, # 08 - 0f
|
||||
0,0,0,0,0,0,0,0, # 10 - 17
|
||||
0,0,0,3,0,0,0,0, # 18 - 1f
|
||||
0,0,0,0,0,0,0,0, # 20 - 27
|
||||
0,3,3,3,3,3,0,0, # 28 - 2f
|
||||
0,0,0,0,0,0,0,0, # 30 - 37
|
||||
0,0,0,0,0,0,0,0, # 38 - 3f
|
||||
0,0,0,0,0,0,0,0, # 40 - 47
|
||||
0,0,0,0,0,0,0,0, # 48 - 4f
|
||||
0,0,0,0,0,0,0,0, # 50 - 57
|
||||
0,0,0,0,0,0,0,0, # 58 - 5f
|
||||
0,0,0,0,0,0,0,0, # 60 - 67
|
||||
0,0,0,0,0,0,0,0, # 68 - 6f
|
||||
0,0,0,0,0,0,0,0, # 70 - 77
|
||||
0,0,0,0,0,0,0,0, # 78 - 7f
|
||||
0,0,0,0,0,0,0,0, # 80 - 87
|
||||
0,0,0,0,0,0,0,0, # 88 - 8f
|
||||
0,0,0,0,0,0,0,0, # 90 - 97
|
||||
0,0,0,0,0,0,0,0, # 98 - 9f
|
||||
0,0,0,0,0,0,0,0, # a0 - a7
|
||||
0,0,0,0,0,0,0,0, # a8 - af
|
||||
0,0,0,0,0,0,0,0, # b0 - b7
|
||||
0,0,0,0,0,0,0,0, # b8 - bf
|
||||
0,0,0,0,0,0,0,0, # c0 - c7
|
||||
0,0,0,0,0,0,0,0, # c8 - cf
|
||||
0,0,0,0,0,0,0,0, # d0 - d7
|
||||
0,0,0,0,0,0,0,0, # d8 - df
|
||||
0,0,0,0,0,0,0,0, # e0 - e7
|
||||
0,0,0,0,0,0,0,0, # e8 - ef
|
||||
0,0,0,0,0,0,0,0, # f0 - f7
|
||||
0,0,0,0,0,0,4,5) # f8 - ff
|
||||
UCS2BE_cls = (
|
||||
0,0,0,0,0,0,0,0, # 00 - 07
|
||||
0,0,1,0,0,2,0,0, # 08 - 0f
|
||||
0,0,0,0,0,0,0,0, # 10 - 17
|
||||
0,0,0,3,0,0,0,0, # 18 - 1f
|
||||
0,0,0,0,0,0,0,0, # 20 - 27
|
||||
0,3,3,3,3,3,0,0, # 28 - 2f
|
||||
0,0,0,0,0,0,0,0, # 30 - 37
|
||||
0,0,0,0,0,0,0,0, # 38 - 3f
|
||||
0,0,0,0,0,0,0,0, # 40 - 47
|
||||
0,0,0,0,0,0,0,0, # 48 - 4f
|
||||
0,0,0,0,0,0,0,0, # 50 - 57
|
||||
0,0,0,0,0,0,0,0, # 58 - 5f
|
||||
0,0,0,0,0,0,0,0, # 60 - 67
|
||||
0,0,0,0,0,0,0,0, # 68 - 6f
|
||||
0,0,0,0,0,0,0,0, # 70 - 77
|
||||
0,0,0,0,0,0,0,0, # 78 - 7f
|
||||
0,0,0,0,0,0,0,0, # 80 - 87
|
||||
0,0,0,0,0,0,0,0, # 88 - 8f
|
||||
0,0,0,0,0,0,0,0, # 90 - 97
|
||||
0,0,0,0,0,0,0,0, # 98 - 9f
|
||||
0,0,0,0,0,0,0,0, # a0 - a7
|
||||
0,0,0,0,0,0,0,0, # a8 - af
|
||||
0,0,0,0,0,0,0,0, # b0 - b7
|
||||
0,0,0,0,0,0,0,0, # b8 - bf
|
||||
0,0,0,0,0,0,0,0, # c0 - c7
|
||||
0,0,0,0,0,0,0,0, # c8 - cf
|
||||
0,0,0,0,0,0,0,0, # d0 - d7
|
||||
0,0,0,0,0,0,0,0, # d8 - df
|
||||
0,0,0,0,0,0,0,0, # e0 - e7
|
||||
0,0,0,0,0,0,0,0, # e8 - ef
|
||||
0,0,0,0,0,0,0,0, # f0 - f7
|
||||
0,0,0,0,0,0,4,5 # f8 - ff
|
||||
)
|
||||
|
||||
UCS2BE_st = ( \
|
||||
5, 7, 7,eError, 4, 3,eError,eError,#00-07
|
||||
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
|
||||
eItsMe,eItsMe, 6, 6, 6, 6,eError,eError,#10-17
|
||||
6, 6, 6, 6, 6,eItsMe, 6, 6,#18-1f
|
||||
6, 6, 6, 6, 5, 7, 7,eError,#20-27
|
||||
5, 8, 6, 6,eError, 6, 6, 6,#28-2f
|
||||
6, 6, 6, 6,eError,eError,eStart,eStart)#30-37
|
||||
UCS2BE_st = (
|
||||
5, 7, 7,eError, 4, 3,eError,eError,#00-07
|
||||
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
|
||||
eItsMe,eItsMe, 6, 6, 6, 6,eError,eError,#10-17
|
||||
6, 6, 6, 6, 6,eItsMe, 6, 6,#18-1f
|
||||
6, 6, 6, 6, 5, 7, 7,eError,#20-27
|
||||
5, 8, 6, 6,eError, 6, 6, 6,#28-2f
|
||||
6, 6, 6, 6,eError,eError,eStart,eStart #30-37
|
||||
)
|
||||
|
||||
UCS2BECharLenTable = (2, 2, 2, 0, 2, 2)
|
||||
|
||||
|
@ -390,48 +444,50 @@ UCS2BESMModel = {'classTable': UCS2BE_cls,
|
|||
|
||||
# UCS2-LE
|
||||
|
||||
UCS2LE_cls = ( \
|
||||
0,0,0,0,0,0,0,0, # 00 - 07
|
||||
0,0,1,0,0,2,0,0, # 08 - 0f
|
||||
0,0,0,0,0,0,0,0, # 10 - 17
|
||||
0,0,0,3,0,0,0,0, # 18 - 1f
|
||||
0,0,0,0,0,0,0,0, # 20 - 27
|
||||
0,3,3,3,3,3,0,0, # 28 - 2f
|
||||
0,0,0,0,0,0,0,0, # 30 - 37
|
||||
0,0,0,0,0,0,0,0, # 38 - 3f
|
||||
0,0,0,0,0,0,0,0, # 40 - 47
|
||||
0,0,0,0,0,0,0,0, # 48 - 4f
|
||||
0,0,0,0,0,0,0,0, # 50 - 57
|
||||
0,0,0,0,0,0,0,0, # 58 - 5f
|
||||
0,0,0,0,0,0,0,0, # 60 - 67
|
||||
0,0,0,0,0,0,0,0, # 68 - 6f
|
||||
0,0,0,0,0,0,0,0, # 70 - 77
|
||||
0,0,0,0,0,0,0,0, # 78 - 7f
|
||||
0,0,0,0,0,0,0,0, # 80 - 87
|
||||
0,0,0,0,0,0,0,0, # 88 - 8f
|
||||
0,0,0,0,0,0,0,0, # 90 - 97
|
||||
0,0,0,0,0,0,0,0, # 98 - 9f
|
||||
0,0,0,0,0,0,0,0, # a0 - a7
|
||||
0,0,0,0,0,0,0,0, # a8 - af
|
||||
0,0,0,0,0,0,0,0, # b0 - b7
|
||||
0,0,0,0,0,0,0,0, # b8 - bf
|
||||
0,0,0,0,0,0,0,0, # c0 - c7
|
||||
0,0,0,0,0,0,0,0, # c8 - cf
|
||||
0,0,0,0,0,0,0,0, # d0 - d7
|
||||
0,0,0,0,0,0,0,0, # d8 - df
|
||||
0,0,0,0,0,0,0,0, # e0 - e7
|
||||
0,0,0,0,0,0,0,0, # e8 - ef
|
||||
0,0,0,0,0,0,0,0, # f0 - f7
|
||||
0,0,0,0,0,0,4,5) # f8 - ff
|
||||
UCS2LE_cls = (
|
||||
0,0,0,0,0,0,0,0, # 00 - 07
|
||||
0,0,1,0,0,2,0,0, # 08 - 0f
|
||||
0,0,0,0,0,0,0,0, # 10 - 17
|
||||
0,0,0,3,0,0,0,0, # 18 - 1f
|
||||
0,0,0,0,0,0,0,0, # 20 - 27
|
||||
0,3,3,3,3,3,0,0, # 28 - 2f
|
||||
0,0,0,0,0,0,0,0, # 30 - 37
|
||||
0,0,0,0,0,0,0,0, # 38 - 3f
|
||||
0,0,0,0,0,0,0,0, # 40 - 47
|
||||
0,0,0,0,0,0,0,0, # 48 - 4f
|
||||
0,0,0,0,0,0,0,0, # 50 - 57
|
||||
0,0,0,0,0,0,0,0, # 58 - 5f
|
||||
0,0,0,0,0,0,0,0, # 60 - 67
|
||||
0,0,0,0,0,0,0,0, # 68 - 6f
|
||||
0,0,0,0,0,0,0,0, # 70 - 77
|
||||
0,0,0,0,0,0,0,0, # 78 - 7f
|
||||
0,0,0,0,0,0,0,0, # 80 - 87
|
||||
0,0,0,0,0,0,0,0, # 88 - 8f
|
||||
0,0,0,0,0,0,0,0, # 90 - 97
|
||||
0,0,0,0,0,0,0,0, # 98 - 9f
|
||||
0,0,0,0,0,0,0,0, # a0 - a7
|
||||
0,0,0,0,0,0,0,0, # a8 - af
|
||||
0,0,0,0,0,0,0,0, # b0 - b7
|
||||
0,0,0,0,0,0,0,0, # b8 - bf
|
||||
0,0,0,0,0,0,0,0, # c0 - c7
|
||||
0,0,0,0,0,0,0,0, # c8 - cf
|
||||
0,0,0,0,0,0,0,0, # d0 - d7
|
||||
0,0,0,0,0,0,0,0, # d8 - df
|
||||
0,0,0,0,0,0,0,0, # e0 - e7
|
||||
0,0,0,0,0,0,0,0, # e8 - ef
|
||||
0,0,0,0,0,0,0,0, # f0 - f7
|
||||
0,0,0,0,0,0,4,5 # f8 - ff
|
||||
)
|
||||
|
||||
UCS2LE_st = ( \
|
||||
6, 6, 7, 6, 4, 3,eError,eError,#00-07
|
||||
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
|
||||
eItsMe,eItsMe, 5, 5, 5,eError,eItsMe,eError,#10-17
|
||||
5, 5, 5,eError, 5,eError, 6, 6,#18-1f
|
||||
7, 6, 8, 8, 5, 5, 5,eError,#20-27
|
||||
5, 5, 5,eError,eError,eError, 5, 5,#28-2f
|
||||
5, 5, 5,eError, 5,eError,eStart,eStart)#30-37
|
||||
UCS2LE_st = (
|
||||
6, 6, 7, 6, 4, 3,eError,eError,#00-07
|
||||
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
|
||||
eItsMe,eItsMe, 5, 5, 5,eError,eItsMe,eError,#10-17
|
||||
5, 5, 5,eError, 5,eError, 6, 6,#18-1f
|
||||
7, 6, 8, 8, 5, 5, 5,eError,#20-27
|
||||
5, 5, 5,eError,eError,eError, 5, 5,#28-2f
|
||||
5, 5, 5,eError, 5,eError,eStart,eStart #30-37
|
||||
)
|
||||
|
||||
UCS2LECharLenTable = (2, 2, 2, 2, 2, 2)
|
||||
|
||||
|
@ -443,67 +499,69 @@ UCS2LESMModel = {'classTable': UCS2LE_cls,
|
|||
|
||||
# UTF-8
|
||||
|
||||
UTF8_cls = ( \
|
||||
UTF8_cls = (
|
||||
1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as a legal value
|
||||
1,1,1,1,1,1,0,0, # 08 - 0f
|
||||
1,1,1,1,1,1,1,1, # 10 - 17
|
||||
1,1,1,0,1,1,1,1, # 18 - 1f
|
||||
1,1,1,1,1,1,1,1, # 20 - 27
|
||||
1,1,1,1,1,1,1,1, # 28 - 2f
|
||||
1,1,1,1,1,1,1,1, # 30 - 37
|
||||
1,1,1,1,1,1,1,1, # 38 - 3f
|
||||
1,1,1,1,1,1,1,1, # 40 - 47
|
||||
1,1,1,1,1,1,1,1, # 48 - 4f
|
||||
1,1,1,1,1,1,1,1, # 50 - 57
|
||||
1,1,1,1,1,1,1,1, # 58 - 5f
|
||||
1,1,1,1,1,1,1,1, # 60 - 67
|
||||
1,1,1,1,1,1,1,1, # 68 - 6f
|
||||
1,1,1,1,1,1,1,1, # 70 - 77
|
||||
1,1,1,1,1,1,1,1, # 78 - 7f
|
||||
2,2,2,2,3,3,3,3, # 80 - 87
|
||||
4,4,4,4,4,4,4,4, # 88 - 8f
|
||||
4,4,4,4,4,4,4,4, # 90 - 97
|
||||
4,4,4,4,4,4,4,4, # 98 - 9f
|
||||
5,5,5,5,5,5,5,5, # a0 - a7
|
||||
5,5,5,5,5,5,5,5, # a8 - af
|
||||
5,5,5,5,5,5,5,5, # b0 - b7
|
||||
5,5,5,5,5,5,5,5, # b8 - bf
|
||||
0,0,6,6,6,6,6,6, # c0 - c7
|
||||
6,6,6,6,6,6,6,6, # c8 - cf
|
||||
6,6,6,6,6,6,6,6, # d0 - d7
|
||||
6,6,6,6,6,6,6,6, # d8 - df
|
||||
7,8,8,8,8,8,8,8, # e0 - e7
|
||||
8,8,8,8,8,9,8,8, # e8 - ef
|
||||
10,11,11,11,11,11,11,11, # f0 - f7
|
||||
12,13,13,13,14,15,0,0) # f8 - ff
|
||||
1,1,1,1,1,1,0,0, # 08 - 0f
|
||||
1,1,1,1,1,1,1,1, # 10 - 17
|
||||
1,1,1,0,1,1,1,1, # 18 - 1f
|
||||
1,1,1,1,1,1,1,1, # 20 - 27
|
||||
1,1,1,1,1,1,1,1, # 28 - 2f
|
||||
1,1,1,1,1,1,1,1, # 30 - 37
|
||||
1,1,1,1,1,1,1,1, # 38 - 3f
|
||||
1,1,1,1,1,1,1,1, # 40 - 47
|
||||
1,1,1,1,1,1,1,1, # 48 - 4f
|
||||
1,1,1,1,1,1,1,1, # 50 - 57
|
||||
1,1,1,1,1,1,1,1, # 58 - 5f
|
||||
1,1,1,1,1,1,1,1, # 60 - 67
|
||||
1,1,1,1,1,1,1,1, # 68 - 6f
|
||||
1,1,1,1,1,1,1,1, # 70 - 77
|
||||
1,1,1,1,1,1,1,1, # 78 - 7f
|
||||
2,2,2,2,3,3,3,3, # 80 - 87
|
||||
4,4,4,4,4,4,4,4, # 88 - 8f
|
||||
4,4,4,4,4,4,4,4, # 90 - 97
|
||||
4,4,4,4,4,4,4,4, # 98 - 9f
|
||||
5,5,5,5,5,5,5,5, # a0 - a7
|
||||
5,5,5,5,5,5,5,5, # a8 - af
|
||||
5,5,5,5,5,5,5,5, # b0 - b7
|
||||
5,5,5,5,5,5,5,5, # b8 - bf
|
||||
0,0,6,6,6,6,6,6, # c0 - c7
|
||||
6,6,6,6,6,6,6,6, # c8 - cf
|
||||
6,6,6,6,6,6,6,6, # d0 - d7
|
||||
6,6,6,6,6,6,6,6, # d8 - df
|
||||
7,8,8,8,8,8,8,8, # e0 - e7
|
||||
8,8,8,8,8,9,8,8, # e8 - ef
|
||||
10,11,11,11,11,11,11,11, # f0 - f7
|
||||
12,13,13,13,14,15,0,0 # f8 - ff
|
||||
)
|
||||
|
||||
UTF8_st = ( \
|
||||
eError,eStart,eError,eError,eError,eError, 12, 10,#00-07
|
||||
9, 11, 8, 7, 6, 5, 4, 3,#08-0f
|
||||
eError,eError,eError,eError,eError,eError,eError,eError,#10-17
|
||||
eError,eError,eError,eError,eError,eError,eError,eError,#18-1f
|
||||
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,#20-27
|
||||
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,#28-2f
|
||||
eError,eError, 5, 5, 5, 5,eError,eError,#30-37
|
||||
eError,eError,eError,eError,eError,eError,eError,eError,#38-3f
|
||||
eError,eError,eError, 5, 5, 5,eError,eError,#40-47
|
||||
eError,eError,eError,eError,eError,eError,eError,eError,#48-4f
|
||||
eError,eError, 7, 7, 7, 7,eError,eError,#50-57
|
||||
eError,eError,eError,eError,eError,eError,eError,eError,#58-5f
|
||||
eError,eError,eError,eError, 7, 7,eError,eError,#60-67
|
||||
eError,eError,eError,eError,eError,eError,eError,eError,#68-6f
|
||||
eError,eError, 9, 9, 9, 9,eError,eError,#70-77
|
||||
eError,eError,eError,eError,eError,eError,eError,eError,#78-7f
|
||||
eError,eError,eError,eError,eError, 9,eError,eError,#80-87
|
||||
eError,eError,eError,eError,eError,eError,eError,eError,#88-8f
|
||||
eError,eError, 12, 12, 12, 12,eError,eError,#90-97
|
||||
eError,eError,eError,eError,eError,eError,eError,eError,#98-9f
|
||||
eError,eError,eError,eError,eError, 12,eError,eError,#a0-a7
|
||||
eError,eError,eError,eError,eError,eError,eError,eError,#a8-af
|
||||
eError,eError, 12, 12, 12,eError,eError,eError,#b0-b7
|
||||
eError,eError,eError,eError,eError,eError,eError,eError,#b8-bf
|
||||
eError,eError,eStart,eStart,eStart,eStart,eError,eError,#c0-c7
|
||||
eError,eError,eError,eError,eError,eError,eError,eError)#c8-cf
|
||||
UTF8_st = (
|
||||
eError,eStart,eError,eError,eError,eError, 12, 10,#00-07
|
||||
9, 11, 8, 7, 6, 5, 4, 3,#08-0f
|
||||
eError,eError,eError,eError,eError,eError,eError,eError,#10-17
|
||||
eError,eError,eError,eError,eError,eError,eError,eError,#18-1f
|
||||
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,#20-27
|
||||
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,#28-2f
|
||||
eError,eError, 5, 5, 5, 5,eError,eError,#30-37
|
||||
eError,eError,eError,eError,eError,eError,eError,eError,#38-3f
|
||||
eError,eError,eError, 5, 5, 5,eError,eError,#40-47
|
||||
eError,eError,eError,eError,eError,eError,eError,eError,#48-4f
|
||||
eError,eError, 7, 7, 7, 7,eError,eError,#50-57
|
||||
eError,eError,eError,eError,eError,eError,eError,eError,#58-5f
|
||||
eError,eError,eError,eError, 7, 7,eError,eError,#60-67
|
||||
eError,eError,eError,eError,eError,eError,eError,eError,#68-6f
|
||||
eError,eError, 9, 9, 9, 9,eError,eError,#70-77
|
||||
eError,eError,eError,eError,eError,eError,eError,eError,#78-7f
|
||||
eError,eError,eError,eError,eError, 9,eError,eError,#80-87
|
||||
eError,eError,eError,eError,eError,eError,eError,eError,#88-8f
|
||||
eError,eError, 12, 12, 12, 12,eError,eError,#90-97
|
||||
eError,eError,eError,eError,eError,eError,eError,eError,#98-9f
|
||||
eError,eError,eError,eError,eError, 12,eError,eError,#a0-a7
|
||||
eError,eError,eError,eError,eError,eError,eError,eError,#a8-af
|
||||
eError,eError, 12, 12, 12,eError,eError,eError,#b0-b7
|
||||
eError,eError,eError,eError,eError,eError,eError,eError,#b8-bf
|
||||
eError,eError,eStart,eStart,eStart,eStart,eError,eError,#c0-c7
|
||||
eError,eError,eError,eError,eError,eError,eError,eError #c8-cf
|
||||
)
|
||||
|
||||
UTF8CharLenTable = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6)
|
||||
|
||||
|
|
50
thirdparty/chardet/sbcharsetprober.py
vendored
50
thirdparty/chardet/sbcharsetprober.py
vendored
|
@ -14,20 +14,22 @@
|
|||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import constants, sys
|
||||
from charsetprober import CharSetProber
|
||||
import sys
|
||||
from . import constants
|
||||
from .charsetprober import CharSetProber
|
||||
from .compat import wrap_ord
|
||||
|
||||
SAMPLE_SIZE = 64
|
||||
SB_ENOUGH_REL_THRESHOLD = 1024
|
||||
|
@ -38,21 +40,26 @@ NUMBER_OF_SEQ_CAT = 4
|
|||
POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1
|
||||
#NEGATIVE_CAT = 0
|
||||
|
||||
|
||||
class SingleByteCharSetProber(CharSetProber):
|
||||
def __init__(self, model, reversed=constants.False, nameProber=None):
|
||||
def __init__(self, model, reversed=False, nameProber=None):
|
||||
CharSetProber.__init__(self)
|
||||
self._mModel = model
|
||||
self._mReversed = reversed # TRUE if we need to reverse every pair in the model lookup
|
||||
self._mNameProber = nameProber # Optional auxiliary prober for name decision
|
||||
# TRUE if we need to reverse every pair in the model lookup
|
||||
self._mReversed = reversed
|
||||
# Optional auxiliary prober for name decision
|
||||
self._mNameProber = nameProber
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
CharSetProber.reset(self)
|
||||
self._mLastOrder = 255 # char order of last character
|
||||
# char order of last character
|
||||
self._mLastOrder = 255
|
||||
self._mSeqCounters = [0] * NUMBER_OF_SEQ_CAT
|
||||
self._mTotalSeqs = 0
|
||||
self._mTotalChar = 0
|
||||
self._mFreqChar = 0 # characters that fall in our sampling range
|
||||
# characters that fall in our sampling range
|
||||
self._mFreqChar = 0
|
||||
|
||||
def get_charset_name(self):
|
||||
if self._mNameProber:
|
||||
|
@ -67,7 +74,7 @@ class SingleByteCharSetProber(CharSetProber):
|
|||
if not aLen:
|
||||
return self.get_state()
|
||||
for c in aBuf:
|
||||
order = self._mModel['charToOrderMap'][ord(c)]
|
||||
order = self._mModel['charToOrderMap'][wrap_ord(c)]
|
||||
if order < SYMBOL_CAT_ORDER:
|
||||
self._mTotalChar += 1
|
||||
if order < SAMPLE_SIZE:
|
||||
|
@ -75,9 +82,12 @@ class SingleByteCharSetProber(CharSetProber):
|
|||
if self._mLastOrder < SAMPLE_SIZE:
|
||||
self._mTotalSeqs += 1
|
||||
if not self._mReversed:
|
||||
self._mSeqCounters[self._mModel['precedenceMatrix'][(self._mLastOrder * SAMPLE_SIZE) + order]] += 1
|
||||
else: # reverse the order of the letters in the lookup
|
||||
self._mSeqCounters[self._mModel['precedenceMatrix'][(order * SAMPLE_SIZE) + self._mLastOrder]] += 1
|
||||
i = (self._mLastOrder * SAMPLE_SIZE) + order
|
||||
model = self._mModel['precedenceMatrix'][i]
|
||||
else: # reverse the order of the letters in the lookup
|
||||
i = (order * SAMPLE_SIZE) + self._mLastOrder
|
||||
model = self._mModel['precedenceMatrix'][i]
|
||||
self._mSeqCounters[model] += 1
|
||||
self._mLastOrder = order
|
||||
|
||||
if self.get_state() == constants.eDetecting:
|
||||
|
@ -85,11 +95,16 @@ class SingleByteCharSetProber(CharSetProber):
|
|||
cf = self.get_confidence()
|
||||
if cf > POSITIVE_SHORTCUT_THRESHOLD:
|
||||
if constants._debug:
|
||||
sys.stderr.write('%s confidence = %s, we have a winner\n' % (self._mModel['charsetName'], cf))
|
||||
sys.stderr.write('%s confidence = %s, we have a'
|
||||
'winner\n' %
|
||||
(self._mModel['charsetName'], cf))
|
||||
self._mState = constants.eFoundIt
|
||||
elif cf < NEGATIVE_SHORTCUT_THRESHOLD:
|
||||
if constants._debug:
|
||||
sys.stderr.write('%s confidence = %s, below negative shortcut threshhold %s\n' % (self._mModel['charsetName'], cf, NEGATIVE_SHORTCUT_THRESHOLD))
|
||||
sys.stderr.write('%s confidence = %s, below negative'
|
||||
'shortcut threshhold %s\n' %
|
||||
(self._mModel['charsetName'], cf,
|
||||
NEGATIVE_SHORTCUT_THRESHOLD))
|
||||
self._mState = constants.eNotMe
|
||||
|
||||
return self.get_state()
|
||||
|
@ -97,9 +112,8 @@ class SingleByteCharSetProber(CharSetProber):
|
|||
def get_confidence(self):
|
||||
r = 0.01
|
||||
if self._mTotalSeqs > 0:
|
||||
# print self._mSeqCounters[POSITIVE_CAT], self._mTotalSeqs, self._mModel['mTypicalPositiveRatio']
|
||||
r = (1.0 * self._mSeqCounters[POSITIVE_CAT]) / self._mTotalSeqs / self._mModel['mTypicalPositiveRatio']
|
||||
# print r, self._mFreqChar, self._mTotalChar
|
||||
r = ((1.0 * self._mSeqCounters[POSITIVE_CAT]) / self._mTotalSeqs
|
||||
/ self._mModel['mTypicalPositiveRatio'])
|
||||
r = r * self._mFreqChar / self._mTotalChar
|
||||
if r >= 1.0:
|
||||
r = 0.99
|
||||
|
|
39
thirdparty/chardet/sbcsgroupprober.py
vendored
39
thirdparty/chardet/sbcsgroupprober.py
vendored
|
@ -14,33 +14,35 @@
|
|||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import constants, sys
|
||||
from charsetgroupprober import CharSetGroupProber
|
||||
from sbcharsetprober import SingleByteCharSetProber
|
||||
from langcyrillicmodel import Win1251CyrillicModel, Koi8rModel, Latin5CyrillicModel, MacCyrillicModel, Ibm866Model, Ibm855Model
|
||||
from langgreekmodel import Latin7GreekModel, Win1253GreekModel
|
||||
from langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel
|
||||
from langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel
|
||||
from langthaimodel import TIS620ThaiModel
|
||||
from langhebrewmodel import Win1255HebrewModel
|
||||
from hebrewprober import HebrewProber
|
||||
from .charsetgroupprober import CharSetGroupProber
|
||||
from .sbcharsetprober import SingleByteCharSetProber
|
||||
from .langcyrillicmodel import (Win1251CyrillicModel, Koi8rModel,
|
||||
Latin5CyrillicModel, MacCyrillicModel,
|
||||
Ibm866Model, Ibm855Model)
|
||||
from .langgreekmodel import Latin7GreekModel, Win1253GreekModel
|
||||
from .langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel
|
||||
from .langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel
|
||||
from .langthaimodel import TIS620ThaiModel
|
||||
from .langhebrewmodel import Win1255HebrewModel
|
||||
from .hebrewprober import HebrewProber
|
||||
|
||||
|
||||
class SBCSGroupProber(CharSetGroupProber):
|
||||
def __init__(self):
|
||||
CharSetGroupProber.__init__(self)
|
||||
self._mProbers = [ \
|
||||
self._mProbers = [
|
||||
SingleByteCharSetProber(Win1251CyrillicModel),
|
||||
SingleByteCharSetProber(Koi8rModel),
|
||||
SingleByteCharSetProber(Latin5CyrillicModel),
|
||||
|
@ -54,11 +56,14 @@ class SBCSGroupProber(CharSetGroupProber):
|
|||
SingleByteCharSetProber(Latin2HungarianModel),
|
||||
SingleByteCharSetProber(Win1250HungarianModel),
|
||||
SingleByteCharSetProber(TIS620ThaiModel),
|
||||
]
|
||||
]
|
||||
hebrewProber = HebrewProber()
|
||||
logicalHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, constants.False, hebrewProber)
|
||||
visualHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, constants.True, hebrewProber)
|
||||
logicalHebrewProber = SingleByteCharSetProber(Win1255HebrewModel,
|
||||
False, hebrewProber)
|
||||
visualHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, True,
|
||||
hebrewProber)
|
||||
hebrewProber.set_model_probers(logicalHebrewProber, visualHebrewProber)
|
||||
self._mProbers.extend([hebrewProber, logicalHebrewProber, visualHebrewProber])
|
||||
self._mProbers.extend([hebrewProber, logicalHebrewProber,
|
||||
visualHebrewProber])
|
||||
|
||||
self.reset()
|
||||
|
|
44
thirdparty/chardet/sjisprober.py
vendored
44
thirdparty/chardet/sjisprober.py
vendored
|
@ -13,25 +13,26 @@
|
|||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from mbcharsetprober import MultiByteCharSetProber
|
||||
from codingstatemachine import CodingStateMachine
|
||||
from chardistribution import SJISDistributionAnalysis
|
||||
from jpcntx import SJISContextAnalysis
|
||||
from mbcssm import SJISSMModel
|
||||
import constants, sys
|
||||
from constants import eStart, eError, eItsMe
|
||||
import sys
|
||||
from .mbcharsetprober import MultiByteCharSetProber
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .chardistribution import SJISDistributionAnalysis
|
||||
from .jpcntx import SJISContextAnalysis
|
||||
from .mbcssm import SJISSMModel
|
||||
from . import constants
|
||||
|
||||
|
||||
class SJISProber(MultiByteCharSetProber):
|
||||
def __init__(self):
|
||||
|
@ -46,35 +47,40 @@ class SJISProber(MultiByteCharSetProber):
|
|||
self._mContextAnalyzer.reset()
|
||||
|
||||
def get_charset_name(self):
|
||||
return "SHIFT_JIS"
|
||||
return self._mContextAnalyzer.get_charset_name()
|
||||
|
||||
def feed(self, aBuf):
|
||||
aLen = len(aBuf)
|
||||
for i in xrange(0, aLen):
|
||||
codingState = self._mCodingSM.next_state(aBuf[i])
|
||||
if codingState == eError:
|
||||
if codingState == constants.eError:
|
||||
if constants._debug:
|
||||
sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n')
|
||||
sys.stderr.write(self.get_charset_name()
|
||||
+ ' prober hit error at byte ' + str(i)
|
||||
+ '\n')
|
||||
self._mState = constants.eNotMe
|
||||
break
|
||||
elif codingState == eItsMe:
|
||||
elif codingState == constants.eItsMe:
|
||||
self._mState = constants.eFoundIt
|
||||
break
|
||||
elif codingState == eStart:
|
||||
elif codingState == constants.eStart:
|
||||
charLen = self._mCodingSM.get_current_charlen()
|
||||
if i == 0:
|
||||
self._mLastChar[1] = aBuf[0]
|
||||
self._mContextAnalyzer.feed(self._mLastChar[2 - charLen :], charLen)
|
||||
self._mContextAnalyzer.feed(self._mLastChar[2 - charLen:],
|
||||
charLen)
|
||||
self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
|
||||
else:
|
||||
self._mContextAnalyzer.feed(aBuf[i + 1 - charLen : i + 3 - charLen], charLen)
|
||||
self._mDistributionAnalyzer.feed(aBuf[i - 1 : i + 1], charLen)
|
||||
self._mContextAnalyzer.feed(aBuf[i + 1 - charLen:i + 3
|
||||
- charLen], charLen)
|
||||
self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1],
|
||||
charLen)
|
||||
|
||||
self._mLastChar[0] = aBuf[aLen - 1]
|
||||
|
||||
if self.get_state() == constants.eDetecting:
|
||||
if self._mContextAnalyzer.got_enough_data() and \
|
||||
(self.get_confidence() > constants.SHORTCUT_THRESHOLD):
|
||||
if (self._mContextAnalyzer.got_enough_data() and
|
||||
(self.get_confidence() > constants.SHORTCUT_THRESHOLD)):
|
||||
self._mState = constants.eFoundIt
|
||||
|
||||
return self.get_state()
|
||||
|
|
20
thirdparty/chardet/test.py
vendored
20
thirdparty/chardet/test.py
vendored
|
@ -1,20 +0,0 @@
|
|||
import sys, glob
|
||||
sys.path.insert(0, '..')
|
||||
from chardet.universaldetector import UniversalDetector
|
||||
|
||||
count = 0
|
||||
u = UniversalDetector()
|
||||
for f in glob.glob(sys.argv[1]):
|
||||
print f.ljust(60),
|
||||
u.reset()
|
||||
for line in file(f, 'rb'):
|
||||
u.feed(line)
|
||||
if u.done: break
|
||||
u.close()
|
||||
result = u.result
|
||||
if result['encoding']:
|
||||
print result['encoding'], 'with confidence', result['confidence']
|
||||
else:
|
||||
print '******** no result'
|
||||
count += 1
|
||||
print count, 'tests'
|
92
thirdparty/chardet/universaldetector.py
vendored
92
thirdparty/chardet/universaldetector.py
vendored
|
@ -14,23 +14,25 @@
|
|||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import constants, sys
|
||||
from latin1prober import Latin1Prober # windows-1252
|
||||
from mbcsgroupprober import MBCSGroupProber # multi-byte character sets
|
||||
from sbcsgroupprober import SBCSGroupProber # single-byte character sets
|
||||
from escprober import EscCharSetProber # ISO-2122, etc.
|
||||
from . import constants
|
||||
import sys
|
||||
import codecs
|
||||
from .latin1prober import Latin1Prober # windows-1252
|
||||
from .mbcsgroupprober import MBCSGroupProber # multi-byte character sets
|
||||
from .sbcsgroupprober import SBCSGroupProber # single-byte character sets
|
||||
from .escprober import EscCharSetProber # ISO-2122, etc.
|
||||
import re
|
||||
|
||||
MINIMUM_THRESHOLD = 0.20
|
||||
|
@ -38,68 +40,78 @@ ePureAscii = 0
|
|||
eEscAscii = 1
|
||||
eHighbyte = 2
|
||||
|
||||
|
||||
class UniversalDetector:
|
||||
def __init__(self):
|
||||
self._highBitDetector = re.compile(r'[\x80-\xFF]')
|
||||
self._escDetector = re.compile(r'(\033|~{)')
|
||||
self._highBitDetector = re.compile(b'[\x80-\xFF]')
|
||||
self._escDetector = re.compile(b'(\033|~{)')
|
||||
self._mEscCharSetProber = None
|
||||
self._mCharSetProbers = []
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.result = {'encoding': None, 'confidence': 0.0}
|
||||
self.done = constants.False
|
||||
self._mStart = constants.True
|
||||
self._mGotData = constants.False
|
||||
self.done = False
|
||||
self._mStart = True
|
||||
self._mGotData = False
|
||||
self._mInputState = ePureAscii
|
||||
self._mLastChar = ''
|
||||
self._mLastChar = b''
|
||||
if self._mEscCharSetProber:
|
||||
self._mEscCharSetProber.reset()
|
||||
for prober in self._mCharSetProbers:
|
||||
prober.reset()
|
||||
|
||||
def feed(self, aBuf):
|
||||
if self.done: return
|
||||
if self.done:
|
||||
return
|
||||
|
||||
aLen = len(aBuf)
|
||||
if not aLen: return
|
||||
if not aLen:
|
||||
return
|
||||
|
||||
if not self._mGotData:
|
||||
# If the data starts with BOM, we know it is UTF
|
||||
if aBuf[:3] == '\xEF\xBB\xBF':
|
||||
if aBuf[:3] == codecs.BOM_UTF8:
|
||||
# EF BB BF UTF-8 with BOM
|
||||
self.result = {'encoding': "UTF-8", 'confidence': 1.0}
|
||||
elif aBuf[:4] == '\xFF\xFE\x00\x00':
|
||||
self.result = {'encoding': "UTF-8-SIG", 'confidence': 1.0}
|
||||
elif aBuf[:4] == codecs.BOM_UTF32_LE:
|
||||
# FF FE 00 00 UTF-32, little-endian BOM
|
||||
self.result = {'encoding': "UTF-32LE", 'confidence': 1.0}
|
||||
elif aBuf[:4] == '\x00\x00\xFE\xFF':
|
||||
elif aBuf[:4] == codecs.BOM_UTF32_BE:
|
||||
# 00 00 FE FF UTF-32, big-endian BOM
|
||||
self.result = {'encoding': "UTF-32BE", 'confidence': 1.0}
|
||||
elif aBuf[:4] == '\xFE\xFF\x00\x00':
|
||||
elif aBuf[:4] == b'\xFE\xFF\x00\x00':
|
||||
# FE FF 00 00 UCS-4, unusual octet order BOM (3412)
|
||||
self.result = {'encoding': "X-ISO-10646-UCS-4-3412", 'confidence': 1.0}
|
||||
elif aBuf[:4] == '\x00\x00\xFF\xFE':
|
||||
self.result = {
|
||||
'encoding': "X-ISO-10646-UCS-4-3412",
|
||||
'confidence': 1.0
|
||||
}
|
||||
elif aBuf[:4] == b'\x00\x00\xFF\xFE':
|
||||
# 00 00 FF FE UCS-4, unusual octet order BOM (2143)
|
||||
self.result = {'encoding': "X-ISO-10646-UCS-4-2143", 'confidence': 1.0}
|
||||
elif aBuf[:2] == '\xFF\xFE':
|
||||
self.result = {
|
||||
'encoding': "X-ISO-10646-UCS-4-2143",
|
||||
'confidence': 1.0
|
||||
}
|
||||
elif aBuf[:2] == codecs.BOM_LE:
|
||||
# FF FE UTF-16, little endian BOM
|
||||
self.result = {'encoding': "UTF-16LE", 'confidence': 1.0}
|
||||
elif aBuf[:2] == '\xFE\xFF':
|
||||
elif aBuf[:2] == codecs.BOM_BE:
|
||||
# FE FF UTF-16, big endian BOM
|
||||
self.result = {'encoding': "UTF-16BE", 'confidence': 1.0}
|
||||
|
||||
self._mGotData = constants.True
|
||||
self._mGotData = True
|
||||
if self.result['encoding'] and (self.result['confidence'] > 0.0):
|
||||
self.done = constants.True
|
||||
self.done = True
|
||||
return
|
||||
|
||||
if self._mInputState == ePureAscii:
|
||||
if self._highBitDetector.search(aBuf):
|
||||
self._mInputState = eHighbyte
|
||||
elif (self._mInputState == ePureAscii) and self._escDetector.search(self._mLastChar + aBuf):
|
||||
elif ((self._mInputState == ePureAscii) and
|
||||
self._escDetector.search(self._mLastChar + aBuf)):
|
||||
self._mInputState = eEscAscii
|
||||
|
||||
self._mLastChar = aBuf[-1]
|
||||
self._mLastChar = aBuf[-1:]
|
||||
|
||||
if self._mInputState == eEscAscii:
|
||||
if not self._mEscCharSetProber:
|
||||
|
@ -107,24 +119,26 @@ class UniversalDetector:
|
|||
if self._mEscCharSetProber.feed(aBuf) == constants.eFoundIt:
|
||||
self.result = {'encoding': self._mEscCharSetProber.get_charset_name(),
|
||||
'confidence': self._mEscCharSetProber.get_confidence()}
|
||||
self.done = constants.True
|
||||
self.done = True
|
||||
elif self._mInputState == eHighbyte:
|
||||
if not self._mCharSetProbers:
|
||||
self._mCharSetProbers = [MBCSGroupProber(), SBCSGroupProber(), Latin1Prober()]
|
||||
self._mCharSetProbers = [MBCSGroupProber(), SBCSGroupProber(),
|
||||
Latin1Prober()]
|
||||
for prober in self._mCharSetProbers:
|
||||
if prober.feed(aBuf) == constants.eFoundIt:
|
||||
self.result = {'encoding': prober.get_charset_name(),
|
||||
'confidence': prober.get_confidence()}
|
||||
self.done = constants.True
|
||||
self.done = True
|
||||
break
|
||||
|
||||
def close(self):
|
||||
if self.done: return
|
||||
if self.done:
|
||||
return
|
||||
if not self._mGotData:
|
||||
if constants._debug:
|
||||
sys.stderr.write('no data received!\n')
|
||||
return
|
||||
self.done = constants.True
|
||||
self.done = True
|
||||
|
||||
if self._mInputState == ePureAscii:
|
||||
self.result = {'encoding': 'ascii', 'confidence': 1.0}
|
||||
|
@ -135,7 +149,8 @@ class UniversalDetector:
|
|||
maxProberConfidence = 0.0
|
||||
maxProber = None
|
||||
for prober in self._mCharSetProbers:
|
||||
if not prober: continue
|
||||
if not prober:
|
||||
continue
|
||||
proberConfidence = prober.get_confidence()
|
||||
if proberConfidence > maxProberConfidence:
|
||||
maxProberConfidence = proberConfidence
|
||||
|
@ -148,7 +163,8 @@ class UniversalDetector:
|
|||
if constants._debug:
|
||||
sys.stderr.write('no probers hit minimum threshhold\n')
|
||||
for prober in self._mCharSetProbers[0].mProbers:
|
||||
if not prober: continue
|
||||
sys.stderr.write('%s confidence = %s\n' % \
|
||||
(prober.get_charset_name(), \
|
||||
if not prober:
|
||||
continue
|
||||
sys.stderr.write('%s confidence = %s\n' %
|
||||
(prober.get_charset_name(),
|
||||
prober.get_confidence()))
|
||||
|
|
20
thirdparty/chardet/utf8prober.py
vendored
20
thirdparty/chardet/utf8prober.py
vendored
|
@ -13,26 +13,26 @@
|
|||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import constants, sys
|
||||
from constants import eStart, eError, eItsMe
|
||||
from charsetprober import CharSetProber
|
||||
from codingstatemachine import CodingStateMachine
|
||||
from mbcssm import UTF8SMModel
|
||||
from . import constants
|
||||
from .charsetprober import CharSetProber
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .mbcssm import UTF8SMModel
|
||||
|
||||
ONE_CHAR_PROB = 0.5
|
||||
|
||||
|
||||
class UTF8Prober(CharSetProber):
|
||||
def __init__(self):
|
||||
CharSetProber.__init__(self)
|
||||
|
@ -50,13 +50,13 @@ class UTF8Prober(CharSetProber):
|
|||
def feed(self, aBuf):
|
||||
for c in aBuf:
|
||||
codingState = self._mCodingSM.next_state(c)
|
||||
if codingState == eError:
|
||||
if codingState == constants.eError:
|
||||
self._mState = constants.eNotMe
|
||||
break
|
||||
elif codingState == eItsMe:
|
||||
elif codingState == constants.eItsMe:
|
||||
self._mState = constants.eFoundIt
|
||||
break
|
||||
elif codingState == eStart:
|
||||
elif codingState == constants.eStart:
|
||||
if self._mCodingSM.get_current_charlen() >= 2:
|
||||
self._mNumOfMBChar += 1
|
||||
|
||||
|
|
4
thirdparty/multipart/multipartpost.py
vendored
4
thirdparty/multipart/multipartpost.py
vendored
|
@ -73,7 +73,7 @@ class MultipartPostHandler(urllib2.BaseHandler):
|
|||
request.add_data(data)
|
||||
return request
|
||||
|
||||
def multipart_encode(vars, files, boundary = None, buf = None):
|
||||
def multipart_encode(vars, files, boundary=None, buf=None):
|
||||
if boundary is None:
|
||||
boundary = mimetools.choose_boundary()
|
||||
|
||||
|
@ -100,7 +100,7 @@ class MultipartPostHandler(urllib2.BaseHandler):
|
|||
# buf += 'Content-Length: %s\r\n' % file_size
|
||||
fd.seek(0)
|
||||
|
||||
buf = str(buf)
|
||||
buf = str(buf) if not isinstance(buf, unicode) else buf.encode("utf8")
|
||||
buf += '\r\n%s\r\n' % fd.read()
|
||||
|
||||
buf += '--%s--\r\n\r\n' % boundary
|
||||
|
|
|
@ -2596,3 +2596,7 @@ tmp_lahir
|
|||
universitas
|
||||
urut
|
||||
waktu
|
||||
|
||||
# WebGoat
|
||||
cookie
|
||||
login_count
|
||||
|
|
|
@ -3366,3 +3366,6 @@ tuser
|
|||
tusers
|
||||
userstbl
|
||||
usertbl
|
||||
|
||||
# WebGoat
|
||||
user_data
|
||||
|
|
|
@ -104,6 +104,8 @@
|
|||
<!-- HSQLDB -->
|
||||
<dbms value="HSQLDB">
|
||||
<error regexp="org\.hsqldb\.jdbc"/>
|
||||
<error regexp="Unexpected end of command in statement \["/>
|
||||
<error regexp="Unexpected token.*in statement \["/>
|
||||
</dbms>
|
||||
|
||||
</root>
|
||||
|
|
|
@ -651,8 +651,8 @@
|
|||
<cast query="CAST(%s AS LONGVARCHAR)"/>
|
||||
<length query="CHAR_LENGTH(%s)"/>
|
||||
<isnull query="IFNULL(%s,' ')"/>
|
||||
<delimiter query=","/>
|
||||
<limit query="LIMIT %d %d"/>
|
||||
<delimiter query="||"/>
|
||||
<limit query="LIMIT %d %d" query2="LIMIT %d OFFSET %d"/>
|
||||
<limitregexp query="\s+LIMIT\s+([\d]+)\s*\,\s*([\d]+)" query2="\s+LIMIT\s+([\d]+)"/>
|
||||
<limitgroupstart query="1"/>
|
||||
<limitgroupstop query="2"/>
|
||||
|
@ -675,30 +675,30 @@
|
|||
<check_udf/>
|
||||
<users>
|
||||
<!-- LIMIT is needed at start for v1.7 this gets mangled unless no-cast is used -->
|
||||
<blind query="SELECT LIMIT %d 1 DISTINCT(user) FROM INFORMATION_SCHEMA.SYSTEM_USERS" count="SELECT COUNT(DISTINCT(user)) FROM INFORMATION_SCHEMA.SYSTEM_USERS"/>
|
||||
<inband query="SELECT user FROM INFORMATION_SCHEMA.SYSTEM_USERS"/>
|
||||
<blind query="SELECT LIMIT %d 1 DISTINCT(user) FROM INFORMATION_SCHEMA.SYSTEM_USERS ORDER BY user" count="SELECT COUNT(DISTINCT(user)) FROM INFORMATION_SCHEMA.SYSTEM_USERS"/>
|
||||
<inband query="SELECT user FROM INFORMATION_SCHEMA.SYSTEM_USERS ORDER BY user"/>
|
||||
</users>
|
||||
<passwords>
|
||||
<!-- Passwords only shown in later versions >=2.0 -->
|
||||
<blind query="SELECT LIMIT %d 1 DISTINCT(password_digest) FROM INFORMATION_SCHEMA.SYSTEM_USERS WHERE user_name='%s'" count="SELECT COUNT(DISTINCT(password_digest)) FROM INFORMATION_SCHEMA.SYSTEM_USERS WHERE user_name='%s'"/>
|
||||
<inband query="SELECT user_name,password_digest FROM INFORMATION_SCHEMA.SYSTEM_USERS" condition="user_name"/>
|
||||
<blind query="SELECT LIMIT %d 1 DISTINCT(password_digest) FROM INFORMATION_SCHEMA.SYSTEM_USERS WHERE user_name='%s' ORDER BY password_digest" count="SELECT COUNT(DISTINCT(password_digest)) FROM INFORMATION_SCHEMA.SYSTEM_USERS WHERE user_name='%s'"/>
|
||||
<inband query="SELECT user_name,password_digest FROM INFORMATION_SCHEMA.SYSTEM_USERS ORDER BY user_name" condition="user_name"/>
|
||||
</passwords>
|
||||
<privileges/>
|
||||
<roles/>
|
||||
<dbs>
|
||||
<blind query="SELECT LIMIT %d 1 DISTINCT(table_schem) FROM INFORMATION_SCHEMA.SYSTEM_SCHEMAS" count="SELECT COUNT(table_schem) FROM INFORMATION_SCHEMA.SYSTEM_SCHEMAS"/>
|
||||
<inband query="SELECT table_schem FROM INFORMATION_SCHEMA.SYSTEM_SCHEMAS" />
|
||||
<blind query="SELECT LIMIT %d 1 DISTINCT(table_schem) FROM INFORMATION_SCHEMA.SYSTEM_SCHEMAS ORDER BY table_schem" count="SELECT COUNT(table_schem) FROM INFORMATION_SCHEMA.SYSTEM_SCHEMAS"/>
|
||||
<inband query="SELECT table_schem FROM INFORMATION_SCHEMA.SYSTEM_SCHEMAS ORDER BY table_schem" />
|
||||
</dbs>
|
||||
<tables>
|
||||
<blind query="SELECT LIMIT %d 1 table_name FROM INFORMATION_SCHEMA.SYSTEM_TABLES WHERE table_schem='%s' " count="SELECT COUNT(table_name) FROM INFORMATION_SCHEMA.SYSTEM_TABLES WHERE table_schem='%s'"/>
|
||||
<inband query="SELECT table_schem,table_name FROM INFORMATION_SCHEMA.SYSTEM_TABLES" condition="table_schem"/>
|
||||
<blind query="SELECT LIMIT %d 1 table_name FROM INFORMATION_SCHEMA.SYSTEM_TABLES WHERE table_schem='%s' ORDER BY table_name" count="SELECT COUNT(table_name) FROM INFORMATION_SCHEMA.SYSTEM_TABLES WHERE table_schem='%s'"/>
|
||||
<inband query="SELECT table_schem,table_name FROM INFORMATION_SCHEMA.SYSTEM_TABLES ORDER BY table_schem" condition="table_schem"/>
|
||||
</tables>
|
||||
<columns>
|
||||
<blind query="SELECT column_name FROM INFORMATION_SCHEMA.SYSTEM_COLUMNS WHERE table_name='%s' AND table_schem='%s'" query2="SELECT column_type FROM INFORMATION_SCHEMA.COLUMNS WHERE table_name='%s' AND column_name='%s' AND table_schema='%s'" count="SELECT COUNT(column_name) FROM INFORMATION_SCHEMA.COLUMNS WHERE table_name='%s' AND table_schema='%s'" condition="column_name"/>
|
||||
<inband query="SELECT column_name,type_name FROM INFORMATION_SCHEMA.SYSTEM_COLUMNS WHERE table_name='%s' AND table_schem='%s'" condition="column_name"/>
|
||||
<blind query="SELECT column_name FROM INFORMATION_SCHEMA.SYSTEM_COLUMNS WHERE table_name='%s' AND table_schem='%s' ORDER BY column_name" query2="SELECT column_type FROM INFORMATION_SCHEMA.SYSTEM_COLUMNS WHERE table_name='%s' AND column_name='%s' AND table_schem='%s'" count="SELECT COUNT(column_name) FROM INFORMATION_SCHEMA.SYSTEM_COLUMNS WHERE table_name='%s' AND table_schem='%s'" condition="column_name"/>
|
||||
<inband query="SELECT column_name,type_name FROM INFORMATION_SCHEMA.SYSTEM_COLUMNS WHERE table_name='%s' AND table_schem='%s' ORDER BY column_name" condition="column_name"/>
|
||||
</columns>
|
||||
<dump_table>
|
||||
<blind query="SELECT LIMIT %d 1 %s FROM %s.%s ORDER BY %s " count="SELECT COUNT(*) FROM %s.%s"/>
|
||||
<blind query="SELECT %s FROM %s.%s ORDER BY %s LIMIT 1 OFFSET %d" count="SELECT COUNT(*) FROM %s.%s"/>
|
||||
<inband query="SELECT %s FROM %s.%s ORDER BY %s"/>
|
||||
</dump_table>
|
||||
<search_db>
|
||||
|
|
Loading…
Reference in New Issue
Block a user