mirror of
https://github.com/sqlmapproject/sqlmap.git
synced 2024-11-25 02:53:46 +03:00
Adding new version of chardet
This commit is contained in:
parent
d424d4cdc7
commit
439d003753
10
thirdparty/chardet/__init__.py
vendored
10
thirdparty/chardet/__init__.py
vendored
|
@ -15,10 +15,16 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
__version__ = "2.0.1"
|
__version__ = "2.3.0"
|
||||||
|
from sys import version_info
|
||||||
|
|
||||||
|
|
||||||
def detect(aBuf):
|
def detect(aBuf):
|
||||||
import universaldetector
|
if ((version_info < (3, 0) and isinstance(aBuf, unicode)) or
|
||||||
|
(version_info >= (3, 0) and not isinstance(aBuf, bytes))):
|
||||||
|
raise ValueError('Expected a bytes object, not a unicode object')
|
||||||
|
|
||||||
|
from . import universaldetector
|
||||||
u = universaldetector.UniversalDetector()
|
u = universaldetector.UniversalDetector()
|
||||||
u.reset()
|
u.reset()
|
||||||
u.feed(aBuf)
|
u.feed(aBuf)
|
||||||
|
|
4
thirdparty/chardet/big5freq.py
vendored
4
thirdparty/chardet/big5freq.py
vendored
|
@ -45,7 +45,7 @@ BIG5_TYPICAL_DISTRIBUTION_RATIO = 0.75
|
||||||
#Char to FreqOrder table
|
#Char to FreqOrder table
|
||||||
BIG5_TABLE_SIZE = 5376
|
BIG5_TABLE_SIZE = 5376
|
||||||
|
|
||||||
Big5CharToFreqOrder = ( \
|
Big5CharToFreqOrder = (
|
||||||
1,1801,1506, 255,1431, 198, 9, 82, 6,5008, 177, 202,3681,1256,2821, 110, # 16
|
1,1801,1506, 255,1431, 198, 9, 82, 6,5008, 177, 202,3681,1256,2821, 110, # 16
|
||||||
3814, 33,3274, 261, 76, 44,2114, 16,2946,2187,1176, 659,3971, 26,3451,2653, # 32
|
3814, 33,3274, 261, 76, 44,2114, 16,2946,2187,1176, 659,3971, 26,3451,2653, # 32
|
||||||
1198,3972,3350,4202, 410,2215, 302, 590, 361,1964, 8, 204, 58,4510,5009,1932, # 48
|
1198,3972,3350,4202, 410,2215, 302, 590, 361,1964, 8, 204, 58,4510,5009,1932, # 48
|
||||||
|
@ -921,3 +921,5 @@ Big5CharToFreqOrder = ( \
|
||||||
13936,13937,13938,13939,13940,13941,13942,13943,13944,13945,13946,13947,13948,13949,13950,13951, #13952
|
13936,13937,13938,13939,13940,13941,13942,13943,13944,13945,13946,13947,13948,13949,13950,13951, #13952
|
||||||
13952,13953,13954,13955,13956,13957,13958,13959,13960,13961,13962,13963,13964,13965,13966,13967, #13968
|
13952,13953,13954,13955,13956,13957,13958,13959,13960,13961,13962,13963,13964,13965,13966,13967, #13968
|
||||||
13968,13969,13970,13971,13972) #13973
|
13968,13969,13970,13971,13972) #13973
|
||||||
|
|
||||||
|
# flake8: noqa
|
||||||
|
|
9
thirdparty/chardet/big5prober.py
vendored
9
thirdparty/chardet/big5prober.py
vendored
|
@ -25,10 +25,11 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from mbcharsetprober import MultiByteCharSetProber
|
from .mbcharsetprober import MultiByteCharSetProber
|
||||||
from codingstatemachine import CodingStateMachine
|
from .codingstatemachine import CodingStateMachine
|
||||||
from chardistribution import Big5DistributionAnalysis
|
from .chardistribution import Big5DistributionAnalysis
|
||||||
from mbcssm import Big5SMModel
|
from .mbcssm import Big5SMModel
|
||||||
|
|
||||||
|
|
||||||
class Big5Prober(MultiByteCharSetProber):
|
class Big5Prober(MultiByteCharSetProber):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
80
thirdparty/chardet/chardetect.py
vendored
Normal file
80
thirdparty/chardet/chardetect.py
vendored
Normal file
|
@ -0,0 +1,80 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
"""
|
||||||
|
Script which takes one or more file paths and reports on their detected
|
||||||
|
encodings
|
||||||
|
|
||||||
|
Example::
|
||||||
|
|
||||||
|
% chardetect somefile someotherfile
|
||||||
|
somefile: windows-1252 with confidence 0.5
|
||||||
|
someotherfile: ascii with confidence 1.0
|
||||||
|
|
||||||
|
If no paths are provided, it takes its input from stdin.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import absolute_import, print_function, unicode_literals
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
from io import open
|
||||||
|
|
||||||
|
from chardet import __version__
|
||||||
|
from chardet.universaldetector import UniversalDetector
|
||||||
|
|
||||||
|
|
||||||
|
def description_of(lines, name='stdin'):
|
||||||
|
"""
|
||||||
|
Return a string describing the probable encoding of a file or
|
||||||
|
list of strings.
|
||||||
|
|
||||||
|
:param lines: The lines to get the encoding of.
|
||||||
|
:type lines: Iterable of bytes
|
||||||
|
:param name: Name of file or collection of lines
|
||||||
|
:type name: str
|
||||||
|
"""
|
||||||
|
u = UniversalDetector()
|
||||||
|
for line in lines:
|
||||||
|
u.feed(line)
|
||||||
|
u.close()
|
||||||
|
result = u.result
|
||||||
|
if result['encoding']:
|
||||||
|
return '{0}: {1} with confidence {2}'.format(name, result['encoding'],
|
||||||
|
result['confidence'])
|
||||||
|
else:
|
||||||
|
return '{0}: no result'.format(name)
|
||||||
|
|
||||||
|
|
||||||
|
def main(argv=None):
|
||||||
|
'''
|
||||||
|
Handles command line arguments and gets things started.
|
||||||
|
|
||||||
|
:param argv: List of arguments, as if specified on the command-line.
|
||||||
|
If None, ``sys.argv[1:]`` is used instead.
|
||||||
|
:type argv: list of str
|
||||||
|
'''
|
||||||
|
# Get command line arguments
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Takes one or more file paths and reports their detected \
|
||||||
|
encodings",
|
||||||
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
||||||
|
conflict_handler='resolve')
|
||||||
|
parser.add_argument('input',
|
||||||
|
help='File whose encoding we would like to determine.',
|
||||||
|
type=argparse.FileType('rb'), nargs='*',
|
||||||
|
default=[sys.stdin])
|
||||||
|
parser.add_argument('--version', action='version',
|
||||||
|
version='%(prog)s {0}'.format(__version__))
|
||||||
|
args = parser.parse_args(argv)
|
||||||
|
|
||||||
|
for f in args.input:
|
||||||
|
if f.isatty():
|
||||||
|
print("You are running chardetect interactively. Press " +
|
||||||
|
"CTRL-D twice at the start of a blank line to signal the " +
|
||||||
|
"end of your input. If you want help, run chardetect " +
|
||||||
|
"--help\n", file=sys.stderr)
|
||||||
|
print(description_of(f, f.name))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
133
thirdparty/chardet/chardistribution.py
vendored
133
thirdparty/chardet/chardistribution.py
vendored
|
@ -25,35 +25,51 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
import constants
|
from .euctwfreq import (EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE,
|
||||||
from euctwfreq import EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE, EUCTW_TYPICAL_DISTRIBUTION_RATIO
|
EUCTW_TYPICAL_DISTRIBUTION_RATIO)
|
||||||
from euckrfreq import EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE, EUCKR_TYPICAL_DISTRIBUTION_RATIO
|
from .euckrfreq import (EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE,
|
||||||
from gb2312freq import GB2312CharToFreqOrder, GB2312_TABLE_SIZE, GB2312_TYPICAL_DISTRIBUTION_RATIO
|
EUCKR_TYPICAL_DISTRIBUTION_RATIO)
|
||||||
from big5freq import Big5CharToFreqOrder, BIG5_TABLE_SIZE, BIG5_TYPICAL_DISTRIBUTION_RATIO
|
from .gb2312freq import (GB2312CharToFreqOrder, GB2312_TABLE_SIZE,
|
||||||
from jisfreq import JISCharToFreqOrder, JIS_TABLE_SIZE, JIS_TYPICAL_DISTRIBUTION_RATIO
|
GB2312_TYPICAL_DISTRIBUTION_RATIO)
|
||||||
|
from .big5freq import (Big5CharToFreqOrder, BIG5_TABLE_SIZE,
|
||||||
|
BIG5_TYPICAL_DISTRIBUTION_RATIO)
|
||||||
|
from .jisfreq import (JISCharToFreqOrder, JIS_TABLE_SIZE,
|
||||||
|
JIS_TYPICAL_DISTRIBUTION_RATIO)
|
||||||
|
from .compat import wrap_ord
|
||||||
|
|
||||||
ENOUGH_DATA_THRESHOLD = 1024
|
ENOUGH_DATA_THRESHOLD = 1024
|
||||||
SURE_YES = 0.99
|
SURE_YES = 0.99
|
||||||
SURE_NO = 0.01
|
SURE_NO = 0.01
|
||||||
|
MINIMUM_DATA_THRESHOLD = 3
|
||||||
|
|
||||||
|
|
||||||
class CharDistributionAnalysis:
|
class CharDistributionAnalysis:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self._mCharToFreqOrder = None # Mapping table to get frequency order from char order (get from GetOrder())
|
# Mapping table to get frequency order from char order (get from
|
||||||
self._mTableSize = None # Size of above table
|
# GetOrder())
|
||||||
self._mTypicalDistributionRatio = None # This is a constant value which varies from language to language, used in calculating confidence. See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail.
|
self._mCharToFreqOrder = None
|
||||||
|
self._mTableSize = None # Size of above table
|
||||||
|
# This is a constant value which varies from language to language,
|
||||||
|
# used in calculating confidence. See
|
||||||
|
# http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
|
||||||
|
# for further detail.
|
||||||
|
self._mTypicalDistributionRatio = None
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
"""reset analyser, clear any state"""
|
"""reset analyser, clear any state"""
|
||||||
self._mDone = constants.False # If this flag is set to constants.True, detection is done and conclusion has been made
|
# If this flag is set to True, detection is done and conclusion has
|
||||||
self._mTotalChars = 0 # Total characters encountered
|
# been made
|
||||||
self._mFreqChars = 0 # The number of characters whose frequency order is less than 512
|
self._mDone = False
|
||||||
|
self._mTotalChars = 0 # Total characters encountered
|
||||||
|
# The number of characters whose frequency order is less than 512
|
||||||
|
self._mFreqChars = 0
|
||||||
|
|
||||||
def feed(self, aStr, aCharLen):
|
def feed(self, aBuf, aCharLen):
|
||||||
"""feed a character with known length"""
|
"""feed a character with known length"""
|
||||||
if aCharLen == 2:
|
if aCharLen == 2:
|
||||||
# we only care about 2-bytes character in our distribution analysis
|
# we only care about 2-bytes character in our distribution analysis
|
||||||
order = self.get_order(aStr)
|
order = self.get_order(aBuf)
|
||||||
else:
|
else:
|
||||||
order = -1
|
order = -1
|
||||||
if order >= 0:
|
if order >= 0:
|
||||||
|
@ -65,12 +81,14 @@ class CharDistributionAnalysis:
|
||||||
|
|
||||||
def get_confidence(self):
|
def get_confidence(self):
|
||||||
"""return confidence based on existing data"""
|
"""return confidence based on existing data"""
|
||||||
# if we didn't receive any character in our consideration range, return negative answer
|
# if we didn't receive any character in our consideration range,
|
||||||
if self._mTotalChars <= 0:
|
# return negative answer
|
||||||
|
if self._mTotalChars <= 0 or self._mFreqChars <= MINIMUM_DATA_THRESHOLD:
|
||||||
return SURE_NO
|
return SURE_NO
|
||||||
|
|
||||||
if self._mTotalChars != self._mFreqChars:
|
if self._mTotalChars != self._mFreqChars:
|
||||||
r = self._mFreqChars / ((self._mTotalChars - self._mFreqChars) * self._mTypicalDistributionRatio)
|
r = (self._mFreqChars / ((self._mTotalChars - self._mFreqChars)
|
||||||
|
* self._mTypicalDistributionRatio))
|
||||||
if r < SURE_YES:
|
if r < SURE_YES:
|
||||||
return r
|
return r
|
||||||
|
|
||||||
|
@ -78,16 +96,18 @@ class CharDistributionAnalysis:
|
||||||
return SURE_YES
|
return SURE_YES
|
||||||
|
|
||||||
def got_enough_data(self):
|
def got_enough_data(self):
|
||||||
# It is not necessary to receive all data to draw conclusion. For charset detection,
|
# It is not necessary to receive all data to draw conclusion.
|
||||||
# certain amount of data is enough
|
# For charset detection, certain amount of data is enough
|
||||||
return self._mTotalChars > ENOUGH_DATA_THRESHOLD
|
return self._mTotalChars > ENOUGH_DATA_THRESHOLD
|
||||||
|
|
||||||
def get_order(self, aStr):
|
def get_order(self, aBuf):
|
||||||
# We do not handle characters based on the original encoding string, but
|
# We do not handle characters based on the original encoding string,
|
||||||
# convert this encoding string to a number, here called order.
|
# but convert this encoding string to a number, here called order.
|
||||||
# This allows multiple encodings of a language to share one frequency table.
|
# This allows multiple encodings of a language to share one frequency
|
||||||
|
# table.
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
|
|
||||||
class EUCTWDistributionAnalysis(CharDistributionAnalysis):
|
class EUCTWDistributionAnalysis(CharDistributionAnalysis):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
CharDistributionAnalysis.__init__(self)
|
CharDistributionAnalysis.__init__(self)
|
||||||
|
@ -95,16 +115,18 @@ class EUCTWDistributionAnalysis(CharDistributionAnalysis):
|
||||||
self._mTableSize = EUCTW_TABLE_SIZE
|
self._mTableSize = EUCTW_TABLE_SIZE
|
||||||
self._mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
|
self._mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
|
||||||
|
|
||||||
def get_order(self, aStr):
|
def get_order(self, aBuf):
|
||||||
# for euc-TW encoding, we are interested
|
# for euc-TW encoding, we are interested
|
||||||
# first byte range: 0xc4 -- 0xfe
|
# first byte range: 0xc4 -- 0xfe
|
||||||
# second byte range: 0xa1 -- 0xfe
|
# second byte range: 0xa1 -- 0xfe
|
||||||
# no validation needed here. State machine has done that
|
# no validation needed here. State machine has done that
|
||||||
if aStr[0] >= '\xC4':
|
first_char = wrap_ord(aBuf[0])
|
||||||
return 94 * (ord(aStr[0]) - 0xC4) + ord(aStr[1]) - 0xA1
|
if first_char >= 0xC4:
|
||||||
|
return 94 * (first_char - 0xC4) + wrap_ord(aBuf[1]) - 0xA1
|
||||||
else:
|
else:
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
|
|
||||||
class EUCKRDistributionAnalysis(CharDistributionAnalysis):
|
class EUCKRDistributionAnalysis(CharDistributionAnalysis):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
CharDistributionAnalysis.__init__(self)
|
CharDistributionAnalysis.__init__(self)
|
||||||
|
@ -112,15 +134,17 @@ class EUCKRDistributionAnalysis(CharDistributionAnalysis):
|
||||||
self._mTableSize = EUCKR_TABLE_SIZE
|
self._mTableSize = EUCKR_TABLE_SIZE
|
||||||
self._mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
|
self._mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
|
||||||
|
|
||||||
def get_order(self, aStr):
|
def get_order(self, aBuf):
|
||||||
# for euc-KR encoding, we are interested
|
# for euc-KR encoding, we are interested
|
||||||
# first byte range: 0xb0 -- 0xfe
|
# first byte range: 0xb0 -- 0xfe
|
||||||
# second byte range: 0xa1 -- 0xfe
|
# second byte range: 0xa1 -- 0xfe
|
||||||
# no validation needed here. State machine has done that
|
# no validation needed here. State machine has done that
|
||||||
if aStr[0] >= '\xB0':
|
first_char = wrap_ord(aBuf[0])
|
||||||
return 94 * (ord(aStr[0]) - 0xB0) + ord(aStr[1]) - 0xA1
|
if first_char >= 0xB0:
|
||||||
|
return 94 * (first_char - 0xB0) + wrap_ord(aBuf[1]) - 0xA1
|
||||||
else:
|
else:
|
||||||
return -1;
|
return -1
|
||||||
|
|
||||||
|
|
||||||
class GB2312DistributionAnalysis(CharDistributionAnalysis):
|
class GB2312DistributionAnalysis(CharDistributionAnalysis):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
@ -129,15 +153,17 @@ class GB2312DistributionAnalysis(CharDistributionAnalysis):
|
||||||
self._mTableSize = GB2312_TABLE_SIZE
|
self._mTableSize = GB2312_TABLE_SIZE
|
||||||
self._mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO
|
self._mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO
|
||||||
|
|
||||||
def get_order(self, aStr):
|
def get_order(self, aBuf):
|
||||||
# for GB2312 encoding, we are interested
|
# for GB2312 encoding, we are interested
|
||||||
# first byte range: 0xb0 -- 0xfe
|
# first byte range: 0xb0 -- 0xfe
|
||||||
# second byte range: 0xa1 -- 0xfe
|
# second byte range: 0xa1 -- 0xfe
|
||||||
# no validation needed here. State machine has done that
|
# no validation needed here. State machine has done that
|
||||||
if (aStr[0] >= '\xB0') and (aStr[1] >= '\xA1'):
|
first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1])
|
||||||
return 94 * (ord(aStr[0]) - 0xB0) + ord(aStr[1]) - 0xA1
|
if (first_char >= 0xB0) and (second_char >= 0xA1):
|
||||||
|
return 94 * (first_char - 0xB0) + second_char - 0xA1
|
||||||
else:
|
else:
|
||||||
return -1;
|
return -1
|
||||||
|
|
||||||
|
|
||||||
class Big5DistributionAnalysis(CharDistributionAnalysis):
|
class Big5DistributionAnalysis(CharDistributionAnalysis):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
@ -146,19 +172,21 @@ class Big5DistributionAnalysis(CharDistributionAnalysis):
|
||||||
self._mTableSize = BIG5_TABLE_SIZE
|
self._mTableSize = BIG5_TABLE_SIZE
|
||||||
self._mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO
|
self._mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO
|
||||||
|
|
||||||
def get_order(self, aStr):
|
def get_order(self, aBuf):
|
||||||
# for big5 encoding, we are interested
|
# for big5 encoding, we are interested
|
||||||
# first byte range: 0xa4 -- 0xfe
|
# first byte range: 0xa4 -- 0xfe
|
||||||
# second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
|
# second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
|
||||||
# no validation needed here. State machine has done that
|
# no validation needed here. State machine has done that
|
||||||
if aStr[0] >= '\xA4':
|
first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1])
|
||||||
if aStr[1] >= '\xA1':
|
if first_char >= 0xA4:
|
||||||
return 157 * (ord(aStr[0]) - 0xA4) + ord(aStr[1]) - 0xA1 + 63
|
if second_char >= 0xA1:
|
||||||
|
return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63
|
||||||
else:
|
else:
|
||||||
return 157 * (ord(aStr[0]) - 0xA4) + ord(aStr[1]) - 0x40
|
return 157 * (first_char - 0xA4) + second_char - 0x40
|
||||||
else:
|
else:
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
|
|
||||||
class SJISDistributionAnalysis(CharDistributionAnalysis):
|
class SJISDistributionAnalysis(CharDistributionAnalysis):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
CharDistributionAnalysis.__init__(self)
|
CharDistributionAnalysis.__init__(self)
|
||||||
|
@ -166,22 +194,24 @@ class SJISDistributionAnalysis(CharDistributionAnalysis):
|
||||||
self._mTableSize = JIS_TABLE_SIZE
|
self._mTableSize = JIS_TABLE_SIZE
|
||||||
self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
|
self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
|
||||||
|
|
||||||
def get_order(self, aStr):
|
def get_order(self, aBuf):
|
||||||
# for sjis encoding, we are interested
|
# for sjis encoding, we are interested
|
||||||
# first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
|
# first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
|
||||||
# second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
|
# second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
|
||||||
# no validation needed here. State machine has done that
|
# no validation needed here. State machine has done that
|
||||||
if (aStr[0] >= '\x81') and (aStr[0] <= '\x9F'):
|
first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1])
|
||||||
order = 188 * (ord(aStr[0]) - 0x81)
|
if (first_char >= 0x81) and (first_char <= 0x9F):
|
||||||
elif (aStr[0] >= '\xE0') and (aStr[0] <= '\xEF'):
|
order = 188 * (first_char - 0x81)
|
||||||
order = 188 * (ord(aStr[0]) - 0xE0 + 31)
|
elif (first_char >= 0xE0) and (first_char <= 0xEF):
|
||||||
|
order = 188 * (first_char - 0xE0 + 31)
|
||||||
else:
|
else:
|
||||||
return -1;
|
return -1
|
||||||
order = order + ord(aStr[1]) - 0x40
|
order = order + second_char - 0x40
|
||||||
if aStr[1] > '\x7F':
|
if second_char > 0x7F:
|
||||||
order =- 1
|
order = -1
|
||||||
return order
|
return order
|
||||||
|
|
||||||
|
|
||||||
class EUCJPDistributionAnalysis(CharDistributionAnalysis):
|
class EUCJPDistributionAnalysis(CharDistributionAnalysis):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
CharDistributionAnalysis.__init__(self)
|
CharDistributionAnalysis.__init__(self)
|
||||||
|
@ -189,12 +219,13 @@ class EUCJPDistributionAnalysis(CharDistributionAnalysis):
|
||||||
self._mTableSize = JIS_TABLE_SIZE
|
self._mTableSize = JIS_TABLE_SIZE
|
||||||
self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
|
self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
|
||||||
|
|
||||||
def get_order(self, aStr):
|
def get_order(self, aBuf):
|
||||||
# for euc-JP encoding, we are interested
|
# for euc-JP encoding, we are interested
|
||||||
# first byte range: 0xa0 -- 0xfe
|
# first byte range: 0xa0 -- 0xfe
|
||||||
# second byte range: 0xa1 -- 0xfe
|
# second byte range: 0xa1 -- 0xfe
|
||||||
# no validation needed here. State machine has done that
|
# no validation needed here. State machine has done that
|
||||||
if aStr[0] >= '\xA0':
|
char = wrap_ord(aBuf[0])
|
||||||
return 94 * (ord(aStr[0]) - 0xA1) + ord(aStr[1]) - 0xa1
|
if char >= 0xA0:
|
||||||
|
return 94 * (char - 0xA1) + wrap_ord(aBuf[1]) - 0xa1
|
||||||
else:
|
else:
|
||||||
return -1
|
return -1
|
||||||
|
|
34
thirdparty/chardet/charsetgroupprober.py
vendored
34
thirdparty/chardet/charsetgroupprober.py
vendored
|
@ -25,8 +25,10 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
import constants, sys
|
from . import constants
|
||||||
from charsetprober import CharSetProber
|
import sys
|
||||||
|
from .charsetprober import CharSetProber
|
||||||
|
|
||||||
|
|
||||||
class CharSetGroupProber(CharSetProber):
|
class CharSetGroupProber(CharSetProber):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
@ -41,28 +43,32 @@ class CharSetGroupProber(CharSetProber):
|
||||||
for prober in self._mProbers:
|
for prober in self._mProbers:
|
||||||
if prober:
|
if prober:
|
||||||
prober.reset()
|
prober.reset()
|
||||||
prober.active = constants.True
|
prober.active = True
|
||||||
self._mActiveNum += 1
|
self._mActiveNum += 1
|
||||||
self._mBestGuessProber = None
|
self._mBestGuessProber = None
|
||||||
|
|
||||||
def get_charset_name(self):
|
def get_charset_name(self):
|
||||||
if not self._mBestGuessProber:
|
if not self._mBestGuessProber:
|
||||||
self.get_confidence()
|
self.get_confidence()
|
||||||
if not self._mBestGuessProber: return None
|
if not self._mBestGuessProber:
|
||||||
|
return None
|
||||||
# self._mBestGuessProber = self._mProbers[0]
|
# self._mBestGuessProber = self._mProbers[0]
|
||||||
return self._mBestGuessProber.get_charset_name()
|
return self._mBestGuessProber.get_charset_name()
|
||||||
|
|
||||||
def feed(self, aBuf):
|
def feed(self, aBuf):
|
||||||
for prober in self._mProbers:
|
for prober in self._mProbers:
|
||||||
if not prober: continue
|
if not prober:
|
||||||
if not prober.active: continue
|
continue
|
||||||
|
if not prober.active:
|
||||||
|
continue
|
||||||
st = prober.feed(aBuf)
|
st = prober.feed(aBuf)
|
||||||
if not st: continue
|
if not st:
|
||||||
|
continue
|
||||||
if st == constants.eFoundIt:
|
if st == constants.eFoundIt:
|
||||||
self._mBestGuessProber = prober
|
self._mBestGuessProber = prober
|
||||||
return self.get_state()
|
return self.get_state()
|
||||||
elif st == constants.eNotMe:
|
elif st == constants.eNotMe:
|
||||||
prober.active = constants.False
|
prober.active = False
|
||||||
self._mActiveNum -= 1
|
self._mActiveNum -= 1
|
||||||
if self._mActiveNum <= 0:
|
if self._mActiveNum <= 0:
|
||||||
self._mState = constants.eNotMe
|
self._mState = constants.eNotMe
|
||||||
|
@ -78,18 +84,22 @@ class CharSetGroupProber(CharSetProber):
|
||||||
bestConf = 0.0
|
bestConf = 0.0
|
||||||
self._mBestGuessProber = None
|
self._mBestGuessProber = None
|
||||||
for prober in self._mProbers:
|
for prober in self._mProbers:
|
||||||
if not prober: continue
|
if not prober:
|
||||||
|
continue
|
||||||
if not prober.active:
|
if not prober.active:
|
||||||
if constants._debug:
|
if constants._debug:
|
||||||
sys.stderr.write(prober.get_charset_name() + ' not active\n')
|
sys.stderr.write(prober.get_charset_name()
|
||||||
|
+ ' not active\n')
|
||||||
continue
|
continue
|
||||||
cf = prober.get_confidence()
|
cf = prober.get_confidence()
|
||||||
if constants._debug:
|
if constants._debug:
|
||||||
sys.stderr.write('%s confidence = %s\n' % (prober.get_charset_name(), cf))
|
sys.stderr.write('%s confidence = %s\n' %
|
||||||
|
(prober.get_charset_name(), cf))
|
||||||
if bestConf < cf:
|
if bestConf < cf:
|
||||||
bestConf = cf
|
bestConf = cf
|
||||||
self._mBestGuessProber = prober
|
self._mBestGuessProber = prober
|
||||||
if not self._mBestGuessProber: return 0.0
|
if not self._mBestGuessProber:
|
||||||
|
return 0.0
|
||||||
return bestConf
|
return bestConf
|
||||||
# else:
|
# else:
|
||||||
# self._mBestGuessProber = self._mProbers[0]
|
# self._mBestGuessProber = self._mProbers[0]
|
||||||
|
|
8
thirdparty/chardet/charsetprober.py
vendored
8
thirdparty/chardet/charsetprober.py
vendored
|
@ -26,7 +26,9 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
import constants, re
|
from . import constants
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
class CharSetProber:
|
class CharSetProber:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
@ -48,11 +50,11 @@ class CharSetProber:
|
||||||
return 0.0
|
return 0.0
|
||||||
|
|
||||||
def filter_high_bit_only(self, aBuf):
|
def filter_high_bit_only(self, aBuf):
|
||||||
aBuf = re.sub(r'([\x00-\x7F])+', ' ', aBuf)
|
aBuf = re.sub(b'([\x00-\x7F])+', b' ', aBuf)
|
||||||
return aBuf
|
return aBuf
|
||||||
|
|
||||||
def filter_without_english_letters(self, aBuf):
|
def filter_without_english_letters(self, aBuf):
|
||||||
aBuf = re.sub(r'([A-Za-z])+', ' ', aBuf)
|
aBuf = re.sub(b'([A-Za-z])+', b' ', aBuf)
|
||||||
return aBuf
|
return aBuf
|
||||||
|
|
||||||
def filter_with_english_letters(self, aBuf):
|
def filter_with_english_letters(self, aBuf):
|
||||||
|
|
11
thirdparty/chardet/codingstatemachine.py
vendored
11
thirdparty/chardet/codingstatemachine.py
vendored
|
@ -25,7 +25,9 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from constants import eStart, eError, eItsMe
|
from .constants import eStart
|
||||||
|
from .compat import wrap_ord
|
||||||
|
|
||||||
|
|
||||||
class CodingStateMachine:
|
class CodingStateMachine:
|
||||||
def __init__(self, sm):
|
def __init__(self, sm):
|
||||||
|
@ -40,12 +42,15 @@ class CodingStateMachine:
|
||||||
def next_state(self, c):
|
def next_state(self, c):
|
||||||
# for each byte we get its class
|
# for each byte we get its class
|
||||||
# if it is first byte, we also get byte length
|
# if it is first byte, we also get byte length
|
||||||
byteCls = self._mModel['classTable'][ord(c)]
|
# PY3K: aBuf is a byte stream, so c is an int, not a byte
|
||||||
|
byteCls = self._mModel['classTable'][wrap_ord(c)]
|
||||||
if self._mCurrentState == eStart:
|
if self._mCurrentState == eStart:
|
||||||
self._mCurrentBytePos = 0
|
self._mCurrentBytePos = 0
|
||||||
self._mCurrentCharLen = self._mModel['charLenTable'][byteCls]
|
self._mCurrentCharLen = self._mModel['charLenTable'][byteCls]
|
||||||
# from byte's class and stateTable, we get its next state
|
# from byte's class and stateTable, we get its next state
|
||||||
self._mCurrentState = self._mModel['stateTable'][self._mCurrentState * self._mModel['classFactor'] + byteCls]
|
curr_state = (self._mCurrentState * self._mModel['classFactor']
|
||||||
|
+ byteCls)
|
||||||
|
self._mCurrentState = self._mModel['stateTable'][curr_state]
|
||||||
self._mCurrentBytePos += 1
|
self._mCurrentBytePos += 1
|
||||||
return self._mCurrentState
|
return self._mCurrentState
|
||||||
|
|
||||||
|
|
34
thirdparty/chardet/compat.py
vendored
Normal file
34
thirdparty/chardet/compat.py
vendored
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
######################## BEGIN LICENSE BLOCK ########################
|
||||||
|
# Contributor(s):
|
||||||
|
# Ian Cordasco - port to Python
|
||||||
|
#
|
||||||
|
# This library is free software; you can redistribute it and/or
|
||||||
|
# modify it under the terms of the GNU Lesser General Public
|
||||||
|
# License as published by the Free Software Foundation; either
|
||||||
|
# version 2.1 of the License, or (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This library is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
# Lesser General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU Lesser General Public
|
||||||
|
# License along with this library; if not, write to the Free Software
|
||||||
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||||
|
# 02110-1301 USA
|
||||||
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
if sys.version_info < (3, 0):
|
||||||
|
base_str = (str, unicode)
|
||||||
|
else:
|
||||||
|
base_str = (bytes, str)
|
||||||
|
|
||||||
|
|
||||||
|
def wrap_ord(a):
|
||||||
|
if sys.version_info < (3, 0) and isinstance(a, base_str):
|
||||||
|
return ord(a)
|
||||||
|
else:
|
||||||
|
return a
|
8
thirdparty/chardet/constants.py
vendored
8
thirdparty/chardet/constants.py
vendored
|
@ -37,11 +37,3 @@ eError = 1
|
||||||
eItsMe = 2
|
eItsMe = 2
|
||||||
|
|
||||||
SHORTCUT_THRESHOLD = 0.95
|
SHORTCUT_THRESHOLD = 0.95
|
||||||
|
|
||||||
import __builtin__
|
|
||||||
if not hasattr(__builtin__, 'False'):
|
|
||||||
False = 0
|
|
||||||
True = 1
|
|
||||||
else:
|
|
||||||
False = __builtin__.False
|
|
||||||
True = __builtin__.True
|
|
||||||
|
|
44
thirdparty/chardet/cp949prober.py
vendored
Normal file
44
thirdparty/chardet/cp949prober.py
vendored
Normal file
|
@ -0,0 +1,44 @@
|
||||||
|
######################## BEGIN LICENSE BLOCK ########################
|
||||||
|
# The Original Code is mozilla.org code.
|
||||||
|
#
|
||||||
|
# The Initial Developer of the Original Code is
|
||||||
|
# Netscape Communications Corporation.
|
||||||
|
# Portions created by the Initial Developer are Copyright (C) 1998
|
||||||
|
# the Initial Developer. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Contributor(s):
|
||||||
|
# Mark Pilgrim - port to Python
|
||||||
|
#
|
||||||
|
# This library is free software; you can redistribute it and/or
|
||||||
|
# modify it under the terms of the GNU Lesser General Public
|
||||||
|
# License as published by the Free Software Foundation; either
|
||||||
|
# version 2.1 of the License, or (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This library is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
# Lesser General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU Lesser General Public
|
||||||
|
# License along with this library; if not, write to the Free Software
|
||||||
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||||
|
# 02110-1301 USA
|
||||||
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
|
from .mbcharsetprober import MultiByteCharSetProber
|
||||||
|
from .codingstatemachine import CodingStateMachine
|
||||||
|
from .chardistribution import EUCKRDistributionAnalysis
|
||||||
|
from .mbcssm import CP949SMModel
|
||||||
|
|
||||||
|
|
||||||
|
class CP949Prober(MultiByteCharSetProber):
|
||||||
|
def __init__(self):
|
||||||
|
MultiByteCharSetProber.__init__(self)
|
||||||
|
self._mCodingSM = CodingStateMachine(CP949SMModel)
|
||||||
|
# NOTE: CP949 is a superset of EUC-KR, so the distribution should be
|
||||||
|
# not different.
|
||||||
|
self._mDistributionAnalyzer = EUCKRDistributionAnalysis()
|
||||||
|
self.reset()
|
||||||
|
|
||||||
|
def get_charset_name(self):
|
||||||
|
return "CP949"
|
33
thirdparty/chardet/escprober.py
vendored
33
thirdparty/chardet/escprober.py
vendored
|
@ -25,27 +25,31 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
import constants, sys
|
from . import constants
|
||||||
from escsm import HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, ISO2022KRSMModel
|
from .escsm import (HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel,
|
||||||
from charsetprober import CharSetProber
|
ISO2022KRSMModel)
|
||||||
from codingstatemachine import CodingStateMachine
|
from .charsetprober import CharSetProber
|
||||||
|
from .codingstatemachine import CodingStateMachine
|
||||||
|
from .compat import wrap_ord
|
||||||
|
|
||||||
|
|
||||||
class EscCharSetProber(CharSetProber):
|
class EscCharSetProber(CharSetProber):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
CharSetProber.__init__(self)
|
CharSetProber.__init__(self)
|
||||||
self._mCodingSM = [ \
|
self._mCodingSM = [
|
||||||
CodingStateMachine(HZSMModel),
|
CodingStateMachine(HZSMModel),
|
||||||
CodingStateMachine(ISO2022CNSMModel),
|
CodingStateMachine(ISO2022CNSMModel),
|
||||||
CodingStateMachine(ISO2022JPSMModel),
|
CodingStateMachine(ISO2022JPSMModel),
|
||||||
CodingStateMachine(ISO2022KRSMModel)
|
CodingStateMachine(ISO2022KRSMModel)
|
||||||
]
|
]
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
CharSetProber.reset(self)
|
CharSetProber.reset(self)
|
||||||
for codingSM in self._mCodingSM:
|
for codingSM in self._mCodingSM:
|
||||||
if not codingSM: continue
|
if not codingSM:
|
||||||
codingSM.active = constants.True
|
continue
|
||||||
|
codingSM.active = True
|
||||||
codingSM.reset()
|
codingSM.reset()
|
||||||
self._mActiveSM = len(self._mCodingSM)
|
self._mActiveSM = len(self._mCodingSM)
|
||||||
self._mDetectedCharset = None
|
self._mDetectedCharset = None
|
||||||
|
@ -61,19 +65,22 @@ class EscCharSetProber(CharSetProber):
|
||||||
|
|
||||||
def feed(self, aBuf):
|
def feed(self, aBuf):
|
||||||
for c in aBuf:
|
for c in aBuf:
|
||||||
|
# PY3K: aBuf is a byte array, so c is an int, not a byte
|
||||||
for codingSM in self._mCodingSM:
|
for codingSM in self._mCodingSM:
|
||||||
if not codingSM: continue
|
if not codingSM:
|
||||||
if not codingSM.active: continue
|
continue
|
||||||
codingState = codingSM.next_state(c)
|
if not codingSM.active:
|
||||||
|
continue
|
||||||
|
codingState = codingSM.next_state(wrap_ord(c))
|
||||||
if codingState == constants.eError:
|
if codingState == constants.eError:
|
||||||
codingSM.active = constants.False
|
codingSM.active = False
|
||||||
self._mActiveSM -= 1
|
self._mActiveSM -= 1
|
||||||
if self._mActiveSM <= 0:
|
if self._mActiveSM <= 0:
|
||||||
self._mState = constants.eNotMe
|
self._mState = constants.eNotMe
|
||||||
return self.get_state()
|
return self.get_state()
|
||||||
elif codingState == constants.eItsMe:
|
elif codingState == constants.eItsMe:
|
||||||
self._mState = constants.eFoundIt
|
self._mState = constants.eFoundIt
|
||||||
self._mDetectedCharset = codingSM.get_coding_state_machine()
|
self._mDetectedCharset = codingSM.get_coding_state_machine() # nopep8
|
||||||
return self.get_state()
|
return self.get_state()
|
||||||
|
|
||||||
return self.get_state()
|
return self.get_state()
|
||||||
|
|
20
thirdparty/chardet/escsm.py
vendored
20
thirdparty/chardet/escsm.py
vendored
|
@ -25,9 +25,9 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from constants import eStart, eError, eItsMe
|
from .constants import eStart, eError, eItsMe
|
||||||
|
|
||||||
HZ_cls = ( \
|
HZ_cls = (
|
||||||
1,0,0,0,0,0,0,0, # 00 - 07
|
1,0,0,0,0,0,0,0, # 00 - 07
|
||||||
0,0,0,0,0,0,0,0, # 08 - 0f
|
0,0,0,0,0,0,0,0, # 08 - 0f
|
||||||
0,0,0,0,0,0,0,0, # 10 - 17
|
0,0,0,0,0,0,0,0, # 10 - 17
|
||||||
|
@ -62,7 +62,7 @@ HZ_cls = ( \
|
||||||
1,1,1,1,1,1,1,1, # f8 - ff
|
1,1,1,1,1,1,1,1, # f8 - ff
|
||||||
)
|
)
|
||||||
|
|
||||||
HZ_st = ( \
|
HZ_st = (
|
||||||
eStart,eError, 3,eStart,eStart,eStart,eError,eError,# 00-07
|
eStart,eError, 3,eStart,eStart,eStart,eError,eError,# 00-07
|
||||||
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f
|
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f
|
||||||
eItsMe,eItsMe,eError,eError,eStart,eStart, 4,eError,# 10-17
|
eItsMe,eItsMe,eError,eError,eStart,eStart, 4,eError,# 10-17
|
||||||
|
@ -79,7 +79,7 @@ HZSMModel = {'classTable': HZ_cls,
|
||||||
'charLenTable': HZCharLenTable,
|
'charLenTable': HZCharLenTable,
|
||||||
'name': "HZ-GB-2312"}
|
'name': "HZ-GB-2312"}
|
||||||
|
|
||||||
ISO2022CN_cls = ( \
|
ISO2022CN_cls = (
|
||||||
2,0,0,0,0,0,0,0, # 00 - 07
|
2,0,0,0,0,0,0,0, # 00 - 07
|
||||||
0,0,0,0,0,0,0,0, # 08 - 0f
|
0,0,0,0,0,0,0,0, # 08 - 0f
|
||||||
0,0,0,0,0,0,0,0, # 10 - 17
|
0,0,0,0,0,0,0,0, # 10 - 17
|
||||||
|
@ -114,7 +114,7 @@ ISO2022CN_cls = ( \
|
||||||
2,2,2,2,2,2,2,2, # f8 - ff
|
2,2,2,2,2,2,2,2, # f8 - ff
|
||||||
)
|
)
|
||||||
|
|
||||||
ISO2022CN_st = ( \
|
ISO2022CN_st = (
|
||||||
eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07
|
eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07
|
||||||
eStart,eError,eError,eError,eError,eError,eError,eError,# 08-0f
|
eStart,eError,eError,eError,eError,eError,eError,eError,# 08-0f
|
||||||
eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17
|
eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17
|
||||||
|
@ -133,7 +133,7 @@ ISO2022CNSMModel = {'classTable': ISO2022CN_cls,
|
||||||
'charLenTable': ISO2022CNCharLenTable,
|
'charLenTable': ISO2022CNCharLenTable,
|
||||||
'name': "ISO-2022-CN"}
|
'name': "ISO-2022-CN"}
|
||||||
|
|
||||||
ISO2022JP_cls = ( \
|
ISO2022JP_cls = (
|
||||||
2,0,0,0,0,0,0,0, # 00 - 07
|
2,0,0,0,0,0,0,0, # 00 - 07
|
||||||
0,0,0,0,0,0,2,2, # 08 - 0f
|
0,0,0,0,0,0,2,2, # 08 - 0f
|
||||||
0,0,0,0,0,0,0,0, # 10 - 17
|
0,0,0,0,0,0,0,0, # 10 - 17
|
||||||
|
@ -168,7 +168,7 @@ ISO2022JP_cls = ( \
|
||||||
2,2,2,2,2,2,2,2, # f8 - ff
|
2,2,2,2,2,2,2,2, # f8 - ff
|
||||||
)
|
)
|
||||||
|
|
||||||
ISO2022JP_st = ( \
|
ISO2022JP_st = (
|
||||||
eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07
|
eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07
|
||||||
eStart,eStart,eError,eError,eError,eError,eError,eError,# 08-0f
|
eStart,eStart,eError,eError,eError,eError,eError,eError,# 08-0f
|
||||||
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17
|
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17
|
||||||
|
@ -188,7 +188,7 @@ ISO2022JPSMModel = {'classTable': ISO2022JP_cls,
|
||||||
'charLenTable': ISO2022JPCharLenTable,
|
'charLenTable': ISO2022JPCharLenTable,
|
||||||
'name': "ISO-2022-JP"}
|
'name': "ISO-2022-JP"}
|
||||||
|
|
||||||
ISO2022KR_cls = ( \
|
ISO2022KR_cls = (
|
||||||
2,0,0,0,0,0,0,0, # 00 - 07
|
2,0,0,0,0,0,0,0, # 00 - 07
|
||||||
0,0,0,0,0,0,0,0, # 08 - 0f
|
0,0,0,0,0,0,0,0, # 08 - 0f
|
||||||
0,0,0,0,0,0,0,0, # 10 - 17
|
0,0,0,0,0,0,0,0, # 10 - 17
|
||||||
|
@ -223,7 +223,7 @@ ISO2022KR_cls = ( \
|
||||||
2,2,2,2,2,2,2,2, # f8 - ff
|
2,2,2,2,2,2,2,2, # f8 - ff
|
||||||
)
|
)
|
||||||
|
|
||||||
ISO2022KR_st = ( \
|
ISO2022KR_st = (
|
||||||
eStart, 3,eError,eStart,eStart,eStart,eError,eError,# 00-07
|
eStart, 3,eError,eStart,eStart,eStart,eError,eError,# 00-07
|
||||||
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f
|
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f
|
||||||
eItsMe,eItsMe,eError,eError,eError, 4,eError,eError,# 10-17
|
eItsMe,eItsMe,eError,eError,eError, 4,eError,eError,# 10-17
|
||||||
|
@ -238,3 +238,5 @@ ISO2022KRSMModel = {'classTable': ISO2022KR_cls,
|
||||||
'stateTable': ISO2022KR_st,
|
'stateTable': ISO2022KR_st,
|
||||||
'charLenTable': ISO2022KRCharLenTable,
|
'charLenTable': ISO2022KRCharLenTable,
|
||||||
'name': "ISO-2022-KR"}
|
'name': "ISO-2022-KR"}
|
||||||
|
|
||||||
|
# flake8: noqa
|
||||||
|
|
37
thirdparty/chardet/eucjpprober.py
vendored
37
thirdparty/chardet/eucjpprober.py
vendored
|
@ -25,13 +25,14 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
import constants, sys
|
import sys
|
||||||
from constants import eStart, eError, eItsMe
|
from . import constants
|
||||||
from mbcharsetprober import MultiByteCharSetProber
|
from .mbcharsetprober import MultiByteCharSetProber
|
||||||
from codingstatemachine import CodingStateMachine
|
from .codingstatemachine import CodingStateMachine
|
||||||
from chardistribution import EUCJPDistributionAnalysis
|
from .chardistribution import EUCJPDistributionAnalysis
|
||||||
from jpcntx import EUCJPContextAnalysis
|
from .jpcntx import EUCJPContextAnalysis
|
||||||
from mbcssm import EUCJPSMModel
|
from .mbcssm import EUCJPSMModel
|
||||||
|
|
||||||
|
|
||||||
class EUCJPProber(MultiByteCharSetProber):
|
class EUCJPProber(MultiByteCharSetProber):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
@ -50,31 +51,35 @@ class EUCJPProber(MultiByteCharSetProber):
|
||||||
|
|
||||||
def feed(self, aBuf):
|
def feed(self, aBuf):
|
||||||
aLen = len(aBuf)
|
aLen = len(aBuf)
|
||||||
for i in xrange(0, aLen):
|
for i in range(0, aLen):
|
||||||
|
# PY3K: aBuf is a byte array, so aBuf[i] is an int, not a byte
|
||||||
codingState = self._mCodingSM.next_state(aBuf[i])
|
codingState = self._mCodingSM.next_state(aBuf[i])
|
||||||
if codingState == eError:
|
if codingState == constants.eError:
|
||||||
if constants._debug:
|
if constants._debug:
|
||||||
sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n')
|
sys.stderr.write(self.get_charset_name()
|
||||||
|
+ ' prober hit error at byte ' + str(i)
|
||||||
|
+ '\n')
|
||||||
self._mState = constants.eNotMe
|
self._mState = constants.eNotMe
|
||||||
break
|
break
|
||||||
elif codingState == eItsMe:
|
elif codingState == constants.eItsMe:
|
||||||
self._mState = constants.eFoundIt
|
self._mState = constants.eFoundIt
|
||||||
break
|
break
|
||||||
elif codingState == eStart:
|
elif codingState == constants.eStart:
|
||||||
charLen = self._mCodingSM.get_current_charlen()
|
charLen = self._mCodingSM.get_current_charlen()
|
||||||
if i == 0:
|
if i == 0:
|
||||||
self._mLastChar[1] = aBuf[0]
|
self._mLastChar[1] = aBuf[0]
|
||||||
self._mContextAnalyzer.feed(self._mLastChar, charLen)
|
self._mContextAnalyzer.feed(self._mLastChar, charLen)
|
||||||
self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
|
self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
|
||||||
else:
|
else:
|
||||||
self._mContextAnalyzer.feed(aBuf[i-1:i+1], charLen)
|
self._mContextAnalyzer.feed(aBuf[i - 1:i + 1], charLen)
|
||||||
self._mDistributionAnalyzer.feed(aBuf[i-1:i+1], charLen)
|
self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1],
|
||||||
|
charLen)
|
||||||
|
|
||||||
self._mLastChar[0] = aBuf[aLen - 1]
|
self._mLastChar[0] = aBuf[aLen - 1]
|
||||||
|
|
||||||
if self.get_state() == constants.eDetecting:
|
if self.get_state() == constants.eDetecting:
|
||||||
if self._mContextAnalyzer.got_enough_data() and \
|
if (self._mContextAnalyzer.got_enough_data() and
|
||||||
(self.get_confidence() > constants.SHORTCUT_THRESHOLD):
|
(self.get_confidence() > constants.SHORTCUT_THRESHOLD)):
|
||||||
self._mState = constants.eFoundIt
|
self._mState = constants.eFoundIt
|
||||||
|
|
||||||
return self.get_state()
|
return self.get_state()
|
||||||
|
|
2
thirdparty/chardet/euckrfreq.py
vendored
2
thirdparty/chardet/euckrfreq.py
vendored
|
@ -592,3 +592,5 @@ EUCKRCharToFreqOrder = ( \
|
||||||
8704,8705,8706,8707,8708,8709,8710,8711,8712,8713,8714,8715,8716,8717,8718,8719,
|
8704,8705,8706,8707,8708,8709,8710,8711,8712,8713,8714,8715,8716,8717,8718,8719,
|
||||||
8720,8721,8722,8723,8724,8725,8726,8727,8728,8729,8730,8731,8732,8733,8734,8735,
|
8720,8721,8722,8723,8724,8725,8726,8727,8728,8729,8730,8731,8732,8733,8734,8735,
|
||||||
8736,8737,8738,8739,8740,8741)
|
8736,8737,8738,8739,8740,8741)
|
||||||
|
|
||||||
|
# flake8: noqa
|
||||||
|
|
9
thirdparty/chardet/euckrprober.py
vendored
9
thirdparty/chardet/euckrprober.py
vendored
|
@ -25,10 +25,11 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from mbcharsetprober import MultiByteCharSetProber
|
from .mbcharsetprober import MultiByteCharSetProber
|
||||||
from codingstatemachine import CodingStateMachine
|
from .codingstatemachine import CodingStateMachine
|
||||||
from chardistribution import EUCKRDistributionAnalysis
|
from .chardistribution import EUCKRDistributionAnalysis
|
||||||
from mbcssm import EUCKRSMModel
|
from .mbcssm import EUCKRSMModel
|
||||||
|
|
||||||
|
|
||||||
class EUCKRProber(MultiByteCharSetProber):
|
class EUCKRProber(MultiByteCharSetProber):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
4
thirdparty/chardet/euctwfreq.py
vendored
4
thirdparty/chardet/euctwfreq.py
vendored
|
@ -46,7 +46,7 @@ EUCTW_TYPICAL_DISTRIBUTION_RATIO = 0.75
|
||||||
# Char to FreqOrder table ,
|
# Char to FreqOrder table ,
|
||||||
EUCTW_TABLE_SIZE = 8102
|
EUCTW_TABLE_SIZE = 8102
|
||||||
|
|
||||||
EUCTWCharToFreqOrder = ( \
|
EUCTWCharToFreqOrder = (
|
||||||
1,1800,1506, 255,1431, 198, 9, 82, 6,7310, 177, 202,3615,1256,2808, 110, # 2742
|
1,1800,1506, 255,1431, 198, 9, 82, 6,7310, 177, 202,3615,1256,2808, 110, # 2742
|
||||||
3735, 33,3241, 261, 76, 44,2113, 16,2931,2184,1176, 659,3868, 26,3404,2643, # 2758
|
3735, 33,3241, 261, 76, 44,2113, 16,2931,2184,1176, 659,3868, 26,3404,2643, # 2758
|
||||||
1198,3869,3313,4060, 410,2211, 302, 590, 361,1963, 8, 204, 58,4296,7311,1931, # 2774
|
1198,3869,3313,4060, 410,2211, 302, 590, 361,1963, 8, 204, 58,4296,7311,1931, # 2774
|
||||||
|
@ -424,3 +424,5 @@ EUCTWCharToFreqOrder = ( \
|
||||||
8694,8695,8696,8697,8698,8699,8700,8701,8702,8703,8704,8705,8706,8707,8708,8709, # 8710
|
8694,8695,8696,8697,8698,8699,8700,8701,8702,8703,8704,8705,8706,8707,8708,8709, # 8710
|
||||||
8710,8711,8712,8713,8714,8715,8716,8717,8718,8719,8720,8721,8722,8723,8724,8725, # 8726
|
8710,8711,8712,8713,8714,8715,8716,8717,8718,8719,8720,8721,8722,8723,8724,8725, # 8726
|
||||||
8726,8727,8728,8729,8730,8731,8732,8733,8734,8735,8736,8737,8738,8739,8740,8741) # 8742
|
8726,8727,8728,8729,8730,8731,8732,8733,8734,8735,8736,8737,8738,8739,8740,8741) # 8742
|
||||||
|
|
||||||
|
# flake8: noqa
|
||||||
|
|
8
thirdparty/chardet/euctwprober.py
vendored
8
thirdparty/chardet/euctwprober.py
vendored
|
@ -25,10 +25,10 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from mbcharsetprober import MultiByteCharSetProber
|
from .mbcharsetprober import MultiByteCharSetProber
|
||||||
from codingstatemachine import CodingStateMachine
|
from .codingstatemachine import CodingStateMachine
|
||||||
from chardistribution import EUCTWDistributionAnalysis
|
from .chardistribution import EUCTWDistributionAnalysis
|
||||||
from mbcssm import EUCTWSMModel
|
from .mbcssm import EUCTWSMModel
|
||||||
|
|
||||||
class EUCTWProber(MultiByteCharSetProber):
|
class EUCTWProber(MultiByteCharSetProber):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
3
thirdparty/chardet/gb2312freq.py
vendored
3
thirdparty/chardet/gb2312freq.py
vendored
|
@ -43,7 +43,7 @@ GB2312_TYPICAL_DISTRIBUTION_RATIO = 0.9
|
||||||
|
|
||||||
GB2312_TABLE_SIZE = 3760
|
GB2312_TABLE_SIZE = 3760
|
||||||
|
|
||||||
GB2312CharToFreqOrder = ( \
|
GB2312CharToFreqOrder = (
|
||||||
1671, 749,1443,2364,3924,3807,2330,3921,1704,3463,2691,1511,1515, 572,3191,2205,
|
1671, 749,1443,2364,3924,3807,2330,3921,1704,3463,2691,1511,1515, 572,3191,2205,
|
||||||
2361, 224,2558, 479,1711, 963,3162, 440,4060,1905,2966,2947,3580,2647,3961,3842,
|
2361, 224,2558, 479,1711, 963,3162, 440,4060,1905,2966,2947,3580,2647,3961,3842,
|
||||||
2204, 869,4207, 970,2678,5626,2944,2956,1479,4048, 514,3595, 588,1346,2820,3409,
|
2204, 869,4207, 970,2678,5626,2944,2956,1479,4048, 514,3595, 588,1346,2820,3409,
|
||||||
|
@ -469,3 +469,4 @@ GB2312CharToFreqOrder = ( \
|
||||||
5867,5507,6273,4206,6274,4789,6098,6764,3619,3646,3833,3804,2394,3788,4936,3978,
|
5867,5507,6273,4206,6274,4789,6098,6764,3619,3646,3833,3804,2394,3788,4936,3978,
|
||||||
4866,4899,6099,6100,5559,6478,6765,3599,5868,6101,5869,5870,6275,6766,4527,6767)
|
4866,4899,6099,6100,5559,6478,6765,3599,5868,6101,5869,5870,6275,6766,4527,6767)
|
||||||
|
|
||||||
|
# flake8: noqa
|
||||||
|
|
8
thirdparty/chardet/gb2312prober.py
vendored
8
thirdparty/chardet/gb2312prober.py
vendored
|
@ -25,10 +25,10 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from mbcharsetprober import MultiByteCharSetProber
|
from .mbcharsetprober import MultiByteCharSetProber
|
||||||
from codingstatemachine import CodingStateMachine
|
from .codingstatemachine import CodingStateMachine
|
||||||
from chardistribution import GB2312DistributionAnalysis
|
from .chardistribution import GB2312DistributionAnalysis
|
||||||
from mbcssm import GB2312SMModel
|
from .mbcssm import GB2312SMModel
|
||||||
|
|
||||||
class GB2312Prober(MultiByteCharSetProber):
|
class GB2312Prober(MultiByteCharSetProber):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
132
thirdparty/chardet/hebrewprober.py
vendored
132
thirdparty/chardet/hebrewprober.py
vendored
|
@ -25,8 +25,9 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from charsetprober import CharSetProber
|
from .charsetprober import CharSetProber
|
||||||
import constants
|
from .constants import eNotMe, eDetecting
|
||||||
|
from .compat import wrap_ord
|
||||||
|
|
||||||
# This prober doesn't actually recognize a language or a charset.
|
# This prober doesn't actually recognize a language or a charset.
|
||||||
# It is a helper prober for the use of the Hebrew model probers
|
# It is a helper prober for the use of the Hebrew model probers
|
||||||
|
@ -126,28 +127,31 @@ import constants
|
||||||
# charset identified, either "windows-1255" or "ISO-8859-8".
|
# charset identified, either "windows-1255" or "ISO-8859-8".
|
||||||
|
|
||||||
# windows-1255 / ISO-8859-8 code points of interest
|
# windows-1255 / ISO-8859-8 code points of interest
|
||||||
FINAL_KAF = '\xea'
|
FINAL_KAF = 0xea
|
||||||
NORMAL_KAF = '\xeb'
|
NORMAL_KAF = 0xeb
|
||||||
FINAL_MEM = '\xed'
|
FINAL_MEM = 0xed
|
||||||
NORMAL_MEM = '\xee'
|
NORMAL_MEM = 0xee
|
||||||
FINAL_NUN = '\xef'
|
FINAL_NUN = 0xef
|
||||||
NORMAL_NUN = '\xf0'
|
NORMAL_NUN = 0xf0
|
||||||
FINAL_PE = '\xf3'
|
FINAL_PE = 0xf3
|
||||||
NORMAL_PE = '\xf4'
|
NORMAL_PE = 0xf4
|
||||||
FINAL_TSADI = '\xf5'
|
FINAL_TSADI = 0xf5
|
||||||
NORMAL_TSADI = '\xf6'
|
NORMAL_TSADI = 0xf6
|
||||||
|
|
||||||
# Minimum Visual vs Logical final letter score difference.
|
# Minimum Visual vs Logical final letter score difference.
|
||||||
# If the difference is below this, don't rely solely on the final letter score distance.
|
# If the difference is below this, don't rely solely on the final letter score
|
||||||
|
# distance.
|
||||||
MIN_FINAL_CHAR_DISTANCE = 5
|
MIN_FINAL_CHAR_DISTANCE = 5
|
||||||
|
|
||||||
# Minimum Visual vs Logical model score difference.
|
# Minimum Visual vs Logical model score difference.
|
||||||
# If the difference is below this, don't rely at all on the model score distance.
|
# If the difference is below this, don't rely at all on the model score
|
||||||
|
# distance.
|
||||||
MIN_MODEL_DISTANCE = 0.01
|
MIN_MODEL_DISTANCE = 0.01
|
||||||
|
|
||||||
VISUAL_HEBREW_NAME = "ISO-8859-8"
|
VISUAL_HEBREW_NAME = "ISO-8859-8"
|
||||||
LOGICAL_HEBREW_NAME = "windows-1255"
|
LOGICAL_HEBREW_NAME = "windows-1255"
|
||||||
|
|
||||||
|
|
||||||
class HebrewProber(CharSetProber):
|
class HebrewProber(CharSetProber):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
CharSetProber.__init__(self)
|
CharSetProber.__init__(self)
|
||||||
|
@ -159,8 +163,8 @@ class HebrewProber(CharSetProber):
|
||||||
self._mFinalCharLogicalScore = 0
|
self._mFinalCharLogicalScore = 0
|
||||||
self._mFinalCharVisualScore = 0
|
self._mFinalCharVisualScore = 0
|
||||||
# The two last characters seen in the previous buffer,
|
# The two last characters seen in the previous buffer,
|
||||||
# mPrev and mBeforePrev are initialized to space in order to simulate a word
|
# mPrev and mBeforePrev are initialized to space in order to simulate
|
||||||
# delimiter at the beginning of the data
|
# a word delimiter at the beginning of the data
|
||||||
self._mPrev = ' '
|
self._mPrev = ' '
|
||||||
self._mBeforePrev = ' '
|
self._mBeforePrev = ' '
|
||||||
# These probers are owned by the group prober.
|
# These probers are owned by the group prober.
|
||||||
|
@ -170,49 +174,52 @@ class HebrewProber(CharSetProber):
|
||||||
self._mVisualProber = visualProber
|
self._mVisualProber = visualProber
|
||||||
|
|
||||||
def is_final(self, c):
|
def is_final(self, c):
|
||||||
return c in [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE, FINAL_TSADI]
|
return wrap_ord(c) in [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE,
|
||||||
|
FINAL_TSADI]
|
||||||
|
|
||||||
def is_non_final(self, c):
|
def is_non_final(self, c):
|
||||||
# The normal Tsadi is not a good Non-Final letter due to words like
|
# The normal Tsadi is not a good Non-Final letter due to words like
|
||||||
# 'lechotet' (to chat) containing an apostrophe after the tsadi. This
|
# 'lechotet' (to chat) containing an apostrophe after the tsadi. This
|
||||||
# apostrophe is converted to a space in FilterWithoutEnglishLetters causing
|
# apostrophe is converted to a space in FilterWithoutEnglishLetters
|
||||||
# the Non-Final tsadi to appear at an end of a word even though this is not
|
# causing the Non-Final tsadi to appear at an end of a word even
|
||||||
# the case in the original text.
|
# though this is not the case in the original text.
|
||||||
# The letters Pe and Kaf rarely display a related behavior of not being a
|
# The letters Pe and Kaf rarely display a related behavior of not being
|
||||||
# good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for
|
# a good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak'
|
||||||
# example legally end with a Non-Final Pe or Kaf. However, the benefit of
|
# for example legally end with a Non-Final Pe or Kaf. However, the
|
||||||
# these letters as Non-Final letters outweighs the damage since these words
|
# benefit of these letters as Non-Final letters outweighs the damage
|
||||||
# are quite rare.
|
# since these words are quite rare.
|
||||||
return c in [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE]
|
return wrap_ord(c) in [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE]
|
||||||
|
|
||||||
def feed(self, aBuf):
|
def feed(self, aBuf):
|
||||||
# Final letter analysis for logical-visual decision.
|
# Final letter analysis for logical-visual decision.
|
||||||
# Look for evidence that the received buffer is either logical Hebrew or
|
# Look for evidence that the received buffer is either logical Hebrew
|
||||||
# visual Hebrew.
|
# or visual Hebrew.
|
||||||
# The following cases are checked:
|
# The following cases are checked:
|
||||||
# 1) A word longer than 1 letter, ending with a final letter. This is an
|
# 1) A word longer than 1 letter, ending with a final letter. This is
|
||||||
# indication that the text is laid out "naturally" since the final letter
|
# an indication that the text is laid out "naturally" since the
|
||||||
# really appears at the end. +1 for logical score.
|
# final letter really appears at the end. +1 for logical score.
|
||||||
# 2) A word longer than 1 letter, ending with a Non-Final letter. In normal
|
# 2) A word longer than 1 letter, ending with a Non-Final letter. In
|
||||||
# Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with
|
# normal Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi,
|
||||||
# the Non-Final form of that letter. Exceptions to this rule are mentioned
|
# should not end with the Non-Final form of that letter. Exceptions
|
||||||
# above in isNonFinal(). This is an indication that the text is laid out
|
# to this rule are mentioned above in isNonFinal(). This is an
|
||||||
# backwards. +1 for visual score
|
# indication that the text is laid out backwards. +1 for visual
|
||||||
# 3) A word longer than 1 letter, starting with a final letter. Final letters
|
# score
|
||||||
# should not appear at the beginning of a word. This is an indication that
|
# 3) A word longer than 1 letter, starting with a final letter. Final
|
||||||
# the text is laid out backwards. +1 for visual score.
|
# letters should not appear at the beginning of a word. This is an
|
||||||
|
# indication that the text is laid out backwards. +1 for visual
|
||||||
|
# score.
|
||||||
#
|
#
|
||||||
# The visual score and logical score are accumulated throughout the text and
|
# The visual score and logical score are accumulated throughout the
|
||||||
# are finally checked against each other in GetCharSetName().
|
# text and are finally checked against each other in GetCharSetName().
|
||||||
# No checking for final letters in the middle of words is done since that case
|
# No checking for final letters in the middle of words is done since
|
||||||
# is not an indication for either Logical or Visual text.
|
# that case is not an indication for either Logical or Visual text.
|
||||||
#
|
#
|
||||||
# We automatically filter out all 7-bit characters (replace them with spaces)
|
# We automatically filter out all 7-bit characters (replace them with
|
||||||
# so the word boundary detection works properly. [MAP]
|
# spaces) so the word boundary detection works properly. [MAP]
|
||||||
|
|
||||||
if self.get_state() == constants.eNotMe:
|
if self.get_state() == eNotMe:
|
||||||
# Both model probers say it's not them. No reason to continue.
|
# Both model probers say it's not them. No reason to continue.
|
||||||
return constants.eNotMe
|
return eNotMe
|
||||||
|
|
||||||
aBuf = self.filter_high_bit_only(aBuf)
|
aBuf = self.filter_high_bit_only(aBuf)
|
||||||
|
|
||||||
|
@ -220,23 +227,27 @@ class HebrewProber(CharSetProber):
|
||||||
if cur == ' ':
|
if cur == ' ':
|
||||||
# We stand on a space - a word just ended
|
# We stand on a space - a word just ended
|
||||||
if self._mBeforePrev != ' ':
|
if self._mBeforePrev != ' ':
|
||||||
# next-to-last char was not a space so self._mPrev is not a 1 letter word
|
# next-to-last char was not a space so self._mPrev is not a
|
||||||
|
# 1 letter word
|
||||||
if self.is_final(self._mPrev):
|
if self.is_final(self._mPrev):
|
||||||
# case (1) [-2:not space][-1:final letter][cur:space]
|
# case (1) [-2:not space][-1:final letter][cur:space]
|
||||||
self._mFinalCharLogicalScore += 1
|
self._mFinalCharLogicalScore += 1
|
||||||
elif self.is_non_final(self._mPrev):
|
elif self.is_non_final(self._mPrev):
|
||||||
# case (2) [-2:not space][-1:Non-Final letter][cur:space]
|
# case (2) [-2:not space][-1:Non-Final letter][
|
||||||
|
# cur:space]
|
||||||
self._mFinalCharVisualScore += 1
|
self._mFinalCharVisualScore += 1
|
||||||
else:
|
else:
|
||||||
# Not standing on a space
|
# Not standing on a space
|
||||||
if (self._mBeforePrev == ' ') and (self.is_final(self._mPrev)) and (cur != ' '):
|
if ((self._mBeforePrev == ' ') and
|
||||||
|
(self.is_final(self._mPrev)) and (cur != ' ')):
|
||||||
# case (3) [-2:space][-1:final letter][cur:not space]
|
# case (3) [-2:space][-1:final letter][cur:not space]
|
||||||
self._mFinalCharVisualScore += 1
|
self._mFinalCharVisualScore += 1
|
||||||
self._mBeforePrev = self._mPrev
|
self._mBeforePrev = self._mPrev
|
||||||
self._mPrev = cur
|
self._mPrev = cur
|
||||||
|
|
||||||
# Forever detecting, till the end or until both model probers return eNotMe (handled above)
|
# Forever detecting, till the end or until both model probers return
|
||||||
return constants.eDetecting
|
# eNotMe (handled above)
|
||||||
|
return eDetecting
|
||||||
|
|
||||||
def get_charset_name(self):
|
def get_charset_name(self):
|
||||||
# Make the decision: is it Logical or Visual?
|
# Make the decision: is it Logical or Visual?
|
||||||
|
@ -248,22 +259,25 @@ class HebrewProber(CharSetProber):
|
||||||
return VISUAL_HEBREW_NAME
|
return VISUAL_HEBREW_NAME
|
||||||
|
|
||||||
# It's not dominant enough, try to rely on the model scores instead.
|
# It's not dominant enough, try to rely on the model scores instead.
|
||||||
modelsub = self._mLogicalProber.get_confidence() - self._mVisualProber.get_confidence()
|
modelsub = (self._mLogicalProber.get_confidence()
|
||||||
|
- self._mVisualProber.get_confidence())
|
||||||
if modelsub > MIN_MODEL_DISTANCE:
|
if modelsub > MIN_MODEL_DISTANCE:
|
||||||
return LOGICAL_HEBREW_NAME
|
return LOGICAL_HEBREW_NAME
|
||||||
if modelsub < -MIN_MODEL_DISTANCE:
|
if modelsub < -MIN_MODEL_DISTANCE:
|
||||||
return VISUAL_HEBREW_NAME
|
return VISUAL_HEBREW_NAME
|
||||||
|
|
||||||
# Still no good, back to final letter distance, maybe it'll save the day.
|
# Still no good, back to final letter distance, maybe it'll save the
|
||||||
|
# day.
|
||||||
if finalsub < 0.0:
|
if finalsub < 0.0:
|
||||||
return VISUAL_HEBREW_NAME
|
return VISUAL_HEBREW_NAME
|
||||||
|
|
||||||
# (finalsub > 0 - Logical) or (don't know what to do) default to Logical.
|
# (finalsub > 0 - Logical) or (don't know what to do) default to
|
||||||
|
# Logical.
|
||||||
return LOGICAL_HEBREW_NAME
|
return LOGICAL_HEBREW_NAME
|
||||||
|
|
||||||
def get_state(self):
|
def get_state(self):
|
||||||
# Remain active as long as any of the model probers are active.
|
# Remain active as long as any of the model probers are active.
|
||||||
if (self._mLogicalProber.get_state() == constants.eNotMe) and \
|
if (self._mLogicalProber.get_state() == eNotMe) and \
|
||||||
(self._mVisualProber.get_state() == constants.eNotMe):
|
(self._mVisualProber.get_state() == eNotMe):
|
||||||
return constants.eNotMe
|
return eNotMe
|
||||||
return constants.eDetecting
|
return eDetecting
|
||||||
|
|
4
thirdparty/chardet/jisfreq.py
vendored
4
thirdparty/chardet/jisfreq.py
vendored
|
@ -46,7 +46,7 @@ JIS_TYPICAL_DISTRIBUTION_RATIO = 3.0
|
||||||
# Char to FreqOrder table ,
|
# Char to FreqOrder table ,
|
||||||
JIS_TABLE_SIZE = 4368
|
JIS_TABLE_SIZE = 4368
|
||||||
|
|
||||||
JISCharToFreqOrder = ( \
|
JISCharToFreqOrder = (
|
||||||
40, 1, 6, 182, 152, 180, 295,2127, 285, 381,3295,4304,3068,4606,3165,3510, # 16
|
40, 1, 6, 182, 152, 180, 295,2127, 285, 381,3295,4304,3068,4606,3165,3510, # 16
|
||||||
3511,1822,2785,4607,1193,2226,5070,4608, 171,2996,1247, 18, 179,5071, 856,1661, # 32
|
3511,1822,2785,4607,1193,2226,5070,4608, 171,2996,1247, 18, 179,5071, 856,1661, # 32
|
||||||
1262,5072, 619, 127,3431,3512,3230,1899,1700, 232, 228,1294,1298, 284, 283,2041, # 48
|
1262,5072, 619, 127,3431,3512,3230,1899,1700, 232, 228,1294,1298, 284, 283,2041, # 48
|
||||||
|
@ -565,3 +565,5 @@ JISCharToFreqOrder = ( \
|
||||||
8224,8225,8226,8227,8228,8229,8230,8231,8232,8233,8234,8235,8236,8237,8238,8239, # 8240
|
8224,8225,8226,8227,8228,8229,8230,8231,8232,8233,8234,8235,8236,8237,8238,8239, # 8240
|
||||||
8240,8241,8242,8243,8244,8245,8246,8247,8248,8249,8250,8251,8252,8253,8254,8255, # 8256
|
8240,8241,8242,8243,8244,8245,8246,8247,8248,8249,8250,8251,8252,8253,8254,8255, # 8256
|
||||||
8256,8257,8258,8259,8260,8261,8262,8263,8264,8265,8266,8267,8268,8269,8270,8271) # 8272
|
8256,8257,8258,8259,8260,8261,8262,8263,8264,8265,8266,8267,8268,8269,8270,8271) # 8272
|
||||||
|
|
||||||
|
# flake8: noqa
|
||||||
|
|
85
thirdparty/chardet/jpcntx.py
vendored
85
thirdparty/chardet/jpcntx.py
vendored
|
@ -25,7 +25,7 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
import constants
|
from .compat import wrap_ord
|
||||||
|
|
||||||
NUM_OF_CATEGORY = 6
|
NUM_OF_CATEGORY = 6
|
||||||
DONT_KNOW = -1
|
DONT_KNOW = -1
|
||||||
|
@ -34,7 +34,7 @@ MAX_REL_THRESHOLD = 1000
|
||||||
MINIMUM_DATA_THRESHOLD = 4
|
MINIMUM_DATA_THRESHOLD = 4
|
||||||
|
|
||||||
# This is hiragana 2-char sequence table, the number in each cell represents its frequency category
|
# This is hiragana 2-char sequence table, the number in each cell represents its frequency category
|
||||||
jp2CharContext = ( \
|
jp2CharContext = (
|
||||||
(0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1),
|
(0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1),
|
||||||
(2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4),
|
(2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4),
|
||||||
(0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2),
|
(0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2),
|
||||||
|
@ -125,24 +125,31 @@ class JapaneseContextAnalysis:
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self._mTotalRel = 0 # total sequence received
|
self._mTotalRel = 0 # total sequence received
|
||||||
self._mRelSample = [0] * NUM_OF_CATEGORY # category counters, each interger counts sequence in its category
|
# category counters, each interger counts sequence in its category
|
||||||
self._mNeedToSkipCharNum = 0 # if last byte in current buffer is not the last byte of a character, we need to know how many bytes to skip in next buffer
|
self._mRelSample = [0] * NUM_OF_CATEGORY
|
||||||
self._mLastCharOrder = -1 # The order of previous char
|
# if last byte in current buffer is not the last byte of a character,
|
||||||
self._mDone = constants.False # If this flag is set to constants.True, detection is done and conclusion has been made
|
# we need to know how many bytes to skip in next buffer
|
||||||
|
self._mNeedToSkipCharNum = 0
|
||||||
|
self._mLastCharOrder = -1 # The order of previous char
|
||||||
|
# If this flag is set to True, detection is done and conclusion has
|
||||||
|
# been made
|
||||||
|
self._mDone = False
|
||||||
|
|
||||||
def feed(self, aBuf, aLen):
|
def feed(self, aBuf, aLen):
|
||||||
if self._mDone: return
|
if self._mDone:
|
||||||
|
return
|
||||||
|
|
||||||
# The buffer we got is byte oriented, and a character may span in more than one
|
# The buffer we got is byte oriented, and a character may span in more than one
|
||||||
# buffers. In case the last one or two byte in last buffer is not complete, we
|
# buffers. In case the last one or two byte in last buffer is not
|
||||||
# record how many byte needed to complete that character and skip these bytes here.
|
# complete, we record how many byte needed to complete that character
|
||||||
# We can choose to record those bytes as well and analyse the character once it
|
# and skip these bytes here. We can choose to record those bytes as
|
||||||
# is complete, but since a character will not make much difference, by simply skipping
|
# well and analyse the character once it is complete, but since a
|
||||||
|
# character will not make much difference, by simply skipping
|
||||||
# this character will simply our logic and improve performance.
|
# this character will simply our logic and improve performance.
|
||||||
i = self._mNeedToSkipCharNum
|
i = self._mNeedToSkipCharNum
|
||||||
while i < aLen:
|
while i < aLen:
|
||||||
order, charLen = self.get_order(aBuf[i:i+2])
|
order, charLen = self.get_order(aBuf[i:i + 2])
|
||||||
i += charLen
|
i += charLen
|
||||||
if i > aLen:
|
if i > aLen:
|
||||||
self._mNeedToSkipCharNum = i - aLen
|
self._mNeedToSkipCharNum = i - aLen
|
||||||
|
@ -151,7 +158,7 @@ class JapaneseContextAnalysis:
|
||||||
if (order != -1) and (self._mLastCharOrder != -1):
|
if (order != -1) and (self._mLastCharOrder != -1):
|
||||||
self._mTotalRel += 1
|
self._mTotalRel += 1
|
||||||
if self._mTotalRel > MAX_REL_THRESHOLD:
|
if self._mTotalRel > MAX_REL_THRESHOLD:
|
||||||
self._mDone = constants.True
|
self._mDone = True
|
||||||
break
|
break
|
||||||
self._mRelSample[jp2CharContext[self._mLastCharOrder][order]] += 1
|
self._mRelSample[jp2CharContext[self._mLastCharOrder][order]] += 1
|
||||||
self._mLastCharOrder = order
|
self._mLastCharOrder = order
|
||||||
|
@ -166,45 +173,55 @@ class JapaneseContextAnalysis:
|
||||||
else:
|
else:
|
||||||
return DONT_KNOW
|
return DONT_KNOW
|
||||||
|
|
||||||
def get_order(self, aStr):
|
def get_order(self, aBuf):
|
||||||
return -1, 1
|
return -1, 1
|
||||||
|
|
||||||
class SJISContextAnalysis(JapaneseContextAnalysis):
|
class SJISContextAnalysis(JapaneseContextAnalysis):
|
||||||
def get_order(self, aStr):
|
def __init__(self):
|
||||||
if not aStr: return -1, 1
|
self.charset_name = "SHIFT_JIS"
|
||||||
|
|
||||||
|
def get_charset_name(self):
|
||||||
|
return self.charset_name
|
||||||
|
|
||||||
|
def get_order(self, aBuf):
|
||||||
|
if not aBuf:
|
||||||
|
return -1, 1
|
||||||
# find out current char's byte length
|
# find out current char's byte length
|
||||||
if ((aStr[0] >= '\x81') and (aStr[0] <= '\x9F')) or \
|
first_char = wrap_ord(aBuf[0])
|
||||||
((aStr[0] >= '\xE0') and (aStr[0] <= '\xFC')):
|
if ((0x81 <= first_char <= 0x9F) or (0xE0 <= first_char <= 0xFC)):
|
||||||
charLen = 2
|
charLen = 2
|
||||||
|
if (first_char == 0x87) or (0xFA <= first_char <= 0xFC):
|
||||||
|
self.charset_name = "CP932"
|
||||||
else:
|
else:
|
||||||
charLen = 1
|
charLen = 1
|
||||||
|
|
||||||
# return its order if it is hiragana
|
# return its order if it is hiragana
|
||||||
if len(aStr) > 1:
|
if len(aBuf) > 1:
|
||||||
if (aStr[0] == '\202') and \
|
second_char = wrap_ord(aBuf[1])
|
||||||
(aStr[1] >= '\x9F') and \
|
if (first_char == 202) and (0x9F <= second_char <= 0xF1):
|
||||||
(aStr[1] <= '\xF1'):
|
return second_char - 0x9F, charLen
|
||||||
return ord(aStr[1]) - 0x9F, charLen
|
|
||||||
|
|
||||||
return -1, charLen
|
return -1, charLen
|
||||||
|
|
||||||
class EUCJPContextAnalysis(JapaneseContextAnalysis):
|
class EUCJPContextAnalysis(JapaneseContextAnalysis):
|
||||||
def get_order(self, aStr):
|
def get_order(self, aBuf):
|
||||||
if not aStr: return -1, 1
|
if not aBuf:
|
||||||
|
return -1, 1
|
||||||
# find out current char's byte length
|
# find out current char's byte length
|
||||||
if (aStr[0] == '\x8E') or \
|
first_char = wrap_ord(aBuf[0])
|
||||||
((aStr[0] >= '\xA1') and (aStr[0] <= '\xFE')):
|
if (first_char == 0x8E) or (0xA1 <= first_char <= 0xFE):
|
||||||
charLen = 2
|
charLen = 2
|
||||||
elif aStr[0] == '\x8F':
|
elif first_char == 0x8F:
|
||||||
charLen = 3
|
charLen = 3
|
||||||
else:
|
else:
|
||||||
charLen = 1
|
charLen = 1
|
||||||
|
|
||||||
# return its order if it is hiragana
|
# return its order if it is hiragana
|
||||||
if len(aStr) > 1:
|
if len(aBuf) > 1:
|
||||||
if (aStr[0] == '\xA4') and \
|
second_char = wrap_ord(aBuf[1])
|
||||||
(aStr[1] >= '\xA1') and \
|
if (first_char == 0xA4) and (0xA1 <= second_char <= 0xF3):
|
||||||
(aStr[1] <= '\xF3'):
|
return second_char - 0xA1, charLen
|
||||||
return ord(aStr[1]) - 0xA1, charLen
|
|
||||||
|
|
||||||
return -1, charLen
|
return -1, charLen
|
||||||
|
|
||||||
|
# flake8: noqa
|
||||||
|
|
19
thirdparty/chardet/langbulgarianmodel.py
vendored
19
thirdparty/chardet/langbulgarianmodel.py
vendored
|
@ -25,8 +25,6 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
import constants
|
|
||||||
|
|
||||||
# 255: Control characters that usually does not exist in any text
|
# 255: Control characters that usually does not exist in any text
|
||||||
# 254: Carriage/Return
|
# 254: Carriage/Return
|
||||||
# 253: symbol (punctuation) that does not belong to word
|
# 253: symbol (punctuation) that does not belong to word
|
||||||
|
@ -36,7 +34,7 @@ import constants
|
||||||
# this table is modified base on win1251BulgarianCharToOrderMap, so
|
# this table is modified base on win1251BulgarianCharToOrderMap, so
|
||||||
# only number <64 is sure valid
|
# only number <64 is sure valid
|
||||||
|
|
||||||
Latin5_BulgarianCharToOrderMap = ( \
|
Latin5_BulgarianCharToOrderMap = (
|
||||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||||
|
@ -55,7 +53,7 @@ Latin5_BulgarianCharToOrderMap = ( \
|
||||||
62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,252,253, # f0
|
62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,252,253, # f0
|
||||||
)
|
)
|
||||||
|
|
||||||
win1251BulgarianCharToOrderMap = ( \
|
win1251BulgarianCharToOrderMap = (
|
||||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||||
|
@ -80,7 +78,7 @@ win1251BulgarianCharToOrderMap = ( \
|
||||||
# first 1024 sequences:3.0618%
|
# first 1024 sequences:3.0618%
|
||||||
# rest sequences: 0.2992%
|
# rest sequences: 0.2992%
|
||||||
# negative sequences: 0.0020%
|
# negative sequences: 0.0020%
|
||||||
BulgarianLangModel = ( \
|
BulgarianLangModel = (
|
||||||
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3,
|
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3,
|
||||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,2,1,2,2,
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,2,1,2,2,
|
||||||
3,1,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,3,0,3,0,1,
|
3,1,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,3,0,3,0,1,
|
||||||
|
@ -211,18 +209,21 @@ BulgarianLangModel = ( \
|
||||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
|
||||||
)
|
)
|
||||||
|
|
||||||
Latin5BulgarianModel = { \
|
Latin5BulgarianModel = {
|
||||||
'charToOrderMap': Latin5_BulgarianCharToOrderMap,
|
'charToOrderMap': Latin5_BulgarianCharToOrderMap,
|
||||||
'precedenceMatrix': BulgarianLangModel,
|
'precedenceMatrix': BulgarianLangModel,
|
||||||
'mTypicalPositiveRatio': 0.969392,
|
'mTypicalPositiveRatio': 0.969392,
|
||||||
'keepEnglishLetter': constants.False,
|
'keepEnglishLetter': False,
|
||||||
'charsetName': "ISO-8859-5"
|
'charsetName': "ISO-8859-5"
|
||||||
}
|
}
|
||||||
|
|
||||||
Win1251BulgarianModel = { \
|
Win1251BulgarianModel = {
|
||||||
'charToOrderMap': win1251BulgarianCharToOrderMap,
|
'charToOrderMap': win1251BulgarianCharToOrderMap,
|
||||||
'precedenceMatrix': BulgarianLangModel,
|
'precedenceMatrix': BulgarianLangModel,
|
||||||
'mTypicalPositiveRatio': 0.969392,
|
'mTypicalPositiveRatio': 0.969392,
|
||||||
'keepEnglishLetter': constants.False,
|
'keepEnglishLetter': False,
|
||||||
'charsetName': "windows-1251"
|
'charsetName': "windows-1251"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# flake8: noqa
|
||||||
|
|
42
thirdparty/chardet/langcyrillicmodel.py
vendored
42
thirdparty/chardet/langcyrillicmodel.py
vendored
|
@ -25,11 +25,9 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
import constants
|
|
||||||
|
|
||||||
# KOI8-R language model
|
# KOI8-R language model
|
||||||
# Character Mapping Table:
|
# Character Mapping Table:
|
||||||
KOI8R_CharToOrderMap = ( \
|
KOI8R_CharToOrderMap = (
|
||||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||||
|
@ -48,7 +46,7 @@ KOI8R_CharToOrderMap = ( \
|
||||||
35, 43, 45, 32, 40, 52, 56, 33, 61, 62, 51, 57, 47, 63, 50, 70, # f0
|
35, 43, 45, 32, 40, 52, 56, 33, 61, 62, 51, 57, 47, 63, 50, 70, # f0
|
||||||
)
|
)
|
||||||
|
|
||||||
win1251_CharToOrderMap = ( \
|
win1251_CharToOrderMap = (
|
||||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||||
|
@ -67,7 +65,7 @@ win1251_CharToOrderMap = ( \
|
||||||
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16,
|
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16,
|
||||||
)
|
)
|
||||||
|
|
||||||
latin5_CharToOrderMap = ( \
|
latin5_CharToOrderMap = (
|
||||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||||
|
@ -86,7 +84,7 @@ latin5_CharToOrderMap = ( \
|
||||||
239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255,
|
239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255,
|
||||||
)
|
)
|
||||||
|
|
||||||
macCyrillic_CharToOrderMap = ( \
|
macCyrillic_CharToOrderMap = (
|
||||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||||
|
@ -105,7 +103,7 @@ macCyrillic_CharToOrderMap = ( \
|
||||||
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,255,
|
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,255,
|
||||||
)
|
)
|
||||||
|
|
||||||
IBM855_CharToOrderMap = ( \
|
IBM855_CharToOrderMap = (
|
||||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||||
|
@ -124,7 +122,7 @@ IBM855_CharToOrderMap = ( \
|
||||||
250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,252,255,
|
250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,252,255,
|
||||||
)
|
)
|
||||||
|
|
||||||
IBM866_CharToOrderMap = ( \
|
IBM866_CharToOrderMap = (
|
||||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||||
|
@ -149,7 +147,7 @@ IBM866_CharToOrderMap = ( \
|
||||||
# first 1024 sequences: 2.3389%
|
# first 1024 sequences: 2.3389%
|
||||||
# rest sequences: 0.1237%
|
# rest sequences: 0.1237%
|
||||||
# negative sequences: 0.0009%
|
# negative sequences: 0.0009%
|
||||||
RussianLangModel = ( \
|
RussianLangModel = (
|
||||||
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3,
|
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3,
|
||||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,2,2,2,0,0,2,
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,2,2,2,0,0,2,
|
||||||
3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,3,0,0,3,3,3,3,3,3,3,3,3,2,3,2,0,
|
3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,3,0,0,3,3,3,3,3,3,3,3,3,2,3,2,0,
|
||||||
|
@ -280,50 +278,52 @@ RussianLangModel = ( \
|
||||||
0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,
|
0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,
|
||||||
)
|
)
|
||||||
|
|
||||||
Koi8rModel = { \
|
Koi8rModel = {
|
||||||
'charToOrderMap': KOI8R_CharToOrderMap,
|
'charToOrderMap': KOI8R_CharToOrderMap,
|
||||||
'precedenceMatrix': RussianLangModel,
|
'precedenceMatrix': RussianLangModel,
|
||||||
'mTypicalPositiveRatio': 0.976601,
|
'mTypicalPositiveRatio': 0.976601,
|
||||||
'keepEnglishLetter': constants.False,
|
'keepEnglishLetter': False,
|
||||||
'charsetName': "KOI8-R"
|
'charsetName': "KOI8-R"
|
||||||
}
|
}
|
||||||
|
|
||||||
Win1251CyrillicModel = { \
|
Win1251CyrillicModel = {
|
||||||
'charToOrderMap': win1251_CharToOrderMap,
|
'charToOrderMap': win1251_CharToOrderMap,
|
||||||
'precedenceMatrix': RussianLangModel,
|
'precedenceMatrix': RussianLangModel,
|
||||||
'mTypicalPositiveRatio': 0.976601,
|
'mTypicalPositiveRatio': 0.976601,
|
||||||
'keepEnglishLetter': constants.False,
|
'keepEnglishLetter': False,
|
||||||
'charsetName': "windows-1251"
|
'charsetName': "windows-1251"
|
||||||
}
|
}
|
||||||
|
|
||||||
Latin5CyrillicModel = { \
|
Latin5CyrillicModel = {
|
||||||
'charToOrderMap': latin5_CharToOrderMap,
|
'charToOrderMap': latin5_CharToOrderMap,
|
||||||
'precedenceMatrix': RussianLangModel,
|
'precedenceMatrix': RussianLangModel,
|
||||||
'mTypicalPositiveRatio': 0.976601,
|
'mTypicalPositiveRatio': 0.976601,
|
||||||
'keepEnglishLetter': constants.False,
|
'keepEnglishLetter': False,
|
||||||
'charsetName': "ISO-8859-5"
|
'charsetName': "ISO-8859-5"
|
||||||
}
|
}
|
||||||
|
|
||||||
MacCyrillicModel = { \
|
MacCyrillicModel = {
|
||||||
'charToOrderMap': macCyrillic_CharToOrderMap,
|
'charToOrderMap': macCyrillic_CharToOrderMap,
|
||||||
'precedenceMatrix': RussianLangModel,
|
'precedenceMatrix': RussianLangModel,
|
||||||
'mTypicalPositiveRatio': 0.976601,
|
'mTypicalPositiveRatio': 0.976601,
|
||||||
'keepEnglishLetter': constants.False,
|
'keepEnglishLetter': False,
|
||||||
'charsetName': "MacCyrillic"
|
'charsetName': "MacCyrillic"
|
||||||
};
|
};
|
||||||
|
|
||||||
Ibm866Model = { \
|
Ibm866Model = {
|
||||||
'charToOrderMap': IBM866_CharToOrderMap,
|
'charToOrderMap': IBM866_CharToOrderMap,
|
||||||
'precedenceMatrix': RussianLangModel,
|
'precedenceMatrix': RussianLangModel,
|
||||||
'mTypicalPositiveRatio': 0.976601,
|
'mTypicalPositiveRatio': 0.976601,
|
||||||
'keepEnglishLetter': constants.False,
|
'keepEnglishLetter': False,
|
||||||
'charsetName': "IBM866"
|
'charsetName': "IBM866"
|
||||||
}
|
}
|
||||||
|
|
||||||
Ibm855Model = { \
|
Ibm855Model = {
|
||||||
'charToOrderMap': IBM855_CharToOrderMap,
|
'charToOrderMap': IBM855_CharToOrderMap,
|
||||||
'precedenceMatrix': RussianLangModel,
|
'precedenceMatrix': RussianLangModel,
|
||||||
'mTypicalPositiveRatio': 0.976601,
|
'mTypicalPositiveRatio': 0.976601,
|
||||||
'keepEnglishLetter': constants.False,
|
'keepEnglishLetter': False,
|
||||||
'charsetName': "IBM855"
|
'charsetName': "IBM855"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# flake8: noqa
|
||||||
|
|
18
thirdparty/chardet/langgreekmodel.py
vendored
18
thirdparty/chardet/langgreekmodel.py
vendored
|
@ -25,15 +25,13 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
import constants
|
|
||||||
|
|
||||||
# 255: Control characters that usually does not exist in any text
|
# 255: Control characters that usually does not exist in any text
|
||||||
# 254: Carriage/Return
|
# 254: Carriage/Return
|
||||||
# 253: symbol (punctuation) that does not belong to word
|
# 253: symbol (punctuation) that does not belong to word
|
||||||
# 252: 0 - 9
|
# 252: 0 - 9
|
||||||
|
|
||||||
# Character Mapping Table:
|
# Character Mapping Table:
|
||||||
Latin7_CharToOrderMap = ( \
|
Latin7_CharToOrderMap = (
|
||||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||||
|
@ -52,7 +50,7 @@ Latin7_CharToOrderMap = ( \
|
||||||
9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, # f0
|
9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, # f0
|
||||||
)
|
)
|
||||||
|
|
||||||
win1253_CharToOrderMap = ( \
|
win1253_CharToOrderMap = (
|
||||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||||
|
@ -77,7 +75,7 @@ win1253_CharToOrderMap = ( \
|
||||||
# first 1024 sequences:1.7001%
|
# first 1024 sequences:1.7001%
|
||||||
# rest sequences: 0.0359%
|
# rest sequences: 0.0359%
|
||||||
# negative sequences: 0.0148%
|
# negative sequences: 0.0148%
|
||||||
GreekLangModel = ( \
|
GreekLangModel = (
|
||||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
0,0,3,2,2,3,3,3,3,3,3,3,3,1,3,3,3,0,2,2,3,3,0,3,0,3,2,0,3,3,3,0,
|
0,0,3,2,2,3,3,3,3,3,3,3,3,1,3,3,3,0,2,2,3,3,0,3,0,3,2,0,3,3,3,0,
|
||||||
|
@ -208,18 +206,20 @@ GreekLangModel = ( \
|
||||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
)
|
)
|
||||||
|
|
||||||
Latin7GreekModel = { \
|
Latin7GreekModel = {
|
||||||
'charToOrderMap': Latin7_CharToOrderMap,
|
'charToOrderMap': Latin7_CharToOrderMap,
|
||||||
'precedenceMatrix': GreekLangModel,
|
'precedenceMatrix': GreekLangModel,
|
||||||
'mTypicalPositiveRatio': 0.982851,
|
'mTypicalPositiveRatio': 0.982851,
|
||||||
'keepEnglishLetter': constants.False,
|
'keepEnglishLetter': False,
|
||||||
'charsetName': "ISO-8859-7"
|
'charsetName': "ISO-8859-7"
|
||||||
}
|
}
|
||||||
|
|
||||||
Win1253GreekModel = { \
|
Win1253GreekModel = {
|
||||||
'charToOrderMap': win1253_CharToOrderMap,
|
'charToOrderMap': win1253_CharToOrderMap,
|
||||||
'precedenceMatrix': GreekLangModel,
|
'precedenceMatrix': GreekLangModel,
|
||||||
'mTypicalPositiveRatio': 0.982851,
|
'mTypicalPositiveRatio': 0.982851,
|
||||||
'keepEnglishLetter': constants.False,
|
'keepEnglishLetter': False,
|
||||||
'charsetName': "windows-1253"
|
'charsetName': "windows-1253"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# flake8: noqa
|
||||||
|
|
12
thirdparty/chardet/langhebrewmodel.py
vendored
12
thirdparty/chardet/langhebrewmodel.py
vendored
|
@ -27,8 +27,6 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
import constants
|
|
||||||
|
|
||||||
# 255: Control characters that usually does not exist in any text
|
# 255: Control characters that usually does not exist in any text
|
||||||
# 254: Carriage/Return
|
# 254: Carriage/Return
|
||||||
# 253: symbol (punctuation) that does not belong to word
|
# 253: symbol (punctuation) that does not belong to word
|
||||||
|
@ -36,7 +34,7 @@ import constants
|
||||||
|
|
||||||
# Windows-1255 language model
|
# Windows-1255 language model
|
||||||
# Character Mapping Table:
|
# Character Mapping Table:
|
||||||
win1255_CharToOrderMap = ( \
|
win1255_CharToOrderMap = (
|
||||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||||
|
@ -61,7 +59,7 @@ win1255_CharToOrderMap = ( \
|
||||||
# first 1024 sequences: 1.5981%
|
# first 1024 sequences: 1.5981%
|
||||||
# rest sequences: 0.087%
|
# rest sequences: 0.087%
|
||||||
# negative sequences: 0.0015%
|
# negative sequences: 0.0015%
|
||||||
HebrewLangModel = ( \
|
HebrewLangModel = (
|
||||||
0,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,2,1,2,0,1,0,0,
|
0,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,2,1,2,0,1,0,0,
|
||||||
3,0,3,1,0,0,1,3,2,0,1,1,2,0,2,2,2,1,1,1,1,2,1,1,1,2,0,0,2,2,0,1,
|
3,0,3,1,0,0,1,3,2,0,1,1,2,0,2,2,2,1,1,1,1,2,1,1,1,2,0,0,2,2,0,1,
|
||||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,
|
||||||
|
@ -192,10 +190,12 @@ HebrewLangModel = ( \
|
||||||
0,0,0,0,0,0,0,0,0,0,1,2,1,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,1,1,0,0,
|
0,0,0,0,0,0,0,0,0,0,1,2,1,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,1,1,0,0,
|
||||||
)
|
)
|
||||||
|
|
||||||
Win1255HebrewModel = { \
|
Win1255HebrewModel = {
|
||||||
'charToOrderMap': win1255_CharToOrderMap,
|
'charToOrderMap': win1255_CharToOrderMap,
|
||||||
'precedenceMatrix': HebrewLangModel,
|
'precedenceMatrix': HebrewLangModel,
|
||||||
'mTypicalPositiveRatio': 0.984004,
|
'mTypicalPositiveRatio': 0.984004,
|
||||||
'keepEnglishLetter': constants.False,
|
'keepEnglishLetter': False,
|
||||||
'charsetName': "windows-1255"
|
'charsetName': "windows-1255"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# flake8: noqa
|
||||||
|
|
18
thirdparty/chardet/langhungarianmodel.py
vendored
18
thirdparty/chardet/langhungarianmodel.py
vendored
|
@ -25,15 +25,13 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
import constants
|
|
||||||
|
|
||||||
# 255: Control characters that usually does not exist in any text
|
# 255: Control characters that usually does not exist in any text
|
||||||
# 254: Carriage/Return
|
# 254: Carriage/Return
|
||||||
# 253: symbol (punctuation) that does not belong to word
|
# 253: symbol (punctuation) that does not belong to word
|
||||||
# 252: 0 - 9
|
# 252: 0 - 9
|
||||||
|
|
||||||
# Character Mapping Table:
|
# Character Mapping Table:
|
||||||
Latin2_HungarianCharToOrderMap = ( \
|
Latin2_HungarianCharToOrderMap = (
|
||||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||||
|
@ -52,7 +50,7 @@ Latin2_HungarianCharToOrderMap = ( \
|
||||||
245,246,247, 25, 73, 42, 24,248,249,250, 31, 56, 29,251,252,253,
|
245,246,247, 25, 73, 42, 24,248,249,250, 31, 56, 29,251,252,253,
|
||||||
)
|
)
|
||||||
|
|
||||||
win1250HungarianCharToOrderMap = ( \
|
win1250HungarianCharToOrderMap = (
|
||||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||||
|
@ -77,7 +75,7 @@ win1250HungarianCharToOrderMap = ( \
|
||||||
# first 1024 sequences:5.2623%
|
# first 1024 sequences:5.2623%
|
||||||
# rest sequences: 0.8894%
|
# rest sequences: 0.8894%
|
||||||
# negative sequences: 0.0009%
|
# negative sequences: 0.0009%
|
||||||
HungarianLangModel = ( \
|
HungarianLangModel = (
|
||||||
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
|
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
|
||||||
3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,1,1,2,2,2,2,2,1,2,
|
3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,1,1,2,2,2,2,2,1,2,
|
||||||
3,2,2,3,3,3,3,3,2,3,3,3,3,3,3,1,2,3,3,3,3,2,3,3,1,1,3,3,0,1,1,1,
|
3,2,2,3,3,3,3,3,2,3,3,3,3,3,3,1,2,3,3,3,3,2,3,3,1,1,3,3,0,1,1,1,
|
||||||
|
@ -208,18 +206,20 @@ HungarianLangModel = ( \
|
||||||
0,1,1,1,1,1,1,0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,
|
0,1,1,1,1,1,1,0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,
|
||||||
)
|
)
|
||||||
|
|
||||||
Latin2HungarianModel = { \
|
Latin2HungarianModel = {
|
||||||
'charToOrderMap': Latin2_HungarianCharToOrderMap,
|
'charToOrderMap': Latin2_HungarianCharToOrderMap,
|
||||||
'precedenceMatrix': HungarianLangModel,
|
'precedenceMatrix': HungarianLangModel,
|
||||||
'mTypicalPositiveRatio': 0.947368,
|
'mTypicalPositiveRatio': 0.947368,
|
||||||
'keepEnglishLetter': constants.True,
|
'keepEnglishLetter': True,
|
||||||
'charsetName': "ISO-8859-2"
|
'charsetName': "ISO-8859-2"
|
||||||
}
|
}
|
||||||
|
|
||||||
Win1250HungarianModel = { \
|
Win1250HungarianModel = {
|
||||||
'charToOrderMap': win1250HungarianCharToOrderMap,
|
'charToOrderMap': win1250HungarianCharToOrderMap,
|
||||||
'precedenceMatrix': HungarianLangModel,
|
'precedenceMatrix': HungarianLangModel,
|
||||||
'mTypicalPositiveRatio': 0.947368,
|
'mTypicalPositiveRatio': 0.947368,
|
||||||
'keepEnglishLetter': constants.True,
|
'keepEnglishLetter': True,
|
||||||
'charsetName': "windows-1250"
|
'charsetName': "windows-1250"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# flake8: noqa
|
||||||
|
|
12
thirdparty/chardet/langthaimodel.py
vendored
12
thirdparty/chardet/langthaimodel.py
vendored
|
@ -25,8 +25,6 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
import constants
|
|
||||||
|
|
||||||
# 255: Control characters that usually does not exist in any text
|
# 255: Control characters that usually does not exist in any text
|
||||||
# 254: Carriage/Return
|
# 254: Carriage/Return
|
||||||
# 253: symbol (punctuation) that does not belong to word
|
# 253: symbol (punctuation) that does not belong to word
|
||||||
|
@ -35,7 +33,7 @@ import constants
|
||||||
# The following result for thai was collected from a limited sample (1M).
|
# The following result for thai was collected from a limited sample (1M).
|
||||||
|
|
||||||
# Character Mapping Table:
|
# Character Mapping Table:
|
||||||
TIS620CharToOrderMap = ( \
|
TIS620CharToOrderMap = (
|
||||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||||
|
@ -60,7 +58,7 @@ TIS620CharToOrderMap = ( \
|
||||||
# first 1024 sequences:7.3177%
|
# first 1024 sequences:7.3177%
|
||||||
# rest sequences: 1.0230%
|
# rest sequences: 1.0230%
|
||||||
# negative sequences: 0.0436%
|
# negative sequences: 0.0436%
|
||||||
ThaiLangModel = ( \
|
ThaiLangModel = (
|
||||||
0,1,3,3,3,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,3,3,
|
0,1,3,3,3,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,3,3,
|
||||||
0,3,3,0,0,0,1,3,0,3,3,2,3,3,0,1,2,3,3,3,3,0,2,0,2,0,0,3,2,1,2,2,
|
0,3,3,0,0,0,1,3,0,3,3,2,3,3,0,1,2,3,3,3,3,0,2,0,2,0,0,3,2,1,2,2,
|
||||||
3,0,3,3,2,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,3,0,3,2,3,0,2,2,2,3,
|
3,0,3,3,2,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,3,0,3,2,3,0,2,2,2,3,
|
||||||
|
@ -191,10 +189,12 @@ ThaiLangModel = ( \
|
||||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
)
|
)
|
||||||
|
|
||||||
TIS620ThaiModel = { \
|
TIS620ThaiModel = {
|
||||||
'charToOrderMap': TIS620CharToOrderMap,
|
'charToOrderMap': TIS620CharToOrderMap,
|
||||||
'precedenceMatrix': ThaiLangModel,
|
'precedenceMatrix': ThaiLangModel,
|
||||||
'mTypicalPositiveRatio': 0.926386,
|
'mTypicalPositiveRatio': 0.926386,
|
||||||
'keepEnglishLetter': constants.False,
|
'keepEnglishLetter': False,
|
||||||
'charsetName': "TIS-620"
|
'charsetName': "TIS-620"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# flake8: noqa
|
||||||
|
|
131
thirdparty/chardet/latin1prober.py
vendored
131
thirdparty/chardet/latin1prober.py
vendored
|
@ -26,73 +26,74 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from charsetprober import CharSetProber
|
from .charsetprober import CharSetProber
|
||||||
import constants
|
from .constants import eNotMe
|
||||||
import operator
|
from .compat import wrap_ord
|
||||||
|
|
||||||
FREQ_CAT_NUM = 4
|
FREQ_CAT_NUM = 4
|
||||||
|
|
||||||
UDF = 0 # undefined
|
UDF = 0 # undefined
|
||||||
OTH = 1 # other
|
OTH = 1 # other
|
||||||
ASC = 2 # ascii capital letter
|
ASC = 2 # ascii capital letter
|
||||||
ASS = 3 # ascii small letter
|
ASS = 3 # ascii small letter
|
||||||
ACV = 4 # accent capital vowel
|
ACV = 4 # accent capital vowel
|
||||||
ACO = 5 # accent capital other
|
ACO = 5 # accent capital other
|
||||||
ASV = 6 # accent small vowel
|
ASV = 6 # accent small vowel
|
||||||
ASO = 7 # accent small other
|
ASO = 7 # accent small other
|
||||||
CLASS_NUM = 8 # total classes
|
CLASS_NUM = 8 # total classes
|
||||||
|
|
||||||
Latin1_CharToClass = ( \
|
Latin1_CharToClass = (
|
||||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
|
||||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
|
||||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17
|
||||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F
|
||||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27
|
||||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F
|
||||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37
|
||||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F
|
||||||
OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47
|
OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47
|
||||||
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F
|
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F
|
||||||
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57
|
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57
|
||||||
ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F
|
ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F
|
||||||
OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67
|
OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67
|
||||||
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F
|
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F
|
||||||
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77
|
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77
|
||||||
ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F
|
ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F
|
||||||
OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, # 80 - 87
|
OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, # 80 - 87
|
||||||
OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, # 88 - 8F
|
OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, # 88 - 8F
|
||||||
UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 90 - 97
|
UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 90 - 97
|
||||||
OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, # 98 - 9F
|
OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, # 98 - 9F
|
||||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A0 - A7
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A0 - A7
|
||||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A8 - AF
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A8 - AF
|
||||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7
|
||||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B8 - BF
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B8 - BF
|
||||||
ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, # C0 - C7
|
ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, # C0 - C7
|
||||||
ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # C8 - CF
|
ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # C8 - CF
|
||||||
ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, # D0 - D7
|
ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, # D0 - D7
|
||||||
ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, # D8 - DF
|
ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, # D8 - DF
|
||||||
ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, # E0 - E7
|
ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, # E0 - E7
|
||||||
ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # E8 - EF
|
ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # E8 - EF
|
||||||
ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, # F0 - F7
|
ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, # F0 - F7
|
||||||
ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, # F8 - FF
|
ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, # F8 - FF
|
||||||
)
|
)
|
||||||
|
|
||||||
# 0 : illegal
|
# 0 : illegal
|
||||||
# 1 : very unlikely
|
# 1 : very unlikely
|
||||||
# 2 : normal
|
# 2 : normal
|
||||||
# 3 : very likely
|
# 3 : very likely
|
||||||
Latin1ClassModel = ( \
|
Latin1ClassModel = (
|
||||||
# UDF OTH ASC ASS ACV ACO ASV ASO
|
# UDF OTH ASC ASS ACV ACO ASV ASO
|
||||||
0, 0, 0, 0, 0, 0, 0, 0, # UDF
|
0, 0, 0, 0, 0, 0, 0, 0, # UDF
|
||||||
0, 3, 3, 3, 3, 3, 3, 3, # OTH
|
0, 3, 3, 3, 3, 3, 3, 3, # OTH
|
||||||
0, 3, 3, 3, 3, 3, 3, 3, # ASC
|
0, 3, 3, 3, 3, 3, 3, 3, # ASC
|
||||||
0, 3, 3, 3, 1, 1, 3, 3, # ASS
|
0, 3, 3, 3, 1, 1, 3, 3, # ASS
|
||||||
0, 3, 3, 3, 1, 2, 1, 2, # ACV
|
0, 3, 3, 3, 1, 2, 1, 2, # ACV
|
||||||
0, 3, 3, 3, 3, 3, 3, 3, # ACO
|
0, 3, 3, 3, 3, 3, 3, 3, # ACO
|
||||||
0, 3, 1, 3, 1, 1, 1, 3, # ASV
|
0, 3, 1, 3, 1, 1, 1, 3, # ASV
|
||||||
0, 3, 1, 3, 1, 1, 3, 3, # ASO
|
0, 3, 1, 3, 1, 1, 3, 3, # ASO
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class Latin1Prober(CharSetProber):
|
class Latin1Prober(CharSetProber):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
CharSetProber.__init__(self)
|
CharSetProber.__init__(self)
|
||||||
|
@ -109,10 +110,11 @@ class Latin1Prober(CharSetProber):
|
||||||
def feed(self, aBuf):
|
def feed(self, aBuf):
|
||||||
aBuf = self.filter_with_english_letters(aBuf)
|
aBuf = self.filter_with_english_letters(aBuf)
|
||||||
for c in aBuf:
|
for c in aBuf:
|
||||||
charClass = Latin1_CharToClass[ord(c)]
|
charClass = Latin1_CharToClass[wrap_ord(c)]
|
||||||
freq = Latin1ClassModel[(self._mLastCharClass * CLASS_NUM) + charClass]
|
freq = Latin1ClassModel[(self._mLastCharClass * CLASS_NUM)
|
||||||
|
+ charClass]
|
||||||
if freq == 0:
|
if freq == 0:
|
||||||
self._mState = constants.eNotMe
|
self._mState = eNotMe
|
||||||
break
|
break
|
||||||
self._mFreqCounter[freq] += 1
|
self._mFreqCounter[freq] += 1
|
||||||
self._mLastCharClass = charClass
|
self._mLastCharClass = charClass
|
||||||
|
@ -120,17 +122,18 @@ class Latin1Prober(CharSetProber):
|
||||||
return self.get_state()
|
return self.get_state()
|
||||||
|
|
||||||
def get_confidence(self):
|
def get_confidence(self):
|
||||||
if self.get_state() == constants.eNotMe:
|
if self.get_state() == eNotMe:
|
||||||
return 0.01
|
return 0.01
|
||||||
|
|
||||||
total = reduce(operator.add, self._mFreqCounter)
|
total = sum(self._mFreqCounter)
|
||||||
if total < 0.01:
|
if total < 0.01:
|
||||||
confidence = 0.0
|
confidence = 0.0
|
||||||
else:
|
else:
|
||||||
confidence = (self._mFreqCounter[3] / total) - (self._mFreqCounter[1] * 20.0 / total)
|
confidence = ((self._mFreqCounter[3] - self._mFreqCounter[1] * 20.0)
|
||||||
|
/ total)
|
||||||
if confidence < 0.0:
|
if confidence < 0.0:
|
||||||
confidence = 0.0
|
confidence = 0.0
|
||||||
# lower the confidence of latin1 so that other more accurate detector
|
# lower the confidence of latin1 so that other more accurate
|
||||||
# can take priority.
|
# detector can take priority.
|
||||||
confidence = confidence * 0.5
|
confidence = confidence * 0.73
|
||||||
return confidence
|
return confidence
|
||||||
|
|
30
thirdparty/chardet/mbcharsetprober.py
vendored
30
thirdparty/chardet/mbcharsetprober.py
vendored
|
@ -27,16 +27,17 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
import constants, sys
|
import sys
|
||||||
from constants import eStart, eError, eItsMe
|
from . import constants
|
||||||
from charsetprober import CharSetProber
|
from .charsetprober import CharSetProber
|
||||||
|
|
||||||
|
|
||||||
class MultiByteCharSetProber(CharSetProber):
|
class MultiByteCharSetProber(CharSetProber):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
CharSetProber.__init__(self)
|
CharSetProber.__init__(self)
|
||||||
self._mDistributionAnalyzer = None
|
self._mDistributionAnalyzer = None
|
||||||
self._mCodingSM = None
|
self._mCodingSM = None
|
||||||
self._mLastChar = ['\x00', '\x00']
|
self._mLastChar = [0, 0]
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
CharSetProber.reset(self)
|
CharSetProber.reset(self)
|
||||||
|
@ -44,36 +45,39 @@ class MultiByteCharSetProber(CharSetProber):
|
||||||
self._mCodingSM.reset()
|
self._mCodingSM.reset()
|
||||||
if self._mDistributionAnalyzer:
|
if self._mDistributionAnalyzer:
|
||||||
self._mDistributionAnalyzer.reset()
|
self._mDistributionAnalyzer.reset()
|
||||||
self._mLastChar = ['\x00', '\x00']
|
self._mLastChar = [0, 0]
|
||||||
|
|
||||||
def get_charset_name(self):
|
def get_charset_name(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def feed(self, aBuf):
|
def feed(self, aBuf):
|
||||||
aLen = len(aBuf)
|
aLen = len(aBuf)
|
||||||
for i in xrange(0, aLen):
|
for i in range(0, aLen):
|
||||||
codingState = self._mCodingSM.next_state(aBuf[i])
|
codingState = self._mCodingSM.next_state(aBuf[i])
|
||||||
if codingState == eError:
|
if codingState == constants.eError:
|
||||||
if constants._debug:
|
if constants._debug:
|
||||||
sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n')
|
sys.stderr.write(self.get_charset_name()
|
||||||
|
+ ' prober hit error at byte ' + str(i)
|
||||||
|
+ '\n')
|
||||||
self._mState = constants.eNotMe
|
self._mState = constants.eNotMe
|
||||||
break
|
break
|
||||||
elif codingState == eItsMe:
|
elif codingState == constants.eItsMe:
|
||||||
self._mState = constants.eFoundIt
|
self._mState = constants.eFoundIt
|
||||||
break
|
break
|
||||||
elif codingState == eStart:
|
elif codingState == constants.eStart:
|
||||||
charLen = self._mCodingSM.get_current_charlen()
|
charLen = self._mCodingSM.get_current_charlen()
|
||||||
if i == 0:
|
if i == 0:
|
||||||
self._mLastChar[1] = aBuf[0]
|
self._mLastChar[1] = aBuf[0]
|
||||||
self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
|
self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
|
||||||
else:
|
else:
|
||||||
self._mDistributionAnalyzer.feed(aBuf[i-1:i+1], charLen)
|
self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1],
|
||||||
|
charLen)
|
||||||
|
|
||||||
self._mLastChar[0] = aBuf[aLen - 1]
|
self._mLastChar[0] = aBuf[aLen - 1]
|
||||||
|
|
||||||
if self.get_state() == constants.eDetecting:
|
if self.get_state() == constants.eDetecting:
|
||||||
if self._mDistributionAnalyzer.got_enough_data() and \
|
if (self._mDistributionAnalyzer.got_enough_data() and
|
||||||
(self.get_confidence() > constants.SHORTCUT_THRESHOLD):
|
(self.get_confidence() > constants.SHORTCUT_THRESHOLD)):
|
||||||
self._mState = constants.eFoundIt
|
self._mState = constants.eFoundIt
|
||||||
|
|
||||||
return self.get_state()
|
return self.get_state()
|
||||||
|
|
24
thirdparty/chardet/mbcsgroupprober.py
vendored
24
thirdparty/chardet/mbcsgroupprober.py
vendored
|
@ -27,24 +27,28 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from charsetgroupprober import CharSetGroupProber
|
from .charsetgroupprober import CharSetGroupProber
|
||||||
from utf8prober import UTF8Prober
|
from .utf8prober import UTF8Prober
|
||||||
from sjisprober import SJISProber
|
from .sjisprober import SJISProber
|
||||||
from eucjpprober import EUCJPProber
|
from .eucjpprober import EUCJPProber
|
||||||
from gb2312prober import GB2312Prober
|
from .gb2312prober import GB2312Prober
|
||||||
from euckrprober import EUCKRProber
|
from .euckrprober import EUCKRProber
|
||||||
from big5prober import Big5Prober
|
from .cp949prober import CP949Prober
|
||||||
from euctwprober import EUCTWProber
|
from .big5prober import Big5Prober
|
||||||
|
from .euctwprober import EUCTWProber
|
||||||
|
|
||||||
|
|
||||||
class MBCSGroupProber(CharSetGroupProber):
|
class MBCSGroupProber(CharSetGroupProber):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
CharSetGroupProber.__init__(self)
|
CharSetGroupProber.__init__(self)
|
||||||
self._mProbers = [ \
|
self._mProbers = [
|
||||||
UTF8Prober(),
|
UTF8Prober(),
|
||||||
SJISProber(),
|
SJISProber(),
|
||||||
EUCJPProber(),
|
EUCJPProber(),
|
||||||
GB2312Prober(),
|
GB2312Prober(),
|
||||||
EUCKRProber(),
|
EUCKRProber(),
|
||||||
|
CP949Prober(),
|
||||||
Big5Prober(),
|
Big5Prober(),
|
||||||
EUCTWProber()]
|
EUCTWProber()
|
||||||
|
]
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
134
thirdparty/chardet/mbcssm.py
vendored
134
thirdparty/chardet/mbcssm.py
vendored
|
@ -25,11 +25,11 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from constants import eStart, eError, eItsMe
|
from .constants import eStart, eError, eItsMe
|
||||||
|
|
||||||
# BIG5
|
# BIG5
|
||||||
|
|
||||||
BIG5_cls = ( \
|
BIG5_cls = (
|
||||||
1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as legal value
|
1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as legal value
|
||||||
1,1,1,1,1,1,0,0, # 08 - 0f
|
1,1,1,1,1,1,0,0, # 08 - 0f
|
||||||
1,1,1,1,1,1,1,1, # 10 - 17
|
1,1,1,1,1,1,1,1, # 10 - 17
|
||||||
|
@ -61,12 +61,14 @@ BIG5_cls = ( \
|
||||||
3,3,3,3,3,3,3,3, # e0 - e7
|
3,3,3,3,3,3,3,3, # e0 - e7
|
||||||
3,3,3,3,3,3,3,3, # e8 - ef
|
3,3,3,3,3,3,3,3, # e8 - ef
|
||||||
3,3,3,3,3,3,3,3, # f0 - f7
|
3,3,3,3,3,3,3,3, # f0 - f7
|
||||||
3,3,3,3,3,3,3,0) # f8 - ff
|
3,3,3,3,3,3,3,0 # f8 - ff
|
||||||
|
)
|
||||||
|
|
||||||
BIG5_st = ( \
|
BIG5_st = (
|
||||||
eError,eStart,eStart, 3,eError,eError,eError,eError,#00-07
|
eError,eStart,eStart, 3,eError,eError,eError,eError,#00-07
|
||||||
eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,#08-0f
|
eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,#08-0f
|
||||||
eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart)#10-17
|
eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart#10-17
|
||||||
|
)
|
||||||
|
|
||||||
Big5CharLenTable = (0, 1, 1, 2, 0)
|
Big5CharLenTable = (0, 1, 1, 2, 0)
|
||||||
|
|
||||||
|
@ -76,9 +78,49 @@ Big5SMModel = {'classTable': BIG5_cls,
|
||||||
'charLenTable': Big5CharLenTable,
|
'charLenTable': Big5CharLenTable,
|
||||||
'name': 'Big5'}
|
'name': 'Big5'}
|
||||||
|
|
||||||
|
# CP949
|
||||||
|
|
||||||
|
CP949_cls = (
|
||||||
|
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,0,0, # 00 - 0f
|
||||||
|
1,1,1,1,1,1,1,1, 1,1,1,0,1,1,1,1, # 10 - 1f
|
||||||
|
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, # 20 - 2f
|
||||||
|
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, # 30 - 3f
|
||||||
|
1,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, # 40 - 4f
|
||||||
|
4,4,5,5,5,5,5,5, 5,5,5,1,1,1,1,1, # 50 - 5f
|
||||||
|
1,5,5,5,5,5,5,5, 5,5,5,5,5,5,5,5, # 60 - 6f
|
||||||
|
5,5,5,5,5,5,5,5, 5,5,5,1,1,1,1,1, # 70 - 7f
|
||||||
|
0,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6, # 80 - 8f
|
||||||
|
6,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6, # 90 - 9f
|
||||||
|
6,7,7,7,7,7,7,7, 7,7,7,7,7,8,8,8, # a0 - af
|
||||||
|
7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7, # b0 - bf
|
||||||
|
7,7,7,7,7,7,9,2, 2,3,2,2,2,2,2,2, # c0 - cf
|
||||||
|
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, # d0 - df
|
||||||
|
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, # e0 - ef
|
||||||
|
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,0, # f0 - ff
|
||||||
|
)
|
||||||
|
|
||||||
|
CP949_st = (
|
||||||
|
#cls= 0 1 2 3 4 5 6 7 8 9 # previous state =
|
||||||
|
eError,eStart, 3,eError,eStart,eStart, 4, 5,eError, 6, # eStart
|
||||||
|
eError,eError,eError,eError,eError,eError,eError,eError,eError,eError, # eError
|
||||||
|
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe, # eItsMe
|
||||||
|
eError,eError,eStart,eStart,eError,eError,eError,eStart,eStart,eStart, # 3
|
||||||
|
eError,eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart,eStart, # 4
|
||||||
|
eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart,eStart,eStart, # 5
|
||||||
|
eError,eStart,eStart,eStart,eStart,eError,eError,eStart,eStart,eStart, # 6
|
||||||
|
)
|
||||||
|
|
||||||
|
CP949CharLenTable = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2)
|
||||||
|
|
||||||
|
CP949SMModel = {'classTable': CP949_cls,
|
||||||
|
'classFactor': 10,
|
||||||
|
'stateTable': CP949_st,
|
||||||
|
'charLenTable': CP949CharLenTable,
|
||||||
|
'name': 'CP949'}
|
||||||
|
|
||||||
# EUC-JP
|
# EUC-JP
|
||||||
|
|
||||||
EUCJP_cls = ( \
|
EUCJP_cls = (
|
||||||
4,4,4,4,4,4,4,4, # 00 - 07
|
4,4,4,4,4,4,4,4, # 00 - 07
|
||||||
4,4,4,4,4,4,5,5, # 08 - 0f
|
4,4,4,4,4,4,5,5, # 08 - 0f
|
||||||
4,4,4,4,4,4,4,4, # 10 - 17
|
4,4,4,4,4,4,4,4, # 10 - 17
|
||||||
|
@ -110,14 +152,16 @@ EUCJP_cls = ( \
|
||||||
0,0,0,0,0,0,0,0, # e0 - e7
|
0,0,0,0,0,0,0,0, # e0 - e7
|
||||||
0,0,0,0,0,0,0,0, # e8 - ef
|
0,0,0,0,0,0,0,0, # e8 - ef
|
||||||
0,0,0,0,0,0,0,0, # f0 - f7
|
0,0,0,0,0,0,0,0, # f0 - f7
|
||||||
0,0,0,0,0,0,0,5) # f8 - ff
|
0,0,0,0,0,0,0,5 # f8 - ff
|
||||||
|
)
|
||||||
|
|
||||||
EUCJP_st = ( \
|
EUCJP_st = (
|
||||||
3, 4, 3, 5,eStart,eError,eError,eError,#00-07
|
3, 4, 3, 5,eStart,eError,eError,eError,#00-07
|
||||||
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
|
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
|
||||||
eItsMe,eItsMe,eStart,eError,eStart,eError,eError,eError,#10-17
|
eItsMe,eItsMe,eStart,eError,eStart,eError,eError,eError,#10-17
|
||||||
eError,eError,eStart,eError,eError,eError, 3,eError,#18-1f
|
eError,eError,eStart,eError,eError,eError, 3,eError,#18-1f
|
||||||
3,eError,eError,eError,eStart,eStart,eStart,eStart)#20-27
|
3,eError,eError,eError,eStart,eStart,eStart,eStart#20-27
|
||||||
|
)
|
||||||
|
|
||||||
EUCJPCharLenTable = (2, 2, 2, 3, 1, 0)
|
EUCJPCharLenTable = (2, 2, 2, 3, 1, 0)
|
||||||
|
|
||||||
|
@ -129,7 +173,7 @@ EUCJPSMModel = {'classTable': EUCJP_cls,
|
||||||
|
|
||||||
# EUC-KR
|
# EUC-KR
|
||||||
|
|
||||||
EUCKR_cls = ( \
|
EUCKR_cls = (
|
||||||
1,1,1,1,1,1,1,1, # 00 - 07
|
1,1,1,1,1,1,1,1, # 00 - 07
|
||||||
1,1,1,1,1,1,0,0, # 08 - 0f
|
1,1,1,1,1,1,0,0, # 08 - 0f
|
||||||
1,1,1,1,1,1,1,1, # 10 - 17
|
1,1,1,1,1,1,1,1, # 10 - 17
|
||||||
|
@ -161,11 +205,13 @@ EUCKR_cls = ( \
|
||||||
2,2,2,2,2,2,2,2, # e0 - e7
|
2,2,2,2,2,2,2,2, # e0 - e7
|
||||||
2,2,2,2,2,2,2,2, # e8 - ef
|
2,2,2,2,2,2,2,2, # e8 - ef
|
||||||
2,2,2,2,2,2,2,2, # f0 - f7
|
2,2,2,2,2,2,2,2, # f0 - f7
|
||||||
2,2,2,2,2,2,2,0) # f8 - ff
|
2,2,2,2,2,2,2,0 # f8 - ff
|
||||||
|
)
|
||||||
|
|
||||||
EUCKR_st = (
|
EUCKR_st = (
|
||||||
eError,eStart, 3,eError,eError,eError,eError,eError,#00-07
|
eError,eStart, 3,eError,eError,eError,eError,eError,#00-07
|
||||||
eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart)#08-0f
|
eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart #08-0f
|
||||||
|
)
|
||||||
|
|
||||||
EUCKRCharLenTable = (0, 1, 2, 0)
|
EUCKRCharLenTable = (0, 1, 2, 0)
|
||||||
|
|
||||||
|
@ -177,7 +223,7 @@ EUCKRSMModel = {'classTable': EUCKR_cls,
|
||||||
|
|
||||||
# EUC-TW
|
# EUC-TW
|
||||||
|
|
||||||
EUCTW_cls = ( \
|
EUCTW_cls = (
|
||||||
2,2,2,2,2,2,2,2, # 00 - 07
|
2,2,2,2,2,2,2,2, # 00 - 07
|
||||||
2,2,2,2,2,2,0,0, # 08 - 0f
|
2,2,2,2,2,2,0,0, # 08 - 0f
|
||||||
2,2,2,2,2,2,2,2, # 10 - 17
|
2,2,2,2,2,2,2,2, # 10 - 17
|
||||||
|
@ -209,15 +255,17 @@ EUCTW_cls = ( \
|
||||||
3,3,3,3,3,3,3,3, # e0 - e7
|
3,3,3,3,3,3,3,3, # e0 - e7
|
||||||
3,3,3,3,3,3,3,3, # e8 - ef
|
3,3,3,3,3,3,3,3, # e8 - ef
|
||||||
3,3,3,3,3,3,3,3, # f0 - f7
|
3,3,3,3,3,3,3,3, # f0 - f7
|
||||||
3,3,3,3,3,3,3,0) # f8 - ff
|
3,3,3,3,3,3,3,0 # f8 - ff
|
||||||
|
)
|
||||||
|
|
||||||
EUCTW_st = ( \
|
EUCTW_st = (
|
||||||
eError,eError,eStart, 3, 3, 3, 4,eError,#00-07
|
eError,eError,eStart, 3, 3, 3, 4,eError,#00-07
|
||||||
eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,#08-0f
|
eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,#08-0f
|
||||||
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eStart,eError,#10-17
|
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eStart,eError,#10-17
|
||||||
eStart,eStart,eStart,eError,eError,eError,eError,eError,#18-1f
|
eStart,eStart,eStart,eError,eError,eError,eError,eError,#18-1f
|
||||||
5,eError,eError,eError,eStart,eError,eStart,eStart,#20-27
|
5,eError,eError,eError,eStart,eError,eStart,eStart,#20-27
|
||||||
eStart,eError,eStart,eStart,eStart,eStart,eStart,eStart)#28-2f
|
eStart,eError,eStart,eStart,eStart,eStart,eStart,eStart #28-2f
|
||||||
|
)
|
||||||
|
|
||||||
EUCTWCharLenTable = (0, 0, 1, 2, 2, 2, 3)
|
EUCTWCharLenTable = (0, 0, 1, 2, 2, 2, 3)
|
||||||
|
|
||||||
|
@ -229,7 +277,7 @@ EUCTWSMModel = {'classTable': EUCTW_cls,
|
||||||
|
|
||||||
# GB2312
|
# GB2312
|
||||||
|
|
||||||
GB2312_cls = ( \
|
GB2312_cls = (
|
||||||
1,1,1,1,1,1,1,1, # 00 - 07
|
1,1,1,1,1,1,1,1, # 00 - 07
|
||||||
1,1,1,1,1,1,0,0, # 08 - 0f
|
1,1,1,1,1,1,0,0, # 08 - 0f
|
||||||
1,1,1,1,1,1,1,1, # 10 - 17
|
1,1,1,1,1,1,1,1, # 10 - 17
|
||||||
|
@ -261,15 +309,17 @@ GB2312_cls = ( \
|
||||||
6,6,6,6,6,6,6,6, # e0 - e7
|
6,6,6,6,6,6,6,6, # e0 - e7
|
||||||
6,6,6,6,6,6,6,6, # e8 - ef
|
6,6,6,6,6,6,6,6, # e8 - ef
|
||||||
6,6,6,6,6,6,6,6, # f0 - f7
|
6,6,6,6,6,6,6,6, # f0 - f7
|
||||||
6,6,6,6,6,6,6,0) # f8 - ff
|
6,6,6,6,6,6,6,0 # f8 - ff
|
||||||
|
)
|
||||||
|
|
||||||
GB2312_st = ( \
|
GB2312_st = (
|
||||||
eError,eStart,eStart,eStart,eStart,eStart, 3,eError,#00-07
|
eError,eStart,eStart,eStart,eStart,eStart, 3,eError,#00-07
|
||||||
eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,#08-0f
|
eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,#08-0f
|
||||||
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,#10-17
|
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,#10-17
|
||||||
4,eError,eStart,eStart,eError,eError,eError,eError,#18-1f
|
4,eError,eStart,eStart,eError,eError,eError,eError,#18-1f
|
||||||
eError,eError, 5,eError,eError,eError,eItsMe,eError,#20-27
|
eError,eError, 5,eError,eError,eError,eItsMe,eError,#20-27
|
||||||
eError,eError,eStart,eStart,eStart,eStart,eStart,eStart)#28-2f
|
eError,eError,eStart,eStart,eStart,eStart,eStart,eStart #28-2f
|
||||||
|
)
|
||||||
|
|
||||||
# To be accurate, the length of class 6 can be either 2 or 4.
|
# To be accurate, the length of class 6 can be either 2 or 4.
|
||||||
# But it is not necessary to discriminate between the two since
|
# But it is not necessary to discriminate between the two since
|
||||||
|
@ -286,7 +336,7 @@ GB2312SMModel = {'classTable': GB2312_cls,
|
||||||
|
|
||||||
# Shift_JIS
|
# Shift_JIS
|
||||||
|
|
||||||
SJIS_cls = ( \
|
SJIS_cls = (
|
||||||
1,1,1,1,1,1,1,1, # 00 - 07
|
1,1,1,1,1,1,1,1, # 00 - 07
|
||||||
1,1,1,1,1,1,0,0, # 08 - 0f
|
1,1,1,1,1,1,0,0, # 08 - 0f
|
||||||
1,1,1,1,1,1,1,1, # 10 - 17
|
1,1,1,1,1,1,1,1, # 10 - 17
|
||||||
|
@ -303,7 +353,7 @@ SJIS_cls = ( \
|
||||||
2,2,2,2,2,2,2,2, # 68 - 6f
|
2,2,2,2,2,2,2,2, # 68 - 6f
|
||||||
2,2,2,2,2,2,2,2, # 70 - 77
|
2,2,2,2,2,2,2,2, # 70 - 77
|
||||||
2,2,2,2,2,2,2,1, # 78 - 7f
|
2,2,2,2,2,2,2,1, # 78 - 7f
|
||||||
3,3,3,3,3,3,3,3, # 80 - 87
|
3,3,3,3,3,2,2,3, # 80 - 87
|
||||||
3,3,3,3,3,3,3,3, # 88 - 8f
|
3,3,3,3,3,3,3,3, # 88 - 8f
|
||||||
3,3,3,3,3,3,3,3, # 90 - 97
|
3,3,3,3,3,3,3,3, # 90 - 97
|
||||||
3,3,3,3,3,3,3,3, # 98 - 9f
|
3,3,3,3,3,3,3,3, # 98 - 9f
|
||||||
|
@ -319,13 +369,15 @@ SJIS_cls = ( \
|
||||||
2,2,2,2,2,2,2,2, # d8 - df
|
2,2,2,2,2,2,2,2, # d8 - df
|
||||||
3,3,3,3,3,3,3,3, # e0 - e7
|
3,3,3,3,3,3,3,3, # e0 - e7
|
||||||
3,3,3,3,3,4,4,4, # e8 - ef
|
3,3,3,3,3,4,4,4, # e8 - ef
|
||||||
4,4,4,4,4,4,4,4, # f0 - f7
|
3,3,3,3,3,3,3,3, # f0 - f7
|
||||||
4,4,4,4,4,0,0,0) # f8 - ff
|
3,3,3,3,3,0,0,0) # f8 - ff
|
||||||
|
|
||||||
SJIS_st = ( \
|
|
||||||
|
SJIS_st = (
|
||||||
eError,eStart,eStart, 3,eError,eError,eError,eError,#00-07
|
eError,eStart,eStart, 3,eError,eError,eError,eError,#00-07
|
||||||
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
|
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
|
||||||
eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart)#10-17
|
eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart #10-17
|
||||||
|
)
|
||||||
|
|
||||||
SJISCharLenTable = (0, 1, 1, 2, 0, 0)
|
SJISCharLenTable = (0, 1, 1, 2, 0, 0)
|
||||||
|
|
||||||
|
@ -337,7 +389,7 @@ SJISSMModel = {'classTable': SJIS_cls,
|
||||||
|
|
||||||
# UCS2-BE
|
# UCS2-BE
|
||||||
|
|
||||||
UCS2BE_cls = ( \
|
UCS2BE_cls = (
|
||||||
0,0,0,0,0,0,0,0, # 00 - 07
|
0,0,0,0,0,0,0,0, # 00 - 07
|
||||||
0,0,1,0,0,2,0,0, # 08 - 0f
|
0,0,1,0,0,2,0,0, # 08 - 0f
|
||||||
0,0,0,0,0,0,0,0, # 10 - 17
|
0,0,0,0,0,0,0,0, # 10 - 17
|
||||||
|
@ -369,16 +421,18 @@ UCS2BE_cls = ( \
|
||||||
0,0,0,0,0,0,0,0, # e0 - e7
|
0,0,0,0,0,0,0,0, # e0 - e7
|
||||||
0,0,0,0,0,0,0,0, # e8 - ef
|
0,0,0,0,0,0,0,0, # e8 - ef
|
||||||
0,0,0,0,0,0,0,0, # f0 - f7
|
0,0,0,0,0,0,0,0, # f0 - f7
|
||||||
0,0,0,0,0,0,4,5) # f8 - ff
|
0,0,0,0,0,0,4,5 # f8 - ff
|
||||||
|
)
|
||||||
|
|
||||||
UCS2BE_st = ( \
|
UCS2BE_st = (
|
||||||
5, 7, 7,eError, 4, 3,eError,eError,#00-07
|
5, 7, 7,eError, 4, 3,eError,eError,#00-07
|
||||||
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
|
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
|
||||||
eItsMe,eItsMe, 6, 6, 6, 6,eError,eError,#10-17
|
eItsMe,eItsMe, 6, 6, 6, 6,eError,eError,#10-17
|
||||||
6, 6, 6, 6, 6,eItsMe, 6, 6,#18-1f
|
6, 6, 6, 6, 6,eItsMe, 6, 6,#18-1f
|
||||||
6, 6, 6, 6, 5, 7, 7,eError,#20-27
|
6, 6, 6, 6, 5, 7, 7,eError,#20-27
|
||||||
5, 8, 6, 6,eError, 6, 6, 6,#28-2f
|
5, 8, 6, 6,eError, 6, 6, 6,#28-2f
|
||||||
6, 6, 6, 6,eError,eError,eStart,eStart)#30-37
|
6, 6, 6, 6,eError,eError,eStart,eStart #30-37
|
||||||
|
)
|
||||||
|
|
||||||
UCS2BECharLenTable = (2, 2, 2, 0, 2, 2)
|
UCS2BECharLenTable = (2, 2, 2, 0, 2, 2)
|
||||||
|
|
||||||
|
@ -390,7 +444,7 @@ UCS2BESMModel = {'classTable': UCS2BE_cls,
|
||||||
|
|
||||||
# UCS2-LE
|
# UCS2-LE
|
||||||
|
|
||||||
UCS2LE_cls = ( \
|
UCS2LE_cls = (
|
||||||
0,0,0,0,0,0,0,0, # 00 - 07
|
0,0,0,0,0,0,0,0, # 00 - 07
|
||||||
0,0,1,0,0,2,0,0, # 08 - 0f
|
0,0,1,0,0,2,0,0, # 08 - 0f
|
||||||
0,0,0,0,0,0,0,0, # 10 - 17
|
0,0,0,0,0,0,0,0, # 10 - 17
|
||||||
|
@ -422,16 +476,18 @@ UCS2LE_cls = ( \
|
||||||
0,0,0,0,0,0,0,0, # e0 - e7
|
0,0,0,0,0,0,0,0, # e0 - e7
|
||||||
0,0,0,0,0,0,0,0, # e8 - ef
|
0,0,0,0,0,0,0,0, # e8 - ef
|
||||||
0,0,0,0,0,0,0,0, # f0 - f7
|
0,0,0,0,0,0,0,0, # f0 - f7
|
||||||
0,0,0,0,0,0,4,5) # f8 - ff
|
0,0,0,0,0,0,4,5 # f8 - ff
|
||||||
|
)
|
||||||
|
|
||||||
UCS2LE_st = ( \
|
UCS2LE_st = (
|
||||||
6, 6, 7, 6, 4, 3,eError,eError,#00-07
|
6, 6, 7, 6, 4, 3,eError,eError,#00-07
|
||||||
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
|
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
|
||||||
eItsMe,eItsMe, 5, 5, 5,eError,eItsMe,eError,#10-17
|
eItsMe,eItsMe, 5, 5, 5,eError,eItsMe,eError,#10-17
|
||||||
5, 5, 5,eError, 5,eError, 6, 6,#18-1f
|
5, 5, 5,eError, 5,eError, 6, 6,#18-1f
|
||||||
7, 6, 8, 8, 5, 5, 5,eError,#20-27
|
7, 6, 8, 8, 5, 5, 5,eError,#20-27
|
||||||
5, 5, 5,eError,eError,eError, 5, 5,#28-2f
|
5, 5, 5,eError,eError,eError, 5, 5,#28-2f
|
||||||
5, 5, 5,eError, 5,eError,eStart,eStart)#30-37
|
5, 5, 5,eError, 5,eError,eStart,eStart #30-37
|
||||||
|
)
|
||||||
|
|
||||||
UCS2LECharLenTable = (2, 2, 2, 2, 2, 2)
|
UCS2LECharLenTable = (2, 2, 2, 2, 2, 2)
|
||||||
|
|
||||||
|
@ -443,7 +499,7 @@ UCS2LESMModel = {'classTable': UCS2LE_cls,
|
||||||
|
|
||||||
# UTF-8
|
# UTF-8
|
||||||
|
|
||||||
UTF8_cls = ( \
|
UTF8_cls = (
|
||||||
1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as a legal value
|
1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as a legal value
|
||||||
1,1,1,1,1,1,0,0, # 08 - 0f
|
1,1,1,1,1,1,0,0, # 08 - 0f
|
||||||
1,1,1,1,1,1,1,1, # 10 - 17
|
1,1,1,1,1,1,1,1, # 10 - 17
|
||||||
|
@ -475,9 +531,10 @@ UTF8_cls = ( \
|
||||||
7,8,8,8,8,8,8,8, # e0 - e7
|
7,8,8,8,8,8,8,8, # e0 - e7
|
||||||
8,8,8,8,8,9,8,8, # e8 - ef
|
8,8,8,8,8,9,8,8, # e8 - ef
|
||||||
10,11,11,11,11,11,11,11, # f0 - f7
|
10,11,11,11,11,11,11,11, # f0 - f7
|
||||||
12,13,13,13,14,15,0,0) # f8 - ff
|
12,13,13,13,14,15,0,0 # f8 - ff
|
||||||
|
)
|
||||||
|
|
||||||
UTF8_st = ( \
|
UTF8_st = (
|
||||||
eError,eStart,eError,eError,eError,eError, 12, 10,#00-07
|
eError,eStart,eError,eError,eError,eError, 12, 10,#00-07
|
||||||
9, 11, 8, 7, 6, 5, 4, 3,#08-0f
|
9, 11, 8, 7, 6, 5, 4, 3,#08-0f
|
||||||
eError,eError,eError,eError,eError,eError,eError,eError,#10-17
|
eError,eError,eError,eError,eError,eError,eError,eError,#10-17
|
||||||
|
@ -503,7 +560,8 @@ UTF8_st = ( \
|
||||||
eError,eError, 12, 12, 12,eError,eError,eError,#b0-b7
|
eError,eError, 12, 12, 12,eError,eError,eError,#b0-b7
|
||||||
eError,eError,eError,eError,eError,eError,eError,eError,#b8-bf
|
eError,eError,eError,eError,eError,eError,eError,eError,#b8-bf
|
||||||
eError,eError,eStart,eStart,eStart,eStart,eError,eError,#c0-c7
|
eError,eError,eStart,eStart,eStart,eStart,eError,eError,#c0-c7
|
||||||
eError,eError,eError,eError,eError,eError,eError,eError)#c8-cf
|
eError,eError,eError,eError,eError,eError,eError,eError #c8-cf
|
||||||
|
)
|
||||||
|
|
||||||
UTF8CharLenTable = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6)
|
UTF8CharLenTable = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6)
|
||||||
|
|
||||||
|
|
46
thirdparty/chardet/sbcharsetprober.py
vendored
46
thirdparty/chardet/sbcharsetprober.py
vendored
|
@ -26,8 +26,10 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
import constants, sys
|
import sys
|
||||||
from charsetprober import CharSetProber
|
from . import constants
|
||||||
|
from .charsetprober import CharSetProber
|
||||||
|
from .compat import wrap_ord
|
||||||
|
|
||||||
SAMPLE_SIZE = 64
|
SAMPLE_SIZE = 64
|
||||||
SB_ENOUGH_REL_THRESHOLD = 1024
|
SB_ENOUGH_REL_THRESHOLD = 1024
|
||||||
|
@ -38,21 +40,26 @@ NUMBER_OF_SEQ_CAT = 4
|
||||||
POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1
|
POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1
|
||||||
#NEGATIVE_CAT = 0
|
#NEGATIVE_CAT = 0
|
||||||
|
|
||||||
|
|
||||||
class SingleByteCharSetProber(CharSetProber):
|
class SingleByteCharSetProber(CharSetProber):
|
||||||
def __init__(self, model, reversed=constants.False, nameProber=None):
|
def __init__(self, model, reversed=False, nameProber=None):
|
||||||
CharSetProber.__init__(self)
|
CharSetProber.__init__(self)
|
||||||
self._mModel = model
|
self._mModel = model
|
||||||
self._mReversed = reversed # TRUE if we need to reverse every pair in the model lookup
|
# TRUE if we need to reverse every pair in the model lookup
|
||||||
self._mNameProber = nameProber # Optional auxiliary prober for name decision
|
self._mReversed = reversed
|
||||||
|
# Optional auxiliary prober for name decision
|
||||||
|
self._mNameProber = nameProber
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
CharSetProber.reset(self)
|
CharSetProber.reset(self)
|
||||||
self._mLastOrder = 255 # char order of last character
|
# char order of last character
|
||||||
|
self._mLastOrder = 255
|
||||||
self._mSeqCounters = [0] * NUMBER_OF_SEQ_CAT
|
self._mSeqCounters = [0] * NUMBER_OF_SEQ_CAT
|
||||||
self._mTotalSeqs = 0
|
self._mTotalSeqs = 0
|
||||||
self._mTotalChar = 0
|
self._mTotalChar = 0
|
||||||
self._mFreqChar = 0 # characters that fall in our sampling range
|
# characters that fall in our sampling range
|
||||||
|
self._mFreqChar = 0
|
||||||
|
|
||||||
def get_charset_name(self):
|
def get_charset_name(self):
|
||||||
if self._mNameProber:
|
if self._mNameProber:
|
||||||
|
@ -67,7 +74,7 @@ class SingleByteCharSetProber(CharSetProber):
|
||||||
if not aLen:
|
if not aLen:
|
||||||
return self.get_state()
|
return self.get_state()
|
||||||
for c in aBuf:
|
for c in aBuf:
|
||||||
order = self._mModel['charToOrderMap'][ord(c)]
|
order = self._mModel['charToOrderMap'][wrap_ord(c)]
|
||||||
if order < SYMBOL_CAT_ORDER:
|
if order < SYMBOL_CAT_ORDER:
|
||||||
self._mTotalChar += 1
|
self._mTotalChar += 1
|
||||||
if order < SAMPLE_SIZE:
|
if order < SAMPLE_SIZE:
|
||||||
|
@ -75,9 +82,12 @@ class SingleByteCharSetProber(CharSetProber):
|
||||||
if self._mLastOrder < SAMPLE_SIZE:
|
if self._mLastOrder < SAMPLE_SIZE:
|
||||||
self._mTotalSeqs += 1
|
self._mTotalSeqs += 1
|
||||||
if not self._mReversed:
|
if not self._mReversed:
|
||||||
self._mSeqCounters[self._mModel['precedenceMatrix'][(self._mLastOrder * SAMPLE_SIZE) + order]] += 1
|
i = (self._mLastOrder * SAMPLE_SIZE) + order
|
||||||
else: # reverse the order of the letters in the lookup
|
model = self._mModel['precedenceMatrix'][i]
|
||||||
self._mSeqCounters[self._mModel['precedenceMatrix'][(order * SAMPLE_SIZE) + self._mLastOrder]] += 1
|
else: # reverse the order of the letters in the lookup
|
||||||
|
i = (order * SAMPLE_SIZE) + self._mLastOrder
|
||||||
|
model = self._mModel['precedenceMatrix'][i]
|
||||||
|
self._mSeqCounters[model] += 1
|
||||||
self._mLastOrder = order
|
self._mLastOrder = order
|
||||||
|
|
||||||
if self.get_state() == constants.eDetecting:
|
if self.get_state() == constants.eDetecting:
|
||||||
|
@ -85,11 +95,16 @@ class SingleByteCharSetProber(CharSetProber):
|
||||||
cf = self.get_confidence()
|
cf = self.get_confidence()
|
||||||
if cf > POSITIVE_SHORTCUT_THRESHOLD:
|
if cf > POSITIVE_SHORTCUT_THRESHOLD:
|
||||||
if constants._debug:
|
if constants._debug:
|
||||||
sys.stderr.write('%s confidence = %s, we have a winner\n' % (self._mModel['charsetName'], cf))
|
sys.stderr.write('%s confidence = %s, we have a'
|
||||||
|
'winner\n' %
|
||||||
|
(self._mModel['charsetName'], cf))
|
||||||
self._mState = constants.eFoundIt
|
self._mState = constants.eFoundIt
|
||||||
elif cf < NEGATIVE_SHORTCUT_THRESHOLD:
|
elif cf < NEGATIVE_SHORTCUT_THRESHOLD:
|
||||||
if constants._debug:
|
if constants._debug:
|
||||||
sys.stderr.write('%s confidence = %s, below negative shortcut threshhold %s\n' % (self._mModel['charsetName'], cf, NEGATIVE_SHORTCUT_THRESHOLD))
|
sys.stderr.write('%s confidence = %s, below negative'
|
||||||
|
'shortcut threshhold %s\n' %
|
||||||
|
(self._mModel['charsetName'], cf,
|
||||||
|
NEGATIVE_SHORTCUT_THRESHOLD))
|
||||||
self._mState = constants.eNotMe
|
self._mState = constants.eNotMe
|
||||||
|
|
||||||
return self.get_state()
|
return self.get_state()
|
||||||
|
@ -97,9 +112,8 @@ class SingleByteCharSetProber(CharSetProber):
|
||||||
def get_confidence(self):
|
def get_confidence(self):
|
||||||
r = 0.01
|
r = 0.01
|
||||||
if self._mTotalSeqs > 0:
|
if self._mTotalSeqs > 0:
|
||||||
# print self._mSeqCounters[POSITIVE_CAT], self._mTotalSeqs, self._mModel['mTypicalPositiveRatio']
|
r = ((1.0 * self._mSeqCounters[POSITIVE_CAT]) / self._mTotalSeqs
|
||||||
r = (1.0 * self._mSeqCounters[POSITIVE_CAT]) / self._mTotalSeqs / self._mModel['mTypicalPositiveRatio']
|
/ self._mModel['mTypicalPositiveRatio'])
|
||||||
# print r, self._mFreqChar, self._mTotalChar
|
|
||||||
r = r * self._mFreqChar / self._mTotalChar
|
r = r * self._mFreqChar / self._mTotalChar
|
||||||
if r >= 1.0:
|
if r >= 1.0:
|
||||||
r = 0.99
|
r = 0.99
|
||||||
|
|
35
thirdparty/chardet/sbcsgroupprober.py
vendored
35
thirdparty/chardet/sbcsgroupprober.py
vendored
|
@ -26,21 +26,23 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
import constants, sys
|
from .charsetgroupprober import CharSetGroupProber
|
||||||
from charsetgroupprober import CharSetGroupProber
|
from .sbcharsetprober import SingleByteCharSetProber
|
||||||
from sbcharsetprober import SingleByteCharSetProber
|
from .langcyrillicmodel import (Win1251CyrillicModel, Koi8rModel,
|
||||||
from langcyrillicmodel import Win1251CyrillicModel, Koi8rModel, Latin5CyrillicModel, MacCyrillicModel, Ibm866Model, Ibm855Model
|
Latin5CyrillicModel, MacCyrillicModel,
|
||||||
from langgreekmodel import Latin7GreekModel, Win1253GreekModel
|
Ibm866Model, Ibm855Model)
|
||||||
from langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel
|
from .langgreekmodel import Latin7GreekModel, Win1253GreekModel
|
||||||
from langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel
|
from .langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel
|
||||||
from langthaimodel import TIS620ThaiModel
|
from .langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel
|
||||||
from langhebrewmodel import Win1255HebrewModel
|
from .langthaimodel import TIS620ThaiModel
|
||||||
from hebrewprober import HebrewProber
|
from .langhebrewmodel import Win1255HebrewModel
|
||||||
|
from .hebrewprober import HebrewProber
|
||||||
|
|
||||||
|
|
||||||
class SBCSGroupProber(CharSetGroupProber):
|
class SBCSGroupProber(CharSetGroupProber):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
CharSetGroupProber.__init__(self)
|
CharSetGroupProber.__init__(self)
|
||||||
self._mProbers = [ \
|
self._mProbers = [
|
||||||
SingleByteCharSetProber(Win1251CyrillicModel),
|
SingleByteCharSetProber(Win1251CyrillicModel),
|
||||||
SingleByteCharSetProber(Koi8rModel),
|
SingleByteCharSetProber(Koi8rModel),
|
||||||
SingleByteCharSetProber(Latin5CyrillicModel),
|
SingleByteCharSetProber(Latin5CyrillicModel),
|
||||||
|
@ -54,11 +56,14 @@ class SBCSGroupProber(CharSetGroupProber):
|
||||||
SingleByteCharSetProber(Latin2HungarianModel),
|
SingleByteCharSetProber(Latin2HungarianModel),
|
||||||
SingleByteCharSetProber(Win1250HungarianModel),
|
SingleByteCharSetProber(Win1250HungarianModel),
|
||||||
SingleByteCharSetProber(TIS620ThaiModel),
|
SingleByteCharSetProber(TIS620ThaiModel),
|
||||||
]
|
]
|
||||||
hebrewProber = HebrewProber()
|
hebrewProber = HebrewProber()
|
||||||
logicalHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, constants.False, hebrewProber)
|
logicalHebrewProber = SingleByteCharSetProber(Win1255HebrewModel,
|
||||||
visualHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, constants.True, hebrewProber)
|
False, hebrewProber)
|
||||||
|
visualHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, True,
|
||||||
|
hebrewProber)
|
||||||
hebrewProber.set_model_probers(logicalHebrewProber, visualHebrewProber)
|
hebrewProber.set_model_probers(logicalHebrewProber, visualHebrewProber)
|
||||||
self._mProbers.extend([hebrewProber, logicalHebrewProber, visualHebrewProber])
|
self._mProbers.extend([hebrewProber, logicalHebrewProber,
|
||||||
|
visualHebrewProber])
|
||||||
|
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
42
thirdparty/chardet/sjisprober.py
vendored
42
thirdparty/chardet/sjisprober.py
vendored
|
@ -25,13 +25,14 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from mbcharsetprober import MultiByteCharSetProber
|
import sys
|
||||||
from codingstatemachine import CodingStateMachine
|
from .mbcharsetprober import MultiByteCharSetProber
|
||||||
from chardistribution import SJISDistributionAnalysis
|
from .codingstatemachine import CodingStateMachine
|
||||||
from jpcntx import SJISContextAnalysis
|
from .chardistribution import SJISDistributionAnalysis
|
||||||
from mbcssm import SJISSMModel
|
from .jpcntx import SJISContextAnalysis
|
||||||
import constants, sys
|
from .mbcssm import SJISSMModel
|
||||||
from constants import eStart, eError, eItsMe
|
from . import constants
|
||||||
|
|
||||||
|
|
||||||
class SJISProber(MultiByteCharSetProber):
|
class SJISProber(MultiByteCharSetProber):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
@ -46,35 +47,40 @@ class SJISProber(MultiByteCharSetProber):
|
||||||
self._mContextAnalyzer.reset()
|
self._mContextAnalyzer.reset()
|
||||||
|
|
||||||
def get_charset_name(self):
|
def get_charset_name(self):
|
||||||
return "SHIFT_JIS"
|
return self._mContextAnalyzer.get_charset_name()
|
||||||
|
|
||||||
def feed(self, aBuf):
|
def feed(self, aBuf):
|
||||||
aLen = len(aBuf)
|
aLen = len(aBuf)
|
||||||
for i in xrange(0, aLen):
|
for i in range(0, aLen):
|
||||||
codingState = self._mCodingSM.next_state(aBuf[i])
|
codingState = self._mCodingSM.next_state(aBuf[i])
|
||||||
if codingState == eError:
|
if codingState == constants.eError:
|
||||||
if constants._debug:
|
if constants._debug:
|
||||||
sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n')
|
sys.stderr.write(self.get_charset_name()
|
||||||
|
+ ' prober hit error at byte ' + str(i)
|
||||||
|
+ '\n')
|
||||||
self._mState = constants.eNotMe
|
self._mState = constants.eNotMe
|
||||||
break
|
break
|
||||||
elif codingState == eItsMe:
|
elif codingState == constants.eItsMe:
|
||||||
self._mState = constants.eFoundIt
|
self._mState = constants.eFoundIt
|
||||||
break
|
break
|
||||||
elif codingState == eStart:
|
elif codingState == constants.eStart:
|
||||||
charLen = self._mCodingSM.get_current_charlen()
|
charLen = self._mCodingSM.get_current_charlen()
|
||||||
if i == 0:
|
if i == 0:
|
||||||
self._mLastChar[1] = aBuf[0]
|
self._mLastChar[1] = aBuf[0]
|
||||||
self._mContextAnalyzer.feed(self._mLastChar[2 - charLen :], charLen)
|
self._mContextAnalyzer.feed(self._mLastChar[2 - charLen:],
|
||||||
|
charLen)
|
||||||
self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
|
self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
|
||||||
else:
|
else:
|
||||||
self._mContextAnalyzer.feed(aBuf[i + 1 - charLen : i + 3 - charLen], charLen)
|
self._mContextAnalyzer.feed(aBuf[i + 1 - charLen:i + 3
|
||||||
self._mDistributionAnalyzer.feed(aBuf[i - 1 : i + 1], charLen)
|
- charLen], charLen)
|
||||||
|
self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1],
|
||||||
|
charLen)
|
||||||
|
|
||||||
self._mLastChar[0] = aBuf[aLen - 1]
|
self._mLastChar[0] = aBuf[aLen - 1]
|
||||||
|
|
||||||
if self.get_state() == constants.eDetecting:
|
if self.get_state() == constants.eDetecting:
|
||||||
if self._mContextAnalyzer.got_enough_data() and \
|
if (self._mContextAnalyzer.got_enough_data() and
|
||||||
(self.get_confidence() > constants.SHORTCUT_THRESHOLD):
|
(self.get_confidence() > constants.SHORTCUT_THRESHOLD)):
|
||||||
self._mState = constants.eFoundIt
|
self._mState = constants.eFoundIt
|
||||||
|
|
||||||
return self.get_state()
|
return self.get_state()
|
||||||
|
|
20
thirdparty/chardet/test.py
vendored
20
thirdparty/chardet/test.py
vendored
|
@ -1,20 +0,0 @@
|
||||||
import sys, glob
|
|
||||||
sys.path.insert(0, '..')
|
|
||||||
from chardet.universaldetector import UniversalDetector
|
|
||||||
|
|
||||||
count = 0
|
|
||||||
u = UniversalDetector()
|
|
||||||
for f in glob.glob(sys.argv[1]):
|
|
||||||
print f.ljust(60),
|
|
||||||
u.reset()
|
|
||||||
for line in file(f, 'rb'):
|
|
||||||
u.feed(line)
|
|
||||||
if u.done: break
|
|
||||||
u.close()
|
|
||||||
result = u.result
|
|
||||||
if result['encoding']:
|
|
||||||
print result['encoding'], 'with confidence', result['confidence']
|
|
||||||
else:
|
|
||||||
print '******** no result'
|
|
||||||
count += 1
|
|
||||||
print count, 'tests'
|
|
88
thirdparty/chardet/universaldetector.py
vendored
88
thirdparty/chardet/universaldetector.py
vendored
|
@ -26,11 +26,13 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
import constants, sys
|
from . import constants
|
||||||
from latin1prober import Latin1Prober # windows-1252
|
import sys
|
||||||
from mbcsgroupprober import MBCSGroupProber # multi-byte character sets
|
import codecs
|
||||||
from sbcsgroupprober import SBCSGroupProber # single-byte character sets
|
from .latin1prober import Latin1Prober # windows-1252
|
||||||
from escprober import EscCharSetProber # ISO-2122, etc.
|
from .mbcsgroupprober import MBCSGroupProber # multi-byte character sets
|
||||||
|
from .sbcsgroupprober import SBCSGroupProber # single-byte character sets
|
||||||
|
from .escprober import EscCharSetProber # ISO-2122, etc.
|
||||||
import re
|
import re
|
||||||
|
|
||||||
MINIMUM_THRESHOLD = 0.20
|
MINIMUM_THRESHOLD = 0.20
|
||||||
|
@ -38,68 +40,78 @@ ePureAscii = 0
|
||||||
eEscAscii = 1
|
eEscAscii = 1
|
||||||
eHighbyte = 2
|
eHighbyte = 2
|
||||||
|
|
||||||
|
|
||||||
class UniversalDetector:
|
class UniversalDetector:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self._highBitDetector = re.compile(r'[\x80-\xFF]')
|
self._highBitDetector = re.compile(b'[\x80-\xFF]')
|
||||||
self._escDetector = re.compile(r'(\033|~{)')
|
self._escDetector = re.compile(b'(\033|~{)')
|
||||||
self._mEscCharSetProber = None
|
self._mEscCharSetProber = None
|
||||||
self._mCharSetProbers = []
|
self._mCharSetProbers = []
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self.result = {'encoding': None, 'confidence': 0.0}
|
self.result = {'encoding': None, 'confidence': 0.0}
|
||||||
self.done = constants.False
|
self.done = False
|
||||||
self._mStart = constants.True
|
self._mStart = True
|
||||||
self._mGotData = constants.False
|
self._mGotData = False
|
||||||
self._mInputState = ePureAscii
|
self._mInputState = ePureAscii
|
||||||
self._mLastChar = ''
|
self._mLastChar = b''
|
||||||
if self._mEscCharSetProber:
|
if self._mEscCharSetProber:
|
||||||
self._mEscCharSetProber.reset()
|
self._mEscCharSetProber.reset()
|
||||||
for prober in self._mCharSetProbers:
|
for prober in self._mCharSetProbers:
|
||||||
prober.reset()
|
prober.reset()
|
||||||
|
|
||||||
def feed(self, aBuf):
|
def feed(self, aBuf):
|
||||||
if self.done: return
|
if self.done:
|
||||||
|
return
|
||||||
|
|
||||||
aLen = len(aBuf)
|
aLen = len(aBuf)
|
||||||
if not aLen: return
|
if not aLen:
|
||||||
|
return
|
||||||
|
|
||||||
if not self._mGotData:
|
if not self._mGotData:
|
||||||
# If the data starts with BOM, we know it is UTF
|
# If the data starts with BOM, we know it is UTF
|
||||||
if aBuf[:3] == '\xEF\xBB\xBF':
|
if aBuf[:3] == codecs.BOM_UTF8:
|
||||||
# EF BB BF UTF-8 with BOM
|
# EF BB BF UTF-8 with BOM
|
||||||
self.result = {'encoding': "UTF-8", 'confidence': 1.0}
|
self.result = {'encoding': "UTF-8-SIG", 'confidence': 1.0}
|
||||||
elif aBuf[:4] == '\xFF\xFE\x00\x00':
|
elif aBuf[:4] == codecs.BOM_UTF32_LE:
|
||||||
# FF FE 00 00 UTF-32, little-endian BOM
|
# FF FE 00 00 UTF-32, little-endian BOM
|
||||||
self.result = {'encoding': "UTF-32LE", 'confidence': 1.0}
|
self.result = {'encoding': "UTF-32LE", 'confidence': 1.0}
|
||||||
elif aBuf[:4] == '\x00\x00\xFE\xFF':
|
elif aBuf[:4] == codecs.BOM_UTF32_BE:
|
||||||
# 00 00 FE FF UTF-32, big-endian BOM
|
# 00 00 FE FF UTF-32, big-endian BOM
|
||||||
self.result = {'encoding': "UTF-32BE", 'confidence': 1.0}
|
self.result = {'encoding': "UTF-32BE", 'confidence': 1.0}
|
||||||
elif aBuf[:4] == '\xFE\xFF\x00\x00':
|
elif aBuf[:4] == b'\xFE\xFF\x00\x00':
|
||||||
# FE FF 00 00 UCS-4, unusual octet order BOM (3412)
|
# FE FF 00 00 UCS-4, unusual octet order BOM (3412)
|
||||||
self.result = {'encoding': "X-ISO-10646-UCS-4-3412", 'confidence': 1.0}
|
self.result = {
|
||||||
elif aBuf[:4] == '\x00\x00\xFF\xFE':
|
'encoding': "X-ISO-10646-UCS-4-3412",
|
||||||
|
'confidence': 1.0
|
||||||
|
}
|
||||||
|
elif aBuf[:4] == b'\x00\x00\xFF\xFE':
|
||||||
# 00 00 FF FE UCS-4, unusual octet order BOM (2143)
|
# 00 00 FF FE UCS-4, unusual octet order BOM (2143)
|
||||||
self.result = {'encoding': "X-ISO-10646-UCS-4-2143", 'confidence': 1.0}
|
self.result = {
|
||||||
elif aBuf[:2] == '\xFF\xFE':
|
'encoding': "X-ISO-10646-UCS-4-2143",
|
||||||
|
'confidence': 1.0
|
||||||
|
}
|
||||||
|
elif aBuf[:2] == codecs.BOM_LE:
|
||||||
# FF FE UTF-16, little endian BOM
|
# FF FE UTF-16, little endian BOM
|
||||||
self.result = {'encoding': "UTF-16LE", 'confidence': 1.0}
|
self.result = {'encoding': "UTF-16LE", 'confidence': 1.0}
|
||||||
elif aBuf[:2] == '\xFE\xFF':
|
elif aBuf[:2] == codecs.BOM_BE:
|
||||||
# FE FF UTF-16, big endian BOM
|
# FE FF UTF-16, big endian BOM
|
||||||
self.result = {'encoding': "UTF-16BE", 'confidence': 1.0}
|
self.result = {'encoding': "UTF-16BE", 'confidence': 1.0}
|
||||||
|
|
||||||
self._mGotData = constants.True
|
self._mGotData = True
|
||||||
if self.result['encoding'] and (self.result['confidence'] > 0.0):
|
if self.result['encoding'] and (self.result['confidence'] > 0.0):
|
||||||
self.done = constants.True
|
self.done = True
|
||||||
return
|
return
|
||||||
|
|
||||||
if self._mInputState == ePureAscii:
|
if self._mInputState == ePureAscii:
|
||||||
if self._highBitDetector.search(aBuf):
|
if self._highBitDetector.search(aBuf):
|
||||||
self._mInputState = eHighbyte
|
self._mInputState = eHighbyte
|
||||||
elif (self._mInputState == ePureAscii) and self._escDetector.search(self._mLastChar + aBuf):
|
elif ((self._mInputState == ePureAscii) and
|
||||||
|
self._escDetector.search(self._mLastChar + aBuf)):
|
||||||
self._mInputState = eEscAscii
|
self._mInputState = eEscAscii
|
||||||
|
|
||||||
self._mLastChar = aBuf[-1]
|
self._mLastChar = aBuf[-1:]
|
||||||
|
|
||||||
if self._mInputState == eEscAscii:
|
if self._mInputState == eEscAscii:
|
||||||
if not self._mEscCharSetProber:
|
if not self._mEscCharSetProber:
|
||||||
|
@ -107,24 +119,26 @@ class UniversalDetector:
|
||||||
if self._mEscCharSetProber.feed(aBuf) == constants.eFoundIt:
|
if self._mEscCharSetProber.feed(aBuf) == constants.eFoundIt:
|
||||||
self.result = {'encoding': self._mEscCharSetProber.get_charset_name(),
|
self.result = {'encoding': self._mEscCharSetProber.get_charset_name(),
|
||||||
'confidence': self._mEscCharSetProber.get_confidence()}
|
'confidence': self._mEscCharSetProber.get_confidence()}
|
||||||
self.done = constants.True
|
self.done = True
|
||||||
elif self._mInputState == eHighbyte:
|
elif self._mInputState == eHighbyte:
|
||||||
if not self._mCharSetProbers:
|
if not self._mCharSetProbers:
|
||||||
self._mCharSetProbers = [MBCSGroupProber(), SBCSGroupProber(), Latin1Prober()]
|
self._mCharSetProbers = [MBCSGroupProber(), SBCSGroupProber(),
|
||||||
|
Latin1Prober()]
|
||||||
for prober in self._mCharSetProbers:
|
for prober in self._mCharSetProbers:
|
||||||
if prober.feed(aBuf) == constants.eFoundIt:
|
if prober.feed(aBuf) == constants.eFoundIt:
|
||||||
self.result = {'encoding': prober.get_charset_name(),
|
self.result = {'encoding': prober.get_charset_name(),
|
||||||
'confidence': prober.get_confidence()}
|
'confidence': prober.get_confidence()}
|
||||||
self.done = constants.True
|
self.done = True
|
||||||
break
|
break
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
if self.done: return
|
if self.done:
|
||||||
|
return
|
||||||
if not self._mGotData:
|
if not self._mGotData:
|
||||||
if constants._debug:
|
if constants._debug:
|
||||||
sys.stderr.write('no data received!\n')
|
sys.stderr.write('no data received!\n')
|
||||||
return
|
return
|
||||||
self.done = constants.True
|
self.done = True
|
||||||
|
|
||||||
if self._mInputState == ePureAscii:
|
if self._mInputState == ePureAscii:
|
||||||
self.result = {'encoding': 'ascii', 'confidence': 1.0}
|
self.result = {'encoding': 'ascii', 'confidence': 1.0}
|
||||||
|
@ -135,7 +149,8 @@ class UniversalDetector:
|
||||||
maxProberConfidence = 0.0
|
maxProberConfidence = 0.0
|
||||||
maxProber = None
|
maxProber = None
|
||||||
for prober in self._mCharSetProbers:
|
for prober in self._mCharSetProbers:
|
||||||
if not prober: continue
|
if not prober:
|
||||||
|
continue
|
||||||
proberConfidence = prober.get_confidence()
|
proberConfidence = prober.get_confidence()
|
||||||
if proberConfidence > maxProberConfidence:
|
if proberConfidence > maxProberConfidence:
|
||||||
maxProberConfidence = proberConfidence
|
maxProberConfidence = proberConfidence
|
||||||
|
@ -148,7 +163,8 @@ class UniversalDetector:
|
||||||
if constants._debug:
|
if constants._debug:
|
||||||
sys.stderr.write('no probers hit minimum threshhold\n')
|
sys.stderr.write('no probers hit minimum threshhold\n')
|
||||||
for prober in self._mCharSetProbers[0].mProbers:
|
for prober in self._mCharSetProbers[0].mProbers:
|
||||||
if not prober: continue
|
if not prober:
|
||||||
sys.stderr.write('%s confidence = %s\n' % \
|
continue
|
||||||
(prober.get_charset_name(), \
|
sys.stderr.write('%s confidence = %s\n' %
|
||||||
|
(prober.get_charset_name(),
|
||||||
prober.get_confidence()))
|
prober.get_confidence()))
|
||||||
|
|
18
thirdparty/chardet/utf8prober.py
vendored
18
thirdparty/chardet/utf8prober.py
vendored
|
@ -25,14 +25,14 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
import constants, sys
|
from . import constants
|
||||||
from constants import eStart, eError, eItsMe
|
from .charsetprober import CharSetProber
|
||||||
from charsetprober import CharSetProber
|
from .codingstatemachine import CodingStateMachine
|
||||||
from codingstatemachine import CodingStateMachine
|
from .mbcssm import UTF8SMModel
|
||||||
from mbcssm import UTF8SMModel
|
|
||||||
|
|
||||||
ONE_CHAR_PROB = 0.5
|
ONE_CHAR_PROB = 0.5
|
||||||
|
|
||||||
|
|
||||||
class UTF8Prober(CharSetProber):
|
class UTF8Prober(CharSetProber):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
CharSetProber.__init__(self)
|
CharSetProber.__init__(self)
|
||||||
|
@ -50,13 +50,13 @@ class UTF8Prober(CharSetProber):
|
||||||
def feed(self, aBuf):
|
def feed(self, aBuf):
|
||||||
for c in aBuf:
|
for c in aBuf:
|
||||||
codingState = self._mCodingSM.next_state(c)
|
codingState = self._mCodingSM.next_state(c)
|
||||||
if codingState == eError:
|
if codingState == constants.eError:
|
||||||
self._mState = constants.eNotMe
|
self._mState = constants.eNotMe
|
||||||
break
|
break
|
||||||
elif codingState == eItsMe:
|
elif codingState == constants.eItsMe:
|
||||||
self._mState = constants.eFoundIt
|
self._mState = constants.eFoundIt
|
||||||
break
|
break
|
||||||
elif codingState == eStart:
|
elif codingState == constants.eStart:
|
||||||
if self._mCodingSM.get_current_charlen() >= 2:
|
if self._mCodingSM.get_current_charlen() >= 2:
|
||||||
self._mNumOfMBChar += 1
|
self._mNumOfMBChar += 1
|
||||||
|
|
||||||
|
@ -69,7 +69,7 @@ class UTF8Prober(CharSetProber):
|
||||||
def get_confidence(self):
|
def get_confidence(self):
|
||||||
unlike = 0.99
|
unlike = 0.99
|
||||||
if self._mNumOfMBChar < 6:
|
if self._mNumOfMBChar < 6:
|
||||||
for i in xrange(0, self._mNumOfMBChar):
|
for i in range(0, self._mNumOfMBChar):
|
||||||
unlike = unlike * ONE_CHAR_PROB
|
unlike = unlike * ONE_CHAR_PROB
|
||||||
return 1.0 - unlike
|
return 1.0 - unlike
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user