mirror of
https://github.com/sqlmapproject/sqlmap.git
synced 2024-11-24 18:43:47 +03:00
Adding new version of chardet
This commit is contained in:
parent
d424d4cdc7
commit
439d003753
10
thirdparty/chardet/__init__.py
vendored
10
thirdparty/chardet/__init__.py
vendored
|
@ -15,10 +15,16 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
__version__ = "2.0.1"
|
||||
__version__ = "2.3.0"
|
||||
from sys import version_info
|
||||
|
||||
|
||||
def detect(aBuf):
|
||||
import universaldetector
|
||||
if ((version_info < (3, 0) and isinstance(aBuf, unicode)) or
|
||||
(version_info >= (3, 0) and not isinstance(aBuf, bytes))):
|
||||
raise ValueError('Expected a bytes object, not a unicode object')
|
||||
|
||||
from . import universaldetector
|
||||
u = universaldetector.UniversalDetector()
|
||||
u.reset()
|
||||
u.feed(aBuf)
|
||||
|
|
4
thirdparty/chardet/big5freq.py
vendored
4
thirdparty/chardet/big5freq.py
vendored
|
@ -45,7 +45,7 @@ BIG5_TYPICAL_DISTRIBUTION_RATIO = 0.75
|
|||
#Char to FreqOrder table
|
||||
BIG5_TABLE_SIZE = 5376
|
||||
|
||||
Big5CharToFreqOrder = ( \
|
||||
Big5CharToFreqOrder = (
|
||||
1,1801,1506, 255,1431, 198, 9, 82, 6,5008, 177, 202,3681,1256,2821, 110, # 16
|
||||
3814, 33,3274, 261, 76, 44,2114, 16,2946,2187,1176, 659,3971, 26,3451,2653, # 32
|
||||
1198,3972,3350,4202, 410,2215, 302, 590, 361,1964, 8, 204, 58,4510,5009,1932, # 48
|
||||
|
@ -921,3 +921,5 @@ Big5CharToFreqOrder = ( \
|
|||
13936,13937,13938,13939,13940,13941,13942,13943,13944,13945,13946,13947,13948,13949,13950,13951, #13952
|
||||
13952,13953,13954,13955,13956,13957,13958,13959,13960,13961,13962,13963,13964,13965,13966,13967, #13968
|
||||
13968,13969,13970,13971,13972) #13973
|
||||
|
||||
# flake8: noqa
|
||||
|
|
9
thirdparty/chardet/big5prober.py
vendored
9
thirdparty/chardet/big5prober.py
vendored
|
@ -25,10 +25,11 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from mbcharsetprober import MultiByteCharSetProber
|
||||
from codingstatemachine import CodingStateMachine
|
||||
from chardistribution import Big5DistributionAnalysis
|
||||
from mbcssm import Big5SMModel
|
||||
from .mbcharsetprober import MultiByteCharSetProber
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .chardistribution import Big5DistributionAnalysis
|
||||
from .mbcssm import Big5SMModel
|
||||
|
||||
|
||||
class Big5Prober(MultiByteCharSetProber):
|
||||
def __init__(self):
|
||||
|
|
80
thirdparty/chardet/chardetect.py
vendored
Normal file
80
thirdparty/chardet/chardetect.py
vendored
Normal file
|
@ -0,0 +1,80 @@
|
|||
#!/usr/bin/env python
|
||||
"""
|
||||
Script which takes one or more file paths and reports on their detected
|
||||
encodings
|
||||
|
||||
Example::
|
||||
|
||||
% chardetect somefile someotherfile
|
||||
somefile: windows-1252 with confidence 0.5
|
||||
someotherfile: ascii with confidence 1.0
|
||||
|
||||
If no paths are provided, it takes its input from stdin.
|
||||
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import, print_function, unicode_literals
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from io import open
|
||||
|
||||
from chardet import __version__
|
||||
from chardet.universaldetector import UniversalDetector
|
||||
|
||||
|
||||
def description_of(lines, name='stdin'):
|
||||
"""
|
||||
Return a string describing the probable encoding of a file or
|
||||
list of strings.
|
||||
|
||||
:param lines: The lines to get the encoding of.
|
||||
:type lines: Iterable of bytes
|
||||
:param name: Name of file or collection of lines
|
||||
:type name: str
|
||||
"""
|
||||
u = UniversalDetector()
|
||||
for line in lines:
|
||||
u.feed(line)
|
||||
u.close()
|
||||
result = u.result
|
||||
if result['encoding']:
|
||||
return '{0}: {1} with confidence {2}'.format(name, result['encoding'],
|
||||
result['confidence'])
|
||||
else:
|
||||
return '{0}: no result'.format(name)
|
||||
|
||||
|
||||
def main(argv=None):
|
||||
'''
|
||||
Handles command line arguments and gets things started.
|
||||
|
||||
:param argv: List of arguments, as if specified on the command-line.
|
||||
If None, ``sys.argv[1:]`` is used instead.
|
||||
:type argv: list of str
|
||||
'''
|
||||
# Get command line arguments
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Takes one or more file paths and reports their detected \
|
||||
encodings",
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
||||
conflict_handler='resolve')
|
||||
parser.add_argument('input',
|
||||
help='File whose encoding we would like to determine.',
|
||||
type=argparse.FileType('rb'), nargs='*',
|
||||
default=[sys.stdin])
|
||||
parser.add_argument('--version', action='version',
|
||||
version='%(prog)s {0}'.format(__version__))
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
for f in args.input:
|
||||
if f.isatty():
|
||||
print("You are running chardetect interactively. Press " +
|
||||
"CTRL-D twice at the start of a blank line to signal the " +
|
||||
"end of your input. If you want help, run chardetect " +
|
||||
"--help\n", file=sys.stderr)
|
||||
print(description_of(f, f.name))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
127
thirdparty/chardet/chardistribution.py
vendored
127
thirdparty/chardet/chardistribution.py
vendored
|
@ -25,35 +25,51 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import constants
|
||||
from euctwfreq import EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE, EUCTW_TYPICAL_DISTRIBUTION_RATIO
|
||||
from euckrfreq import EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE, EUCKR_TYPICAL_DISTRIBUTION_RATIO
|
||||
from gb2312freq import GB2312CharToFreqOrder, GB2312_TABLE_SIZE, GB2312_TYPICAL_DISTRIBUTION_RATIO
|
||||
from big5freq import Big5CharToFreqOrder, BIG5_TABLE_SIZE, BIG5_TYPICAL_DISTRIBUTION_RATIO
|
||||
from jisfreq import JISCharToFreqOrder, JIS_TABLE_SIZE, JIS_TYPICAL_DISTRIBUTION_RATIO
|
||||
from .euctwfreq import (EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE,
|
||||
EUCTW_TYPICAL_DISTRIBUTION_RATIO)
|
||||
from .euckrfreq import (EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE,
|
||||
EUCKR_TYPICAL_DISTRIBUTION_RATIO)
|
||||
from .gb2312freq import (GB2312CharToFreqOrder, GB2312_TABLE_SIZE,
|
||||
GB2312_TYPICAL_DISTRIBUTION_RATIO)
|
||||
from .big5freq import (Big5CharToFreqOrder, BIG5_TABLE_SIZE,
|
||||
BIG5_TYPICAL_DISTRIBUTION_RATIO)
|
||||
from .jisfreq import (JISCharToFreqOrder, JIS_TABLE_SIZE,
|
||||
JIS_TYPICAL_DISTRIBUTION_RATIO)
|
||||
from .compat import wrap_ord
|
||||
|
||||
ENOUGH_DATA_THRESHOLD = 1024
|
||||
SURE_YES = 0.99
|
||||
SURE_NO = 0.01
|
||||
MINIMUM_DATA_THRESHOLD = 3
|
||||
|
||||
|
||||
class CharDistributionAnalysis:
|
||||
def __init__(self):
|
||||
self._mCharToFreqOrder = None # Mapping table to get frequency order from char order (get from GetOrder())
|
||||
# Mapping table to get frequency order from char order (get from
|
||||
# GetOrder())
|
||||
self._mCharToFreqOrder = None
|
||||
self._mTableSize = None # Size of above table
|
||||
self._mTypicalDistributionRatio = None # This is a constant value which varies from language to language, used in calculating confidence. See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail.
|
||||
# This is a constant value which varies from language to language,
|
||||
# used in calculating confidence. See
|
||||
# http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
|
||||
# for further detail.
|
||||
self._mTypicalDistributionRatio = None
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
"""reset analyser, clear any state"""
|
||||
self._mDone = constants.False # If this flag is set to constants.True, detection is done and conclusion has been made
|
||||
# If this flag is set to True, detection is done and conclusion has
|
||||
# been made
|
||||
self._mDone = False
|
||||
self._mTotalChars = 0 # Total characters encountered
|
||||
self._mFreqChars = 0 # The number of characters whose frequency order is less than 512
|
||||
# The number of characters whose frequency order is less than 512
|
||||
self._mFreqChars = 0
|
||||
|
||||
def feed(self, aStr, aCharLen):
|
||||
def feed(self, aBuf, aCharLen):
|
||||
"""feed a character with known length"""
|
||||
if aCharLen == 2:
|
||||
# we only care about 2-bytes character in our distribution analysis
|
||||
order = self.get_order(aStr)
|
||||
order = self.get_order(aBuf)
|
||||
else:
|
||||
order = -1
|
||||
if order >= 0:
|
||||
|
@ -65,12 +81,14 @@ class CharDistributionAnalysis:
|
|||
|
||||
def get_confidence(self):
|
||||
"""return confidence based on existing data"""
|
||||
# if we didn't receive any character in our consideration range, return negative answer
|
||||
if self._mTotalChars <= 0:
|
||||
# if we didn't receive any character in our consideration range,
|
||||
# return negative answer
|
||||
if self._mTotalChars <= 0 or self._mFreqChars <= MINIMUM_DATA_THRESHOLD:
|
||||
return SURE_NO
|
||||
|
||||
if self._mTotalChars != self._mFreqChars:
|
||||
r = self._mFreqChars / ((self._mTotalChars - self._mFreqChars) * self._mTypicalDistributionRatio)
|
||||
r = (self._mFreqChars / ((self._mTotalChars - self._mFreqChars)
|
||||
* self._mTypicalDistributionRatio))
|
||||
if r < SURE_YES:
|
||||
return r
|
||||
|
||||
|
@ -78,16 +96,18 @@ class CharDistributionAnalysis:
|
|||
return SURE_YES
|
||||
|
||||
def got_enough_data(self):
|
||||
# It is not necessary to receive all data to draw conclusion. For charset detection,
|
||||
# certain amount of data is enough
|
||||
# It is not necessary to receive all data to draw conclusion.
|
||||
# For charset detection, certain amount of data is enough
|
||||
return self._mTotalChars > ENOUGH_DATA_THRESHOLD
|
||||
|
||||
def get_order(self, aStr):
|
||||
# We do not handle characters based on the original encoding string, but
|
||||
# convert this encoding string to a number, here called order.
|
||||
# This allows multiple encodings of a language to share one frequency table.
|
||||
def get_order(self, aBuf):
|
||||
# We do not handle characters based on the original encoding string,
|
||||
# but convert this encoding string to a number, here called order.
|
||||
# This allows multiple encodings of a language to share one frequency
|
||||
# table.
|
||||
return -1
|
||||
|
||||
|
||||
class EUCTWDistributionAnalysis(CharDistributionAnalysis):
|
||||
def __init__(self):
|
||||
CharDistributionAnalysis.__init__(self)
|
||||
|
@ -95,16 +115,18 @@ class EUCTWDistributionAnalysis(CharDistributionAnalysis):
|
|||
self._mTableSize = EUCTW_TABLE_SIZE
|
||||
self._mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
|
||||
|
||||
def get_order(self, aStr):
|
||||
def get_order(self, aBuf):
|
||||
# for euc-TW encoding, we are interested
|
||||
# first byte range: 0xc4 -- 0xfe
|
||||
# second byte range: 0xa1 -- 0xfe
|
||||
# no validation needed here. State machine has done that
|
||||
if aStr[0] >= '\xC4':
|
||||
return 94 * (ord(aStr[0]) - 0xC4) + ord(aStr[1]) - 0xA1
|
||||
first_char = wrap_ord(aBuf[0])
|
||||
if first_char >= 0xC4:
|
||||
return 94 * (first_char - 0xC4) + wrap_ord(aBuf[1]) - 0xA1
|
||||
else:
|
||||
return -1
|
||||
|
||||
|
||||
class EUCKRDistributionAnalysis(CharDistributionAnalysis):
|
||||
def __init__(self):
|
||||
CharDistributionAnalysis.__init__(self)
|
||||
|
@ -112,15 +134,17 @@ class EUCKRDistributionAnalysis(CharDistributionAnalysis):
|
|||
self._mTableSize = EUCKR_TABLE_SIZE
|
||||
self._mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
|
||||
|
||||
def get_order(self, aStr):
|
||||
def get_order(self, aBuf):
|
||||
# for euc-KR encoding, we are interested
|
||||
# first byte range: 0xb0 -- 0xfe
|
||||
# second byte range: 0xa1 -- 0xfe
|
||||
# no validation needed here. State machine has done that
|
||||
if aStr[0] >= '\xB0':
|
||||
return 94 * (ord(aStr[0]) - 0xB0) + ord(aStr[1]) - 0xA1
|
||||
first_char = wrap_ord(aBuf[0])
|
||||
if first_char >= 0xB0:
|
||||
return 94 * (first_char - 0xB0) + wrap_ord(aBuf[1]) - 0xA1
|
||||
else:
|
||||
return -1;
|
||||
return -1
|
||||
|
||||
|
||||
class GB2312DistributionAnalysis(CharDistributionAnalysis):
|
||||
def __init__(self):
|
||||
|
@ -129,15 +153,17 @@ class GB2312DistributionAnalysis(CharDistributionAnalysis):
|
|||
self._mTableSize = GB2312_TABLE_SIZE
|
||||
self._mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO
|
||||
|
||||
def get_order(self, aStr):
|
||||
def get_order(self, aBuf):
|
||||
# for GB2312 encoding, we are interested
|
||||
# first byte range: 0xb0 -- 0xfe
|
||||
# second byte range: 0xa1 -- 0xfe
|
||||
# no validation needed here. State machine has done that
|
||||
if (aStr[0] >= '\xB0') and (aStr[1] >= '\xA1'):
|
||||
return 94 * (ord(aStr[0]) - 0xB0) + ord(aStr[1]) - 0xA1
|
||||
first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1])
|
||||
if (first_char >= 0xB0) and (second_char >= 0xA1):
|
||||
return 94 * (first_char - 0xB0) + second_char - 0xA1
|
||||
else:
|
||||
return -1;
|
||||
return -1
|
||||
|
||||
|
||||
class Big5DistributionAnalysis(CharDistributionAnalysis):
|
||||
def __init__(self):
|
||||
|
@ -146,19 +172,21 @@ class Big5DistributionAnalysis(CharDistributionAnalysis):
|
|||
self._mTableSize = BIG5_TABLE_SIZE
|
||||
self._mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO
|
||||
|
||||
def get_order(self, aStr):
|
||||
def get_order(self, aBuf):
|
||||
# for big5 encoding, we are interested
|
||||
# first byte range: 0xa4 -- 0xfe
|
||||
# second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
|
||||
# no validation needed here. State machine has done that
|
||||
if aStr[0] >= '\xA4':
|
||||
if aStr[1] >= '\xA1':
|
||||
return 157 * (ord(aStr[0]) - 0xA4) + ord(aStr[1]) - 0xA1 + 63
|
||||
first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1])
|
||||
if first_char >= 0xA4:
|
||||
if second_char >= 0xA1:
|
||||
return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63
|
||||
else:
|
||||
return 157 * (ord(aStr[0]) - 0xA4) + ord(aStr[1]) - 0x40
|
||||
return 157 * (first_char - 0xA4) + second_char - 0x40
|
||||
else:
|
||||
return -1
|
||||
|
||||
|
||||
class SJISDistributionAnalysis(CharDistributionAnalysis):
|
||||
def __init__(self):
|
||||
CharDistributionAnalysis.__init__(self)
|
||||
|
@ -166,22 +194,24 @@ class SJISDistributionAnalysis(CharDistributionAnalysis):
|
|||
self._mTableSize = JIS_TABLE_SIZE
|
||||
self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
|
||||
|
||||
def get_order(self, aStr):
|
||||
def get_order(self, aBuf):
|
||||
# for sjis encoding, we are interested
|
||||
# first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
|
||||
# second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
|
||||
# no validation needed here. State machine has done that
|
||||
if (aStr[0] >= '\x81') and (aStr[0] <= '\x9F'):
|
||||
order = 188 * (ord(aStr[0]) - 0x81)
|
||||
elif (aStr[0] >= '\xE0') and (aStr[0] <= '\xEF'):
|
||||
order = 188 * (ord(aStr[0]) - 0xE0 + 31)
|
||||
first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1])
|
||||
if (first_char >= 0x81) and (first_char <= 0x9F):
|
||||
order = 188 * (first_char - 0x81)
|
||||
elif (first_char >= 0xE0) and (first_char <= 0xEF):
|
||||
order = 188 * (first_char - 0xE0 + 31)
|
||||
else:
|
||||
return -1;
|
||||
order = order + ord(aStr[1]) - 0x40
|
||||
if aStr[1] > '\x7F':
|
||||
return -1
|
||||
order = order + second_char - 0x40
|
||||
if second_char > 0x7F:
|
||||
order = -1
|
||||
return order
|
||||
|
||||
|
||||
class EUCJPDistributionAnalysis(CharDistributionAnalysis):
|
||||
def __init__(self):
|
||||
CharDistributionAnalysis.__init__(self)
|
||||
|
@ -189,12 +219,13 @@ class EUCJPDistributionAnalysis(CharDistributionAnalysis):
|
|||
self._mTableSize = JIS_TABLE_SIZE
|
||||
self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
|
||||
|
||||
def get_order(self, aStr):
|
||||
def get_order(self, aBuf):
|
||||
# for euc-JP encoding, we are interested
|
||||
# first byte range: 0xa0 -- 0xfe
|
||||
# second byte range: 0xa1 -- 0xfe
|
||||
# no validation needed here. State machine has done that
|
||||
if aStr[0] >= '\xA0':
|
||||
return 94 * (ord(aStr[0]) - 0xA1) + ord(aStr[1]) - 0xa1
|
||||
char = wrap_ord(aBuf[0])
|
||||
if char >= 0xA0:
|
||||
return 94 * (char - 0xA1) + wrap_ord(aBuf[1]) - 0xa1
|
||||
else:
|
||||
return -1
|
||||
|
|
34
thirdparty/chardet/charsetgroupprober.py
vendored
34
thirdparty/chardet/charsetgroupprober.py
vendored
|
@ -25,8 +25,10 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import constants, sys
|
||||
from charsetprober import CharSetProber
|
||||
from . import constants
|
||||
import sys
|
||||
from .charsetprober import CharSetProber
|
||||
|
||||
|
||||
class CharSetGroupProber(CharSetProber):
|
||||
def __init__(self):
|
||||
|
@ -41,28 +43,32 @@ class CharSetGroupProber(CharSetProber):
|
|||
for prober in self._mProbers:
|
||||
if prober:
|
||||
prober.reset()
|
||||
prober.active = constants.True
|
||||
prober.active = True
|
||||
self._mActiveNum += 1
|
||||
self._mBestGuessProber = None
|
||||
|
||||
def get_charset_name(self):
|
||||
if not self._mBestGuessProber:
|
||||
self.get_confidence()
|
||||
if not self._mBestGuessProber: return None
|
||||
if not self._mBestGuessProber:
|
||||
return None
|
||||
# self._mBestGuessProber = self._mProbers[0]
|
||||
return self._mBestGuessProber.get_charset_name()
|
||||
|
||||
def feed(self, aBuf):
|
||||
for prober in self._mProbers:
|
||||
if not prober: continue
|
||||
if not prober.active: continue
|
||||
if not prober:
|
||||
continue
|
||||
if not prober.active:
|
||||
continue
|
||||
st = prober.feed(aBuf)
|
||||
if not st: continue
|
||||
if not st:
|
||||
continue
|
||||
if st == constants.eFoundIt:
|
||||
self._mBestGuessProber = prober
|
||||
return self.get_state()
|
||||
elif st == constants.eNotMe:
|
||||
prober.active = constants.False
|
||||
prober.active = False
|
||||
self._mActiveNum -= 1
|
||||
if self._mActiveNum <= 0:
|
||||
self._mState = constants.eNotMe
|
||||
|
@ -78,18 +84,22 @@ class CharSetGroupProber(CharSetProber):
|
|||
bestConf = 0.0
|
||||
self._mBestGuessProber = None
|
||||
for prober in self._mProbers:
|
||||
if not prober: continue
|
||||
if not prober:
|
||||
continue
|
||||
if not prober.active:
|
||||
if constants._debug:
|
||||
sys.stderr.write(prober.get_charset_name() + ' not active\n')
|
||||
sys.stderr.write(prober.get_charset_name()
|
||||
+ ' not active\n')
|
||||
continue
|
||||
cf = prober.get_confidence()
|
||||
if constants._debug:
|
||||
sys.stderr.write('%s confidence = %s\n' % (prober.get_charset_name(), cf))
|
||||
sys.stderr.write('%s confidence = %s\n' %
|
||||
(prober.get_charset_name(), cf))
|
||||
if bestConf < cf:
|
||||
bestConf = cf
|
||||
self._mBestGuessProber = prober
|
||||
if not self._mBestGuessProber: return 0.0
|
||||
if not self._mBestGuessProber:
|
||||
return 0.0
|
||||
return bestConf
|
||||
# else:
|
||||
# self._mBestGuessProber = self._mProbers[0]
|
||||
|
|
8
thirdparty/chardet/charsetprober.py
vendored
8
thirdparty/chardet/charsetprober.py
vendored
|
@ -26,7 +26,9 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import constants, re
|
||||
from . import constants
|
||||
import re
|
||||
|
||||
|
||||
class CharSetProber:
|
||||
def __init__(self):
|
||||
|
@ -48,11 +50,11 @@ class CharSetProber:
|
|||
return 0.0
|
||||
|
||||
def filter_high_bit_only(self, aBuf):
|
||||
aBuf = re.sub(r'([\x00-\x7F])+', ' ', aBuf)
|
||||
aBuf = re.sub(b'([\x00-\x7F])+', b' ', aBuf)
|
||||
return aBuf
|
||||
|
||||
def filter_without_english_letters(self, aBuf):
|
||||
aBuf = re.sub(r'([A-Za-z])+', ' ', aBuf)
|
||||
aBuf = re.sub(b'([A-Za-z])+', b' ', aBuf)
|
||||
return aBuf
|
||||
|
||||
def filter_with_english_letters(self, aBuf):
|
||||
|
|
11
thirdparty/chardet/codingstatemachine.py
vendored
11
thirdparty/chardet/codingstatemachine.py
vendored
|
@ -25,7 +25,9 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from constants import eStart, eError, eItsMe
|
||||
from .constants import eStart
|
||||
from .compat import wrap_ord
|
||||
|
||||
|
||||
class CodingStateMachine:
|
||||
def __init__(self, sm):
|
||||
|
@ -40,12 +42,15 @@ class CodingStateMachine:
|
|||
def next_state(self, c):
|
||||
# for each byte we get its class
|
||||
# if it is first byte, we also get byte length
|
||||
byteCls = self._mModel['classTable'][ord(c)]
|
||||
# PY3K: aBuf is a byte stream, so c is an int, not a byte
|
||||
byteCls = self._mModel['classTable'][wrap_ord(c)]
|
||||
if self._mCurrentState == eStart:
|
||||
self._mCurrentBytePos = 0
|
||||
self._mCurrentCharLen = self._mModel['charLenTable'][byteCls]
|
||||
# from byte's class and stateTable, we get its next state
|
||||
self._mCurrentState = self._mModel['stateTable'][self._mCurrentState * self._mModel['classFactor'] + byteCls]
|
||||
curr_state = (self._mCurrentState * self._mModel['classFactor']
|
||||
+ byteCls)
|
||||
self._mCurrentState = self._mModel['stateTable'][curr_state]
|
||||
self._mCurrentBytePos += 1
|
||||
return self._mCurrentState
|
||||
|
||||
|
|
34
thirdparty/chardet/compat.py
vendored
Normal file
34
thirdparty/chardet/compat.py
vendored
Normal file
|
@ -0,0 +1,34 @@
|
|||
######################## BEGIN LICENSE BLOCK ########################
|
||||
# Contributor(s):
|
||||
# Ian Cordasco - port to Python
|
||||
#
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import sys
|
||||
|
||||
|
||||
if sys.version_info < (3, 0):
|
||||
base_str = (str, unicode)
|
||||
else:
|
||||
base_str = (bytes, str)
|
||||
|
||||
|
||||
def wrap_ord(a):
|
||||
if sys.version_info < (3, 0) and isinstance(a, base_str):
|
||||
return ord(a)
|
||||
else:
|
||||
return a
|
8
thirdparty/chardet/constants.py
vendored
8
thirdparty/chardet/constants.py
vendored
|
@ -37,11 +37,3 @@ eError = 1
|
|||
eItsMe = 2
|
||||
|
||||
SHORTCUT_THRESHOLD = 0.95
|
||||
|
||||
import __builtin__
|
||||
if not hasattr(__builtin__, 'False'):
|
||||
False = 0
|
||||
True = 1
|
||||
else:
|
||||
False = __builtin__.False
|
||||
True = __builtin__.True
|
||||
|
|
44
thirdparty/chardet/cp949prober.py
vendored
Normal file
44
thirdparty/chardet/cp949prober.py
vendored
Normal file
|
@ -0,0 +1,44 @@
|
|||
######################## BEGIN LICENSE BLOCK ########################
|
||||
# The Original Code is mozilla.org code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 1998
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Mark Pilgrim - port to Python
|
||||
#
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from .mbcharsetprober import MultiByteCharSetProber
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .chardistribution import EUCKRDistributionAnalysis
|
||||
from .mbcssm import CP949SMModel
|
||||
|
||||
|
||||
class CP949Prober(MultiByteCharSetProber):
|
||||
def __init__(self):
|
||||
MultiByteCharSetProber.__init__(self)
|
||||
self._mCodingSM = CodingStateMachine(CP949SMModel)
|
||||
# NOTE: CP949 is a superset of EUC-KR, so the distribution should be
|
||||
# not different.
|
||||
self._mDistributionAnalyzer = EUCKRDistributionAnalysis()
|
||||
self.reset()
|
||||
|
||||
def get_charset_name(self):
|
||||
return "CP949"
|
31
thirdparty/chardet/escprober.py
vendored
31
thirdparty/chardet/escprober.py
vendored
|
@ -25,15 +25,18 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import constants, sys
|
||||
from escsm import HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, ISO2022KRSMModel
|
||||
from charsetprober import CharSetProber
|
||||
from codingstatemachine import CodingStateMachine
|
||||
from . import constants
|
||||
from .escsm import (HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel,
|
||||
ISO2022KRSMModel)
|
||||
from .charsetprober import CharSetProber
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .compat import wrap_ord
|
||||
|
||||
|
||||
class EscCharSetProber(CharSetProber):
|
||||
def __init__(self):
|
||||
CharSetProber.__init__(self)
|
||||
self._mCodingSM = [ \
|
||||
self._mCodingSM = [
|
||||
CodingStateMachine(HZSMModel),
|
||||
CodingStateMachine(ISO2022CNSMModel),
|
||||
CodingStateMachine(ISO2022JPSMModel),
|
||||
|
@ -44,8 +47,9 @@ class EscCharSetProber(CharSetProber):
|
|||
def reset(self):
|
||||
CharSetProber.reset(self)
|
||||
for codingSM in self._mCodingSM:
|
||||
if not codingSM: continue
|
||||
codingSM.active = constants.True
|
||||
if not codingSM:
|
||||
continue
|
||||
codingSM.active = True
|
||||
codingSM.reset()
|
||||
self._mActiveSM = len(self._mCodingSM)
|
||||
self._mDetectedCharset = None
|
||||
|
@ -61,19 +65,22 @@ class EscCharSetProber(CharSetProber):
|
|||
|
||||
def feed(self, aBuf):
|
||||
for c in aBuf:
|
||||
# PY3K: aBuf is a byte array, so c is an int, not a byte
|
||||
for codingSM in self._mCodingSM:
|
||||
if not codingSM: continue
|
||||
if not codingSM.active: continue
|
||||
codingState = codingSM.next_state(c)
|
||||
if not codingSM:
|
||||
continue
|
||||
if not codingSM.active:
|
||||
continue
|
||||
codingState = codingSM.next_state(wrap_ord(c))
|
||||
if codingState == constants.eError:
|
||||
codingSM.active = constants.False
|
||||
codingSM.active = False
|
||||
self._mActiveSM -= 1
|
||||
if self._mActiveSM <= 0:
|
||||
self._mState = constants.eNotMe
|
||||
return self.get_state()
|
||||
elif codingState == constants.eItsMe:
|
||||
self._mState = constants.eFoundIt
|
||||
self._mDetectedCharset = codingSM.get_coding_state_machine()
|
||||
self._mDetectedCharset = codingSM.get_coding_state_machine() # nopep8
|
||||
return self.get_state()
|
||||
|
||||
return self.get_state()
|
||||
|
|
20
thirdparty/chardet/escsm.py
vendored
20
thirdparty/chardet/escsm.py
vendored
|
@ -25,9 +25,9 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from constants import eStart, eError, eItsMe
|
||||
from .constants import eStart, eError, eItsMe
|
||||
|
||||
HZ_cls = ( \
|
||||
HZ_cls = (
|
||||
1,0,0,0,0,0,0,0, # 00 - 07
|
||||
0,0,0,0,0,0,0,0, # 08 - 0f
|
||||
0,0,0,0,0,0,0,0, # 10 - 17
|
||||
|
@ -62,7 +62,7 @@ HZ_cls = ( \
|
|||
1,1,1,1,1,1,1,1, # f8 - ff
|
||||
)
|
||||
|
||||
HZ_st = ( \
|
||||
HZ_st = (
|
||||
eStart,eError, 3,eStart,eStart,eStart,eError,eError,# 00-07
|
||||
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f
|
||||
eItsMe,eItsMe,eError,eError,eStart,eStart, 4,eError,# 10-17
|
||||
|
@ -79,7 +79,7 @@ HZSMModel = {'classTable': HZ_cls,
|
|||
'charLenTable': HZCharLenTable,
|
||||
'name': "HZ-GB-2312"}
|
||||
|
||||
ISO2022CN_cls = ( \
|
||||
ISO2022CN_cls = (
|
||||
2,0,0,0,0,0,0,0, # 00 - 07
|
||||
0,0,0,0,0,0,0,0, # 08 - 0f
|
||||
0,0,0,0,0,0,0,0, # 10 - 17
|
||||
|
@ -114,7 +114,7 @@ ISO2022CN_cls = ( \
|
|||
2,2,2,2,2,2,2,2, # f8 - ff
|
||||
)
|
||||
|
||||
ISO2022CN_st = ( \
|
||||
ISO2022CN_st = (
|
||||
eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07
|
||||
eStart,eError,eError,eError,eError,eError,eError,eError,# 08-0f
|
||||
eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17
|
||||
|
@ -133,7 +133,7 @@ ISO2022CNSMModel = {'classTable': ISO2022CN_cls,
|
|||
'charLenTable': ISO2022CNCharLenTable,
|
||||
'name': "ISO-2022-CN"}
|
||||
|
||||
ISO2022JP_cls = ( \
|
||||
ISO2022JP_cls = (
|
||||
2,0,0,0,0,0,0,0, # 00 - 07
|
||||
0,0,0,0,0,0,2,2, # 08 - 0f
|
||||
0,0,0,0,0,0,0,0, # 10 - 17
|
||||
|
@ -168,7 +168,7 @@ ISO2022JP_cls = ( \
|
|||
2,2,2,2,2,2,2,2, # f8 - ff
|
||||
)
|
||||
|
||||
ISO2022JP_st = ( \
|
||||
ISO2022JP_st = (
|
||||
eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07
|
||||
eStart,eStart,eError,eError,eError,eError,eError,eError,# 08-0f
|
||||
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17
|
||||
|
@ -188,7 +188,7 @@ ISO2022JPSMModel = {'classTable': ISO2022JP_cls,
|
|||
'charLenTable': ISO2022JPCharLenTable,
|
||||
'name': "ISO-2022-JP"}
|
||||
|
||||
ISO2022KR_cls = ( \
|
||||
ISO2022KR_cls = (
|
||||
2,0,0,0,0,0,0,0, # 00 - 07
|
||||
0,0,0,0,0,0,0,0, # 08 - 0f
|
||||
0,0,0,0,0,0,0,0, # 10 - 17
|
||||
|
@ -223,7 +223,7 @@ ISO2022KR_cls = ( \
|
|||
2,2,2,2,2,2,2,2, # f8 - ff
|
||||
)
|
||||
|
||||
ISO2022KR_st = ( \
|
||||
ISO2022KR_st = (
|
||||
eStart, 3,eError,eStart,eStart,eStart,eError,eError,# 00-07
|
||||
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f
|
||||
eItsMe,eItsMe,eError,eError,eError, 4,eError,eError,# 10-17
|
||||
|
@ -238,3 +238,5 @@ ISO2022KRSMModel = {'classTable': ISO2022KR_cls,
|
|||
'stateTable': ISO2022KR_st,
|
||||
'charLenTable': ISO2022KRCharLenTable,
|
||||
'name': "ISO-2022-KR"}
|
||||
|
||||
# flake8: noqa
|
||||
|
|
35
thirdparty/chardet/eucjpprober.py
vendored
35
thirdparty/chardet/eucjpprober.py
vendored
|
@ -25,13 +25,14 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import constants, sys
|
||||
from constants import eStart, eError, eItsMe
|
||||
from mbcharsetprober import MultiByteCharSetProber
|
||||
from codingstatemachine import CodingStateMachine
|
||||
from chardistribution import EUCJPDistributionAnalysis
|
||||
from jpcntx import EUCJPContextAnalysis
|
||||
from mbcssm import EUCJPSMModel
|
||||
import sys
|
||||
from . import constants
|
||||
from .mbcharsetprober import MultiByteCharSetProber
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .chardistribution import EUCJPDistributionAnalysis
|
||||
from .jpcntx import EUCJPContextAnalysis
|
||||
from .mbcssm import EUCJPSMModel
|
||||
|
||||
|
||||
class EUCJPProber(MultiByteCharSetProber):
|
||||
def __init__(self):
|
||||
|
@ -50,17 +51,20 @@ class EUCJPProber(MultiByteCharSetProber):
|
|||
|
||||
def feed(self, aBuf):
|
||||
aLen = len(aBuf)
|
||||
for i in xrange(0, aLen):
|
||||
for i in range(0, aLen):
|
||||
# PY3K: aBuf is a byte array, so aBuf[i] is an int, not a byte
|
||||
codingState = self._mCodingSM.next_state(aBuf[i])
|
||||
if codingState == eError:
|
||||
if codingState == constants.eError:
|
||||
if constants._debug:
|
||||
sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n')
|
||||
sys.stderr.write(self.get_charset_name()
|
||||
+ ' prober hit error at byte ' + str(i)
|
||||
+ '\n')
|
||||
self._mState = constants.eNotMe
|
||||
break
|
||||
elif codingState == eItsMe:
|
||||
elif codingState == constants.eItsMe:
|
||||
self._mState = constants.eFoundIt
|
||||
break
|
||||
elif codingState == eStart:
|
||||
elif codingState == constants.eStart:
|
||||
charLen = self._mCodingSM.get_current_charlen()
|
||||
if i == 0:
|
||||
self._mLastChar[1] = aBuf[0]
|
||||
|
@ -68,13 +72,14 @@ class EUCJPProber(MultiByteCharSetProber):
|
|||
self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
|
||||
else:
|
||||
self._mContextAnalyzer.feed(aBuf[i - 1:i + 1], charLen)
|
||||
self._mDistributionAnalyzer.feed(aBuf[i-1:i+1], charLen)
|
||||
self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1],
|
||||
charLen)
|
||||
|
||||
self._mLastChar[0] = aBuf[aLen - 1]
|
||||
|
||||
if self.get_state() == constants.eDetecting:
|
||||
if self._mContextAnalyzer.got_enough_data() and \
|
||||
(self.get_confidence() > constants.SHORTCUT_THRESHOLD):
|
||||
if (self._mContextAnalyzer.got_enough_data() and
|
||||
(self.get_confidence() > constants.SHORTCUT_THRESHOLD)):
|
||||
self._mState = constants.eFoundIt
|
||||
|
||||
return self.get_state()
|
||||
|
|
2
thirdparty/chardet/euckrfreq.py
vendored
2
thirdparty/chardet/euckrfreq.py
vendored
|
@ -592,3 +592,5 @@ EUCKRCharToFreqOrder = ( \
|
|||
8704,8705,8706,8707,8708,8709,8710,8711,8712,8713,8714,8715,8716,8717,8718,8719,
|
||||
8720,8721,8722,8723,8724,8725,8726,8727,8728,8729,8730,8731,8732,8733,8734,8735,
|
||||
8736,8737,8738,8739,8740,8741)
|
||||
|
||||
# flake8: noqa
|
||||
|
|
9
thirdparty/chardet/euckrprober.py
vendored
9
thirdparty/chardet/euckrprober.py
vendored
|
@ -25,10 +25,11 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from mbcharsetprober import MultiByteCharSetProber
|
||||
from codingstatemachine import CodingStateMachine
|
||||
from chardistribution import EUCKRDistributionAnalysis
|
||||
from mbcssm import EUCKRSMModel
|
||||
from .mbcharsetprober import MultiByteCharSetProber
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .chardistribution import EUCKRDistributionAnalysis
|
||||
from .mbcssm import EUCKRSMModel
|
||||
|
||||
|
||||
class EUCKRProber(MultiByteCharSetProber):
|
||||
def __init__(self):
|
||||
|
|
4
thirdparty/chardet/euctwfreq.py
vendored
4
thirdparty/chardet/euctwfreq.py
vendored
|
@ -46,7 +46,7 @@ EUCTW_TYPICAL_DISTRIBUTION_RATIO = 0.75
|
|||
# Char to FreqOrder table ,
|
||||
EUCTW_TABLE_SIZE = 8102
|
||||
|
||||
EUCTWCharToFreqOrder = ( \
|
||||
EUCTWCharToFreqOrder = (
|
||||
1,1800,1506, 255,1431, 198, 9, 82, 6,7310, 177, 202,3615,1256,2808, 110, # 2742
|
||||
3735, 33,3241, 261, 76, 44,2113, 16,2931,2184,1176, 659,3868, 26,3404,2643, # 2758
|
||||
1198,3869,3313,4060, 410,2211, 302, 590, 361,1963, 8, 204, 58,4296,7311,1931, # 2774
|
||||
|
@ -424,3 +424,5 @@ EUCTWCharToFreqOrder = ( \
|
|||
8694,8695,8696,8697,8698,8699,8700,8701,8702,8703,8704,8705,8706,8707,8708,8709, # 8710
|
||||
8710,8711,8712,8713,8714,8715,8716,8717,8718,8719,8720,8721,8722,8723,8724,8725, # 8726
|
||||
8726,8727,8728,8729,8730,8731,8732,8733,8734,8735,8736,8737,8738,8739,8740,8741) # 8742
|
||||
|
||||
# flake8: noqa
|
||||
|
|
8
thirdparty/chardet/euctwprober.py
vendored
8
thirdparty/chardet/euctwprober.py
vendored
|
@ -25,10 +25,10 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from mbcharsetprober import MultiByteCharSetProber
|
||||
from codingstatemachine import CodingStateMachine
|
||||
from chardistribution import EUCTWDistributionAnalysis
|
||||
from mbcssm import EUCTWSMModel
|
||||
from .mbcharsetprober import MultiByteCharSetProber
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .chardistribution import EUCTWDistributionAnalysis
|
||||
from .mbcssm import EUCTWSMModel
|
||||
|
||||
class EUCTWProber(MultiByteCharSetProber):
|
||||
def __init__(self):
|
||||
|
|
3
thirdparty/chardet/gb2312freq.py
vendored
3
thirdparty/chardet/gb2312freq.py
vendored
|
@ -43,7 +43,7 @@ GB2312_TYPICAL_DISTRIBUTION_RATIO = 0.9
|
|||
|
||||
GB2312_TABLE_SIZE = 3760
|
||||
|
||||
GB2312CharToFreqOrder = ( \
|
||||
GB2312CharToFreqOrder = (
|
||||
1671, 749,1443,2364,3924,3807,2330,3921,1704,3463,2691,1511,1515, 572,3191,2205,
|
||||
2361, 224,2558, 479,1711, 963,3162, 440,4060,1905,2966,2947,3580,2647,3961,3842,
|
||||
2204, 869,4207, 970,2678,5626,2944,2956,1479,4048, 514,3595, 588,1346,2820,3409,
|
||||
|
@ -469,3 +469,4 @@ GB2312CharToFreqOrder = ( \
|
|||
5867,5507,6273,4206,6274,4789,6098,6764,3619,3646,3833,3804,2394,3788,4936,3978,
|
||||
4866,4899,6099,6100,5559,6478,6765,3599,5868,6101,5869,5870,6275,6766,4527,6767)
|
||||
|
||||
# flake8: noqa
|
||||
|
|
8
thirdparty/chardet/gb2312prober.py
vendored
8
thirdparty/chardet/gb2312prober.py
vendored
|
@ -25,10 +25,10 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from mbcharsetprober import MultiByteCharSetProber
|
||||
from codingstatemachine import CodingStateMachine
|
||||
from chardistribution import GB2312DistributionAnalysis
|
||||
from mbcssm import GB2312SMModel
|
||||
from .mbcharsetprober import MultiByteCharSetProber
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .chardistribution import GB2312DistributionAnalysis
|
||||
from .mbcssm import GB2312SMModel
|
||||
|
||||
class GB2312Prober(MultiByteCharSetProber):
|
||||
def __init__(self):
|
||||
|
|
132
thirdparty/chardet/hebrewprober.py
vendored
132
thirdparty/chardet/hebrewprober.py
vendored
|
@ -25,8 +25,9 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from charsetprober import CharSetProber
|
||||
import constants
|
||||
from .charsetprober import CharSetProber
|
||||
from .constants import eNotMe, eDetecting
|
||||
from .compat import wrap_ord
|
||||
|
||||
# This prober doesn't actually recognize a language or a charset.
|
||||
# It is a helper prober for the use of the Hebrew model probers
|
||||
|
@ -126,28 +127,31 @@ import constants
|
|||
# charset identified, either "windows-1255" or "ISO-8859-8".
|
||||
|
||||
# windows-1255 / ISO-8859-8 code points of interest
|
||||
FINAL_KAF = '\xea'
|
||||
NORMAL_KAF = '\xeb'
|
||||
FINAL_MEM = '\xed'
|
||||
NORMAL_MEM = '\xee'
|
||||
FINAL_NUN = '\xef'
|
||||
NORMAL_NUN = '\xf0'
|
||||
FINAL_PE = '\xf3'
|
||||
NORMAL_PE = '\xf4'
|
||||
FINAL_TSADI = '\xf5'
|
||||
NORMAL_TSADI = '\xf6'
|
||||
FINAL_KAF = 0xea
|
||||
NORMAL_KAF = 0xeb
|
||||
FINAL_MEM = 0xed
|
||||
NORMAL_MEM = 0xee
|
||||
FINAL_NUN = 0xef
|
||||
NORMAL_NUN = 0xf0
|
||||
FINAL_PE = 0xf3
|
||||
NORMAL_PE = 0xf4
|
||||
FINAL_TSADI = 0xf5
|
||||
NORMAL_TSADI = 0xf6
|
||||
|
||||
# Minimum Visual vs Logical final letter score difference.
|
||||
# If the difference is below this, don't rely solely on the final letter score distance.
|
||||
# If the difference is below this, don't rely solely on the final letter score
|
||||
# distance.
|
||||
MIN_FINAL_CHAR_DISTANCE = 5
|
||||
|
||||
# Minimum Visual vs Logical model score difference.
|
||||
# If the difference is below this, don't rely at all on the model score distance.
|
||||
# If the difference is below this, don't rely at all on the model score
|
||||
# distance.
|
||||
MIN_MODEL_DISTANCE = 0.01
|
||||
|
||||
VISUAL_HEBREW_NAME = "ISO-8859-8"
|
||||
LOGICAL_HEBREW_NAME = "windows-1255"
|
||||
|
||||
|
||||
class HebrewProber(CharSetProber):
|
||||
def __init__(self):
|
||||
CharSetProber.__init__(self)
|
||||
|
@ -159,8 +163,8 @@ class HebrewProber(CharSetProber):
|
|||
self._mFinalCharLogicalScore = 0
|
||||
self._mFinalCharVisualScore = 0
|
||||
# The two last characters seen in the previous buffer,
|
||||
# mPrev and mBeforePrev are initialized to space in order to simulate a word
|
||||
# delimiter at the beginning of the data
|
||||
# mPrev and mBeforePrev are initialized to space in order to simulate
|
||||
# a word delimiter at the beginning of the data
|
||||
self._mPrev = ' '
|
||||
self._mBeforePrev = ' '
|
||||
# These probers are owned by the group prober.
|
||||
|
@ -170,49 +174,52 @@ class HebrewProber(CharSetProber):
|
|||
self._mVisualProber = visualProber
|
||||
|
||||
def is_final(self, c):
|
||||
return c in [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE, FINAL_TSADI]
|
||||
return wrap_ord(c) in [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE,
|
||||
FINAL_TSADI]
|
||||
|
||||
def is_non_final(self, c):
|
||||
# The normal Tsadi is not a good Non-Final letter due to words like
|
||||
# 'lechotet' (to chat) containing an apostrophe after the tsadi. This
|
||||
# apostrophe is converted to a space in FilterWithoutEnglishLetters causing
|
||||
# the Non-Final tsadi to appear at an end of a word even though this is not
|
||||
# the case in the original text.
|
||||
# The letters Pe and Kaf rarely display a related behavior of not being a
|
||||
# good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for
|
||||
# example legally end with a Non-Final Pe or Kaf. However, the benefit of
|
||||
# these letters as Non-Final letters outweighs the damage since these words
|
||||
# are quite rare.
|
||||
return c in [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE]
|
||||
# apostrophe is converted to a space in FilterWithoutEnglishLetters
|
||||
# causing the Non-Final tsadi to appear at an end of a word even
|
||||
# though this is not the case in the original text.
|
||||
# The letters Pe and Kaf rarely display a related behavior of not being
|
||||
# a good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak'
|
||||
# for example legally end with a Non-Final Pe or Kaf. However, the
|
||||
# benefit of these letters as Non-Final letters outweighs the damage
|
||||
# since these words are quite rare.
|
||||
return wrap_ord(c) in [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE]
|
||||
|
||||
def feed(self, aBuf):
|
||||
# Final letter analysis for logical-visual decision.
|
||||
# Look for evidence that the received buffer is either logical Hebrew or
|
||||
# visual Hebrew.
|
||||
# Look for evidence that the received buffer is either logical Hebrew
|
||||
# or visual Hebrew.
|
||||
# The following cases are checked:
|
||||
# 1) A word longer than 1 letter, ending with a final letter. This is an
|
||||
# indication that the text is laid out "naturally" since the final letter
|
||||
# really appears at the end. +1 for logical score.
|
||||
# 2) A word longer than 1 letter, ending with a Non-Final letter. In normal
|
||||
# Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with
|
||||
# the Non-Final form of that letter. Exceptions to this rule are mentioned
|
||||
# above in isNonFinal(). This is an indication that the text is laid out
|
||||
# backwards. +1 for visual score
|
||||
# 3) A word longer than 1 letter, starting with a final letter. Final letters
|
||||
# should not appear at the beginning of a word. This is an indication that
|
||||
# the text is laid out backwards. +1 for visual score.
|
||||
# 1) A word longer than 1 letter, ending with a final letter. This is
|
||||
# an indication that the text is laid out "naturally" since the
|
||||
# final letter really appears at the end. +1 for logical score.
|
||||
# 2) A word longer than 1 letter, ending with a Non-Final letter. In
|
||||
# normal Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi,
|
||||
# should not end with the Non-Final form of that letter. Exceptions
|
||||
# to this rule are mentioned above in isNonFinal(). This is an
|
||||
# indication that the text is laid out backwards. +1 for visual
|
||||
# score
|
||||
# 3) A word longer than 1 letter, starting with a final letter. Final
|
||||
# letters should not appear at the beginning of a word. This is an
|
||||
# indication that the text is laid out backwards. +1 for visual
|
||||
# score.
|
||||
#
|
||||
# The visual score and logical score are accumulated throughout the text and
|
||||
# are finally checked against each other in GetCharSetName().
|
||||
# No checking for final letters in the middle of words is done since that case
|
||||
# is not an indication for either Logical or Visual text.
|
||||
# The visual score and logical score are accumulated throughout the
|
||||
# text and are finally checked against each other in GetCharSetName().
|
||||
# No checking for final letters in the middle of words is done since
|
||||
# that case is not an indication for either Logical or Visual text.
|
||||
#
|
||||
# We automatically filter out all 7-bit characters (replace them with spaces)
|
||||
# so the word boundary detection works properly. [MAP]
|
||||
# We automatically filter out all 7-bit characters (replace them with
|
||||
# spaces) so the word boundary detection works properly. [MAP]
|
||||
|
||||
if self.get_state() == constants.eNotMe:
|
||||
if self.get_state() == eNotMe:
|
||||
# Both model probers say it's not them. No reason to continue.
|
||||
return constants.eNotMe
|
||||
return eNotMe
|
||||
|
||||
aBuf = self.filter_high_bit_only(aBuf)
|
||||
|
||||
|
@ -220,23 +227,27 @@ class HebrewProber(CharSetProber):
|
|||
if cur == ' ':
|
||||
# We stand on a space - a word just ended
|
||||
if self._mBeforePrev != ' ':
|
||||
# next-to-last char was not a space so self._mPrev is not a 1 letter word
|
||||
# next-to-last char was not a space so self._mPrev is not a
|
||||
# 1 letter word
|
||||
if self.is_final(self._mPrev):
|
||||
# case (1) [-2:not space][-1:final letter][cur:space]
|
||||
self._mFinalCharLogicalScore += 1
|
||||
elif self.is_non_final(self._mPrev):
|
||||
# case (2) [-2:not space][-1:Non-Final letter][cur:space]
|
||||
# case (2) [-2:not space][-1:Non-Final letter][
|
||||
# cur:space]
|
||||
self._mFinalCharVisualScore += 1
|
||||
else:
|
||||
# Not standing on a space
|
||||
if (self._mBeforePrev == ' ') and (self.is_final(self._mPrev)) and (cur != ' '):
|
||||
if ((self._mBeforePrev == ' ') and
|
||||
(self.is_final(self._mPrev)) and (cur != ' ')):
|
||||
# case (3) [-2:space][-1:final letter][cur:not space]
|
||||
self._mFinalCharVisualScore += 1
|
||||
self._mBeforePrev = self._mPrev
|
||||
self._mPrev = cur
|
||||
|
||||
# Forever detecting, till the end or until both model probers return eNotMe (handled above)
|
||||
return constants.eDetecting
|
||||
# Forever detecting, till the end or until both model probers return
|
||||
# eNotMe (handled above)
|
||||
return eDetecting
|
||||
|
||||
def get_charset_name(self):
|
||||
# Make the decision: is it Logical or Visual?
|
||||
|
@ -248,22 +259,25 @@ class HebrewProber(CharSetProber):
|
|||
return VISUAL_HEBREW_NAME
|
||||
|
||||
# It's not dominant enough, try to rely on the model scores instead.
|
||||
modelsub = self._mLogicalProber.get_confidence() - self._mVisualProber.get_confidence()
|
||||
modelsub = (self._mLogicalProber.get_confidence()
|
||||
- self._mVisualProber.get_confidence())
|
||||
if modelsub > MIN_MODEL_DISTANCE:
|
||||
return LOGICAL_HEBREW_NAME
|
||||
if modelsub < -MIN_MODEL_DISTANCE:
|
||||
return VISUAL_HEBREW_NAME
|
||||
|
||||
# Still no good, back to final letter distance, maybe it'll save the day.
|
||||
# Still no good, back to final letter distance, maybe it'll save the
|
||||
# day.
|
||||
if finalsub < 0.0:
|
||||
return VISUAL_HEBREW_NAME
|
||||
|
||||
# (finalsub > 0 - Logical) or (don't know what to do) default to Logical.
|
||||
# (finalsub > 0 - Logical) or (don't know what to do) default to
|
||||
# Logical.
|
||||
return LOGICAL_HEBREW_NAME
|
||||
|
||||
def get_state(self):
|
||||
# Remain active as long as any of the model probers are active.
|
||||
if (self._mLogicalProber.get_state() == constants.eNotMe) and \
|
||||
(self._mVisualProber.get_state() == constants.eNotMe):
|
||||
return constants.eNotMe
|
||||
return constants.eDetecting
|
||||
if (self._mLogicalProber.get_state() == eNotMe) and \
|
||||
(self._mVisualProber.get_state() == eNotMe):
|
||||
return eNotMe
|
||||
return eDetecting
|
||||
|
|
4
thirdparty/chardet/jisfreq.py
vendored
4
thirdparty/chardet/jisfreq.py
vendored
|
@ -46,7 +46,7 @@ JIS_TYPICAL_DISTRIBUTION_RATIO = 3.0
|
|||
# Char to FreqOrder table ,
|
||||
JIS_TABLE_SIZE = 4368
|
||||
|
||||
JISCharToFreqOrder = ( \
|
||||
JISCharToFreqOrder = (
|
||||
40, 1, 6, 182, 152, 180, 295,2127, 285, 381,3295,4304,3068,4606,3165,3510, # 16
|
||||
3511,1822,2785,4607,1193,2226,5070,4608, 171,2996,1247, 18, 179,5071, 856,1661, # 32
|
||||
1262,5072, 619, 127,3431,3512,3230,1899,1700, 232, 228,1294,1298, 284, 283,2041, # 48
|
||||
|
@ -565,3 +565,5 @@ JISCharToFreqOrder = ( \
|
|||
8224,8225,8226,8227,8228,8229,8230,8231,8232,8233,8234,8235,8236,8237,8238,8239, # 8240
|
||||
8240,8241,8242,8243,8244,8245,8246,8247,8248,8249,8250,8251,8252,8253,8254,8255, # 8256
|
||||
8256,8257,8258,8259,8260,8261,8262,8263,8264,8265,8266,8267,8268,8269,8270,8271) # 8272
|
||||
|
||||
# flake8: noqa
|
||||
|
|
79
thirdparty/chardet/jpcntx.py
vendored
79
thirdparty/chardet/jpcntx.py
vendored
|
@ -25,7 +25,7 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import constants
|
||||
from .compat import wrap_ord
|
||||
|
||||
NUM_OF_CATEGORY = 6
|
||||
DONT_KNOW = -1
|
||||
|
@ -34,7 +34,7 @@ MAX_REL_THRESHOLD = 1000
|
|||
MINIMUM_DATA_THRESHOLD = 4
|
||||
|
||||
# This is hiragana 2-char sequence table, the number in each cell represents its frequency category
|
||||
jp2CharContext = ( \
|
||||
jp2CharContext = (
|
||||
(0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1),
|
||||
(2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4),
|
||||
(0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2),
|
||||
|
@ -126,19 +126,26 @@ class JapaneseContextAnalysis:
|
|||
|
||||
def reset(self):
|
||||
self._mTotalRel = 0 # total sequence received
|
||||
self._mRelSample = [0] * NUM_OF_CATEGORY # category counters, each interger counts sequence in its category
|
||||
self._mNeedToSkipCharNum = 0 # if last byte in current buffer is not the last byte of a character, we need to know how many bytes to skip in next buffer
|
||||
# category counters, each interger counts sequence in its category
|
||||
self._mRelSample = [0] * NUM_OF_CATEGORY
|
||||
# if last byte in current buffer is not the last byte of a character,
|
||||
# we need to know how many bytes to skip in next buffer
|
||||
self._mNeedToSkipCharNum = 0
|
||||
self._mLastCharOrder = -1 # The order of previous char
|
||||
self._mDone = constants.False # If this flag is set to constants.True, detection is done and conclusion has been made
|
||||
# If this flag is set to True, detection is done and conclusion has
|
||||
# been made
|
||||
self._mDone = False
|
||||
|
||||
def feed(self, aBuf, aLen):
|
||||
if self._mDone: return
|
||||
if self._mDone:
|
||||
return
|
||||
|
||||
# The buffer we got is byte oriented, and a character may span in more than one
|
||||
# buffers. In case the last one or two byte in last buffer is not complete, we
|
||||
# record how many byte needed to complete that character and skip these bytes here.
|
||||
# We can choose to record those bytes as well and analyse the character once it
|
||||
# is complete, but since a character will not make much difference, by simply skipping
|
||||
# buffers. In case the last one or two byte in last buffer is not
|
||||
# complete, we record how many byte needed to complete that character
|
||||
# and skip these bytes here. We can choose to record those bytes as
|
||||
# well and analyse the character once it is complete, but since a
|
||||
# character will not make much difference, by simply skipping
|
||||
# this character will simply our logic and improve performance.
|
||||
i = self._mNeedToSkipCharNum
|
||||
while i < aLen:
|
||||
|
@ -151,7 +158,7 @@ class JapaneseContextAnalysis:
|
|||
if (order != -1) and (self._mLastCharOrder != -1):
|
||||
self._mTotalRel += 1
|
||||
if self._mTotalRel > MAX_REL_THRESHOLD:
|
||||
self._mDone = constants.True
|
||||
self._mDone = True
|
||||
break
|
||||
self._mRelSample[jp2CharContext[self._mLastCharOrder][order]] += 1
|
||||
self._mLastCharOrder = order
|
||||
|
@ -166,45 +173,55 @@ class JapaneseContextAnalysis:
|
|||
else:
|
||||
return DONT_KNOW
|
||||
|
||||
def get_order(self, aStr):
|
||||
def get_order(self, aBuf):
|
||||
return -1, 1
|
||||
|
||||
class SJISContextAnalysis(JapaneseContextAnalysis):
|
||||
def get_order(self, aStr):
|
||||
if not aStr: return -1, 1
|
||||
def __init__(self):
|
||||
self.charset_name = "SHIFT_JIS"
|
||||
|
||||
def get_charset_name(self):
|
||||
return self.charset_name
|
||||
|
||||
def get_order(self, aBuf):
|
||||
if not aBuf:
|
||||
return -1, 1
|
||||
# find out current char's byte length
|
||||
if ((aStr[0] >= '\x81') and (aStr[0] <= '\x9F')) or \
|
||||
((aStr[0] >= '\xE0') and (aStr[0] <= '\xFC')):
|
||||
first_char = wrap_ord(aBuf[0])
|
||||
if ((0x81 <= first_char <= 0x9F) or (0xE0 <= first_char <= 0xFC)):
|
||||
charLen = 2
|
||||
if (first_char == 0x87) or (0xFA <= first_char <= 0xFC):
|
||||
self.charset_name = "CP932"
|
||||
else:
|
||||
charLen = 1
|
||||
|
||||
# return its order if it is hiragana
|
||||
if len(aStr) > 1:
|
||||
if (aStr[0] == '\202') and \
|
||||
(aStr[1] >= '\x9F') and \
|
||||
(aStr[1] <= '\xF1'):
|
||||
return ord(aStr[1]) - 0x9F, charLen
|
||||
if len(aBuf) > 1:
|
||||
second_char = wrap_ord(aBuf[1])
|
||||
if (first_char == 202) and (0x9F <= second_char <= 0xF1):
|
||||
return second_char - 0x9F, charLen
|
||||
|
||||
return -1, charLen
|
||||
|
||||
class EUCJPContextAnalysis(JapaneseContextAnalysis):
|
||||
def get_order(self, aStr):
|
||||
if not aStr: return -1, 1
|
||||
def get_order(self, aBuf):
|
||||
if not aBuf:
|
||||
return -1, 1
|
||||
# find out current char's byte length
|
||||
if (aStr[0] == '\x8E') or \
|
||||
((aStr[0] >= '\xA1') and (aStr[0] <= '\xFE')):
|
||||
first_char = wrap_ord(aBuf[0])
|
||||
if (first_char == 0x8E) or (0xA1 <= first_char <= 0xFE):
|
||||
charLen = 2
|
||||
elif aStr[0] == '\x8F':
|
||||
elif first_char == 0x8F:
|
||||
charLen = 3
|
||||
else:
|
||||
charLen = 1
|
||||
|
||||
# return its order if it is hiragana
|
||||
if len(aStr) > 1:
|
||||
if (aStr[0] == '\xA4') and \
|
||||
(aStr[1] >= '\xA1') and \
|
||||
(aStr[1] <= '\xF3'):
|
||||
return ord(aStr[1]) - 0xA1, charLen
|
||||
if len(aBuf) > 1:
|
||||
second_char = wrap_ord(aBuf[1])
|
||||
if (first_char == 0xA4) and (0xA1 <= second_char <= 0xF3):
|
||||
return second_char - 0xA1, charLen
|
||||
|
||||
return -1, charLen
|
||||
|
||||
# flake8: noqa
|
||||
|
|
19
thirdparty/chardet/langbulgarianmodel.py
vendored
19
thirdparty/chardet/langbulgarianmodel.py
vendored
|
@ -25,8 +25,6 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import constants
|
||||
|
||||
# 255: Control characters that usually does not exist in any text
|
||||
# 254: Carriage/Return
|
||||
# 253: symbol (punctuation) that does not belong to word
|
||||
|
@ -36,7 +34,7 @@ import constants
|
|||
# this table is modified base on win1251BulgarianCharToOrderMap, so
|
||||
# only number <64 is sure valid
|
||||
|
||||
Latin5_BulgarianCharToOrderMap = ( \
|
||||
Latin5_BulgarianCharToOrderMap = (
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||
|
@ -55,7 +53,7 @@ Latin5_BulgarianCharToOrderMap = ( \
|
|||
62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,252,253, # f0
|
||||
)
|
||||
|
||||
win1251BulgarianCharToOrderMap = ( \
|
||||
win1251BulgarianCharToOrderMap = (
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||
|
@ -80,7 +78,7 @@ win1251BulgarianCharToOrderMap = ( \
|
|||
# first 1024 sequences:3.0618%
|
||||
# rest sequences: 0.2992%
|
||||
# negative sequences: 0.0020%
|
||||
BulgarianLangModel = ( \
|
||||
BulgarianLangModel = (
|
||||
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,2,1,2,2,
|
||||
3,1,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,3,0,3,0,1,
|
||||
|
@ -211,18 +209,21 @@ BulgarianLangModel = ( \
|
|||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
|
||||
)
|
||||
|
||||
Latin5BulgarianModel = { \
|
||||
Latin5BulgarianModel = {
|
||||
'charToOrderMap': Latin5_BulgarianCharToOrderMap,
|
||||
'precedenceMatrix': BulgarianLangModel,
|
||||
'mTypicalPositiveRatio': 0.969392,
|
||||
'keepEnglishLetter': constants.False,
|
||||
'keepEnglishLetter': False,
|
||||
'charsetName': "ISO-8859-5"
|
||||
}
|
||||
|
||||
Win1251BulgarianModel = { \
|
||||
Win1251BulgarianModel = {
|
||||
'charToOrderMap': win1251BulgarianCharToOrderMap,
|
||||
'precedenceMatrix': BulgarianLangModel,
|
||||
'mTypicalPositiveRatio': 0.969392,
|
||||
'keepEnglishLetter': constants.False,
|
||||
'keepEnglishLetter': False,
|
||||
'charsetName': "windows-1251"
|
||||
}
|
||||
|
||||
|
||||
# flake8: noqa
|
||||
|
|
42
thirdparty/chardet/langcyrillicmodel.py
vendored
42
thirdparty/chardet/langcyrillicmodel.py
vendored
|
@ -25,11 +25,9 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import constants
|
||||
|
||||
# KOI8-R language model
|
||||
# Character Mapping Table:
|
||||
KOI8R_CharToOrderMap = ( \
|
||||
KOI8R_CharToOrderMap = (
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||
|
@ -48,7 +46,7 @@ KOI8R_CharToOrderMap = ( \
|
|||
35, 43, 45, 32, 40, 52, 56, 33, 61, 62, 51, 57, 47, 63, 50, 70, # f0
|
||||
)
|
||||
|
||||
win1251_CharToOrderMap = ( \
|
||||
win1251_CharToOrderMap = (
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||
|
@ -67,7 +65,7 @@ win1251_CharToOrderMap = ( \
|
|||
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16,
|
||||
)
|
||||
|
||||
latin5_CharToOrderMap = ( \
|
||||
latin5_CharToOrderMap = (
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||
|
@ -86,7 +84,7 @@ latin5_CharToOrderMap = ( \
|
|||
239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255,
|
||||
)
|
||||
|
||||
macCyrillic_CharToOrderMap = ( \
|
||||
macCyrillic_CharToOrderMap = (
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||
|
@ -105,7 +103,7 @@ macCyrillic_CharToOrderMap = ( \
|
|||
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,255,
|
||||
)
|
||||
|
||||
IBM855_CharToOrderMap = ( \
|
||||
IBM855_CharToOrderMap = (
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||
|
@ -124,7 +122,7 @@ IBM855_CharToOrderMap = ( \
|
|||
250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,252,255,
|
||||
)
|
||||
|
||||
IBM866_CharToOrderMap = ( \
|
||||
IBM866_CharToOrderMap = (
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||
|
@ -149,7 +147,7 @@ IBM866_CharToOrderMap = ( \
|
|||
# first 1024 sequences: 2.3389%
|
||||
# rest sequences: 0.1237%
|
||||
# negative sequences: 0.0009%
|
||||
RussianLangModel = ( \
|
||||
RussianLangModel = (
|
||||
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,2,2,2,0,0,2,
|
||||
3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,3,0,0,3,3,3,3,3,3,3,3,3,2,3,2,0,
|
||||
|
@ -280,50 +278,52 @@ RussianLangModel = ( \
|
|||
0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,
|
||||
)
|
||||
|
||||
Koi8rModel = { \
|
||||
Koi8rModel = {
|
||||
'charToOrderMap': KOI8R_CharToOrderMap,
|
||||
'precedenceMatrix': RussianLangModel,
|
||||
'mTypicalPositiveRatio': 0.976601,
|
||||
'keepEnglishLetter': constants.False,
|
||||
'keepEnglishLetter': False,
|
||||
'charsetName': "KOI8-R"
|
||||
}
|
||||
|
||||
Win1251CyrillicModel = { \
|
||||
Win1251CyrillicModel = {
|
||||
'charToOrderMap': win1251_CharToOrderMap,
|
||||
'precedenceMatrix': RussianLangModel,
|
||||
'mTypicalPositiveRatio': 0.976601,
|
||||
'keepEnglishLetter': constants.False,
|
||||
'keepEnglishLetter': False,
|
||||
'charsetName': "windows-1251"
|
||||
}
|
||||
|
||||
Latin5CyrillicModel = { \
|
||||
Latin5CyrillicModel = {
|
||||
'charToOrderMap': latin5_CharToOrderMap,
|
||||
'precedenceMatrix': RussianLangModel,
|
||||
'mTypicalPositiveRatio': 0.976601,
|
||||
'keepEnglishLetter': constants.False,
|
||||
'keepEnglishLetter': False,
|
||||
'charsetName': "ISO-8859-5"
|
||||
}
|
||||
|
||||
MacCyrillicModel = { \
|
||||
MacCyrillicModel = {
|
||||
'charToOrderMap': macCyrillic_CharToOrderMap,
|
||||
'precedenceMatrix': RussianLangModel,
|
||||
'mTypicalPositiveRatio': 0.976601,
|
||||
'keepEnglishLetter': constants.False,
|
||||
'keepEnglishLetter': False,
|
||||
'charsetName': "MacCyrillic"
|
||||
};
|
||||
|
||||
Ibm866Model = { \
|
||||
Ibm866Model = {
|
||||
'charToOrderMap': IBM866_CharToOrderMap,
|
||||
'precedenceMatrix': RussianLangModel,
|
||||
'mTypicalPositiveRatio': 0.976601,
|
||||
'keepEnglishLetter': constants.False,
|
||||
'keepEnglishLetter': False,
|
||||
'charsetName': "IBM866"
|
||||
}
|
||||
|
||||
Ibm855Model = { \
|
||||
Ibm855Model = {
|
||||
'charToOrderMap': IBM855_CharToOrderMap,
|
||||
'precedenceMatrix': RussianLangModel,
|
||||
'mTypicalPositiveRatio': 0.976601,
|
||||
'keepEnglishLetter': constants.False,
|
||||
'keepEnglishLetter': False,
|
||||
'charsetName': "IBM855"
|
||||
}
|
||||
|
||||
# flake8: noqa
|
||||
|
|
18
thirdparty/chardet/langgreekmodel.py
vendored
18
thirdparty/chardet/langgreekmodel.py
vendored
|
@ -25,15 +25,13 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import constants
|
||||
|
||||
# 255: Control characters that usually does not exist in any text
|
||||
# 254: Carriage/Return
|
||||
# 253: symbol (punctuation) that does not belong to word
|
||||
# 252: 0 - 9
|
||||
|
||||
# Character Mapping Table:
|
||||
Latin7_CharToOrderMap = ( \
|
||||
Latin7_CharToOrderMap = (
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||
|
@ -52,7 +50,7 @@ Latin7_CharToOrderMap = ( \
|
|||
9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, # f0
|
||||
)
|
||||
|
||||
win1253_CharToOrderMap = ( \
|
||||
win1253_CharToOrderMap = (
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||
|
@ -77,7 +75,7 @@ win1253_CharToOrderMap = ( \
|
|||
# first 1024 sequences:1.7001%
|
||||
# rest sequences: 0.0359%
|
||||
# negative sequences: 0.0148%
|
||||
GreekLangModel = ( \
|
||||
GreekLangModel = (
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,3,2,2,3,3,3,3,3,3,3,3,1,3,3,3,0,2,2,3,3,0,3,0,3,2,0,3,3,3,0,
|
||||
|
@ -208,18 +206,20 @@ GreekLangModel = ( \
|
|||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
)
|
||||
|
||||
Latin7GreekModel = { \
|
||||
Latin7GreekModel = {
|
||||
'charToOrderMap': Latin7_CharToOrderMap,
|
||||
'precedenceMatrix': GreekLangModel,
|
||||
'mTypicalPositiveRatio': 0.982851,
|
||||
'keepEnglishLetter': constants.False,
|
||||
'keepEnglishLetter': False,
|
||||
'charsetName': "ISO-8859-7"
|
||||
}
|
||||
|
||||
Win1253GreekModel = { \
|
||||
Win1253GreekModel = {
|
||||
'charToOrderMap': win1253_CharToOrderMap,
|
||||
'precedenceMatrix': GreekLangModel,
|
||||
'mTypicalPositiveRatio': 0.982851,
|
||||
'keepEnglishLetter': constants.False,
|
||||
'keepEnglishLetter': False,
|
||||
'charsetName': "windows-1253"
|
||||
}
|
||||
|
||||
# flake8: noqa
|
||||
|
|
12
thirdparty/chardet/langhebrewmodel.py
vendored
12
thirdparty/chardet/langhebrewmodel.py
vendored
|
@ -27,8 +27,6 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import constants
|
||||
|
||||
# 255: Control characters that usually does not exist in any text
|
||||
# 254: Carriage/Return
|
||||
# 253: symbol (punctuation) that does not belong to word
|
||||
|
@ -36,7 +34,7 @@ import constants
|
|||
|
||||
# Windows-1255 language model
|
||||
# Character Mapping Table:
|
||||
win1255_CharToOrderMap = ( \
|
||||
win1255_CharToOrderMap = (
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||
|
@ -61,7 +59,7 @@ win1255_CharToOrderMap = ( \
|
|||
# first 1024 sequences: 1.5981%
|
||||
# rest sequences: 0.087%
|
||||
# negative sequences: 0.0015%
|
||||
HebrewLangModel = ( \
|
||||
HebrewLangModel = (
|
||||
0,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,2,1,2,0,1,0,0,
|
||||
3,0,3,1,0,0,1,3,2,0,1,1,2,0,2,2,2,1,1,1,1,2,1,1,1,2,0,0,2,2,0,1,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,
|
||||
|
@ -192,10 +190,12 @@ HebrewLangModel = ( \
|
|||
0,0,0,0,0,0,0,0,0,0,1,2,1,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,1,1,0,0,
|
||||
)
|
||||
|
||||
Win1255HebrewModel = { \
|
||||
Win1255HebrewModel = {
|
||||
'charToOrderMap': win1255_CharToOrderMap,
|
||||
'precedenceMatrix': HebrewLangModel,
|
||||
'mTypicalPositiveRatio': 0.984004,
|
||||
'keepEnglishLetter': constants.False,
|
||||
'keepEnglishLetter': False,
|
||||
'charsetName': "windows-1255"
|
||||
}
|
||||
|
||||
# flake8: noqa
|
||||
|
|
18
thirdparty/chardet/langhungarianmodel.py
vendored
18
thirdparty/chardet/langhungarianmodel.py
vendored
|
@ -25,15 +25,13 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import constants
|
||||
|
||||
# 255: Control characters that usually does not exist in any text
|
||||
# 254: Carriage/Return
|
||||
# 253: symbol (punctuation) that does not belong to word
|
||||
# 252: 0 - 9
|
||||
|
||||
# Character Mapping Table:
|
||||
Latin2_HungarianCharToOrderMap = ( \
|
||||
Latin2_HungarianCharToOrderMap = (
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||
|
@ -52,7 +50,7 @@ Latin2_HungarianCharToOrderMap = ( \
|
|||
245,246,247, 25, 73, 42, 24,248,249,250, 31, 56, 29,251,252,253,
|
||||
)
|
||||
|
||||
win1250HungarianCharToOrderMap = ( \
|
||||
win1250HungarianCharToOrderMap = (
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||
|
@ -77,7 +75,7 @@ win1250HungarianCharToOrderMap = ( \
|
|||
# first 1024 sequences:5.2623%
|
||||
# rest sequences: 0.8894%
|
||||
# negative sequences: 0.0009%
|
||||
HungarianLangModel = ( \
|
||||
HungarianLangModel = (
|
||||
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
|
||||
3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,1,1,2,2,2,2,2,1,2,
|
||||
3,2,2,3,3,3,3,3,2,3,3,3,3,3,3,1,2,3,3,3,3,2,3,3,1,1,3,3,0,1,1,1,
|
||||
|
@ -208,18 +206,20 @@ HungarianLangModel = ( \
|
|||
0,1,1,1,1,1,1,0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,
|
||||
)
|
||||
|
||||
Latin2HungarianModel = { \
|
||||
Latin2HungarianModel = {
|
||||
'charToOrderMap': Latin2_HungarianCharToOrderMap,
|
||||
'precedenceMatrix': HungarianLangModel,
|
||||
'mTypicalPositiveRatio': 0.947368,
|
||||
'keepEnglishLetter': constants.True,
|
||||
'keepEnglishLetter': True,
|
||||
'charsetName': "ISO-8859-2"
|
||||
}
|
||||
|
||||
Win1250HungarianModel = { \
|
||||
Win1250HungarianModel = {
|
||||
'charToOrderMap': win1250HungarianCharToOrderMap,
|
||||
'precedenceMatrix': HungarianLangModel,
|
||||
'mTypicalPositiveRatio': 0.947368,
|
||||
'keepEnglishLetter': constants.True,
|
||||
'keepEnglishLetter': True,
|
||||
'charsetName': "windows-1250"
|
||||
}
|
||||
|
||||
# flake8: noqa
|
||||
|
|
12
thirdparty/chardet/langthaimodel.py
vendored
12
thirdparty/chardet/langthaimodel.py
vendored
|
@ -25,8 +25,6 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import constants
|
||||
|
||||
# 255: Control characters that usually does not exist in any text
|
||||
# 254: Carriage/Return
|
||||
# 253: symbol (punctuation) that does not belong to word
|
||||
|
@ -35,7 +33,7 @@ import constants
|
|||
# The following result for thai was collected from a limited sample (1M).
|
||||
|
||||
# Character Mapping Table:
|
||||
TIS620CharToOrderMap = ( \
|
||||
TIS620CharToOrderMap = (
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
|
||||
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
|
||||
|
@ -60,7 +58,7 @@ TIS620CharToOrderMap = ( \
|
|||
# first 1024 sequences:7.3177%
|
||||
# rest sequences: 1.0230%
|
||||
# negative sequences: 0.0436%
|
||||
ThaiLangModel = ( \
|
||||
ThaiLangModel = (
|
||||
0,1,3,3,3,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,3,3,
|
||||
0,3,3,0,0,0,1,3,0,3,3,2,3,3,0,1,2,3,3,3,3,0,2,0,2,0,0,3,2,1,2,2,
|
||||
3,0,3,3,2,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,3,0,3,2,3,0,2,2,2,3,
|
||||
|
@ -191,10 +189,12 @@ ThaiLangModel = ( \
|
|||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
)
|
||||
|
||||
TIS620ThaiModel = { \
|
||||
TIS620ThaiModel = {
|
||||
'charToOrderMap': TIS620CharToOrderMap,
|
||||
'precedenceMatrix': ThaiLangModel,
|
||||
'mTypicalPositiveRatio': 0.926386,
|
||||
'keepEnglishLetter': constants.False,
|
||||
'keepEnglishLetter': False,
|
||||
'charsetName': "TIS-620"
|
||||
}
|
||||
|
||||
# flake8: noqa
|
||||
|
|
31
thirdparty/chardet/latin1prober.py
vendored
31
thirdparty/chardet/latin1prober.py
vendored
|
@ -26,9 +26,9 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from charsetprober import CharSetProber
|
||||
import constants
|
||||
import operator
|
||||
from .charsetprober import CharSetProber
|
||||
from .constants import eNotMe
|
||||
from .compat import wrap_ord
|
||||
|
||||
FREQ_CAT_NUM = 4
|
||||
|
||||
|
@ -42,7 +42,7 @@ ASV = 6 # accent small vowel
|
|||
ASO = 7 # accent small other
|
||||
CLASS_NUM = 8 # total classes
|
||||
|
||||
Latin1_CharToClass = ( \
|
||||
Latin1_CharToClass = (
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17
|
||||
|
@ -81,7 +81,7 @@ Latin1_CharToClass = ( \
|
|||
# 1 : very unlikely
|
||||
# 2 : normal
|
||||
# 3 : very likely
|
||||
Latin1ClassModel = ( \
|
||||
Latin1ClassModel = (
|
||||
# UDF OTH ASC ASS ACV ACO ASV ASO
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # UDF
|
||||
0, 3, 3, 3, 3, 3, 3, 3, # OTH
|
||||
|
@ -93,6 +93,7 @@ Latin1ClassModel = ( \
|
|||
0, 3, 1, 3, 1, 1, 3, 3, # ASO
|
||||
)
|
||||
|
||||
|
||||
class Latin1Prober(CharSetProber):
|
||||
def __init__(self):
|
||||
CharSetProber.__init__(self)
|
||||
|
@ -109,10 +110,11 @@ class Latin1Prober(CharSetProber):
|
|||
def feed(self, aBuf):
|
||||
aBuf = self.filter_with_english_letters(aBuf)
|
||||
for c in aBuf:
|
||||
charClass = Latin1_CharToClass[ord(c)]
|
||||
freq = Latin1ClassModel[(self._mLastCharClass * CLASS_NUM) + charClass]
|
||||
charClass = Latin1_CharToClass[wrap_ord(c)]
|
||||
freq = Latin1ClassModel[(self._mLastCharClass * CLASS_NUM)
|
||||
+ charClass]
|
||||
if freq == 0:
|
||||
self._mState = constants.eNotMe
|
||||
self._mState = eNotMe
|
||||
break
|
||||
self._mFreqCounter[freq] += 1
|
||||
self._mLastCharClass = charClass
|
||||
|
@ -120,17 +122,18 @@ class Latin1Prober(CharSetProber):
|
|||
return self.get_state()
|
||||
|
||||
def get_confidence(self):
|
||||
if self.get_state() == constants.eNotMe:
|
||||
if self.get_state() == eNotMe:
|
||||
return 0.01
|
||||
|
||||
total = reduce(operator.add, self._mFreqCounter)
|
||||
total = sum(self._mFreqCounter)
|
||||
if total < 0.01:
|
||||
confidence = 0.0
|
||||
else:
|
||||
confidence = (self._mFreqCounter[3] / total) - (self._mFreqCounter[1] * 20.0 / total)
|
||||
confidence = ((self._mFreqCounter[3] - self._mFreqCounter[1] * 20.0)
|
||||
/ total)
|
||||
if confidence < 0.0:
|
||||
confidence = 0.0
|
||||
# lower the confidence of latin1 so that other more accurate detector
|
||||
# can take priority.
|
||||
confidence = confidence * 0.5
|
||||
# lower the confidence of latin1 so that other more accurate
|
||||
# detector can take priority.
|
||||
confidence = confidence * 0.73
|
||||
return confidence
|
||||
|
|
30
thirdparty/chardet/mbcharsetprober.py
vendored
30
thirdparty/chardet/mbcharsetprober.py
vendored
|
@ -27,16 +27,17 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import constants, sys
|
||||
from constants import eStart, eError, eItsMe
|
||||
from charsetprober import CharSetProber
|
||||
import sys
|
||||
from . import constants
|
||||
from .charsetprober import CharSetProber
|
||||
|
||||
|
||||
class MultiByteCharSetProber(CharSetProber):
|
||||
def __init__(self):
|
||||
CharSetProber.__init__(self)
|
||||
self._mDistributionAnalyzer = None
|
||||
self._mCodingSM = None
|
||||
self._mLastChar = ['\x00', '\x00']
|
||||
self._mLastChar = [0, 0]
|
||||
|
||||
def reset(self):
|
||||
CharSetProber.reset(self)
|
||||
|
@ -44,36 +45,39 @@ class MultiByteCharSetProber(CharSetProber):
|
|||
self._mCodingSM.reset()
|
||||
if self._mDistributionAnalyzer:
|
||||
self._mDistributionAnalyzer.reset()
|
||||
self._mLastChar = ['\x00', '\x00']
|
||||
self._mLastChar = [0, 0]
|
||||
|
||||
def get_charset_name(self):
|
||||
pass
|
||||
|
||||
def feed(self, aBuf):
|
||||
aLen = len(aBuf)
|
||||
for i in xrange(0, aLen):
|
||||
for i in range(0, aLen):
|
||||
codingState = self._mCodingSM.next_state(aBuf[i])
|
||||
if codingState == eError:
|
||||
if codingState == constants.eError:
|
||||
if constants._debug:
|
||||
sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n')
|
||||
sys.stderr.write(self.get_charset_name()
|
||||
+ ' prober hit error at byte ' + str(i)
|
||||
+ '\n')
|
||||
self._mState = constants.eNotMe
|
||||
break
|
||||
elif codingState == eItsMe:
|
||||
elif codingState == constants.eItsMe:
|
||||
self._mState = constants.eFoundIt
|
||||
break
|
||||
elif codingState == eStart:
|
||||
elif codingState == constants.eStart:
|
||||
charLen = self._mCodingSM.get_current_charlen()
|
||||
if i == 0:
|
||||
self._mLastChar[1] = aBuf[0]
|
||||
self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
|
||||
else:
|
||||
self._mDistributionAnalyzer.feed(aBuf[i-1:i+1], charLen)
|
||||
self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1],
|
||||
charLen)
|
||||
|
||||
self._mLastChar[0] = aBuf[aLen - 1]
|
||||
|
||||
if self.get_state() == constants.eDetecting:
|
||||
if self._mDistributionAnalyzer.got_enough_data() and \
|
||||
(self.get_confidence() > constants.SHORTCUT_THRESHOLD):
|
||||
if (self._mDistributionAnalyzer.got_enough_data() and
|
||||
(self.get_confidence() > constants.SHORTCUT_THRESHOLD)):
|
||||
self._mState = constants.eFoundIt
|
||||
|
||||
return self.get_state()
|
||||
|
|
24
thirdparty/chardet/mbcsgroupprober.py
vendored
24
thirdparty/chardet/mbcsgroupprober.py
vendored
|
@ -27,24 +27,28 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from charsetgroupprober import CharSetGroupProber
|
||||
from utf8prober import UTF8Prober
|
||||
from sjisprober import SJISProber
|
||||
from eucjpprober import EUCJPProber
|
||||
from gb2312prober import GB2312Prober
|
||||
from euckrprober import EUCKRProber
|
||||
from big5prober import Big5Prober
|
||||
from euctwprober import EUCTWProber
|
||||
from .charsetgroupprober import CharSetGroupProber
|
||||
from .utf8prober import UTF8Prober
|
||||
from .sjisprober import SJISProber
|
||||
from .eucjpprober import EUCJPProber
|
||||
from .gb2312prober import GB2312Prober
|
||||
from .euckrprober import EUCKRProber
|
||||
from .cp949prober import CP949Prober
|
||||
from .big5prober import Big5Prober
|
||||
from .euctwprober import EUCTWProber
|
||||
|
||||
|
||||
class MBCSGroupProber(CharSetGroupProber):
|
||||
def __init__(self):
|
||||
CharSetGroupProber.__init__(self)
|
||||
self._mProbers = [ \
|
||||
self._mProbers = [
|
||||
UTF8Prober(),
|
||||
SJISProber(),
|
||||
EUCJPProber(),
|
||||
GB2312Prober(),
|
||||
EUCKRProber(),
|
||||
CP949Prober(),
|
||||
Big5Prober(),
|
||||
EUCTWProber()]
|
||||
EUCTWProber()
|
||||
]
|
||||
self.reset()
|
||||
|
|
134
thirdparty/chardet/mbcssm.py
vendored
134
thirdparty/chardet/mbcssm.py
vendored
|
@ -25,11 +25,11 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from constants import eStart, eError, eItsMe
|
||||
from .constants import eStart, eError, eItsMe
|
||||
|
||||
# BIG5
|
||||
|
||||
BIG5_cls = ( \
|
||||
BIG5_cls = (
|
||||
1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as legal value
|
||||
1,1,1,1,1,1,0,0, # 08 - 0f
|
||||
1,1,1,1,1,1,1,1, # 10 - 17
|
||||
|
@ -61,12 +61,14 @@ BIG5_cls = ( \
|
|||
3,3,3,3,3,3,3,3, # e0 - e7
|
||||
3,3,3,3,3,3,3,3, # e8 - ef
|
||||
3,3,3,3,3,3,3,3, # f0 - f7
|
||||
3,3,3,3,3,3,3,0) # f8 - ff
|
||||
3,3,3,3,3,3,3,0 # f8 - ff
|
||||
)
|
||||
|
||||
BIG5_st = ( \
|
||||
BIG5_st = (
|
||||
eError,eStart,eStart, 3,eError,eError,eError,eError,#00-07
|
||||
eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,#08-0f
|
||||
eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart)#10-17
|
||||
eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart#10-17
|
||||
)
|
||||
|
||||
Big5CharLenTable = (0, 1, 1, 2, 0)
|
||||
|
||||
|
@ -76,9 +78,49 @@ Big5SMModel = {'classTable': BIG5_cls,
|
|||
'charLenTable': Big5CharLenTable,
|
||||
'name': 'Big5'}
|
||||
|
||||
# CP949
|
||||
|
||||
CP949_cls = (
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,0,0, # 00 - 0f
|
||||
1,1,1,1,1,1,1,1, 1,1,1,0,1,1,1,1, # 10 - 1f
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, # 20 - 2f
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, # 30 - 3f
|
||||
1,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, # 40 - 4f
|
||||
4,4,5,5,5,5,5,5, 5,5,5,1,1,1,1,1, # 50 - 5f
|
||||
1,5,5,5,5,5,5,5, 5,5,5,5,5,5,5,5, # 60 - 6f
|
||||
5,5,5,5,5,5,5,5, 5,5,5,1,1,1,1,1, # 70 - 7f
|
||||
0,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6, # 80 - 8f
|
||||
6,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6, # 90 - 9f
|
||||
6,7,7,7,7,7,7,7, 7,7,7,7,7,8,8,8, # a0 - af
|
||||
7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7, # b0 - bf
|
||||
7,7,7,7,7,7,9,2, 2,3,2,2,2,2,2,2, # c0 - cf
|
||||
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, # d0 - df
|
||||
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, # e0 - ef
|
||||
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,0, # f0 - ff
|
||||
)
|
||||
|
||||
CP949_st = (
|
||||
#cls= 0 1 2 3 4 5 6 7 8 9 # previous state =
|
||||
eError,eStart, 3,eError,eStart,eStart, 4, 5,eError, 6, # eStart
|
||||
eError,eError,eError,eError,eError,eError,eError,eError,eError,eError, # eError
|
||||
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe, # eItsMe
|
||||
eError,eError,eStart,eStart,eError,eError,eError,eStart,eStart,eStart, # 3
|
||||
eError,eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart,eStart, # 4
|
||||
eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart,eStart,eStart, # 5
|
||||
eError,eStart,eStart,eStart,eStart,eError,eError,eStart,eStart,eStart, # 6
|
||||
)
|
||||
|
||||
CP949CharLenTable = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2)
|
||||
|
||||
CP949SMModel = {'classTable': CP949_cls,
|
||||
'classFactor': 10,
|
||||
'stateTable': CP949_st,
|
||||
'charLenTable': CP949CharLenTable,
|
||||
'name': 'CP949'}
|
||||
|
||||
# EUC-JP
|
||||
|
||||
EUCJP_cls = ( \
|
||||
EUCJP_cls = (
|
||||
4,4,4,4,4,4,4,4, # 00 - 07
|
||||
4,4,4,4,4,4,5,5, # 08 - 0f
|
||||
4,4,4,4,4,4,4,4, # 10 - 17
|
||||
|
@ -110,14 +152,16 @@ EUCJP_cls = ( \
|
|||
0,0,0,0,0,0,0,0, # e0 - e7
|
||||
0,0,0,0,0,0,0,0, # e8 - ef
|
||||
0,0,0,0,0,0,0,0, # f0 - f7
|
||||
0,0,0,0,0,0,0,5) # f8 - ff
|
||||
0,0,0,0,0,0,0,5 # f8 - ff
|
||||
)
|
||||
|
||||
EUCJP_st = ( \
|
||||
EUCJP_st = (
|
||||
3, 4, 3, 5,eStart,eError,eError,eError,#00-07
|
||||
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
|
||||
eItsMe,eItsMe,eStart,eError,eStart,eError,eError,eError,#10-17
|
||||
eError,eError,eStart,eError,eError,eError, 3,eError,#18-1f
|
||||
3,eError,eError,eError,eStart,eStart,eStart,eStart)#20-27
|
||||
3,eError,eError,eError,eStart,eStart,eStart,eStart#20-27
|
||||
)
|
||||
|
||||
EUCJPCharLenTable = (2, 2, 2, 3, 1, 0)
|
||||
|
||||
|
@ -129,7 +173,7 @@ EUCJPSMModel = {'classTable': EUCJP_cls,
|
|||
|
||||
# EUC-KR
|
||||
|
||||
EUCKR_cls = ( \
|
||||
EUCKR_cls = (
|
||||
1,1,1,1,1,1,1,1, # 00 - 07
|
||||
1,1,1,1,1,1,0,0, # 08 - 0f
|
||||
1,1,1,1,1,1,1,1, # 10 - 17
|
||||
|
@ -161,11 +205,13 @@ EUCKR_cls = ( \
|
|||
2,2,2,2,2,2,2,2, # e0 - e7
|
||||
2,2,2,2,2,2,2,2, # e8 - ef
|
||||
2,2,2,2,2,2,2,2, # f0 - f7
|
||||
2,2,2,2,2,2,2,0) # f8 - ff
|
||||
2,2,2,2,2,2,2,0 # f8 - ff
|
||||
)
|
||||
|
||||
EUCKR_st = (
|
||||
eError,eStart, 3,eError,eError,eError,eError,eError,#00-07
|
||||
eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart)#08-0f
|
||||
eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart #08-0f
|
||||
)
|
||||
|
||||
EUCKRCharLenTable = (0, 1, 2, 0)
|
||||
|
||||
|
@ -177,7 +223,7 @@ EUCKRSMModel = {'classTable': EUCKR_cls,
|
|||
|
||||
# EUC-TW
|
||||
|
||||
EUCTW_cls = ( \
|
||||
EUCTW_cls = (
|
||||
2,2,2,2,2,2,2,2, # 00 - 07
|
||||
2,2,2,2,2,2,0,0, # 08 - 0f
|
||||
2,2,2,2,2,2,2,2, # 10 - 17
|
||||
|
@ -209,15 +255,17 @@ EUCTW_cls = ( \
|
|||
3,3,3,3,3,3,3,3, # e0 - e7
|
||||
3,3,3,3,3,3,3,3, # e8 - ef
|
||||
3,3,3,3,3,3,3,3, # f0 - f7
|
||||
3,3,3,3,3,3,3,0) # f8 - ff
|
||||
3,3,3,3,3,3,3,0 # f8 - ff
|
||||
)
|
||||
|
||||
EUCTW_st = ( \
|
||||
EUCTW_st = (
|
||||
eError,eError,eStart, 3, 3, 3, 4,eError,#00-07
|
||||
eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,#08-0f
|
||||
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eStart,eError,#10-17
|
||||
eStart,eStart,eStart,eError,eError,eError,eError,eError,#18-1f
|
||||
5,eError,eError,eError,eStart,eError,eStart,eStart,#20-27
|
||||
eStart,eError,eStart,eStart,eStart,eStart,eStart,eStart)#28-2f
|
||||
eStart,eError,eStart,eStart,eStart,eStart,eStart,eStart #28-2f
|
||||
)
|
||||
|
||||
EUCTWCharLenTable = (0, 0, 1, 2, 2, 2, 3)
|
||||
|
||||
|
@ -229,7 +277,7 @@ EUCTWSMModel = {'classTable': EUCTW_cls,
|
|||
|
||||
# GB2312
|
||||
|
||||
GB2312_cls = ( \
|
||||
GB2312_cls = (
|
||||
1,1,1,1,1,1,1,1, # 00 - 07
|
||||
1,1,1,1,1,1,0,0, # 08 - 0f
|
||||
1,1,1,1,1,1,1,1, # 10 - 17
|
||||
|
@ -261,15 +309,17 @@ GB2312_cls = ( \
|
|||
6,6,6,6,6,6,6,6, # e0 - e7
|
||||
6,6,6,6,6,6,6,6, # e8 - ef
|
||||
6,6,6,6,6,6,6,6, # f0 - f7
|
||||
6,6,6,6,6,6,6,0) # f8 - ff
|
||||
6,6,6,6,6,6,6,0 # f8 - ff
|
||||
)
|
||||
|
||||
GB2312_st = ( \
|
||||
GB2312_st = (
|
||||
eError,eStart,eStart,eStart,eStart,eStart, 3,eError,#00-07
|
||||
eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,#08-0f
|
||||
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,#10-17
|
||||
4,eError,eStart,eStart,eError,eError,eError,eError,#18-1f
|
||||
eError,eError, 5,eError,eError,eError,eItsMe,eError,#20-27
|
||||
eError,eError,eStart,eStart,eStart,eStart,eStart,eStart)#28-2f
|
||||
eError,eError,eStart,eStart,eStart,eStart,eStart,eStart #28-2f
|
||||
)
|
||||
|
||||
# To be accurate, the length of class 6 can be either 2 or 4.
|
||||
# But it is not necessary to discriminate between the two since
|
||||
|
@ -286,7 +336,7 @@ GB2312SMModel = {'classTable': GB2312_cls,
|
|||
|
||||
# Shift_JIS
|
||||
|
||||
SJIS_cls = ( \
|
||||
SJIS_cls = (
|
||||
1,1,1,1,1,1,1,1, # 00 - 07
|
||||
1,1,1,1,1,1,0,0, # 08 - 0f
|
||||
1,1,1,1,1,1,1,1, # 10 - 17
|
||||
|
@ -303,7 +353,7 @@ SJIS_cls = ( \
|
|||
2,2,2,2,2,2,2,2, # 68 - 6f
|
||||
2,2,2,2,2,2,2,2, # 70 - 77
|
||||
2,2,2,2,2,2,2,1, # 78 - 7f
|
||||
3,3,3,3,3,3,3,3, # 80 - 87
|
||||
3,3,3,3,3,2,2,3, # 80 - 87
|
||||
3,3,3,3,3,3,3,3, # 88 - 8f
|
||||
3,3,3,3,3,3,3,3, # 90 - 97
|
||||
3,3,3,3,3,3,3,3, # 98 - 9f
|
||||
|
@ -319,13 +369,15 @@ SJIS_cls = ( \
|
|||
2,2,2,2,2,2,2,2, # d8 - df
|
||||
3,3,3,3,3,3,3,3, # e0 - e7
|
||||
3,3,3,3,3,4,4,4, # e8 - ef
|
||||
4,4,4,4,4,4,4,4, # f0 - f7
|
||||
4,4,4,4,4,0,0,0) # f8 - ff
|
||||
3,3,3,3,3,3,3,3, # f0 - f7
|
||||
3,3,3,3,3,0,0,0) # f8 - ff
|
||||
|
||||
SJIS_st = ( \
|
||||
|
||||
SJIS_st = (
|
||||
eError,eStart,eStart, 3,eError,eError,eError,eError,#00-07
|
||||
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
|
||||
eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart)#10-17
|
||||
eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart #10-17
|
||||
)
|
||||
|
||||
SJISCharLenTable = (0, 1, 1, 2, 0, 0)
|
||||
|
||||
|
@ -337,7 +389,7 @@ SJISSMModel = {'classTable': SJIS_cls,
|
|||
|
||||
# UCS2-BE
|
||||
|
||||
UCS2BE_cls = ( \
|
||||
UCS2BE_cls = (
|
||||
0,0,0,0,0,0,0,0, # 00 - 07
|
||||
0,0,1,0,0,2,0,0, # 08 - 0f
|
||||
0,0,0,0,0,0,0,0, # 10 - 17
|
||||
|
@ -369,16 +421,18 @@ UCS2BE_cls = ( \
|
|||
0,0,0,0,0,0,0,0, # e0 - e7
|
||||
0,0,0,0,0,0,0,0, # e8 - ef
|
||||
0,0,0,0,0,0,0,0, # f0 - f7
|
||||
0,0,0,0,0,0,4,5) # f8 - ff
|
||||
0,0,0,0,0,0,4,5 # f8 - ff
|
||||
)
|
||||
|
||||
UCS2BE_st = ( \
|
||||
UCS2BE_st = (
|
||||
5, 7, 7,eError, 4, 3,eError,eError,#00-07
|
||||
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
|
||||
eItsMe,eItsMe, 6, 6, 6, 6,eError,eError,#10-17
|
||||
6, 6, 6, 6, 6,eItsMe, 6, 6,#18-1f
|
||||
6, 6, 6, 6, 5, 7, 7,eError,#20-27
|
||||
5, 8, 6, 6,eError, 6, 6, 6,#28-2f
|
||||
6, 6, 6, 6,eError,eError,eStart,eStart)#30-37
|
||||
6, 6, 6, 6,eError,eError,eStart,eStart #30-37
|
||||
)
|
||||
|
||||
UCS2BECharLenTable = (2, 2, 2, 0, 2, 2)
|
||||
|
||||
|
@ -390,7 +444,7 @@ UCS2BESMModel = {'classTable': UCS2BE_cls,
|
|||
|
||||
# UCS2-LE
|
||||
|
||||
UCS2LE_cls = ( \
|
||||
UCS2LE_cls = (
|
||||
0,0,0,0,0,0,0,0, # 00 - 07
|
||||
0,0,1,0,0,2,0,0, # 08 - 0f
|
||||
0,0,0,0,0,0,0,0, # 10 - 17
|
||||
|
@ -422,16 +476,18 @@ UCS2LE_cls = ( \
|
|||
0,0,0,0,0,0,0,0, # e0 - e7
|
||||
0,0,0,0,0,0,0,0, # e8 - ef
|
||||
0,0,0,0,0,0,0,0, # f0 - f7
|
||||
0,0,0,0,0,0,4,5) # f8 - ff
|
||||
0,0,0,0,0,0,4,5 # f8 - ff
|
||||
)
|
||||
|
||||
UCS2LE_st = ( \
|
||||
UCS2LE_st = (
|
||||
6, 6, 7, 6, 4, 3,eError,eError,#00-07
|
||||
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
|
||||
eItsMe,eItsMe, 5, 5, 5,eError,eItsMe,eError,#10-17
|
||||
5, 5, 5,eError, 5,eError, 6, 6,#18-1f
|
||||
7, 6, 8, 8, 5, 5, 5,eError,#20-27
|
||||
5, 5, 5,eError,eError,eError, 5, 5,#28-2f
|
||||
5, 5, 5,eError, 5,eError,eStart,eStart)#30-37
|
||||
5, 5, 5,eError, 5,eError,eStart,eStart #30-37
|
||||
)
|
||||
|
||||
UCS2LECharLenTable = (2, 2, 2, 2, 2, 2)
|
||||
|
||||
|
@ -443,7 +499,7 @@ UCS2LESMModel = {'classTable': UCS2LE_cls,
|
|||
|
||||
# UTF-8
|
||||
|
||||
UTF8_cls = ( \
|
||||
UTF8_cls = (
|
||||
1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as a legal value
|
||||
1,1,1,1,1,1,0,0, # 08 - 0f
|
||||
1,1,1,1,1,1,1,1, # 10 - 17
|
||||
|
@ -475,9 +531,10 @@ UTF8_cls = ( \
|
|||
7,8,8,8,8,8,8,8, # e0 - e7
|
||||
8,8,8,8,8,9,8,8, # e8 - ef
|
||||
10,11,11,11,11,11,11,11, # f0 - f7
|
||||
12,13,13,13,14,15,0,0) # f8 - ff
|
||||
12,13,13,13,14,15,0,0 # f8 - ff
|
||||
)
|
||||
|
||||
UTF8_st = ( \
|
||||
UTF8_st = (
|
||||
eError,eStart,eError,eError,eError,eError, 12, 10,#00-07
|
||||
9, 11, 8, 7, 6, 5, 4, 3,#08-0f
|
||||
eError,eError,eError,eError,eError,eError,eError,eError,#10-17
|
||||
|
@ -503,7 +560,8 @@ UTF8_st = ( \
|
|||
eError,eError, 12, 12, 12,eError,eError,eError,#b0-b7
|
||||
eError,eError,eError,eError,eError,eError,eError,eError,#b8-bf
|
||||
eError,eError,eStart,eStart,eStart,eStart,eError,eError,#c0-c7
|
||||
eError,eError,eError,eError,eError,eError,eError,eError)#c8-cf
|
||||
eError,eError,eError,eError,eError,eError,eError,eError #c8-cf
|
||||
)
|
||||
|
||||
UTF8CharLenTable = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6)
|
||||
|
||||
|
|
44
thirdparty/chardet/sbcharsetprober.py
vendored
44
thirdparty/chardet/sbcharsetprober.py
vendored
|
@ -26,8 +26,10 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import constants, sys
|
||||
from charsetprober import CharSetProber
|
||||
import sys
|
||||
from . import constants
|
||||
from .charsetprober import CharSetProber
|
||||
from .compat import wrap_ord
|
||||
|
||||
SAMPLE_SIZE = 64
|
||||
SB_ENOUGH_REL_THRESHOLD = 1024
|
||||
|
@ -38,21 +40,26 @@ NUMBER_OF_SEQ_CAT = 4
|
|||
POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1
|
||||
#NEGATIVE_CAT = 0
|
||||
|
||||
|
||||
class SingleByteCharSetProber(CharSetProber):
|
||||
def __init__(self, model, reversed=constants.False, nameProber=None):
|
||||
def __init__(self, model, reversed=False, nameProber=None):
|
||||
CharSetProber.__init__(self)
|
||||
self._mModel = model
|
||||
self._mReversed = reversed # TRUE if we need to reverse every pair in the model lookup
|
||||
self._mNameProber = nameProber # Optional auxiliary prober for name decision
|
||||
# TRUE if we need to reverse every pair in the model lookup
|
||||
self._mReversed = reversed
|
||||
# Optional auxiliary prober for name decision
|
||||
self._mNameProber = nameProber
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
CharSetProber.reset(self)
|
||||
self._mLastOrder = 255 # char order of last character
|
||||
# char order of last character
|
||||
self._mLastOrder = 255
|
||||
self._mSeqCounters = [0] * NUMBER_OF_SEQ_CAT
|
||||
self._mTotalSeqs = 0
|
||||
self._mTotalChar = 0
|
||||
self._mFreqChar = 0 # characters that fall in our sampling range
|
||||
# characters that fall in our sampling range
|
||||
self._mFreqChar = 0
|
||||
|
||||
def get_charset_name(self):
|
||||
if self._mNameProber:
|
||||
|
@ -67,7 +74,7 @@ class SingleByteCharSetProber(CharSetProber):
|
|||
if not aLen:
|
||||
return self.get_state()
|
||||
for c in aBuf:
|
||||
order = self._mModel['charToOrderMap'][ord(c)]
|
||||
order = self._mModel['charToOrderMap'][wrap_ord(c)]
|
||||
if order < SYMBOL_CAT_ORDER:
|
||||
self._mTotalChar += 1
|
||||
if order < SAMPLE_SIZE:
|
||||
|
@ -75,9 +82,12 @@ class SingleByteCharSetProber(CharSetProber):
|
|||
if self._mLastOrder < SAMPLE_SIZE:
|
||||
self._mTotalSeqs += 1
|
||||
if not self._mReversed:
|
||||
self._mSeqCounters[self._mModel['precedenceMatrix'][(self._mLastOrder * SAMPLE_SIZE) + order]] += 1
|
||||
i = (self._mLastOrder * SAMPLE_SIZE) + order
|
||||
model = self._mModel['precedenceMatrix'][i]
|
||||
else: # reverse the order of the letters in the lookup
|
||||
self._mSeqCounters[self._mModel['precedenceMatrix'][(order * SAMPLE_SIZE) + self._mLastOrder]] += 1
|
||||
i = (order * SAMPLE_SIZE) + self._mLastOrder
|
||||
model = self._mModel['precedenceMatrix'][i]
|
||||
self._mSeqCounters[model] += 1
|
||||
self._mLastOrder = order
|
||||
|
||||
if self.get_state() == constants.eDetecting:
|
||||
|
@ -85,11 +95,16 @@ class SingleByteCharSetProber(CharSetProber):
|
|||
cf = self.get_confidence()
|
||||
if cf > POSITIVE_SHORTCUT_THRESHOLD:
|
||||
if constants._debug:
|
||||
sys.stderr.write('%s confidence = %s, we have a winner\n' % (self._mModel['charsetName'], cf))
|
||||
sys.stderr.write('%s confidence = %s, we have a'
|
||||
'winner\n' %
|
||||
(self._mModel['charsetName'], cf))
|
||||
self._mState = constants.eFoundIt
|
||||
elif cf < NEGATIVE_SHORTCUT_THRESHOLD:
|
||||
if constants._debug:
|
||||
sys.stderr.write('%s confidence = %s, below negative shortcut threshhold %s\n' % (self._mModel['charsetName'], cf, NEGATIVE_SHORTCUT_THRESHOLD))
|
||||
sys.stderr.write('%s confidence = %s, below negative'
|
||||
'shortcut threshhold %s\n' %
|
||||
(self._mModel['charsetName'], cf,
|
||||
NEGATIVE_SHORTCUT_THRESHOLD))
|
||||
self._mState = constants.eNotMe
|
||||
|
||||
return self.get_state()
|
||||
|
@ -97,9 +112,8 @@ class SingleByteCharSetProber(CharSetProber):
|
|||
def get_confidence(self):
|
||||
r = 0.01
|
||||
if self._mTotalSeqs > 0:
|
||||
# print self._mSeqCounters[POSITIVE_CAT], self._mTotalSeqs, self._mModel['mTypicalPositiveRatio']
|
||||
r = (1.0 * self._mSeqCounters[POSITIVE_CAT]) / self._mTotalSeqs / self._mModel['mTypicalPositiveRatio']
|
||||
# print r, self._mFreqChar, self._mTotalChar
|
||||
r = ((1.0 * self._mSeqCounters[POSITIVE_CAT]) / self._mTotalSeqs
|
||||
/ self._mModel['mTypicalPositiveRatio'])
|
||||
r = r * self._mFreqChar / self._mTotalChar
|
||||
if r >= 1.0:
|
||||
r = 0.99
|
||||
|
|
33
thirdparty/chardet/sbcsgroupprober.py
vendored
33
thirdparty/chardet/sbcsgroupprober.py
vendored
|
@ -26,21 +26,23 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import constants, sys
|
||||
from charsetgroupprober import CharSetGroupProber
|
||||
from sbcharsetprober import SingleByteCharSetProber
|
||||
from langcyrillicmodel import Win1251CyrillicModel, Koi8rModel, Latin5CyrillicModel, MacCyrillicModel, Ibm866Model, Ibm855Model
|
||||
from langgreekmodel import Latin7GreekModel, Win1253GreekModel
|
||||
from langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel
|
||||
from langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel
|
||||
from langthaimodel import TIS620ThaiModel
|
||||
from langhebrewmodel import Win1255HebrewModel
|
||||
from hebrewprober import HebrewProber
|
||||
from .charsetgroupprober import CharSetGroupProber
|
||||
from .sbcharsetprober import SingleByteCharSetProber
|
||||
from .langcyrillicmodel import (Win1251CyrillicModel, Koi8rModel,
|
||||
Latin5CyrillicModel, MacCyrillicModel,
|
||||
Ibm866Model, Ibm855Model)
|
||||
from .langgreekmodel import Latin7GreekModel, Win1253GreekModel
|
||||
from .langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel
|
||||
from .langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel
|
||||
from .langthaimodel import TIS620ThaiModel
|
||||
from .langhebrewmodel import Win1255HebrewModel
|
||||
from .hebrewprober import HebrewProber
|
||||
|
||||
|
||||
class SBCSGroupProber(CharSetGroupProber):
|
||||
def __init__(self):
|
||||
CharSetGroupProber.__init__(self)
|
||||
self._mProbers = [ \
|
||||
self._mProbers = [
|
||||
SingleByteCharSetProber(Win1251CyrillicModel),
|
||||
SingleByteCharSetProber(Koi8rModel),
|
||||
SingleByteCharSetProber(Latin5CyrillicModel),
|
||||
|
@ -56,9 +58,12 @@ class SBCSGroupProber(CharSetGroupProber):
|
|||
SingleByteCharSetProber(TIS620ThaiModel),
|
||||
]
|
||||
hebrewProber = HebrewProber()
|
||||
logicalHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, constants.False, hebrewProber)
|
||||
visualHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, constants.True, hebrewProber)
|
||||
logicalHebrewProber = SingleByteCharSetProber(Win1255HebrewModel,
|
||||
False, hebrewProber)
|
||||
visualHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, True,
|
||||
hebrewProber)
|
||||
hebrewProber.set_model_probers(logicalHebrewProber, visualHebrewProber)
|
||||
self._mProbers.extend([hebrewProber, logicalHebrewProber, visualHebrewProber])
|
||||
self._mProbers.extend([hebrewProber, logicalHebrewProber,
|
||||
visualHebrewProber])
|
||||
|
||||
self.reset()
|
||||
|
|
42
thirdparty/chardet/sjisprober.py
vendored
42
thirdparty/chardet/sjisprober.py
vendored
|
@ -25,13 +25,14 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from mbcharsetprober import MultiByteCharSetProber
|
||||
from codingstatemachine import CodingStateMachine
|
||||
from chardistribution import SJISDistributionAnalysis
|
||||
from jpcntx import SJISContextAnalysis
|
||||
from mbcssm import SJISSMModel
|
||||
import constants, sys
|
||||
from constants import eStart, eError, eItsMe
|
||||
import sys
|
||||
from .mbcharsetprober import MultiByteCharSetProber
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .chardistribution import SJISDistributionAnalysis
|
||||
from .jpcntx import SJISContextAnalysis
|
||||
from .mbcssm import SJISSMModel
|
||||
from . import constants
|
||||
|
||||
|
||||
class SJISProber(MultiByteCharSetProber):
|
||||
def __init__(self):
|
||||
|
@ -46,35 +47,40 @@ class SJISProber(MultiByteCharSetProber):
|
|||
self._mContextAnalyzer.reset()
|
||||
|
||||
def get_charset_name(self):
|
||||
return "SHIFT_JIS"
|
||||
return self._mContextAnalyzer.get_charset_name()
|
||||
|
||||
def feed(self, aBuf):
|
||||
aLen = len(aBuf)
|
||||
for i in xrange(0, aLen):
|
||||
for i in range(0, aLen):
|
||||
codingState = self._mCodingSM.next_state(aBuf[i])
|
||||
if codingState == eError:
|
||||
if codingState == constants.eError:
|
||||
if constants._debug:
|
||||
sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n')
|
||||
sys.stderr.write(self.get_charset_name()
|
||||
+ ' prober hit error at byte ' + str(i)
|
||||
+ '\n')
|
||||
self._mState = constants.eNotMe
|
||||
break
|
||||
elif codingState == eItsMe:
|
||||
elif codingState == constants.eItsMe:
|
||||
self._mState = constants.eFoundIt
|
||||
break
|
||||
elif codingState == eStart:
|
||||
elif codingState == constants.eStart:
|
||||
charLen = self._mCodingSM.get_current_charlen()
|
||||
if i == 0:
|
||||
self._mLastChar[1] = aBuf[0]
|
||||
self._mContextAnalyzer.feed(self._mLastChar[2 - charLen :], charLen)
|
||||
self._mContextAnalyzer.feed(self._mLastChar[2 - charLen:],
|
||||
charLen)
|
||||
self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
|
||||
else:
|
||||
self._mContextAnalyzer.feed(aBuf[i + 1 - charLen : i + 3 - charLen], charLen)
|
||||
self._mDistributionAnalyzer.feed(aBuf[i - 1 : i + 1], charLen)
|
||||
self._mContextAnalyzer.feed(aBuf[i + 1 - charLen:i + 3
|
||||
- charLen], charLen)
|
||||
self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1],
|
||||
charLen)
|
||||
|
||||
self._mLastChar[0] = aBuf[aLen - 1]
|
||||
|
||||
if self.get_state() == constants.eDetecting:
|
||||
if self._mContextAnalyzer.got_enough_data() and \
|
||||
(self.get_confidence() > constants.SHORTCUT_THRESHOLD):
|
||||
if (self._mContextAnalyzer.got_enough_data() and
|
||||
(self.get_confidence() > constants.SHORTCUT_THRESHOLD)):
|
||||
self._mState = constants.eFoundIt
|
||||
|
||||
return self.get_state()
|
||||
|
|
20
thirdparty/chardet/test.py
vendored
20
thirdparty/chardet/test.py
vendored
|
@ -1,20 +0,0 @@
|
|||
import sys, glob
|
||||
sys.path.insert(0, '..')
|
||||
from chardet.universaldetector import UniversalDetector
|
||||
|
||||
count = 0
|
||||
u = UniversalDetector()
|
||||
for f in glob.glob(sys.argv[1]):
|
||||
print f.ljust(60),
|
||||
u.reset()
|
||||
for line in file(f, 'rb'):
|
||||
u.feed(line)
|
||||
if u.done: break
|
||||
u.close()
|
||||
result = u.result
|
||||
if result['encoding']:
|
||||
print result['encoding'], 'with confidence', result['confidence']
|
||||
else:
|
||||
print '******** no result'
|
||||
count += 1
|
||||
print count, 'tests'
|
88
thirdparty/chardet/universaldetector.py
vendored
88
thirdparty/chardet/universaldetector.py
vendored
|
@ -26,11 +26,13 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import constants, sys
|
||||
from latin1prober import Latin1Prober # windows-1252
|
||||
from mbcsgroupprober import MBCSGroupProber # multi-byte character sets
|
||||
from sbcsgroupprober import SBCSGroupProber # single-byte character sets
|
||||
from escprober import EscCharSetProber # ISO-2122, etc.
|
||||
from . import constants
|
||||
import sys
|
||||
import codecs
|
||||
from .latin1prober import Latin1Prober # windows-1252
|
||||
from .mbcsgroupprober import MBCSGroupProber # multi-byte character sets
|
||||
from .sbcsgroupprober import SBCSGroupProber # single-byte character sets
|
||||
from .escprober import EscCharSetProber # ISO-2122, etc.
|
||||
import re
|
||||
|
||||
MINIMUM_THRESHOLD = 0.20
|
||||
|
@ -38,68 +40,78 @@ ePureAscii = 0
|
|||
eEscAscii = 1
|
||||
eHighbyte = 2
|
||||
|
||||
|
||||
class UniversalDetector:
|
||||
def __init__(self):
|
||||
self._highBitDetector = re.compile(r'[\x80-\xFF]')
|
||||
self._escDetector = re.compile(r'(\033|~{)')
|
||||
self._highBitDetector = re.compile(b'[\x80-\xFF]')
|
||||
self._escDetector = re.compile(b'(\033|~{)')
|
||||
self._mEscCharSetProber = None
|
||||
self._mCharSetProbers = []
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.result = {'encoding': None, 'confidence': 0.0}
|
||||
self.done = constants.False
|
||||
self._mStart = constants.True
|
||||
self._mGotData = constants.False
|
||||
self.done = False
|
||||
self._mStart = True
|
||||
self._mGotData = False
|
||||
self._mInputState = ePureAscii
|
||||
self._mLastChar = ''
|
||||
self._mLastChar = b''
|
||||
if self._mEscCharSetProber:
|
||||
self._mEscCharSetProber.reset()
|
||||
for prober in self._mCharSetProbers:
|
||||
prober.reset()
|
||||
|
||||
def feed(self, aBuf):
|
||||
if self.done: return
|
||||
if self.done:
|
||||
return
|
||||
|
||||
aLen = len(aBuf)
|
||||
if not aLen: return
|
||||
if not aLen:
|
||||
return
|
||||
|
||||
if not self._mGotData:
|
||||
# If the data starts with BOM, we know it is UTF
|
||||
if aBuf[:3] == '\xEF\xBB\xBF':
|
||||
if aBuf[:3] == codecs.BOM_UTF8:
|
||||
# EF BB BF UTF-8 with BOM
|
||||
self.result = {'encoding': "UTF-8", 'confidence': 1.0}
|
||||
elif aBuf[:4] == '\xFF\xFE\x00\x00':
|
||||
self.result = {'encoding': "UTF-8-SIG", 'confidence': 1.0}
|
||||
elif aBuf[:4] == codecs.BOM_UTF32_LE:
|
||||
# FF FE 00 00 UTF-32, little-endian BOM
|
||||
self.result = {'encoding': "UTF-32LE", 'confidence': 1.0}
|
||||
elif aBuf[:4] == '\x00\x00\xFE\xFF':
|
||||
elif aBuf[:4] == codecs.BOM_UTF32_BE:
|
||||
# 00 00 FE FF UTF-32, big-endian BOM
|
||||
self.result = {'encoding': "UTF-32BE", 'confidence': 1.0}
|
||||
elif aBuf[:4] == '\xFE\xFF\x00\x00':
|
||||
elif aBuf[:4] == b'\xFE\xFF\x00\x00':
|
||||
# FE FF 00 00 UCS-4, unusual octet order BOM (3412)
|
||||
self.result = {'encoding': "X-ISO-10646-UCS-4-3412", 'confidence': 1.0}
|
||||
elif aBuf[:4] == '\x00\x00\xFF\xFE':
|
||||
self.result = {
|
||||
'encoding': "X-ISO-10646-UCS-4-3412",
|
||||
'confidence': 1.0
|
||||
}
|
||||
elif aBuf[:4] == b'\x00\x00\xFF\xFE':
|
||||
# 00 00 FF FE UCS-4, unusual octet order BOM (2143)
|
||||
self.result = {'encoding': "X-ISO-10646-UCS-4-2143", 'confidence': 1.0}
|
||||
elif aBuf[:2] == '\xFF\xFE':
|
||||
self.result = {
|
||||
'encoding': "X-ISO-10646-UCS-4-2143",
|
||||
'confidence': 1.0
|
||||
}
|
||||
elif aBuf[:2] == codecs.BOM_LE:
|
||||
# FF FE UTF-16, little endian BOM
|
||||
self.result = {'encoding': "UTF-16LE", 'confidence': 1.0}
|
||||
elif aBuf[:2] == '\xFE\xFF':
|
||||
elif aBuf[:2] == codecs.BOM_BE:
|
||||
# FE FF UTF-16, big endian BOM
|
||||
self.result = {'encoding': "UTF-16BE", 'confidence': 1.0}
|
||||
|
||||
self._mGotData = constants.True
|
||||
self._mGotData = True
|
||||
if self.result['encoding'] and (self.result['confidence'] > 0.0):
|
||||
self.done = constants.True
|
||||
self.done = True
|
||||
return
|
||||
|
||||
if self._mInputState == ePureAscii:
|
||||
if self._highBitDetector.search(aBuf):
|
||||
self._mInputState = eHighbyte
|
||||
elif (self._mInputState == ePureAscii) and self._escDetector.search(self._mLastChar + aBuf):
|
||||
elif ((self._mInputState == ePureAscii) and
|
||||
self._escDetector.search(self._mLastChar + aBuf)):
|
||||
self._mInputState = eEscAscii
|
||||
|
||||
self._mLastChar = aBuf[-1]
|
||||
self._mLastChar = aBuf[-1:]
|
||||
|
||||
if self._mInputState == eEscAscii:
|
||||
if not self._mEscCharSetProber:
|
||||
|
@ -107,24 +119,26 @@ class UniversalDetector:
|
|||
if self._mEscCharSetProber.feed(aBuf) == constants.eFoundIt:
|
||||
self.result = {'encoding': self._mEscCharSetProber.get_charset_name(),
|
||||
'confidence': self._mEscCharSetProber.get_confidence()}
|
||||
self.done = constants.True
|
||||
self.done = True
|
||||
elif self._mInputState == eHighbyte:
|
||||
if not self._mCharSetProbers:
|
||||
self._mCharSetProbers = [MBCSGroupProber(), SBCSGroupProber(), Latin1Prober()]
|
||||
self._mCharSetProbers = [MBCSGroupProber(), SBCSGroupProber(),
|
||||
Latin1Prober()]
|
||||
for prober in self._mCharSetProbers:
|
||||
if prober.feed(aBuf) == constants.eFoundIt:
|
||||
self.result = {'encoding': prober.get_charset_name(),
|
||||
'confidence': prober.get_confidence()}
|
||||
self.done = constants.True
|
||||
self.done = True
|
||||
break
|
||||
|
||||
def close(self):
|
||||
if self.done: return
|
||||
if self.done:
|
||||
return
|
||||
if not self._mGotData:
|
||||
if constants._debug:
|
||||
sys.stderr.write('no data received!\n')
|
||||
return
|
||||
self.done = constants.True
|
||||
self.done = True
|
||||
|
||||
if self._mInputState == ePureAscii:
|
||||
self.result = {'encoding': 'ascii', 'confidence': 1.0}
|
||||
|
@ -135,7 +149,8 @@ class UniversalDetector:
|
|||
maxProberConfidence = 0.0
|
||||
maxProber = None
|
||||
for prober in self._mCharSetProbers:
|
||||
if not prober: continue
|
||||
if not prober:
|
||||
continue
|
||||
proberConfidence = prober.get_confidence()
|
||||
if proberConfidence > maxProberConfidence:
|
||||
maxProberConfidence = proberConfidence
|
||||
|
@ -148,7 +163,8 @@ class UniversalDetector:
|
|||
if constants._debug:
|
||||
sys.stderr.write('no probers hit minimum threshhold\n')
|
||||
for prober in self._mCharSetProbers[0].mProbers:
|
||||
if not prober: continue
|
||||
sys.stderr.write('%s confidence = %s\n' % \
|
||||
(prober.get_charset_name(), \
|
||||
if not prober:
|
||||
continue
|
||||
sys.stderr.write('%s confidence = %s\n' %
|
||||
(prober.get_charset_name(),
|
||||
prober.get_confidence()))
|
||||
|
|
18
thirdparty/chardet/utf8prober.py
vendored
18
thirdparty/chardet/utf8prober.py
vendored
|
@ -25,14 +25,14 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import constants, sys
|
||||
from constants import eStart, eError, eItsMe
|
||||
from charsetprober import CharSetProber
|
||||
from codingstatemachine import CodingStateMachine
|
||||
from mbcssm import UTF8SMModel
|
||||
from . import constants
|
||||
from .charsetprober import CharSetProber
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .mbcssm import UTF8SMModel
|
||||
|
||||
ONE_CHAR_PROB = 0.5
|
||||
|
||||
|
||||
class UTF8Prober(CharSetProber):
|
||||
def __init__(self):
|
||||
CharSetProber.__init__(self)
|
||||
|
@ -50,13 +50,13 @@ class UTF8Prober(CharSetProber):
|
|||
def feed(self, aBuf):
|
||||
for c in aBuf:
|
||||
codingState = self._mCodingSM.next_state(c)
|
||||
if codingState == eError:
|
||||
if codingState == constants.eError:
|
||||
self._mState = constants.eNotMe
|
||||
break
|
||||
elif codingState == eItsMe:
|
||||
elif codingState == constants.eItsMe:
|
||||
self._mState = constants.eFoundIt
|
||||
break
|
||||
elif codingState == eStart:
|
||||
elif codingState == constants.eStart:
|
||||
if self._mCodingSM.get_current_charlen() >= 2:
|
||||
self._mNumOfMBChar += 1
|
||||
|
||||
|
@ -69,7 +69,7 @@ class UTF8Prober(CharSetProber):
|
|||
def get_confidence(self):
|
||||
unlike = 0.99
|
||||
if self._mNumOfMBChar < 6:
|
||||
for i in xrange(0, self._mNumOfMBChar):
|
||||
for i in range(0, self._mNumOfMBChar):
|
||||
unlike = unlike * ONE_CHAR_PROB
|
||||
return 1.0 - unlike
|
||||
else:
|
||||
|
|
Loading…
Reference in New Issue
Block a user