Adding new version of chardet

This commit is contained in:
Miroslav Stampar 2015-10-09 13:35:48 +02:00
parent d424d4cdc7
commit 439d003753
39 changed files with 1499 additions and 1148 deletions

View File

@ -15,10 +15,16 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
__version__ = "2.0.1" __version__ = "2.3.0"
from sys import version_info
def detect(aBuf): def detect(aBuf):
import universaldetector if ((version_info < (3, 0) and isinstance(aBuf, unicode)) or
(version_info >= (3, 0) and not isinstance(aBuf, bytes))):
raise ValueError('Expected a bytes object, not a unicode object')
from . import universaldetector
u = universaldetector.UniversalDetector() u = universaldetector.UniversalDetector()
u.reset() u.reset()
u.feed(aBuf) u.feed(aBuf)

View File

@ -45,7 +45,7 @@ BIG5_TYPICAL_DISTRIBUTION_RATIO = 0.75
#Char to FreqOrder table #Char to FreqOrder table
BIG5_TABLE_SIZE = 5376 BIG5_TABLE_SIZE = 5376
Big5CharToFreqOrder = ( \ Big5CharToFreqOrder = (
1,1801,1506, 255,1431, 198, 9, 82, 6,5008, 177, 202,3681,1256,2821, 110, # 16 1,1801,1506, 255,1431, 198, 9, 82, 6,5008, 177, 202,3681,1256,2821, 110, # 16
3814, 33,3274, 261, 76, 44,2114, 16,2946,2187,1176, 659,3971, 26,3451,2653, # 32 3814, 33,3274, 261, 76, 44,2114, 16,2946,2187,1176, 659,3971, 26,3451,2653, # 32
1198,3972,3350,4202, 410,2215, 302, 590, 361,1964, 8, 204, 58,4510,5009,1932, # 48 1198,3972,3350,4202, 410,2215, 302, 590, 361,1964, 8, 204, 58,4510,5009,1932, # 48
@ -921,3 +921,5 @@ Big5CharToFreqOrder = ( \
13936,13937,13938,13939,13940,13941,13942,13943,13944,13945,13946,13947,13948,13949,13950,13951, #13952 13936,13937,13938,13939,13940,13941,13942,13943,13944,13945,13946,13947,13948,13949,13950,13951, #13952
13952,13953,13954,13955,13956,13957,13958,13959,13960,13961,13962,13963,13964,13965,13966,13967, #13968 13952,13953,13954,13955,13956,13957,13958,13959,13960,13961,13962,13963,13964,13965,13966,13967, #13968
13968,13969,13970,13971,13972) #13973 13968,13969,13970,13971,13972) #13973
# flake8: noqa

View File

@ -25,10 +25,11 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from mbcharsetprober import MultiByteCharSetProber from .mbcharsetprober import MultiByteCharSetProber
from codingstatemachine import CodingStateMachine from .codingstatemachine import CodingStateMachine
from chardistribution import Big5DistributionAnalysis from .chardistribution import Big5DistributionAnalysis
from mbcssm import Big5SMModel from .mbcssm import Big5SMModel
class Big5Prober(MultiByteCharSetProber): class Big5Prober(MultiByteCharSetProber):
def __init__(self): def __init__(self):

80
thirdparty/chardet/chardetect.py vendored Normal file
View File

@ -0,0 +1,80 @@
#!/usr/bin/env python
"""
Script which takes one or more file paths and reports on their detected
encodings
Example::
% chardetect somefile someotherfile
somefile: windows-1252 with confidence 0.5
someotherfile: ascii with confidence 1.0
If no paths are provided, it takes its input from stdin.
"""
from __future__ import absolute_import, print_function, unicode_literals
import argparse
import sys
from io import open
from chardet import __version__
from chardet.universaldetector import UniversalDetector
def description_of(lines, name='stdin'):
"""
Return a string describing the probable encoding of a file or
list of strings.
:param lines: The lines to get the encoding of.
:type lines: Iterable of bytes
:param name: Name of file or collection of lines
:type name: str
"""
u = UniversalDetector()
for line in lines:
u.feed(line)
u.close()
result = u.result
if result['encoding']:
return '{0}: {1} with confidence {2}'.format(name, result['encoding'],
result['confidence'])
else:
return '{0}: no result'.format(name)
def main(argv=None):
'''
Handles command line arguments and gets things started.
:param argv: List of arguments, as if specified on the command-line.
If None, ``sys.argv[1:]`` is used instead.
:type argv: list of str
'''
# Get command line arguments
parser = argparse.ArgumentParser(
description="Takes one or more file paths and reports their detected \
encodings",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
conflict_handler='resolve')
parser.add_argument('input',
help='File whose encoding we would like to determine.',
type=argparse.FileType('rb'), nargs='*',
default=[sys.stdin])
parser.add_argument('--version', action='version',
version='%(prog)s {0}'.format(__version__))
args = parser.parse_args(argv)
for f in args.input:
if f.isatty():
print("You are running chardetect interactively. Press " +
"CTRL-D twice at the start of a blank line to signal the " +
"end of your input. If you want help, run chardetect " +
"--help\n", file=sys.stderr)
print(description_of(f, f.name))
if __name__ == '__main__':
main()

View File

@ -25,35 +25,51 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants from .euctwfreq import (EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE,
from euctwfreq import EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE, EUCTW_TYPICAL_DISTRIBUTION_RATIO EUCTW_TYPICAL_DISTRIBUTION_RATIO)
from euckrfreq import EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE, EUCKR_TYPICAL_DISTRIBUTION_RATIO from .euckrfreq import (EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE,
from gb2312freq import GB2312CharToFreqOrder, GB2312_TABLE_SIZE, GB2312_TYPICAL_DISTRIBUTION_RATIO EUCKR_TYPICAL_DISTRIBUTION_RATIO)
from big5freq import Big5CharToFreqOrder, BIG5_TABLE_SIZE, BIG5_TYPICAL_DISTRIBUTION_RATIO from .gb2312freq import (GB2312CharToFreqOrder, GB2312_TABLE_SIZE,
from jisfreq import JISCharToFreqOrder, JIS_TABLE_SIZE, JIS_TYPICAL_DISTRIBUTION_RATIO GB2312_TYPICAL_DISTRIBUTION_RATIO)
from .big5freq import (Big5CharToFreqOrder, BIG5_TABLE_SIZE,
BIG5_TYPICAL_DISTRIBUTION_RATIO)
from .jisfreq import (JISCharToFreqOrder, JIS_TABLE_SIZE,
JIS_TYPICAL_DISTRIBUTION_RATIO)
from .compat import wrap_ord
ENOUGH_DATA_THRESHOLD = 1024 ENOUGH_DATA_THRESHOLD = 1024
SURE_YES = 0.99 SURE_YES = 0.99
SURE_NO = 0.01 SURE_NO = 0.01
MINIMUM_DATA_THRESHOLD = 3
class CharDistributionAnalysis: class CharDistributionAnalysis:
def __init__(self): def __init__(self):
self._mCharToFreqOrder = None # Mapping table to get frequency order from char order (get from GetOrder()) # Mapping table to get frequency order from char order (get from
self._mTableSize = None # Size of above table # GetOrder())
self._mTypicalDistributionRatio = None # This is a constant value which varies from language to language, used in calculating confidence. See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail. self._mCharToFreqOrder = None
self._mTableSize = None # Size of above table
# This is a constant value which varies from language to language,
# used in calculating confidence. See
# http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
# for further detail.
self._mTypicalDistributionRatio = None
self.reset() self.reset()
def reset(self): def reset(self):
"""reset analyser, clear any state""" """reset analyser, clear any state"""
self._mDone = constants.False # If this flag is set to constants.True, detection is done and conclusion has been made # If this flag is set to True, detection is done and conclusion has
self._mTotalChars = 0 # Total characters encountered # been made
self._mFreqChars = 0 # The number of characters whose frequency order is less than 512 self._mDone = False
self._mTotalChars = 0 # Total characters encountered
# The number of characters whose frequency order is less than 512
self._mFreqChars = 0
def feed(self, aStr, aCharLen): def feed(self, aBuf, aCharLen):
"""feed a character with known length""" """feed a character with known length"""
if aCharLen == 2: if aCharLen == 2:
# we only care about 2-bytes character in our distribution analysis # we only care about 2-bytes character in our distribution analysis
order = self.get_order(aStr) order = self.get_order(aBuf)
else: else:
order = -1 order = -1
if order >= 0: if order >= 0:
@ -65,12 +81,14 @@ class CharDistributionAnalysis:
def get_confidence(self): def get_confidence(self):
"""return confidence based on existing data""" """return confidence based on existing data"""
# if we didn't receive any character in our consideration range, return negative answer # if we didn't receive any character in our consideration range,
if self._mTotalChars <= 0: # return negative answer
if self._mTotalChars <= 0 or self._mFreqChars <= MINIMUM_DATA_THRESHOLD:
return SURE_NO return SURE_NO
if self._mTotalChars != self._mFreqChars: if self._mTotalChars != self._mFreqChars:
r = self._mFreqChars / ((self._mTotalChars - self._mFreqChars) * self._mTypicalDistributionRatio) r = (self._mFreqChars / ((self._mTotalChars - self._mFreqChars)
* self._mTypicalDistributionRatio))
if r < SURE_YES: if r < SURE_YES:
return r return r
@ -78,16 +96,18 @@ class CharDistributionAnalysis:
return SURE_YES return SURE_YES
def got_enough_data(self): def got_enough_data(self):
# It is not necessary to receive all data to draw conclusion. For charset detection, # It is not necessary to receive all data to draw conclusion.
# certain amount of data is enough # For charset detection, certain amount of data is enough
return self._mTotalChars > ENOUGH_DATA_THRESHOLD return self._mTotalChars > ENOUGH_DATA_THRESHOLD
def get_order(self, aStr): def get_order(self, aBuf):
# We do not handle characters based on the original encoding string, but # We do not handle characters based on the original encoding string,
# convert this encoding string to a number, here called order. # but convert this encoding string to a number, here called order.
# This allows multiple encodings of a language to share one frequency table. # This allows multiple encodings of a language to share one frequency
# table.
return -1 return -1
class EUCTWDistributionAnalysis(CharDistributionAnalysis): class EUCTWDistributionAnalysis(CharDistributionAnalysis):
def __init__(self): def __init__(self):
CharDistributionAnalysis.__init__(self) CharDistributionAnalysis.__init__(self)
@ -95,16 +115,18 @@ class EUCTWDistributionAnalysis(CharDistributionAnalysis):
self._mTableSize = EUCTW_TABLE_SIZE self._mTableSize = EUCTW_TABLE_SIZE
self._mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO self._mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, aStr): def get_order(self, aBuf):
# for euc-TW encoding, we are interested # for euc-TW encoding, we are interested
# first byte range: 0xc4 -- 0xfe # first byte range: 0xc4 -- 0xfe
# second byte range: 0xa1 -- 0xfe # second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that # no validation needed here. State machine has done that
if aStr[0] >= '\xC4': first_char = wrap_ord(aBuf[0])
return 94 * (ord(aStr[0]) - 0xC4) + ord(aStr[1]) - 0xA1 if first_char >= 0xC4:
return 94 * (first_char - 0xC4) + wrap_ord(aBuf[1]) - 0xA1
else: else:
return -1 return -1
class EUCKRDistributionAnalysis(CharDistributionAnalysis): class EUCKRDistributionAnalysis(CharDistributionAnalysis):
def __init__(self): def __init__(self):
CharDistributionAnalysis.__init__(self) CharDistributionAnalysis.__init__(self)
@ -112,15 +134,17 @@ class EUCKRDistributionAnalysis(CharDistributionAnalysis):
self._mTableSize = EUCKR_TABLE_SIZE self._mTableSize = EUCKR_TABLE_SIZE
self._mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO self._mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, aStr): def get_order(self, aBuf):
# for euc-KR encoding, we are interested # for euc-KR encoding, we are interested
# first byte range: 0xb0 -- 0xfe # first byte range: 0xb0 -- 0xfe
# second byte range: 0xa1 -- 0xfe # second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that # no validation needed here. State machine has done that
if aStr[0] >= '\xB0': first_char = wrap_ord(aBuf[0])
return 94 * (ord(aStr[0]) - 0xB0) + ord(aStr[1]) - 0xA1 if first_char >= 0xB0:
return 94 * (first_char - 0xB0) + wrap_ord(aBuf[1]) - 0xA1
else: else:
return -1; return -1
class GB2312DistributionAnalysis(CharDistributionAnalysis): class GB2312DistributionAnalysis(CharDistributionAnalysis):
def __init__(self): def __init__(self):
@ -129,15 +153,17 @@ class GB2312DistributionAnalysis(CharDistributionAnalysis):
self._mTableSize = GB2312_TABLE_SIZE self._mTableSize = GB2312_TABLE_SIZE
self._mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO self._mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, aStr): def get_order(self, aBuf):
# for GB2312 encoding, we are interested # for GB2312 encoding, we are interested
# first byte range: 0xb0 -- 0xfe # first byte range: 0xb0 -- 0xfe
# second byte range: 0xa1 -- 0xfe # second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that # no validation needed here. State machine has done that
if (aStr[0] >= '\xB0') and (aStr[1] >= '\xA1'): first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1])
return 94 * (ord(aStr[0]) - 0xB0) + ord(aStr[1]) - 0xA1 if (first_char >= 0xB0) and (second_char >= 0xA1):
return 94 * (first_char - 0xB0) + second_char - 0xA1
else: else:
return -1; return -1
class Big5DistributionAnalysis(CharDistributionAnalysis): class Big5DistributionAnalysis(CharDistributionAnalysis):
def __init__(self): def __init__(self):
@ -146,19 +172,21 @@ class Big5DistributionAnalysis(CharDistributionAnalysis):
self._mTableSize = BIG5_TABLE_SIZE self._mTableSize = BIG5_TABLE_SIZE
self._mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO self._mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, aStr): def get_order(self, aBuf):
# for big5 encoding, we are interested # for big5 encoding, we are interested
# first byte range: 0xa4 -- 0xfe # first byte range: 0xa4 -- 0xfe
# second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe # second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
# no validation needed here. State machine has done that # no validation needed here. State machine has done that
if aStr[0] >= '\xA4': first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1])
if aStr[1] >= '\xA1': if first_char >= 0xA4:
return 157 * (ord(aStr[0]) - 0xA4) + ord(aStr[1]) - 0xA1 + 63 if second_char >= 0xA1:
return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63
else: else:
return 157 * (ord(aStr[0]) - 0xA4) + ord(aStr[1]) - 0x40 return 157 * (first_char - 0xA4) + second_char - 0x40
else: else:
return -1 return -1
class SJISDistributionAnalysis(CharDistributionAnalysis): class SJISDistributionAnalysis(CharDistributionAnalysis):
def __init__(self): def __init__(self):
CharDistributionAnalysis.__init__(self) CharDistributionAnalysis.__init__(self)
@ -166,22 +194,24 @@ class SJISDistributionAnalysis(CharDistributionAnalysis):
self._mTableSize = JIS_TABLE_SIZE self._mTableSize = JIS_TABLE_SIZE
self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, aStr): def get_order(self, aBuf):
# for sjis encoding, we are interested # for sjis encoding, we are interested
# first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe # first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
# second byte range: 0x40 -- 0x7e, 0x81 -- oxfe # second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
# no validation needed here. State machine has done that # no validation needed here. State machine has done that
if (aStr[0] >= '\x81') and (aStr[0] <= '\x9F'): first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1])
order = 188 * (ord(aStr[0]) - 0x81) if (first_char >= 0x81) and (first_char <= 0x9F):
elif (aStr[0] >= '\xE0') and (aStr[0] <= '\xEF'): order = 188 * (first_char - 0x81)
order = 188 * (ord(aStr[0]) - 0xE0 + 31) elif (first_char >= 0xE0) and (first_char <= 0xEF):
order = 188 * (first_char - 0xE0 + 31)
else: else:
return -1; return -1
order = order + ord(aStr[1]) - 0x40 order = order + second_char - 0x40
if aStr[1] > '\x7F': if second_char > 0x7F:
order =- 1 order = -1
return order return order
class EUCJPDistributionAnalysis(CharDistributionAnalysis): class EUCJPDistributionAnalysis(CharDistributionAnalysis):
def __init__(self): def __init__(self):
CharDistributionAnalysis.__init__(self) CharDistributionAnalysis.__init__(self)
@ -189,12 +219,13 @@ class EUCJPDistributionAnalysis(CharDistributionAnalysis):
self._mTableSize = JIS_TABLE_SIZE self._mTableSize = JIS_TABLE_SIZE
self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, aStr): def get_order(self, aBuf):
# for euc-JP encoding, we are interested # for euc-JP encoding, we are interested
# first byte range: 0xa0 -- 0xfe # first byte range: 0xa0 -- 0xfe
# second byte range: 0xa1 -- 0xfe # second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that # no validation needed here. State machine has done that
if aStr[0] >= '\xA0': char = wrap_ord(aBuf[0])
return 94 * (ord(aStr[0]) - 0xA1) + ord(aStr[1]) - 0xa1 if char >= 0xA0:
return 94 * (char - 0xA1) + wrap_ord(aBuf[1]) - 0xa1
else: else:
return -1 return -1

View File

@ -25,8 +25,10 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants, sys from . import constants
from charsetprober import CharSetProber import sys
from .charsetprober import CharSetProber
class CharSetGroupProber(CharSetProber): class CharSetGroupProber(CharSetProber):
def __init__(self): def __init__(self):
@ -41,28 +43,32 @@ class CharSetGroupProber(CharSetProber):
for prober in self._mProbers: for prober in self._mProbers:
if prober: if prober:
prober.reset() prober.reset()
prober.active = constants.True prober.active = True
self._mActiveNum += 1 self._mActiveNum += 1
self._mBestGuessProber = None self._mBestGuessProber = None
def get_charset_name(self): def get_charset_name(self):
if not self._mBestGuessProber: if not self._mBestGuessProber:
self.get_confidence() self.get_confidence()
if not self._mBestGuessProber: return None if not self._mBestGuessProber:
return None
# self._mBestGuessProber = self._mProbers[0] # self._mBestGuessProber = self._mProbers[0]
return self._mBestGuessProber.get_charset_name() return self._mBestGuessProber.get_charset_name()
def feed(self, aBuf): def feed(self, aBuf):
for prober in self._mProbers: for prober in self._mProbers:
if not prober: continue if not prober:
if not prober.active: continue continue
if not prober.active:
continue
st = prober.feed(aBuf) st = prober.feed(aBuf)
if not st: continue if not st:
continue
if st == constants.eFoundIt: if st == constants.eFoundIt:
self._mBestGuessProber = prober self._mBestGuessProber = prober
return self.get_state() return self.get_state()
elif st == constants.eNotMe: elif st == constants.eNotMe:
prober.active = constants.False prober.active = False
self._mActiveNum -= 1 self._mActiveNum -= 1
if self._mActiveNum <= 0: if self._mActiveNum <= 0:
self._mState = constants.eNotMe self._mState = constants.eNotMe
@ -78,18 +84,22 @@ class CharSetGroupProber(CharSetProber):
bestConf = 0.0 bestConf = 0.0
self._mBestGuessProber = None self._mBestGuessProber = None
for prober in self._mProbers: for prober in self._mProbers:
if not prober: continue if not prober:
continue
if not prober.active: if not prober.active:
if constants._debug: if constants._debug:
sys.stderr.write(prober.get_charset_name() + ' not active\n') sys.stderr.write(prober.get_charset_name()
+ ' not active\n')
continue continue
cf = prober.get_confidence() cf = prober.get_confidence()
if constants._debug: if constants._debug:
sys.stderr.write('%s confidence = %s\n' % (prober.get_charset_name(), cf)) sys.stderr.write('%s confidence = %s\n' %
(prober.get_charset_name(), cf))
if bestConf < cf: if bestConf < cf:
bestConf = cf bestConf = cf
self._mBestGuessProber = prober self._mBestGuessProber = prober
if not self._mBestGuessProber: return 0.0 if not self._mBestGuessProber:
return 0.0
return bestConf return bestConf
# else: # else:
# self._mBestGuessProber = self._mProbers[0] # self._mBestGuessProber = self._mProbers[0]

View File

@ -26,7 +26,9 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants, re from . import constants
import re
class CharSetProber: class CharSetProber:
def __init__(self): def __init__(self):
@ -48,11 +50,11 @@ class CharSetProber:
return 0.0 return 0.0
def filter_high_bit_only(self, aBuf): def filter_high_bit_only(self, aBuf):
aBuf = re.sub(r'([\x00-\x7F])+', ' ', aBuf) aBuf = re.sub(b'([\x00-\x7F])+', b' ', aBuf)
return aBuf return aBuf
def filter_without_english_letters(self, aBuf): def filter_without_english_letters(self, aBuf):
aBuf = re.sub(r'([A-Za-z])+', ' ', aBuf) aBuf = re.sub(b'([A-Za-z])+', b' ', aBuf)
return aBuf return aBuf
def filter_with_english_letters(self, aBuf): def filter_with_english_letters(self, aBuf):

View File

@ -25,7 +25,9 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from constants import eStart, eError, eItsMe from .constants import eStart
from .compat import wrap_ord
class CodingStateMachine: class CodingStateMachine:
def __init__(self, sm): def __init__(self, sm):
@ -40,12 +42,15 @@ class CodingStateMachine:
def next_state(self, c): def next_state(self, c):
# for each byte we get its class # for each byte we get its class
# if it is first byte, we also get byte length # if it is first byte, we also get byte length
byteCls = self._mModel['classTable'][ord(c)] # PY3K: aBuf is a byte stream, so c is an int, not a byte
byteCls = self._mModel['classTable'][wrap_ord(c)]
if self._mCurrentState == eStart: if self._mCurrentState == eStart:
self._mCurrentBytePos = 0 self._mCurrentBytePos = 0
self._mCurrentCharLen = self._mModel['charLenTable'][byteCls] self._mCurrentCharLen = self._mModel['charLenTable'][byteCls]
# from byte's class and stateTable, we get its next state # from byte's class and stateTable, we get its next state
self._mCurrentState = self._mModel['stateTable'][self._mCurrentState * self._mModel['classFactor'] + byteCls] curr_state = (self._mCurrentState * self._mModel['classFactor']
+ byteCls)
self._mCurrentState = self._mModel['stateTable'][curr_state]
self._mCurrentBytePos += 1 self._mCurrentBytePos += 1
return self._mCurrentState return self._mCurrentState

34
thirdparty/chardet/compat.py vendored Normal file
View File

@ -0,0 +1,34 @@
######################## BEGIN LICENSE BLOCK ########################
# Contributor(s):
# Ian Cordasco - port to Python
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import sys
if sys.version_info < (3, 0):
base_str = (str, unicode)
else:
base_str = (bytes, str)
def wrap_ord(a):
if sys.version_info < (3, 0) and isinstance(a, base_str):
return ord(a)
else:
return a

View File

@ -37,11 +37,3 @@ eError = 1
eItsMe = 2 eItsMe = 2
SHORTCUT_THRESHOLD = 0.95 SHORTCUT_THRESHOLD = 0.95
import __builtin__
if not hasattr(__builtin__, 'False'):
False = 0
True = 1
else:
False = __builtin__.False
True = __builtin__.True

44
thirdparty/chardet/cp949prober.py vendored Normal file
View File

@ -0,0 +1,44 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is mozilla.org code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from .chardistribution import EUCKRDistributionAnalysis
from .mbcssm import CP949SMModel
class CP949Prober(MultiByteCharSetProber):
def __init__(self):
MultiByteCharSetProber.__init__(self)
self._mCodingSM = CodingStateMachine(CP949SMModel)
# NOTE: CP949 is a superset of EUC-KR, so the distribution should be
# not different.
self._mDistributionAnalyzer = EUCKRDistributionAnalysis()
self.reset()
def get_charset_name(self):
return "CP949"

View File

@ -25,27 +25,31 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants, sys from . import constants
from escsm import HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, ISO2022KRSMModel from .escsm import (HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel,
from charsetprober import CharSetProber ISO2022KRSMModel)
from codingstatemachine import CodingStateMachine from .charsetprober import CharSetProber
from .codingstatemachine import CodingStateMachine
from .compat import wrap_ord
class EscCharSetProber(CharSetProber): class EscCharSetProber(CharSetProber):
def __init__(self): def __init__(self):
CharSetProber.__init__(self) CharSetProber.__init__(self)
self._mCodingSM = [ \ self._mCodingSM = [
CodingStateMachine(HZSMModel), CodingStateMachine(HZSMModel),
CodingStateMachine(ISO2022CNSMModel), CodingStateMachine(ISO2022CNSMModel),
CodingStateMachine(ISO2022JPSMModel), CodingStateMachine(ISO2022JPSMModel),
CodingStateMachine(ISO2022KRSMModel) CodingStateMachine(ISO2022KRSMModel)
] ]
self.reset() self.reset()
def reset(self): def reset(self):
CharSetProber.reset(self) CharSetProber.reset(self)
for codingSM in self._mCodingSM: for codingSM in self._mCodingSM:
if not codingSM: continue if not codingSM:
codingSM.active = constants.True continue
codingSM.active = True
codingSM.reset() codingSM.reset()
self._mActiveSM = len(self._mCodingSM) self._mActiveSM = len(self._mCodingSM)
self._mDetectedCharset = None self._mDetectedCharset = None
@ -61,19 +65,22 @@ class EscCharSetProber(CharSetProber):
def feed(self, aBuf): def feed(self, aBuf):
for c in aBuf: for c in aBuf:
# PY3K: aBuf is a byte array, so c is an int, not a byte
for codingSM in self._mCodingSM: for codingSM in self._mCodingSM:
if not codingSM: continue if not codingSM:
if not codingSM.active: continue continue
codingState = codingSM.next_state(c) if not codingSM.active:
continue
codingState = codingSM.next_state(wrap_ord(c))
if codingState == constants.eError: if codingState == constants.eError:
codingSM.active = constants.False codingSM.active = False
self._mActiveSM -= 1 self._mActiveSM -= 1
if self._mActiveSM <= 0: if self._mActiveSM <= 0:
self._mState = constants.eNotMe self._mState = constants.eNotMe
return self.get_state() return self.get_state()
elif codingState == constants.eItsMe: elif codingState == constants.eItsMe:
self._mState = constants.eFoundIt self._mState = constants.eFoundIt
self._mDetectedCharset = codingSM.get_coding_state_machine() self._mDetectedCharset = codingSM.get_coding_state_machine() # nopep8
return self.get_state() return self.get_state()
return self.get_state() return self.get_state()

View File

@ -25,9 +25,9 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from constants import eStart, eError, eItsMe from .constants import eStart, eError, eItsMe
HZ_cls = ( \ HZ_cls = (
1,0,0,0,0,0,0,0, # 00 - 07 1,0,0,0,0,0,0,0, # 00 - 07
0,0,0,0,0,0,0,0, # 08 - 0f 0,0,0,0,0,0,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17 0,0,0,0,0,0,0,0, # 10 - 17
@ -62,7 +62,7 @@ HZ_cls = ( \
1,1,1,1,1,1,1,1, # f8 - ff 1,1,1,1,1,1,1,1, # f8 - ff
) )
HZ_st = ( \ HZ_st = (
eStart,eError, 3,eStart,eStart,eStart,eError,eError,# 00-07 eStart,eError, 3,eStart,eStart,eStart,eError,eError,# 00-07
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f
eItsMe,eItsMe,eError,eError,eStart,eStart, 4,eError,# 10-17 eItsMe,eItsMe,eError,eError,eStart,eStart, 4,eError,# 10-17
@ -79,7 +79,7 @@ HZSMModel = {'classTable': HZ_cls,
'charLenTable': HZCharLenTable, 'charLenTable': HZCharLenTable,
'name': "HZ-GB-2312"} 'name': "HZ-GB-2312"}
ISO2022CN_cls = ( \ ISO2022CN_cls = (
2,0,0,0,0,0,0,0, # 00 - 07 2,0,0,0,0,0,0,0, # 00 - 07
0,0,0,0,0,0,0,0, # 08 - 0f 0,0,0,0,0,0,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17 0,0,0,0,0,0,0,0, # 10 - 17
@ -114,7 +114,7 @@ ISO2022CN_cls = ( \
2,2,2,2,2,2,2,2, # f8 - ff 2,2,2,2,2,2,2,2, # f8 - ff
) )
ISO2022CN_st = ( \ ISO2022CN_st = (
eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07 eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07
eStart,eError,eError,eError,eError,eError,eError,eError,# 08-0f eStart,eError,eError,eError,eError,eError,eError,eError,# 08-0f
eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17 eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17
@ -133,7 +133,7 @@ ISO2022CNSMModel = {'classTable': ISO2022CN_cls,
'charLenTable': ISO2022CNCharLenTable, 'charLenTable': ISO2022CNCharLenTable,
'name': "ISO-2022-CN"} 'name': "ISO-2022-CN"}
ISO2022JP_cls = ( \ ISO2022JP_cls = (
2,0,0,0,0,0,0,0, # 00 - 07 2,0,0,0,0,0,0,0, # 00 - 07
0,0,0,0,0,0,2,2, # 08 - 0f 0,0,0,0,0,0,2,2, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17 0,0,0,0,0,0,0,0, # 10 - 17
@ -168,7 +168,7 @@ ISO2022JP_cls = ( \
2,2,2,2,2,2,2,2, # f8 - ff 2,2,2,2,2,2,2,2, # f8 - ff
) )
ISO2022JP_st = ( \ ISO2022JP_st = (
eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07 eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07
eStart,eStart,eError,eError,eError,eError,eError,eError,# 08-0f eStart,eStart,eError,eError,eError,eError,eError,eError,# 08-0f
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17 eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17
@ -188,7 +188,7 @@ ISO2022JPSMModel = {'classTable': ISO2022JP_cls,
'charLenTable': ISO2022JPCharLenTable, 'charLenTable': ISO2022JPCharLenTable,
'name': "ISO-2022-JP"} 'name': "ISO-2022-JP"}
ISO2022KR_cls = ( \ ISO2022KR_cls = (
2,0,0,0,0,0,0,0, # 00 - 07 2,0,0,0,0,0,0,0, # 00 - 07
0,0,0,0,0,0,0,0, # 08 - 0f 0,0,0,0,0,0,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17 0,0,0,0,0,0,0,0, # 10 - 17
@ -223,7 +223,7 @@ ISO2022KR_cls = ( \
2,2,2,2,2,2,2,2, # f8 - ff 2,2,2,2,2,2,2,2, # f8 - ff
) )
ISO2022KR_st = ( \ ISO2022KR_st = (
eStart, 3,eError,eStart,eStart,eStart,eError,eError,# 00-07 eStart, 3,eError,eStart,eStart,eStart,eError,eError,# 00-07
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f
eItsMe,eItsMe,eError,eError,eError, 4,eError,eError,# 10-17 eItsMe,eItsMe,eError,eError,eError, 4,eError,eError,# 10-17
@ -238,3 +238,5 @@ ISO2022KRSMModel = {'classTable': ISO2022KR_cls,
'stateTable': ISO2022KR_st, 'stateTable': ISO2022KR_st,
'charLenTable': ISO2022KRCharLenTable, 'charLenTable': ISO2022KRCharLenTable,
'name': "ISO-2022-KR"} 'name': "ISO-2022-KR"}
# flake8: noqa

View File

@ -25,13 +25,14 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants, sys import sys
from constants import eStart, eError, eItsMe from . import constants
from mbcharsetprober import MultiByteCharSetProber from .mbcharsetprober import MultiByteCharSetProber
from codingstatemachine import CodingStateMachine from .codingstatemachine import CodingStateMachine
from chardistribution import EUCJPDistributionAnalysis from .chardistribution import EUCJPDistributionAnalysis
from jpcntx import EUCJPContextAnalysis from .jpcntx import EUCJPContextAnalysis
from mbcssm import EUCJPSMModel from .mbcssm import EUCJPSMModel
class EUCJPProber(MultiByteCharSetProber): class EUCJPProber(MultiByteCharSetProber):
def __init__(self): def __init__(self):
@ -50,31 +51,35 @@ class EUCJPProber(MultiByteCharSetProber):
def feed(self, aBuf): def feed(self, aBuf):
aLen = len(aBuf) aLen = len(aBuf)
for i in xrange(0, aLen): for i in range(0, aLen):
# PY3K: aBuf is a byte array, so aBuf[i] is an int, not a byte
codingState = self._mCodingSM.next_state(aBuf[i]) codingState = self._mCodingSM.next_state(aBuf[i])
if codingState == eError: if codingState == constants.eError:
if constants._debug: if constants._debug:
sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n') sys.stderr.write(self.get_charset_name()
+ ' prober hit error at byte ' + str(i)
+ '\n')
self._mState = constants.eNotMe self._mState = constants.eNotMe
break break
elif codingState == eItsMe: elif codingState == constants.eItsMe:
self._mState = constants.eFoundIt self._mState = constants.eFoundIt
break break
elif codingState == eStart: elif codingState == constants.eStart:
charLen = self._mCodingSM.get_current_charlen() charLen = self._mCodingSM.get_current_charlen()
if i == 0: if i == 0:
self._mLastChar[1] = aBuf[0] self._mLastChar[1] = aBuf[0]
self._mContextAnalyzer.feed(self._mLastChar, charLen) self._mContextAnalyzer.feed(self._mLastChar, charLen)
self._mDistributionAnalyzer.feed(self._mLastChar, charLen) self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
else: else:
self._mContextAnalyzer.feed(aBuf[i-1:i+1], charLen) self._mContextAnalyzer.feed(aBuf[i - 1:i + 1], charLen)
self._mDistributionAnalyzer.feed(aBuf[i-1:i+1], charLen) self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1],
charLen)
self._mLastChar[0] = aBuf[aLen - 1] self._mLastChar[0] = aBuf[aLen - 1]
if self.get_state() == constants.eDetecting: if self.get_state() == constants.eDetecting:
if self._mContextAnalyzer.got_enough_data() and \ if (self._mContextAnalyzer.got_enough_data() and
(self.get_confidence() > constants.SHORTCUT_THRESHOLD): (self.get_confidence() > constants.SHORTCUT_THRESHOLD)):
self._mState = constants.eFoundIt self._mState = constants.eFoundIt
return self.get_state() return self.get_state()

View File

@ -592,3 +592,5 @@ EUCKRCharToFreqOrder = ( \
8704,8705,8706,8707,8708,8709,8710,8711,8712,8713,8714,8715,8716,8717,8718,8719, 8704,8705,8706,8707,8708,8709,8710,8711,8712,8713,8714,8715,8716,8717,8718,8719,
8720,8721,8722,8723,8724,8725,8726,8727,8728,8729,8730,8731,8732,8733,8734,8735, 8720,8721,8722,8723,8724,8725,8726,8727,8728,8729,8730,8731,8732,8733,8734,8735,
8736,8737,8738,8739,8740,8741) 8736,8737,8738,8739,8740,8741)
# flake8: noqa

View File

@ -25,10 +25,11 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from mbcharsetprober import MultiByteCharSetProber from .mbcharsetprober import MultiByteCharSetProber
from codingstatemachine import CodingStateMachine from .codingstatemachine import CodingStateMachine
from chardistribution import EUCKRDistributionAnalysis from .chardistribution import EUCKRDistributionAnalysis
from mbcssm import EUCKRSMModel from .mbcssm import EUCKRSMModel
class EUCKRProber(MultiByteCharSetProber): class EUCKRProber(MultiByteCharSetProber):
def __init__(self): def __init__(self):

View File

@ -46,7 +46,7 @@ EUCTW_TYPICAL_DISTRIBUTION_RATIO = 0.75
# Char to FreqOrder table , # Char to FreqOrder table ,
EUCTW_TABLE_SIZE = 8102 EUCTW_TABLE_SIZE = 8102
EUCTWCharToFreqOrder = ( \ EUCTWCharToFreqOrder = (
1,1800,1506, 255,1431, 198, 9, 82, 6,7310, 177, 202,3615,1256,2808, 110, # 2742 1,1800,1506, 255,1431, 198, 9, 82, 6,7310, 177, 202,3615,1256,2808, 110, # 2742
3735, 33,3241, 261, 76, 44,2113, 16,2931,2184,1176, 659,3868, 26,3404,2643, # 2758 3735, 33,3241, 261, 76, 44,2113, 16,2931,2184,1176, 659,3868, 26,3404,2643, # 2758
1198,3869,3313,4060, 410,2211, 302, 590, 361,1963, 8, 204, 58,4296,7311,1931, # 2774 1198,3869,3313,4060, 410,2211, 302, 590, 361,1963, 8, 204, 58,4296,7311,1931, # 2774
@ -424,3 +424,5 @@ EUCTWCharToFreqOrder = ( \
8694,8695,8696,8697,8698,8699,8700,8701,8702,8703,8704,8705,8706,8707,8708,8709, # 8710 8694,8695,8696,8697,8698,8699,8700,8701,8702,8703,8704,8705,8706,8707,8708,8709, # 8710
8710,8711,8712,8713,8714,8715,8716,8717,8718,8719,8720,8721,8722,8723,8724,8725, # 8726 8710,8711,8712,8713,8714,8715,8716,8717,8718,8719,8720,8721,8722,8723,8724,8725, # 8726
8726,8727,8728,8729,8730,8731,8732,8733,8734,8735,8736,8737,8738,8739,8740,8741) # 8742 8726,8727,8728,8729,8730,8731,8732,8733,8734,8735,8736,8737,8738,8739,8740,8741) # 8742
# flake8: noqa

View File

@ -25,10 +25,10 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from mbcharsetprober import MultiByteCharSetProber from .mbcharsetprober import MultiByteCharSetProber
from codingstatemachine import CodingStateMachine from .codingstatemachine import CodingStateMachine
from chardistribution import EUCTWDistributionAnalysis from .chardistribution import EUCTWDistributionAnalysis
from mbcssm import EUCTWSMModel from .mbcssm import EUCTWSMModel
class EUCTWProber(MultiByteCharSetProber): class EUCTWProber(MultiByteCharSetProber):
def __init__(self): def __init__(self):

View File

@ -43,7 +43,7 @@ GB2312_TYPICAL_DISTRIBUTION_RATIO = 0.9
GB2312_TABLE_SIZE = 3760 GB2312_TABLE_SIZE = 3760
GB2312CharToFreqOrder = ( \ GB2312CharToFreqOrder = (
1671, 749,1443,2364,3924,3807,2330,3921,1704,3463,2691,1511,1515, 572,3191,2205, 1671, 749,1443,2364,3924,3807,2330,3921,1704,3463,2691,1511,1515, 572,3191,2205,
2361, 224,2558, 479,1711, 963,3162, 440,4060,1905,2966,2947,3580,2647,3961,3842, 2361, 224,2558, 479,1711, 963,3162, 440,4060,1905,2966,2947,3580,2647,3961,3842,
2204, 869,4207, 970,2678,5626,2944,2956,1479,4048, 514,3595, 588,1346,2820,3409, 2204, 869,4207, 970,2678,5626,2944,2956,1479,4048, 514,3595, 588,1346,2820,3409,
@ -469,3 +469,4 @@ GB2312CharToFreqOrder = ( \
5867,5507,6273,4206,6274,4789,6098,6764,3619,3646,3833,3804,2394,3788,4936,3978, 5867,5507,6273,4206,6274,4789,6098,6764,3619,3646,3833,3804,2394,3788,4936,3978,
4866,4899,6099,6100,5559,6478,6765,3599,5868,6101,5869,5870,6275,6766,4527,6767) 4866,4899,6099,6100,5559,6478,6765,3599,5868,6101,5869,5870,6275,6766,4527,6767)
# flake8: noqa

View File

@ -25,10 +25,10 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from mbcharsetprober import MultiByteCharSetProber from .mbcharsetprober import MultiByteCharSetProber
from codingstatemachine import CodingStateMachine from .codingstatemachine import CodingStateMachine
from chardistribution import GB2312DistributionAnalysis from .chardistribution import GB2312DistributionAnalysis
from mbcssm import GB2312SMModel from .mbcssm import GB2312SMModel
class GB2312Prober(MultiByteCharSetProber): class GB2312Prober(MultiByteCharSetProber):
def __init__(self): def __init__(self):

View File

@ -25,8 +25,9 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from charsetprober import CharSetProber from .charsetprober import CharSetProber
import constants from .constants import eNotMe, eDetecting
from .compat import wrap_ord
# This prober doesn't actually recognize a language or a charset. # This prober doesn't actually recognize a language or a charset.
# It is a helper prober for the use of the Hebrew model probers # It is a helper prober for the use of the Hebrew model probers
@ -126,28 +127,31 @@ import constants
# charset identified, either "windows-1255" or "ISO-8859-8". # charset identified, either "windows-1255" or "ISO-8859-8".
# windows-1255 / ISO-8859-8 code points of interest # windows-1255 / ISO-8859-8 code points of interest
FINAL_KAF = '\xea' FINAL_KAF = 0xea
NORMAL_KAF = '\xeb' NORMAL_KAF = 0xeb
FINAL_MEM = '\xed' FINAL_MEM = 0xed
NORMAL_MEM = '\xee' NORMAL_MEM = 0xee
FINAL_NUN = '\xef' FINAL_NUN = 0xef
NORMAL_NUN = '\xf0' NORMAL_NUN = 0xf0
FINAL_PE = '\xf3' FINAL_PE = 0xf3
NORMAL_PE = '\xf4' NORMAL_PE = 0xf4
FINAL_TSADI = '\xf5' FINAL_TSADI = 0xf5
NORMAL_TSADI = '\xf6' NORMAL_TSADI = 0xf6
# Minimum Visual vs Logical final letter score difference. # Minimum Visual vs Logical final letter score difference.
# If the difference is below this, don't rely solely on the final letter score distance. # If the difference is below this, don't rely solely on the final letter score
# distance.
MIN_FINAL_CHAR_DISTANCE = 5 MIN_FINAL_CHAR_DISTANCE = 5
# Minimum Visual vs Logical model score difference. # Minimum Visual vs Logical model score difference.
# If the difference is below this, don't rely at all on the model score distance. # If the difference is below this, don't rely at all on the model score
# distance.
MIN_MODEL_DISTANCE = 0.01 MIN_MODEL_DISTANCE = 0.01
VISUAL_HEBREW_NAME = "ISO-8859-8" VISUAL_HEBREW_NAME = "ISO-8859-8"
LOGICAL_HEBREW_NAME = "windows-1255" LOGICAL_HEBREW_NAME = "windows-1255"
class HebrewProber(CharSetProber): class HebrewProber(CharSetProber):
def __init__(self): def __init__(self):
CharSetProber.__init__(self) CharSetProber.__init__(self)
@ -159,8 +163,8 @@ class HebrewProber(CharSetProber):
self._mFinalCharLogicalScore = 0 self._mFinalCharLogicalScore = 0
self._mFinalCharVisualScore = 0 self._mFinalCharVisualScore = 0
# The two last characters seen in the previous buffer, # The two last characters seen in the previous buffer,
# mPrev and mBeforePrev are initialized to space in order to simulate a word # mPrev and mBeforePrev are initialized to space in order to simulate
# delimiter at the beginning of the data # a word delimiter at the beginning of the data
self._mPrev = ' ' self._mPrev = ' '
self._mBeforePrev = ' ' self._mBeforePrev = ' '
# These probers are owned by the group prober. # These probers are owned by the group prober.
@ -170,49 +174,52 @@ class HebrewProber(CharSetProber):
self._mVisualProber = visualProber self._mVisualProber = visualProber
def is_final(self, c): def is_final(self, c):
return c in [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE, FINAL_TSADI] return wrap_ord(c) in [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE,
FINAL_TSADI]
def is_non_final(self, c): def is_non_final(self, c):
# The normal Tsadi is not a good Non-Final letter due to words like # The normal Tsadi is not a good Non-Final letter due to words like
# 'lechotet' (to chat) containing an apostrophe after the tsadi. This # 'lechotet' (to chat) containing an apostrophe after the tsadi. This
# apostrophe is converted to a space in FilterWithoutEnglishLetters causing # apostrophe is converted to a space in FilterWithoutEnglishLetters
# the Non-Final tsadi to appear at an end of a word even though this is not # causing the Non-Final tsadi to appear at an end of a word even
# the case in the original text. # though this is not the case in the original text.
# The letters Pe and Kaf rarely display a related behavior of not being a # The letters Pe and Kaf rarely display a related behavior of not being
# good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for # a good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak'
# example legally end with a Non-Final Pe or Kaf. However, the benefit of # for example legally end with a Non-Final Pe or Kaf. However, the
# these letters as Non-Final letters outweighs the damage since these words # benefit of these letters as Non-Final letters outweighs the damage
# are quite rare. # since these words are quite rare.
return c in [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE] return wrap_ord(c) in [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE]
def feed(self, aBuf): def feed(self, aBuf):
# Final letter analysis for logical-visual decision. # Final letter analysis for logical-visual decision.
# Look for evidence that the received buffer is either logical Hebrew or # Look for evidence that the received buffer is either logical Hebrew
# visual Hebrew. # or visual Hebrew.
# The following cases are checked: # The following cases are checked:
# 1) A word longer than 1 letter, ending with a final letter. This is an # 1) A word longer than 1 letter, ending with a final letter. This is
# indication that the text is laid out "naturally" since the final letter # an indication that the text is laid out "naturally" since the
# really appears at the end. +1 for logical score. # final letter really appears at the end. +1 for logical score.
# 2) A word longer than 1 letter, ending with a Non-Final letter. In normal # 2) A word longer than 1 letter, ending with a Non-Final letter. In
# Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with # normal Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi,
# the Non-Final form of that letter. Exceptions to this rule are mentioned # should not end with the Non-Final form of that letter. Exceptions
# above in isNonFinal(). This is an indication that the text is laid out # to this rule are mentioned above in isNonFinal(). This is an
# backwards. +1 for visual score # indication that the text is laid out backwards. +1 for visual
# 3) A word longer than 1 letter, starting with a final letter. Final letters # score
# should not appear at the beginning of a word. This is an indication that # 3) A word longer than 1 letter, starting with a final letter. Final
# the text is laid out backwards. +1 for visual score. # letters should not appear at the beginning of a word. This is an
# indication that the text is laid out backwards. +1 for visual
# score.
# #
# The visual score and logical score are accumulated throughout the text and # The visual score and logical score are accumulated throughout the
# are finally checked against each other in GetCharSetName(). # text and are finally checked against each other in GetCharSetName().
# No checking for final letters in the middle of words is done since that case # No checking for final letters in the middle of words is done since
# is not an indication for either Logical or Visual text. # that case is not an indication for either Logical or Visual text.
# #
# We automatically filter out all 7-bit characters (replace them with spaces) # We automatically filter out all 7-bit characters (replace them with
# so the word boundary detection works properly. [MAP] # spaces) so the word boundary detection works properly. [MAP]
if self.get_state() == constants.eNotMe: if self.get_state() == eNotMe:
# Both model probers say it's not them. No reason to continue. # Both model probers say it's not them. No reason to continue.
return constants.eNotMe return eNotMe
aBuf = self.filter_high_bit_only(aBuf) aBuf = self.filter_high_bit_only(aBuf)
@ -220,23 +227,27 @@ class HebrewProber(CharSetProber):
if cur == ' ': if cur == ' ':
# We stand on a space - a word just ended # We stand on a space - a word just ended
if self._mBeforePrev != ' ': if self._mBeforePrev != ' ':
# next-to-last char was not a space so self._mPrev is not a 1 letter word # next-to-last char was not a space so self._mPrev is not a
# 1 letter word
if self.is_final(self._mPrev): if self.is_final(self._mPrev):
# case (1) [-2:not space][-1:final letter][cur:space] # case (1) [-2:not space][-1:final letter][cur:space]
self._mFinalCharLogicalScore += 1 self._mFinalCharLogicalScore += 1
elif self.is_non_final(self._mPrev): elif self.is_non_final(self._mPrev):
# case (2) [-2:not space][-1:Non-Final letter][cur:space] # case (2) [-2:not space][-1:Non-Final letter][
# cur:space]
self._mFinalCharVisualScore += 1 self._mFinalCharVisualScore += 1
else: else:
# Not standing on a space # Not standing on a space
if (self._mBeforePrev == ' ') and (self.is_final(self._mPrev)) and (cur != ' '): if ((self._mBeforePrev == ' ') and
(self.is_final(self._mPrev)) and (cur != ' ')):
# case (3) [-2:space][-1:final letter][cur:not space] # case (3) [-2:space][-1:final letter][cur:not space]
self._mFinalCharVisualScore += 1 self._mFinalCharVisualScore += 1
self._mBeforePrev = self._mPrev self._mBeforePrev = self._mPrev
self._mPrev = cur self._mPrev = cur
# Forever detecting, till the end or until both model probers return eNotMe (handled above) # Forever detecting, till the end or until both model probers return
return constants.eDetecting # eNotMe (handled above)
return eDetecting
def get_charset_name(self): def get_charset_name(self):
# Make the decision: is it Logical or Visual? # Make the decision: is it Logical or Visual?
@ -248,22 +259,25 @@ class HebrewProber(CharSetProber):
return VISUAL_HEBREW_NAME return VISUAL_HEBREW_NAME
# It's not dominant enough, try to rely on the model scores instead. # It's not dominant enough, try to rely on the model scores instead.
modelsub = self._mLogicalProber.get_confidence() - self._mVisualProber.get_confidence() modelsub = (self._mLogicalProber.get_confidence()
- self._mVisualProber.get_confidence())
if modelsub > MIN_MODEL_DISTANCE: if modelsub > MIN_MODEL_DISTANCE:
return LOGICAL_HEBREW_NAME return LOGICAL_HEBREW_NAME
if modelsub < -MIN_MODEL_DISTANCE: if modelsub < -MIN_MODEL_DISTANCE:
return VISUAL_HEBREW_NAME return VISUAL_HEBREW_NAME
# Still no good, back to final letter distance, maybe it'll save the day. # Still no good, back to final letter distance, maybe it'll save the
# day.
if finalsub < 0.0: if finalsub < 0.0:
return VISUAL_HEBREW_NAME return VISUAL_HEBREW_NAME
# (finalsub > 0 - Logical) or (don't know what to do) default to Logical. # (finalsub > 0 - Logical) or (don't know what to do) default to
# Logical.
return LOGICAL_HEBREW_NAME return LOGICAL_HEBREW_NAME
def get_state(self): def get_state(self):
# Remain active as long as any of the model probers are active. # Remain active as long as any of the model probers are active.
if (self._mLogicalProber.get_state() == constants.eNotMe) and \ if (self._mLogicalProber.get_state() == eNotMe) and \
(self._mVisualProber.get_state() == constants.eNotMe): (self._mVisualProber.get_state() == eNotMe):
return constants.eNotMe return eNotMe
return constants.eDetecting return eDetecting

View File

@ -46,7 +46,7 @@ JIS_TYPICAL_DISTRIBUTION_RATIO = 3.0
# Char to FreqOrder table , # Char to FreqOrder table ,
JIS_TABLE_SIZE = 4368 JIS_TABLE_SIZE = 4368
JISCharToFreqOrder = ( \ JISCharToFreqOrder = (
40, 1, 6, 182, 152, 180, 295,2127, 285, 381,3295,4304,3068,4606,3165,3510, # 16 40, 1, 6, 182, 152, 180, 295,2127, 285, 381,3295,4304,3068,4606,3165,3510, # 16
3511,1822,2785,4607,1193,2226,5070,4608, 171,2996,1247, 18, 179,5071, 856,1661, # 32 3511,1822,2785,4607,1193,2226,5070,4608, 171,2996,1247, 18, 179,5071, 856,1661, # 32
1262,5072, 619, 127,3431,3512,3230,1899,1700, 232, 228,1294,1298, 284, 283,2041, # 48 1262,5072, 619, 127,3431,3512,3230,1899,1700, 232, 228,1294,1298, 284, 283,2041, # 48
@ -565,3 +565,5 @@ JISCharToFreqOrder = ( \
8224,8225,8226,8227,8228,8229,8230,8231,8232,8233,8234,8235,8236,8237,8238,8239, # 8240 8224,8225,8226,8227,8228,8229,8230,8231,8232,8233,8234,8235,8236,8237,8238,8239, # 8240
8240,8241,8242,8243,8244,8245,8246,8247,8248,8249,8250,8251,8252,8253,8254,8255, # 8256 8240,8241,8242,8243,8244,8245,8246,8247,8248,8249,8250,8251,8252,8253,8254,8255, # 8256
8256,8257,8258,8259,8260,8261,8262,8263,8264,8265,8266,8267,8268,8269,8270,8271) # 8272 8256,8257,8258,8259,8260,8261,8262,8263,8264,8265,8266,8267,8268,8269,8270,8271) # 8272
# flake8: noqa

View File

@ -25,7 +25,7 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants from .compat import wrap_ord
NUM_OF_CATEGORY = 6 NUM_OF_CATEGORY = 6
DONT_KNOW = -1 DONT_KNOW = -1
@ -34,7 +34,7 @@ MAX_REL_THRESHOLD = 1000
MINIMUM_DATA_THRESHOLD = 4 MINIMUM_DATA_THRESHOLD = 4
# This is hiragana 2-char sequence table, the number in each cell represents its frequency category # This is hiragana 2-char sequence table, the number in each cell represents its frequency category
jp2CharContext = ( \ jp2CharContext = (
(0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1), (0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1),
(2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4), (2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4),
(0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2), (0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2),
@ -125,24 +125,31 @@ class JapaneseContextAnalysis:
self.reset() self.reset()
def reset(self): def reset(self):
self._mTotalRel = 0 # total sequence received self._mTotalRel = 0 # total sequence received
self._mRelSample = [0] * NUM_OF_CATEGORY # category counters, each interger counts sequence in its category # category counters, each interger counts sequence in its category
self._mNeedToSkipCharNum = 0 # if last byte in current buffer is not the last byte of a character, we need to know how many bytes to skip in next buffer self._mRelSample = [0] * NUM_OF_CATEGORY
self._mLastCharOrder = -1 # The order of previous char # if last byte in current buffer is not the last byte of a character,
self._mDone = constants.False # If this flag is set to constants.True, detection is done and conclusion has been made # we need to know how many bytes to skip in next buffer
self._mNeedToSkipCharNum = 0
self._mLastCharOrder = -1 # The order of previous char
# If this flag is set to True, detection is done and conclusion has
# been made
self._mDone = False
def feed(self, aBuf, aLen): def feed(self, aBuf, aLen):
if self._mDone: return if self._mDone:
return
# The buffer we got is byte oriented, and a character may span in more than one # The buffer we got is byte oriented, and a character may span in more than one
# buffers. In case the last one or two byte in last buffer is not complete, we # buffers. In case the last one or two byte in last buffer is not
# record how many byte needed to complete that character and skip these bytes here. # complete, we record how many byte needed to complete that character
# We can choose to record those bytes as well and analyse the character once it # and skip these bytes here. We can choose to record those bytes as
# is complete, but since a character will not make much difference, by simply skipping # well and analyse the character once it is complete, but since a
# character will not make much difference, by simply skipping
# this character will simply our logic and improve performance. # this character will simply our logic and improve performance.
i = self._mNeedToSkipCharNum i = self._mNeedToSkipCharNum
while i < aLen: while i < aLen:
order, charLen = self.get_order(aBuf[i:i+2]) order, charLen = self.get_order(aBuf[i:i + 2])
i += charLen i += charLen
if i > aLen: if i > aLen:
self._mNeedToSkipCharNum = i - aLen self._mNeedToSkipCharNum = i - aLen
@ -151,7 +158,7 @@ class JapaneseContextAnalysis:
if (order != -1) and (self._mLastCharOrder != -1): if (order != -1) and (self._mLastCharOrder != -1):
self._mTotalRel += 1 self._mTotalRel += 1
if self._mTotalRel > MAX_REL_THRESHOLD: if self._mTotalRel > MAX_REL_THRESHOLD:
self._mDone = constants.True self._mDone = True
break break
self._mRelSample[jp2CharContext[self._mLastCharOrder][order]] += 1 self._mRelSample[jp2CharContext[self._mLastCharOrder][order]] += 1
self._mLastCharOrder = order self._mLastCharOrder = order
@ -166,45 +173,55 @@ class JapaneseContextAnalysis:
else: else:
return DONT_KNOW return DONT_KNOW
def get_order(self, aStr): def get_order(self, aBuf):
return -1, 1 return -1, 1
class SJISContextAnalysis(JapaneseContextAnalysis): class SJISContextAnalysis(JapaneseContextAnalysis):
def get_order(self, aStr): def __init__(self):
if not aStr: return -1, 1 self.charset_name = "SHIFT_JIS"
def get_charset_name(self):
return self.charset_name
def get_order(self, aBuf):
if not aBuf:
return -1, 1
# find out current char's byte length # find out current char's byte length
if ((aStr[0] >= '\x81') and (aStr[0] <= '\x9F')) or \ first_char = wrap_ord(aBuf[0])
((aStr[0] >= '\xE0') and (aStr[0] <= '\xFC')): if ((0x81 <= first_char <= 0x9F) or (0xE0 <= first_char <= 0xFC)):
charLen = 2 charLen = 2
if (first_char == 0x87) or (0xFA <= first_char <= 0xFC):
self.charset_name = "CP932"
else: else:
charLen = 1 charLen = 1
# return its order if it is hiragana # return its order if it is hiragana
if len(aStr) > 1: if len(aBuf) > 1:
if (aStr[0] == '\202') and \ second_char = wrap_ord(aBuf[1])
(aStr[1] >= '\x9F') and \ if (first_char == 202) and (0x9F <= second_char <= 0xF1):
(aStr[1] <= '\xF1'): return second_char - 0x9F, charLen
return ord(aStr[1]) - 0x9F, charLen
return -1, charLen return -1, charLen
class EUCJPContextAnalysis(JapaneseContextAnalysis): class EUCJPContextAnalysis(JapaneseContextAnalysis):
def get_order(self, aStr): def get_order(self, aBuf):
if not aStr: return -1, 1 if not aBuf:
return -1, 1
# find out current char's byte length # find out current char's byte length
if (aStr[0] == '\x8E') or \ first_char = wrap_ord(aBuf[0])
((aStr[0] >= '\xA1') and (aStr[0] <= '\xFE')): if (first_char == 0x8E) or (0xA1 <= first_char <= 0xFE):
charLen = 2 charLen = 2
elif aStr[0] == '\x8F': elif first_char == 0x8F:
charLen = 3 charLen = 3
else: else:
charLen = 1 charLen = 1
# return its order if it is hiragana # return its order if it is hiragana
if len(aStr) > 1: if len(aBuf) > 1:
if (aStr[0] == '\xA4') and \ second_char = wrap_ord(aBuf[1])
(aStr[1] >= '\xA1') and \ if (first_char == 0xA4) and (0xA1 <= second_char <= 0xF3):
(aStr[1] <= '\xF3'): return second_char - 0xA1, charLen
return ord(aStr[1]) - 0xA1, charLen
return -1, charLen return -1, charLen
# flake8: noqa

View File

@ -25,8 +25,6 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants
# 255: Control characters that usually does not exist in any text # 255: Control characters that usually does not exist in any text
# 254: Carriage/Return # 254: Carriage/Return
# 253: symbol (punctuation) that does not belong to word # 253: symbol (punctuation) that does not belong to word
@ -36,7 +34,7 @@ import constants
# this table is modified base on win1251BulgarianCharToOrderMap, so # this table is modified base on win1251BulgarianCharToOrderMap, so
# only number <64 is sure valid # only number <64 is sure valid
Latin5_BulgarianCharToOrderMap = ( \ Latin5_BulgarianCharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -55,7 +53,7 @@ Latin5_BulgarianCharToOrderMap = ( \
62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,252,253, # f0 62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,252,253, # f0
) )
win1251BulgarianCharToOrderMap = ( \ win1251BulgarianCharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -80,7 +78,7 @@ win1251BulgarianCharToOrderMap = ( \
# first 1024 sequences:3.0618% # first 1024 sequences:3.0618%
# rest sequences: 0.2992% # rest sequences: 0.2992%
# negative sequences: 0.0020% # negative sequences: 0.0020%
BulgarianLangModel = ( \ BulgarianLangModel = (
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3, 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,2,1,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,2,1,2,2,
3,1,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,3,0,3,0,1, 3,1,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,3,0,3,0,1,
@ -211,18 +209,21 @@ BulgarianLangModel = ( \
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
) )
Latin5BulgarianModel = { \ Latin5BulgarianModel = {
'charToOrderMap': Latin5_BulgarianCharToOrderMap, 'charToOrderMap': Latin5_BulgarianCharToOrderMap,
'precedenceMatrix': BulgarianLangModel, 'precedenceMatrix': BulgarianLangModel,
'mTypicalPositiveRatio': 0.969392, 'mTypicalPositiveRatio': 0.969392,
'keepEnglishLetter': constants.False, 'keepEnglishLetter': False,
'charsetName': "ISO-8859-5" 'charsetName': "ISO-8859-5"
} }
Win1251BulgarianModel = { \ Win1251BulgarianModel = {
'charToOrderMap': win1251BulgarianCharToOrderMap, 'charToOrderMap': win1251BulgarianCharToOrderMap,
'precedenceMatrix': BulgarianLangModel, 'precedenceMatrix': BulgarianLangModel,
'mTypicalPositiveRatio': 0.969392, 'mTypicalPositiveRatio': 0.969392,
'keepEnglishLetter': constants.False, 'keepEnglishLetter': False,
'charsetName': "windows-1251" 'charsetName': "windows-1251"
} }
# flake8: noqa

View File

@ -25,11 +25,9 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants
# KOI8-R language model # KOI8-R language model
# Character Mapping Table: # Character Mapping Table:
KOI8R_CharToOrderMap = ( \ KOI8R_CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -48,7 +46,7 @@ KOI8R_CharToOrderMap = ( \
35, 43, 45, 32, 40, 52, 56, 33, 61, 62, 51, 57, 47, 63, 50, 70, # f0 35, 43, 45, 32, 40, 52, 56, 33, 61, 62, 51, 57, 47, 63, 50, 70, # f0
) )
win1251_CharToOrderMap = ( \ win1251_CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -67,7 +65,7 @@ win1251_CharToOrderMap = ( \
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16, 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16,
) )
latin5_CharToOrderMap = ( \ latin5_CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -86,7 +84,7 @@ latin5_CharToOrderMap = ( \
239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255, 239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255,
) )
macCyrillic_CharToOrderMap = ( \ macCyrillic_CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -105,7 +103,7 @@ macCyrillic_CharToOrderMap = ( \
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,255, 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,255,
) )
IBM855_CharToOrderMap = ( \ IBM855_CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -124,7 +122,7 @@ IBM855_CharToOrderMap = ( \
250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,252,255, 250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,252,255,
) )
IBM866_CharToOrderMap = ( \ IBM866_CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -149,7 +147,7 @@ IBM866_CharToOrderMap = ( \
# first 1024 sequences: 2.3389% # first 1024 sequences: 2.3389%
# rest sequences: 0.1237% # rest sequences: 0.1237%
# negative sequences: 0.0009% # negative sequences: 0.0009%
RussianLangModel = ( \ RussianLangModel = (
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3, 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,2,2,2,0,0,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,2,2,2,0,0,2,
3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,3,0,0,3,3,3,3,3,3,3,3,3,2,3,2,0, 3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,3,0,0,3,3,3,3,3,3,3,3,3,2,3,2,0,
@ -280,50 +278,52 @@ RussianLangModel = ( \
0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0, 0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,
) )
Koi8rModel = { \ Koi8rModel = {
'charToOrderMap': KOI8R_CharToOrderMap, 'charToOrderMap': KOI8R_CharToOrderMap,
'precedenceMatrix': RussianLangModel, 'precedenceMatrix': RussianLangModel,
'mTypicalPositiveRatio': 0.976601, 'mTypicalPositiveRatio': 0.976601,
'keepEnglishLetter': constants.False, 'keepEnglishLetter': False,
'charsetName': "KOI8-R" 'charsetName': "KOI8-R"
} }
Win1251CyrillicModel = { \ Win1251CyrillicModel = {
'charToOrderMap': win1251_CharToOrderMap, 'charToOrderMap': win1251_CharToOrderMap,
'precedenceMatrix': RussianLangModel, 'precedenceMatrix': RussianLangModel,
'mTypicalPositiveRatio': 0.976601, 'mTypicalPositiveRatio': 0.976601,
'keepEnglishLetter': constants.False, 'keepEnglishLetter': False,
'charsetName': "windows-1251" 'charsetName': "windows-1251"
} }
Latin5CyrillicModel = { \ Latin5CyrillicModel = {
'charToOrderMap': latin5_CharToOrderMap, 'charToOrderMap': latin5_CharToOrderMap,
'precedenceMatrix': RussianLangModel, 'precedenceMatrix': RussianLangModel,
'mTypicalPositiveRatio': 0.976601, 'mTypicalPositiveRatio': 0.976601,
'keepEnglishLetter': constants.False, 'keepEnglishLetter': False,
'charsetName': "ISO-8859-5" 'charsetName': "ISO-8859-5"
} }
MacCyrillicModel = { \ MacCyrillicModel = {
'charToOrderMap': macCyrillic_CharToOrderMap, 'charToOrderMap': macCyrillic_CharToOrderMap,
'precedenceMatrix': RussianLangModel, 'precedenceMatrix': RussianLangModel,
'mTypicalPositiveRatio': 0.976601, 'mTypicalPositiveRatio': 0.976601,
'keepEnglishLetter': constants.False, 'keepEnglishLetter': False,
'charsetName': "MacCyrillic" 'charsetName': "MacCyrillic"
}; };
Ibm866Model = { \ Ibm866Model = {
'charToOrderMap': IBM866_CharToOrderMap, 'charToOrderMap': IBM866_CharToOrderMap,
'precedenceMatrix': RussianLangModel, 'precedenceMatrix': RussianLangModel,
'mTypicalPositiveRatio': 0.976601, 'mTypicalPositiveRatio': 0.976601,
'keepEnglishLetter': constants.False, 'keepEnglishLetter': False,
'charsetName': "IBM866" 'charsetName': "IBM866"
} }
Ibm855Model = { \ Ibm855Model = {
'charToOrderMap': IBM855_CharToOrderMap, 'charToOrderMap': IBM855_CharToOrderMap,
'precedenceMatrix': RussianLangModel, 'precedenceMatrix': RussianLangModel,
'mTypicalPositiveRatio': 0.976601, 'mTypicalPositiveRatio': 0.976601,
'keepEnglishLetter': constants.False, 'keepEnglishLetter': False,
'charsetName': "IBM855" 'charsetName': "IBM855"
} }
# flake8: noqa

View File

@ -25,15 +25,13 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants
# 255: Control characters that usually does not exist in any text # 255: Control characters that usually does not exist in any text
# 254: Carriage/Return # 254: Carriage/Return
# 253: symbol (punctuation) that does not belong to word # 253: symbol (punctuation) that does not belong to word
# 252: 0 - 9 # 252: 0 - 9
# Character Mapping Table: # Character Mapping Table:
Latin7_CharToOrderMap = ( \ Latin7_CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -52,7 +50,7 @@ Latin7_CharToOrderMap = ( \
9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, # f0 9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, # f0
) )
win1253_CharToOrderMap = ( \ win1253_CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -77,7 +75,7 @@ win1253_CharToOrderMap = ( \
# first 1024 sequences:1.7001% # first 1024 sequences:1.7001%
# rest sequences: 0.0359% # rest sequences: 0.0359%
# negative sequences: 0.0148% # negative sequences: 0.0148%
GreekLangModel = ( \ GreekLangModel = (
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,3,2,2,3,3,3,3,3,3,3,3,1,3,3,3,0,2,2,3,3,0,3,0,3,2,0,3,3,3,0, 0,0,3,2,2,3,3,3,3,3,3,3,3,1,3,3,3,0,2,2,3,3,0,3,0,3,2,0,3,3,3,0,
@ -208,18 +206,20 @@ GreekLangModel = ( \
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
) )
Latin7GreekModel = { \ Latin7GreekModel = {
'charToOrderMap': Latin7_CharToOrderMap, 'charToOrderMap': Latin7_CharToOrderMap,
'precedenceMatrix': GreekLangModel, 'precedenceMatrix': GreekLangModel,
'mTypicalPositiveRatio': 0.982851, 'mTypicalPositiveRatio': 0.982851,
'keepEnglishLetter': constants.False, 'keepEnglishLetter': False,
'charsetName': "ISO-8859-7" 'charsetName': "ISO-8859-7"
} }
Win1253GreekModel = { \ Win1253GreekModel = {
'charToOrderMap': win1253_CharToOrderMap, 'charToOrderMap': win1253_CharToOrderMap,
'precedenceMatrix': GreekLangModel, 'precedenceMatrix': GreekLangModel,
'mTypicalPositiveRatio': 0.982851, 'mTypicalPositiveRatio': 0.982851,
'keepEnglishLetter': constants.False, 'keepEnglishLetter': False,
'charsetName': "windows-1253" 'charsetName': "windows-1253"
} }
# flake8: noqa

View File

@ -27,8 +27,6 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants
# 255: Control characters that usually does not exist in any text # 255: Control characters that usually does not exist in any text
# 254: Carriage/Return # 254: Carriage/Return
# 253: symbol (punctuation) that does not belong to word # 253: symbol (punctuation) that does not belong to word
@ -36,7 +34,7 @@ import constants
# Windows-1255 language model # Windows-1255 language model
# Character Mapping Table: # Character Mapping Table:
win1255_CharToOrderMap = ( \ win1255_CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -61,7 +59,7 @@ win1255_CharToOrderMap = ( \
# first 1024 sequences: 1.5981% # first 1024 sequences: 1.5981%
# rest sequences: 0.087% # rest sequences: 0.087%
# negative sequences: 0.0015% # negative sequences: 0.0015%
HebrewLangModel = ( \ HebrewLangModel = (
0,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,2,1,2,0,1,0,0, 0,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,2,1,2,0,1,0,0,
3,0,3,1,0,0,1,3,2,0,1,1,2,0,2,2,2,1,1,1,1,2,1,1,1,2,0,0,2,2,0,1, 3,0,3,1,0,0,1,3,2,0,1,1,2,0,2,2,2,1,1,1,1,2,1,1,1,2,0,0,2,2,0,1,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,
@ -192,10 +190,12 @@ HebrewLangModel = ( \
0,0,0,0,0,0,0,0,0,0,1,2,1,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,1,1,0,0, 0,0,0,0,0,0,0,0,0,0,1,2,1,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,1,1,0,0,
) )
Win1255HebrewModel = { \ Win1255HebrewModel = {
'charToOrderMap': win1255_CharToOrderMap, 'charToOrderMap': win1255_CharToOrderMap,
'precedenceMatrix': HebrewLangModel, 'precedenceMatrix': HebrewLangModel,
'mTypicalPositiveRatio': 0.984004, 'mTypicalPositiveRatio': 0.984004,
'keepEnglishLetter': constants.False, 'keepEnglishLetter': False,
'charsetName': "windows-1255" 'charsetName': "windows-1255"
} }
# flake8: noqa

View File

@ -25,15 +25,13 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants
# 255: Control characters that usually does not exist in any text # 255: Control characters that usually does not exist in any text
# 254: Carriage/Return # 254: Carriage/Return
# 253: symbol (punctuation) that does not belong to word # 253: symbol (punctuation) that does not belong to word
# 252: 0 - 9 # 252: 0 - 9
# Character Mapping Table: # Character Mapping Table:
Latin2_HungarianCharToOrderMap = ( \ Latin2_HungarianCharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -52,7 +50,7 @@ Latin2_HungarianCharToOrderMap = ( \
245,246,247, 25, 73, 42, 24,248,249,250, 31, 56, 29,251,252,253, 245,246,247, 25, 73, 42, 24,248,249,250, 31, 56, 29,251,252,253,
) )
win1250HungarianCharToOrderMap = ( \ win1250HungarianCharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -77,7 +75,7 @@ win1250HungarianCharToOrderMap = ( \
# first 1024 sequences:5.2623% # first 1024 sequences:5.2623%
# rest sequences: 0.8894% # rest sequences: 0.8894%
# negative sequences: 0.0009% # negative sequences: 0.0009%
HungarianLangModel = ( \ HungarianLangModel = (
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,1,1,2,2,2,2,2,1,2, 3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,1,1,2,2,2,2,2,1,2,
3,2,2,3,3,3,3,3,2,3,3,3,3,3,3,1,2,3,3,3,3,2,3,3,1,1,3,3,0,1,1,1, 3,2,2,3,3,3,3,3,2,3,3,3,3,3,3,1,2,3,3,3,3,2,3,3,1,1,3,3,0,1,1,1,
@ -208,18 +206,20 @@ HungarianLangModel = ( \
0,1,1,1,1,1,1,0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0, 0,1,1,1,1,1,1,0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,
) )
Latin2HungarianModel = { \ Latin2HungarianModel = {
'charToOrderMap': Latin2_HungarianCharToOrderMap, 'charToOrderMap': Latin2_HungarianCharToOrderMap,
'precedenceMatrix': HungarianLangModel, 'precedenceMatrix': HungarianLangModel,
'mTypicalPositiveRatio': 0.947368, 'mTypicalPositiveRatio': 0.947368,
'keepEnglishLetter': constants.True, 'keepEnglishLetter': True,
'charsetName': "ISO-8859-2" 'charsetName': "ISO-8859-2"
} }
Win1250HungarianModel = { \ Win1250HungarianModel = {
'charToOrderMap': win1250HungarianCharToOrderMap, 'charToOrderMap': win1250HungarianCharToOrderMap,
'precedenceMatrix': HungarianLangModel, 'precedenceMatrix': HungarianLangModel,
'mTypicalPositiveRatio': 0.947368, 'mTypicalPositiveRatio': 0.947368,
'keepEnglishLetter': constants.True, 'keepEnglishLetter': True,
'charsetName': "windows-1250" 'charsetName': "windows-1250"
} }
# flake8: noqa

View File

@ -25,8 +25,6 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants
# 255: Control characters that usually does not exist in any text # 255: Control characters that usually does not exist in any text
# 254: Carriage/Return # 254: Carriage/Return
# 253: symbol (punctuation) that does not belong to word # 253: symbol (punctuation) that does not belong to word
@ -35,7 +33,7 @@ import constants
# The following result for thai was collected from a limited sample (1M). # The following result for thai was collected from a limited sample (1M).
# Character Mapping Table: # Character Mapping Table:
TIS620CharToOrderMap = ( \ TIS620CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -60,7 +58,7 @@ TIS620CharToOrderMap = ( \
# first 1024 sequences:7.3177% # first 1024 sequences:7.3177%
# rest sequences: 1.0230% # rest sequences: 1.0230%
# negative sequences: 0.0436% # negative sequences: 0.0436%
ThaiLangModel = ( \ ThaiLangModel = (
0,1,3,3,3,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,3,3, 0,1,3,3,3,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,3,3,
0,3,3,0,0,0,1,3,0,3,3,2,3,3,0,1,2,3,3,3,3,0,2,0,2,0,0,3,2,1,2,2, 0,3,3,0,0,0,1,3,0,3,3,2,3,3,0,1,2,3,3,3,3,0,2,0,2,0,0,3,2,1,2,2,
3,0,3,3,2,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,3,0,3,2,3,0,2,2,2,3, 3,0,3,3,2,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,3,0,3,2,3,0,2,2,2,3,
@ -191,10 +189,12 @@ ThaiLangModel = ( \
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
) )
TIS620ThaiModel = { \ TIS620ThaiModel = {
'charToOrderMap': TIS620CharToOrderMap, 'charToOrderMap': TIS620CharToOrderMap,
'precedenceMatrix': ThaiLangModel, 'precedenceMatrix': ThaiLangModel,
'mTypicalPositiveRatio': 0.926386, 'mTypicalPositiveRatio': 0.926386,
'keepEnglishLetter': constants.False, 'keepEnglishLetter': False,
'charsetName': "TIS-620" 'charsetName': "TIS-620"
} }
# flake8: noqa

View File

@ -26,73 +26,74 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from charsetprober import CharSetProber from .charsetprober import CharSetProber
import constants from .constants import eNotMe
import operator from .compat import wrap_ord
FREQ_CAT_NUM = 4 FREQ_CAT_NUM = 4
UDF = 0 # undefined UDF = 0 # undefined
OTH = 1 # other OTH = 1 # other
ASC = 2 # ascii capital letter ASC = 2 # ascii capital letter
ASS = 3 # ascii small letter ASS = 3 # ascii small letter
ACV = 4 # accent capital vowel ACV = 4 # accent capital vowel
ACO = 5 # accent capital other ACO = 5 # accent capital other
ASV = 6 # accent small vowel ASV = 6 # accent small vowel
ASO = 7 # accent small other ASO = 7 # accent small other
CLASS_NUM = 8 # total classes CLASS_NUM = 8 # total classes
Latin1_CharToClass = ( \ Latin1_CharToClass = (
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F
OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47 OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57
ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F
OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67 OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77 ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77
ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F
OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, # 80 - 87 OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, # 80 - 87
OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, # 88 - 8F OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, # 88 - 8F
UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 90 - 97 UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 90 - 97
OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, # 98 - 9F OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, # 98 - 9F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A0 - A7 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A0 - A7
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A8 - AF OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A8 - AF
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B8 - BF OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B8 - BF
ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, # C0 - C7 ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, # C0 - C7
ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # C8 - CF ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # C8 - CF
ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, # D0 - D7 ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, # D0 - D7
ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, # D8 - DF ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, # D8 - DF
ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, # E0 - E7 ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, # E0 - E7
ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # E8 - EF ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # E8 - EF
ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, # F0 - F7 ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, # F0 - F7
ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, # F8 - FF ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, # F8 - FF
) )
# 0 : illegal # 0 : illegal
# 1 : very unlikely # 1 : very unlikely
# 2 : normal # 2 : normal
# 3 : very likely # 3 : very likely
Latin1ClassModel = ( \ Latin1ClassModel = (
# UDF OTH ASC ASS ACV ACO ASV ASO # UDF OTH ASC ASS ACV ACO ASV ASO
0, 0, 0, 0, 0, 0, 0, 0, # UDF 0, 0, 0, 0, 0, 0, 0, 0, # UDF
0, 3, 3, 3, 3, 3, 3, 3, # OTH 0, 3, 3, 3, 3, 3, 3, 3, # OTH
0, 3, 3, 3, 3, 3, 3, 3, # ASC 0, 3, 3, 3, 3, 3, 3, 3, # ASC
0, 3, 3, 3, 1, 1, 3, 3, # ASS 0, 3, 3, 3, 1, 1, 3, 3, # ASS
0, 3, 3, 3, 1, 2, 1, 2, # ACV 0, 3, 3, 3, 1, 2, 1, 2, # ACV
0, 3, 3, 3, 3, 3, 3, 3, # ACO 0, 3, 3, 3, 3, 3, 3, 3, # ACO
0, 3, 1, 3, 1, 1, 1, 3, # ASV 0, 3, 1, 3, 1, 1, 1, 3, # ASV
0, 3, 1, 3, 1, 1, 3, 3, # ASO 0, 3, 1, 3, 1, 1, 3, 3, # ASO
) )
class Latin1Prober(CharSetProber): class Latin1Prober(CharSetProber):
def __init__(self): def __init__(self):
CharSetProber.__init__(self) CharSetProber.__init__(self)
@ -109,10 +110,11 @@ class Latin1Prober(CharSetProber):
def feed(self, aBuf): def feed(self, aBuf):
aBuf = self.filter_with_english_letters(aBuf) aBuf = self.filter_with_english_letters(aBuf)
for c in aBuf: for c in aBuf:
charClass = Latin1_CharToClass[ord(c)] charClass = Latin1_CharToClass[wrap_ord(c)]
freq = Latin1ClassModel[(self._mLastCharClass * CLASS_NUM) + charClass] freq = Latin1ClassModel[(self._mLastCharClass * CLASS_NUM)
+ charClass]
if freq == 0: if freq == 0:
self._mState = constants.eNotMe self._mState = eNotMe
break break
self._mFreqCounter[freq] += 1 self._mFreqCounter[freq] += 1
self._mLastCharClass = charClass self._mLastCharClass = charClass
@ -120,17 +122,18 @@ class Latin1Prober(CharSetProber):
return self.get_state() return self.get_state()
def get_confidence(self): def get_confidence(self):
if self.get_state() == constants.eNotMe: if self.get_state() == eNotMe:
return 0.01 return 0.01
total = reduce(operator.add, self._mFreqCounter) total = sum(self._mFreqCounter)
if total < 0.01: if total < 0.01:
confidence = 0.0 confidence = 0.0
else: else:
confidence = (self._mFreqCounter[3] / total) - (self._mFreqCounter[1] * 20.0 / total) confidence = ((self._mFreqCounter[3] - self._mFreqCounter[1] * 20.0)
/ total)
if confidence < 0.0: if confidence < 0.0:
confidence = 0.0 confidence = 0.0
# lower the confidence of latin1 so that other more accurate detector # lower the confidence of latin1 so that other more accurate
# can take priority. # detector can take priority.
confidence = confidence * 0.5 confidence = confidence * 0.73
return confidence return confidence

View File

@ -27,16 +27,17 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants, sys import sys
from constants import eStart, eError, eItsMe from . import constants
from charsetprober import CharSetProber from .charsetprober import CharSetProber
class MultiByteCharSetProber(CharSetProber): class MultiByteCharSetProber(CharSetProber):
def __init__(self): def __init__(self):
CharSetProber.__init__(self) CharSetProber.__init__(self)
self._mDistributionAnalyzer = None self._mDistributionAnalyzer = None
self._mCodingSM = None self._mCodingSM = None
self._mLastChar = ['\x00', '\x00'] self._mLastChar = [0, 0]
def reset(self): def reset(self):
CharSetProber.reset(self) CharSetProber.reset(self)
@ -44,36 +45,39 @@ class MultiByteCharSetProber(CharSetProber):
self._mCodingSM.reset() self._mCodingSM.reset()
if self._mDistributionAnalyzer: if self._mDistributionAnalyzer:
self._mDistributionAnalyzer.reset() self._mDistributionAnalyzer.reset()
self._mLastChar = ['\x00', '\x00'] self._mLastChar = [0, 0]
def get_charset_name(self): def get_charset_name(self):
pass pass
def feed(self, aBuf): def feed(self, aBuf):
aLen = len(aBuf) aLen = len(aBuf)
for i in xrange(0, aLen): for i in range(0, aLen):
codingState = self._mCodingSM.next_state(aBuf[i]) codingState = self._mCodingSM.next_state(aBuf[i])
if codingState == eError: if codingState == constants.eError:
if constants._debug: if constants._debug:
sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n') sys.stderr.write(self.get_charset_name()
+ ' prober hit error at byte ' + str(i)
+ '\n')
self._mState = constants.eNotMe self._mState = constants.eNotMe
break break
elif codingState == eItsMe: elif codingState == constants.eItsMe:
self._mState = constants.eFoundIt self._mState = constants.eFoundIt
break break
elif codingState == eStart: elif codingState == constants.eStart:
charLen = self._mCodingSM.get_current_charlen() charLen = self._mCodingSM.get_current_charlen()
if i == 0: if i == 0:
self._mLastChar[1] = aBuf[0] self._mLastChar[1] = aBuf[0]
self._mDistributionAnalyzer.feed(self._mLastChar, charLen) self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
else: else:
self._mDistributionAnalyzer.feed(aBuf[i-1:i+1], charLen) self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1],
charLen)
self._mLastChar[0] = aBuf[aLen - 1] self._mLastChar[0] = aBuf[aLen - 1]
if self.get_state() == constants.eDetecting: if self.get_state() == constants.eDetecting:
if self._mDistributionAnalyzer.got_enough_data() and \ if (self._mDistributionAnalyzer.got_enough_data() and
(self.get_confidence() > constants.SHORTCUT_THRESHOLD): (self.get_confidence() > constants.SHORTCUT_THRESHOLD)):
self._mState = constants.eFoundIt self._mState = constants.eFoundIt
return self.get_state() return self.get_state()

View File

@ -27,24 +27,28 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from charsetgroupprober import CharSetGroupProber from .charsetgroupprober import CharSetGroupProber
from utf8prober import UTF8Prober from .utf8prober import UTF8Prober
from sjisprober import SJISProber from .sjisprober import SJISProber
from eucjpprober import EUCJPProber from .eucjpprober import EUCJPProber
from gb2312prober import GB2312Prober from .gb2312prober import GB2312Prober
from euckrprober import EUCKRProber from .euckrprober import EUCKRProber
from big5prober import Big5Prober from .cp949prober import CP949Prober
from euctwprober import EUCTWProber from .big5prober import Big5Prober
from .euctwprober import EUCTWProber
class MBCSGroupProber(CharSetGroupProber): class MBCSGroupProber(CharSetGroupProber):
def __init__(self): def __init__(self):
CharSetGroupProber.__init__(self) CharSetGroupProber.__init__(self)
self._mProbers = [ \ self._mProbers = [
UTF8Prober(), UTF8Prober(),
SJISProber(), SJISProber(),
EUCJPProber(), EUCJPProber(),
GB2312Prober(), GB2312Prober(),
EUCKRProber(), EUCKRProber(),
CP949Prober(),
Big5Prober(), Big5Prober(),
EUCTWProber()] EUCTWProber()
]
self.reset() self.reset()

View File

@ -25,11 +25,11 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from constants import eStart, eError, eItsMe from .constants import eStart, eError, eItsMe
# BIG5 # BIG5
BIG5_cls = ( \ BIG5_cls = (
1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as legal value 1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as legal value
1,1,1,1,1,1,0,0, # 08 - 0f 1,1,1,1,1,1,0,0, # 08 - 0f
1,1,1,1,1,1,1,1, # 10 - 17 1,1,1,1,1,1,1,1, # 10 - 17
@ -61,12 +61,14 @@ BIG5_cls = ( \
3,3,3,3,3,3,3,3, # e0 - e7 3,3,3,3,3,3,3,3, # e0 - e7
3,3,3,3,3,3,3,3, # e8 - ef 3,3,3,3,3,3,3,3, # e8 - ef
3,3,3,3,3,3,3,3, # f0 - f7 3,3,3,3,3,3,3,3, # f0 - f7
3,3,3,3,3,3,3,0) # f8 - ff 3,3,3,3,3,3,3,0 # f8 - ff
)
BIG5_st = ( \ BIG5_st = (
eError,eStart,eStart, 3,eError,eError,eError,eError,#00-07 eError,eStart,eStart, 3,eError,eError,eError,eError,#00-07
eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,#08-0f eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,#08-0f
eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart)#10-17 eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart#10-17
)
Big5CharLenTable = (0, 1, 1, 2, 0) Big5CharLenTable = (0, 1, 1, 2, 0)
@ -76,9 +78,49 @@ Big5SMModel = {'classTable': BIG5_cls,
'charLenTable': Big5CharLenTable, 'charLenTable': Big5CharLenTable,
'name': 'Big5'} 'name': 'Big5'}
# CP949
CP949_cls = (
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,0,0, # 00 - 0f
1,1,1,1,1,1,1,1, 1,1,1,0,1,1,1,1, # 10 - 1f
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, # 20 - 2f
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, # 30 - 3f
1,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, # 40 - 4f
4,4,5,5,5,5,5,5, 5,5,5,1,1,1,1,1, # 50 - 5f
1,5,5,5,5,5,5,5, 5,5,5,5,5,5,5,5, # 60 - 6f
5,5,5,5,5,5,5,5, 5,5,5,1,1,1,1,1, # 70 - 7f
0,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6, # 80 - 8f
6,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6, # 90 - 9f
6,7,7,7,7,7,7,7, 7,7,7,7,7,8,8,8, # a0 - af
7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7, # b0 - bf
7,7,7,7,7,7,9,2, 2,3,2,2,2,2,2,2, # c0 - cf
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, # d0 - df
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, # e0 - ef
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,0, # f0 - ff
)
CP949_st = (
#cls= 0 1 2 3 4 5 6 7 8 9 # previous state =
eError,eStart, 3,eError,eStart,eStart, 4, 5,eError, 6, # eStart
eError,eError,eError,eError,eError,eError,eError,eError,eError,eError, # eError
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe, # eItsMe
eError,eError,eStart,eStart,eError,eError,eError,eStart,eStart,eStart, # 3
eError,eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart,eStart, # 4
eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart,eStart,eStart, # 5
eError,eStart,eStart,eStart,eStart,eError,eError,eStart,eStart,eStart, # 6
)
CP949CharLenTable = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2)
CP949SMModel = {'classTable': CP949_cls,
'classFactor': 10,
'stateTable': CP949_st,
'charLenTable': CP949CharLenTable,
'name': 'CP949'}
# EUC-JP # EUC-JP
EUCJP_cls = ( \ EUCJP_cls = (
4,4,4,4,4,4,4,4, # 00 - 07 4,4,4,4,4,4,4,4, # 00 - 07
4,4,4,4,4,4,5,5, # 08 - 0f 4,4,4,4,4,4,5,5, # 08 - 0f
4,4,4,4,4,4,4,4, # 10 - 17 4,4,4,4,4,4,4,4, # 10 - 17
@ -110,14 +152,16 @@ EUCJP_cls = ( \
0,0,0,0,0,0,0,0, # e0 - e7 0,0,0,0,0,0,0,0, # e0 - e7
0,0,0,0,0,0,0,0, # e8 - ef 0,0,0,0,0,0,0,0, # e8 - ef
0,0,0,0,0,0,0,0, # f0 - f7 0,0,0,0,0,0,0,0, # f0 - f7
0,0,0,0,0,0,0,5) # f8 - ff 0,0,0,0,0,0,0,5 # f8 - ff
)
EUCJP_st = ( \ EUCJP_st = (
3, 4, 3, 5,eStart,eError,eError,eError,#00-07 3, 4, 3, 5,eStart,eError,eError,eError,#00-07
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
eItsMe,eItsMe,eStart,eError,eStart,eError,eError,eError,#10-17 eItsMe,eItsMe,eStart,eError,eStart,eError,eError,eError,#10-17
eError,eError,eStart,eError,eError,eError, 3,eError,#18-1f eError,eError,eStart,eError,eError,eError, 3,eError,#18-1f
3,eError,eError,eError,eStart,eStart,eStart,eStart)#20-27 3,eError,eError,eError,eStart,eStart,eStart,eStart#20-27
)
EUCJPCharLenTable = (2, 2, 2, 3, 1, 0) EUCJPCharLenTable = (2, 2, 2, 3, 1, 0)
@ -129,7 +173,7 @@ EUCJPSMModel = {'classTable': EUCJP_cls,
# EUC-KR # EUC-KR
EUCKR_cls = ( \ EUCKR_cls = (
1,1,1,1,1,1,1,1, # 00 - 07 1,1,1,1,1,1,1,1, # 00 - 07
1,1,1,1,1,1,0,0, # 08 - 0f 1,1,1,1,1,1,0,0, # 08 - 0f
1,1,1,1,1,1,1,1, # 10 - 17 1,1,1,1,1,1,1,1, # 10 - 17
@ -161,11 +205,13 @@ EUCKR_cls = ( \
2,2,2,2,2,2,2,2, # e0 - e7 2,2,2,2,2,2,2,2, # e0 - e7
2,2,2,2,2,2,2,2, # e8 - ef 2,2,2,2,2,2,2,2, # e8 - ef
2,2,2,2,2,2,2,2, # f0 - f7 2,2,2,2,2,2,2,2, # f0 - f7
2,2,2,2,2,2,2,0) # f8 - ff 2,2,2,2,2,2,2,0 # f8 - ff
)
EUCKR_st = ( EUCKR_st = (
eError,eStart, 3,eError,eError,eError,eError,eError,#00-07 eError,eStart, 3,eError,eError,eError,eError,eError,#00-07
eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart)#08-0f eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart #08-0f
)
EUCKRCharLenTable = (0, 1, 2, 0) EUCKRCharLenTable = (0, 1, 2, 0)
@ -177,7 +223,7 @@ EUCKRSMModel = {'classTable': EUCKR_cls,
# EUC-TW # EUC-TW
EUCTW_cls = ( \ EUCTW_cls = (
2,2,2,2,2,2,2,2, # 00 - 07 2,2,2,2,2,2,2,2, # 00 - 07
2,2,2,2,2,2,0,0, # 08 - 0f 2,2,2,2,2,2,0,0, # 08 - 0f
2,2,2,2,2,2,2,2, # 10 - 17 2,2,2,2,2,2,2,2, # 10 - 17
@ -209,15 +255,17 @@ EUCTW_cls = ( \
3,3,3,3,3,3,3,3, # e0 - e7 3,3,3,3,3,3,3,3, # e0 - e7
3,3,3,3,3,3,3,3, # e8 - ef 3,3,3,3,3,3,3,3, # e8 - ef
3,3,3,3,3,3,3,3, # f0 - f7 3,3,3,3,3,3,3,3, # f0 - f7
3,3,3,3,3,3,3,0) # f8 - ff 3,3,3,3,3,3,3,0 # f8 - ff
)
EUCTW_st = ( \ EUCTW_st = (
eError,eError,eStart, 3, 3, 3, 4,eError,#00-07 eError,eError,eStart, 3, 3, 3, 4,eError,#00-07
eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,#08-0f eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,#08-0f
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eStart,eError,#10-17 eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eStart,eError,#10-17
eStart,eStart,eStart,eError,eError,eError,eError,eError,#18-1f eStart,eStart,eStart,eError,eError,eError,eError,eError,#18-1f
5,eError,eError,eError,eStart,eError,eStart,eStart,#20-27 5,eError,eError,eError,eStart,eError,eStart,eStart,#20-27
eStart,eError,eStart,eStart,eStart,eStart,eStart,eStart)#28-2f eStart,eError,eStart,eStart,eStart,eStart,eStart,eStart #28-2f
)
EUCTWCharLenTable = (0, 0, 1, 2, 2, 2, 3) EUCTWCharLenTable = (0, 0, 1, 2, 2, 2, 3)
@ -229,7 +277,7 @@ EUCTWSMModel = {'classTable': EUCTW_cls,
# GB2312 # GB2312
GB2312_cls = ( \ GB2312_cls = (
1,1,1,1,1,1,1,1, # 00 - 07 1,1,1,1,1,1,1,1, # 00 - 07
1,1,1,1,1,1,0,0, # 08 - 0f 1,1,1,1,1,1,0,0, # 08 - 0f
1,1,1,1,1,1,1,1, # 10 - 17 1,1,1,1,1,1,1,1, # 10 - 17
@ -261,15 +309,17 @@ GB2312_cls = ( \
6,6,6,6,6,6,6,6, # e0 - e7 6,6,6,6,6,6,6,6, # e0 - e7
6,6,6,6,6,6,6,6, # e8 - ef 6,6,6,6,6,6,6,6, # e8 - ef
6,6,6,6,6,6,6,6, # f0 - f7 6,6,6,6,6,6,6,6, # f0 - f7
6,6,6,6,6,6,6,0) # f8 - ff 6,6,6,6,6,6,6,0 # f8 - ff
)
GB2312_st = ( \ GB2312_st = (
eError,eStart,eStart,eStart,eStart,eStart, 3,eError,#00-07 eError,eStart,eStart,eStart,eStart,eStart, 3,eError,#00-07
eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,#08-0f eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,#08-0f
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,#10-17 eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,#10-17
4,eError,eStart,eStart,eError,eError,eError,eError,#18-1f 4,eError,eStart,eStart,eError,eError,eError,eError,#18-1f
eError,eError, 5,eError,eError,eError,eItsMe,eError,#20-27 eError,eError, 5,eError,eError,eError,eItsMe,eError,#20-27
eError,eError,eStart,eStart,eStart,eStart,eStart,eStart)#28-2f eError,eError,eStart,eStart,eStart,eStart,eStart,eStart #28-2f
)
# To be accurate, the length of class 6 can be either 2 or 4. # To be accurate, the length of class 6 can be either 2 or 4.
# But it is not necessary to discriminate between the two since # But it is not necessary to discriminate between the two since
@ -286,7 +336,7 @@ GB2312SMModel = {'classTable': GB2312_cls,
# Shift_JIS # Shift_JIS
SJIS_cls = ( \ SJIS_cls = (
1,1,1,1,1,1,1,1, # 00 - 07 1,1,1,1,1,1,1,1, # 00 - 07
1,1,1,1,1,1,0,0, # 08 - 0f 1,1,1,1,1,1,0,0, # 08 - 0f
1,1,1,1,1,1,1,1, # 10 - 17 1,1,1,1,1,1,1,1, # 10 - 17
@ -303,7 +353,7 @@ SJIS_cls = ( \
2,2,2,2,2,2,2,2, # 68 - 6f 2,2,2,2,2,2,2,2, # 68 - 6f
2,2,2,2,2,2,2,2, # 70 - 77 2,2,2,2,2,2,2,2, # 70 - 77
2,2,2,2,2,2,2,1, # 78 - 7f 2,2,2,2,2,2,2,1, # 78 - 7f
3,3,3,3,3,3,3,3, # 80 - 87 3,3,3,3,3,2,2,3, # 80 - 87
3,3,3,3,3,3,3,3, # 88 - 8f 3,3,3,3,3,3,3,3, # 88 - 8f
3,3,3,3,3,3,3,3, # 90 - 97 3,3,3,3,3,3,3,3, # 90 - 97
3,3,3,3,3,3,3,3, # 98 - 9f 3,3,3,3,3,3,3,3, # 98 - 9f
@ -319,13 +369,15 @@ SJIS_cls = ( \
2,2,2,2,2,2,2,2, # d8 - df 2,2,2,2,2,2,2,2, # d8 - df
3,3,3,3,3,3,3,3, # e0 - e7 3,3,3,3,3,3,3,3, # e0 - e7
3,3,3,3,3,4,4,4, # e8 - ef 3,3,3,3,3,4,4,4, # e8 - ef
4,4,4,4,4,4,4,4, # f0 - f7 3,3,3,3,3,3,3,3, # f0 - f7
4,4,4,4,4,0,0,0) # f8 - ff 3,3,3,3,3,0,0,0) # f8 - ff
SJIS_st = ( \
SJIS_st = (
eError,eStart,eStart, 3,eError,eError,eError,eError,#00-07 eError,eStart,eStart, 3,eError,eError,eError,eError,#00-07
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart)#10-17 eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart #10-17
)
SJISCharLenTable = (0, 1, 1, 2, 0, 0) SJISCharLenTable = (0, 1, 1, 2, 0, 0)
@ -337,7 +389,7 @@ SJISSMModel = {'classTable': SJIS_cls,
# UCS2-BE # UCS2-BE
UCS2BE_cls = ( \ UCS2BE_cls = (
0,0,0,0,0,0,0,0, # 00 - 07 0,0,0,0,0,0,0,0, # 00 - 07
0,0,1,0,0,2,0,0, # 08 - 0f 0,0,1,0,0,2,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17 0,0,0,0,0,0,0,0, # 10 - 17
@ -369,16 +421,18 @@ UCS2BE_cls = ( \
0,0,0,0,0,0,0,0, # e0 - e7 0,0,0,0,0,0,0,0, # e0 - e7
0,0,0,0,0,0,0,0, # e8 - ef 0,0,0,0,0,0,0,0, # e8 - ef
0,0,0,0,0,0,0,0, # f0 - f7 0,0,0,0,0,0,0,0, # f0 - f7
0,0,0,0,0,0,4,5) # f8 - ff 0,0,0,0,0,0,4,5 # f8 - ff
)
UCS2BE_st = ( \ UCS2BE_st = (
5, 7, 7,eError, 4, 3,eError,eError,#00-07 5, 7, 7,eError, 4, 3,eError,eError,#00-07
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
eItsMe,eItsMe, 6, 6, 6, 6,eError,eError,#10-17 eItsMe,eItsMe, 6, 6, 6, 6,eError,eError,#10-17
6, 6, 6, 6, 6,eItsMe, 6, 6,#18-1f 6, 6, 6, 6, 6,eItsMe, 6, 6,#18-1f
6, 6, 6, 6, 5, 7, 7,eError,#20-27 6, 6, 6, 6, 5, 7, 7,eError,#20-27
5, 8, 6, 6,eError, 6, 6, 6,#28-2f 5, 8, 6, 6,eError, 6, 6, 6,#28-2f
6, 6, 6, 6,eError,eError,eStart,eStart)#30-37 6, 6, 6, 6,eError,eError,eStart,eStart #30-37
)
UCS2BECharLenTable = (2, 2, 2, 0, 2, 2) UCS2BECharLenTable = (2, 2, 2, 0, 2, 2)
@ -390,7 +444,7 @@ UCS2BESMModel = {'classTable': UCS2BE_cls,
# UCS2-LE # UCS2-LE
UCS2LE_cls = ( \ UCS2LE_cls = (
0,0,0,0,0,0,0,0, # 00 - 07 0,0,0,0,0,0,0,0, # 00 - 07
0,0,1,0,0,2,0,0, # 08 - 0f 0,0,1,0,0,2,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17 0,0,0,0,0,0,0,0, # 10 - 17
@ -422,16 +476,18 @@ UCS2LE_cls = ( \
0,0,0,0,0,0,0,0, # e0 - e7 0,0,0,0,0,0,0,0, # e0 - e7
0,0,0,0,0,0,0,0, # e8 - ef 0,0,0,0,0,0,0,0, # e8 - ef
0,0,0,0,0,0,0,0, # f0 - f7 0,0,0,0,0,0,0,0, # f0 - f7
0,0,0,0,0,0,4,5) # f8 - ff 0,0,0,0,0,0,4,5 # f8 - ff
)
UCS2LE_st = ( \ UCS2LE_st = (
6, 6, 7, 6, 4, 3,eError,eError,#00-07 6, 6, 7, 6, 4, 3,eError,eError,#00-07
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
eItsMe,eItsMe, 5, 5, 5,eError,eItsMe,eError,#10-17 eItsMe,eItsMe, 5, 5, 5,eError,eItsMe,eError,#10-17
5, 5, 5,eError, 5,eError, 6, 6,#18-1f 5, 5, 5,eError, 5,eError, 6, 6,#18-1f
7, 6, 8, 8, 5, 5, 5,eError,#20-27 7, 6, 8, 8, 5, 5, 5,eError,#20-27
5, 5, 5,eError,eError,eError, 5, 5,#28-2f 5, 5, 5,eError,eError,eError, 5, 5,#28-2f
5, 5, 5,eError, 5,eError,eStart,eStart)#30-37 5, 5, 5,eError, 5,eError,eStart,eStart #30-37
)
UCS2LECharLenTable = (2, 2, 2, 2, 2, 2) UCS2LECharLenTable = (2, 2, 2, 2, 2, 2)
@ -443,7 +499,7 @@ UCS2LESMModel = {'classTable': UCS2LE_cls,
# UTF-8 # UTF-8
UTF8_cls = ( \ UTF8_cls = (
1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as a legal value 1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as a legal value
1,1,1,1,1,1,0,0, # 08 - 0f 1,1,1,1,1,1,0,0, # 08 - 0f
1,1,1,1,1,1,1,1, # 10 - 17 1,1,1,1,1,1,1,1, # 10 - 17
@ -475,9 +531,10 @@ UTF8_cls = ( \
7,8,8,8,8,8,8,8, # e0 - e7 7,8,8,8,8,8,8,8, # e0 - e7
8,8,8,8,8,9,8,8, # e8 - ef 8,8,8,8,8,9,8,8, # e8 - ef
10,11,11,11,11,11,11,11, # f0 - f7 10,11,11,11,11,11,11,11, # f0 - f7
12,13,13,13,14,15,0,0) # f8 - ff 12,13,13,13,14,15,0,0 # f8 - ff
)
UTF8_st = ( \ UTF8_st = (
eError,eStart,eError,eError,eError,eError, 12, 10,#00-07 eError,eStart,eError,eError,eError,eError, 12, 10,#00-07
9, 11, 8, 7, 6, 5, 4, 3,#08-0f 9, 11, 8, 7, 6, 5, 4, 3,#08-0f
eError,eError,eError,eError,eError,eError,eError,eError,#10-17 eError,eError,eError,eError,eError,eError,eError,eError,#10-17
@ -503,7 +560,8 @@ UTF8_st = ( \
eError,eError, 12, 12, 12,eError,eError,eError,#b0-b7 eError,eError, 12, 12, 12,eError,eError,eError,#b0-b7
eError,eError,eError,eError,eError,eError,eError,eError,#b8-bf eError,eError,eError,eError,eError,eError,eError,eError,#b8-bf
eError,eError,eStart,eStart,eStart,eStart,eError,eError,#c0-c7 eError,eError,eStart,eStart,eStart,eStart,eError,eError,#c0-c7
eError,eError,eError,eError,eError,eError,eError,eError)#c8-cf eError,eError,eError,eError,eError,eError,eError,eError #c8-cf
)
UTF8CharLenTable = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6) UTF8CharLenTable = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6)

View File

@ -26,8 +26,10 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants, sys import sys
from charsetprober import CharSetProber from . import constants
from .charsetprober import CharSetProber
from .compat import wrap_ord
SAMPLE_SIZE = 64 SAMPLE_SIZE = 64
SB_ENOUGH_REL_THRESHOLD = 1024 SB_ENOUGH_REL_THRESHOLD = 1024
@ -38,21 +40,26 @@ NUMBER_OF_SEQ_CAT = 4
POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1 POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1
#NEGATIVE_CAT = 0 #NEGATIVE_CAT = 0
class SingleByteCharSetProber(CharSetProber): class SingleByteCharSetProber(CharSetProber):
def __init__(self, model, reversed=constants.False, nameProber=None): def __init__(self, model, reversed=False, nameProber=None):
CharSetProber.__init__(self) CharSetProber.__init__(self)
self._mModel = model self._mModel = model
self._mReversed = reversed # TRUE if we need to reverse every pair in the model lookup # TRUE if we need to reverse every pair in the model lookup
self._mNameProber = nameProber # Optional auxiliary prober for name decision self._mReversed = reversed
# Optional auxiliary prober for name decision
self._mNameProber = nameProber
self.reset() self.reset()
def reset(self): def reset(self):
CharSetProber.reset(self) CharSetProber.reset(self)
self._mLastOrder = 255 # char order of last character # char order of last character
self._mLastOrder = 255
self._mSeqCounters = [0] * NUMBER_OF_SEQ_CAT self._mSeqCounters = [0] * NUMBER_OF_SEQ_CAT
self._mTotalSeqs = 0 self._mTotalSeqs = 0
self._mTotalChar = 0 self._mTotalChar = 0
self._mFreqChar = 0 # characters that fall in our sampling range # characters that fall in our sampling range
self._mFreqChar = 0
def get_charset_name(self): def get_charset_name(self):
if self._mNameProber: if self._mNameProber:
@ -67,7 +74,7 @@ class SingleByteCharSetProber(CharSetProber):
if not aLen: if not aLen:
return self.get_state() return self.get_state()
for c in aBuf: for c in aBuf:
order = self._mModel['charToOrderMap'][ord(c)] order = self._mModel['charToOrderMap'][wrap_ord(c)]
if order < SYMBOL_CAT_ORDER: if order < SYMBOL_CAT_ORDER:
self._mTotalChar += 1 self._mTotalChar += 1
if order < SAMPLE_SIZE: if order < SAMPLE_SIZE:
@ -75,9 +82,12 @@ class SingleByteCharSetProber(CharSetProber):
if self._mLastOrder < SAMPLE_SIZE: if self._mLastOrder < SAMPLE_SIZE:
self._mTotalSeqs += 1 self._mTotalSeqs += 1
if not self._mReversed: if not self._mReversed:
self._mSeqCounters[self._mModel['precedenceMatrix'][(self._mLastOrder * SAMPLE_SIZE) + order]] += 1 i = (self._mLastOrder * SAMPLE_SIZE) + order
else: # reverse the order of the letters in the lookup model = self._mModel['precedenceMatrix'][i]
self._mSeqCounters[self._mModel['precedenceMatrix'][(order * SAMPLE_SIZE) + self._mLastOrder]] += 1 else: # reverse the order of the letters in the lookup
i = (order * SAMPLE_SIZE) + self._mLastOrder
model = self._mModel['precedenceMatrix'][i]
self._mSeqCounters[model] += 1
self._mLastOrder = order self._mLastOrder = order
if self.get_state() == constants.eDetecting: if self.get_state() == constants.eDetecting:
@ -85,11 +95,16 @@ class SingleByteCharSetProber(CharSetProber):
cf = self.get_confidence() cf = self.get_confidence()
if cf > POSITIVE_SHORTCUT_THRESHOLD: if cf > POSITIVE_SHORTCUT_THRESHOLD:
if constants._debug: if constants._debug:
sys.stderr.write('%s confidence = %s, we have a winner\n' % (self._mModel['charsetName'], cf)) sys.stderr.write('%s confidence = %s, we have a'
'winner\n' %
(self._mModel['charsetName'], cf))
self._mState = constants.eFoundIt self._mState = constants.eFoundIt
elif cf < NEGATIVE_SHORTCUT_THRESHOLD: elif cf < NEGATIVE_SHORTCUT_THRESHOLD:
if constants._debug: if constants._debug:
sys.stderr.write('%s confidence = %s, below negative shortcut threshhold %s\n' % (self._mModel['charsetName'], cf, NEGATIVE_SHORTCUT_THRESHOLD)) sys.stderr.write('%s confidence = %s, below negative'
'shortcut threshhold %s\n' %
(self._mModel['charsetName'], cf,
NEGATIVE_SHORTCUT_THRESHOLD))
self._mState = constants.eNotMe self._mState = constants.eNotMe
return self.get_state() return self.get_state()
@ -97,9 +112,8 @@ class SingleByteCharSetProber(CharSetProber):
def get_confidence(self): def get_confidence(self):
r = 0.01 r = 0.01
if self._mTotalSeqs > 0: if self._mTotalSeqs > 0:
# print self._mSeqCounters[POSITIVE_CAT], self._mTotalSeqs, self._mModel['mTypicalPositiveRatio'] r = ((1.0 * self._mSeqCounters[POSITIVE_CAT]) / self._mTotalSeqs
r = (1.0 * self._mSeqCounters[POSITIVE_CAT]) / self._mTotalSeqs / self._mModel['mTypicalPositiveRatio'] / self._mModel['mTypicalPositiveRatio'])
# print r, self._mFreqChar, self._mTotalChar
r = r * self._mFreqChar / self._mTotalChar r = r * self._mFreqChar / self._mTotalChar
if r >= 1.0: if r >= 1.0:
r = 0.99 r = 0.99

View File

@ -26,21 +26,23 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants, sys from .charsetgroupprober import CharSetGroupProber
from charsetgroupprober import CharSetGroupProber from .sbcharsetprober import SingleByteCharSetProber
from sbcharsetprober import SingleByteCharSetProber from .langcyrillicmodel import (Win1251CyrillicModel, Koi8rModel,
from langcyrillicmodel import Win1251CyrillicModel, Koi8rModel, Latin5CyrillicModel, MacCyrillicModel, Ibm866Model, Ibm855Model Latin5CyrillicModel, MacCyrillicModel,
from langgreekmodel import Latin7GreekModel, Win1253GreekModel Ibm866Model, Ibm855Model)
from langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel from .langgreekmodel import Latin7GreekModel, Win1253GreekModel
from langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel from .langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel
from langthaimodel import TIS620ThaiModel from .langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel
from langhebrewmodel import Win1255HebrewModel from .langthaimodel import TIS620ThaiModel
from hebrewprober import HebrewProber from .langhebrewmodel import Win1255HebrewModel
from .hebrewprober import HebrewProber
class SBCSGroupProber(CharSetGroupProber): class SBCSGroupProber(CharSetGroupProber):
def __init__(self): def __init__(self):
CharSetGroupProber.__init__(self) CharSetGroupProber.__init__(self)
self._mProbers = [ \ self._mProbers = [
SingleByteCharSetProber(Win1251CyrillicModel), SingleByteCharSetProber(Win1251CyrillicModel),
SingleByteCharSetProber(Koi8rModel), SingleByteCharSetProber(Koi8rModel),
SingleByteCharSetProber(Latin5CyrillicModel), SingleByteCharSetProber(Latin5CyrillicModel),
@ -54,11 +56,14 @@ class SBCSGroupProber(CharSetGroupProber):
SingleByteCharSetProber(Latin2HungarianModel), SingleByteCharSetProber(Latin2HungarianModel),
SingleByteCharSetProber(Win1250HungarianModel), SingleByteCharSetProber(Win1250HungarianModel),
SingleByteCharSetProber(TIS620ThaiModel), SingleByteCharSetProber(TIS620ThaiModel),
] ]
hebrewProber = HebrewProber() hebrewProber = HebrewProber()
logicalHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, constants.False, hebrewProber) logicalHebrewProber = SingleByteCharSetProber(Win1255HebrewModel,
visualHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, constants.True, hebrewProber) False, hebrewProber)
visualHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, True,
hebrewProber)
hebrewProber.set_model_probers(logicalHebrewProber, visualHebrewProber) hebrewProber.set_model_probers(logicalHebrewProber, visualHebrewProber)
self._mProbers.extend([hebrewProber, logicalHebrewProber, visualHebrewProber]) self._mProbers.extend([hebrewProber, logicalHebrewProber,
visualHebrewProber])
self.reset() self.reset()

View File

@ -25,13 +25,14 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from mbcharsetprober import MultiByteCharSetProber import sys
from codingstatemachine import CodingStateMachine from .mbcharsetprober import MultiByteCharSetProber
from chardistribution import SJISDistributionAnalysis from .codingstatemachine import CodingStateMachine
from jpcntx import SJISContextAnalysis from .chardistribution import SJISDistributionAnalysis
from mbcssm import SJISSMModel from .jpcntx import SJISContextAnalysis
import constants, sys from .mbcssm import SJISSMModel
from constants import eStart, eError, eItsMe from . import constants
class SJISProber(MultiByteCharSetProber): class SJISProber(MultiByteCharSetProber):
def __init__(self): def __init__(self):
@ -46,35 +47,40 @@ class SJISProber(MultiByteCharSetProber):
self._mContextAnalyzer.reset() self._mContextAnalyzer.reset()
def get_charset_name(self): def get_charset_name(self):
return "SHIFT_JIS" return self._mContextAnalyzer.get_charset_name()
def feed(self, aBuf): def feed(self, aBuf):
aLen = len(aBuf) aLen = len(aBuf)
for i in xrange(0, aLen): for i in range(0, aLen):
codingState = self._mCodingSM.next_state(aBuf[i]) codingState = self._mCodingSM.next_state(aBuf[i])
if codingState == eError: if codingState == constants.eError:
if constants._debug: if constants._debug:
sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n') sys.stderr.write(self.get_charset_name()
+ ' prober hit error at byte ' + str(i)
+ '\n')
self._mState = constants.eNotMe self._mState = constants.eNotMe
break break
elif codingState == eItsMe: elif codingState == constants.eItsMe:
self._mState = constants.eFoundIt self._mState = constants.eFoundIt
break break
elif codingState == eStart: elif codingState == constants.eStart:
charLen = self._mCodingSM.get_current_charlen() charLen = self._mCodingSM.get_current_charlen()
if i == 0: if i == 0:
self._mLastChar[1] = aBuf[0] self._mLastChar[1] = aBuf[0]
self._mContextAnalyzer.feed(self._mLastChar[2 - charLen :], charLen) self._mContextAnalyzer.feed(self._mLastChar[2 - charLen:],
charLen)
self._mDistributionAnalyzer.feed(self._mLastChar, charLen) self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
else: else:
self._mContextAnalyzer.feed(aBuf[i + 1 - charLen : i + 3 - charLen], charLen) self._mContextAnalyzer.feed(aBuf[i + 1 - charLen:i + 3
self._mDistributionAnalyzer.feed(aBuf[i - 1 : i + 1], charLen) - charLen], charLen)
self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1],
charLen)
self._mLastChar[0] = aBuf[aLen - 1] self._mLastChar[0] = aBuf[aLen - 1]
if self.get_state() == constants.eDetecting: if self.get_state() == constants.eDetecting:
if self._mContextAnalyzer.got_enough_data() and \ if (self._mContextAnalyzer.got_enough_data() and
(self.get_confidence() > constants.SHORTCUT_THRESHOLD): (self.get_confidence() > constants.SHORTCUT_THRESHOLD)):
self._mState = constants.eFoundIt self._mState = constants.eFoundIt
return self.get_state() return self.get_state()

View File

@ -1,20 +0,0 @@
import sys, glob
sys.path.insert(0, '..')
from chardet.universaldetector import UniversalDetector
count = 0
u = UniversalDetector()
for f in glob.glob(sys.argv[1]):
print f.ljust(60),
u.reset()
for line in file(f, 'rb'):
u.feed(line)
if u.done: break
u.close()
result = u.result
if result['encoding']:
print result['encoding'], 'with confidence', result['confidence']
else:
print '******** no result'
count += 1
print count, 'tests'

View File

@ -26,11 +26,13 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants, sys from . import constants
from latin1prober import Latin1Prober # windows-1252 import sys
from mbcsgroupprober import MBCSGroupProber # multi-byte character sets import codecs
from sbcsgroupprober import SBCSGroupProber # single-byte character sets from .latin1prober import Latin1Prober # windows-1252
from escprober import EscCharSetProber # ISO-2122, etc. from .mbcsgroupprober import MBCSGroupProber # multi-byte character sets
from .sbcsgroupprober import SBCSGroupProber # single-byte character sets
from .escprober import EscCharSetProber # ISO-2122, etc.
import re import re
MINIMUM_THRESHOLD = 0.20 MINIMUM_THRESHOLD = 0.20
@ -38,68 +40,78 @@ ePureAscii = 0
eEscAscii = 1 eEscAscii = 1
eHighbyte = 2 eHighbyte = 2
class UniversalDetector: class UniversalDetector:
def __init__(self): def __init__(self):
self._highBitDetector = re.compile(r'[\x80-\xFF]') self._highBitDetector = re.compile(b'[\x80-\xFF]')
self._escDetector = re.compile(r'(\033|~{)') self._escDetector = re.compile(b'(\033|~{)')
self._mEscCharSetProber = None self._mEscCharSetProber = None
self._mCharSetProbers = [] self._mCharSetProbers = []
self.reset() self.reset()
def reset(self): def reset(self):
self.result = {'encoding': None, 'confidence': 0.0} self.result = {'encoding': None, 'confidence': 0.0}
self.done = constants.False self.done = False
self._mStart = constants.True self._mStart = True
self._mGotData = constants.False self._mGotData = False
self._mInputState = ePureAscii self._mInputState = ePureAscii
self._mLastChar = '' self._mLastChar = b''
if self._mEscCharSetProber: if self._mEscCharSetProber:
self._mEscCharSetProber.reset() self._mEscCharSetProber.reset()
for prober in self._mCharSetProbers: for prober in self._mCharSetProbers:
prober.reset() prober.reset()
def feed(self, aBuf): def feed(self, aBuf):
if self.done: return if self.done:
return
aLen = len(aBuf) aLen = len(aBuf)
if not aLen: return if not aLen:
return
if not self._mGotData: if not self._mGotData:
# If the data starts with BOM, we know it is UTF # If the data starts with BOM, we know it is UTF
if aBuf[:3] == '\xEF\xBB\xBF': if aBuf[:3] == codecs.BOM_UTF8:
# EF BB BF UTF-8 with BOM # EF BB BF UTF-8 with BOM
self.result = {'encoding': "UTF-8", 'confidence': 1.0} self.result = {'encoding': "UTF-8-SIG", 'confidence': 1.0}
elif aBuf[:4] == '\xFF\xFE\x00\x00': elif aBuf[:4] == codecs.BOM_UTF32_LE:
# FF FE 00 00 UTF-32, little-endian BOM # FF FE 00 00 UTF-32, little-endian BOM
self.result = {'encoding': "UTF-32LE", 'confidence': 1.0} self.result = {'encoding': "UTF-32LE", 'confidence': 1.0}
elif aBuf[:4] == '\x00\x00\xFE\xFF': elif aBuf[:4] == codecs.BOM_UTF32_BE:
# 00 00 FE FF UTF-32, big-endian BOM # 00 00 FE FF UTF-32, big-endian BOM
self.result = {'encoding': "UTF-32BE", 'confidence': 1.0} self.result = {'encoding': "UTF-32BE", 'confidence': 1.0}
elif aBuf[:4] == '\xFE\xFF\x00\x00': elif aBuf[:4] == b'\xFE\xFF\x00\x00':
# FE FF 00 00 UCS-4, unusual octet order BOM (3412) # FE FF 00 00 UCS-4, unusual octet order BOM (3412)
self.result = {'encoding': "X-ISO-10646-UCS-4-3412", 'confidence': 1.0} self.result = {
elif aBuf[:4] == '\x00\x00\xFF\xFE': 'encoding': "X-ISO-10646-UCS-4-3412",
'confidence': 1.0
}
elif aBuf[:4] == b'\x00\x00\xFF\xFE':
# 00 00 FF FE UCS-4, unusual octet order BOM (2143) # 00 00 FF FE UCS-4, unusual octet order BOM (2143)
self.result = {'encoding': "X-ISO-10646-UCS-4-2143", 'confidence': 1.0} self.result = {
elif aBuf[:2] == '\xFF\xFE': 'encoding': "X-ISO-10646-UCS-4-2143",
'confidence': 1.0
}
elif aBuf[:2] == codecs.BOM_LE:
# FF FE UTF-16, little endian BOM # FF FE UTF-16, little endian BOM
self.result = {'encoding': "UTF-16LE", 'confidence': 1.0} self.result = {'encoding': "UTF-16LE", 'confidence': 1.0}
elif aBuf[:2] == '\xFE\xFF': elif aBuf[:2] == codecs.BOM_BE:
# FE FF UTF-16, big endian BOM # FE FF UTF-16, big endian BOM
self.result = {'encoding': "UTF-16BE", 'confidence': 1.0} self.result = {'encoding': "UTF-16BE", 'confidence': 1.0}
self._mGotData = constants.True self._mGotData = True
if self.result['encoding'] and (self.result['confidence'] > 0.0): if self.result['encoding'] and (self.result['confidence'] > 0.0):
self.done = constants.True self.done = True
return return
if self._mInputState == ePureAscii: if self._mInputState == ePureAscii:
if self._highBitDetector.search(aBuf): if self._highBitDetector.search(aBuf):
self._mInputState = eHighbyte self._mInputState = eHighbyte
elif (self._mInputState == ePureAscii) and self._escDetector.search(self._mLastChar + aBuf): elif ((self._mInputState == ePureAscii) and
self._escDetector.search(self._mLastChar + aBuf)):
self._mInputState = eEscAscii self._mInputState = eEscAscii
self._mLastChar = aBuf[-1] self._mLastChar = aBuf[-1:]
if self._mInputState == eEscAscii: if self._mInputState == eEscAscii:
if not self._mEscCharSetProber: if not self._mEscCharSetProber:
@ -107,24 +119,26 @@ class UniversalDetector:
if self._mEscCharSetProber.feed(aBuf) == constants.eFoundIt: if self._mEscCharSetProber.feed(aBuf) == constants.eFoundIt:
self.result = {'encoding': self._mEscCharSetProber.get_charset_name(), self.result = {'encoding': self._mEscCharSetProber.get_charset_name(),
'confidence': self._mEscCharSetProber.get_confidence()} 'confidence': self._mEscCharSetProber.get_confidence()}
self.done = constants.True self.done = True
elif self._mInputState == eHighbyte: elif self._mInputState == eHighbyte:
if not self._mCharSetProbers: if not self._mCharSetProbers:
self._mCharSetProbers = [MBCSGroupProber(), SBCSGroupProber(), Latin1Prober()] self._mCharSetProbers = [MBCSGroupProber(), SBCSGroupProber(),
Latin1Prober()]
for prober in self._mCharSetProbers: for prober in self._mCharSetProbers:
if prober.feed(aBuf) == constants.eFoundIt: if prober.feed(aBuf) == constants.eFoundIt:
self.result = {'encoding': prober.get_charset_name(), self.result = {'encoding': prober.get_charset_name(),
'confidence': prober.get_confidence()} 'confidence': prober.get_confidence()}
self.done = constants.True self.done = True
break break
def close(self): def close(self):
if self.done: return if self.done:
return
if not self._mGotData: if not self._mGotData:
if constants._debug: if constants._debug:
sys.stderr.write('no data received!\n') sys.stderr.write('no data received!\n')
return return
self.done = constants.True self.done = True
if self._mInputState == ePureAscii: if self._mInputState == ePureAscii:
self.result = {'encoding': 'ascii', 'confidence': 1.0} self.result = {'encoding': 'ascii', 'confidence': 1.0}
@ -135,7 +149,8 @@ class UniversalDetector:
maxProberConfidence = 0.0 maxProberConfidence = 0.0
maxProber = None maxProber = None
for prober in self._mCharSetProbers: for prober in self._mCharSetProbers:
if not prober: continue if not prober:
continue
proberConfidence = prober.get_confidence() proberConfidence = prober.get_confidence()
if proberConfidence > maxProberConfidence: if proberConfidence > maxProberConfidence:
maxProberConfidence = proberConfidence maxProberConfidence = proberConfidence
@ -148,7 +163,8 @@ class UniversalDetector:
if constants._debug: if constants._debug:
sys.stderr.write('no probers hit minimum threshhold\n') sys.stderr.write('no probers hit minimum threshhold\n')
for prober in self._mCharSetProbers[0].mProbers: for prober in self._mCharSetProbers[0].mProbers:
if not prober: continue if not prober:
sys.stderr.write('%s confidence = %s\n' % \ continue
(prober.get_charset_name(), \ sys.stderr.write('%s confidence = %s\n' %
(prober.get_charset_name(),
prober.get_confidence())) prober.get_confidence()))

View File

@ -25,14 +25,14 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants, sys from . import constants
from constants import eStart, eError, eItsMe from .charsetprober import CharSetProber
from charsetprober import CharSetProber from .codingstatemachine import CodingStateMachine
from codingstatemachine import CodingStateMachine from .mbcssm import UTF8SMModel
from mbcssm import UTF8SMModel
ONE_CHAR_PROB = 0.5 ONE_CHAR_PROB = 0.5
class UTF8Prober(CharSetProber): class UTF8Prober(CharSetProber):
def __init__(self): def __init__(self):
CharSetProber.__init__(self) CharSetProber.__init__(self)
@ -50,13 +50,13 @@ class UTF8Prober(CharSetProber):
def feed(self, aBuf): def feed(self, aBuf):
for c in aBuf: for c in aBuf:
codingState = self._mCodingSM.next_state(c) codingState = self._mCodingSM.next_state(c)
if codingState == eError: if codingState == constants.eError:
self._mState = constants.eNotMe self._mState = constants.eNotMe
break break
elif codingState == eItsMe: elif codingState == constants.eItsMe:
self._mState = constants.eFoundIt self._mState = constants.eFoundIt
break break
elif codingState == eStart: elif codingState == constants.eStart:
if self._mCodingSM.get_current_charlen() >= 2: if self._mCodingSM.get_current_charlen() >= 2:
self._mNumOfMBChar += 1 self._mNumOfMBChar += 1
@ -69,7 +69,7 @@ class UTF8Prober(CharSetProber):
def get_confidence(self): def get_confidence(self):
unlike = 0.99 unlike = 0.99
if self._mNumOfMBChar < 6: if self._mNumOfMBChar < 6:
for i in xrange(0, self._mNumOfMBChar): for i in range(0, self._mNumOfMBChar):
unlike = unlike * ONE_CHAR_PROB unlike = unlike * ONE_CHAR_PROB
return 1.0 - unlike return 1.0 - unlike
else: else: