Adding new version of chardet

This commit is contained in:
Miroslav Stampar 2015-10-09 13:35:48 +02:00
parent d424d4cdc7
commit 439d003753
39 changed files with 1499 additions and 1148 deletions

View File

@ -15,10 +15,16 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
__version__ = "2.0.1"
__version__ = "2.3.0"
from sys import version_info
def detect(aBuf):
import universaldetector
if ((version_info < (3, 0) and isinstance(aBuf, unicode)) or
(version_info >= (3, 0) and not isinstance(aBuf, bytes))):
raise ValueError('Expected a bytes object, not a unicode object')
from . import universaldetector
u = universaldetector.UniversalDetector()
u.reset()
u.feed(aBuf)

View File

@ -45,7 +45,7 @@ BIG5_TYPICAL_DISTRIBUTION_RATIO = 0.75
#Char to FreqOrder table
BIG5_TABLE_SIZE = 5376
Big5CharToFreqOrder = ( \
Big5CharToFreqOrder = (
1,1801,1506, 255,1431, 198, 9, 82, 6,5008, 177, 202,3681,1256,2821, 110, # 16
3814, 33,3274, 261, 76, 44,2114, 16,2946,2187,1176, 659,3971, 26,3451,2653, # 32
1198,3972,3350,4202, 410,2215, 302, 590, 361,1964, 8, 204, 58,4510,5009,1932, # 48
@ -921,3 +921,5 @@ Big5CharToFreqOrder = ( \
13936,13937,13938,13939,13940,13941,13942,13943,13944,13945,13946,13947,13948,13949,13950,13951, #13952
13952,13953,13954,13955,13956,13957,13958,13959,13960,13961,13962,13963,13964,13965,13966,13967, #13968
13968,13969,13970,13971,13972) #13973
# flake8: noqa

View File

@ -25,10 +25,11 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from mbcharsetprober import MultiByteCharSetProber
from codingstatemachine import CodingStateMachine
from chardistribution import Big5DistributionAnalysis
from mbcssm import Big5SMModel
from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from .chardistribution import Big5DistributionAnalysis
from .mbcssm import Big5SMModel
class Big5Prober(MultiByteCharSetProber):
def __init__(self):

80
thirdparty/chardet/chardetect.py vendored Normal file
View File

@ -0,0 +1,80 @@
#!/usr/bin/env python
"""
Script which takes one or more file paths and reports on their detected
encodings
Example::
% chardetect somefile someotherfile
somefile: windows-1252 with confidence 0.5
someotherfile: ascii with confidence 1.0
If no paths are provided, it takes its input from stdin.
"""
from __future__ import absolute_import, print_function, unicode_literals
import argparse
import sys
from io import open
from chardet import __version__
from chardet.universaldetector import UniversalDetector
def description_of(lines, name='stdin'):
"""
Return a string describing the probable encoding of a file or
list of strings.
:param lines: The lines to get the encoding of.
:type lines: Iterable of bytes
:param name: Name of file or collection of lines
:type name: str
"""
u = UniversalDetector()
for line in lines:
u.feed(line)
u.close()
result = u.result
if result['encoding']:
return '{0}: {1} with confidence {2}'.format(name, result['encoding'],
result['confidence'])
else:
return '{0}: no result'.format(name)
def main(argv=None):
'''
Handles command line arguments and gets things started.
:param argv: List of arguments, as if specified on the command-line.
If None, ``sys.argv[1:]`` is used instead.
:type argv: list of str
'''
# Get command line arguments
parser = argparse.ArgumentParser(
description="Takes one or more file paths and reports their detected \
encodings",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
conflict_handler='resolve')
parser.add_argument('input',
help='File whose encoding we would like to determine.',
type=argparse.FileType('rb'), nargs='*',
default=[sys.stdin])
parser.add_argument('--version', action='version',
version='%(prog)s {0}'.format(__version__))
args = parser.parse_args(argv)
for f in args.input:
if f.isatty():
print("You are running chardetect interactively. Press " +
"CTRL-D twice at the start of a blank line to signal the " +
"end of your input. If you want help, run chardetect " +
"--help\n", file=sys.stderr)
print(description_of(f, f.name))
if __name__ == '__main__':
main()

View File

@ -25,35 +25,51 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants
from euctwfreq import EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE, EUCTW_TYPICAL_DISTRIBUTION_RATIO
from euckrfreq import EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE, EUCKR_TYPICAL_DISTRIBUTION_RATIO
from gb2312freq import GB2312CharToFreqOrder, GB2312_TABLE_SIZE, GB2312_TYPICAL_DISTRIBUTION_RATIO
from big5freq import Big5CharToFreqOrder, BIG5_TABLE_SIZE, BIG5_TYPICAL_DISTRIBUTION_RATIO
from jisfreq import JISCharToFreqOrder, JIS_TABLE_SIZE, JIS_TYPICAL_DISTRIBUTION_RATIO
from .euctwfreq import (EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE,
EUCTW_TYPICAL_DISTRIBUTION_RATIO)
from .euckrfreq import (EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE,
EUCKR_TYPICAL_DISTRIBUTION_RATIO)
from .gb2312freq import (GB2312CharToFreqOrder, GB2312_TABLE_SIZE,
GB2312_TYPICAL_DISTRIBUTION_RATIO)
from .big5freq import (Big5CharToFreqOrder, BIG5_TABLE_SIZE,
BIG5_TYPICAL_DISTRIBUTION_RATIO)
from .jisfreq import (JISCharToFreqOrder, JIS_TABLE_SIZE,
JIS_TYPICAL_DISTRIBUTION_RATIO)
from .compat import wrap_ord
ENOUGH_DATA_THRESHOLD = 1024
SURE_YES = 0.99
SURE_NO = 0.01
MINIMUM_DATA_THRESHOLD = 3
class CharDistributionAnalysis:
def __init__(self):
self._mCharToFreqOrder = None # Mapping table to get frequency order from char order (get from GetOrder())
# Mapping table to get frequency order from char order (get from
# GetOrder())
self._mCharToFreqOrder = None
self._mTableSize = None # Size of above table
self._mTypicalDistributionRatio = None # This is a constant value which varies from language to language, used in calculating confidence. See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail.
# This is a constant value which varies from language to language,
# used in calculating confidence. See
# http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
# for further detail.
self._mTypicalDistributionRatio = None
self.reset()
def reset(self):
"""reset analyser, clear any state"""
self._mDone = constants.False # If this flag is set to constants.True, detection is done and conclusion has been made
# If this flag is set to True, detection is done and conclusion has
# been made
self._mDone = False
self._mTotalChars = 0 # Total characters encountered
self._mFreqChars = 0 # The number of characters whose frequency order is less than 512
# The number of characters whose frequency order is less than 512
self._mFreqChars = 0
def feed(self, aStr, aCharLen):
def feed(self, aBuf, aCharLen):
"""feed a character with known length"""
if aCharLen == 2:
# we only care about 2-bytes character in our distribution analysis
order = self.get_order(aStr)
order = self.get_order(aBuf)
else:
order = -1
if order >= 0:
@ -65,12 +81,14 @@ class CharDistributionAnalysis:
def get_confidence(self):
"""return confidence based on existing data"""
# if we didn't receive any character in our consideration range, return negative answer
if self._mTotalChars <= 0:
# if we didn't receive any character in our consideration range,
# return negative answer
if self._mTotalChars <= 0 or self._mFreqChars <= MINIMUM_DATA_THRESHOLD:
return SURE_NO
if self._mTotalChars != self._mFreqChars:
r = self._mFreqChars / ((self._mTotalChars - self._mFreqChars) * self._mTypicalDistributionRatio)
r = (self._mFreqChars / ((self._mTotalChars - self._mFreqChars)
* self._mTypicalDistributionRatio))
if r < SURE_YES:
return r
@ -78,16 +96,18 @@ class CharDistributionAnalysis:
return SURE_YES
def got_enough_data(self):
# It is not necessary to receive all data to draw conclusion. For charset detection,
# certain amount of data is enough
# It is not necessary to receive all data to draw conclusion.
# For charset detection, certain amount of data is enough
return self._mTotalChars > ENOUGH_DATA_THRESHOLD
def get_order(self, aStr):
# We do not handle characters based on the original encoding string, but
# convert this encoding string to a number, here called order.
# This allows multiple encodings of a language to share one frequency table.
def get_order(self, aBuf):
# We do not handle characters based on the original encoding string,
# but convert this encoding string to a number, here called order.
# This allows multiple encodings of a language to share one frequency
# table.
return -1
class EUCTWDistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
CharDistributionAnalysis.__init__(self)
@ -95,16 +115,18 @@ class EUCTWDistributionAnalysis(CharDistributionAnalysis):
self._mTableSize = EUCTW_TABLE_SIZE
self._mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, aStr):
def get_order(self, aBuf):
# for euc-TW encoding, we are interested
# first byte range: 0xc4 -- 0xfe
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
if aStr[0] >= '\xC4':
return 94 * (ord(aStr[0]) - 0xC4) + ord(aStr[1]) - 0xA1
first_char = wrap_ord(aBuf[0])
if first_char >= 0xC4:
return 94 * (first_char - 0xC4) + wrap_ord(aBuf[1]) - 0xA1
else:
return -1
class EUCKRDistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
CharDistributionAnalysis.__init__(self)
@ -112,15 +134,17 @@ class EUCKRDistributionAnalysis(CharDistributionAnalysis):
self._mTableSize = EUCKR_TABLE_SIZE
self._mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, aStr):
def get_order(self, aBuf):
# for euc-KR encoding, we are interested
# first byte range: 0xb0 -- 0xfe
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
if aStr[0] >= '\xB0':
return 94 * (ord(aStr[0]) - 0xB0) + ord(aStr[1]) - 0xA1
first_char = wrap_ord(aBuf[0])
if first_char >= 0xB0:
return 94 * (first_char - 0xB0) + wrap_ord(aBuf[1]) - 0xA1
else:
return -1;
return -1
class GB2312DistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
@ -129,15 +153,17 @@ class GB2312DistributionAnalysis(CharDistributionAnalysis):
self._mTableSize = GB2312_TABLE_SIZE
self._mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, aStr):
def get_order(self, aBuf):
# for GB2312 encoding, we are interested
# first byte range: 0xb0 -- 0xfe
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
if (aStr[0] >= '\xB0') and (aStr[1] >= '\xA1'):
return 94 * (ord(aStr[0]) - 0xB0) + ord(aStr[1]) - 0xA1
first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1])
if (first_char >= 0xB0) and (second_char >= 0xA1):
return 94 * (first_char - 0xB0) + second_char - 0xA1
else:
return -1;
return -1
class Big5DistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
@ -146,19 +172,21 @@ class Big5DistributionAnalysis(CharDistributionAnalysis):
self._mTableSize = BIG5_TABLE_SIZE
self._mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, aStr):
def get_order(self, aBuf):
# for big5 encoding, we are interested
# first byte range: 0xa4 -- 0xfe
# second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
# no validation needed here. State machine has done that
if aStr[0] >= '\xA4':
if aStr[1] >= '\xA1':
return 157 * (ord(aStr[0]) - 0xA4) + ord(aStr[1]) - 0xA1 + 63
first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1])
if first_char >= 0xA4:
if second_char >= 0xA1:
return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63
else:
return 157 * (ord(aStr[0]) - 0xA4) + ord(aStr[1]) - 0x40
return 157 * (first_char - 0xA4) + second_char - 0x40
else:
return -1
class SJISDistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
CharDistributionAnalysis.__init__(self)
@ -166,22 +194,24 @@ class SJISDistributionAnalysis(CharDistributionAnalysis):
self._mTableSize = JIS_TABLE_SIZE
self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, aStr):
def get_order(self, aBuf):
# for sjis encoding, we are interested
# first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
# second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
# no validation needed here. State machine has done that
if (aStr[0] >= '\x81') and (aStr[0] <= '\x9F'):
order = 188 * (ord(aStr[0]) - 0x81)
elif (aStr[0] >= '\xE0') and (aStr[0] <= '\xEF'):
order = 188 * (ord(aStr[0]) - 0xE0 + 31)
first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1])
if (first_char >= 0x81) and (first_char <= 0x9F):
order = 188 * (first_char - 0x81)
elif (first_char >= 0xE0) and (first_char <= 0xEF):
order = 188 * (first_char - 0xE0 + 31)
else:
return -1;
order = order + ord(aStr[1]) - 0x40
if aStr[1] > '\x7F':
order =- 1
return -1
order = order + second_char - 0x40
if second_char > 0x7F:
order = -1
return order
class EUCJPDistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
CharDistributionAnalysis.__init__(self)
@ -189,12 +219,13 @@ class EUCJPDistributionAnalysis(CharDistributionAnalysis):
self._mTableSize = JIS_TABLE_SIZE
self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, aStr):
def get_order(self, aBuf):
# for euc-JP encoding, we are interested
# first byte range: 0xa0 -- 0xfe
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
if aStr[0] >= '\xA0':
return 94 * (ord(aStr[0]) - 0xA1) + ord(aStr[1]) - 0xa1
char = wrap_ord(aBuf[0])
if char >= 0xA0:
return 94 * (char - 0xA1) + wrap_ord(aBuf[1]) - 0xa1
else:
return -1

View File

@ -25,8 +25,10 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants, sys
from charsetprober import CharSetProber
from . import constants
import sys
from .charsetprober import CharSetProber
class CharSetGroupProber(CharSetProber):
def __init__(self):
@ -41,28 +43,32 @@ class CharSetGroupProber(CharSetProber):
for prober in self._mProbers:
if prober:
prober.reset()
prober.active = constants.True
prober.active = True
self._mActiveNum += 1
self._mBestGuessProber = None
def get_charset_name(self):
if not self._mBestGuessProber:
self.get_confidence()
if not self._mBestGuessProber: return None
if not self._mBestGuessProber:
return None
# self._mBestGuessProber = self._mProbers[0]
return self._mBestGuessProber.get_charset_name()
def feed(self, aBuf):
for prober in self._mProbers:
if not prober: continue
if not prober.active: continue
if not prober:
continue
if not prober.active:
continue
st = prober.feed(aBuf)
if not st: continue
if not st:
continue
if st == constants.eFoundIt:
self._mBestGuessProber = prober
return self.get_state()
elif st == constants.eNotMe:
prober.active = constants.False
prober.active = False
self._mActiveNum -= 1
if self._mActiveNum <= 0:
self._mState = constants.eNotMe
@ -78,18 +84,22 @@ class CharSetGroupProber(CharSetProber):
bestConf = 0.0
self._mBestGuessProber = None
for prober in self._mProbers:
if not prober: continue
if not prober:
continue
if not prober.active:
if constants._debug:
sys.stderr.write(prober.get_charset_name() + ' not active\n')
sys.stderr.write(prober.get_charset_name()
+ ' not active\n')
continue
cf = prober.get_confidence()
if constants._debug:
sys.stderr.write('%s confidence = %s\n' % (prober.get_charset_name(), cf))
sys.stderr.write('%s confidence = %s\n' %
(prober.get_charset_name(), cf))
if bestConf < cf:
bestConf = cf
self._mBestGuessProber = prober
if not self._mBestGuessProber: return 0.0
if not self._mBestGuessProber:
return 0.0
return bestConf
# else:
# self._mBestGuessProber = self._mProbers[0]

View File

@ -26,7 +26,9 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants, re
from . import constants
import re
class CharSetProber:
def __init__(self):
@ -48,11 +50,11 @@ class CharSetProber:
return 0.0
def filter_high_bit_only(self, aBuf):
aBuf = re.sub(r'([\x00-\x7F])+', ' ', aBuf)
aBuf = re.sub(b'([\x00-\x7F])+', b' ', aBuf)
return aBuf
def filter_without_english_letters(self, aBuf):
aBuf = re.sub(r'([A-Za-z])+', ' ', aBuf)
aBuf = re.sub(b'([A-Za-z])+', b' ', aBuf)
return aBuf
def filter_with_english_letters(self, aBuf):

View File

@ -25,7 +25,9 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from constants import eStart, eError, eItsMe
from .constants import eStart
from .compat import wrap_ord
class CodingStateMachine:
def __init__(self, sm):
@ -40,12 +42,15 @@ class CodingStateMachine:
def next_state(self, c):
# for each byte we get its class
# if it is first byte, we also get byte length
byteCls = self._mModel['classTable'][ord(c)]
# PY3K: aBuf is a byte stream, so c is an int, not a byte
byteCls = self._mModel['classTable'][wrap_ord(c)]
if self._mCurrentState == eStart:
self._mCurrentBytePos = 0
self._mCurrentCharLen = self._mModel['charLenTable'][byteCls]
# from byte's class and stateTable, we get its next state
self._mCurrentState = self._mModel['stateTable'][self._mCurrentState * self._mModel['classFactor'] + byteCls]
curr_state = (self._mCurrentState * self._mModel['classFactor']
+ byteCls)
self._mCurrentState = self._mModel['stateTable'][curr_state]
self._mCurrentBytePos += 1
return self._mCurrentState

34
thirdparty/chardet/compat.py vendored Normal file
View File

@ -0,0 +1,34 @@
######################## BEGIN LICENSE BLOCK ########################
# Contributor(s):
# Ian Cordasco - port to Python
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import sys
if sys.version_info < (3, 0):
base_str = (str, unicode)
else:
base_str = (bytes, str)
def wrap_ord(a):
if sys.version_info < (3, 0) and isinstance(a, base_str):
return ord(a)
else:
return a

View File

@ -37,11 +37,3 @@ eError = 1
eItsMe = 2
SHORTCUT_THRESHOLD = 0.95
import __builtin__
if not hasattr(__builtin__, 'False'):
False = 0
True = 1
else:
False = __builtin__.False
True = __builtin__.True

44
thirdparty/chardet/cp949prober.py vendored Normal file
View File

@ -0,0 +1,44 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is mozilla.org code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from .chardistribution import EUCKRDistributionAnalysis
from .mbcssm import CP949SMModel
class CP949Prober(MultiByteCharSetProber):
def __init__(self):
MultiByteCharSetProber.__init__(self)
self._mCodingSM = CodingStateMachine(CP949SMModel)
# NOTE: CP949 is a superset of EUC-KR, so the distribution should be
# not different.
self._mDistributionAnalyzer = EUCKRDistributionAnalysis()
self.reset()
def get_charset_name(self):
return "CP949"

View File

@ -25,15 +25,18 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants, sys
from escsm import HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, ISO2022KRSMModel
from charsetprober import CharSetProber
from codingstatemachine import CodingStateMachine
from . import constants
from .escsm import (HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel,
ISO2022KRSMModel)
from .charsetprober import CharSetProber
from .codingstatemachine import CodingStateMachine
from .compat import wrap_ord
class EscCharSetProber(CharSetProber):
def __init__(self):
CharSetProber.__init__(self)
self._mCodingSM = [ \
self._mCodingSM = [
CodingStateMachine(HZSMModel),
CodingStateMachine(ISO2022CNSMModel),
CodingStateMachine(ISO2022JPSMModel),
@ -44,8 +47,9 @@ class EscCharSetProber(CharSetProber):
def reset(self):
CharSetProber.reset(self)
for codingSM in self._mCodingSM:
if not codingSM: continue
codingSM.active = constants.True
if not codingSM:
continue
codingSM.active = True
codingSM.reset()
self._mActiveSM = len(self._mCodingSM)
self._mDetectedCharset = None
@ -61,19 +65,22 @@ class EscCharSetProber(CharSetProber):
def feed(self, aBuf):
for c in aBuf:
# PY3K: aBuf is a byte array, so c is an int, not a byte
for codingSM in self._mCodingSM:
if not codingSM: continue
if not codingSM.active: continue
codingState = codingSM.next_state(c)
if not codingSM:
continue
if not codingSM.active:
continue
codingState = codingSM.next_state(wrap_ord(c))
if codingState == constants.eError:
codingSM.active = constants.False
codingSM.active = False
self._mActiveSM -= 1
if self._mActiveSM <= 0:
self._mState = constants.eNotMe
return self.get_state()
elif codingState == constants.eItsMe:
self._mState = constants.eFoundIt
self._mDetectedCharset = codingSM.get_coding_state_machine()
self._mDetectedCharset = codingSM.get_coding_state_machine() # nopep8
return self.get_state()
return self.get_state()

View File

@ -25,9 +25,9 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from constants import eStart, eError, eItsMe
from .constants import eStart, eError, eItsMe
HZ_cls = ( \
HZ_cls = (
1,0,0,0,0,0,0,0, # 00 - 07
0,0,0,0,0,0,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17
@ -62,7 +62,7 @@ HZ_cls = ( \
1,1,1,1,1,1,1,1, # f8 - ff
)
HZ_st = ( \
HZ_st = (
eStart,eError, 3,eStart,eStart,eStart,eError,eError,# 00-07
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f
eItsMe,eItsMe,eError,eError,eStart,eStart, 4,eError,# 10-17
@ -79,7 +79,7 @@ HZSMModel = {'classTable': HZ_cls,
'charLenTable': HZCharLenTable,
'name': "HZ-GB-2312"}
ISO2022CN_cls = ( \
ISO2022CN_cls = (
2,0,0,0,0,0,0,0, # 00 - 07
0,0,0,0,0,0,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17
@ -114,7 +114,7 @@ ISO2022CN_cls = ( \
2,2,2,2,2,2,2,2, # f8 - ff
)
ISO2022CN_st = ( \
ISO2022CN_st = (
eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07
eStart,eError,eError,eError,eError,eError,eError,eError,# 08-0f
eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17
@ -133,7 +133,7 @@ ISO2022CNSMModel = {'classTable': ISO2022CN_cls,
'charLenTable': ISO2022CNCharLenTable,
'name': "ISO-2022-CN"}
ISO2022JP_cls = ( \
ISO2022JP_cls = (
2,0,0,0,0,0,0,0, # 00 - 07
0,0,0,0,0,0,2,2, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17
@ -168,7 +168,7 @@ ISO2022JP_cls = ( \
2,2,2,2,2,2,2,2, # f8 - ff
)
ISO2022JP_st = ( \
ISO2022JP_st = (
eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07
eStart,eStart,eError,eError,eError,eError,eError,eError,# 08-0f
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17
@ -188,7 +188,7 @@ ISO2022JPSMModel = {'classTable': ISO2022JP_cls,
'charLenTable': ISO2022JPCharLenTable,
'name': "ISO-2022-JP"}
ISO2022KR_cls = ( \
ISO2022KR_cls = (
2,0,0,0,0,0,0,0, # 00 - 07
0,0,0,0,0,0,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17
@ -223,7 +223,7 @@ ISO2022KR_cls = ( \
2,2,2,2,2,2,2,2, # f8 - ff
)
ISO2022KR_st = ( \
ISO2022KR_st = (
eStart, 3,eError,eStart,eStart,eStart,eError,eError,# 00-07
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f
eItsMe,eItsMe,eError,eError,eError, 4,eError,eError,# 10-17
@ -238,3 +238,5 @@ ISO2022KRSMModel = {'classTable': ISO2022KR_cls,
'stateTable': ISO2022KR_st,
'charLenTable': ISO2022KRCharLenTable,
'name': "ISO-2022-KR"}
# flake8: noqa

View File

@ -25,13 +25,14 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants, sys
from constants import eStart, eError, eItsMe
from mbcharsetprober import MultiByteCharSetProber
from codingstatemachine import CodingStateMachine
from chardistribution import EUCJPDistributionAnalysis
from jpcntx import EUCJPContextAnalysis
from mbcssm import EUCJPSMModel
import sys
from . import constants
from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from .chardistribution import EUCJPDistributionAnalysis
from .jpcntx import EUCJPContextAnalysis
from .mbcssm import EUCJPSMModel
class EUCJPProber(MultiByteCharSetProber):
def __init__(self):
@ -50,31 +51,35 @@ class EUCJPProber(MultiByteCharSetProber):
def feed(self, aBuf):
aLen = len(aBuf)
for i in xrange(0, aLen):
for i in range(0, aLen):
# PY3K: aBuf is a byte array, so aBuf[i] is an int, not a byte
codingState = self._mCodingSM.next_state(aBuf[i])
if codingState == eError:
if codingState == constants.eError:
if constants._debug:
sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n')
sys.stderr.write(self.get_charset_name()
+ ' prober hit error at byte ' + str(i)
+ '\n')
self._mState = constants.eNotMe
break
elif codingState == eItsMe:
elif codingState == constants.eItsMe:
self._mState = constants.eFoundIt
break
elif codingState == eStart:
elif codingState == constants.eStart:
charLen = self._mCodingSM.get_current_charlen()
if i == 0:
self._mLastChar[1] = aBuf[0]
self._mContextAnalyzer.feed(self._mLastChar, charLen)
self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
else:
self._mContextAnalyzer.feed(aBuf[i-1:i+1], charLen)
self._mDistributionAnalyzer.feed(aBuf[i-1:i+1], charLen)
self._mContextAnalyzer.feed(aBuf[i - 1:i + 1], charLen)
self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1],
charLen)
self._mLastChar[0] = aBuf[aLen - 1]
if self.get_state() == constants.eDetecting:
if self._mContextAnalyzer.got_enough_data() and \
(self.get_confidence() > constants.SHORTCUT_THRESHOLD):
if (self._mContextAnalyzer.got_enough_data() and
(self.get_confidence() > constants.SHORTCUT_THRESHOLD)):
self._mState = constants.eFoundIt
return self.get_state()

View File

@ -592,3 +592,5 @@ EUCKRCharToFreqOrder = ( \
8704,8705,8706,8707,8708,8709,8710,8711,8712,8713,8714,8715,8716,8717,8718,8719,
8720,8721,8722,8723,8724,8725,8726,8727,8728,8729,8730,8731,8732,8733,8734,8735,
8736,8737,8738,8739,8740,8741)
# flake8: noqa

View File

@ -25,10 +25,11 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from mbcharsetprober import MultiByteCharSetProber
from codingstatemachine import CodingStateMachine
from chardistribution import EUCKRDistributionAnalysis
from mbcssm import EUCKRSMModel
from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from .chardistribution import EUCKRDistributionAnalysis
from .mbcssm import EUCKRSMModel
class EUCKRProber(MultiByteCharSetProber):
def __init__(self):

View File

@ -46,7 +46,7 @@ EUCTW_TYPICAL_DISTRIBUTION_RATIO = 0.75
# Char to FreqOrder table ,
EUCTW_TABLE_SIZE = 8102
EUCTWCharToFreqOrder = ( \
EUCTWCharToFreqOrder = (
1,1800,1506, 255,1431, 198, 9, 82, 6,7310, 177, 202,3615,1256,2808, 110, # 2742
3735, 33,3241, 261, 76, 44,2113, 16,2931,2184,1176, 659,3868, 26,3404,2643, # 2758
1198,3869,3313,4060, 410,2211, 302, 590, 361,1963, 8, 204, 58,4296,7311,1931, # 2774
@ -424,3 +424,5 @@ EUCTWCharToFreqOrder = ( \
8694,8695,8696,8697,8698,8699,8700,8701,8702,8703,8704,8705,8706,8707,8708,8709, # 8710
8710,8711,8712,8713,8714,8715,8716,8717,8718,8719,8720,8721,8722,8723,8724,8725, # 8726
8726,8727,8728,8729,8730,8731,8732,8733,8734,8735,8736,8737,8738,8739,8740,8741) # 8742
# flake8: noqa

View File

@ -25,10 +25,10 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from mbcharsetprober import MultiByteCharSetProber
from codingstatemachine import CodingStateMachine
from chardistribution import EUCTWDistributionAnalysis
from mbcssm import EUCTWSMModel
from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from .chardistribution import EUCTWDistributionAnalysis
from .mbcssm import EUCTWSMModel
class EUCTWProber(MultiByteCharSetProber):
def __init__(self):

View File

@ -43,7 +43,7 @@ GB2312_TYPICAL_DISTRIBUTION_RATIO = 0.9
GB2312_TABLE_SIZE = 3760
GB2312CharToFreqOrder = ( \
GB2312CharToFreqOrder = (
1671, 749,1443,2364,3924,3807,2330,3921,1704,3463,2691,1511,1515, 572,3191,2205,
2361, 224,2558, 479,1711, 963,3162, 440,4060,1905,2966,2947,3580,2647,3961,3842,
2204, 869,4207, 970,2678,5626,2944,2956,1479,4048, 514,3595, 588,1346,2820,3409,
@ -469,3 +469,4 @@ GB2312CharToFreqOrder = ( \
5867,5507,6273,4206,6274,4789,6098,6764,3619,3646,3833,3804,2394,3788,4936,3978,
4866,4899,6099,6100,5559,6478,6765,3599,5868,6101,5869,5870,6275,6766,4527,6767)
# flake8: noqa

View File

@ -25,10 +25,10 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from mbcharsetprober import MultiByteCharSetProber
from codingstatemachine import CodingStateMachine
from chardistribution import GB2312DistributionAnalysis
from mbcssm import GB2312SMModel
from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from .chardistribution import GB2312DistributionAnalysis
from .mbcssm import GB2312SMModel
class GB2312Prober(MultiByteCharSetProber):
def __init__(self):

View File

@ -25,8 +25,9 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from charsetprober import CharSetProber
import constants
from .charsetprober import CharSetProber
from .constants import eNotMe, eDetecting
from .compat import wrap_ord
# This prober doesn't actually recognize a language or a charset.
# It is a helper prober for the use of the Hebrew model probers
@ -126,28 +127,31 @@ import constants
# charset identified, either "windows-1255" or "ISO-8859-8".
# windows-1255 / ISO-8859-8 code points of interest
FINAL_KAF = '\xea'
NORMAL_KAF = '\xeb'
FINAL_MEM = '\xed'
NORMAL_MEM = '\xee'
FINAL_NUN = '\xef'
NORMAL_NUN = '\xf0'
FINAL_PE = '\xf3'
NORMAL_PE = '\xf4'
FINAL_TSADI = '\xf5'
NORMAL_TSADI = '\xf6'
FINAL_KAF = 0xea
NORMAL_KAF = 0xeb
FINAL_MEM = 0xed
NORMAL_MEM = 0xee
FINAL_NUN = 0xef
NORMAL_NUN = 0xf0
FINAL_PE = 0xf3
NORMAL_PE = 0xf4
FINAL_TSADI = 0xf5
NORMAL_TSADI = 0xf6
# Minimum Visual vs Logical final letter score difference.
# If the difference is below this, don't rely solely on the final letter score distance.
# If the difference is below this, don't rely solely on the final letter score
# distance.
MIN_FINAL_CHAR_DISTANCE = 5
# Minimum Visual vs Logical model score difference.
# If the difference is below this, don't rely at all on the model score distance.
# If the difference is below this, don't rely at all on the model score
# distance.
MIN_MODEL_DISTANCE = 0.01
VISUAL_HEBREW_NAME = "ISO-8859-8"
LOGICAL_HEBREW_NAME = "windows-1255"
class HebrewProber(CharSetProber):
def __init__(self):
CharSetProber.__init__(self)
@ -159,8 +163,8 @@ class HebrewProber(CharSetProber):
self._mFinalCharLogicalScore = 0
self._mFinalCharVisualScore = 0
# The two last characters seen in the previous buffer,
# mPrev and mBeforePrev are initialized to space in order to simulate a word
# delimiter at the beginning of the data
# mPrev and mBeforePrev are initialized to space in order to simulate
# a word delimiter at the beginning of the data
self._mPrev = ' '
self._mBeforePrev = ' '
# These probers are owned by the group prober.
@ -170,49 +174,52 @@ class HebrewProber(CharSetProber):
self._mVisualProber = visualProber
def is_final(self, c):
return c in [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE, FINAL_TSADI]
return wrap_ord(c) in [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE,
FINAL_TSADI]
def is_non_final(self, c):
# The normal Tsadi is not a good Non-Final letter due to words like
# 'lechotet' (to chat) containing an apostrophe after the tsadi. This
# apostrophe is converted to a space in FilterWithoutEnglishLetters causing
# the Non-Final tsadi to appear at an end of a word even though this is not
# the case in the original text.
# The letters Pe and Kaf rarely display a related behavior of not being a
# good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for
# example legally end with a Non-Final Pe or Kaf. However, the benefit of
# these letters as Non-Final letters outweighs the damage since these words
# are quite rare.
return c in [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE]
# apostrophe is converted to a space in FilterWithoutEnglishLetters
# causing the Non-Final tsadi to appear at an end of a word even
# though this is not the case in the original text.
# The letters Pe and Kaf rarely display a related behavior of not being
# a good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak'
# for example legally end with a Non-Final Pe or Kaf. However, the
# benefit of these letters as Non-Final letters outweighs the damage
# since these words are quite rare.
return wrap_ord(c) in [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE]
def feed(self, aBuf):
# Final letter analysis for logical-visual decision.
# Look for evidence that the received buffer is either logical Hebrew or
# visual Hebrew.
# Look for evidence that the received buffer is either logical Hebrew
# or visual Hebrew.
# The following cases are checked:
# 1) A word longer than 1 letter, ending with a final letter. This is an
# indication that the text is laid out "naturally" since the final letter
# really appears at the end. +1 for logical score.
# 2) A word longer than 1 letter, ending with a Non-Final letter. In normal
# Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with
# the Non-Final form of that letter. Exceptions to this rule are mentioned
# above in isNonFinal(). This is an indication that the text is laid out
# backwards. +1 for visual score
# 3) A word longer than 1 letter, starting with a final letter. Final letters
# should not appear at the beginning of a word. This is an indication that
# the text is laid out backwards. +1 for visual score.
# 1) A word longer than 1 letter, ending with a final letter. This is
# an indication that the text is laid out "naturally" since the
# final letter really appears at the end. +1 for logical score.
# 2) A word longer than 1 letter, ending with a Non-Final letter. In
# normal Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi,
# should not end with the Non-Final form of that letter. Exceptions
# to this rule are mentioned above in isNonFinal(). This is an
# indication that the text is laid out backwards. +1 for visual
# score
# 3) A word longer than 1 letter, starting with a final letter. Final
# letters should not appear at the beginning of a word. This is an
# indication that the text is laid out backwards. +1 for visual
# score.
#
# The visual score and logical score are accumulated throughout the text and
# are finally checked against each other in GetCharSetName().
# No checking for final letters in the middle of words is done since that case
# is not an indication for either Logical or Visual text.
# The visual score and logical score are accumulated throughout the
# text and are finally checked against each other in GetCharSetName().
# No checking for final letters in the middle of words is done since
# that case is not an indication for either Logical or Visual text.
#
# We automatically filter out all 7-bit characters (replace them with spaces)
# so the word boundary detection works properly. [MAP]
# We automatically filter out all 7-bit characters (replace them with
# spaces) so the word boundary detection works properly. [MAP]
if self.get_state() == constants.eNotMe:
if self.get_state() == eNotMe:
# Both model probers say it's not them. No reason to continue.
return constants.eNotMe
return eNotMe
aBuf = self.filter_high_bit_only(aBuf)
@ -220,23 +227,27 @@ class HebrewProber(CharSetProber):
if cur == ' ':
# We stand on a space - a word just ended
if self._mBeforePrev != ' ':
# next-to-last char was not a space so self._mPrev is not a 1 letter word
# next-to-last char was not a space so self._mPrev is not a
# 1 letter word
if self.is_final(self._mPrev):
# case (1) [-2:not space][-1:final letter][cur:space]
self._mFinalCharLogicalScore += 1
elif self.is_non_final(self._mPrev):
# case (2) [-2:not space][-1:Non-Final letter][cur:space]
# case (2) [-2:not space][-1:Non-Final letter][
# cur:space]
self._mFinalCharVisualScore += 1
else:
# Not standing on a space
if (self._mBeforePrev == ' ') and (self.is_final(self._mPrev)) and (cur != ' '):
if ((self._mBeforePrev == ' ') and
(self.is_final(self._mPrev)) and (cur != ' ')):
# case (3) [-2:space][-1:final letter][cur:not space]
self._mFinalCharVisualScore += 1
self._mBeforePrev = self._mPrev
self._mPrev = cur
# Forever detecting, till the end or until both model probers return eNotMe (handled above)
return constants.eDetecting
# Forever detecting, till the end or until both model probers return
# eNotMe (handled above)
return eDetecting
def get_charset_name(self):
# Make the decision: is it Logical or Visual?
@ -248,22 +259,25 @@ class HebrewProber(CharSetProber):
return VISUAL_HEBREW_NAME
# It's not dominant enough, try to rely on the model scores instead.
modelsub = self._mLogicalProber.get_confidence() - self._mVisualProber.get_confidence()
modelsub = (self._mLogicalProber.get_confidence()
- self._mVisualProber.get_confidence())
if modelsub > MIN_MODEL_DISTANCE:
return LOGICAL_HEBREW_NAME
if modelsub < -MIN_MODEL_DISTANCE:
return VISUAL_HEBREW_NAME
# Still no good, back to final letter distance, maybe it'll save the day.
# Still no good, back to final letter distance, maybe it'll save the
# day.
if finalsub < 0.0:
return VISUAL_HEBREW_NAME
# (finalsub > 0 - Logical) or (don't know what to do) default to Logical.
# (finalsub > 0 - Logical) or (don't know what to do) default to
# Logical.
return LOGICAL_HEBREW_NAME
def get_state(self):
# Remain active as long as any of the model probers are active.
if (self._mLogicalProber.get_state() == constants.eNotMe) and \
(self._mVisualProber.get_state() == constants.eNotMe):
return constants.eNotMe
return constants.eDetecting
if (self._mLogicalProber.get_state() == eNotMe) and \
(self._mVisualProber.get_state() == eNotMe):
return eNotMe
return eDetecting

View File

@ -46,7 +46,7 @@ JIS_TYPICAL_DISTRIBUTION_RATIO = 3.0
# Char to FreqOrder table ,
JIS_TABLE_SIZE = 4368
JISCharToFreqOrder = ( \
JISCharToFreqOrder = (
40, 1, 6, 182, 152, 180, 295,2127, 285, 381,3295,4304,3068,4606,3165,3510, # 16
3511,1822,2785,4607,1193,2226,5070,4608, 171,2996,1247, 18, 179,5071, 856,1661, # 32
1262,5072, 619, 127,3431,3512,3230,1899,1700, 232, 228,1294,1298, 284, 283,2041, # 48
@ -565,3 +565,5 @@ JISCharToFreqOrder = ( \
8224,8225,8226,8227,8228,8229,8230,8231,8232,8233,8234,8235,8236,8237,8238,8239, # 8240
8240,8241,8242,8243,8244,8245,8246,8247,8248,8249,8250,8251,8252,8253,8254,8255, # 8256
8256,8257,8258,8259,8260,8261,8262,8263,8264,8265,8266,8267,8268,8269,8270,8271) # 8272
# flake8: noqa

View File

@ -25,7 +25,7 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants
from .compat import wrap_ord
NUM_OF_CATEGORY = 6
DONT_KNOW = -1
@ -34,7 +34,7 @@ MAX_REL_THRESHOLD = 1000
MINIMUM_DATA_THRESHOLD = 4
# This is hiragana 2-char sequence table, the number in each cell represents its frequency category
jp2CharContext = ( \
jp2CharContext = (
(0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1),
(2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4),
(0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2),
@ -126,23 +126,30 @@ class JapaneseContextAnalysis:
def reset(self):
self._mTotalRel = 0 # total sequence received
self._mRelSample = [0] * NUM_OF_CATEGORY # category counters, each interger counts sequence in its category
self._mNeedToSkipCharNum = 0 # if last byte in current buffer is not the last byte of a character, we need to know how many bytes to skip in next buffer
# category counters, each interger counts sequence in its category
self._mRelSample = [0] * NUM_OF_CATEGORY
# if last byte in current buffer is not the last byte of a character,
# we need to know how many bytes to skip in next buffer
self._mNeedToSkipCharNum = 0
self._mLastCharOrder = -1 # The order of previous char
self._mDone = constants.False # If this flag is set to constants.True, detection is done and conclusion has been made
# If this flag is set to True, detection is done and conclusion has
# been made
self._mDone = False
def feed(self, aBuf, aLen):
if self._mDone: return
if self._mDone:
return
# The buffer we got is byte oriented, and a character may span in more than one
# buffers. In case the last one or two byte in last buffer is not complete, we
# record how many byte needed to complete that character and skip these bytes here.
# We can choose to record those bytes as well and analyse the character once it
# is complete, but since a character will not make much difference, by simply skipping
# buffers. In case the last one or two byte in last buffer is not
# complete, we record how many byte needed to complete that character
# and skip these bytes here. We can choose to record those bytes as
# well and analyse the character once it is complete, but since a
# character will not make much difference, by simply skipping
# this character will simply our logic and improve performance.
i = self._mNeedToSkipCharNum
while i < aLen:
order, charLen = self.get_order(aBuf[i:i+2])
order, charLen = self.get_order(aBuf[i:i + 2])
i += charLen
if i > aLen:
self._mNeedToSkipCharNum = i - aLen
@ -151,7 +158,7 @@ class JapaneseContextAnalysis:
if (order != -1) and (self._mLastCharOrder != -1):
self._mTotalRel += 1
if self._mTotalRel > MAX_REL_THRESHOLD:
self._mDone = constants.True
self._mDone = True
break
self._mRelSample[jp2CharContext[self._mLastCharOrder][order]] += 1
self._mLastCharOrder = order
@ -166,45 +173,55 @@ class JapaneseContextAnalysis:
else:
return DONT_KNOW
def get_order(self, aStr):
def get_order(self, aBuf):
return -1, 1
class SJISContextAnalysis(JapaneseContextAnalysis):
def get_order(self, aStr):
if not aStr: return -1, 1
def __init__(self):
self.charset_name = "SHIFT_JIS"
def get_charset_name(self):
return self.charset_name
def get_order(self, aBuf):
if not aBuf:
return -1, 1
# find out current char's byte length
if ((aStr[0] >= '\x81') and (aStr[0] <= '\x9F')) or \
((aStr[0] >= '\xE0') and (aStr[0] <= '\xFC')):
first_char = wrap_ord(aBuf[0])
if ((0x81 <= first_char <= 0x9F) or (0xE0 <= first_char <= 0xFC)):
charLen = 2
if (first_char == 0x87) or (0xFA <= first_char <= 0xFC):
self.charset_name = "CP932"
else:
charLen = 1
# return its order if it is hiragana
if len(aStr) > 1:
if (aStr[0] == '\202') and \
(aStr[1] >= '\x9F') and \
(aStr[1] <= '\xF1'):
return ord(aStr[1]) - 0x9F, charLen
if len(aBuf) > 1:
second_char = wrap_ord(aBuf[1])
if (first_char == 202) and (0x9F <= second_char <= 0xF1):
return second_char - 0x9F, charLen
return -1, charLen
class EUCJPContextAnalysis(JapaneseContextAnalysis):
def get_order(self, aStr):
if not aStr: return -1, 1
def get_order(self, aBuf):
if not aBuf:
return -1, 1
# find out current char's byte length
if (aStr[0] == '\x8E') or \
((aStr[0] >= '\xA1') and (aStr[0] <= '\xFE')):
first_char = wrap_ord(aBuf[0])
if (first_char == 0x8E) or (0xA1 <= first_char <= 0xFE):
charLen = 2
elif aStr[0] == '\x8F':
elif first_char == 0x8F:
charLen = 3
else:
charLen = 1
# return its order if it is hiragana
if len(aStr) > 1:
if (aStr[0] == '\xA4') and \
(aStr[1] >= '\xA1') and \
(aStr[1] <= '\xF3'):
return ord(aStr[1]) - 0xA1, charLen
if len(aBuf) > 1:
second_char = wrap_ord(aBuf[1])
if (first_char == 0xA4) and (0xA1 <= second_char <= 0xF3):
return second_char - 0xA1, charLen
return -1, charLen
# flake8: noqa

View File

@ -25,8 +25,6 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants
# 255: Control characters that usually does not exist in any text
# 254: Carriage/Return
# 253: symbol (punctuation) that does not belong to word
@ -36,7 +34,7 @@ import constants
# this table is modified base on win1251BulgarianCharToOrderMap, so
# only number <64 is sure valid
Latin5_BulgarianCharToOrderMap = ( \
Latin5_BulgarianCharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -55,7 +53,7 @@ Latin5_BulgarianCharToOrderMap = ( \
62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,252,253, # f0
)
win1251BulgarianCharToOrderMap = ( \
win1251BulgarianCharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -80,7 +78,7 @@ win1251BulgarianCharToOrderMap = ( \
# first 1024 sequences:3.0618%
# rest sequences: 0.2992%
# negative sequences: 0.0020%
BulgarianLangModel = ( \
BulgarianLangModel = (
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,2,1,2,2,
3,1,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,3,0,3,0,1,
@ -211,18 +209,21 @@ BulgarianLangModel = ( \
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
)
Latin5BulgarianModel = { \
Latin5BulgarianModel = {
'charToOrderMap': Latin5_BulgarianCharToOrderMap,
'precedenceMatrix': BulgarianLangModel,
'mTypicalPositiveRatio': 0.969392,
'keepEnglishLetter': constants.False,
'keepEnglishLetter': False,
'charsetName': "ISO-8859-5"
}
Win1251BulgarianModel = { \
Win1251BulgarianModel = {
'charToOrderMap': win1251BulgarianCharToOrderMap,
'precedenceMatrix': BulgarianLangModel,
'mTypicalPositiveRatio': 0.969392,
'keepEnglishLetter': constants.False,
'keepEnglishLetter': False,
'charsetName': "windows-1251"
}
# flake8: noqa

View File

@ -25,11 +25,9 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants
# KOI8-R language model
# Character Mapping Table:
KOI8R_CharToOrderMap = ( \
KOI8R_CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -48,7 +46,7 @@ KOI8R_CharToOrderMap = ( \
35, 43, 45, 32, 40, 52, 56, 33, 61, 62, 51, 57, 47, 63, 50, 70, # f0
)
win1251_CharToOrderMap = ( \
win1251_CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -67,7 +65,7 @@ win1251_CharToOrderMap = ( \
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16,
)
latin5_CharToOrderMap = ( \
latin5_CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -86,7 +84,7 @@ latin5_CharToOrderMap = ( \
239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255,
)
macCyrillic_CharToOrderMap = ( \
macCyrillic_CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -105,7 +103,7 @@ macCyrillic_CharToOrderMap = ( \
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,255,
)
IBM855_CharToOrderMap = ( \
IBM855_CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -124,7 +122,7 @@ IBM855_CharToOrderMap = ( \
250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,252,255,
)
IBM866_CharToOrderMap = ( \
IBM866_CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -149,7 +147,7 @@ IBM866_CharToOrderMap = ( \
# first 1024 sequences: 2.3389%
# rest sequences: 0.1237%
# negative sequences: 0.0009%
RussianLangModel = ( \
RussianLangModel = (
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,2,2,2,0,0,2,
3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,3,0,0,3,3,3,3,3,3,3,3,3,2,3,2,0,
@ -280,50 +278,52 @@ RussianLangModel = ( \
0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,
)
Koi8rModel = { \
Koi8rModel = {
'charToOrderMap': KOI8R_CharToOrderMap,
'precedenceMatrix': RussianLangModel,
'mTypicalPositiveRatio': 0.976601,
'keepEnglishLetter': constants.False,
'keepEnglishLetter': False,
'charsetName': "KOI8-R"
}
Win1251CyrillicModel = { \
Win1251CyrillicModel = {
'charToOrderMap': win1251_CharToOrderMap,
'precedenceMatrix': RussianLangModel,
'mTypicalPositiveRatio': 0.976601,
'keepEnglishLetter': constants.False,
'keepEnglishLetter': False,
'charsetName': "windows-1251"
}
Latin5CyrillicModel = { \
Latin5CyrillicModel = {
'charToOrderMap': latin5_CharToOrderMap,
'precedenceMatrix': RussianLangModel,
'mTypicalPositiveRatio': 0.976601,
'keepEnglishLetter': constants.False,
'keepEnglishLetter': False,
'charsetName': "ISO-8859-5"
}
MacCyrillicModel = { \
MacCyrillicModel = {
'charToOrderMap': macCyrillic_CharToOrderMap,
'precedenceMatrix': RussianLangModel,
'mTypicalPositiveRatio': 0.976601,
'keepEnglishLetter': constants.False,
'keepEnglishLetter': False,
'charsetName': "MacCyrillic"
};
Ibm866Model = { \
Ibm866Model = {
'charToOrderMap': IBM866_CharToOrderMap,
'precedenceMatrix': RussianLangModel,
'mTypicalPositiveRatio': 0.976601,
'keepEnglishLetter': constants.False,
'keepEnglishLetter': False,
'charsetName': "IBM866"
}
Ibm855Model = { \
Ibm855Model = {
'charToOrderMap': IBM855_CharToOrderMap,
'precedenceMatrix': RussianLangModel,
'mTypicalPositiveRatio': 0.976601,
'keepEnglishLetter': constants.False,
'keepEnglishLetter': False,
'charsetName': "IBM855"
}
# flake8: noqa

View File

@ -25,15 +25,13 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants
# 255: Control characters that usually does not exist in any text
# 254: Carriage/Return
# 253: symbol (punctuation) that does not belong to word
# 252: 0 - 9
# Character Mapping Table:
Latin7_CharToOrderMap = ( \
Latin7_CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -52,7 +50,7 @@ Latin7_CharToOrderMap = ( \
9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, # f0
)
win1253_CharToOrderMap = ( \
win1253_CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -77,7 +75,7 @@ win1253_CharToOrderMap = ( \
# first 1024 sequences:1.7001%
# rest sequences: 0.0359%
# negative sequences: 0.0148%
GreekLangModel = ( \
GreekLangModel = (
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,3,2,2,3,3,3,3,3,3,3,3,1,3,3,3,0,2,2,3,3,0,3,0,3,2,0,3,3,3,0,
@ -208,18 +206,20 @@ GreekLangModel = ( \
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
)
Latin7GreekModel = { \
Latin7GreekModel = {
'charToOrderMap': Latin7_CharToOrderMap,
'precedenceMatrix': GreekLangModel,
'mTypicalPositiveRatio': 0.982851,
'keepEnglishLetter': constants.False,
'keepEnglishLetter': False,
'charsetName': "ISO-8859-7"
}
Win1253GreekModel = { \
Win1253GreekModel = {
'charToOrderMap': win1253_CharToOrderMap,
'precedenceMatrix': GreekLangModel,
'mTypicalPositiveRatio': 0.982851,
'keepEnglishLetter': constants.False,
'keepEnglishLetter': False,
'charsetName': "windows-1253"
}
# flake8: noqa

View File

@ -27,8 +27,6 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants
# 255: Control characters that usually does not exist in any text
# 254: Carriage/Return
# 253: symbol (punctuation) that does not belong to word
@ -36,7 +34,7 @@ import constants
# Windows-1255 language model
# Character Mapping Table:
win1255_CharToOrderMap = ( \
win1255_CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -61,7 +59,7 @@ win1255_CharToOrderMap = ( \
# first 1024 sequences: 1.5981%
# rest sequences: 0.087%
# negative sequences: 0.0015%
HebrewLangModel = ( \
HebrewLangModel = (
0,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,2,1,2,0,1,0,0,
3,0,3,1,0,0,1,3,2,0,1,1,2,0,2,2,2,1,1,1,1,2,1,1,1,2,0,0,2,2,0,1,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,
@ -192,10 +190,12 @@ HebrewLangModel = ( \
0,0,0,0,0,0,0,0,0,0,1,2,1,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,1,1,0,0,
)
Win1255HebrewModel = { \
Win1255HebrewModel = {
'charToOrderMap': win1255_CharToOrderMap,
'precedenceMatrix': HebrewLangModel,
'mTypicalPositiveRatio': 0.984004,
'keepEnglishLetter': constants.False,
'keepEnglishLetter': False,
'charsetName': "windows-1255"
}
# flake8: noqa

View File

@ -25,15 +25,13 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants
# 255: Control characters that usually does not exist in any text
# 254: Carriage/Return
# 253: symbol (punctuation) that does not belong to word
# 252: 0 - 9
# Character Mapping Table:
Latin2_HungarianCharToOrderMap = ( \
Latin2_HungarianCharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -52,7 +50,7 @@ Latin2_HungarianCharToOrderMap = ( \
245,246,247, 25, 73, 42, 24,248,249,250, 31, 56, 29,251,252,253,
)
win1250HungarianCharToOrderMap = ( \
win1250HungarianCharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -77,7 +75,7 @@ win1250HungarianCharToOrderMap = ( \
# first 1024 sequences:5.2623%
# rest sequences: 0.8894%
# negative sequences: 0.0009%
HungarianLangModel = ( \
HungarianLangModel = (
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,1,1,2,2,2,2,2,1,2,
3,2,2,3,3,3,3,3,2,3,3,3,3,3,3,1,2,3,3,3,3,2,3,3,1,1,3,3,0,1,1,1,
@ -208,18 +206,20 @@ HungarianLangModel = ( \
0,1,1,1,1,1,1,0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,
)
Latin2HungarianModel = { \
Latin2HungarianModel = {
'charToOrderMap': Latin2_HungarianCharToOrderMap,
'precedenceMatrix': HungarianLangModel,
'mTypicalPositiveRatio': 0.947368,
'keepEnglishLetter': constants.True,
'keepEnglishLetter': True,
'charsetName': "ISO-8859-2"
}
Win1250HungarianModel = { \
Win1250HungarianModel = {
'charToOrderMap': win1250HungarianCharToOrderMap,
'precedenceMatrix': HungarianLangModel,
'mTypicalPositiveRatio': 0.947368,
'keepEnglishLetter': constants.True,
'keepEnglishLetter': True,
'charsetName': "windows-1250"
}
# flake8: noqa

View File

@ -25,8 +25,6 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants
# 255: Control characters that usually does not exist in any text
# 254: Carriage/Return
# 253: symbol (punctuation) that does not belong to word
@ -35,7 +33,7 @@ import constants
# The following result for thai was collected from a limited sample (1M).
# Character Mapping Table:
TIS620CharToOrderMap = ( \
TIS620CharToOrderMap = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
@ -60,7 +58,7 @@ TIS620CharToOrderMap = ( \
# first 1024 sequences:7.3177%
# rest sequences: 1.0230%
# negative sequences: 0.0436%
ThaiLangModel = ( \
ThaiLangModel = (
0,1,3,3,3,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,3,3,
0,3,3,0,0,0,1,3,0,3,3,2,3,3,0,1,2,3,3,3,3,0,2,0,2,0,0,3,2,1,2,2,
3,0,3,3,2,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,3,0,3,2,3,0,2,2,2,3,
@ -191,10 +189,12 @@ ThaiLangModel = ( \
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
)
TIS620ThaiModel = { \
TIS620ThaiModel = {
'charToOrderMap': TIS620CharToOrderMap,
'precedenceMatrix': ThaiLangModel,
'mTypicalPositiveRatio': 0.926386,
'keepEnglishLetter': constants.False,
'keepEnglishLetter': False,
'charsetName': "TIS-620"
}
# flake8: noqa

View File

@ -26,9 +26,9 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from charsetprober import CharSetProber
import constants
import operator
from .charsetprober import CharSetProber
from .constants import eNotMe
from .compat import wrap_ord
FREQ_CAT_NUM = 4
@ -42,7 +42,7 @@ ASV = 6 # accent small vowel
ASO = 7 # accent small other
CLASS_NUM = 8 # total classes
Latin1_CharToClass = ( \
Latin1_CharToClass = (
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17
@ -81,8 +81,8 @@ Latin1_CharToClass = ( \
# 1 : very unlikely
# 2 : normal
# 3 : very likely
Latin1ClassModel = ( \
# UDF OTH ASC ASS ACV ACO ASV ASO
Latin1ClassModel = (
# UDF OTH ASC ASS ACV ACO ASV ASO
0, 0, 0, 0, 0, 0, 0, 0, # UDF
0, 3, 3, 3, 3, 3, 3, 3, # OTH
0, 3, 3, 3, 3, 3, 3, 3, # ASC
@ -93,6 +93,7 @@ Latin1ClassModel = ( \
0, 3, 1, 3, 1, 1, 3, 3, # ASO
)
class Latin1Prober(CharSetProber):
def __init__(self):
CharSetProber.__init__(self)
@ -109,10 +110,11 @@ class Latin1Prober(CharSetProber):
def feed(self, aBuf):
aBuf = self.filter_with_english_letters(aBuf)
for c in aBuf:
charClass = Latin1_CharToClass[ord(c)]
freq = Latin1ClassModel[(self._mLastCharClass * CLASS_NUM) + charClass]
charClass = Latin1_CharToClass[wrap_ord(c)]
freq = Latin1ClassModel[(self._mLastCharClass * CLASS_NUM)
+ charClass]
if freq == 0:
self._mState = constants.eNotMe
self._mState = eNotMe
break
self._mFreqCounter[freq] += 1
self._mLastCharClass = charClass
@ -120,17 +122,18 @@ class Latin1Prober(CharSetProber):
return self.get_state()
def get_confidence(self):
if self.get_state() == constants.eNotMe:
if self.get_state() == eNotMe:
return 0.01
total = reduce(operator.add, self._mFreqCounter)
total = sum(self._mFreqCounter)
if total < 0.01:
confidence = 0.0
else:
confidence = (self._mFreqCounter[3] / total) - (self._mFreqCounter[1] * 20.0 / total)
confidence = ((self._mFreqCounter[3] - self._mFreqCounter[1] * 20.0)
/ total)
if confidence < 0.0:
confidence = 0.0
# lower the confidence of latin1 so that other more accurate detector
# can take priority.
confidence = confidence * 0.5
# lower the confidence of latin1 so that other more accurate
# detector can take priority.
confidence = confidence * 0.73
return confidence

View File

@ -27,16 +27,17 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants, sys
from constants import eStart, eError, eItsMe
from charsetprober import CharSetProber
import sys
from . import constants
from .charsetprober import CharSetProber
class MultiByteCharSetProber(CharSetProber):
def __init__(self):
CharSetProber.__init__(self)
self._mDistributionAnalyzer = None
self._mCodingSM = None
self._mLastChar = ['\x00', '\x00']
self._mLastChar = [0, 0]
def reset(self):
CharSetProber.reset(self)
@ -44,36 +45,39 @@ class MultiByteCharSetProber(CharSetProber):
self._mCodingSM.reset()
if self._mDistributionAnalyzer:
self._mDistributionAnalyzer.reset()
self._mLastChar = ['\x00', '\x00']
self._mLastChar = [0, 0]
def get_charset_name(self):
pass
def feed(self, aBuf):
aLen = len(aBuf)
for i in xrange(0, aLen):
for i in range(0, aLen):
codingState = self._mCodingSM.next_state(aBuf[i])
if codingState == eError:
if codingState == constants.eError:
if constants._debug:
sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n')
sys.stderr.write(self.get_charset_name()
+ ' prober hit error at byte ' + str(i)
+ '\n')
self._mState = constants.eNotMe
break
elif codingState == eItsMe:
elif codingState == constants.eItsMe:
self._mState = constants.eFoundIt
break
elif codingState == eStart:
elif codingState == constants.eStart:
charLen = self._mCodingSM.get_current_charlen()
if i == 0:
self._mLastChar[1] = aBuf[0]
self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
else:
self._mDistributionAnalyzer.feed(aBuf[i-1:i+1], charLen)
self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1],
charLen)
self._mLastChar[0] = aBuf[aLen - 1]
if self.get_state() == constants.eDetecting:
if self._mDistributionAnalyzer.got_enough_data() and \
(self.get_confidence() > constants.SHORTCUT_THRESHOLD):
if (self._mDistributionAnalyzer.got_enough_data() and
(self.get_confidence() > constants.SHORTCUT_THRESHOLD)):
self._mState = constants.eFoundIt
return self.get_state()

View File

@ -27,24 +27,28 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from charsetgroupprober import CharSetGroupProber
from utf8prober import UTF8Prober
from sjisprober import SJISProber
from eucjpprober import EUCJPProber
from gb2312prober import GB2312Prober
from euckrprober import EUCKRProber
from big5prober import Big5Prober
from euctwprober import EUCTWProber
from .charsetgroupprober import CharSetGroupProber
from .utf8prober import UTF8Prober
from .sjisprober import SJISProber
from .eucjpprober import EUCJPProber
from .gb2312prober import GB2312Prober
from .euckrprober import EUCKRProber
from .cp949prober import CP949Prober
from .big5prober import Big5Prober
from .euctwprober import EUCTWProber
class MBCSGroupProber(CharSetGroupProber):
def __init__(self):
CharSetGroupProber.__init__(self)
self._mProbers = [ \
self._mProbers = [
UTF8Prober(),
SJISProber(),
EUCJPProber(),
GB2312Prober(),
EUCKRProber(),
CP949Prober(),
Big5Prober(),
EUCTWProber()]
EUCTWProber()
]
self.reset()

View File

@ -25,11 +25,11 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from constants import eStart, eError, eItsMe
from .constants import eStart, eError, eItsMe
# BIG5
BIG5_cls = ( \
BIG5_cls = (
1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as legal value
1,1,1,1,1,1,0,0, # 08 - 0f
1,1,1,1,1,1,1,1, # 10 - 17
@ -61,12 +61,14 @@ BIG5_cls = ( \
3,3,3,3,3,3,3,3, # e0 - e7
3,3,3,3,3,3,3,3, # e8 - ef
3,3,3,3,3,3,3,3, # f0 - f7
3,3,3,3,3,3,3,0) # f8 - ff
3,3,3,3,3,3,3,0 # f8 - ff
)
BIG5_st = ( \
BIG5_st = (
eError,eStart,eStart, 3,eError,eError,eError,eError,#00-07
eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,#08-0f
eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart)#10-17
eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart#10-17
)
Big5CharLenTable = (0, 1, 1, 2, 0)
@ -76,9 +78,49 @@ Big5SMModel = {'classTable': BIG5_cls,
'charLenTable': Big5CharLenTable,
'name': 'Big5'}
# CP949
CP949_cls = (
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,0,0, # 00 - 0f
1,1,1,1,1,1,1,1, 1,1,1,0,1,1,1,1, # 10 - 1f
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, # 20 - 2f
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, # 30 - 3f
1,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, # 40 - 4f
4,4,5,5,5,5,5,5, 5,5,5,1,1,1,1,1, # 50 - 5f
1,5,5,5,5,5,5,5, 5,5,5,5,5,5,5,5, # 60 - 6f
5,5,5,5,5,5,5,5, 5,5,5,1,1,1,1,1, # 70 - 7f
0,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6, # 80 - 8f
6,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6, # 90 - 9f
6,7,7,7,7,7,7,7, 7,7,7,7,7,8,8,8, # a0 - af
7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7, # b0 - bf
7,7,7,7,7,7,9,2, 2,3,2,2,2,2,2,2, # c0 - cf
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, # d0 - df
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, # e0 - ef
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,0, # f0 - ff
)
CP949_st = (
#cls= 0 1 2 3 4 5 6 7 8 9 # previous state =
eError,eStart, 3,eError,eStart,eStart, 4, 5,eError, 6, # eStart
eError,eError,eError,eError,eError,eError,eError,eError,eError,eError, # eError
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe, # eItsMe
eError,eError,eStart,eStart,eError,eError,eError,eStart,eStart,eStart, # 3
eError,eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart,eStart, # 4
eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart,eStart,eStart, # 5
eError,eStart,eStart,eStart,eStart,eError,eError,eStart,eStart,eStart, # 6
)
CP949CharLenTable = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2)
CP949SMModel = {'classTable': CP949_cls,
'classFactor': 10,
'stateTable': CP949_st,
'charLenTable': CP949CharLenTable,
'name': 'CP949'}
# EUC-JP
EUCJP_cls = ( \
EUCJP_cls = (
4,4,4,4,4,4,4,4, # 00 - 07
4,4,4,4,4,4,5,5, # 08 - 0f
4,4,4,4,4,4,4,4, # 10 - 17
@ -110,14 +152,16 @@ EUCJP_cls = ( \
0,0,0,0,0,0,0,0, # e0 - e7
0,0,0,0,0,0,0,0, # e8 - ef
0,0,0,0,0,0,0,0, # f0 - f7
0,0,0,0,0,0,0,5) # f8 - ff
0,0,0,0,0,0,0,5 # f8 - ff
)
EUCJP_st = ( \
EUCJP_st = (
3, 4, 3, 5,eStart,eError,eError,eError,#00-07
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
eItsMe,eItsMe,eStart,eError,eStart,eError,eError,eError,#10-17
eError,eError,eStart,eError,eError,eError, 3,eError,#18-1f
3,eError,eError,eError,eStart,eStart,eStart,eStart)#20-27
3,eError,eError,eError,eStart,eStart,eStart,eStart#20-27
)
EUCJPCharLenTable = (2, 2, 2, 3, 1, 0)
@ -129,7 +173,7 @@ EUCJPSMModel = {'classTable': EUCJP_cls,
# EUC-KR
EUCKR_cls = ( \
EUCKR_cls = (
1,1,1,1,1,1,1,1, # 00 - 07
1,1,1,1,1,1,0,0, # 08 - 0f
1,1,1,1,1,1,1,1, # 10 - 17
@ -161,11 +205,13 @@ EUCKR_cls = ( \
2,2,2,2,2,2,2,2, # e0 - e7
2,2,2,2,2,2,2,2, # e8 - ef
2,2,2,2,2,2,2,2, # f0 - f7
2,2,2,2,2,2,2,0) # f8 - ff
2,2,2,2,2,2,2,0 # f8 - ff
)
EUCKR_st = (
eError,eStart, 3,eError,eError,eError,eError,eError,#00-07
eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart)#08-0f
eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart #08-0f
)
EUCKRCharLenTable = (0, 1, 2, 0)
@ -177,7 +223,7 @@ EUCKRSMModel = {'classTable': EUCKR_cls,
# EUC-TW
EUCTW_cls = ( \
EUCTW_cls = (
2,2,2,2,2,2,2,2, # 00 - 07
2,2,2,2,2,2,0,0, # 08 - 0f
2,2,2,2,2,2,2,2, # 10 - 17
@ -209,15 +255,17 @@ EUCTW_cls = ( \
3,3,3,3,3,3,3,3, # e0 - e7
3,3,3,3,3,3,3,3, # e8 - ef
3,3,3,3,3,3,3,3, # f0 - f7
3,3,3,3,3,3,3,0) # f8 - ff
3,3,3,3,3,3,3,0 # f8 - ff
)
EUCTW_st = ( \
EUCTW_st = (
eError,eError,eStart, 3, 3, 3, 4,eError,#00-07
eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,#08-0f
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eStart,eError,#10-17
eStart,eStart,eStart,eError,eError,eError,eError,eError,#18-1f
5,eError,eError,eError,eStart,eError,eStart,eStart,#20-27
eStart,eError,eStart,eStart,eStart,eStart,eStart,eStart)#28-2f
eStart,eError,eStart,eStart,eStart,eStart,eStart,eStart #28-2f
)
EUCTWCharLenTable = (0, 0, 1, 2, 2, 2, 3)
@ -229,7 +277,7 @@ EUCTWSMModel = {'classTable': EUCTW_cls,
# GB2312
GB2312_cls = ( \
GB2312_cls = (
1,1,1,1,1,1,1,1, # 00 - 07
1,1,1,1,1,1,0,0, # 08 - 0f
1,1,1,1,1,1,1,1, # 10 - 17
@ -261,15 +309,17 @@ GB2312_cls = ( \
6,6,6,6,6,6,6,6, # e0 - e7
6,6,6,6,6,6,6,6, # e8 - ef
6,6,6,6,6,6,6,6, # f0 - f7
6,6,6,6,6,6,6,0) # f8 - ff
6,6,6,6,6,6,6,0 # f8 - ff
)
GB2312_st = ( \
GB2312_st = (
eError,eStart,eStart,eStart,eStart,eStart, 3,eError,#00-07
eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,#08-0f
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,#10-17
4,eError,eStart,eStart,eError,eError,eError,eError,#18-1f
eError,eError, 5,eError,eError,eError,eItsMe,eError,#20-27
eError,eError,eStart,eStart,eStart,eStart,eStart,eStart)#28-2f
eError,eError,eStart,eStart,eStart,eStart,eStart,eStart #28-2f
)
# To be accurate, the length of class 6 can be either 2 or 4.
# But it is not necessary to discriminate between the two since
@ -286,7 +336,7 @@ GB2312SMModel = {'classTable': GB2312_cls,
# Shift_JIS
SJIS_cls = ( \
SJIS_cls = (
1,1,1,1,1,1,1,1, # 00 - 07
1,1,1,1,1,1,0,0, # 08 - 0f
1,1,1,1,1,1,1,1, # 10 - 17
@ -303,7 +353,7 @@ SJIS_cls = ( \
2,2,2,2,2,2,2,2, # 68 - 6f
2,2,2,2,2,2,2,2, # 70 - 77
2,2,2,2,2,2,2,1, # 78 - 7f
3,3,3,3,3,3,3,3, # 80 - 87
3,3,3,3,3,2,2,3, # 80 - 87
3,3,3,3,3,3,3,3, # 88 - 8f
3,3,3,3,3,3,3,3, # 90 - 97
3,3,3,3,3,3,3,3, # 98 - 9f
@ -319,13 +369,15 @@ SJIS_cls = ( \
2,2,2,2,2,2,2,2, # d8 - df
3,3,3,3,3,3,3,3, # e0 - e7
3,3,3,3,3,4,4,4, # e8 - ef
4,4,4,4,4,4,4,4, # f0 - f7
4,4,4,4,4,0,0,0) # f8 - ff
3,3,3,3,3,3,3,3, # f0 - f7
3,3,3,3,3,0,0,0) # f8 - ff
SJIS_st = ( \
SJIS_st = (
eError,eStart,eStart, 3,eError,eError,eError,eError,#00-07
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart)#10-17
eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart #10-17
)
SJISCharLenTable = (0, 1, 1, 2, 0, 0)
@ -337,7 +389,7 @@ SJISSMModel = {'classTable': SJIS_cls,
# UCS2-BE
UCS2BE_cls = ( \
UCS2BE_cls = (
0,0,0,0,0,0,0,0, # 00 - 07
0,0,1,0,0,2,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17
@ -369,16 +421,18 @@ UCS2BE_cls = ( \
0,0,0,0,0,0,0,0, # e0 - e7
0,0,0,0,0,0,0,0, # e8 - ef
0,0,0,0,0,0,0,0, # f0 - f7
0,0,0,0,0,0,4,5) # f8 - ff
0,0,0,0,0,0,4,5 # f8 - ff
)
UCS2BE_st = ( \
UCS2BE_st = (
5, 7, 7,eError, 4, 3,eError,eError,#00-07
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
eItsMe,eItsMe, 6, 6, 6, 6,eError,eError,#10-17
6, 6, 6, 6, 6,eItsMe, 6, 6,#18-1f
6, 6, 6, 6, 5, 7, 7,eError,#20-27
5, 8, 6, 6,eError, 6, 6, 6,#28-2f
6, 6, 6, 6,eError,eError,eStart,eStart)#30-37
6, 6, 6, 6,eError,eError,eStart,eStart #30-37
)
UCS2BECharLenTable = (2, 2, 2, 0, 2, 2)
@ -390,7 +444,7 @@ UCS2BESMModel = {'classTable': UCS2BE_cls,
# UCS2-LE
UCS2LE_cls = ( \
UCS2LE_cls = (
0,0,0,0,0,0,0,0, # 00 - 07
0,0,1,0,0,2,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17
@ -422,16 +476,18 @@ UCS2LE_cls = ( \
0,0,0,0,0,0,0,0, # e0 - e7
0,0,0,0,0,0,0,0, # e8 - ef
0,0,0,0,0,0,0,0, # f0 - f7
0,0,0,0,0,0,4,5) # f8 - ff
0,0,0,0,0,0,4,5 # f8 - ff
)
UCS2LE_st = ( \
UCS2LE_st = (
6, 6, 7, 6, 4, 3,eError,eError,#00-07
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f
eItsMe,eItsMe, 5, 5, 5,eError,eItsMe,eError,#10-17
5, 5, 5,eError, 5,eError, 6, 6,#18-1f
7, 6, 8, 8, 5, 5, 5,eError,#20-27
5, 5, 5,eError,eError,eError, 5, 5,#28-2f
5, 5, 5,eError, 5,eError,eStart,eStart)#30-37
5, 5, 5,eError, 5,eError,eStart,eStart #30-37
)
UCS2LECharLenTable = (2, 2, 2, 2, 2, 2)
@ -443,7 +499,7 @@ UCS2LESMModel = {'classTable': UCS2LE_cls,
# UTF-8
UTF8_cls = ( \
UTF8_cls = (
1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as a legal value
1,1,1,1,1,1,0,0, # 08 - 0f
1,1,1,1,1,1,1,1, # 10 - 17
@ -475,9 +531,10 @@ UTF8_cls = ( \
7,8,8,8,8,8,8,8, # e0 - e7
8,8,8,8,8,9,8,8, # e8 - ef
10,11,11,11,11,11,11,11, # f0 - f7
12,13,13,13,14,15,0,0) # f8 - ff
12,13,13,13,14,15,0,0 # f8 - ff
)
UTF8_st = ( \
UTF8_st = (
eError,eStart,eError,eError,eError,eError, 12, 10,#00-07
9, 11, 8, 7, 6, 5, 4, 3,#08-0f
eError,eError,eError,eError,eError,eError,eError,eError,#10-17
@ -503,7 +560,8 @@ UTF8_st = ( \
eError,eError, 12, 12, 12,eError,eError,eError,#b0-b7
eError,eError,eError,eError,eError,eError,eError,eError,#b8-bf
eError,eError,eStart,eStart,eStart,eStart,eError,eError,#c0-c7
eError,eError,eError,eError,eError,eError,eError,eError)#c8-cf
eError,eError,eError,eError,eError,eError,eError,eError #c8-cf
)
UTF8CharLenTable = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6)

View File

@ -26,8 +26,10 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants, sys
from charsetprober import CharSetProber
import sys
from . import constants
from .charsetprober import CharSetProber
from .compat import wrap_ord
SAMPLE_SIZE = 64
SB_ENOUGH_REL_THRESHOLD = 1024
@ -38,21 +40,26 @@ NUMBER_OF_SEQ_CAT = 4
POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1
#NEGATIVE_CAT = 0
class SingleByteCharSetProber(CharSetProber):
def __init__(self, model, reversed=constants.False, nameProber=None):
def __init__(self, model, reversed=False, nameProber=None):
CharSetProber.__init__(self)
self._mModel = model
self._mReversed = reversed # TRUE if we need to reverse every pair in the model lookup
self._mNameProber = nameProber # Optional auxiliary prober for name decision
# TRUE if we need to reverse every pair in the model lookup
self._mReversed = reversed
# Optional auxiliary prober for name decision
self._mNameProber = nameProber
self.reset()
def reset(self):
CharSetProber.reset(self)
self._mLastOrder = 255 # char order of last character
# char order of last character
self._mLastOrder = 255
self._mSeqCounters = [0] * NUMBER_OF_SEQ_CAT
self._mTotalSeqs = 0
self._mTotalChar = 0
self._mFreqChar = 0 # characters that fall in our sampling range
# characters that fall in our sampling range
self._mFreqChar = 0
def get_charset_name(self):
if self._mNameProber:
@ -67,7 +74,7 @@ class SingleByteCharSetProber(CharSetProber):
if not aLen:
return self.get_state()
for c in aBuf:
order = self._mModel['charToOrderMap'][ord(c)]
order = self._mModel['charToOrderMap'][wrap_ord(c)]
if order < SYMBOL_CAT_ORDER:
self._mTotalChar += 1
if order < SAMPLE_SIZE:
@ -75,9 +82,12 @@ class SingleByteCharSetProber(CharSetProber):
if self._mLastOrder < SAMPLE_SIZE:
self._mTotalSeqs += 1
if not self._mReversed:
self._mSeqCounters[self._mModel['precedenceMatrix'][(self._mLastOrder * SAMPLE_SIZE) + order]] += 1
i = (self._mLastOrder * SAMPLE_SIZE) + order
model = self._mModel['precedenceMatrix'][i]
else: # reverse the order of the letters in the lookup
self._mSeqCounters[self._mModel['precedenceMatrix'][(order * SAMPLE_SIZE) + self._mLastOrder]] += 1
i = (order * SAMPLE_SIZE) + self._mLastOrder
model = self._mModel['precedenceMatrix'][i]
self._mSeqCounters[model] += 1
self._mLastOrder = order
if self.get_state() == constants.eDetecting:
@ -85,11 +95,16 @@ class SingleByteCharSetProber(CharSetProber):
cf = self.get_confidence()
if cf > POSITIVE_SHORTCUT_THRESHOLD:
if constants._debug:
sys.stderr.write('%s confidence = %s, we have a winner\n' % (self._mModel['charsetName'], cf))
sys.stderr.write('%s confidence = %s, we have a'
'winner\n' %
(self._mModel['charsetName'], cf))
self._mState = constants.eFoundIt
elif cf < NEGATIVE_SHORTCUT_THRESHOLD:
if constants._debug:
sys.stderr.write('%s confidence = %s, below negative shortcut threshhold %s\n' % (self._mModel['charsetName'], cf, NEGATIVE_SHORTCUT_THRESHOLD))
sys.stderr.write('%s confidence = %s, below negative'
'shortcut threshhold %s\n' %
(self._mModel['charsetName'], cf,
NEGATIVE_SHORTCUT_THRESHOLD))
self._mState = constants.eNotMe
return self.get_state()
@ -97,9 +112,8 @@ class SingleByteCharSetProber(CharSetProber):
def get_confidence(self):
r = 0.01
if self._mTotalSeqs > 0:
# print self._mSeqCounters[POSITIVE_CAT], self._mTotalSeqs, self._mModel['mTypicalPositiveRatio']
r = (1.0 * self._mSeqCounters[POSITIVE_CAT]) / self._mTotalSeqs / self._mModel['mTypicalPositiveRatio']
# print r, self._mFreqChar, self._mTotalChar
r = ((1.0 * self._mSeqCounters[POSITIVE_CAT]) / self._mTotalSeqs
/ self._mModel['mTypicalPositiveRatio'])
r = r * self._mFreqChar / self._mTotalChar
if r >= 1.0:
r = 0.99

View File

@ -26,21 +26,23 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants, sys
from charsetgroupprober import CharSetGroupProber
from sbcharsetprober import SingleByteCharSetProber
from langcyrillicmodel import Win1251CyrillicModel, Koi8rModel, Latin5CyrillicModel, MacCyrillicModel, Ibm866Model, Ibm855Model
from langgreekmodel import Latin7GreekModel, Win1253GreekModel
from langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel
from langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel
from langthaimodel import TIS620ThaiModel
from langhebrewmodel import Win1255HebrewModel
from hebrewprober import HebrewProber
from .charsetgroupprober import CharSetGroupProber
from .sbcharsetprober import SingleByteCharSetProber
from .langcyrillicmodel import (Win1251CyrillicModel, Koi8rModel,
Latin5CyrillicModel, MacCyrillicModel,
Ibm866Model, Ibm855Model)
from .langgreekmodel import Latin7GreekModel, Win1253GreekModel
from .langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel
from .langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel
from .langthaimodel import TIS620ThaiModel
from .langhebrewmodel import Win1255HebrewModel
from .hebrewprober import HebrewProber
class SBCSGroupProber(CharSetGroupProber):
def __init__(self):
CharSetGroupProber.__init__(self)
self._mProbers = [ \
self._mProbers = [
SingleByteCharSetProber(Win1251CyrillicModel),
SingleByteCharSetProber(Koi8rModel),
SingleByteCharSetProber(Latin5CyrillicModel),
@ -56,9 +58,12 @@ class SBCSGroupProber(CharSetGroupProber):
SingleByteCharSetProber(TIS620ThaiModel),
]
hebrewProber = HebrewProber()
logicalHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, constants.False, hebrewProber)
visualHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, constants.True, hebrewProber)
logicalHebrewProber = SingleByteCharSetProber(Win1255HebrewModel,
False, hebrewProber)
visualHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, True,
hebrewProber)
hebrewProber.set_model_probers(logicalHebrewProber, visualHebrewProber)
self._mProbers.extend([hebrewProber, logicalHebrewProber, visualHebrewProber])
self._mProbers.extend([hebrewProber, logicalHebrewProber,
visualHebrewProber])
self.reset()

View File

@ -25,13 +25,14 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from mbcharsetprober import MultiByteCharSetProber
from codingstatemachine import CodingStateMachine
from chardistribution import SJISDistributionAnalysis
from jpcntx import SJISContextAnalysis
from mbcssm import SJISSMModel
import constants, sys
from constants import eStart, eError, eItsMe
import sys
from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from .chardistribution import SJISDistributionAnalysis
from .jpcntx import SJISContextAnalysis
from .mbcssm import SJISSMModel
from . import constants
class SJISProber(MultiByteCharSetProber):
def __init__(self):
@ -46,35 +47,40 @@ class SJISProber(MultiByteCharSetProber):
self._mContextAnalyzer.reset()
def get_charset_name(self):
return "SHIFT_JIS"
return self._mContextAnalyzer.get_charset_name()
def feed(self, aBuf):
aLen = len(aBuf)
for i in xrange(0, aLen):
for i in range(0, aLen):
codingState = self._mCodingSM.next_state(aBuf[i])
if codingState == eError:
if codingState == constants.eError:
if constants._debug:
sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n')
sys.stderr.write(self.get_charset_name()
+ ' prober hit error at byte ' + str(i)
+ '\n')
self._mState = constants.eNotMe
break
elif codingState == eItsMe:
elif codingState == constants.eItsMe:
self._mState = constants.eFoundIt
break
elif codingState == eStart:
elif codingState == constants.eStart:
charLen = self._mCodingSM.get_current_charlen()
if i == 0:
self._mLastChar[1] = aBuf[0]
self._mContextAnalyzer.feed(self._mLastChar[2 - charLen :], charLen)
self._mContextAnalyzer.feed(self._mLastChar[2 - charLen:],
charLen)
self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
else:
self._mContextAnalyzer.feed(aBuf[i + 1 - charLen : i + 3 - charLen], charLen)
self._mDistributionAnalyzer.feed(aBuf[i - 1 : i + 1], charLen)
self._mContextAnalyzer.feed(aBuf[i + 1 - charLen:i + 3
- charLen], charLen)
self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1],
charLen)
self._mLastChar[0] = aBuf[aLen - 1]
if self.get_state() == constants.eDetecting:
if self._mContextAnalyzer.got_enough_data() and \
(self.get_confidence() > constants.SHORTCUT_THRESHOLD):
if (self._mContextAnalyzer.got_enough_data() and
(self.get_confidence() > constants.SHORTCUT_THRESHOLD)):
self._mState = constants.eFoundIt
return self.get_state()

View File

@ -1,20 +0,0 @@
import sys, glob
sys.path.insert(0, '..')
from chardet.universaldetector import UniversalDetector
count = 0
u = UniversalDetector()
for f in glob.glob(sys.argv[1]):
print f.ljust(60),
u.reset()
for line in file(f, 'rb'):
u.feed(line)
if u.done: break
u.close()
result = u.result
if result['encoding']:
print result['encoding'], 'with confidence', result['confidence']
else:
print '******** no result'
count += 1
print count, 'tests'

View File

@ -26,11 +26,13 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants, sys
from latin1prober import Latin1Prober # windows-1252
from mbcsgroupprober import MBCSGroupProber # multi-byte character sets
from sbcsgroupprober import SBCSGroupProber # single-byte character sets
from escprober import EscCharSetProber # ISO-2122, etc.
from . import constants
import sys
import codecs
from .latin1prober import Latin1Prober # windows-1252
from .mbcsgroupprober import MBCSGroupProber # multi-byte character sets
from .sbcsgroupprober import SBCSGroupProber # single-byte character sets
from .escprober import EscCharSetProber # ISO-2122, etc.
import re
MINIMUM_THRESHOLD = 0.20
@ -38,68 +40,78 @@ ePureAscii = 0
eEscAscii = 1
eHighbyte = 2
class UniversalDetector:
def __init__(self):
self._highBitDetector = re.compile(r'[\x80-\xFF]')
self._escDetector = re.compile(r'(\033|~{)')
self._highBitDetector = re.compile(b'[\x80-\xFF]')
self._escDetector = re.compile(b'(\033|~{)')
self._mEscCharSetProber = None
self._mCharSetProbers = []
self.reset()
def reset(self):
self.result = {'encoding': None, 'confidence': 0.0}
self.done = constants.False
self._mStart = constants.True
self._mGotData = constants.False
self.done = False
self._mStart = True
self._mGotData = False
self._mInputState = ePureAscii
self._mLastChar = ''
self._mLastChar = b''
if self._mEscCharSetProber:
self._mEscCharSetProber.reset()
for prober in self._mCharSetProbers:
prober.reset()
def feed(self, aBuf):
if self.done: return
if self.done:
return
aLen = len(aBuf)
if not aLen: return
if not aLen:
return
if not self._mGotData:
# If the data starts with BOM, we know it is UTF
if aBuf[:3] == '\xEF\xBB\xBF':
if aBuf[:3] == codecs.BOM_UTF8:
# EF BB BF UTF-8 with BOM
self.result = {'encoding': "UTF-8", 'confidence': 1.0}
elif aBuf[:4] == '\xFF\xFE\x00\x00':
self.result = {'encoding': "UTF-8-SIG", 'confidence': 1.0}
elif aBuf[:4] == codecs.BOM_UTF32_LE:
# FF FE 00 00 UTF-32, little-endian BOM
self.result = {'encoding': "UTF-32LE", 'confidence': 1.0}
elif aBuf[:4] == '\x00\x00\xFE\xFF':
elif aBuf[:4] == codecs.BOM_UTF32_BE:
# 00 00 FE FF UTF-32, big-endian BOM
self.result = {'encoding': "UTF-32BE", 'confidence': 1.0}
elif aBuf[:4] == '\xFE\xFF\x00\x00':
elif aBuf[:4] == b'\xFE\xFF\x00\x00':
# FE FF 00 00 UCS-4, unusual octet order BOM (3412)
self.result = {'encoding': "X-ISO-10646-UCS-4-3412", 'confidence': 1.0}
elif aBuf[:4] == '\x00\x00\xFF\xFE':
self.result = {
'encoding': "X-ISO-10646-UCS-4-3412",
'confidence': 1.0
}
elif aBuf[:4] == b'\x00\x00\xFF\xFE':
# 00 00 FF FE UCS-4, unusual octet order BOM (2143)
self.result = {'encoding': "X-ISO-10646-UCS-4-2143", 'confidence': 1.0}
elif aBuf[:2] == '\xFF\xFE':
self.result = {
'encoding': "X-ISO-10646-UCS-4-2143",
'confidence': 1.0
}
elif aBuf[:2] == codecs.BOM_LE:
# FF FE UTF-16, little endian BOM
self.result = {'encoding': "UTF-16LE", 'confidence': 1.0}
elif aBuf[:2] == '\xFE\xFF':
elif aBuf[:2] == codecs.BOM_BE:
# FE FF UTF-16, big endian BOM
self.result = {'encoding': "UTF-16BE", 'confidence': 1.0}
self._mGotData = constants.True
self._mGotData = True
if self.result['encoding'] and (self.result['confidence'] > 0.0):
self.done = constants.True
self.done = True
return
if self._mInputState == ePureAscii:
if self._highBitDetector.search(aBuf):
self._mInputState = eHighbyte
elif (self._mInputState == ePureAscii) and self._escDetector.search(self._mLastChar + aBuf):
elif ((self._mInputState == ePureAscii) and
self._escDetector.search(self._mLastChar + aBuf)):
self._mInputState = eEscAscii
self._mLastChar = aBuf[-1]
self._mLastChar = aBuf[-1:]
if self._mInputState == eEscAscii:
if not self._mEscCharSetProber:
@ -107,24 +119,26 @@ class UniversalDetector:
if self._mEscCharSetProber.feed(aBuf) == constants.eFoundIt:
self.result = {'encoding': self._mEscCharSetProber.get_charset_name(),
'confidence': self._mEscCharSetProber.get_confidence()}
self.done = constants.True
self.done = True
elif self._mInputState == eHighbyte:
if not self._mCharSetProbers:
self._mCharSetProbers = [MBCSGroupProber(), SBCSGroupProber(), Latin1Prober()]
self._mCharSetProbers = [MBCSGroupProber(), SBCSGroupProber(),
Latin1Prober()]
for prober in self._mCharSetProbers:
if prober.feed(aBuf) == constants.eFoundIt:
self.result = {'encoding': prober.get_charset_name(),
'confidence': prober.get_confidence()}
self.done = constants.True
self.done = True
break
def close(self):
if self.done: return
if self.done:
return
if not self._mGotData:
if constants._debug:
sys.stderr.write('no data received!\n')
return
self.done = constants.True
self.done = True
if self._mInputState == ePureAscii:
self.result = {'encoding': 'ascii', 'confidence': 1.0}
@ -135,7 +149,8 @@ class UniversalDetector:
maxProberConfidence = 0.0
maxProber = None
for prober in self._mCharSetProbers:
if not prober: continue
if not prober:
continue
proberConfidence = prober.get_confidence()
if proberConfidence > maxProberConfidence:
maxProberConfidence = proberConfidence
@ -148,7 +163,8 @@ class UniversalDetector:
if constants._debug:
sys.stderr.write('no probers hit minimum threshhold\n')
for prober in self._mCharSetProbers[0].mProbers:
if not prober: continue
sys.stderr.write('%s confidence = %s\n' % \
(prober.get_charset_name(), \
if not prober:
continue
sys.stderr.write('%s confidence = %s\n' %
(prober.get_charset_name(),
prober.get_confidence()))

View File

@ -25,14 +25,14 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants, sys
from constants import eStart, eError, eItsMe
from charsetprober import CharSetProber
from codingstatemachine import CodingStateMachine
from mbcssm import UTF8SMModel
from . import constants
from .charsetprober import CharSetProber
from .codingstatemachine import CodingStateMachine
from .mbcssm import UTF8SMModel
ONE_CHAR_PROB = 0.5
class UTF8Prober(CharSetProber):
def __init__(self):
CharSetProber.__init__(self)
@ -50,13 +50,13 @@ class UTF8Prober(CharSetProber):
def feed(self, aBuf):
for c in aBuf:
codingState = self._mCodingSM.next_state(c)
if codingState == eError:
if codingState == constants.eError:
self._mState = constants.eNotMe
break
elif codingState == eItsMe:
elif codingState == constants.eItsMe:
self._mState = constants.eFoundIt
break
elif codingState == eStart:
elif codingState == constants.eStart:
if self._mCodingSM.get_current_charlen() >= 2:
self._mNumOfMBChar += 1
@ -69,7 +69,7 @@ class UTF8Prober(CharSetProber):
def get_confidence(self):
unlike = 0.99
if self._mNumOfMBChar < 6:
for i in xrange(0, self._mNumOfMBChar):
for i in range(0, self._mNumOfMBChar):
unlike = unlike * ONE_CHAR_PROB
return 1.0 - unlike
else: