diff --git a/thirdparty/chardet/__init__.py b/thirdparty/chardet/__init__.py index 953b39942..82c2a48d2 100644 --- a/thirdparty/chardet/__init__.py +++ b/thirdparty/chardet/__init__.py @@ -3,22 +3,28 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -__version__ = "2.0.1" +__version__ = "2.3.0" +from sys import version_info + def detect(aBuf): - import universaldetector + if ((version_info < (3, 0) and isinstance(aBuf, unicode)) or + (version_info >= (3, 0) and not isinstance(aBuf, bytes))): + raise ValueError('Expected a bytes object, not a unicode object') + + from . import universaldetector u = universaldetector.UniversalDetector() u.reset() u.feed(aBuf) diff --git a/thirdparty/chardet/big5freq.py b/thirdparty/chardet/big5freq.py index c1b0f3cec..65bffc04b 100644 --- a/thirdparty/chardet/big5freq.py +++ b/thirdparty/chardet/big5freq.py @@ -1,11 +1,11 @@ ######################## BEGIN LICENSE BLOCK ######################## # The Original Code is Mozilla Communicator client code. -# +# # The Initial Developer of the Original Code is # Netscape Communications Corporation. # Portions created by the Initial Developer are Copyright (C) 1998 # the Initial Developer. All Rights Reserved. -# +# # Contributor(s): # Mark Pilgrim - port to Python # @@ -13,12 +13,12 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA @@ -26,18 +26,18 @@ ######################### END LICENSE BLOCK ######################### # Big5 frequency table -# by Taiwan's Mandarin Promotion Council +# by Taiwan's Mandarin Promotion Council # -# +# # 128 --> 0.42261 # 256 --> 0.57851 # 512 --> 0.74851 # 1024 --> 0.89384 # 2048 --> 0.97583 -# +# # Ideal Distribution Ratio = 0.74851/(1-0.74851) =2.98 # Random Distribution Ration = 512/(5401-512)=0.105 -# +# # Typical Distribution Ratio about 25% of Ideal one, still much higher than RDR BIG5_TYPICAL_DISTRIBUTION_RATIO = 0.75 @@ -45,7 +45,7 @@ BIG5_TYPICAL_DISTRIBUTION_RATIO = 0.75 #Char to FreqOrder table BIG5_TABLE_SIZE = 5376 -Big5CharToFreqOrder = ( \ +Big5CharToFreqOrder = ( 1,1801,1506, 255,1431, 198, 9, 82, 6,5008, 177, 202,3681,1256,2821, 110, # 16 3814, 33,3274, 261, 76, 44,2114, 16,2946,2187,1176, 659,3971, 26,3451,2653, # 32 1198,3972,3350,4202, 410,2215, 302, 590, 361,1964, 8, 204, 58,4510,5009,1932, # 48 @@ -921,3 +921,5 @@ Big5CharToFreqOrder = ( \ 13936,13937,13938,13939,13940,13941,13942,13943,13944,13945,13946,13947,13948,13949,13950,13951, #13952 13952,13953,13954,13955,13956,13957,13958,13959,13960,13961,13962,13963,13964,13965,13966,13967, #13968 13968,13969,13970,13971,13972) #13973 + +# flake8: noqa diff --git a/thirdparty/chardet/big5prober.py b/thirdparty/chardet/big5prober.py index e6b52aadb..becce81e5 100644 --- a/thirdparty/chardet/big5prober.py +++ b/thirdparty/chardet/big5prober.py @@ -1,11 +1,11 @@ ######################## BEGIN LICENSE BLOCK ######################## # The Original Code is Mozilla Communicator client code. -# +# # The Initial Developer of the Original Code is # Netscape Communications Corporation. # Portions created by the Initial Developer are Copyright (C) 1998 # the Initial Developer. All Rights Reserved. -# +# # Contributor(s): # Mark Pilgrim - port to Python # @@ -13,22 +13,23 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from mbcharsetprober import MultiByteCharSetProber -from codingstatemachine import CodingStateMachine -from chardistribution import Big5DistributionAnalysis -from mbcssm import Big5SMModel +from .mbcharsetprober import MultiByteCharSetProber +from .codingstatemachine import CodingStateMachine +from .chardistribution import Big5DistributionAnalysis +from .mbcssm import Big5SMModel + class Big5Prober(MultiByteCharSetProber): def __init__(self): diff --git a/thirdparty/chardet/chardetect.py b/thirdparty/chardet/chardetect.py new file mode 100644 index 000000000..ffe892f25 --- /dev/null +++ b/thirdparty/chardet/chardetect.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python +""" +Script which takes one or more file paths and reports on their detected +encodings + +Example:: + + % chardetect somefile someotherfile + somefile: windows-1252 with confidence 0.5 + someotherfile: ascii with confidence 1.0 + +If no paths are provided, it takes its input from stdin. + +""" + +from __future__ import absolute_import, print_function, unicode_literals + +import argparse +import sys +from io import open + +from chardet import __version__ +from chardet.universaldetector import UniversalDetector + + +def description_of(lines, name='stdin'): + """ + Return a string describing the probable encoding of a file or + list of strings. + + :param lines: The lines to get the encoding of. + :type lines: Iterable of bytes + :param name: Name of file or collection of lines + :type name: str + """ + u = UniversalDetector() + for line in lines: + u.feed(line) + u.close() + result = u.result + if result['encoding']: + return '{0}: {1} with confidence {2}'.format(name, result['encoding'], + result['confidence']) + else: + return '{0}: no result'.format(name) + + +def main(argv=None): + ''' + Handles command line arguments and gets things started. + + :param argv: List of arguments, as if specified on the command-line. + If None, ``sys.argv[1:]`` is used instead. + :type argv: list of str + ''' + # Get command line arguments + parser = argparse.ArgumentParser( + description="Takes one or more file paths and reports their detected \ + encodings", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + conflict_handler='resolve') + parser.add_argument('input', + help='File whose encoding we would like to determine.', + type=argparse.FileType('rb'), nargs='*', + default=[sys.stdin]) + parser.add_argument('--version', action='version', + version='%(prog)s {0}'.format(__version__)) + args = parser.parse_args(argv) + + for f in args.input: + if f.isatty(): + print("You are running chardetect interactively. Press " + + "CTRL-D twice at the start of a blank line to signal the " + + "end of your input. If you want help, run chardetect " + + "--help\n", file=sys.stderr) + print(description_of(f, f.name)) + + +if __name__ == '__main__': + main() diff --git a/thirdparty/chardet/chardistribution.py b/thirdparty/chardet/chardistribution.py index 1f95fc848..4e64a00be 100644 --- a/thirdparty/chardet/chardistribution.py +++ b/thirdparty/chardet/chardistribution.py @@ -1,11 +1,11 @@ ######################## BEGIN LICENSE BLOCK ######################## # The Original Code is Mozilla Communicator client code. -# +# # The Initial Developer of the Original Code is # Netscape Communications Corporation. # Portions created by the Initial Developer are Copyright (C) 1998 # the Initial Developer. All Rights Reserved. -# +# # Contributor(s): # Mark Pilgrim - port to Python # @@ -13,47 +13,63 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants -from euctwfreq import EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE, EUCTW_TYPICAL_DISTRIBUTION_RATIO -from euckrfreq import EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE, EUCKR_TYPICAL_DISTRIBUTION_RATIO -from gb2312freq import GB2312CharToFreqOrder, GB2312_TABLE_SIZE, GB2312_TYPICAL_DISTRIBUTION_RATIO -from big5freq import Big5CharToFreqOrder, BIG5_TABLE_SIZE, BIG5_TYPICAL_DISTRIBUTION_RATIO -from jisfreq import JISCharToFreqOrder, JIS_TABLE_SIZE, JIS_TYPICAL_DISTRIBUTION_RATIO +from .euctwfreq import (EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE, + EUCTW_TYPICAL_DISTRIBUTION_RATIO) +from .euckrfreq import (EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE, + EUCKR_TYPICAL_DISTRIBUTION_RATIO) +from .gb2312freq import (GB2312CharToFreqOrder, GB2312_TABLE_SIZE, + GB2312_TYPICAL_DISTRIBUTION_RATIO) +from .big5freq import (Big5CharToFreqOrder, BIG5_TABLE_SIZE, + BIG5_TYPICAL_DISTRIBUTION_RATIO) +from .jisfreq import (JISCharToFreqOrder, JIS_TABLE_SIZE, + JIS_TYPICAL_DISTRIBUTION_RATIO) +from .compat import wrap_ord ENOUGH_DATA_THRESHOLD = 1024 SURE_YES = 0.99 SURE_NO = 0.01 +MINIMUM_DATA_THRESHOLD = 3 + class CharDistributionAnalysis: def __init__(self): - self._mCharToFreqOrder = None # Mapping table to get frequency order from char order (get from GetOrder()) - self._mTableSize = None # Size of above table - self._mTypicalDistributionRatio = None # This is a constant value which varies from language to language, used in calculating confidence. See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail. + # Mapping table to get frequency order from char order (get from + # GetOrder()) + self._mCharToFreqOrder = None + self._mTableSize = None # Size of above table + # This is a constant value which varies from language to language, + # used in calculating confidence. See + # http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html + # for further detail. + self._mTypicalDistributionRatio = None self.reset() def reset(self): """reset analyser, clear any state""" - self._mDone = constants.False # If this flag is set to constants.True, detection is done and conclusion has been made - self._mTotalChars = 0 # Total characters encountered - self._mFreqChars = 0 # The number of characters whose frequency order is less than 512 + # If this flag is set to True, detection is done and conclusion has + # been made + self._mDone = False + self._mTotalChars = 0 # Total characters encountered + # The number of characters whose frequency order is less than 512 + self._mFreqChars = 0 - def feed(self, aStr, aCharLen): + def feed(self, aBuf, aCharLen): """feed a character with known length""" if aCharLen == 2: # we only care about 2-bytes character in our distribution analysis - order = self.get_order(aStr) + order = self.get_order(aBuf) else: order = -1 if order >= 0: @@ -65,12 +81,14 @@ class CharDistributionAnalysis: def get_confidence(self): """return confidence based on existing data""" - # if we didn't receive any character in our consideration range, return negative answer - if self._mTotalChars <= 0: + # if we didn't receive any character in our consideration range, + # return negative answer + if self._mTotalChars <= 0 or self._mFreqChars <= MINIMUM_DATA_THRESHOLD: return SURE_NO if self._mTotalChars != self._mFreqChars: - r = self._mFreqChars / ((self._mTotalChars - self._mFreqChars) * self._mTypicalDistributionRatio) + r = (self._mFreqChars / ((self._mTotalChars - self._mFreqChars) + * self._mTypicalDistributionRatio)) if r < SURE_YES: return r @@ -78,16 +96,18 @@ class CharDistributionAnalysis: return SURE_YES def got_enough_data(self): - # It is not necessary to receive all data to draw conclusion. For charset detection, - # certain amount of data is enough + # It is not necessary to receive all data to draw conclusion. + # For charset detection, certain amount of data is enough return self._mTotalChars > ENOUGH_DATA_THRESHOLD - def get_order(self, aStr): - # We do not handle characters based on the original encoding string, but - # convert this encoding string to a number, here called order. - # This allows multiple encodings of a language to share one frequency table. + def get_order(self, aBuf): + # We do not handle characters based on the original encoding string, + # but convert this encoding string to a number, here called order. + # This allows multiple encodings of a language to share one frequency + # table. return -1 + class EUCTWDistributionAnalysis(CharDistributionAnalysis): def __init__(self): CharDistributionAnalysis.__init__(self) @@ -95,16 +115,18 @@ class EUCTWDistributionAnalysis(CharDistributionAnalysis): self._mTableSize = EUCTW_TABLE_SIZE self._mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, aStr): - # for euc-TW encoding, we are interested + def get_order(self, aBuf): + # for euc-TW encoding, we are interested # first byte range: 0xc4 -- 0xfe # second byte range: 0xa1 -- 0xfe # no validation needed here. State machine has done that - if aStr[0] >= '\xC4': - return 94 * (ord(aStr[0]) - 0xC4) + ord(aStr[1]) - 0xA1 + first_char = wrap_ord(aBuf[0]) + if first_char >= 0xC4: + return 94 * (first_char - 0xC4) + wrap_ord(aBuf[1]) - 0xA1 else: return -1 + class EUCKRDistributionAnalysis(CharDistributionAnalysis): def __init__(self): CharDistributionAnalysis.__init__(self) @@ -112,15 +134,17 @@ class EUCKRDistributionAnalysis(CharDistributionAnalysis): self._mTableSize = EUCKR_TABLE_SIZE self._mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, aStr): - # for euc-KR encoding, we are interested + def get_order(self, aBuf): + # for euc-KR encoding, we are interested # first byte range: 0xb0 -- 0xfe # second byte range: 0xa1 -- 0xfe # no validation needed here. State machine has done that - if aStr[0] >= '\xB0': - return 94 * (ord(aStr[0]) - 0xB0) + ord(aStr[1]) - 0xA1 + first_char = wrap_ord(aBuf[0]) + if first_char >= 0xB0: + return 94 * (first_char - 0xB0) + wrap_ord(aBuf[1]) - 0xA1 else: - return -1; + return -1 + class GB2312DistributionAnalysis(CharDistributionAnalysis): def __init__(self): @@ -129,15 +153,17 @@ class GB2312DistributionAnalysis(CharDistributionAnalysis): self._mTableSize = GB2312_TABLE_SIZE self._mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, aStr): - # for GB2312 encoding, we are interested + def get_order(self, aBuf): + # for GB2312 encoding, we are interested # first byte range: 0xb0 -- 0xfe # second byte range: 0xa1 -- 0xfe # no validation needed here. State machine has done that - if (aStr[0] >= '\xB0') and (aStr[1] >= '\xA1'): - return 94 * (ord(aStr[0]) - 0xB0) + ord(aStr[1]) - 0xA1 + first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1]) + if (first_char >= 0xB0) and (second_char >= 0xA1): + return 94 * (first_char - 0xB0) + second_char - 0xA1 else: - return -1; + return -1 + class Big5DistributionAnalysis(CharDistributionAnalysis): def __init__(self): @@ -146,19 +172,21 @@ class Big5DistributionAnalysis(CharDistributionAnalysis): self._mTableSize = BIG5_TABLE_SIZE self._mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, aStr): - # for big5 encoding, we are interested + def get_order(self, aBuf): + # for big5 encoding, we are interested # first byte range: 0xa4 -- 0xfe # second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe # no validation needed here. State machine has done that - if aStr[0] >= '\xA4': - if aStr[1] >= '\xA1': - return 157 * (ord(aStr[0]) - 0xA4) + ord(aStr[1]) - 0xA1 + 63 + first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1]) + if first_char >= 0xA4: + if second_char >= 0xA1: + return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63 else: - return 157 * (ord(aStr[0]) - 0xA4) + ord(aStr[1]) - 0x40 + return 157 * (first_char - 0xA4) + second_char - 0x40 else: return -1 + class SJISDistributionAnalysis(CharDistributionAnalysis): def __init__(self): CharDistributionAnalysis.__init__(self) @@ -166,22 +194,24 @@ class SJISDistributionAnalysis(CharDistributionAnalysis): self._mTableSize = JIS_TABLE_SIZE self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, aStr): - # for sjis encoding, we are interested + def get_order(self, aBuf): + # for sjis encoding, we are interested # first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe # second byte range: 0x40 -- 0x7e, 0x81 -- oxfe # no validation needed here. State machine has done that - if (aStr[0] >= '\x81') and (aStr[0] <= '\x9F'): - order = 188 * (ord(aStr[0]) - 0x81) - elif (aStr[0] >= '\xE0') and (aStr[0] <= '\xEF'): - order = 188 * (ord(aStr[0]) - 0xE0 + 31) + first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1]) + if (first_char >= 0x81) and (first_char <= 0x9F): + order = 188 * (first_char - 0x81) + elif (first_char >= 0xE0) and (first_char <= 0xEF): + order = 188 * (first_char - 0xE0 + 31) else: - return -1; - order = order + ord(aStr[1]) - 0x40 - if aStr[1] > '\x7F': - order =- 1 + return -1 + order = order + second_char - 0x40 + if second_char > 0x7F: + order = -1 return order + class EUCJPDistributionAnalysis(CharDistributionAnalysis): def __init__(self): CharDistributionAnalysis.__init__(self) @@ -189,12 +219,13 @@ class EUCJPDistributionAnalysis(CharDistributionAnalysis): self._mTableSize = JIS_TABLE_SIZE self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, aStr): - # for euc-JP encoding, we are interested + def get_order(self, aBuf): + # for euc-JP encoding, we are interested # first byte range: 0xa0 -- 0xfe # second byte range: 0xa1 -- 0xfe # no validation needed here. State machine has done that - if aStr[0] >= '\xA0': - return 94 * (ord(aStr[0]) - 0xA1) + ord(aStr[1]) - 0xa1 + char = wrap_ord(aBuf[0]) + if char >= 0xA0: + return 94 * (char - 0xA1) + wrap_ord(aBuf[1]) - 0xa1 else: return -1 diff --git a/thirdparty/chardet/charsetgroupprober.py b/thirdparty/chardet/charsetgroupprober.py index 9037af480..85e7a1c67 100644 --- a/thirdparty/chardet/charsetgroupprober.py +++ b/thirdparty/chardet/charsetgroupprober.py @@ -25,8 +25,10 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants, sys -from charsetprober import CharSetProber +from . import constants +import sys +from .charsetprober import CharSetProber + class CharSetGroupProber(CharSetProber): def __init__(self): @@ -41,28 +43,32 @@ class CharSetGroupProber(CharSetProber): for prober in self._mProbers: if prober: prober.reset() - prober.active = constants.True + prober.active = True self._mActiveNum += 1 self._mBestGuessProber = None def get_charset_name(self): if not self._mBestGuessProber: self.get_confidence() - if not self._mBestGuessProber: return None + if not self._mBestGuessProber: + return None # self._mBestGuessProber = self._mProbers[0] return self._mBestGuessProber.get_charset_name() def feed(self, aBuf): for prober in self._mProbers: - if not prober: continue - if not prober.active: continue + if not prober: + continue + if not prober.active: + continue st = prober.feed(aBuf) - if not st: continue + if not st: + continue if st == constants.eFoundIt: self._mBestGuessProber = prober return self.get_state() elif st == constants.eNotMe: - prober.active = constants.False + prober.active = False self._mActiveNum -= 1 if self._mActiveNum <= 0: self._mState = constants.eNotMe @@ -78,18 +84,22 @@ class CharSetGroupProber(CharSetProber): bestConf = 0.0 self._mBestGuessProber = None for prober in self._mProbers: - if not prober: continue + if not prober: + continue if not prober.active: if constants._debug: - sys.stderr.write(prober.get_charset_name() + ' not active\n') + sys.stderr.write(prober.get_charset_name() + + ' not active\n') continue cf = prober.get_confidence() if constants._debug: - sys.stderr.write('%s confidence = %s\n' % (prober.get_charset_name(), cf)) + sys.stderr.write('%s confidence = %s\n' % + (prober.get_charset_name(), cf)) if bestConf < cf: bestConf = cf self._mBestGuessProber = prober - if not self._mBestGuessProber: return 0.0 + if not self._mBestGuessProber: + return 0.0 return bestConf # else: # self._mBestGuessProber = self._mProbers[0] diff --git a/thirdparty/chardet/charsetprober.py b/thirdparty/chardet/charsetprober.py index 6ad198cd4..97581712c 100644 --- a/thirdparty/chardet/charsetprober.py +++ b/thirdparty/chardet/charsetprober.py @@ -1,11 +1,11 @@ ######################## BEGIN LICENSE BLOCK ######################## # The Original Code is Mozilla Universal charset detector code. -# +# # The Initial Developer of the Original Code is # Netscape Communications Corporation. # Portions created by the Initial Developer are Copyright (C) 2001 # the Initial Developer. All Rights Reserved. -# +# # Contributor(s): # Mark Pilgrim - port to Python # Shy Shalom - original C code @@ -14,19 +14,21 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants, re +from . import constants +import re + class CharSetProber: def __init__(self): @@ -48,11 +50,11 @@ class CharSetProber: return 0.0 def filter_high_bit_only(self, aBuf): - aBuf = re.sub(r'([\x00-\x7F])+', ' ', aBuf) + aBuf = re.sub(b'([\x00-\x7F])+', b' ', aBuf) return aBuf def filter_without_english_letters(self, aBuf): - aBuf = re.sub(r'([A-Za-z])+', ' ', aBuf) + aBuf = re.sub(b'([A-Za-z])+', b' ', aBuf) return aBuf def filter_with_english_letters(self, aBuf): diff --git a/thirdparty/chardet/codingstatemachine.py b/thirdparty/chardet/codingstatemachine.py index 452d3b0a0..8dd8c9179 100644 --- a/thirdparty/chardet/codingstatemachine.py +++ b/thirdparty/chardet/codingstatemachine.py @@ -13,19 +13,21 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from constants import eStart, eError, eItsMe +from .constants import eStart +from .compat import wrap_ord + class CodingStateMachine: def __init__(self, sm): @@ -40,12 +42,15 @@ class CodingStateMachine: def next_state(self, c): # for each byte we get its class # if it is first byte, we also get byte length - byteCls = self._mModel['classTable'][ord(c)] + # PY3K: aBuf is a byte stream, so c is an int, not a byte + byteCls = self._mModel['classTable'][wrap_ord(c)] if self._mCurrentState == eStart: self._mCurrentBytePos = 0 self._mCurrentCharLen = self._mModel['charLenTable'][byteCls] # from byte's class and stateTable, we get its next state - self._mCurrentState = self._mModel['stateTable'][self._mCurrentState * self._mModel['classFactor'] + byteCls] + curr_state = (self._mCurrentState * self._mModel['classFactor'] + + byteCls) + self._mCurrentState = self._mModel['stateTable'][curr_state] self._mCurrentBytePos += 1 return self._mCurrentState diff --git a/thirdparty/chardet/compat.py b/thirdparty/chardet/compat.py new file mode 100644 index 000000000..d9e30addf --- /dev/null +++ b/thirdparty/chardet/compat.py @@ -0,0 +1,34 @@ +######################## BEGIN LICENSE BLOCK ######################## +# Contributor(s): +# Ian Cordasco - port to Python +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA +# 02110-1301 USA +######################### END LICENSE BLOCK ######################### + +import sys + + +if sys.version_info < (3, 0): + base_str = (str, unicode) +else: + base_str = (bytes, str) + + +def wrap_ord(a): + if sys.version_info < (3, 0) and isinstance(a, base_str): + return ord(a) + else: + return a diff --git a/thirdparty/chardet/constants.py b/thirdparty/chardet/constants.py index e94e226b0..e4d148b3c 100644 --- a/thirdparty/chardet/constants.py +++ b/thirdparty/chardet/constants.py @@ -37,11 +37,3 @@ eError = 1 eItsMe = 2 SHORTCUT_THRESHOLD = 0.95 - -import __builtin__ -if not hasattr(__builtin__, 'False'): - False = 0 - True = 1 -else: - False = __builtin__.False - True = __builtin__.True diff --git a/thirdparty/chardet/cp949prober.py b/thirdparty/chardet/cp949prober.py new file mode 100644 index 000000000..ff4272f82 --- /dev/null +++ b/thirdparty/chardet/cp949prober.py @@ -0,0 +1,44 @@ +######################## BEGIN LICENSE BLOCK ######################## +# The Original Code is mozilla.org code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 1998 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Mark Pilgrim - port to Python +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA +# 02110-1301 USA +######################### END LICENSE BLOCK ######################### + +from .mbcharsetprober import MultiByteCharSetProber +from .codingstatemachine import CodingStateMachine +from .chardistribution import EUCKRDistributionAnalysis +from .mbcssm import CP949SMModel + + +class CP949Prober(MultiByteCharSetProber): + def __init__(self): + MultiByteCharSetProber.__init__(self) + self._mCodingSM = CodingStateMachine(CP949SMModel) + # NOTE: CP949 is a superset of EUC-KR, so the distribution should be + # not different. + self._mDistributionAnalyzer = EUCKRDistributionAnalysis() + self.reset() + + def get_charset_name(self): + return "CP949" diff --git a/thirdparty/chardet/escprober.py b/thirdparty/chardet/escprober.py index c2e979e7b..80a844ff3 100644 --- a/thirdparty/chardet/escprober.py +++ b/thirdparty/chardet/escprober.py @@ -13,39 +13,43 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants, sys -from escsm import HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, ISO2022KRSMModel -from charsetprober import CharSetProber -from codingstatemachine import CodingStateMachine +from . import constants +from .escsm import (HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, + ISO2022KRSMModel) +from .charsetprober import CharSetProber +from .codingstatemachine import CodingStateMachine +from .compat import wrap_ord + class EscCharSetProber(CharSetProber): def __init__(self): CharSetProber.__init__(self) - self._mCodingSM = [ \ + self._mCodingSM = [ CodingStateMachine(HZSMModel), CodingStateMachine(ISO2022CNSMModel), CodingStateMachine(ISO2022JPSMModel), CodingStateMachine(ISO2022KRSMModel) - ] + ] self.reset() def reset(self): CharSetProber.reset(self) for codingSM in self._mCodingSM: - if not codingSM: continue - codingSM.active = constants.True + if not codingSM: + continue + codingSM.active = True codingSM.reset() self._mActiveSM = len(self._mCodingSM) self._mDetectedCharset = None @@ -61,19 +65,22 @@ class EscCharSetProber(CharSetProber): def feed(self, aBuf): for c in aBuf: + # PY3K: aBuf is a byte array, so c is an int, not a byte for codingSM in self._mCodingSM: - if not codingSM: continue - if not codingSM.active: continue - codingState = codingSM.next_state(c) + if not codingSM: + continue + if not codingSM.active: + continue + codingState = codingSM.next_state(wrap_ord(c)) if codingState == constants.eError: - codingSM.active = constants.False + codingSM.active = False self._mActiveSM -= 1 if self._mActiveSM <= 0: self._mState = constants.eNotMe return self.get_state() elif codingState == constants.eItsMe: self._mState = constants.eFoundIt - self._mDetectedCharset = codingSM.get_coding_state_machine() + self._mDetectedCharset = codingSM.get_coding_state_machine() # nopep8 return self.get_state() return self.get_state() diff --git a/thirdparty/chardet/escsm.py b/thirdparty/chardet/escsm.py index 9fa22952e..bd302b4c6 100644 --- a/thirdparty/chardet/escsm.py +++ b/thirdparty/chardet/escsm.py @@ -13,62 +13,62 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from constants import eStart, eError, eItsMe +from .constants import eStart, eError, eItsMe -HZ_cls = ( \ -1,0,0,0,0,0,0,0, # 00 - 07 -0,0,0,0,0,0,0,0, # 08 - 0f -0,0,0,0,0,0,0,0, # 10 - 17 -0,0,0,1,0,0,0,0, # 18 - 1f -0,0,0,0,0,0,0,0, # 20 - 27 -0,0,0,0,0,0,0,0, # 28 - 2f -0,0,0,0,0,0,0,0, # 30 - 37 -0,0,0,0,0,0,0,0, # 38 - 3f -0,0,0,0,0,0,0,0, # 40 - 47 -0,0,0,0,0,0,0,0, # 48 - 4f -0,0,0,0,0,0,0,0, # 50 - 57 -0,0,0,0,0,0,0,0, # 58 - 5f -0,0,0,0,0,0,0,0, # 60 - 67 -0,0,0,0,0,0,0,0, # 68 - 6f -0,0,0,0,0,0,0,0, # 70 - 77 -0,0,0,4,0,5,2,0, # 78 - 7f -1,1,1,1,1,1,1,1, # 80 - 87 -1,1,1,1,1,1,1,1, # 88 - 8f -1,1,1,1,1,1,1,1, # 90 - 97 -1,1,1,1,1,1,1,1, # 98 - 9f -1,1,1,1,1,1,1,1, # a0 - a7 -1,1,1,1,1,1,1,1, # a8 - af -1,1,1,1,1,1,1,1, # b0 - b7 -1,1,1,1,1,1,1,1, # b8 - bf -1,1,1,1,1,1,1,1, # c0 - c7 -1,1,1,1,1,1,1,1, # c8 - cf -1,1,1,1,1,1,1,1, # d0 - d7 -1,1,1,1,1,1,1,1, # d8 - df -1,1,1,1,1,1,1,1, # e0 - e7 -1,1,1,1,1,1,1,1, # e8 - ef -1,1,1,1,1,1,1,1, # f0 - f7 -1,1,1,1,1,1,1,1, # f8 - ff +HZ_cls = ( +1,0,0,0,0,0,0,0, # 00 - 07 +0,0,0,0,0,0,0,0, # 08 - 0f +0,0,0,0,0,0,0,0, # 10 - 17 +0,0,0,1,0,0,0,0, # 18 - 1f +0,0,0,0,0,0,0,0, # 20 - 27 +0,0,0,0,0,0,0,0, # 28 - 2f +0,0,0,0,0,0,0,0, # 30 - 37 +0,0,0,0,0,0,0,0, # 38 - 3f +0,0,0,0,0,0,0,0, # 40 - 47 +0,0,0,0,0,0,0,0, # 48 - 4f +0,0,0,0,0,0,0,0, # 50 - 57 +0,0,0,0,0,0,0,0, # 58 - 5f +0,0,0,0,0,0,0,0, # 60 - 67 +0,0,0,0,0,0,0,0, # 68 - 6f +0,0,0,0,0,0,0,0, # 70 - 77 +0,0,0,4,0,5,2,0, # 78 - 7f +1,1,1,1,1,1,1,1, # 80 - 87 +1,1,1,1,1,1,1,1, # 88 - 8f +1,1,1,1,1,1,1,1, # 90 - 97 +1,1,1,1,1,1,1,1, # 98 - 9f +1,1,1,1,1,1,1,1, # a0 - a7 +1,1,1,1,1,1,1,1, # a8 - af +1,1,1,1,1,1,1,1, # b0 - b7 +1,1,1,1,1,1,1,1, # b8 - bf +1,1,1,1,1,1,1,1, # c0 - c7 +1,1,1,1,1,1,1,1, # c8 - cf +1,1,1,1,1,1,1,1, # d0 - d7 +1,1,1,1,1,1,1,1, # d8 - df +1,1,1,1,1,1,1,1, # e0 - e7 +1,1,1,1,1,1,1,1, # e8 - ef +1,1,1,1,1,1,1,1, # f0 - f7 +1,1,1,1,1,1,1,1, # f8 - ff ) -HZ_st = ( \ -eStart,eError, 3,eStart,eStart,eStart,eError,eError,# 00-07 -eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f -eItsMe,eItsMe,eError,eError,eStart,eStart, 4,eError,# 10-17 - 5,eError, 6,eError, 5, 5, 4,eError,# 18-1f - 4,eError, 4, 4, 4,eError, 4,eError,# 20-27 - 4,eItsMe,eStart,eStart,eStart,eStart,eStart,eStart,# 28-2f +HZ_st = ( +eStart,eError, 3,eStart,eStart,eStart,eError,eError,# 00-07 +eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f +eItsMe,eItsMe,eError,eError,eStart,eStart, 4,eError,# 10-17 + 5,eError, 6,eError, 5, 5, 4,eError,# 18-1f + 4,eError, 4, 4, 4,eError, 4,eError,# 20-27 + 4,eItsMe,eStart,eStart,eStart,eStart,eStart,eStart,# 28-2f ) HZCharLenTable = (0, 0, 0, 0, 0, 0) @@ -79,50 +79,50 @@ HZSMModel = {'classTable': HZ_cls, 'charLenTable': HZCharLenTable, 'name': "HZ-GB-2312"} -ISO2022CN_cls = ( \ -2,0,0,0,0,0,0,0, # 00 - 07 -0,0,0,0,0,0,0,0, # 08 - 0f -0,0,0,0,0,0,0,0, # 10 - 17 -0,0,0,1,0,0,0,0, # 18 - 1f -0,0,0,0,0,0,0,0, # 20 - 27 -0,3,0,0,0,0,0,0, # 28 - 2f -0,0,0,0,0,0,0,0, # 30 - 37 -0,0,0,0,0,0,0,0, # 38 - 3f -0,0,0,4,0,0,0,0, # 40 - 47 -0,0,0,0,0,0,0,0, # 48 - 4f -0,0,0,0,0,0,0,0, # 50 - 57 -0,0,0,0,0,0,0,0, # 58 - 5f -0,0,0,0,0,0,0,0, # 60 - 67 -0,0,0,0,0,0,0,0, # 68 - 6f -0,0,0,0,0,0,0,0, # 70 - 77 -0,0,0,0,0,0,0,0, # 78 - 7f -2,2,2,2,2,2,2,2, # 80 - 87 -2,2,2,2,2,2,2,2, # 88 - 8f -2,2,2,2,2,2,2,2, # 90 - 97 -2,2,2,2,2,2,2,2, # 98 - 9f -2,2,2,2,2,2,2,2, # a0 - a7 -2,2,2,2,2,2,2,2, # a8 - af -2,2,2,2,2,2,2,2, # b0 - b7 -2,2,2,2,2,2,2,2, # b8 - bf -2,2,2,2,2,2,2,2, # c0 - c7 -2,2,2,2,2,2,2,2, # c8 - cf -2,2,2,2,2,2,2,2, # d0 - d7 -2,2,2,2,2,2,2,2, # d8 - df -2,2,2,2,2,2,2,2, # e0 - e7 -2,2,2,2,2,2,2,2, # e8 - ef -2,2,2,2,2,2,2,2, # f0 - f7 -2,2,2,2,2,2,2,2, # f8 - ff +ISO2022CN_cls = ( +2,0,0,0,0,0,0,0, # 00 - 07 +0,0,0,0,0,0,0,0, # 08 - 0f +0,0,0,0,0,0,0,0, # 10 - 17 +0,0,0,1,0,0,0,0, # 18 - 1f +0,0,0,0,0,0,0,0, # 20 - 27 +0,3,0,0,0,0,0,0, # 28 - 2f +0,0,0,0,0,0,0,0, # 30 - 37 +0,0,0,0,0,0,0,0, # 38 - 3f +0,0,0,4,0,0,0,0, # 40 - 47 +0,0,0,0,0,0,0,0, # 48 - 4f +0,0,0,0,0,0,0,0, # 50 - 57 +0,0,0,0,0,0,0,0, # 58 - 5f +0,0,0,0,0,0,0,0, # 60 - 67 +0,0,0,0,0,0,0,0, # 68 - 6f +0,0,0,0,0,0,0,0, # 70 - 77 +0,0,0,0,0,0,0,0, # 78 - 7f +2,2,2,2,2,2,2,2, # 80 - 87 +2,2,2,2,2,2,2,2, # 88 - 8f +2,2,2,2,2,2,2,2, # 90 - 97 +2,2,2,2,2,2,2,2, # 98 - 9f +2,2,2,2,2,2,2,2, # a0 - a7 +2,2,2,2,2,2,2,2, # a8 - af +2,2,2,2,2,2,2,2, # b0 - b7 +2,2,2,2,2,2,2,2, # b8 - bf +2,2,2,2,2,2,2,2, # c0 - c7 +2,2,2,2,2,2,2,2, # c8 - cf +2,2,2,2,2,2,2,2, # d0 - d7 +2,2,2,2,2,2,2,2, # d8 - df +2,2,2,2,2,2,2,2, # e0 - e7 +2,2,2,2,2,2,2,2, # e8 - ef +2,2,2,2,2,2,2,2, # f0 - f7 +2,2,2,2,2,2,2,2, # f8 - ff ) -ISO2022CN_st = ( \ -eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07 -eStart,eError,eError,eError,eError,eError,eError,eError,# 08-0f -eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17 -eItsMe,eItsMe,eItsMe,eError,eError,eError, 4,eError,# 18-1f -eError,eError,eError,eItsMe,eError,eError,eError,eError,# 20-27 - 5, 6,eError,eError,eError,eError,eError,eError,# 28-2f -eError,eError,eError,eItsMe,eError,eError,eError,eError,# 30-37 -eError,eError,eError,eError,eError,eItsMe,eError,eStart,# 38-3f +ISO2022CN_st = ( +eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07 +eStart,eError,eError,eError,eError,eError,eError,eError,# 08-0f +eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17 +eItsMe,eItsMe,eItsMe,eError,eError,eError, 4,eError,# 18-1f +eError,eError,eError,eItsMe,eError,eError,eError,eError,# 20-27 + 5, 6,eError,eError,eError,eError,eError,eError,# 28-2f +eError,eError,eError,eItsMe,eError,eError,eError,eError,# 30-37 +eError,eError,eError,eError,eError,eItsMe,eError,eStart,# 38-3f ) ISO2022CNCharLenTable = (0, 0, 0, 0, 0, 0, 0, 0, 0) @@ -133,51 +133,51 @@ ISO2022CNSMModel = {'classTable': ISO2022CN_cls, 'charLenTable': ISO2022CNCharLenTable, 'name': "ISO-2022-CN"} -ISO2022JP_cls = ( \ -2,0,0,0,0,0,0,0, # 00 - 07 -0,0,0,0,0,0,2,2, # 08 - 0f -0,0,0,0,0,0,0,0, # 10 - 17 -0,0,0,1,0,0,0,0, # 18 - 1f -0,0,0,0,7,0,0,0, # 20 - 27 -3,0,0,0,0,0,0,0, # 28 - 2f -0,0,0,0,0,0,0,0, # 30 - 37 -0,0,0,0,0,0,0,0, # 38 - 3f -6,0,4,0,8,0,0,0, # 40 - 47 -0,9,5,0,0,0,0,0, # 48 - 4f -0,0,0,0,0,0,0,0, # 50 - 57 -0,0,0,0,0,0,0,0, # 58 - 5f -0,0,0,0,0,0,0,0, # 60 - 67 -0,0,0,0,0,0,0,0, # 68 - 6f -0,0,0,0,0,0,0,0, # 70 - 77 -0,0,0,0,0,0,0,0, # 78 - 7f -2,2,2,2,2,2,2,2, # 80 - 87 -2,2,2,2,2,2,2,2, # 88 - 8f -2,2,2,2,2,2,2,2, # 90 - 97 -2,2,2,2,2,2,2,2, # 98 - 9f -2,2,2,2,2,2,2,2, # a0 - a7 -2,2,2,2,2,2,2,2, # a8 - af -2,2,2,2,2,2,2,2, # b0 - b7 -2,2,2,2,2,2,2,2, # b8 - bf -2,2,2,2,2,2,2,2, # c0 - c7 -2,2,2,2,2,2,2,2, # c8 - cf -2,2,2,2,2,2,2,2, # d0 - d7 -2,2,2,2,2,2,2,2, # d8 - df -2,2,2,2,2,2,2,2, # e0 - e7 -2,2,2,2,2,2,2,2, # e8 - ef -2,2,2,2,2,2,2,2, # f0 - f7 -2,2,2,2,2,2,2,2, # f8 - ff +ISO2022JP_cls = ( +2,0,0,0,0,0,0,0, # 00 - 07 +0,0,0,0,0,0,2,2, # 08 - 0f +0,0,0,0,0,0,0,0, # 10 - 17 +0,0,0,1,0,0,0,0, # 18 - 1f +0,0,0,0,7,0,0,0, # 20 - 27 +3,0,0,0,0,0,0,0, # 28 - 2f +0,0,0,0,0,0,0,0, # 30 - 37 +0,0,0,0,0,0,0,0, # 38 - 3f +6,0,4,0,8,0,0,0, # 40 - 47 +0,9,5,0,0,0,0,0, # 48 - 4f +0,0,0,0,0,0,0,0, # 50 - 57 +0,0,0,0,0,0,0,0, # 58 - 5f +0,0,0,0,0,0,0,0, # 60 - 67 +0,0,0,0,0,0,0,0, # 68 - 6f +0,0,0,0,0,0,0,0, # 70 - 77 +0,0,0,0,0,0,0,0, # 78 - 7f +2,2,2,2,2,2,2,2, # 80 - 87 +2,2,2,2,2,2,2,2, # 88 - 8f +2,2,2,2,2,2,2,2, # 90 - 97 +2,2,2,2,2,2,2,2, # 98 - 9f +2,2,2,2,2,2,2,2, # a0 - a7 +2,2,2,2,2,2,2,2, # a8 - af +2,2,2,2,2,2,2,2, # b0 - b7 +2,2,2,2,2,2,2,2, # b8 - bf +2,2,2,2,2,2,2,2, # c0 - c7 +2,2,2,2,2,2,2,2, # c8 - cf +2,2,2,2,2,2,2,2, # d0 - d7 +2,2,2,2,2,2,2,2, # d8 - df +2,2,2,2,2,2,2,2, # e0 - e7 +2,2,2,2,2,2,2,2, # e8 - ef +2,2,2,2,2,2,2,2, # f0 - f7 +2,2,2,2,2,2,2,2, # f8 - ff ) -ISO2022JP_st = ( \ -eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07 -eStart,eStart,eError,eError,eError,eError,eError,eError,# 08-0f -eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17 -eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,# 18-1f -eError, 5,eError,eError,eError, 4,eError,eError,# 20-27 -eError,eError,eError, 6,eItsMe,eError,eItsMe,eError,# 28-2f -eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,# 30-37 -eError,eError,eError,eItsMe,eError,eError,eError,eError,# 38-3f -eError,eError,eError,eError,eItsMe,eError,eStart,eStart,# 40-47 +ISO2022JP_st = ( +eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07 +eStart,eStart,eError,eError,eError,eError,eError,eError,# 08-0f +eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17 +eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,# 18-1f +eError, 5,eError,eError,eError, 4,eError,eError,# 20-27 +eError,eError,eError, 6,eItsMe,eError,eItsMe,eError,# 28-2f +eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,# 30-37 +eError,eError,eError,eItsMe,eError,eError,eError,eError,# 38-3f +eError,eError,eError,eError,eItsMe,eError,eStart,eStart,# 40-47 ) ISO2022JPCharLenTable = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0) @@ -188,47 +188,47 @@ ISO2022JPSMModel = {'classTable': ISO2022JP_cls, 'charLenTable': ISO2022JPCharLenTable, 'name': "ISO-2022-JP"} -ISO2022KR_cls = ( \ -2,0,0,0,0,0,0,0, # 00 - 07 -0,0,0,0,0,0,0,0, # 08 - 0f -0,0,0,0,0,0,0,0, # 10 - 17 -0,0,0,1,0,0,0,0, # 18 - 1f -0,0,0,0,3,0,0,0, # 20 - 27 -0,4,0,0,0,0,0,0, # 28 - 2f -0,0,0,0,0,0,0,0, # 30 - 37 -0,0,0,0,0,0,0,0, # 38 - 3f -0,0,0,5,0,0,0,0, # 40 - 47 -0,0,0,0,0,0,0,0, # 48 - 4f -0,0,0,0,0,0,0,0, # 50 - 57 -0,0,0,0,0,0,0,0, # 58 - 5f -0,0,0,0,0,0,0,0, # 60 - 67 -0,0,0,0,0,0,0,0, # 68 - 6f -0,0,0,0,0,0,0,0, # 70 - 77 -0,0,0,0,0,0,0,0, # 78 - 7f -2,2,2,2,2,2,2,2, # 80 - 87 -2,2,2,2,2,2,2,2, # 88 - 8f -2,2,2,2,2,2,2,2, # 90 - 97 -2,2,2,2,2,2,2,2, # 98 - 9f -2,2,2,2,2,2,2,2, # a0 - a7 -2,2,2,2,2,2,2,2, # a8 - af -2,2,2,2,2,2,2,2, # b0 - b7 -2,2,2,2,2,2,2,2, # b8 - bf -2,2,2,2,2,2,2,2, # c0 - c7 -2,2,2,2,2,2,2,2, # c8 - cf -2,2,2,2,2,2,2,2, # d0 - d7 -2,2,2,2,2,2,2,2, # d8 - df -2,2,2,2,2,2,2,2, # e0 - e7 -2,2,2,2,2,2,2,2, # e8 - ef -2,2,2,2,2,2,2,2, # f0 - f7 -2,2,2,2,2,2,2,2, # f8 - ff +ISO2022KR_cls = ( +2,0,0,0,0,0,0,0, # 00 - 07 +0,0,0,0,0,0,0,0, # 08 - 0f +0,0,0,0,0,0,0,0, # 10 - 17 +0,0,0,1,0,0,0,0, # 18 - 1f +0,0,0,0,3,0,0,0, # 20 - 27 +0,4,0,0,0,0,0,0, # 28 - 2f +0,0,0,0,0,0,0,0, # 30 - 37 +0,0,0,0,0,0,0,0, # 38 - 3f +0,0,0,5,0,0,0,0, # 40 - 47 +0,0,0,0,0,0,0,0, # 48 - 4f +0,0,0,0,0,0,0,0, # 50 - 57 +0,0,0,0,0,0,0,0, # 58 - 5f +0,0,0,0,0,0,0,0, # 60 - 67 +0,0,0,0,0,0,0,0, # 68 - 6f +0,0,0,0,0,0,0,0, # 70 - 77 +0,0,0,0,0,0,0,0, # 78 - 7f +2,2,2,2,2,2,2,2, # 80 - 87 +2,2,2,2,2,2,2,2, # 88 - 8f +2,2,2,2,2,2,2,2, # 90 - 97 +2,2,2,2,2,2,2,2, # 98 - 9f +2,2,2,2,2,2,2,2, # a0 - a7 +2,2,2,2,2,2,2,2, # a8 - af +2,2,2,2,2,2,2,2, # b0 - b7 +2,2,2,2,2,2,2,2, # b8 - bf +2,2,2,2,2,2,2,2, # c0 - c7 +2,2,2,2,2,2,2,2, # c8 - cf +2,2,2,2,2,2,2,2, # d0 - d7 +2,2,2,2,2,2,2,2, # d8 - df +2,2,2,2,2,2,2,2, # e0 - e7 +2,2,2,2,2,2,2,2, # e8 - ef +2,2,2,2,2,2,2,2, # f0 - f7 +2,2,2,2,2,2,2,2, # f8 - ff ) -ISO2022KR_st = ( \ -eStart, 3,eError,eStart,eStart,eStart,eError,eError,# 00-07 -eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f -eItsMe,eItsMe,eError,eError,eError, 4,eError,eError,# 10-17 -eError,eError,eError,eError, 5,eError,eError,eError,# 18-1f -eError,eError,eError,eItsMe,eStart,eStart,eStart,eStart,# 20-27 +ISO2022KR_st = ( +eStart, 3,eError,eStart,eStart,eStart,eError,eError,# 00-07 +eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f +eItsMe,eItsMe,eError,eError,eError, 4,eError,eError,# 10-17 +eError,eError,eError,eError, 5,eError,eError,eError,# 18-1f +eError,eError,eError,eItsMe,eStart,eStart,eStart,eStart,# 20-27 ) ISO2022KRCharLenTable = (0, 0, 0, 0, 0, 0) @@ -238,3 +238,5 @@ ISO2022KRSMModel = {'classTable': ISO2022KR_cls, 'stateTable': ISO2022KR_st, 'charLenTable': ISO2022KRCharLenTable, 'name': "ISO-2022-KR"} + +# flake8: noqa diff --git a/thirdparty/chardet/eucjpprober.py b/thirdparty/chardet/eucjpprober.py index faa5cb58d..8e64fdcc2 100644 --- a/thirdparty/chardet/eucjpprober.py +++ b/thirdparty/chardet/eucjpprober.py @@ -13,25 +13,26 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants, sys -from constants import eStart, eError, eItsMe -from mbcharsetprober import MultiByteCharSetProber -from codingstatemachine import CodingStateMachine -from chardistribution import EUCJPDistributionAnalysis -from jpcntx import EUCJPContextAnalysis -from mbcssm import EUCJPSMModel +import sys +from . import constants +from .mbcharsetprober import MultiByteCharSetProber +from .codingstatemachine import CodingStateMachine +from .chardistribution import EUCJPDistributionAnalysis +from .jpcntx import EUCJPContextAnalysis +from .mbcssm import EUCJPSMModel + class EUCJPProber(MultiByteCharSetProber): def __init__(self): @@ -50,31 +51,35 @@ class EUCJPProber(MultiByteCharSetProber): def feed(self, aBuf): aLen = len(aBuf) - for i in xrange(0, aLen): + for i in range(0, aLen): + # PY3K: aBuf is a byte array, so aBuf[i] is an int, not a byte codingState = self._mCodingSM.next_state(aBuf[i]) - if codingState == eError: + if codingState == constants.eError: if constants._debug: - sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n') + sys.stderr.write(self.get_charset_name() + + ' prober hit error at byte ' + str(i) + + '\n') self._mState = constants.eNotMe break - elif codingState == eItsMe: + elif codingState == constants.eItsMe: self._mState = constants.eFoundIt break - elif codingState == eStart: + elif codingState == constants.eStart: charLen = self._mCodingSM.get_current_charlen() if i == 0: self._mLastChar[1] = aBuf[0] self._mContextAnalyzer.feed(self._mLastChar, charLen) self._mDistributionAnalyzer.feed(self._mLastChar, charLen) else: - self._mContextAnalyzer.feed(aBuf[i-1:i+1], charLen) - self._mDistributionAnalyzer.feed(aBuf[i-1:i+1], charLen) + self._mContextAnalyzer.feed(aBuf[i - 1:i + 1], charLen) + self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1], + charLen) self._mLastChar[0] = aBuf[aLen - 1] if self.get_state() == constants.eDetecting: - if self._mContextAnalyzer.got_enough_data() and \ - (self.get_confidence() > constants.SHORTCUT_THRESHOLD): + if (self._mContextAnalyzer.got_enough_data() and + (self.get_confidence() > constants.SHORTCUT_THRESHOLD)): self._mState = constants.eFoundIt return self.get_state() diff --git a/thirdparty/chardet/euckrfreq.py b/thirdparty/chardet/euckrfreq.py index 1463fa1d8..a179e4c21 100644 --- a/thirdparty/chardet/euckrfreq.py +++ b/thirdparty/chardet/euckrfreq.py @@ -592,3 +592,5 @@ EUCKRCharToFreqOrder = ( \ 8704,8705,8706,8707,8708,8709,8710,8711,8712,8713,8714,8715,8716,8717,8718,8719, 8720,8721,8722,8723,8724,8725,8726,8727,8728,8729,8730,8731,8732,8733,8734,8735, 8736,8737,8738,8739,8740,8741) + +# flake8: noqa diff --git a/thirdparty/chardet/euckrprober.py b/thirdparty/chardet/euckrprober.py index bd697ebf3..5982a46b6 100644 --- a/thirdparty/chardet/euckrprober.py +++ b/thirdparty/chardet/euckrprober.py @@ -13,22 +13,23 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from mbcharsetprober import MultiByteCharSetProber -from codingstatemachine import CodingStateMachine -from chardistribution import EUCKRDistributionAnalysis -from mbcssm import EUCKRSMModel +from .mbcharsetprober import MultiByteCharSetProber +from .codingstatemachine import CodingStateMachine +from .chardistribution import EUCKRDistributionAnalysis +from .mbcssm import EUCKRSMModel + class EUCKRProber(MultiByteCharSetProber): def __init__(self): diff --git a/thirdparty/chardet/euctwfreq.py b/thirdparty/chardet/euctwfreq.py index c05720950..576e7504d 100644 --- a/thirdparty/chardet/euctwfreq.py +++ b/thirdparty/chardet/euctwfreq.py @@ -13,12 +13,12 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA @@ -26,8 +26,8 @@ ######################### END LICENSE BLOCK ######################### # EUCTW frequency table -# Converted from big5 work -# by Taiwan's Mandarin Promotion Council +# Converted from big5 work +# by Taiwan's Mandarin Promotion Council # # 128 --> 0.42261 @@ -38,15 +38,15 @@ # # Idea Distribution Ratio = 0.74851/(1-0.74851) =2.98 # Random Distribution Ration = 512/(5401-512)=0.105 -# +# # Typical Distribution Ratio about 25% of Ideal one, still much higher than RDR EUCTW_TYPICAL_DISTRIBUTION_RATIO = 0.75 -# Char to FreqOrder table , +# Char to FreqOrder table , EUCTW_TABLE_SIZE = 8102 -EUCTWCharToFreqOrder = ( \ +EUCTWCharToFreqOrder = ( 1,1800,1506, 255,1431, 198, 9, 82, 6,7310, 177, 202,3615,1256,2808, 110, # 2742 3735, 33,3241, 261, 76, 44,2113, 16,2931,2184,1176, 659,3868, 26,3404,2643, # 2758 1198,3869,3313,4060, 410,2211, 302, 590, 361,1963, 8, 204, 58,4296,7311,1931, # 2774 @@ -424,3 +424,5 @@ EUCTWCharToFreqOrder = ( \ 8694,8695,8696,8697,8698,8699,8700,8701,8702,8703,8704,8705,8706,8707,8708,8709, # 8710 8710,8711,8712,8713,8714,8715,8716,8717,8718,8719,8720,8721,8722,8723,8724,8725, # 8726 8726,8727,8728,8729,8730,8731,8732,8733,8734,8735,8736,8737,8738,8739,8740,8741) # 8742 + +# flake8: noqa diff --git a/thirdparty/chardet/euctwprober.py b/thirdparty/chardet/euctwprober.py index b073f134f..fe652fe37 100644 --- a/thirdparty/chardet/euctwprober.py +++ b/thirdparty/chardet/euctwprober.py @@ -25,10 +25,10 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from mbcharsetprober import MultiByteCharSetProber -from codingstatemachine import CodingStateMachine -from chardistribution import EUCTWDistributionAnalysis -from mbcssm import EUCTWSMModel +from .mbcharsetprober import MultiByteCharSetProber +from .codingstatemachine import CodingStateMachine +from .chardistribution import EUCTWDistributionAnalysis +from .mbcssm import EUCTWSMModel class EUCTWProber(MultiByteCharSetProber): def __init__(self): diff --git a/thirdparty/chardet/gb2312freq.py b/thirdparty/chardet/gb2312freq.py index 7a4d5a1b3..1238f510f 100644 --- a/thirdparty/chardet/gb2312freq.py +++ b/thirdparty/chardet/gb2312freq.py @@ -13,12 +13,12 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA @@ -36,14 +36,14 @@ # # Ideal Distribution Ratio = 0.79135/(1-0.79135) = 3.79 # Random Distribution Ration = 512 / (3755 - 512) = 0.157 -# +# # Typical Distribution Ratio about 25% of Ideal one, still much higher that RDR GB2312_TYPICAL_DISTRIBUTION_RATIO = 0.9 GB2312_TABLE_SIZE = 3760 -GB2312CharToFreqOrder = ( \ +GB2312CharToFreqOrder = ( 1671, 749,1443,2364,3924,3807,2330,3921,1704,3463,2691,1511,1515, 572,3191,2205, 2361, 224,2558, 479,1711, 963,3162, 440,4060,1905,2966,2947,3580,2647,3961,3842, 2204, 869,4207, 970,2678,5626,2944,2956,1479,4048, 514,3595, 588,1346,2820,3409, @@ -469,3 +469,4 @@ GB2312CharToFreqOrder = ( \ 5867,5507,6273,4206,6274,4789,6098,6764,3619,3646,3833,3804,2394,3788,4936,3978, 4866,4899,6099,6100,5559,6478,6765,3599,5868,6101,5869,5870,6275,6766,4527,6767) +# flake8: noqa diff --git a/thirdparty/chardet/gb2312prober.py b/thirdparty/chardet/gb2312prober.py index 91eb3925a..0325a2d86 100644 --- a/thirdparty/chardet/gb2312prober.py +++ b/thirdparty/chardet/gb2312prober.py @@ -25,10 +25,10 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from mbcharsetprober import MultiByteCharSetProber -from codingstatemachine import CodingStateMachine -from chardistribution import GB2312DistributionAnalysis -from mbcssm import GB2312SMModel +from .mbcharsetprober import MultiByteCharSetProber +from .codingstatemachine import CodingStateMachine +from .chardistribution import GB2312DistributionAnalysis +from .mbcssm import GB2312SMModel class GB2312Prober(MultiByteCharSetProber): def __init__(self): diff --git a/thirdparty/chardet/hebrewprober.py b/thirdparty/chardet/hebrewprober.py index 442c0bf2b..ba225c5ef 100644 --- a/thirdparty/chardet/hebrewprober.py +++ b/thirdparty/chardet/hebrewprober.py @@ -13,20 +13,21 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from charsetprober import CharSetProber -import constants +from .charsetprober import CharSetProber +from .constants import eNotMe, eDetecting +from .compat import wrap_ord # This prober doesn't actually recognize a language or a charset. # It is a helper prober for the use of the Hebrew model probers @@ -35,40 +36,40 @@ import constants # # Four main charsets exist in Hebrew: # "ISO-8859-8" - Visual Hebrew -# "windows-1255" - Logical Hebrew +# "windows-1255" - Logical Hebrew # "ISO-8859-8-I" - Logical Hebrew # "x-mac-hebrew" - ?? Logical Hebrew ?? # # Both "ISO" charsets use a completely identical set of code points, whereas -# "windows-1255" and "x-mac-hebrew" are two different proper supersets of +# "windows-1255" and "x-mac-hebrew" are two different proper supersets of # these code points. windows-1255 defines additional characters in the range -# 0x80-0x9F as some misc punctuation marks as well as some Hebrew-specific +# 0x80-0x9F as some misc punctuation marks as well as some Hebrew-specific # diacritics and additional 'Yiddish' ligature letters in the range 0xc0-0xd6. -# x-mac-hebrew defines similar additional code points but with a different +# x-mac-hebrew defines similar additional code points but with a different # mapping. # -# As far as an average Hebrew text with no diacritics is concerned, all four -# charsets are identical with respect to code points. Meaning that for the -# main Hebrew alphabet, all four map the same values to all 27 Hebrew letters +# As far as an average Hebrew text with no diacritics is concerned, all four +# charsets are identical with respect to code points. Meaning that for the +# main Hebrew alphabet, all four map the same values to all 27 Hebrew letters # (including final letters). # # The dominant difference between these charsets is their directionality. # "Visual" directionality means that the text is ordered as if the renderer is -# not aware of a BIDI rendering algorithm. The renderer sees the text and -# draws it from left to right. The text itself when ordered naturally is read +# not aware of a BIDI rendering algorithm. The renderer sees the text and +# draws it from left to right. The text itself when ordered naturally is read # backwards. A buffer of Visual Hebrew generally looks like so: # "[last word of first line spelled backwards] [whole line ordered backwards -# and spelled backwards] [first word of first line spelled backwards] +# and spelled backwards] [first word of first line spelled backwards] # [end of line] [last word of second line] ... etc' " # adding punctuation marks, numbers and English text to visual text is # naturally also "visual" and from left to right. -# +# # "Logical" directionality means the text is ordered "naturally" according to -# the order it is read. It is the responsibility of the renderer to display -# the text from right to left. A BIDI algorithm is used to place general +# the order it is read. It is the responsibility of the renderer to display +# the text from right to left. A BIDI algorithm is used to place general # punctuation marks, numbers and English text in the text. # -# Texts in x-mac-hebrew are almost impossible to find on the Internet. From +# Texts in x-mac-hebrew are almost impossible to find on the Internet. From # what little evidence I could find, it seems that its general directionality # is Logical. # @@ -76,17 +77,17 @@ import constants # charsets: # Visual Hebrew - "ISO-8859-8" - backwards text - Words and sentences are # backwards while line order is natural. For charset recognition purposes -# the line order is unimportant (In fact, for this implementation, even +# the line order is unimportant (In fact, for this implementation, even # word order is unimportant). # Logical Hebrew - "windows-1255" - normal, naturally ordered text. # -# "ISO-8859-8-I" is a subset of windows-1255 and doesn't need to be +# "ISO-8859-8-I" is a subset of windows-1255 and doesn't need to be # specifically identified. # "x-mac-hebrew" is also identified as windows-1255. A text in x-mac-hebrew # that contain special punctuation marks or diacritics is displayed with # some unconverted characters showing as question marks. This problem might # be corrected using another model prober for x-mac-hebrew. Due to the fact -# that x-mac-hebrew texts are so rare, writing another model prober isn't +# that x-mac-hebrew texts are so rare, writing another model prober isn't # worth the effort and performance hit. # #### The Prober #### @@ -126,28 +127,31 @@ import constants # charset identified, either "windows-1255" or "ISO-8859-8". # windows-1255 / ISO-8859-8 code points of interest -FINAL_KAF = '\xea' -NORMAL_KAF = '\xeb' -FINAL_MEM = '\xed' -NORMAL_MEM = '\xee' -FINAL_NUN = '\xef' -NORMAL_NUN = '\xf0' -FINAL_PE = '\xf3' -NORMAL_PE = '\xf4' -FINAL_TSADI = '\xf5' -NORMAL_TSADI = '\xf6' +FINAL_KAF = 0xea +NORMAL_KAF = 0xeb +FINAL_MEM = 0xed +NORMAL_MEM = 0xee +FINAL_NUN = 0xef +NORMAL_NUN = 0xf0 +FINAL_PE = 0xf3 +NORMAL_PE = 0xf4 +FINAL_TSADI = 0xf5 +NORMAL_TSADI = 0xf6 # Minimum Visual vs Logical final letter score difference. -# If the difference is below this, don't rely solely on the final letter score distance. +# If the difference is below this, don't rely solely on the final letter score +# distance. MIN_FINAL_CHAR_DISTANCE = 5 # Minimum Visual vs Logical model score difference. -# If the difference is below this, don't rely at all on the model score distance. +# If the difference is below this, don't rely at all on the model score +# distance. MIN_MODEL_DISTANCE = 0.01 VISUAL_HEBREW_NAME = "ISO-8859-8" LOGICAL_HEBREW_NAME = "windows-1255" + class HebrewProber(CharSetProber): def __init__(self): CharSetProber.__init__(self) @@ -159,8 +163,8 @@ class HebrewProber(CharSetProber): self._mFinalCharLogicalScore = 0 self._mFinalCharVisualScore = 0 # The two last characters seen in the previous buffer, - # mPrev and mBeforePrev are initialized to space in order to simulate a word - # delimiter at the beginning of the data + # mPrev and mBeforePrev are initialized to space in order to simulate + # a word delimiter at the beginning of the data self._mPrev = ' ' self._mBeforePrev = ' ' # These probers are owned by the group prober. @@ -170,49 +174,52 @@ class HebrewProber(CharSetProber): self._mVisualProber = visualProber def is_final(self, c): - return c in [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE, FINAL_TSADI] + return wrap_ord(c) in [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE, + FINAL_TSADI] def is_non_final(self, c): - # The normal Tsadi is not a good Non-Final letter due to words like - # 'lechotet' (to chat) containing an apostrophe after the tsadi. This - # apostrophe is converted to a space in FilterWithoutEnglishLetters causing - # the Non-Final tsadi to appear at an end of a word even though this is not - # the case in the original text. - # The letters Pe and Kaf rarely display a related behavior of not being a - # good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for - # example legally end with a Non-Final Pe or Kaf. However, the benefit of - # these letters as Non-Final letters outweighs the damage since these words - # are quite rare. - return c in [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE] + # The normal Tsadi is not a good Non-Final letter due to words like + # 'lechotet' (to chat) containing an apostrophe after the tsadi. This + # apostrophe is converted to a space in FilterWithoutEnglishLetters + # causing the Non-Final tsadi to appear at an end of a word even + # though this is not the case in the original text. + # The letters Pe and Kaf rarely display a related behavior of not being + # a good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' + # for example legally end with a Non-Final Pe or Kaf. However, the + # benefit of these letters as Non-Final letters outweighs the damage + # since these words are quite rare. + return wrap_ord(c) in [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE] def feed(self, aBuf): # Final letter analysis for logical-visual decision. - # Look for evidence that the received buffer is either logical Hebrew or - # visual Hebrew. + # Look for evidence that the received buffer is either logical Hebrew + # or visual Hebrew. # The following cases are checked: - # 1) A word longer than 1 letter, ending with a final letter. This is an - # indication that the text is laid out "naturally" since the final letter - # really appears at the end. +1 for logical score. - # 2) A word longer than 1 letter, ending with a Non-Final letter. In normal - # Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with - # the Non-Final form of that letter. Exceptions to this rule are mentioned - # above in isNonFinal(). This is an indication that the text is laid out - # backwards. +1 for visual score - # 3) A word longer than 1 letter, starting with a final letter. Final letters - # should not appear at the beginning of a word. This is an indication that - # the text is laid out backwards. +1 for visual score. - # - # The visual score and logical score are accumulated throughout the text and - # are finally checked against each other in GetCharSetName(). - # No checking for final letters in the middle of words is done since that case - # is not an indication for either Logical or Visual text. - # - # We automatically filter out all 7-bit characters (replace them with spaces) - # so the word boundary detection works properly. [MAP] + # 1) A word longer than 1 letter, ending with a final letter. This is + # an indication that the text is laid out "naturally" since the + # final letter really appears at the end. +1 for logical score. + # 2) A word longer than 1 letter, ending with a Non-Final letter. In + # normal Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, + # should not end with the Non-Final form of that letter. Exceptions + # to this rule are mentioned above in isNonFinal(). This is an + # indication that the text is laid out backwards. +1 for visual + # score + # 3) A word longer than 1 letter, starting with a final letter. Final + # letters should not appear at the beginning of a word. This is an + # indication that the text is laid out backwards. +1 for visual + # score. + # + # The visual score and logical score are accumulated throughout the + # text and are finally checked against each other in GetCharSetName(). + # No checking for final letters in the middle of words is done since + # that case is not an indication for either Logical or Visual text. + # + # We automatically filter out all 7-bit characters (replace them with + # spaces) so the word boundary detection works properly. [MAP] - if self.get_state() == constants.eNotMe: + if self.get_state() == eNotMe: # Both model probers say it's not them. No reason to continue. - return constants.eNotMe + return eNotMe aBuf = self.filter_high_bit_only(aBuf) @@ -220,23 +227,27 @@ class HebrewProber(CharSetProber): if cur == ' ': # We stand on a space - a word just ended if self._mBeforePrev != ' ': - # next-to-last char was not a space so self._mPrev is not a 1 letter word + # next-to-last char was not a space so self._mPrev is not a + # 1 letter word if self.is_final(self._mPrev): # case (1) [-2:not space][-1:final letter][cur:space] self._mFinalCharLogicalScore += 1 elif self.is_non_final(self._mPrev): - # case (2) [-2:not space][-1:Non-Final letter][cur:space] + # case (2) [-2:not space][-1:Non-Final letter][ + # cur:space] self._mFinalCharVisualScore += 1 else: # Not standing on a space - if (self._mBeforePrev == ' ') and (self.is_final(self._mPrev)) and (cur != ' '): + if ((self._mBeforePrev == ' ') and + (self.is_final(self._mPrev)) and (cur != ' ')): # case (3) [-2:space][-1:final letter][cur:not space] self._mFinalCharVisualScore += 1 self._mBeforePrev = self._mPrev self._mPrev = cur - # Forever detecting, till the end or until both model probers return eNotMe (handled above) - return constants.eDetecting + # Forever detecting, till the end or until both model probers return + # eNotMe (handled above) + return eDetecting def get_charset_name(self): # Make the decision: is it Logical or Visual? @@ -248,22 +259,25 @@ class HebrewProber(CharSetProber): return VISUAL_HEBREW_NAME # It's not dominant enough, try to rely on the model scores instead. - modelsub = self._mLogicalProber.get_confidence() - self._mVisualProber.get_confidence() + modelsub = (self._mLogicalProber.get_confidence() + - self._mVisualProber.get_confidence()) if modelsub > MIN_MODEL_DISTANCE: return LOGICAL_HEBREW_NAME if modelsub < -MIN_MODEL_DISTANCE: return VISUAL_HEBREW_NAME - # Still no good, back to final letter distance, maybe it'll save the day. + # Still no good, back to final letter distance, maybe it'll save the + # day. if finalsub < 0.0: return VISUAL_HEBREW_NAME - # (finalsub > 0 - Logical) or (don't know what to do) default to Logical. + # (finalsub > 0 - Logical) or (don't know what to do) default to + # Logical. return LOGICAL_HEBREW_NAME def get_state(self): # Remain active as long as any of the model probers are active. - if (self._mLogicalProber.get_state() == constants.eNotMe) and \ - (self._mVisualProber.get_state() == constants.eNotMe): - return constants.eNotMe - return constants.eDetecting + if (self._mLogicalProber.get_state() == eNotMe) and \ + (self._mVisualProber.get_state() == eNotMe): + return eNotMe + return eDetecting diff --git a/thirdparty/chardet/jisfreq.py b/thirdparty/chardet/jisfreq.py index 5fe4a5c3f..064345b08 100644 --- a/thirdparty/chardet/jisfreq.py +++ b/thirdparty/chardet/jisfreq.py @@ -13,12 +13,12 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA @@ -28,7 +28,7 @@ # Sampling from about 20M text materials include literature and computer technology # # Japanese frequency table, applied to both S-JIS and EUC-JP -# They are sorted in order. +# They are sorted in order. # 128 --> 0.77094 # 256 --> 0.85710 @@ -38,15 +38,15 @@ # # Ideal Distribution Ratio = 0.92635 / (1-0.92635) = 12.58 # Random Distribution Ration = 512 / (2965+62+83+86-512) = 0.191 -# -# Typical Distribution Ratio, 25% of IDR +# +# Typical Distribution Ratio, 25% of IDR JIS_TYPICAL_DISTRIBUTION_RATIO = 3.0 -# Char to FreqOrder table , +# Char to FreqOrder table , JIS_TABLE_SIZE = 4368 -JISCharToFreqOrder = ( \ +JISCharToFreqOrder = ( 40, 1, 6, 182, 152, 180, 295,2127, 285, 381,3295,4304,3068,4606,3165,3510, # 16 3511,1822,2785,4607,1193,2226,5070,4608, 171,2996,1247, 18, 179,5071, 856,1661, # 32 1262,5072, 619, 127,3431,3512,3230,1899,1700, 232, 228,1294,1298, 284, 283,2041, # 48 @@ -565,3 +565,5 @@ JISCharToFreqOrder = ( \ 8224,8225,8226,8227,8228,8229,8230,8231,8232,8233,8234,8235,8236,8237,8238,8239, # 8240 8240,8241,8242,8243,8244,8245,8246,8247,8248,8249,8250,8251,8252,8253,8254,8255, # 8256 8256,8257,8258,8259,8260,8261,8262,8263,8264,8265,8266,8267,8268,8269,8270,8271) # 8272 + +# flake8: noqa diff --git a/thirdparty/chardet/jpcntx.py b/thirdparty/chardet/jpcntx.py index 06d396e5b..59aeb6a87 100644 --- a/thirdparty/chardet/jpcntx.py +++ b/thirdparty/chardet/jpcntx.py @@ -13,19 +13,19 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants +from .compat import wrap_ord NUM_OF_CATEGORY = 6 DONT_KNOW = -1 @@ -34,7 +34,7 @@ MAX_REL_THRESHOLD = 1000 MINIMUM_DATA_THRESHOLD = 4 # This is hiragana 2-char sequence table, the number in each cell represents its frequency category -jp2CharContext = ( \ +jp2CharContext = ( (0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1), (2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4), (0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2), @@ -125,24 +125,31 @@ class JapaneseContextAnalysis: self.reset() def reset(self): - self._mTotalRel = 0 # total sequence received - self._mRelSample = [0] * NUM_OF_CATEGORY # category counters, each interger counts sequence in its category - self._mNeedToSkipCharNum = 0 # if last byte in current buffer is not the last byte of a character, we need to know how many bytes to skip in next buffer - self._mLastCharOrder = -1 # The order of previous char - self._mDone = constants.False # If this flag is set to constants.True, detection is done and conclusion has been made + self._mTotalRel = 0 # total sequence received + # category counters, each interger counts sequence in its category + self._mRelSample = [0] * NUM_OF_CATEGORY + # if last byte in current buffer is not the last byte of a character, + # we need to know how many bytes to skip in next buffer + self._mNeedToSkipCharNum = 0 + self._mLastCharOrder = -1 # The order of previous char + # If this flag is set to True, detection is done and conclusion has + # been made + self._mDone = False def feed(self, aBuf, aLen): - if self._mDone: return + if self._mDone: + return # The buffer we got is byte oriented, and a character may span in more than one - # buffers. In case the last one or two byte in last buffer is not complete, we - # record how many byte needed to complete that character and skip these bytes here. - # We can choose to record those bytes as well and analyse the character once it - # is complete, but since a character will not make much difference, by simply skipping + # buffers. In case the last one or two byte in last buffer is not + # complete, we record how many byte needed to complete that character + # and skip these bytes here. We can choose to record those bytes as + # well and analyse the character once it is complete, but since a + # character will not make much difference, by simply skipping # this character will simply our logic and improve performance. i = self._mNeedToSkipCharNum while i < aLen: - order, charLen = self.get_order(aBuf[i:i+2]) + order, charLen = self.get_order(aBuf[i:i + 2]) i += charLen if i > aLen: self._mNeedToSkipCharNum = i - aLen @@ -151,7 +158,7 @@ class JapaneseContextAnalysis: if (order != -1) and (self._mLastCharOrder != -1): self._mTotalRel += 1 if self._mTotalRel > MAX_REL_THRESHOLD: - self._mDone = constants.True + self._mDone = True break self._mRelSample[jp2CharContext[self._mLastCharOrder][order]] += 1 self._mLastCharOrder = order @@ -166,45 +173,55 @@ class JapaneseContextAnalysis: else: return DONT_KNOW - def get_order(self, aStr): + def get_order(self, aBuf): return -1, 1 class SJISContextAnalysis(JapaneseContextAnalysis): - def get_order(self, aStr): - if not aStr: return -1, 1 + def __init__(self): + self.charset_name = "SHIFT_JIS" + + def get_charset_name(self): + return self.charset_name + + def get_order(self, aBuf): + if not aBuf: + return -1, 1 # find out current char's byte length - if ((aStr[0] >= '\x81') and (aStr[0] <= '\x9F')) or \ - ((aStr[0] >= '\xE0') and (aStr[0] <= '\xFC')): + first_char = wrap_ord(aBuf[0]) + if ((0x81 <= first_char <= 0x9F) or (0xE0 <= first_char <= 0xFC)): charLen = 2 + if (first_char == 0x87) or (0xFA <= first_char <= 0xFC): + self.charset_name = "CP932" else: charLen = 1 # return its order if it is hiragana - if len(aStr) > 1: - if (aStr[0] == '\202') and \ - (aStr[1] >= '\x9F') and \ - (aStr[1] <= '\xF1'): - return ord(aStr[1]) - 0x9F, charLen + if len(aBuf) > 1: + second_char = wrap_ord(aBuf[1]) + if (first_char == 202) and (0x9F <= second_char <= 0xF1): + return second_char - 0x9F, charLen return -1, charLen class EUCJPContextAnalysis(JapaneseContextAnalysis): - def get_order(self, aStr): - if not aStr: return -1, 1 + def get_order(self, aBuf): + if not aBuf: + return -1, 1 # find out current char's byte length - if (aStr[0] == '\x8E') or \ - ((aStr[0] >= '\xA1') and (aStr[0] <= '\xFE')): + first_char = wrap_ord(aBuf[0]) + if (first_char == 0x8E) or (0xA1 <= first_char <= 0xFE): charLen = 2 - elif aStr[0] == '\x8F': + elif first_char == 0x8F: charLen = 3 else: charLen = 1 # return its order if it is hiragana - if len(aStr) > 1: - if (aStr[0] == '\xA4') and \ - (aStr[1] >= '\xA1') and \ - (aStr[1] <= '\xF3'): - return ord(aStr[1]) - 0xA1, charLen + if len(aBuf) > 1: + second_char = wrap_ord(aBuf[1]) + if (first_char == 0xA4) and (0xA1 <= second_char <= 0xF3): + return second_char - 0xA1, charLen return -1, charLen + +# flake8: noqa diff --git a/thirdparty/chardet/langbulgarianmodel.py b/thirdparty/chardet/langbulgarianmodel.py index bf5641e7b..e5788fc64 100644 --- a/thirdparty/chardet/langbulgarianmodel.py +++ b/thirdparty/chardet/langbulgarianmodel.py @@ -13,30 +13,28 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants - # 255: Control characters that usually does not exist in any text # 254: Carriage/Return # 253: symbol (punctuation) that does not belong to word # 252: 0 - 9 # Character Mapping Table: -# this table is modified base on win1251BulgarianCharToOrderMap, so +# this table is modified base on win1251BulgarianCharToOrderMap, so # only number <64 is sure valid -Latin5_BulgarianCharToOrderMap = ( \ +Latin5_BulgarianCharToOrderMap = ( 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 @@ -55,7 +53,7 @@ Latin5_BulgarianCharToOrderMap = ( \ 62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,252,253, # f0 ) -win1251BulgarianCharToOrderMap = ( \ +win1251BulgarianCharToOrderMap = ( 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 @@ -74,13 +72,13 @@ win1251BulgarianCharToOrderMap = ( \ 7, 8, 5, 19, 29, 25, 22, 21, 27, 24, 17, 75, 52,253, 42, 16, # f0 ) -# Model Table: +# Model Table: # total sequences: 100% # first 512 sequences: 96.9392% # first 1024 sequences:3.0618% # rest sequences: 0.2992% -# negative sequences: 0.0020% -BulgarianLangModel = ( \ +# negative sequences: 0.0020% +BulgarianLangModel = ( 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,2,1,2,2, 3,1,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,3,0,3,0,1, @@ -211,18 +209,21 @@ BulgarianLangModel = ( \ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, ) -Latin5BulgarianModel = { \ +Latin5BulgarianModel = { 'charToOrderMap': Latin5_BulgarianCharToOrderMap, 'precedenceMatrix': BulgarianLangModel, 'mTypicalPositiveRatio': 0.969392, - 'keepEnglishLetter': constants.False, + 'keepEnglishLetter': False, 'charsetName': "ISO-8859-5" } -Win1251BulgarianModel = { \ +Win1251BulgarianModel = { 'charToOrderMap': win1251BulgarianCharToOrderMap, 'precedenceMatrix': BulgarianLangModel, 'mTypicalPositiveRatio': 0.969392, - 'keepEnglishLetter': constants.False, + 'keepEnglishLetter': False, 'charsetName': "windows-1251" } + + +# flake8: noqa diff --git a/thirdparty/chardet/langcyrillicmodel.py b/thirdparty/chardet/langcyrillicmodel.py index e604cc73d..a86f54bd5 100644 --- a/thirdparty/chardet/langcyrillicmodel.py +++ b/thirdparty/chardet/langcyrillicmodel.py @@ -13,23 +13,21 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants - # KOI8-R language model # Character Mapping Table: -KOI8R_CharToOrderMap = ( \ +KOI8R_CharToOrderMap = ( 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 @@ -48,7 +46,7 @@ KOI8R_CharToOrderMap = ( \ 35, 43, 45, 32, 40, 52, 56, 33, 61, 62, 51, 57, 47, 63, 50, 70, # f0 ) -win1251_CharToOrderMap = ( \ +win1251_CharToOrderMap = ( 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 @@ -67,7 +65,7 @@ win1251_CharToOrderMap = ( \ 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16, ) -latin5_CharToOrderMap = ( \ +latin5_CharToOrderMap = ( 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 @@ -86,7 +84,7 @@ latin5_CharToOrderMap = ( \ 239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255, ) -macCyrillic_CharToOrderMap = ( \ +macCyrillic_CharToOrderMap = ( 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 @@ -105,7 +103,7 @@ macCyrillic_CharToOrderMap = ( \ 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,255, ) -IBM855_CharToOrderMap = ( \ +IBM855_CharToOrderMap = ( 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 @@ -124,7 +122,7 @@ IBM855_CharToOrderMap = ( \ 250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,252,255, ) -IBM866_CharToOrderMap = ( \ +IBM866_CharToOrderMap = ( 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 @@ -143,13 +141,13 @@ IBM866_CharToOrderMap = ( \ 239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255, ) -# Model Table: +# Model Table: # total sequences: 100% # first 512 sequences: 97.6601% # first 1024 sequences: 2.3389% # rest sequences: 0.1237% -# negative sequences: 0.0009% -RussianLangModel = ( \ +# negative sequences: 0.0009% +RussianLangModel = ( 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,2,2,2,0,0,2, 3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,3,0,0,3,3,3,3,3,3,3,3,3,2,3,2,0, @@ -280,50 +278,52 @@ RussianLangModel = ( \ 0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0, ) -Koi8rModel = { \ +Koi8rModel = { 'charToOrderMap': KOI8R_CharToOrderMap, 'precedenceMatrix': RussianLangModel, 'mTypicalPositiveRatio': 0.976601, - 'keepEnglishLetter': constants.False, + 'keepEnglishLetter': False, 'charsetName': "KOI8-R" } -Win1251CyrillicModel = { \ +Win1251CyrillicModel = { 'charToOrderMap': win1251_CharToOrderMap, 'precedenceMatrix': RussianLangModel, 'mTypicalPositiveRatio': 0.976601, - 'keepEnglishLetter': constants.False, + 'keepEnglishLetter': False, 'charsetName': "windows-1251" } -Latin5CyrillicModel = { \ +Latin5CyrillicModel = { 'charToOrderMap': latin5_CharToOrderMap, 'precedenceMatrix': RussianLangModel, 'mTypicalPositiveRatio': 0.976601, - 'keepEnglishLetter': constants.False, + 'keepEnglishLetter': False, 'charsetName': "ISO-8859-5" } -MacCyrillicModel = { \ +MacCyrillicModel = { 'charToOrderMap': macCyrillic_CharToOrderMap, 'precedenceMatrix': RussianLangModel, 'mTypicalPositiveRatio': 0.976601, - 'keepEnglishLetter': constants.False, + 'keepEnglishLetter': False, 'charsetName': "MacCyrillic" }; -Ibm866Model = { \ +Ibm866Model = { 'charToOrderMap': IBM866_CharToOrderMap, 'precedenceMatrix': RussianLangModel, 'mTypicalPositiveRatio': 0.976601, - 'keepEnglishLetter': constants.False, + 'keepEnglishLetter': False, 'charsetName': "IBM866" } -Ibm855Model = { \ +Ibm855Model = { 'charToOrderMap': IBM855_CharToOrderMap, 'precedenceMatrix': RussianLangModel, 'mTypicalPositiveRatio': 0.976601, - 'keepEnglishLetter': constants.False, + 'keepEnglishLetter': False, 'charsetName': "IBM855" } + +# flake8: noqa diff --git a/thirdparty/chardet/langgreekmodel.py b/thirdparty/chardet/langgreekmodel.py index ec6d49e80..ddb583765 100644 --- a/thirdparty/chardet/langgreekmodel.py +++ b/thirdparty/chardet/langgreekmodel.py @@ -13,27 +13,25 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants - # 255: Control characters that usually does not exist in any text # 254: Carriage/Return # 253: symbol (punctuation) that does not belong to word # 252: 0 - 9 # Character Mapping Table: -Latin7_CharToOrderMap = ( \ +Latin7_CharToOrderMap = ( 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 @@ -52,7 +50,7 @@ Latin7_CharToOrderMap = ( \ 9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, # f0 ) -win1253_CharToOrderMap = ( \ +win1253_CharToOrderMap = ( 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 @@ -71,13 +69,13 @@ win1253_CharToOrderMap = ( \ 9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, # f0 ) -# Model Table: +# Model Table: # total sequences: 100% # first 512 sequences: 98.2851% # first 1024 sequences:1.7001% # rest sequences: 0.0359% -# negative sequences: 0.0148% -GreekLangModel = ( \ +# negative sequences: 0.0148% +GreekLangModel = ( 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,3,2,2,3,3,3,3,3,3,3,3,1,3,3,3,0,2,2,3,3,0,3,0,3,2,0,3,3,3,0, @@ -208,18 +206,20 @@ GreekLangModel = ( \ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, ) -Latin7GreekModel = { \ +Latin7GreekModel = { 'charToOrderMap': Latin7_CharToOrderMap, 'precedenceMatrix': GreekLangModel, 'mTypicalPositiveRatio': 0.982851, - 'keepEnglishLetter': constants.False, + 'keepEnglishLetter': False, 'charsetName': "ISO-8859-7" } -Win1253GreekModel = { \ +Win1253GreekModel = { 'charToOrderMap': win1253_CharToOrderMap, 'precedenceMatrix': GreekLangModel, 'mTypicalPositiveRatio': 0.982851, - 'keepEnglishLetter': constants.False, + 'keepEnglishLetter': False, 'charsetName': "windows-1253" } + +# flake8: noqa diff --git a/thirdparty/chardet/langhebrewmodel.py b/thirdparty/chardet/langhebrewmodel.py index a8bcc65bf..75f2bc7fe 100644 --- a/thirdparty/chardet/langhebrewmodel.py +++ b/thirdparty/chardet/langhebrewmodel.py @@ -15,20 +15,18 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants - # 255: Control characters that usually does not exist in any text # 254: Carriage/Return # 253: symbol (punctuation) that does not belong to word @@ -36,7 +34,7 @@ import constants # Windows-1255 language model # Character Mapping Table: -win1255_CharToOrderMap = ( \ +win1255_CharToOrderMap = ( 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 @@ -55,13 +53,13 @@ win1255_CharToOrderMap = ( \ 12, 19, 13, 26, 18, 27, 21, 17, 7, 10, 5,251,252,128, 96,253, ) -# Model Table: +# Model Table: # total sequences: 100% # first 512 sequences: 98.4004% # first 1024 sequences: 1.5981% # rest sequences: 0.087% -# negative sequences: 0.0015% -HebrewLangModel = ( \ +# negative sequences: 0.0015% +HebrewLangModel = ( 0,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,2,1,2,0,1,0,0, 3,0,3,1,0,0,1,3,2,0,1,1,2,0,2,2,2,1,1,1,1,2,1,1,1,2,0,0,2,2,0,1, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2, @@ -192,10 +190,12 @@ HebrewLangModel = ( \ 0,0,0,0,0,0,0,0,0,0,1,2,1,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,1,1,0,0, ) -Win1255HebrewModel = { \ +Win1255HebrewModel = { 'charToOrderMap': win1255_CharToOrderMap, 'precedenceMatrix': HebrewLangModel, 'mTypicalPositiveRatio': 0.984004, - 'keepEnglishLetter': constants.False, + 'keepEnglishLetter': False, 'charsetName': "windows-1255" } + +# flake8: noqa diff --git a/thirdparty/chardet/langhungarianmodel.py b/thirdparty/chardet/langhungarianmodel.py index d635f03c2..49d2f0fe7 100644 --- a/thirdparty/chardet/langhungarianmodel.py +++ b/thirdparty/chardet/langhungarianmodel.py @@ -13,27 +13,25 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants - # 255: Control characters that usually does not exist in any text # 254: Carriage/Return # 253: symbol (punctuation) that does not belong to word # 252: 0 - 9 # Character Mapping Table: -Latin2_HungarianCharToOrderMap = ( \ +Latin2_HungarianCharToOrderMap = ( 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 @@ -52,7 +50,7 @@ Latin2_HungarianCharToOrderMap = ( \ 245,246,247, 25, 73, 42, 24,248,249,250, 31, 56, 29,251,252,253, ) -win1250HungarianCharToOrderMap = ( \ +win1250HungarianCharToOrderMap = ( 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 @@ -71,13 +69,13 @@ win1250HungarianCharToOrderMap = ( \ 245,246,247, 25, 74, 42, 24,248,249,250, 31, 56, 29,251,252,253, ) -# Model Table: +# Model Table: # total sequences: 100% # first 512 sequences: 94.7368% # first 1024 sequences:5.2623% # rest sequences: 0.8894% -# negative sequences: 0.0009% -HungarianLangModel = ( \ +# negative sequences: 0.0009% +HungarianLangModel = ( 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,1,1,2,2,2,2,2,1,2, 3,2,2,3,3,3,3,3,2,3,3,3,3,3,3,1,2,3,3,3,3,2,3,3,1,1,3,3,0,1,1,1, @@ -208,18 +206,20 @@ HungarianLangModel = ( \ 0,1,1,1,1,1,1,0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0, ) -Latin2HungarianModel = { \ +Latin2HungarianModel = { 'charToOrderMap': Latin2_HungarianCharToOrderMap, 'precedenceMatrix': HungarianLangModel, 'mTypicalPositiveRatio': 0.947368, - 'keepEnglishLetter': constants.True, + 'keepEnglishLetter': True, 'charsetName': "ISO-8859-2" } -Win1250HungarianModel = { \ +Win1250HungarianModel = { 'charToOrderMap': win1250HungarianCharToOrderMap, 'precedenceMatrix': HungarianLangModel, 'mTypicalPositiveRatio': 0.947368, - 'keepEnglishLetter': constants.True, + 'keepEnglishLetter': True, 'charsetName': "windows-1250" } + +# flake8: noqa diff --git a/thirdparty/chardet/langthaimodel.py b/thirdparty/chardet/langthaimodel.py index 96ec054f2..0508b1b1a 100644 --- a/thirdparty/chardet/langthaimodel.py +++ b/thirdparty/chardet/langthaimodel.py @@ -13,29 +13,27 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants - # 255: Control characters that usually does not exist in any text # 254: Carriage/Return # 253: symbol (punctuation) that does not belong to word # 252: 0 - 9 -# The following result for thai was collected from a limited sample (1M). +# The following result for thai was collected from a limited sample (1M). # Character Mapping Table: -TIS620CharToOrderMap = ( \ +TIS620CharToOrderMap = ( 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 @@ -54,13 +52,13 @@ TIS620CharToOrderMap = ( \ 68, 56, 59, 65, 69, 60, 70, 80, 71, 87,248,249,250,251,252,253, ) -# Model Table: +# Model Table: # total sequences: 100% # first 512 sequences: 92.6386% # first 1024 sequences:7.3177% # rest sequences: 1.0230% -# negative sequences: 0.0436% -ThaiLangModel = ( \ +# negative sequences: 0.0436% +ThaiLangModel = ( 0,1,3,3,3,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,3,3, 0,3,3,0,0,0,1,3,0,3,3,2,3,3,0,1,2,3,3,3,3,0,2,0,2,0,0,3,2,1,2,2, 3,0,3,3,2,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,3,0,3,2,3,0,2,2,2,3, @@ -191,10 +189,12 @@ ThaiLangModel = ( \ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, ) -TIS620ThaiModel = { \ +TIS620ThaiModel = { 'charToOrderMap': TIS620CharToOrderMap, 'precedenceMatrix': ThaiLangModel, 'mTypicalPositiveRatio': 0.926386, - 'keepEnglishLetter': constants.False, + 'keepEnglishLetter': False, 'charsetName': "TIS-620" } + +# flake8: noqa diff --git a/thirdparty/chardet/latin1prober.py b/thirdparty/chardet/latin1prober.py index ae4527c75..eef357354 100644 --- a/thirdparty/chardet/latin1prober.py +++ b/thirdparty/chardet/latin1prober.py @@ -14,85 +14,86 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from charsetprober import CharSetProber -import constants -import operator +from .charsetprober import CharSetProber +from .constants import eNotMe +from .compat import wrap_ord FREQ_CAT_NUM = 4 -UDF = 0 # undefined -OTH = 1 # other -ASC = 2 # ascii capital letter -ASS = 3 # ascii small letter -ACV = 4 # accent capital vowel -ACO = 5 # accent capital other -ASV = 6 # accent small vowel -ASO = 7 # accent small other -CLASS_NUM = 8 # total classes +UDF = 0 # undefined +OTH = 1 # other +ASC = 2 # ascii capital letter +ASS = 3 # ascii small letter +ACV = 4 # accent capital vowel +ACO = 5 # accent capital other +ASV = 6 # accent small vowel +ASO = 7 # accent small other +CLASS_NUM = 8 # total classes -Latin1_CharToClass = ( \ - OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07 - OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F - OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17 - OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F - OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27 - OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F - OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37 - OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F - OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47 - ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F - ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57 - ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F - OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67 - ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F - ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77 - ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F - OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, # 80 - 87 - OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, # 88 - 8F - UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 90 - 97 - OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, # 98 - 9F - OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A0 - A7 - OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A8 - AF - OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7 - OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B8 - BF - ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, # C0 - C7 - ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # C8 - CF - ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, # D0 - D7 - ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, # D8 - DF - ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, # E0 - E7 - ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # E8 - EF - ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, # F0 - F7 - ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, # F8 - FF +Latin1_CharToClass = ( + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07 + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17 + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27 + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37 + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F + OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47 + ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F + ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57 + ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F + OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67 + ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F + ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77 + ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F + OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, # 80 - 87 + OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, # 88 - 8F + UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 90 - 97 + OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, # 98 - 9F + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A0 - A7 + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A8 - AF + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7 + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B8 - BF + ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, # C0 - C7 + ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # C8 - CF + ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, # D0 - D7 + ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, # D8 - DF + ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, # E0 - E7 + ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # E8 - EF + ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, # F0 - F7 + ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, # F8 - FF ) -# 0 : illegal -# 1 : very unlikely -# 2 : normal +# 0 : illegal +# 1 : very unlikely +# 2 : normal # 3 : very likely -Latin1ClassModel = ( \ -# UDF OTH ASC ASS ACV ACO ASV ASO - 0, 0, 0, 0, 0, 0, 0, 0, # UDF - 0, 3, 3, 3, 3, 3, 3, 3, # OTH - 0, 3, 3, 3, 3, 3, 3, 3, # ASC - 0, 3, 3, 3, 1, 1, 3, 3, # ASS - 0, 3, 3, 3, 1, 2, 1, 2, # ACV - 0, 3, 3, 3, 3, 3, 3, 3, # ACO - 0, 3, 1, 3, 1, 1, 1, 3, # ASV - 0, 3, 1, 3, 1, 1, 3, 3, # ASO +Latin1ClassModel = ( + # UDF OTH ASC ASS ACV ACO ASV ASO + 0, 0, 0, 0, 0, 0, 0, 0, # UDF + 0, 3, 3, 3, 3, 3, 3, 3, # OTH + 0, 3, 3, 3, 3, 3, 3, 3, # ASC + 0, 3, 3, 3, 1, 1, 3, 3, # ASS + 0, 3, 3, 3, 1, 2, 1, 2, # ACV + 0, 3, 3, 3, 3, 3, 3, 3, # ACO + 0, 3, 1, 3, 1, 1, 1, 3, # ASV + 0, 3, 1, 3, 1, 1, 3, 3, # ASO ) + class Latin1Prober(CharSetProber): def __init__(self): CharSetProber.__init__(self) @@ -109,10 +110,11 @@ class Latin1Prober(CharSetProber): def feed(self, aBuf): aBuf = self.filter_with_english_letters(aBuf) for c in aBuf: - charClass = Latin1_CharToClass[ord(c)] - freq = Latin1ClassModel[(self._mLastCharClass * CLASS_NUM) + charClass] + charClass = Latin1_CharToClass[wrap_ord(c)] + freq = Latin1ClassModel[(self._mLastCharClass * CLASS_NUM) + + charClass] if freq == 0: - self._mState = constants.eNotMe + self._mState = eNotMe break self._mFreqCounter[freq] += 1 self._mLastCharClass = charClass @@ -120,17 +122,18 @@ class Latin1Prober(CharSetProber): return self.get_state() def get_confidence(self): - if self.get_state() == constants.eNotMe: + if self.get_state() == eNotMe: return 0.01 - total = reduce(operator.add, self._mFreqCounter) + total = sum(self._mFreqCounter) if total < 0.01: confidence = 0.0 else: - confidence = (self._mFreqCounter[3] / total) - (self._mFreqCounter[1] * 20.0 / total) + confidence = ((self._mFreqCounter[3] - self._mFreqCounter[1] * 20.0) + / total) if confidence < 0.0: confidence = 0.0 - # lower the confidence of latin1 so that other more accurate detector - # can take priority. - confidence = confidence * 0.5 + # lower the confidence of latin1 so that other more accurate + # detector can take priority. + confidence = confidence * 0.73 return confidence diff --git a/thirdparty/chardet/mbcharsetprober.py b/thirdparty/chardet/mbcharsetprober.py index 09b035e02..bb42f2fb5 100644 --- a/thirdparty/chardet/mbcharsetprober.py +++ b/thirdparty/chardet/mbcharsetprober.py @@ -15,28 +15,29 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants, sys -from constants import eStart, eError, eItsMe -from charsetprober import CharSetProber +import sys +from . import constants +from .charsetprober import CharSetProber + class MultiByteCharSetProber(CharSetProber): def __init__(self): CharSetProber.__init__(self) self._mDistributionAnalyzer = None self._mCodingSM = None - self._mLastChar = ['\x00', '\x00'] + self._mLastChar = [0, 0] def reset(self): CharSetProber.reset(self) @@ -44,36 +45,39 @@ class MultiByteCharSetProber(CharSetProber): self._mCodingSM.reset() if self._mDistributionAnalyzer: self._mDistributionAnalyzer.reset() - self._mLastChar = ['\x00', '\x00'] + self._mLastChar = [0, 0] def get_charset_name(self): pass def feed(self, aBuf): aLen = len(aBuf) - for i in xrange(0, aLen): + for i in range(0, aLen): codingState = self._mCodingSM.next_state(aBuf[i]) - if codingState == eError: + if codingState == constants.eError: if constants._debug: - sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n') + sys.stderr.write(self.get_charset_name() + + ' prober hit error at byte ' + str(i) + + '\n') self._mState = constants.eNotMe break - elif codingState == eItsMe: + elif codingState == constants.eItsMe: self._mState = constants.eFoundIt break - elif codingState == eStart: + elif codingState == constants.eStart: charLen = self._mCodingSM.get_current_charlen() if i == 0: self._mLastChar[1] = aBuf[0] self._mDistributionAnalyzer.feed(self._mLastChar, charLen) else: - self._mDistributionAnalyzer.feed(aBuf[i-1:i+1], charLen) + self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1], + charLen) self._mLastChar[0] = aBuf[aLen - 1] if self.get_state() == constants.eDetecting: - if self._mDistributionAnalyzer.got_enough_data() and \ - (self.get_confidence() > constants.SHORTCUT_THRESHOLD): + if (self._mDistributionAnalyzer.got_enough_data() and + (self.get_confidence() > constants.SHORTCUT_THRESHOLD)): self._mState = constants.eFoundIt return self.get_state() diff --git a/thirdparty/chardet/mbcsgroupprober.py b/thirdparty/chardet/mbcsgroupprober.py index 941cc3e37..03c9dcf3e 100644 --- a/thirdparty/chardet/mbcsgroupprober.py +++ b/thirdparty/chardet/mbcsgroupprober.py @@ -15,36 +15,40 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from charsetgroupprober import CharSetGroupProber -from utf8prober import UTF8Prober -from sjisprober import SJISProber -from eucjpprober import EUCJPProber -from gb2312prober import GB2312Prober -from euckrprober import EUCKRProber -from big5prober import Big5Prober -from euctwprober import EUCTWProber +from .charsetgroupprober import CharSetGroupProber +from .utf8prober import UTF8Prober +from .sjisprober import SJISProber +from .eucjpprober import EUCJPProber +from .gb2312prober import GB2312Prober +from .euckrprober import EUCKRProber +from .cp949prober import CP949Prober +from .big5prober import Big5Prober +from .euctwprober import EUCTWProber + class MBCSGroupProber(CharSetGroupProber): def __init__(self): CharSetGroupProber.__init__(self) - self._mProbers = [ \ + self._mProbers = [ UTF8Prober(), SJISProber(), EUCJPProber(), GB2312Prober(), EUCKRProber(), + CP949Prober(), Big5Prober(), - EUCTWProber()] + EUCTWProber() + ] self.reset() diff --git a/thirdparty/chardet/mbcssm.py b/thirdparty/chardet/mbcssm.py index 2b68306b0..efe678ca0 100644 --- a/thirdparty/chardet/mbcssm.py +++ b/thirdparty/chardet/mbcssm.py @@ -13,60 +13,62 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from constants import eStart, eError, eItsMe +from .constants import eStart, eError, eItsMe -# BIG5 +# BIG5 -BIG5_cls = ( \ +BIG5_cls = ( 1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as legal value - 1,1,1,1,1,1,0,0, # 08 - 0f - 1,1,1,1,1,1,1,1, # 10 - 17 - 1,1,1,0,1,1,1,1, # 18 - 1f - 1,1,1,1,1,1,1,1, # 20 - 27 - 1,1,1,1,1,1,1,1, # 28 - 2f - 1,1,1,1,1,1,1,1, # 30 - 37 - 1,1,1,1,1,1,1,1, # 38 - 3f - 2,2,2,2,2,2,2,2, # 40 - 47 - 2,2,2,2,2,2,2,2, # 48 - 4f - 2,2,2,2,2,2,2,2, # 50 - 57 - 2,2,2,2,2,2,2,2, # 58 - 5f - 2,2,2,2,2,2,2,2, # 60 - 67 - 2,2,2,2,2,2,2,2, # 68 - 6f - 2,2,2,2,2,2,2,2, # 70 - 77 - 2,2,2,2,2,2,2,1, # 78 - 7f - 4,4,4,4,4,4,4,4, # 80 - 87 - 4,4,4,4,4,4,4,4, # 88 - 8f - 4,4,4,4,4,4,4,4, # 90 - 97 - 4,4,4,4,4,4,4,4, # 98 - 9f - 4,3,3,3,3,3,3,3, # a0 - a7 - 3,3,3,3,3,3,3,3, # a8 - af - 3,3,3,3,3,3,3,3, # b0 - b7 - 3,3,3,3,3,3,3,3, # b8 - bf - 3,3,3,3,3,3,3,3, # c0 - c7 - 3,3,3,3,3,3,3,3, # c8 - cf - 3,3,3,3,3,3,3,3, # d0 - d7 - 3,3,3,3,3,3,3,3, # d8 - df - 3,3,3,3,3,3,3,3, # e0 - e7 - 3,3,3,3,3,3,3,3, # e8 - ef - 3,3,3,3,3,3,3,3, # f0 - f7 - 3,3,3,3,3,3,3,0) # f8 - ff + 1,1,1,1,1,1,0,0, # 08 - 0f + 1,1,1,1,1,1,1,1, # 10 - 17 + 1,1,1,0,1,1,1,1, # 18 - 1f + 1,1,1,1,1,1,1,1, # 20 - 27 + 1,1,1,1,1,1,1,1, # 28 - 2f + 1,1,1,1,1,1,1,1, # 30 - 37 + 1,1,1,1,1,1,1,1, # 38 - 3f + 2,2,2,2,2,2,2,2, # 40 - 47 + 2,2,2,2,2,2,2,2, # 48 - 4f + 2,2,2,2,2,2,2,2, # 50 - 57 + 2,2,2,2,2,2,2,2, # 58 - 5f + 2,2,2,2,2,2,2,2, # 60 - 67 + 2,2,2,2,2,2,2,2, # 68 - 6f + 2,2,2,2,2,2,2,2, # 70 - 77 + 2,2,2,2,2,2,2,1, # 78 - 7f + 4,4,4,4,4,4,4,4, # 80 - 87 + 4,4,4,4,4,4,4,4, # 88 - 8f + 4,4,4,4,4,4,4,4, # 90 - 97 + 4,4,4,4,4,4,4,4, # 98 - 9f + 4,3,3,3,3,3,3,3, # a0 - a7 + 3,3,3,3,3,3,3,3, # a8 - af + 3,3,3,3,3,3,3,3, # b0 - b7 + 3,3,3,3,3,3,3,3, # b8 - bf + 3,3,3,3,3,3,3,3, # c0 - c7 + 3,3,3,3,3,3,3,3, # c8 - cf + 3,3,3,3,3,3,3,3, # d0 - d7 + 3,3,3,3,3,3,3,3, # d8 - df + 3,3,3,3,3,3,3,3, # e0 - e7 + 3,3,3,3,3,3,3,3, # e8 - ef + 3,3,3,3,3,3,3,3, # f0 - f7 + 3,3,3,3,3,3,3,0 # f8 - ff +) -BIG5_st = ( \ - eError,eStart,eStart, 3,eError,eError,eError,eError,#00-07 - eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,#08-0f - eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart)#10-17 +BIG5_st = ( + eError,eStart,eStart, 3,eError,eError,eError,eError,#00-07 + eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,#08-0f + eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart#10-17 +) Big5CharLenTable = (0, 1, 1, 2, 0) @@ -76,48 +78,90 @@ Big5SMModel = {'classTable': BIG5_cls, 'charLenTable': Big5CharLenTable, 'name': 'Big5'} +# CP949 + +CP949_cls = ( + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,0,0, # 00 - 0f + 1,1,1,1,1,1,1,1, 1,1,1,0,1,1,1,1, # 10 - 1f + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, # 20 - 2f + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, # 30 - 3f + 1,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, # 40 - 4f + 4,4,5,5,5,5,5,5, 5,5,5,1,1,1,1,1, # 50 - 5f + 1,5,5,5,5,5,5,5, 5,5,5,5,5,5,5,5, # 60 - 6f + 5,5,5,5,5,5,5,5, 5,5,5,1,1,1,1,1, # 70 - 7f + 0,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6, # 80 - 8f + 6,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6, # 90 - 9f + 6,7,7,7,7,7,7,7, 7,7,7,7,7,8,8,8, # a0 - af + 7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7, # b0 - bf + 7,7,7,7,7,7,9,2, 2,3,2,2,2,2,2,2, # c0 - cf + 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, # d0 - df + 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, # e0 - ef + 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,0, # f0 - ff +) + +CP949_st = ( +#cls= 0 1 2 3 4 5 6 7 8 9 # previous state = + eError,eStart, 3,eError,eStart,eStart, 4, 5,eError, 6, # eStart + eError,eError,eError,eError,eError,eError,eError,eError,eError,eError, # eError + eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe, # eItsMe + eError,eError,eStart,eStart,eError,eError,eError,eStart,eStart,eStart, # 3 + eError,eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart,eStart, # 4 + eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart,eStart,eStart, # 5 + eError,eStart,eStart,eStart,eStart,eError,eError,eStart,eStart,eStart, # 6 +) + +CP949CharLenTable = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2) + +CP949SMModel = {'classTable': CP949_cls, + 'classFactor': 10, + 'stateTable': CP949_st, + 'charLenTable': CP949CharLenTable, + 'name': 'CP949'} + # EUC-JP -EUCJP_cls = ( \ - 4,4,4,4,4,4,4,4, # 00 - 07 - 4,4,4,4,4,4,5,5, # 08 - 0f - 4,4,4,4,4,4,4,4, # 10 - 17 - 4,4,4,5,4,4,4,4, # 18 - 1f - 4,4,4,4,4,4,4,4, # 20 - 27 - 4,4,4,4,4,4,4,4, # 28 - 2f - 4,4,4,4,4,4,4,4, # 30 - 37 - 4,4,4,4,4,4,4,4, # 38 - 3f - 4,4,4,4,4,4,4,4, # 40 - 47 - 4,4,4,4,4,4,4,4, # 48 - 4f - 4,4,4,4,4,4,4,4, # 50 - 57 - 4,4,4,4,4,4,4,4, # 58 - 5f - 4,4,4,4,4,4,4,4, # 60 - 67 - 4,4,4,4,4,4,4,4, # 68 - 6f - 4,4,4,4,4,4,4,4, # 70 - 77 - 4,4,4,4,4,4,4,4, # 78 - 7f - 5,5,5,5,5,5,5,5, # 80 - 87 - 5,5,5,5,5,5,1,3, # 88 - 8f - 5,5,5,5,5,5,5,5, # 90 - 97 - 5,5,5,5,5,5,5,5, # 98 - 9f - 5,2,2,2,2,2,2,2, # a0 - a7 - 2,2,2,2,2,2,2,2, # a8 - af - 2,2,2,2,2,2,2,2, # b0 - b7 - 2,2,2,2,2,2,2,2, # b8 - bf - 2,2,2,2,2,2,2,2, # c0 - c7 - 2,2,2,2,2,2,2,2, # c8 - cf - 2,2,2,2,2,2,2,2, # d0 - d7 - 2,2,2,2,2,2,2,2, # d8 - df - 0,0,0,0,0,0,0,0, # e0 - e7 - 0,0,0,0,0,0,0,0, # e8 - ef - 0,0,0,0,0,0,0,0, # f0 - f7 - 0,0,0,0,0,0,0,5) # f8 - ff +EUCJP_cls = ( + 4,4,4,4,4,4,4,4, # 00 - 07 + 4,4,4,4,4,4,5,5, # 08 - 0f + 4,4,4,4,4,4,4,4, # 10 - 17 + 4,4,4,5,4,4,4,4, # 18 - 1f + 4,4,4,4,4,4,4,4, # 20 - 27 + 4,4,4,4,4,4,4,4, # 28 - 2f + 4,4,4,4,4,4,4,4, # 30 - 37 + 4,4,4,4,4,4,4,4, # 38 - 3f + 4,4,4,4,4,4,4,4, # 40 - 47 + 4,4,4,4,4,4,4,4, # 48 - 4f + 4,4,4,4,4,4,4,4, # 50 - 57 + 4,4,4,4,4,4,4,4, # 58 - 5f + 4,4,4,4,4,4,4,4, # 60 - 67 + 4,4,4,4,4,4,4,4, # 68 - 6f + 4,4,4,4,4,4,4,4, # 70 - 77 + 4,4,4,4,4,4,4,4, # 78 - 7f + 5,5,5,5,5,5,5,5, # 80 - 87 + 5,5,5,5,5,5,1,3, # 88 - 8f + 5,5,5,5,5,5,5,5, # 90 - 97 + 5,5,5,5,5,5,5,5, # 98 - 9f + 5,2,2,2,2,2,2,2, # a0 - a7 + 2,2,2,2,2,2,2,2, # a8 - af + 2,2,2,2,2,2,2,2, # b0 - b7 + 2,2,2,2,2,2,2,2, # b8 - bf + 2,2,2,2,2,2,2,2, # c0 - c7 + 2,2,2,2,2,2,2,2, # c8 - cf + 2,2,2,2,2,2,2,2, # d0 - d7 + 2,2,2,2,2,2,2,2, # d8 - df + 0,0,0,0,0,0,0,0, # e0 - e7 + 0,0,0,0,0,0,0,0, # e8 - ef + 0,0,0,0,0,0,0,0, # f0 - f7 + 0,0,0,0,0,0,0,5 # f8 - ff +) -EUCJP_st = ( \ - 3, 4, 3, 5,eStart,eError,eError,eError,#00-07 - eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f - eItsMe,eItsMe,eStart,eError,eStart,eError,eError,eError,#10-17 - eError,eError,eStart,eError,eError,eError, 3,eError,#18-1f - 3,eError,eError,eError,eStart,eStart,eStart,eStart)#20-27 +EUCJP_st = ( + 3, 4, 3, 5,eStart,eError,eError,eError,#00-07 + eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f + eItsMe,eItsMe,eStart,eError,eStart,eError,eError,eError,#10-17 + eError,eError,eStart,eError,eError,eError, 3,eError,#18-1f + 3,eError,eError,eError,eStart,eStart,eStart,eStart#20-27 +) EUCJPCharLenTable = (2, 2, 2, 3, 1, 0) @@ -129,43 +173,45 @@ EUCJPSMModel = {'classTable': EUCJP_cls, # EUC-KR -EUCKR_cls = ( \ - 1,1,1,1,1,1,1,1, # 00 - 07 - 1,1,1,1,1,1,0,0, # 08 - 0f - 1,1,1,1,1,1,1,1, # 10 - 17 - 1,1,1,0,1,1,1,1, # 18 - 1f - 1,1,1,1,1,1,1,1, # 20 - 27 - 1,1,1,1,1,1,1,1, # 28 - 2f - 1,1,1,1,1,1,1,1, # 30 - 37 - 1,1,1,1,1,1,1,1, # 38 - 3f - 1,1,1,1,1,1,1,1, # 40 - 47 - 1,1,1,1,1,1,1,1, # 48 - 4f - 1,1,1,1,1,1,1,1, # 50 - 57 - 1,1,1,1,1,1,1,1, # 58 - 5f - 1,1,1,1,1,1,1,1, # 60 - 67 - 1,1,1,1,1,1,1,1, # 68 - 6f - 1,1,1,1,1,1,1,1, # 70 - 77 - 1,1,1,1,1,1,1,1, # 78 - 7f - 0,0,0,0,0,0,0,0, # 80 - 87 - 0,0,0,0,0,0,0,0, # 88 - 8f - 0,0,0,0,0,0,0,0, # 90 - 97 - 0,0,0,0,0,0,0,0, # 98 - 9f - 0,2,2,2,2,2,2,2, # a0 - a7 - 2,2,2,2,2,3,3,3, # a8 - af - 2,2,2,2,2,2,2,2, # b0 - b7 - 2,2,2,2,2,2,2,2, # b8 - bf - 2,2,2,2,2,2,2,2, # c0 - c7 - 2,3,2,2,2,2,2,2, # c8 - cf - 2,2,2,2,2,2,2,2, # d0 - d7 - 2,2,2,2,2,2,2,2, # d8 - df - 2,2,2,2,2,2,2,2, # e0 - e7 - 2,2,2,2,2,2,2,2, # e8 - ef - 2,2,2,2,2,2,2,2, # f0 - f7 - 2,2,2,2,2,2,2,0) # f8 - ff +EUCKR_cls = ( + 1,1,1,1,1,1,1,1, # 00 - 07 + 1,1,1,1,1,1,0,0, # 08 - 0f + 1,1,1,1,1,1,1,1, # 10 - 17 + 1,1,1,0,1,1,1,1, # 18 - 1f + 1,1,1,1,1,1,1,1, # 20 - 27 + 1,1,1,1,1,1,1,1, # 28 - 2f + 1,1,1,1,1,1,1,1, # 30 - 37 + 1,1,1,1,1,1,1,1, # 38 - 3f + 1,1,1,1,1,1,1,1, # 40 - 47 + 1,1,1,1,1,1,1,1, # 48 - 4f + 1,1,1,1,1,1,1,1, # 50 - 57 + 1,1,1,1,1,1,1,1, # 58 - 5f + 1,1,1,1,1,1,1,1, # 60 - 67 + 1,1,1,1,1,1,1,1, # 68 - 6f + 1,1,1,1,1,1,1,1, # 70 - 77 + 1,1,1,1,1,1,1,1, # 78 - 7f + 0,0,0,0,0,0,0,0, # 80 - 87 + 0,0,0,0,0,0,0,0, # 88 - 8f + 0,0,0,0,0,0,0,0, # 90 - 97 + 0,0,0,0,0,0,0,0, # 98 - 9f + 0,2,2,2,2,2,2,2, # a0 - a7 + 2,2,2,2,2,3,3,3, # a8 - af + 2,2,2,2,2,2,2,2, # b0 - b7 + 2,2,2,2,2,2,2,2, # b8 - bf + 2,2,2,2,2,2,2,2, # c0 - c7 + 2,3,2,2,2,2,2,2, # c8 - cf + 2,2,2,2,2,2,2,2, # d0 - d7 + 2,2,2,2,2,2,2,2, # d8 - df + 2,2,2,2,2,2,2,2, # e0 - e7 + 2,2,2,2,2,2,2,2, # e8 - ef + 2,2,2,2,2,2,2,2, # f0 - f7 + 2,2,2,2,2,2,2,0 # f8 - ff +) EUCKR_st = ( - eError,eStart, 3,eError,eError,eError,eError,eError,#00-07 - eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart)#08-0f + eError,eStart, 3,eError,eError,eError,eError,eError,#00-07 + eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart #08-0f +) EUCKRCharLenTable = (0, 1, 2, 0) @@ -177,47 +223,49 @@ EUCKRSMModel = {'classTable': EUCKR_cls, # EUC-TW -EUCTW_cls = ( \ - 2,2,2,2,2,2,2,2, # 00 - 07 - 2,2,2,2,2,2,0,0, # 08 - 0f - 2,2,2,2,2,2,2,2, # 10 - 17 - 2,2,2,0,2,2,2,2, # 18 - 1f - 2,2,2,2,2,2,2,2, # 20 - 27 - 2,2,2,2,2,2,2,2, # 28 - 2f - 2,2,2,2,2,2,2,2, # 30 - 37 - 2,2,2,2,2,2,2,2, # 38 - 3f - 2,2,2,2,2,2,2,2, # 40 - 47 - 2,2,2,2,2,2,2,2, # 48 - 4f - 2,2,2,2,2,2,2,2, # 50 - 57 - 2,2,2,2,2,2,2,2, # 58 - 5f - 2,2,2,2,2,2,2,2, # 60 - 67 - 2,2,2,2,2,2,2,2, # 68 - 6f - 2,2,2,2,2,2,2,2, # 70 - 77 - 2,2,2,2,2,2,2,2, # 78 - 7f - 0,0,0,0,0,0,0,0, # 80 - 87 - 0,0,0,0,0,0,6,0, # 88 - 8f - 0,0,0,0,0,0,0,0, # 90 - 97 - 0,0,0,0,0,0,0,0, # 98 - 9f - 0,3,4,4,4,4,4,4, # a0 - a7 - 5,5,1,1,1,1,1,1, # a8 - af - 1,1,1,1,1,1,1,1, # b0 - b7 - 1,1,1,1,1,1,1,1, # b8 - bf - 1,1,3,1,3,3,3,3, # c0 - c7 - 3,3,3,3,3,3,3,3, # c8 - cf - 3,3,3,3,3,3,3,3, # d0 - d7 - 3,3,3,3,3,3,3,3, # d8 - df - 3,3,3,3,3,3,3,3, # e0 - e7 - 3,3,3,3,3,3,3,3, # e8 - ef - 3,3,3,3,3,3,3,3, # f0 - f7 - 3,3,3,3,3,3,3,0) # f8 - ff +EUCTW_cls = ( + 2,2,2,2,2,2,2,2, # 00 - 07 + 2,2,2,2,2,2,0,0, # 08 - 0f + 2,2,2,2,2,2,2,2, # 10 - 17 + 2,2,2,0,2,2,2,2, # 18 - 1f + 2,2,2,2,2,2,2,2, # 20 - 27 + 2,2,2,2,2,2,2,2, # 28 - 2f + 2,2,2,2,2,2,2,2, # 30 - 37 + 2,2,2,2,2,2,2,2, # 38 - 3f + 2,2,2,2,2,2,2,2, # 40 - 47 + 2,2,2,2,2,2,2,2, # 48 - 4f + 2,2,2,2,2,2,2,2, # 50 - 57 + 2,2,2,2,2,2,2,2, # 58 - 5f + 2,2,2,2,2,2,2,2, # 60 - 67 + 2,2,2,2,2,2,2,2, # 68 - 6f + 2,2,2,2,2,2,2,2, # 70 - 77 + 2,2,2,2,2,2,2,2, # 78 - 7f + 0,0,0,0,0,0,0,0, # 80 - 87 + 0,0,0,0,0,0,6,0, # 88 - 8f + 0,0,0,0,0,0,0,0, # 90 - 97 + 0,0,0,0,0,0,0,0, # 98 - 9f + 0,3,4,4,4,4,4,4, # a0 - a7 + 5,5,1,1,1,1,1,1, # a8 - af + 1,1,1,1,1,1,1,1, # b0 - b7 + 1,1,1,1,1,1,1,1, # b8 - bf + 1,1,3,1,3,3,3,3, # c0 - c7 + 3,3,3,3,3,3,3,3, # c8 - cf + 3,3,3,3,3,3,3,3, # d0 - d7 + 3,3,3,3,3,3,3,3, # d8 - df + 3,3,3,3,3,3,3,3, # e0 - e7 + 3,3,3,3,3,3,3,3, # e8 - ef + 3,3,3,3,3,3,3,3, # f0 - f7 + 3,3,3,3,3,3,3,0 # f8 - ff +) -EUCTW_st = ( \ - eError,eError,eStart, 3, 3, 3, 4,eError,#00-07 - eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,#08-0f - eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eStart,eError,#10-17 - eStart,eStart,eStart,eError,eError,eError,eError,eError,#18-1f - 5,eError,eError,eError,eStart,eError,eStart,eStart,#20-27 - eStart,eError,eStart,eStart,eStart,eStart,eStart,eStart)#28-2f +EUCTW_st = ( + eError,eError,eStart, 3, 3, 3, 4,eError,#00-07 + eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,#08-0f + eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eStart,eError,#10-17 + eStart,eStart,eStart,eError,eError,eError,eError,eError,#18-1f + 5,eError,eError,eError,eStart,eError,eStart,eStart,#20-27 + eStart,eError,eStart,eStart,eStart,eStart,eStart,eStart #28-2f +) EUCTWCharLenTable = (0, 0, 1, 2, 2, 2, 3) @@ -229,53 +277,55 @@ EUCTWSMModel = {'classTable': EUCTW_cls, # GB2312 -GB2312_cls = ( \ - 1,1,1,1,1,1,1,1, # 00 - 07 - 1,1,1,1,1,1,0,0, # 08 - 0f - 1,1,1,1,1,1,1,1, # 10 - 17 - 1,1,1,0,1,1,1,1, # 18 - 1f - 1,1,1,1,1,1,1,1, # 20 - 27 - 1,1,1,1,1,1,1,1, # 28 - 2f - 3,3,3,3,3,3,3,3, # 30 - 37 - 3,3,1,1,1,1,1,1, # 38 - 3f - 2,2,2,2,2,2,2,2, # 40 - 47 - 2,2,2,2,2,2,2,2, # 48 - 4f - 2,2,2,2,2,2,2,2, # 50 - 57 - 2,2,2,2,2,2,2,2, # 58 - 5f - 2,2,2,2,2,2,2,2, # 60 - 67 - 2,2,2,2,2,2,2,2, # 68 - 6f - 2,2,2,2,2,2,2,2, # 70 - 77 - 2,2,2,2,2,2,2,4, # 78 - 7f - 5,6,6,6,6,6,6,6, # 80 - 87 - 6,6,6,6,6,6,6,6, # 88 - 8f - 6,6,6,6,6,6,6,6, # 90 - 97 - 6,6,6,6,6,6,6,6, # 98 - 9f - 6,6,6,6,6,6,6,6, # a0 - a7 - 6,6,6,6,6,6,6,6, # a8 - af - 6,6,6,6,6,6,6,6, # b0 - b7 - 6,6,6,6,6,6,6,6, # b8 - bf - 6,6,6,6,6,6,6,6, # c0 - c7 - 6,6,6,6,6,6,6,6, # c8 - cf - 6,6,6,6,6,6,6,6, # d0 - d7 - 6,6,6,6,6,6,6,6, # d8 - df - 6,6,6,6,6,6,6,6, # e0 - e7 - 6,6,6,6,6,6,6,6, # e8 - ef - 6,6,6,6,6,6,6,6, # f0 - f7 - 6,6,6,6,6,6,6,0) # f8 - ff +GB2312_cls = ( + 1,1,1,1,1,1,1,1, # 00 - 07 + 1,1,1,1,1,1,0,0, # 08 - 0f + 1,1,1,1,1,1,1,1, # 10 - 17 + 1,1,1,0,1,1,1,1, # 18 - 1f + 1,1,1,1,1,1,1,1, # 20 - 27 + 1,1,1,1,1,1,1,1, # 28 - 2f + 3,3,3,3,3,3,3,3, # 30 - 37 + 3,3,1,1,1,1,1,1, # 38 - 3f + 2,2,2,2,2,2,2,2, # 40 - 47 + 2,2,2,2,2,2,2,2, # 48 - 4f + 2,2,2,2,2,2,2,2, # 50 - 57 + 2,2,2,2,2,2,2,2, # 58 - 5f + 2,2,2,2,2,2,2,2, # 60 - 67 + 2,2,2,2,2,2,2,2, # 68 - 6f + 2,2,2,2,2,2,2,2, # 70 - 77 + 2,2,2,2,2,2,2,4, # 78 - 7f + 5,6,6,6,6,6,6,6, # 80 - 87 + 6,6,6,6,6,6,6,6, # 88 - 8f + 6,6,6,6,6,6,6,6, # 90 - 97 + 6,6,6,6,6,6,6,6, # 98 - 9f + 6,6,6,6,6,6,6,6, # a0 - a7 + 6,6,6,6,6,6,6,6, # a8 - af + 6,6,6,6,6,6,6,6, # b0 - b7 + 6,6,6,6,6,6,6,6, # b8 - bf + 6,6,6,6,6,6,6,6, # c0 - c7 + 6,6,6,6,6,6,6,6, # c8 - cf + 6,6,6,6,6,6,6,6, # d0 - d7 + 6,6,6,6,6,6,6,6, # d8 - df + 6,6,6,6,6,6,6,6, # e0 - e7 + 6,6,6,6,6,6,6,6, # e8 - ef + 6,6,6,6,6,6,6,6, # f0 - f7 + 6,6,6,6,6,6,6,0 # f8 - ff +) -GB2312_st = ( \ - eError,eStart,eStart,eStart,eStart,eStart, 3,eError,#00-07 - eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,#08-0f - eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,#10-17 - 4,eError,eStart,eStart,eError,eError,eError,eError,#18-1f - eError,eError, 5,eError,eError,eError,eItsMe,eError,#20-27 - eError,eError,eStart,eStart,eStart,eStart,eStart,eStart)#28-2f +GB2312_st = ( + eError,eStart,eStart,eStart,eStart,eStart, 3,eError,#00-07 + eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,#08-0f + eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,#10-17 + 4,eError,eStart,eStart,eError,eError,eError,eError,#18-1f + eError,eError, 5,eError,eError,eError,eItsMe,eError,#20-27 + eError,eError,eStart,eStart,eStart,eStart,eStart,eStart #28-2f +) -# To be accurate, the length of class 6 can be either 2 or 4. -# But it is not necessary to discriminate between the two since -# it is used for frequency analysis only, and we are validing -# each code range there as well. So it is safe to set it to be -# 2 here. +# To be accurate, the length of class 6 can be either 2 or 4. +# But it is not necessary to discriminate between the two since +# it is used for frequency analysis only, and we are validing +# each code range there as well. So it is safe to set it to be +# 2 here. GB2312CharLenTable = (0, 1, 1, 1, 1, 1, 2) GB2312SMModel = {'classTable': GB2312_cls, @@ -286,46 +336,48 @@ GB2312SMModel = {'classTable': GB2312_cls, # Shift_JIS -SJIS_cls = ( \ - 1,1,1,1,1,1,1,1, # 00 - 07 - 1,1,1,1,1,1,0,0, # 08 - 0f - 1,1,1,1,1,1,1,1, # 10 - 17 - 1,1,1,0,1,1,1,1, # 18 - 1f - 1,1,1,1,1,1,1,1, # 20 - 27 - 1,1,1,1,1,1,1,1, # 28 - 2f - 1,1,1,1,1,1,1,1, # 30 - 37 - 1,1,1,1,1,1,1,1, # 38 - 3f - 2,2,2,2,2,2,2,2, # 40 - 47 - 2,2,2,2,2,2,2,2, # 48 - 4f - 2,2,2,2,2,2,2,2, # 50 - 57 - 2,2,2,2,2,2,2,2, # 58 - 5f - 2,2,2,2,2,2,2,2, # 60 - 67 - 2,2,2,2,2,2,2,2, # 68 - 6f - 2,2,2,2,2,2,2,2, # 70 - 77 - 2,2,2,2,2,2,2,1, # 78 - 7f - 3,3,3,3,3,3,3,3, # 80 - 87 - 3,3,3,3,3,3,3,3, # 88 - 8f - 3,3,3,3,3,3,3,3, # 90 - 97 - 3,3,3,3,3,3,3,3, # 98 - 9f - #0xa0 is illegal in sjis encoding, but some pages does +SJIS_cls = ( + 1,1,1,1,1,1,1,1, # 00 - 07 + 1,1,1,1,1,1,0,0, # 08 - 0f + 1,1,1,1,1,1,1,1, # 10 - 17 + 1,1,1,0,1,1,1,1, # 18 - 1f + 1,1,1,1,1,1,1,1, # 20 - 27 + 1,1,1,1,1,1,1,1, # 28 - 2f + 1,1,1,1,1,1,1,1, # 30 - 37 + 1,1,1,1,1,1,1,1, # 38 - 3f + 2,2,2,2,2,2,2,2, # 40 - 47 + 2,2,2,2,2,2,2,2, # 48 - 4f + 2,2,2,2,2,2,2,2, # 50 - 57 + 2,2,2,2,2,2,2,2, # 58 - 5f + 2,2,2,2,2,2,2,2, # 60 - 67 + 2,2,2,2,2,2,2,2, # 68 - 6f + 2,2,2,2,2,2,2,2, # 70 - 77 + 2,2,2,2,2,2,2,1, # 78 - 7f + 3,3,3,3,3,2,2,3, # 80 - 87 + 3,3,3,3,3,3,3,3, # 88 - 8f + 3,3,3,3,3,3,3,3, # 90 - 97 + 3,3,3,3,3,3,3,3, # 98 - 9f + #0xa0 is illegal in sjis encoding, but some pages does #contain such byte. We need to be more error forgiven. - 2,2,2,2,2,2,2,2, # a0 - a7 - 2,2,2,2,2,2,2,2, # a8 - af - 2,2,2,2,2,2,2,2, # b0 - b7 - 2,2,2,2,2,2,2,2, # b8 - bf - 2,2,2,2,2,2,2,2, # c0 - c7 - 2,2,2,2,2,2,2,2, # c8 - cf - 2,2,2,2,2,2,2,2, # d0 - d7 - 2,2,2,2,2,2,2,2, # d8 - df - 3,3,3,3,3,3,3,3, # e0 - e7 - 3,3,3,3,3,4,4,4, # e8 - ef - 4,4,4,4,4,4,4,4, # f0 - f7 - 4,4,4,4,4,0,0,0) # f8 - ff + 2,2,2,2,2,2,2,2, # a0 - a7 + 2,2,2,2,2,2,2,2, # a8 - af + 2,2,2,2,2,2,2,2, # b0 - b7 + 2,2,2,2,2,2,2,2, # b8 - bf + 2,2,2,2,2,2,2,2, # c0 - c7 + 2,2,2,2,2,2,2,2, # c8 - cf + 2,2,2,2,2,2,2,2, # d0 - d7 + 2,2,2,2,2,2,2,2, # d8 - df + 3,3,3,3,3,3,3,3, # e0 - e7 + 3,3,3,3,3,4,4,4, # e8 - ef + 3,3,3,3,3,3,3,3, # f0 - f7 + 3,3,3,3,3,0,0,0) # f8 - ff -SJIS_st = ( \ - eError,eStart,eStart, 3,eError,eError,eError,eError,#00-07 - eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f - eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart)#10-17 + +SJIS_st = ( + eError,eStart,eStart, 3,eError,eError,eError,eError,#00-07 + eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f + eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart #10-17 +) SJISCharLenTable = (0, 1, 1, 2, 0, 0) @@ -337,48 +389,50 @@ SJISSMModel = {'classTable': SJIS_cls, # UCS2-BE -UCS2BE_cls = ( \ - 0,0,0,0,0,0,0,0, # 00 - 07 - 0,0,1,0,0,2,0,0, # 08 - 0f - 0,0,0,0,0,0,0,0, # 10 - 17 - 0,0,0,3,0,0,0,0, # 18 - 1f - 0,0,0,0,0,0,0,0, # 20 - 27 - 0,3,3,3,3,3,0,0, # 28 - 2f - 0,0,0,0,0,0,0,0, # 30 - 37 - 0,0,0,0,0,0,0,0, # 38 - 3f - 0,0,0,0,0,0,0,0, # 40 - 47 - 0,0,0,0,0,0,0,0, # 48 - 4f - 0,0,0,0,0,0,0,0, # 50 - 57 - 0,0,0,0,0,0,0,0, # 58 - 5f - 0,0,0,0,0,0,0,0, # 60 - 67 - 0,0,0,0,0,0,0,0, # 68 - 6f - 0,0,0,0,0,0,0,0, # 70 - 77 - 0,0,0,0,0,0,0,0, # 78 - 7f - 0,0,0,0,0,0,0,0, # 80 - 87 - 0,0,0,0,0,0,0,0, # 88 - 8f - 0,0,0,0,0,0,0,0, # 90 - 97 - 0,0,0,0,0,0,0,0, # 98 - 9f - 0,0,0,0,0,0,0,0, # a0 - a7 - 0,0,0,0,0,0,0,0, # a8 - af - 0,0,0,0,0,0,0,0, # b0 - b7 - 0,0,0,0,0,0,0,0, # b8 - bf - 0,0,0,0,0,0,0,0, # c0 - c7 - 0,0,0,0,0,0,0,0, # c8 - cf - 0,0,0,0,0,0,0,0, # d0 - d7 - 0,0,0,0,0,0,0,0, # d8 - df - 0,0,0,0,0,0,0,0, # e0 - e7 - 0,0,0,0,0,0,0,0, # e8 - ef - 0,0,0,0,0,0,0,0, # f0 - f7 - 0,0,0,0,0,0,4,5) # f8 - ff +UCS2BE_cls = ( + 0,0,0,0,0,0,0,0, # 00 - 07 + 0,0,1,0,0,2,0,0, # 08 - 0f + 0,0,0,0,0,0,0,0, # 10 - 17 + 0,0,0,3,0,0,0,0, # 18 - 1f + 0,0,0,0,0,0,0,0, # 20 - 27 + 0,3,3,3,3,3,0,0, # 28 - 2f + 0,0,0,0,0,0,0,0, # 30 - 37 + 0,0,0,0,0,0,0,0, # 38 - 3f + 0,0,0,0,0,0,0,0, # 40 - 47 + 0,0,0,0,0,0,0,0, # 48 - 4f + 0,0,0,0,0,0,0,0, # 50 - 57 + 0,0,0,0,0,0,0,0, # 58 - 5f + 0,0,0,0,0,0,0,0, # 60 - 67 + 0,0,0,0,0,0,0,0, # 68 - 6f + 0,0,0,0,0,0,0,0, # 70 - 77 + 0,0,0,0,0,0,0,0, # 78 - 7f + 0,0,0,0,0,0,0,0, # 80 - 87 + 0,0,0,0,0,0,0,0, # 88 - 8f + 0,0,0,0,0,0,0,0, # 90 - 97 + 0,0,0,0,0,0,0,0, # 98 - 9f + 0,0,0,0,0,0,0,0, # a0 - a7 + 0,0,0,0,0,0,0,0, # a8 - af + 0,0,0,0,0,0,0,0, # b0 - b7 + 0,0,0,0,0,0,0,0, # b8 - bf + 0,0,0,0,0,0,0,0, # c0 - c7 + 0,0,0,0,0,0,0,0, # c8 - cf + 0,0,0,0,0,0,0,0, # d0 - d7 + 0,0,0,0,0,0,0,0, # d8 - df + 0,0,0,0,0,0,0,0, # e0 - e7 + 0,0,0,0,0,0,0,0, # e8 - ef + 0,0,0,0,0,0,0,0, # f0 - f7 + 0,0,0,0,0,0,4,5 # f8 - ff +) -UCS2BE_st = ( \ - 5, 7, 7,eError, 4, 3,eError,eError,#00-07 - eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f - eItsMe,eItsMe, 6, 6, 6, 6,eError,eError,#10-17 - 6, 6, 6, 6, 6,eItsMe, 6, 6,#18-1f - 6, 6, 6, 6, 5, 7, 7,eError,#20-27 - 5, 8, 6, 6,eError, 6, 6, 6,#28-2f - 6, 6, 6, 6,eError,eError,eStart,eStart)#30-37 +UCS2BE_st = ( + 5, 7, 7,eError, 4, 3,eError,eError,#00-07 + eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f + eItsMe,eItsMe, 6, 6, 6, 6,eError,eError,#10-17 + 6, 6, 6, 6, 6,eItsMe, 6, 6,#18-1f + 6, 6, 6, 6, 5, 7, 7,eError,#20-27 + 5, 8, 6, 6,eError, 6, 6, 6,#28-2f + 6, 6, 6, 6,eError,eError,eStart,eStart #30-37 +) UCS2BECharLenTable = (2, 2, 2, 0, 2, 2) @@ -390,48 +444,50 @@ UCS2BESMModel = {'classTable': UCS2BE_cls, # UCS2-LE -UCS2LE_cls = ( \ - 0,0,0,0,0,0,0,0, # 00 - 07 - 0,0,1,0,0,2,0,0, # 08 - 0f - 0,0,0,0,0,0,0,0, # 10 - 17 - 0,0,0,3,0,0,0,0, # 18 - 1f - 0,0,0,0,0,0,0,0, # 20 - 27 - 0,3,3,3,3,3,0,0, # 28 - 2f - 0,0,0,0,0,0,0,0, # 30 - 37 - 0,0,0,0,0,0,0,0, # 38 - 3f - 0,0,0,0,0,0,0,0, # 40 - 47 - 0,0,0,0,0,0,0,0, # 48 - 4f - 0,0,0,0,0,0,0,0, # 50 - 57 - 0,0,0,0,0,0,0,0, # 58 - 5f - 0,0,0,0,0,0,0,0, # 60 - 67 - 0,0,0,0,0,0,0,0, # 68 - 6f - 0,0,0,0,0,0,0,0, # 70 - 77 - 0,0,0,0,0,0,0,0, # 78 - 7f - 0,0,0,0,0,0,0,0, # 80 - 87 - 0,0,0,0,0,0,0,0, # 88 - 8f - 0,0,0,0,0,0,0,0, # 90 - 97 - 0,0,0,0,0,0,0,0, # 98 - 9f - 0,0,0,0,0,0,0,0, # a0 - a7 - 0,0,0,0,0,0,0,0, # a8 - af - 0,0,0,0,0,0,0,0, # b0 - b7 - 0,0,0,0,0,0,0,0, # b8 - bf - 0,0,0,0,0,0,0,0, # c0 - c7 - 0,0,0,0,0,0,0,0, # c8 - cf - 0,0,0,0,0,0,0,0, # d0 - d7 - 0,0,0,0,0,0,0,0, # d8 - df - 0,0,0,0,0,0,0,0, # e0 - e7 - 0,0,0,0,0,0,0,0, # e8 - ef - 0,0,0,0,0,0,0,0, # f0 - f7 - 0,0,0,0,0,0,4,5) # f8 - ff +UCS2LE_cls = ( + 0,0,0,0,0,0,0,0, # 00 - 07 + 0,0,1,0,0,2,0,0, # 08 - 0f + 0,0,0,0,0,0,0,0, # 10 - 17 + 0,0,0,3,0,0,0,0, # 18 - 1f + 0,0,0,0,0,0,0,0, # 20 - 27 + 0,3,3,3,3,3,0,0, # 28 - 2f + 0,0,0,0,0,0,0,0, # 30 - 37 + 0,0,0,0,0,0,0,0, # 38 - 3f + 0,0,0,0,0,0,0,0, # 40 - 47 + 0,0,0,0,0,0,0,0, # 48 - 4f + 0,0,0,0,0,0,0,0, # 50 - 57 + 0,0,0,0,0,0,0,0, # 58 - 5f + 0,0,0,0,0,0,0,0, # 60 - 67 + 0,0,0,0,0,0,0,0, # 68 - 6f + 0,0,0,0,0,0,0,0, # 70 - 77 + 0,0,0,0,0,0,0,0, # 78 - 7f + 0,0,0,0,0,0,0,0, # 80 - 87 + 0,0,0,0,0,0,0,0, # 88 - 8f + 0,0,0,0,0,0,0,0, # 90 - 97 + 0,0,0,0,0,0,0,0, # 98 - 9f + 0,0,0,0,0,0,0,0, # a0 - a7 + 0,0,0,0,0,0,0,0, # a8 - af + 0,0,0,0,0,0,0,0, # b0 - b7 + 0,0,0,0,0,0,0,0, # b8 - bf + 0,0,0,0,0,0,0,0, # c0 - c7 + 0,0,0,0,0,0,0,0, # c8 - cf + 0,0,0,0,0,0,0,0, # d0 - d7 + 0,0,0,0,0,0,0,0, # d8 - df + 0,0,0,0,0,0,0,0, # e0 - e7 + 0,0,0,0,0,0,0,0, # e8 - ef + 0,0,0,0,0,0,0,0, # f0 - f7 + 0,0,0,0,0,0,4,5 # f8 - ff +) -UCS2LE_st = ( \ - 6, 6, 7, 6, 4, 3,eError,eError,#00-07 - eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f - eItsMe,eItsMe, 5, 5, 5,eError,eItsMe,eError,#10-17 - 5, 5, 5,eError, 5,eError, 6, 6,#18-1f - 7, 6, 8, 8, 5, 5, 5,eError,#20-27 - 5, 5, 5,eError,eError,eError, 5, 5,#28-2f - 5, 5, 5,eError, 5,eError,eStart,eStart)#30-37 +UCS2LE_st = ( + 6, 6, 7, 6, 4, 3,eError,eError,#00-07 + eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,#08-0f + eItsMe,eItsMe, 5, 5, 5,eError,eItsMe,eError,#10-17 + 5, 5, 5,eError, 5,eError, 6, 6,#18-1f + 7, 6, 8, 8, 5, 5, 5,eError,#20-27 + 5, 5, 5,eError,eError,eError, 5, 5,#28-2f + 5, 5, 5,eError, 5,eError,eStart,eStart #30-37 +) UCS2LECharLenTable = (2, 2, 2, 2, 2, 2) @@ -443,67 +499,69 @@ UCS2LESMModel = {'classTable': UCS2LE_cls, # UTF-8 -UTF8_cls = ( \ +UTF8_cls = ( 1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as a legal value - 1,1,1,1,1,1,0,0, # 08 - 0f - 1,1,1,1,1,1,1,1, # 10 - 17 - 1,1,1,0,1,1,1,1, # 18 - 1f - 1,1,1,1,1,1,1,1, # 20 - 27 - 1,1,1,1,1,1,1,1, # 28 - 2f - 1,1,1,1,1,1,1,1, # 30 - 37 - 1,1,1,1,1,1,1,1, # 38 - 3f - 1,1,1,1,1,1,1,1, # 40 - 47 - 1,1,1,1,1,1,1,1, # 48 - 4f - 1,1,1,1,1,1,1,1, # 50 - 57 - 1,1,1,1,1,1,1,1, # 58 - 5f - 1,1,1,1,1,1,1,1, # 60 - 67 - 1,1,1,1,1,1,1,1, # 68 - 6f - 1,1,1,1,1,1,1,1, # 70 - 77 - 1,1,1,1,1,1,1,1, # 78 - 7f - 2,2,2,2,3,3,3,3, # 80 - 87 - 4,4,4,4,4,4,4,4, # 88 - 8f - 4,4,4,4,4,4,4,4, # 90 - 97 - 4,4,4,4,4,4,4,4, # 98 - 9f - 5,5,5,5,5,5,5,5, # a0 - a7 - 5,5,5,5,5,5,5,5, # a8 - af - 5,5,5,5,5,5,5,5, # b0 - b7 - 5,5,5,5,5,5,5,5, # b8 - bf - 0,0,6,6,6,6,6,6, # c0 - c7 - 6,6,6,6,6,6,6,6, # c8 - cf - 6,6,6,6,6,6,6,6, # d0 - d7 - 6,6,6,6,6,6,6,6, # d8 - df - 7,8,8,8,8,8,8,8, # e0 - e7 - 8,8,8,8,8,9,8,8, # e8 - ef - 10,11,11,11,11,11,11,11, # f0 - f7 - 12,13,13,13,14,15,0,0) # f8 - ff + 1,1,1,1,1,1,0,0, # 08 - 0f + 1,1,1,1,1,1,1,1, # 10 - 17 + 1,1,1,0,1,1,1,1, # 18 - 1f + 1,1,1,1,1,1,1,1, # 20 - 27 + 1,1,1,1,1,1,1,1, # 28 - 2f + 1,1,1,1,1,1,1,1, # 30 - 37 + 1,1,1,1,1,1,1,1, # 38 - 3f + 1,1,1,1,1,1,1,1, # 40 - 47 + 1,1,1,1,1,1,1,1, # 48 - 4f + 1,1,1,1,1,1,1,1, # 50 - 57 + 1,1,1,1,1,1,1,1, # 58 - 5f + 1,1,1,1,1,1,1,1, # 60 - 67 + 1,1,1,1,1,1,1,1, # 68 - 6f + 1,1,1,1,1,1,1,1, # 70 - 77 + 1,1,1,1,1,1,1,1, # 78 - 7f + 2,2,2,2,3,3,3,3, # 80 - 87 + 4,4,4,4,4,4,4,4, # 88 - 8f + 4,4,4,4,4,4,4,4, # 90 - 97 + 4,4,4,4,4,4,4,4, # 98 - 9f + 5,5,5,5,5,5,5,5, # a0 - a7 + 5,5,5,5,5,5,5,5, # a8 - af + 5,5,5,5,5,5,5,5, # b0 - b7 + 5,5,5,5,5,5,5,5, # b8 - bf + 0,0,6,6,6,6,6,6, # c0 - c7 + 6,6,6,6,6,6,6,6, # c8 - cf + 6,6,6,6,6,6,6,6, # d0 - d7 + 6,6,6,6,6,6,6,6, # d8 - df + 7,8,8,8,8,8,8,8, # e0 - e7 + 8,8,8,8,8,9,8,8, # e8 - ef + 10,11,11,11,11,11,11,11, # f0 - f7 + 12,13,13,13,14,15,0,0 # f8 - ff +) -UTF8_st = ( \ - eError,eStart,eError,eError,eError,eError, 12, 10,#00-07 - 9, 11, 8, 7, 6, 5, 4, 3,#08-0f - eError,eError,eError,eError,eError,eError,eError,eError,#10-17 - eError,eError,eError,eError,eError,eError,eError,eError,#18-1f - eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,#20-27 - eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,#28-2f - eError,eError, 5, 5, 5, 5,eError,eError,#30-37 - eError,eError,eError,eError,eError,eError,eError,eError,#38-3f - eError,eError,eError, 5, 5, 5,eError,eError,#40-47 - eError,eError,eError,eError,eError,eError,eError,eError,#48-4f - eError,eError, 7, 7, 7, 7,eError,eError,#50-57 - eError,eError,eError,eError,eError,eError,eError,eError,#58-5f - eError,eError,eError,eError, 7, 7,eError,eError,#60-67 - eError,eError,eError,eError,eError,eError,eError,eError,#68-6f - eError,eError, 9, 9, 9, 9,eError,eError,#70-77 - eError,eError,eError,eError,eError,eError,eError,eError,#78-7f - eError,eError,eError,eError,eError, 9,eError,eError,#80-87 - eError,eError,eError,eError,eError,eError,eError,eError,#88-8f - eError,eError, 12, 12, 12, 12,eError,eError,#90-97 - eError,eError,eError,eError,eError,eError,eError,eError,#98-9f - eError,eError,eError,eError,eError, 12,eError,eError,#a0-a7 - eError,eError,eError,eError,eError,eError,eError,eError,#a8-af - eError,eError, 12, 12, 12,eError,eError,eError,#b0-b7 - eError,eError,eError,eError,eError,eError,eError,eError,#b8-bf - eError,eError,eStart,eStart,eStart,eStart,eError,eError,#c0-c7 - eError,eError,eError,eError,eError,eError,eError,eError)#c8-cf +UTF8_st = ( + eError,eStart,eError,eError,eError,eError, 12, 10,#00-07 + 9, 11, 8, 7, 6, 5, 4, 3,#08-0f + eError,eError,eError,eError,eError,eError,eError,eError,#10-17 + eError,eError,eError,eError,eError,eError,eError,eError,#18-1f + eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,#20-27 + eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,#28-2f + eError,eError, 5, 5, 5, 5,eError,eError,#30-37 + eError,eError,eError,eError,eError,eError,eError,eError,#38-3f + eError,eError,eError, 5, 5, 5,eError,eError,#40-47 + eError,eError,eError,eError,eError,eError,eError,eError,#48-4f + eError,eError, 7, 7, 7, 7,eError,eError,#50-57 + eError,eError,eError,eError,eError,eError,eError,eError,#58-5f + eError,eError,eError,eError, 7, 7,eError,eError,#60-67 + eError,eError,eError,eError,eError,eError,eError,eError,#68-6f + eError,eError, 9, 9, 9, 9,eError,eError,#70-77 + eError,eError,eError,eError,eError,eError,eError,eError,#78-7f + eError,eError,eError,eError,eError, 9,eError,eError,#80-87 + eError,eError,eError,eError,eError,eError,eError,eError,#88-8f + eError,eError, 12, 12, 12, 12,eError,eError,#90-97 + eError,eError,eError,eError,eError,eError,eError,eError,#98-9f + eError,eError,eError,eError,eError, 12,eError,eError,#a0-a7 + eError,eError,eError,eError,eError,eError,eError,eError,#a8-af + eError,eError, 12, 12, 12,eError,eError,eError,#b0-b7 + eError,eError,eError,eError,eError,eError,eError,eError,#b8-bf + eError,eError,eStart,eStart,eStart,eStart,eError,eError,#c0-c7 + eError,eError,eError,eError,eError,eError,eError,eError #c8-cf +) UTF8CharLenTable = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6) diff --git a/thirdparty/chardet/sbcharsetprober.py b/thirdparty/chardet/sbcharsetprober.py index f92fc14c8..37291bd27 100644 --- a/thirdparty/chardet/sbcharsetprober.py +++ b/thirdparty/chardet/sbcharsetprober.py @@ -14,20 +14,22 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants, sys -from charsetprober import CharSetProber +import sys +from . import constants +from .charsetprober import CharSetProber +from .compat import wrap_ord SAMPLE_SIZE = 64 SB_ENOUGH_REL_THRESHOLD = 1024 @@ -38,21 +40,26 @@ NUMBER_OF_SEQ_CAT = 4 POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1 #NEGATIVE_CAT = 0 + class SingleByteCharSetProber(CharSetProber): - def __init__(self, model, reversed=constants.False, nameProber=None): + def __init__(self, model, reversed=False, nameProber=None): CharSetProber.__init__(self) self._mModel = model - self._mReversed = reversed # TRUE if we need to reverse every pair in the model lookup - self._mNameProber = nameProber # Optional auxiliary prober for name decision + # TRUE if we need to reverse every pair in the model lookup + self._mReversed = reversed + # Optional auxiliary prober for name decision + self._mNameProber = nameProber self.reset() def reset(self): CharSetProber.reset(self) - self._mLastOrder = 255 # char order of last character + # char order of last character + self._mLastOrder = 255 self._mSeqCounters = [0] * NUMBER_OF_SEQ_CAT self._mTotalSeqs = 0 self._mTotalChar = 0 - self._mFreqChar = 0 # characters that fall in our sampling range + # characters that fall in our sampling range + self._mFreqChar = 0 def get_charset_name(self): if self._mNameProber: @@ -67,7 +74,7 @@ class SingleByteCharSetProber(CharSetProber): if not aLen: return self.get_state() for c in aBuf: - order = self._mModel['charToOrderMap'][ord(c)] + order = self._mModel['charToOrderMap'][wrap_ord(c)] if order < SYMBOL_CAT_ORDER: self._mTotalChar += 1 if order < SAMPLE_SIZE: @@ -75,9 +82,12 @@ class SingleByteCharSetProber(CharSetProber): if self._mLastOrder < SAMPLE_SIZE: self._mTotalSeqs += 1 if not self._mReversed: - self._mSeqCounters[self._mModel['precedenceMatrix'][(self._mLastOrder * SAMPLE_SIZE) + order]] += 1 - else: # reverse the order of the letters in the lookup - self._mSeqCounters[self._mModel['precedenceMatrix'][(order * SAMPLE_SIZE) + self._mLastOrder]] += 1 + i = (self._mLastOrder * SAMPLE_SIZE) + order + model = self._mModel['precedenceMatrix'][i] + else: # reverse the order of the letters in the lookup + i = (order * SAMPLE_SIZE) + self._mLastOrder + model = self._mModel['precedenceMatrix'][i] + self._mSeqCounters[model] += 1 self._mLastOrder = order if self.get_state() == constants.eDetecting: @@ -85,11 +95,16 @@ class SingleByteCharSetProber(CharSetProber): cf = self.get_confidence() if cf > POSITIVE_SHORTCUT_THRESHOLD: if constants._debug: - sys.stderr.write('%s confidence = %s, we have a winner\n' % (self._mModel['charsetName'], cf)) + sys.stderr.write('%s confidence = %s, we have a' + 'winner\n' % + (self._mModel['charsetName'], cf)) self._mState = constants.eFoundIt elif cf < NEGATIVE_SHORTCUT_THRESHOLD: if constants._debug: - sys.stderr.write('%s confidence = %s, below negative shortcut threshhold %s\n' % (self._mModel['charsetName'], cf, NEGATIVE_SHORTCUT_THRESHOLD)) + sys.stderr.write('%s confidence = %s, below negative' + 'shortcut threshhold %s\n' % + (self._mModel['charsetName'], cf, + NEGATIVE_SHORTCUT_THRESHOLD)) self._mState = constants.eNotMe return self.get_state() @@ -97,9 +112,8 @@ class SingleByteCharSetProber(CharSetProber): def get_confidence(self): r = 0.01 if self._mTotalSeqs > 0: -# print self._mSeqCounters[POSITIVE_CAT], self._mTotalSeqs, self._mModel['mTypicalPositiveRatio'] - r = (1.0 * self._mSeqCounters[POSITIVE_CAT]) / self._mTotalSeqs / self._mModel['mTypicalPositiveRatio'] -# print r, self._mFreqChar, self._mTotalChar + r = ((1.0 * self._mSeqCounters[POSITIVE_CAT]) / self._mTotalSeqs + / self._mModel['mTypicalPositiveRatio']) r = r * self._mFreqChar / self._mTotalChar if r >= 1.0: r = 0.99 diff --git a/thirdparty/chardet/sbcsgroupprober.py b/thirdparty/chardet/sbcsgroupprober.py index d19160c86..1b6196cd1 100644 --- a/thirdparty/chardet/sbcsgroupprober.py +++ b/thirdparty/chardet/sbcsgroupprober.py @@ -14,33 +14,35 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants, sys -from charsetgroupprober import CharSetGroupProber -from sbcharsetprober import SingleByteCharSetProber -from langcyrillicmodel import Win1251CyrillicModel, Koi8rModel, Latin5CyrillicModel, MacCyrillicModel, Ibm866Model, Ibm855Model -from langgreekmodel import Latin7GreekModel, Win1253GreekModel -from langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel -from langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel -from langthaimodel import TIS620ThaiModel -from langhebrewmodel import Win1255HebrewModel -from hebrewprober import HebrewProber +from .charsetgroupprober import CharSetGroupProber +from .sbcharsetprober import SingleByteCharSetProber +from .langcyrillicmodel import (Win1251CyrillicModel, Koi8rModel, + Latin5CyrillicModel, MacCyrillicModel, + Ibm866Model, Ibm855Model) +from .langgreekmodel import Latin7GreekModel, Win1253GreekModel +from .langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel +from .langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel +from .langthaimodel import TIS620ThaiModel +from .langhebrewmodel import Win1255HebrewModel +from .hebrewprober import HebrewProber + class SBCSGroupProber(CharSetGroupProber): def __init__(self): CharSetGroupProber.__init__(self) - self._mProbers = [ \ + self._mProbers = [ SingleByteCharSetProber(Win1251CyrillicModel), SingleByteCharSetProber(Koi8rModel), SingleByteCharSetProber(Latin5CyrillicModel), @@ -54,11 +56,14 @@ class SBCSGroupProber(CharSetGroupProber): SingleByteCharSetProber(Latin2HungarianModel), SingleByteCharSetProber(Win1250HungarianModel), SingleByteCharSetProber(TIS620ThaiModel), - ] + ] hebrewProber = HebrewProber() - logicalHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, constants.False, hebrewProber) - visualHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, constants.True, hebrewProber) + logicalHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, + False, hebrewProber) + visualHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, True, + hebrewProber) hebrewProber.set_model_probers(logicalHebrewProber, visualHebrewProber) - self._mProbers.extend([hebrewProber, logicalHebrewProber, visualHebrewProber]) + self._mProbers.extend([hebrewProber, logicalHebrewProber, + visualHebrewProber]) self.reset() diff --git a/thirdparty/chardet/sjisprober.py b/thirdparty/chardet/sjisprober.py index 8f69f60be..cd0e9e707 100644 --- a/thirdparty/chardet/sjisprober.py +++ b/thirdparty/chardet/sjisprober.py @@ -13,25 +13,26 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from mbcharsetprober import MultiByteCharSetProber -from codingstatemachine import CodingStateMachine -from chardistribution import SJISDistributionAnalysis -from jpcntx import SJISContextAnalysis -from mbcssm import SJISSMModel -import constants, sys -from constants import eStart, eError, eItsMe +import sys +from .mbcharsetprober import MultiByteCharSetProber +from .codingstatemachine import CodingStateMachine +from .chardistribution import SJISDistributionAnalysis +from .jpcntx import SJISContextAnalysis +from .mbcssm import SJISSMModel +from . import constants + class SJISProber(MultiByteCharSetProber): def __init__(self): @@ -46,35 +47,40 @@ class SJISProber(MultiByteCharSetProber): self._mContextAnalyzer.reset() def get_charset_name(self): - return "SHIFT_JIS" + return self._mContextAnalyzer.get_charset_name() def feed(self, aBuf): aLen = len(aBuf) - for i in xrange(0, aLen): + for i in range(0, aLen): codingState = self._mCodingSM.next_state(aBuf[i]) - if codingState == eError: + if codingState == constants.eError: if constants._debug: - sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n') + sys.stderr.write(self.get_charset_name() + + ' prober hit error at byte ' + str(i) + + '\n') self._mState = constants.eNotMe break - elif codingState == eItsMe: + elif codingState == constants.eItsMe: self._mState = constants.eFoundIt break - elif codingState == eStart: + elif codingState == constants.eStart: charLen = self._mCodingSM.get_current_charlen() if i == 0: self._mLastChar[1] = aBuf[0] - self._mContextAnalyzer.feed(self._mLastChar[2 - charLen :], charLen) + self._mContextAnalyzer.feed(self._mLastChar[2 - charLen:], + charLen) self._mDistributionAnalyzer.feed(self._mLastChar, charLen) else: - self._mContextAnalyzer.feed(aBuf[i + 1 - charLen : i + 3 - charLen], charLen) - self._mDistributionAnalyzer.feed(aBuf[i - 1 : i + 1], charLen) + self._mContextAnalyzer.feed(aBuf[i + 1 - charLen:i + 3 + - charLen], charLen) + self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1], + charLen) self._mLastChar[0] = aBuf[aLen - 1] if self.get_state() == constants.eDetecting: - if self._mContextAnalyzer.got_enough_data() and \ - (self.get_confidence() > constants.SHORTCUT_THRESHOLD): + if (self._mContextAnalyzer.got_enough_data() and + (self.get_confidence() > constants.SHORTCUT_THRESHOLD)): self._mState = constants.eFoundIt return self.get_state() diff --git a/thirdparty/chardet/test.py b/thirdparty/chardet/test.py deleted file mode 100644 index 2ebf3a4dc..000000000 --- a/thirdparty/chardet/test.py +++ /dev/null @@ -1,20 +0,0 @@ -import sys, glob -sys.path.insert(0, '..') -from chardet.universaldetector import UniversalDetector - -count = 0 -u = UniversalDetector() -for f in glob.glob(sys.argv[1]): - print f.ljust(60), - u.reset() - for line in file(f, 'rb'): - u.feed(line) - if u.done: break - u.close() - result = u.result - if result['encoding']: - print result['encoding'], 'with confidence', result['confidence'] - else: - print '******** no result' - count += 1 -print count, 'tests' diff --git a/thirdparty/chardet/universaldetector.py b/thirdparty/chardet/universaldetector.py index a08425f87..476522b99 100644 --- a/thirdparty/chardet/universaldetector.py +++ b/thirdparty/chardet/universaldetector.py @@ -14,23 +14,25 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants, sys -from latin1prober import Latin1Prober # windows-1252 -from mbcsgroupprober import MBCSGroupProber # multi-byte character sets -from sbcsgroupprober import SBCSGroupProber # single-byte character sets -from escprober import EscCharSetProber # ISO-2122, etc. +from . import constants +import sys +import codecs +from .latin1prober import Latin1Prober # windows-1252 +from .mbcsgroupprober import MBCSGroupProber # multi-byte character sets +from .sbcsgroupprober import SBCSGroupProber # single-byte character sets +from .escprober import EscCharSetProber # ISO-2122, etc. import re MINIMUM_THRESHOLD = 0.20 @@ -38,68 +40,78 @@ ePureAscii = 0 eEscAscii = 1 eHighbyte = 2 + class UniversalDetector: def __init__(self): - self._highBitDetector = re.compile(r'[\x80-\xFF]') - self._escDetector = re.compile(r'(\033|~{)') + self._highBitDetector = re.compile(b'[\x80-\xFF]') + self._escDetector = re.compile(b'(\033|~{)') self._mEscCharSetProber = None self._mCharSetProbers = [] self.reset() def reset(self): self.result = {'encoding': None, 'confidence': 0.0} - self.done = constants.False - self._mStart = constants.True - self._mGotData = constants.False + self.done = False + self._mStart = True + self._mGotData = False self._mInputState = ePureAscii - self._mLastChar = '' + self._mLastChar = b'' if self._mEscCharSetProber: self._mEscCharSetProber.reset() for prober in self._mCharSetProbers: prober.reset() def feed(self, aBuf): - if self.done: return + if self.done: + return aLen = len(aBuf) - if not aLen: return + if not aLen: + return if not self._mGotData: # If the data starts with BOM, we know it is UTF - if aBuf[:3] == '\xEF\xBB\xBF': + if aBuf[:3] == codecs.BOM_UTF8: # EF BB BF UTF-8 with BOM - self.result = {'encoding': "UTF-8", 'confidence': 1.0} - elif aBuf[:4] == '\xFF\xFE\x00\x00': + self.result = {'encoding': "UTF-8-SIG", 'confidence': 1.0} + elif aBuf[:4] == codecs.BOM_UTF32_LE: # FF FE 00 00 UTF-32, little-endian BOM self.result = {'encoding': "UTF-32LE", 'confidence': 1.0} - elif aBuf[:4] == '\x00\x00\xFE\xFF': + elif aBuf[:4] == codecs.BOM_UTF32_BE: # 00 00 FE FF UTF-32, big-endian BOM self.result = {'encoding': "UTF-32BE", 'confidence': 1.0} - elif aBuf[:4] == '\xFE\xFF\x00\x00': + elif aBuf[:4] == b'\xFE\xFF\x00\x00': # FE FF 00 00 UCS-4, unusual octet order BOM (3412) - self.result = {'encoding': "X-ISO-10646-UCS-4-3412", 'confidence': 1.0} - elif aBuf[:4] == '\x00\x00\xFF\xFE': + self.result = { + 'encoding': "X-ISO-10646-UCS-4-3412", + 'confidence': 1.0 + } + elif aBuf[:4] == b'\x00\x00\xFF\xFE': # 00 00 FF FE UCS-4, unusual octet order BOM (2143) - self.result = {'encoding': "X-ISO-10646-UCS-4-2143", 'confidence': 1.0} - elif aBuf[:2] == '\xFF\xFE': + self.result = { + 'encoding': "X-ISO-10646-UCS-4-2143", + 'confidence': 1.0 + } + elif aBuf[:2] == codecs.BOM_LE: # FF FE UTF-16, little endian BOM self.result = {'encoding': "UTF-16LE", 'confidence': 1.0} - elif aBuf[:2] == '\xFE\xFF': + elif aBuf[:2] == codecs.BOM_BE: # FE FF UTF-16, big endian BOM self.result = {'encoding': "UTF-16BE", 'confidence': 1.0} - self._mGotData = constants.True + self._mGotData = True if self.result['encoding'] and (self.result['confidence'] > 0.0): - self.done = constants.True + self.done = True return if self._mInputState == ePureAscii: if self._highBitDetector.search(aBuf): self._mInputState = eHighbyte - elif (self._mInputState == ePureAscii) and self._escDetector.search(self._mLastChar + aBuf): + elif ((self._mInputState == ePureAscii) and + self._escDetector.search(self._mLastChar + aBuf)): self._mInputState = eEscAscii - self._mLastChar = aBuf[-1] + self._mLastChar = aBuf[-1:] if self._mInputState == eEscAscii: if not self._mEscCharSetProber: @@ -107,24 +119,26 @@ class UniversalDetector: if self._mEscCharSetProber.feed(aBuf) == constants.eFoundIt: self.result = {'encoding': self._mEscCharSetProber.get_charset_name(), 'confidence': self._mEscCharSetProber.get_confidence()} - self.done = constants.True + self.done = True elif self._mInputState == eHighbyte: if not self._mCharSetProbers: - self._mCharSetProbers = [MBCSGroupProber(), SBCSGroupProber(), Latin1Prober()] + self._mCharSetProbers = [MBCSGroupProber(), SBCSGroupProber(), + Latin1Prober()] for prober in self._mCharSetProbers: if prober.feed(aBuf) == constants.eFoundIt: self.result = {'encoding': prober.get_charset_name(), 'confidence': prober.get_confidence()} - self.done = constants.True + self.done = True break def close(self): - if self.done: return + if self.done: + return if not self._mGotData: if constants._debug: sys.stderr.write('no data received!\n') return - self.done = constants.True + self.done = True if self._mInputState == ePureAscii: self.result = {'encoding': 'ascii', 'confidence': 1.0} @@ -135,7 +149,8 @@ class UniversalDetector: maxProberConfidence = 0.0 maxProber = None for prober in self._mCharSetProbers: - if not prober: continue + if not prober: + continue proberConfidence = prober.get_confidence() if proberConfidence > maxProberConfidence: maxProberConfidence = proberConfidence @@ -148,7 +163,8 @@ class UniversalDetector: if constants._debug: sys.stderr.write('no probers hit minimum threshhold\n') for prober in self._mCharSetProbers[0].mProbers: - if not prober: continue - sys.stderr.write('%s confidence = %s\n' % \ - (prober.get_charset_name(), \ + if not prober: + continue + sys.stderr.write('%s confidence = %s\n' % + (prober.get_charset_name(), prober.get_confidence())) diff --git a/thirdparty/chardet/utf8prober.py b/thirdparty/chardet/utf8prober.py index fec8548c8..1c0bb5d8f 100644 --- a/thirdparty/chardet/utf8prober.py +++ b/thirdparty/chardet/utf8prober.py @@ -13,26 +13,26 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants, sys -from constants import eStart, eError, eItsMe -from charsetprober import CharSetProber -from codingstatemachine import CodingStateMachine -from mbcssm import UTF8SMModel +from . import constants +from .charsetprober import CharSetProber +from .codingstatemachine import CodingStateMachine +from .mbcssm import UTF8SMModel ONE_CHAR_PROB = 0.5 + class UTF8Prober(CharSetProber): def __init__(self): CharSetProber.__init__(self) @@ -50,13 +50,13 @@ class UTF8Prober(CharSetProber): def feed(self, aBuf): for c in aBuf: codingState = self._mCodingSM.next_state(c) - if codingState == eError: + if codingState == constants.eError: self._mState = constants.eNotMe break - elif codingState == eItsMe: + elif codingState == constants.eItsMe: self._mState = constants.eFoundIt break - elif codingState == eStart: + elif codingState == constants.eStart: if self._mCodingSM.get_current_charlen() >= 2: self._mNumOfMBChar += 1 @@ -69,7 +69,7 @@ class UTF8Prober(CharSetProber): def get_confidence(self): unlike = 0.99 if self._mNumOfMBChar < 6: - for i in xrange(0, self._mNumOfMBChar): + for i in range(0, self._mNumOfMBChar): unlike = unlike * ONE_CHAR_PROB return 1.0 - unlike else: