From eea96c5b8df04dc8570aabc84783d9a2a49c59b5 Mon Sep 17 00:00:00 2001 From: Bernardo Damele Date: Thu, 5 May 2011 08:50:18 +0000 Subject: [PATCH] code cleanup --- extra/chardet/chardistribution.py | 4 ++-- extra/chardet/charsetgroupprober.py | 2 +- extra/chardet/charsetprober.py | 8 ++++---- extra/chardet/escprober.py | 2 +- extra/chardet/eucjpprober.py | 6 +++--- extra/chardet/hebrewprober.py | 6 +++--- extra/chardet/jpcntx.py | 8 ++++---- extra/chardet/latin1prober.py | 2 +- extra/chardet/mbcharsetprober.py | 4 ++-- extra/chardet/sbcharsetprober.py | 2 +- extra/chardet/sjisprober.py | 6 +++--- extra/chardet/universaldetector.py | 4 ++-- 12 files changed, 27 insertions(+), 27 deletions(-) diff --git a/extra/chardet/chardistribution.py b/extra/chardet/chardistribution.py index b89334184..1f95fc848 100755 --- a/extra/chardet/chardistribution.py +++ b/extra/chardet/chardistribution.py @@ -42,7 +42,7 @@ class CharDistributionAnalysis: self._mTableSize = None # Size of above table self._mTypicalDistributionRatio = None # This is a constant value which varies from language to language, used in calculating confidence. See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail. self.reset() - + def reset(self): """reset analyser, clear any state""" self._mDone = constants.False # If this flag is set to constants.True, detection is done and conclusion has been made @@ -87,7 +87,7 @@ class CharDistributionAnalysis: # convert this encoding string to a number, here called order. # This allows multiple encodings of a language to share one frequency table. return -1 - + class EUCTWDistributionAnalysis(CharDistributionAnalysis): def __init__(self): CharDistributionAnalysis.__init__(self) diff --git a/extra/chardet/charsetgroupprober.py b/extra/chardet/charsetgroupprober.py index 518806949..9037af480 100755 --- a/extra/chardet/charsetgroupprober.py +++ b/extra/chardet/charsetgroupprober.py @@ -34,7 +34,7 @@ class CharSetGroupProber(CharSetProber): self._mActiveNum = 0 self._mProbers = [] self._mBestGuessProber = None - + def reset(self): CharSetProber.reset(self) self._mActiveNum = 0 diff --git a/extra/chardet/charsetprober.py b/extra/chardet/charsetprober.py index 3ac1683c7..6ad198cd4 100755 --- a/extra/chardet/charsetprober.py +++ b/extra/chardet/charsetprober.py @@ -31,10 +31,10 @@ import constants, re class CharSetProber: def __init__(self): pass - + def reset(self): self._mState = constants.eDetecting - + def get_charset_name(self): return None @@ -50,11 +50,11 @@ class CharSetProber: def filter_high_bit_only(self, aBuf): aBuf = re.sub(r'([\x00-\x7F])+', ' ', aBuf) return aBuf - + def filter_without_english_letters(self, aBuf): aBuf = re.sub(r'([A-Za-z])+', ' ', aBuf) return aBuf - + def filter_with_english_letters(self, aBuf): # TODO return aBuf diff --git a/extra/chardet/escprober.py b/extra/chardet/escprober.py index 572ed7be3..c2e979e7b 100755 --- a/extra/chardet/escprober.py +++ b/extra/chardet/escprober.py @@ -75,5 +75,5 @@ class EscCharSetProber(CharSetProber): self._mState = constants.eFoundIt self._mDetectedCharset = codingSM.get_coding_state_machine() return self.get_state() - + return self.get_state() diff --git a/extra/chardet/eucjpprober.py b/extra/chardet/eucjpprober.py index 46a8b38b7..1c20e8034 100755 --- a/extra/chardet/eucjpprober.py +++ b/extra/chardet/eucjpprober.py @@ -44,7 +44,7 @@ class EUCJPProber(MultiByteCharSetProber): def reset(self): MultiByteCharSetProber.reset(self) self._mContextAnalyzer.reset() - + def get_charset_name(self): return "EUC-JP" @@ -69,9 +69,9 @@ class EUCJPProber(MultiByteCharSetProber): else: self._mContextAnalyzer.feed(aBuf[i-1:i+1], charLen) self._mDistributionAnalyzer.feed(aBuf[i-1:i+1], charLen) - + self._mLastChar[0] = aBuf[aLen - 1] - + if self.get_state() == constants.eDetecting: if self._mContextAnalyzer.got_enough_data() and \ (self.get_confidence() > constants.SHORTCUT_THRESHOLD): diff --git a/extra/chardet/hebrewprober.py b/extra/chardet/hebrewprober.py index a2b1eaa99..442c0bf2b 100755 --- a/extra/chardet/hebrewprober.py +++ b/extra/chardet/hebrewprober.py @@ -164,7 +164,7 @@ class HebrewProber(CharSetProber): self._mPrev = ' ' self._mBeforePrev = ' ' # These probers are owned by the group prober. - + def set_model_probers(self, logicalProber, visualProber): self._mLogicalProber = logicalProber self._mVisualProber = visualProber @@ -184,7 +184,7 @@ class HebrewProber(CharSetProber): # these letters as Non-Final letters outweighs the damage since these words # are quite rare. return c in [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE] - + def feed(self, aBuf): # Final letter analysis for logical-visual decision. # Look for evidence that the received buffer is either logical Hebrew or @@ -215,7 +215,7 @@ class HebrewProber(CharSetProber): return constants.eNotMe aBuf = self.filter_high_bit_only(aBuf) - + for cur in aBuf: if cur == ' ': # We stand on a space - a word just ended diff --git a/extra/chardet/jpcntx.py b/extra/chardet/jpcntx.py index 93db4a9cb..06d396e5b 100755 --- a/extra/chardet/jpcntx.py +++ b/extra/chardet/jpcntx.py @@ -123,7 +123,7 @@ jp2CharContext = ( \ class JapaneseContextAnalysis: def __init__(self): self.reset() - + def reset(self): self._mTotalRel = 0 # total sequence received self._mRelSample = [0] * NUM_OF_CATEGORY # category counters, each interger counts sequence in its category @@ -133,7 +133,7 @@ class JapaneseContextAnalysis: def feed(self, aBuf, aLen): if self._mDone: return - + # The buffer we got is byte oriented, and a character may span in more than one # buffers. In case the last one or two byte in last buffer is not complete, we # record how many byte needed to complete that character and skip these bytes here. @@ -158,7 +158,7 @@ class JapaneseContextAnalysis: def got_enough_data(self): return self._mTotalRel > ENOUGH_REL_THRESHOLD - + def get_confidence(self): # This is just one way to calculate confidence. It works well for me. if self._mTotalRel > MINIMUM_DATA_THRESHOLD: @@ -168,7 +168,7 @@ class JapaneseContextAnalysis: def get_order(self, aStr): return -1, 1 - + class SJISContextAnalysis(JapaneseContextAnalysis): def get_order(self, aStr): if not aStr: return -1, 1 diff --git a/extra/chardet/latin1prober.py b/extra/chardet/latin1prober.py index b46129ba8..ae4527c75 100755 --- a/extra/chardet/latin1prober.py +++ b/extra/chardet/latin1prober.py @@ -122,7 +122,7 @@ class Latin1Prober(CharSetProber): def get_confidence(self): if self.get_state() == constants.eNotMe: return 0.01 - + total = reduce(operator.add, self._mFreqCounter) if total < 0.01: confidence = 0.0 diff --git a/extra/chardet/mbcharsetprober.py b/extra/chardet/mbcharsetprober.py index a8131445a..4c0f928a4 100755 --- a/extra/chardet/mbcharsetprober.py +++ b/extra/chardet/mbcharsetprober.py @@ -68,9 +68,9 @@ class MultiByteCharSetProber(CharSetProber): self._mDistributionAnalyzer.feed(self._mLastChar, charLen) else: self._mDistributionAnalyzer.feed(aBuf[i-1:i+1], charLen) - + self._mLastChar[0] = aBuf[aLen - 1] - + if self.get_state() == constants.eDetecting: if self._mDistributionAnalyzer.got_enough_data() and \ (self.get_confidence() > constants.SHORTCUT_THRESHOLD): diff --git a/extra/chardet/sbcharsetprober.py b/extra/chardet/sbcharsetprober.py index da0711632..f92fc14c8 100755 --- a/extra/chardet/sbcharsetprober.py +++ b/extra/chardet/sbcharsetprober.py @@ -37,7 +37,7 @@ SYMBOL_CAT_ORDER = 250 NUMBER_OF_SEQ_CAT = 4 POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1 #NEGATIVE_CAT = 0 - + class SingleByteCharSetProber(CharSetProber): def __init__(self, model, reversed=constants.False, nameProber=None): CharSetProber.__init__(self) diff --git a/extra/chardet/sjisprober.py b/extra/chardet/sjisprober.py index fea2690c1..75d970525 100755 --- a/extra/chardet/sjisprober.py +++ b/extra/chardet/sjisprober.py @@ -44,7 +44,7 @@ class SJISProber(MultiByteCharSetProber): def reset(self): MultiByteCharSetProber.reset(self) self._mContextAnalyzer.reset() - + def get_charset_name(self): return "SHIFT_JIS" @@ -69,9 +69,9 @@ class SJISProber(MultiByteCharSetProber): else: self._mContextAnalyzer.feed(aBuf[i + 1 - charLen : i + 3 - charLen], charLen) self._mDistributionAnalyzer.feed(aBuf[i - 1 : i + 1], charLen) - + self._mLastChar[0] = aBuf[aLen - 1] - + if self.get_state() == constants.eDetecting: if self._mContextAnalyzer.got_enough_data() and \ (self.get_confidence() > constants.SHORTCUT_THRESHOLD): diff --git a/extra/chardet/universaldetector.py b/extra/chardet/universaldetector.py index 809df2276..a08425f87 100755 --- a/extra/chardet/universaldetector.py +++ b/extra/chardet/universaldetector.py @@ -63,7 +63,7 @@ class UniversalDetector: aLen = len(aBuf) if not aLen: return - + if not self._mGotData: # If the data starts with BOM, we know it is UTF if aBuf[:3] == '\xEF\xBB\xBF': @@ -125,7 +125,7 @@ class UniversalDetector: sys.stderr.write('no data received!\n') return self.done = constants.True - + if self._mInputState == ePureAscii: self.result = {'encoding': 'ascii', 'confidence': 1.0} return self.result