code cleanup

This commit is contained in:
Bernardo Damele 2011-05-05 08:50:18 +00:00
parent b12aa8a56f
commit eea96c5b8d
12 changed files with 27 additions and 27 deletions

View File

@ -42,7 +42,7 @@ class CharDistributionAnalysis:
self._mTableSize = None # Size of above table
self._mTypicalDistributionRatio = None # This is a constant value which varies from language to language, used in calculating confidence. See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail.
self.reset()
def reset(self):
"""reset analyser, clear any state"""
self._mDone = constants.False # If this flag is set to constants.True, detection is done and conclusion has been made
@ -87,7 +87,7 @@ class CharDistributionAnalysis:
# convert this encoding string to a number, here called order.
# This allows multiple encodings of a language to share one frequency table.
return -1
class EUCTWDistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
CharDistributionAnalysis.__init__(self)

View File

@ -34,7 +34,7 @@ class CharSetGroupProber(CharSetProber):
self._mActiveNum = 0
self._mProbers = []
self._mBestGuessProber = None
def reset(self):
CharSetProber.reset(self)
self._mActiveNum = 0

View File

@ -31,10 +31,10 @@ import constants, re
class CharSetProber:
def __init__(self):
pass
def reset(self):
self._mState = constants.eDetecting
def get_charset_name(self):
return None
@ -50,11 +50,11 @@ class CharSetProber:
def filter_high_bit_only(self, aBuf):
aBuf = re.sub(r'([\x00-\x7F])+', ' ', aBuf)
return aBuf
def filter_without_english_letters(self, aBuf):
aBuf = re.sub(r'([A-Za-z])+', ' ', aBuf)
return aBuf
def filter_with_english_letters(self, aBuf):
# TODO
return aBuf

View File

@ -75,5 +75,5 @@ class EscCharSetProber(CharSetProber):
self._mState = constants.eFoundIt
self._mDetectedCharset = codingSM.get_coding_state_machine()
return self.get_state()
return self.get_state()

View File

@ -44,7 +44,7 @@ class EUCJPProber(MultiByteCharSetProber):
def reset(self):
MultiByteCharSetProber.reset(self)
self._mContextAnalyzer.reset()
def get_charset_name(self):
return "EUC-JP"
@ -69,9 +69,9 @@ class EUCJPProber(MultiByteCharSetProber):
else:
self._mContextAnalyzer.feed(aBuf[i-1:i+1], charLen)
self._mDistributionAnalyzer.feed(aBuf[i-1:i+1], charLen)
self._mLastChar[0] = aBuf[aLen - 1]
if self.get_state() == constants.eDetecting:
if self._mContextAnalyzer.got_enough_data() and \
(self.get_confidence() > constants.SHORTCUT_THRESHOLD):

View File

@ -164,7 +164,7 @@ class HebrewProber(CharSetProber):
self._mPrev = ' '
self._mBeforePrev = ' '
# These probers are owned by the group prober.
def set_model_probers(self, logicalProber, visualProber):
self._mLogicalProber = logicalProber
self._mVisualProber = visualProber
@ -184,7 +184,7 @@ class HebrewProber(CharSetProber):
# these letters as Non-Final letters outweighs the damage since these words
# are quite rare.
return c in [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE]
def feed(self, aBuf):
# Final letter analysis for logical-visual decision.
# Look for evidence that the received buffer is either logical Hebrew or
@ -215,7 +215,7 @@ class HebrewProber(CharSetProber):
return constants.eNotMe
aBuf = self.filter_high_bit_only(aBuf)
for cur in aBuf:
if cur == ' ':
# We stand on a space - a word just ended

View File

@ -123,7 +123,7 @@ jp2CharContext = ( \
class JapaneseContextAnalysis:
def __init__(self):
self.reset()
def reset(self):
self._mTotalRel = 0 # total sequence received
self._mRelSample = [0] * NUM_OF_CATEGORY # category counters, each interger counts sequence in its category
@ -133,7 +133,7 @@ class JapaneseContextAnalysis:
def feed(self, aBuf, aLen):
if self._mDone: return
# The buffer we got is byte oriented, and a character may span in more than one
# buffers. In case the last one or two byte in last buffer is not complete, we
# record how many byte needed to complete that character and skip these bytes here.
@ -158,7 +158,7 @@ class JapaneseContextAnalysis:
def got_enough_data(self):
return self._mTotalRel > ENOUGH_REL_THRESHOLD
def get_confidence(self):
# This is just one way to calculate confidence. It works well for me.
if self._mTotalRel > MINIMUM_DATA_THRESHOLD:
@ -168,7 +168,7 @@ class JapaneseContextAnalysis:
def get_order(self, aStr):
return -1, 1
class SJISContextAnalysis(JapaneseContextAnalysis):
def get_order(self, aStr):
if not aStr: return -1, 1

View File

@ -122,7 +122,7 @@ class Latin1Prober(CharSetProber):
def get_confidence(self):
if self.get_state() == constants.eNotMe:
return 0.01
total = reduce(operator.add, self._mFreqCounter)
if total < 0.01:
confidence = 0.0

View File

@ -68,9 +68,9 @@ class MultiByteCharSetProber(CharSetProber):
self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
else:
self._mDistributionAnalyzer.feed(aBuf[i-1:i+1], charLen)
self._mLastChar[0] = aBuf[aLen - 1]
if self.get_state() == constants.eDetecting:
if self._mDistributionAnalyzer.got_enough_data() and \
(self.get_confidence() > constants.SHORTCUT_THRESHOLD):

View File

@ -37,7 +37,7 @@ SYMBOL_CAT_ORDER = 250
NUMBER_OF_SEQ_CAT = 4
POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1
#NEGATIVE_CAT = 0
class SingleByteCharSetProber(CharSetProber):
def __init__(self, model, reversed=constants.False, nameProber=None):
CharSetProber.__init__(self)

View File

@ -44,7 +44,7 @@ class SJISProber(MultiByteCharSetProber):
def reset(self):
MultiByteCharSetProber.reset(self)
self._mContextAnalyzer.reset()
def get_charset_name(self):
return "SHIFT_JIS"
@ -69,9 +69,9 @@ class SJISProber(MultiByteCharSetProber):
else:
self._mContextAnalyzer.feed(aBuf[i + 1 - charLen : i + 3 - charLen], charLen)
self._mDistributionAnalyzer.feed(aBuf[i - 1 : i + 1], charLen)
self._mLastChar[0] = aBuf[aLen - 1]
if self.get_state() == constants.eDetecting:
if self._mContextAnalyzer.got_enough_data() and \
(self.get_confidence() > constants.SHORTCUT_THRESHOLD):

View File

@ -63,7 +63,7 @@ class UniversalDetector:
aLen = len(aBuf)
if not aLen: return
if not self._mGotData:
# If the data starts with BOM, we know it is UTF
if aBuf[:3] == '\xEF\xBB\xBF':
@ -125,7 +125,7 @@ class UniversalDetector:
sys.stderr.write('no data received!\n')
return
self.done = constants.True
if self._mInputState == ePureAscii:
self.result = {'encoding': 'ascii', 'confidence': 1.0}
return self.result