some fine tuning of dynamicity removing engine

This commit is contained in:
Miroslav Stampar 2010-12-04 13:39:35 +00:00
parent eeb199375b
commit 1f795622b3
2 changed files with 4 additions and 3 deletions

View File

@ -490,17 +490,19 @@ def checkDynamicContent(firstPage, secondPage):
blocks = SequenceMatcher(None, firstPage, secondPage).get_matching_blocks() blocks = SequenceMatcher(None, firstPage, secondPage).get_matching_blocks()
kb.dynamicMarkings = [] kb.dynamicMarkings = []
# Removing too small matching blocks
i = 0 i = 0
while i < len(blocks): while i < len(blocks):
block = blocks[i] block = blocks[i]
(_, _, length) = block (_, _, length) = block
if length <= conf.minMatchBlock: if length <= conf.dynMarkLength:
blocks.remove(block) blocks.remove(block)
else: else:
i += 1 i += 1
# Making of dynamic markings based on prefix/suffix principle
if len(blocks) > 0: if len(blocks) > 0:
blocks.insert(0, None) blocks.insert(0, None)
blocks.append(None) blocks.append(None)
@ -518,7 +520,7 @@ def checkDynamicContent(firstPage, secondPage):
prefix = trimAlphaNum(prefix) prefix = trimAlphaNum(prefix)
suffix = trimAlphaNum(suffix) suffix = trimAlphaNum(suffix)
kb.dynamicMarkings.append((re.escape(prefix[-conf.dynMarkLength:]) if prefix else None, re.escape(suffix[:conf.dynMarkLength]) if suffix else None)) kb.dynamicMarkings.append((re.escape(prefix[-conf.dynMarkLength/2:]) if prefix else None, re.escape(suffix[:conf.dynMarkLength/2]) if suffix else None))
if len(kb.dynamicMarkings) > 0: if len(kb.dynamicMarkings) > 0:
infoMsg = "dynamic content marked for removal (%d region%s)" % (len(kb.dynamicMarkings), 's' if len(kb.dynamicMarkings) > 1 else '') infoMsg = "dynamic content marked for removal (%d region%s)" % (len(kb.dynamicMarkings), 's' if len(kb.dynamicMarkings) > 1 else '')

View File

@ -1076,7 +1076,6 @@ def __setConfAttributes():
conf.dbmsConnector = None conf.dbmsConnector = None
conf.dbmsHandler = None conf.dbmsHandler = None
conf.dumpPath = None conf.dumpPath = None
conf.minMatchBlock = 8
conf.dynMarkLength = 32 conf.dynMarkLength = 32
conf.httpHeaders = [] conf.httpHeaders = []
conf.hostname = None conf.hostname = None