Improving comparison engine (removing shared prelude part to further sharpen if pages are identical - especially noticable in small test pages)

This commit is contained in:
Miroslav Stampar 2012-11-13 15:22:59 +01:00
parent 6f7f9dd8eb
commit 5b3fe25211

View File

@ -22,6 +22,7 @@ from lib.core.settings import DIFF_TOLERANCE
from lib.core.settings import HTML_TITLE_REGEX from lib.core.settings import HTML_TITLE_REGEX
from lib.core.settings import MIN_RATIO from lib.core.settings import MIN_RATIO
from lib.core.settings import MAX_RATIO from lib.core.settings import MAX_RATIO
from lib.core.settings import REFLECTED_VALUE_MARKER
from lib.core.settings import LOWER_RATIO_BOUND from lib.core.settings import LOWER_RATIO_BOUND
from lib.core.settings import UPPER_RATIO_BOUND from lib.core.settings import UPPER_RATIO_BOUND
from lib.core.threads import getCurrentThreadData from lib.core.threads import getCurrentThreadData
@ -114,16 +115,26 @@ def _comparison(page, headers, code, getRatioValue, pageLength):
seq1 = getFilteredPageContent(seqMatcher.a, True) if conf.textOnly else seqMatcher.a seq1 = getFilteredPageContent(seqMatcher.a, True) if conf.textOnly else seqMatcher.a
seq2 = getFilteredPageContent(page, True) if conf.textOnly else page seq2 = getFilteredPageContent(page, True) if conf.textOnly else page
if seq1 is not None:
seqMatcher.set_seq1(seq1)
if seq2 is not None:
seqMatcher.set_seq2(seq2)
if seq1 is None or seq2 is None: if seq1 is None or seq2 is None:
return None return None
else:
ratio = round(seqMatcher.quick_ratio(), 3) seq1 = seq1.replace(REFLECTED_VALUE_MARKER, "")
seq2 = seq2.replace(REFLECTED_VALUE_MARKER, "")
count = 0
while count < min(len(seq1), len(seq2)):
if seq1[count] == seq2[count]:
count += 1
else:
break
if count:
seq1 = seq1[count:]
seq2 = seq2[count:]
seqMatcher.set_seq1(seq1)
seqMatcher.set_seq2(seq2)
ratio = round(seqMatcher.quick_ratio(), 3)
# If the url is stable and we did not set yet the match ratio and the # If the url is stable and we did not set yet the match ratio and the
# current injected value changes the url page content # current injected value changes the url page content