From 7072b395c9e0705338c857e7b1ef7087cdb7b928 Mon Sep 17 00:00:00 2001 From: greg Date: Tue, 16 Jan 2018 15:46:13 -0500 Subject: [PATCH] Add greedy matcher tests --- spacy/tests/regression/test_issue1855.py | 63 ++++++++++++++++++++++++ spacy/tests/test_matcher_greedy.py | 63 ++++++++++++++++++++++++ 2 files changed, 126 insertions(+) create mode 100644 spacy/tests/regression/test_issue1855.py create mode 100644 spacy/tests/test_matcher_greedy.py diff --git a/spacy/tests/regression/test_issue1855.py b/spacy/tests/regression/test_issue1855.py new file mode 100644 index 000000000..882c356ca --- /dev/null +++ b/spacy/tests/regression/test_issue1855.py @@ -0,0 +1,63 @@ +# coding: utf-8 +from __future__ import unicode_literals +import re + +from ..matcher import Matcher + +import pytest + +pattern1 = [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'*'}] +pattern2 = [{'ORTH':'A','OP':'*'},{'ORTH':'A','OP':'1'}] +pattern3 = [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'1'}] +pattern4 = [{'ORTH':'B','OP':'1'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}] +pattern5 = [{'ORTH':'B','OP':'*'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}] + +re_pattern1 = 'AA*' +re_pattern2 = 'A*A' +re_pattern3 = 'AA' +re_pattern4 = 'BA*B' +re_pattern5 = 'B*A*B' + +@pytest.fixture +def text(): + return "(ABBAAAAAB)." + +@pytest.fixture +def doc(en_tokenizer,text): + doc = en_tokenizer(' '.join(text)) + return doc + +@pytest.mark.parametrize('pattern,re_pattern',[ + (pattern1,re_pattern1), + (pattern2,re_pattern2), + (pattern3,re_pattern3), + (pattern4,re_pattern4), + (pattern5,re_pattern5)]) +def test_greedy_matching(doc,text,pattern,re_pattern): + """ + Test that the greedy matching behavior of the * op + is consistant with other re implementations + """ + matcher = Matcher(doc.vocab) + matcher.add(re_pattern,None,pattern) + matches = matcher(doc) + re_matches = [m.span() for m in re.finditer(re_pattern,text)] + for match,re_match in zip(matches,re_matches): + assert match[1:]==re_match + +@pytest.mark.parametrize('pattern,re_pattern',[ + (pattern1,re_pattern1), + (pattern2,re_pattern2), + (pattern3,re_pattern3), + (pattern4,re_pattern4), + (pattern5,re_pattern5)]) +def test_match_consuming(doc,text,pattern,re_pattern): + """ + Test that matcher.__call__ consumes tokens on a match + similar to re.findall + """ + matcher = Matcher(doc.vocab) + matcher.add(re_pattern,None,pattern) + matches = matcher(doc) + re_matches = [m.span() for m in re.finditer(re_pattern,text)] + assert len(matches)==len(re_matches) \ No newline at end of file diff --git a/spacy/tests/test_matcher_greedy.py b/spacy/tests/test_matcher_greedy.py new file mode 100644 index 000000000..882c356ca --- /dev/null +++ b/spacy/tests/test_matcher_greedy.py @@ -0,0 +1,63 @@ +# coding: utf-8 +from __future__ import unicode_literals +import re + +from ..matcher import Matcher + +import pytest + +pattern1 = [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'*'}] +pattern2 = [{'ORTH':'A','OP':'*'},{'ORTH':'A','OP':'1'}] +pattern3 = [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'1'}] +pattern4 = [{'ORTH':'B','OP':'1'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}] +pattern5 = [{'ORTH':'B','OP':'*'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}] + +re_pattern1 = 'AA*' +re_pattern2 = 'A*A' +re_pattern3 = 'AA' +re_pattern4 = 'BA*B' +re_pattern5 = 'B*A*B' + +@pytest.fixture +def text(): + return "(ABBAAAAAB)." + +@pytest.fixture +def doc(en_tokenizer,text): + doc = en_tokenizer(' '.join(text)) + return doc + +@pytest.mark.parametrize('pattern,re_pattern',[ + (pattern1,re_pattern1), + (pattern2,re_pattern2), + (pattern3,re_pattern3), + (pattern4,re_pattern4), + (pattern5,re_pattern5)]) +def test_greedy_matching(doc,text,pattern,re_pattern): + """ + Test that the greedy matching behavior of the * op + is consistant with other re implementations + """ + matcher = Matcher(doc.vocab) + matcher.add(re_pattern,None,pattern) + matches = matcher(doc) + re_matches = [m.span() for m in re.finditer(re_pattern,text)] + for match,re_match in zip(matches,re_matches): + assert match[1:]==re_match + +@pytest.mark.parametrize('pattern,re_pattern',[ + (pattern1,re_pattern1), + (pattern2,re_pattern2), + (pattern3,re_pattern3), + (pattern4,re_pattern4), + (pattern5,re_pattern5)]) +def test_match_consuming(doc,text,pattern,re_pattern): + """ + Test that matcher.__call__ consumes tokens on a match + similar to re.findall + """ + matcher = Matcher(doc.vocab) + matcher.add(re_pattern,None,pattern) + matches = matcher(doc) + re_matches = [m.span() for m in re.finditer(re_pattern,text)] + assert len(matches)==len(re_matches) \ No newline at end of file