mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-28 10:56:31 +03:00
75f3234404
## Description Related issues: #2379 (should be fixed by separating model tests) * **total execution time down from > 300 seconds to under 60 seconds** 🎉 * removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure * changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version) * merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways) * tidied up and rewrote existing tests wherever possible ### Todo - [ ] move tests to `/tests` and adjust CI commands accordingly - [x] move model test suite from internal repo to `spacy-models` - [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~ - [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted - [ ] update documentation on how to run tests ### Types of change enhancement, tests ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
117 lines
3.7 KiB
Python
117 lines
3.7 KiB
Python
# coding: utf-8
|
|
from __future__ import unicode_literals
|
|
|
|
import pytest
|
|
import re
|
|
from spacy.matcher import Matcher
|
|
from spacy.tokens import Doc
|
|
|
|
|
|
pattern1 = [{'ORTH':'A', 'OP':'1'}, {'ORTH':'A', 'OP':'*'}]
|
|
pattern2 = [{'ORTH':'A', 'OP':'*'}, {'ORTH':'A', 'OP':'1'}]
|
|
pattern3 = [{'ORTH':'A', 'OP':'1'}, {'ORTH':'A', 'OP':'1'}]
|
|
pattern4 = [{'ORTH':'B', 'OP':'1'}, {'ORTH':'A', 'OP':'*'}, {'ORTH':'B', 'OP':'1'}]
|
|
pattern5 = [{'ORTH':'B', 'OP':'*'}, {'ORTH':'A', 'OP':'*'}, {'ORTH':'B', 'OP':'1'}]
|
|
|
|
re_pattern1 = 'AA*'
|
|
re_pattern2 = 'A*A'
|
|
re_pattern3 = 'AA'
|
|
re_pattern4 = 'BA*B'
|
|
re_pattern5 = 'B*A*B'
|
|
|
|
|
|
@pytest.fixture
|
|
def text():
|
|
return "(ABBAAAAAB)."
|
|
|
|
|
|
@pytest.fixture
|
|
def doc(en_tokenizer, text):
|
|
doc = en_tokenizer(' '.join(text))
|
|
return doc
|
|
|
|
|
|
@pytest.mark.xfail
|
|
@pytest.mark.parametrize('pattern,re_pattern', [
|
|
(pattern1, re_pattern1),
|
|
(pattern2, re_pattern2),
|
|
(pattern3, re_pattern3),
|
|
(pattern4, re_pattern4),
|
|
(pattern5, re_pattern5)])
|
|
def test_greedy_matching(doc, text, pattern, re_pattern):
|
|
"""Test that the greedy matching behavior of the * op is consistant with
|
|
other re implementations."""
|
|
matcher = Matcher(doc.vocab)
|
|
matcher.add(re_pattern, None, pattern)
|
|
matches = matcher(doc)
|
|
re_matches = [m.span() for m in re.finditer(re_pattern, text)]
|
|
for match, re_match in zip(matches, re_matches):
|
|
assert match[1:] == re_match
|
|
|
|
|
|
@pytest.mark.xfail
|
|
@pytest.mark.parametrize('pattern,re_pattern', [
|
|
(pattern1, re_pattern1),
|
|
(pattern2, re_pattern2),
|
|
(pattern3, re_pattern3),
|
|
(pattern4, re_pattern4),
|
|
(pattern5, re_pattern5)])
|
|
def test_match_consuming(doc, text, pattern, re_pattern):
|
|
"""Test that matcher.__call__ consumes tokens on a match similar to
|
|
re.findall."""
|
|
matcher = Matcher(doc.vocab)
|
|
matcher.add(re_pattern, None, pattern)
|
|
matches = matcher(doc)
|
|
re_matches = [m.span() for m in re.finditer(re_pattern, text)]
|
|
assert len(matches) == len(re_matches)
|
|
|
|
|
|
def test_operator_combos(en_vocab):
|
|
cases = [
|
|
('aaab', 'a a a b', True),
|
|
('aaab', 'a+ b', True),
|
|
('aaab', 'a+ a+ b', True),
|
|
('aaab', 'a+ a+ a b', True),
|
|
('aaab', 'a+ a+ a+ b', True),
|
|
('aaab', 'a+ a a b', True),
|
|
('aaab', 'a+ a a', True),
|
|
('aaab', 'a+', True),
|
|
('aaa', 'a+ b', False),
|
|
('aaa', 'a+ a+ b', False),
|
|
('aaa', 'a+ a+ a+ b', False),
|
|
('aaa', 'a+ a b', False),
|
|
('aaa', 'a+ a a b', False),
|
|
('aaab', 'a+ a a', True),
|
|
('aaab', 'a+', True),
|
|
('aaab', 'a+ a b', True)
|
|
]
|
|
for string, pattern_str, result in cases:
|
|
matcher = Matcher(en_vocab)
|
|
doc = Doc(matcher.vocab, words=list(string))
|
|
pattern = []
|
|
for part in pattern_str.split():
|
|
if part.endswith('+'):
|
|
pattern.append({'ORTH': part[0], 'OP': '+'})
|
|
else:
|
|
pattern.append({'ORTH': part})
|
|
matcher.add('PATTERN', None, pattern)
|
|
matches = matcher(doc)
|
|
if result:
|
|
assert matches, (string, pattern_str)
|
|
else:
|
|
assert not matches, (string, pattern_str)
|
|
|
|
|
|
def test_matcher_end_zero_plus(en_vocab):
|
|
"""Test matcher works when patterns end with * operator. (issue 1450)"""
|
|
matcher = Matcher(en_vocab)
|
|
pattern = [{'ORTH': "a"}, {'ORTH': "b", 'OP': "*"}]
|
|
matcher.add('TSTEND', None, pattern)
|
|
nlp = lambda string: Doc(matcher.vocab, words=string.split())
|
|
assert len(matcher(nlp('a'))) == 1
|
|
assert len(matcher(nlp('a b'))) == 2
|
|
assert len(matcher(nlp('a c'))) == 1
|
|
assert len(matcher(nlp('a b c'))) == 2
|
|
assert len(matcher(nlp('a b b c'))) == 3
|
|
assert len(matcher(nlp('a b b'))) == 3
|