spaCy/spacy/tests/matcher/test_matcher_logic.py

# coding: utf-8
from __future__ import unicode_literals

import pytest
import re
from spacy.matcher import Matcher
from spacy.tokens import Doc


pattern1    = [{'ORTH':'A', 'OP':'1'}, {'ORTH':'A', 'OP':'*'}]
pattern2    = [{'ORTH':'A', 'OP':'*'}, {'ORTH':'A', 'OP':'1'}]
pattern3    = [{'ORTH':'A', 'OP':'1'}, {'ORTH':'A', 'OP':'1'}]
pattern4    = [{'ORTH':'B', 'OP':'1'}, {'ORTH':'A', 'OP':'*'}, {'ORTH':'B', 'OP':'1'}]
pattern5    = [{'ORTH':'B', 'OP':'*'}, {'ORTH':'A', 'OP':'*'}, {'ORTH':'B', 'OP':'1'}]

re_pattern1 = 'AA*'
re_pattern2 = 'A*A'
re_pattern3 = 'AA'
re_pattern4 = 'BA*B'
re_pattern5 = 'B*A*B'


@pytest.fixture
def text():
    return "(ABBAAAAAB)."


@pytest.fixture
def doc(en_tokenizer, text):
    doc = en_tokenizer(' '.join(text))
    return doc


@pytest.mark.xfail
@pytest.mark.parametrize('pattern,re_pattern', [
    (pattern1, re_pattern1),
    (pattern2, re_pattern2),
    (pattern3, re_pattern3),
    (pattern4, re_pattern4),
    (pattern5, re_pattern5)])
def test_greedy_matching(doc, text, pattern, re_pattern):
    """Test that the greedy matching behavior of the * op is consistant with
    other re implementations."""
    matcher = Matcher(doc.vocab)
    matcher.add(re_pattern, None, pattern)
    matches = matcher(doc)
    re_matches = [m.span() for m in re.finditer(re_pattern, text)]
    for match, re_match in zip(matches, re_matches):
        assert match[1:] == re_match


@pytest.mark.xfail
@pytest.mark.parametrize('pattern,re_pattern', [
    (pattern1, re_pattern1),
    (pattern2, re_pattern2),
    (pattern3, re_pattern3),
    (pattern4, re_pattern4),
    (pattern5, re_pattern5)])
def test_match_consuming(doc, text, pattern, re_pattern):
    """Test that matcher.__call__ consumes tokens on a match similar to
    re.findall."""
    matcher = Matcher(doc.vocab)
    matcher.add(re_pattern, None, pattern)
    matches = matcher(doc)
    re_matches = [m.span() for m in re.finditer(re_pattern, text)]
    assert len(matches) == len(re_matches)


def test_operator_combos(en_vocab):
    cases = [
        ('aaab', 'a a a b', True),
        ('aaab', 'a+ b', True),
        ('aaab', 'a+ a+ b', True),
        ('aaab', 'a+ a+ a b', True),
        ('aaab', 'a+ a+ a+ b', True),
        ('aaab', 'a+ a a b', True),
        ('aaab', 'a+ a a', True),
        ('aaab', 'a+', True),
        ('aaa', 'a+ b', False),
        ('aaa', 'a+ a+ b', False),
        ('aaa', 'a+ a+ a+ b', False),
        ('aaa', 'a+ a b', False),
        ('aaa', 'a+ a a b', False),
        ('aaab', 'a+ a a', True),
        ('aaab', 'a+', True),
        ('aaab', 'a+ a b', True)
    ]
    for string, pattern_str, result in cases:
        matcher = Matcher(en_vocab)
        doc = Doc(matcher.vocab, words=list(string))
        pattern = []
        for part in pattern_str.split():
            if part.endswith('+'):
                pattern.append({'ORTH': part[0], 'OP': '+'})
            else:
                pattern.append({'ORTH': part})
        matcher.add('PATTERN', None, pattern)
        matches = matcher(doc)
        if result:
            assert matches, (string, pattern_str)
        else:
            assert not matches, (string, pattern_str)


def test_matcher_end_zero_plus(en_vocab):
    """Test matcher works when patterns end with * operator. (issue 1450)"""
    matcher = Matcher(en_vocab)
    pattern = [{'ORTH': "a"}, {'ORTH': "b", 'OP': "*"}]
    matcher.add('TSTEND', None, pattern)
    nlp = lambda string: Doc(matcher.vocab, words=string.split())
    assert len(matcher(nlp('a'))) == 1
    assert len(matcher(nlp('a b'))) == 2
    assert len(matcher(nlp('a c'))) == 1
    assert len(matcher(nlp('a b c'))) == 2
    assert len(matcher(nlp('a b b c'))) == 3
    assert len(matcher(nlp('a b b'))) == 3
💫 Refactor test suite (#2568) ## Description Related issues: #2379 (should be fixed by separating model tests) * total execution time down from > 300 seconds to under 60 seconds 🎉 * removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure * changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version) * merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways) * tidied up and rewrote existing tests wherever possible ### Todo - [ ] move tests to `/tests` and adjust CI commands accordingly - [x] move model test suite from internal repo to `spacy-models` - [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~ - [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted - [ ] update documentation on how to run tests ### Types of change enhancement, tests ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information. 2018-07-25 00:38:44 +03:00			`# coding: utf-8`
			`from __future__ import unicode_literals`

			`import pytest`
			`import re`
			`from spacy.matcher import Matcher`
			`from spacy.tokens import Doc`


			`pattern1 = [{'ORTH':'A', 'OP':'1'}, {'ORTH':'A', 'OP':'*'}]`
			`pattern2 = [{'ORTH':'A', 'OP':'*'}, {'ORTH':'A', 'OP':'1'}]`
			`pattern3 = [{'ORTH':'A', 'OP':'1'}, {'ORTH':'A', 'OP':'1'}]`
			`pattern4 = [{'ORTH':'B', 'OP':'1'}, {'ORTH':'A', 'OP':'*'}, {'ORTH':'B', 'OP':'1'}]`
			`pattern5 = [{'ORTH':'B', 'OP':''}, {'ORTH':'A', 'OP':''}, {'ORTH':'B', 'OP':'1'}]`

			`re_pattern1 = 'AA*'`
			`re_pattern2 = 'A*A'`
			`re_pattern3 = 'AA'`
			`re_pattern4 = 'BA*B'`
			`re_pattern5 = 'BAB'`


			`@pytest.fixture`
			`def text():`
			`return "(ABBAAAAAB)."`


			`@pytest.fixture`
			`def doc(en_tokenizer, text):`
			`doc = en_tokenizer(' '.join(text))`
			`return doc`


			`@pytest.mark.xfail`
			`@pytest.mark.parametrize('pattern,re_pattern', [`
			`(pattern1, re_pattern1),`
			`(pattern2, re_pattern2),`
			`(pattern3, re_pattern3),`
			`(pattern4, re_pattern4),`
			`(pattern5, re_pattern5)])`
			`def test_greedy_matching(doc, text, pattern, re_pattern):`
			`"""Test that the greedy matching behavior of the * op is consistant with`
			`other re implementations."""`
			`matcher = Matcher(doc.vocab)`
			`matcher.add(re_pattern, None, pattern)`
			`matches = matcher(doc)`
			`re_matches = [m.span() for m in re.finditer(re_pattern, text)]`
			`for match, re_match in zip(matches, re_matches):`
			`assert match[1:] == re_match`


			`@pytest.mark.xfail`
			`@pytest.mark.parametrize('pattern,re_pattern', [`
			`(pattern1, re_pattern1),`
			`(pattern2, re_pattern2),`
			`(pattern3, re_pattern3),`
			`(pattern4, re_pattern4),`
			`(pattern5, re_pattern5)])`
			`def test_match_consuming(doc, text, pattern, re_pattern):`
			`"""Test that matcher.__call__ consumes tokens on a match similar to`
			`re.findall."""`
			`matcher = Matcher(doc.vocab)`
			`matcher.add(re_pattern, None, pattern)`
			`matches = matcher(doc)`
			`re_matches = [m.span() for m in re.finditer(re_pattern, text)]`
			`assert len(matches) == len(re_matches)`


			`def test_operator_combos(en_vocab):`
			`cases = [`
			`('aaab', 'a a a b', True),`
			`('aaab', 'a+ b', True),`
			`('aaab', 'a+ a+ b', True),`
			`('aaab', 'a+ a+ a b', True),`
			`('aaab', 'a+ a+ a+ b', True),`
			`('aaab', 'a+ a a b', True),`
			`('aaab', 'a+ a a', True),`
			`('aaab', 'a+', True),`
			`('aaa', 'a+ b', False),`
			`('aaa', 'a+ a+ b', False),`
			`('aaa', 'a+ a+ a+ b', False),`
			`('aaa', 'a+ a b', False),`
			`('aaa', 'a+ a a b', False),`
			`('aaab', 'a+ a a', True),`
			`('aaab', 'a+', True),`
			`('aaab', 'a+ a b', True)`
			`]`
			`for string, pattern_str, result in cases:`
			`matcher = Matcher(en_vocab)`
			`doc = Doc(matcher.vocab, words=list(string))`
			`pattern = []`
			`for part in pattern_str.split():`
			`if part.endswith('+'):`
			`pattern.append({'ORTH': part[0], 'OP': '+'})`
			`else:`
			`pattern.append({'ORTH': part})`
			`matcher.add('PATTERN', None, pattern)`
			`matches = matcher(doc)`
			`if result:`
			`assert matches, (string, pattern_str)`
			`else:`
			`assert not matches, (string, pattern_str)`


			`def test_matcher_end_zero_plus(en_vocab):`
			`"""Test matcher works when patterns end with * operator. (issue 1450)"""`
			`matcher = Matcher(en_vocab)`
			`pattern = [{'ORTH': "a"}, {'ORTH': "b", 'OP': "*"}]`
			`matcher.add('TSTEND', None, pattern)`
			`nlp = lambda string: Doc(matcher.vocab, words=string.split())`
			`assert len(matcher(nlp('a'))) == 1`
			`assert len(matcher(nlp('a b'))) == 2`
			`assert len(matcher(nlp('a c'))) == 1`
			`assert len(matcher(nlp('a b c'))) == 2`
			`assert len(matcher(nlp('a b b c'))) == 3`
			`assert len(matcher(nlp('a b b'))) == 3`