spaCy/spacy/tests/regression/test_issue1450.py

from __future__ import unicode_literals
import pytest

from ...matcher import Matcher
from ...tokens import Doc
from ...vocab import Vocab


@pytest.mark.parametrize(
    'string,start,end',
    [
        ('a', 0, 1),
        ('a b', 0, 2),
        ('a c', 0, 1),
        ('a b c', 0, 2),
        ('a b b c', 0, 3),
        ('a b b', 0, 3),
    ]
)
def test_issue1450_matcher_end_zero_plus(string, start, end):
    '''Test matcher works when patterns end with * operator.
    
    Original example (rewritten to avoid model usage)

    nlp = spacy.load('en_core_web_sm')
    matcher = Matcher(nlp.vocab)
    matcher.add(
        "TSTEND",
        on_match_1,
        [
            {TAG: "JJ", LOWER: "new"},
            {TAG: "NN", 'OP': "*"}
        ]
    )
    doc = nlp(u'Could you create a new ticket for me?')
    print([(w.tag_, w.text, w.lower_) for w in doc])
    matches = matcher(doc)
    print(matches)
    assert len(matches) == 1
    assert matches[0][1] == 4
    assert matches[0][2] == 5
    '''
    matcher = Matcher(Vocab())
    matcher.add(
        "TSTEND",
        None,
        [
            {'ORTH': "a"},
            {'ORTH': "b", 'OP': "*"}
        ]
    )
    doc = Doc(Vocab(), words=string.split())
    matches = matcher(doc)
    if start is None or end is None:
        assert matches == []
    
    assert matches[0][1] == start
    assert matches[0][2] == end
Fix Issue #1450: Off-by-1 in * and ? matches Patterns that end in variable-length operators e.g. * and ? now end on the correct token. Previously, they were off by 1: the next token was pulled into the match, even if that's where the pattern failed. 2017-10-24 15:26:27 +03:00			`from __future__ import unicode_literals`
			`import pytest`

			`from ...matcher import Matcher`
			`from ...tokens import Doc`
			`from ...vocab import Vocab`


			`@pytest.mark.parametrize(`
			`'string,start,end',`
			`[`
			`('a', 0, 1),`
			`('a b', 0, 2),`
			`('a c', 0, 1),`
			`('a b c', 0, 2),`
Correct test examples 2018-01-23 23:00:14 +03:00			`('a b b c', 0, 3),`
			`('a b b', 0, 3),`
Fix Issue #1450: Off-by-1 in * and ? matches Patterns that end in variable-length operators e.g. * and ? now end on the correct token. Previously, they were off by 1: the next token was pulled into the match, even if that's where the pattern failed. 2017-10-24 15:26:27 +03:00			`]`
			`)`
			`def test_issue1450_matcher_end_zero_plus(string, start, end):`
			`'''Test matcher works when patterns end with * operator.`

			`Original example (rewritten to avoid model usage)`

			`nlp = spacy.load('en_core_web_sm')`
			`matcher = Matcher(nlp.vocab)`
			`matcher.add(`
			`"TSTEND",`
			`on_match_1,`
			`[`
			`{TAG: "JJ", LOWER: "new"},`
			`{TAG: "NN", 'OP': "*"}`
			`]`
			`)`
			`doc = nlp(u'Could you create a new ticket for me?')`
			`print([(w.tag_, w.text, w.lower_) for w in doc])`
			`matches = matcher(doc)`
			`print(matches)`
			`assert len(matches) == 1`
			`assert matches[0][1] == 4`
			`assert matches[0][2] == 5`
			`'''`
			`matcher = Matcher(Vocab())`
			`matcher.add(`
			`"TSTEND",`
			`None,`
			`[`
			`{'ORTH': "a"},`
			`{'ORTH': "b", 'OP': "*"}`
			`]`
			`)`
			`doc = Doc(Vocab(), words=string.split())`
			`matches = matcher(doc)`
			`if start is None or end is None:`
			`assert matches == []`

			`assert matches[0][1] == start`
			`assert matches[0][2] == end`