mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-03 22:06:37 +03:00
68 lines
1.6 KiB
Python
68 lines
1.6 KiB
Python
# coding: utf-8
|
|
from __future__ import unicode_literals
|
|
|
|
import pytest
|
|
from spacy.matcher import Matcher
|
|
from spacy.tokens import Doc
|
|
|
|
|
|
PATTERNS = [
|
|
("1", [[{"LEMMA": "have"}, {"LOWER": "to"}, {"LOWER": "do"}, {"POS": "ADP"}]]),
|
|
(
|
|
"2",
|
|
[
|
|
[
|
|
{"LEMMA": "have"},
|
|
{"IS_ASCII": True, "IS_PUNCT": False, "OP": "*"},
|
|
{"LOWER": "to"},
|
|
{"LOWER": "do"},
|
|
{"POS": "ADP"},
|
|
]
|
|
],
|
|
),
|
|
(
|
|
"3",
|
|
[
|
|
[
|
|
{"LEMMA": "have"},
|
|
{"IS_ASCII": True, "IS_PUNCT": False, "OP": "?"},
|
|
{"LOWER": "to"},
|
|
{"LOWER": "do"},
|
|
{"POS": "ADP"},
|
|
]
|
|
],
|
|
),
|
|
]
|
|
|
|
|
|
@pytest.fixture
|
|
def doc(en_tokenizer):
|
|
doc = en_tokenizer("also has to do with")
|
|
doc[0].tag_ = "RB"
|
|
doc[1].tag_ = "VBZ"
|
|
doc[2].tag_ = "TO"
|
|
doc[3].tag_ = "VB"
|
|
doc[4].tag_ = "IN"
|
|
return doc
|
|
|
|
|
|
@pytest.fixture
|
|
def matcher(en_tokenizer):
|
|
return Matcher(en_tokenizer.vocab)
|
|
|
|
|
|
@pytest.mark.parametrize("pattern", PATTERNS)
|
|
def test_issue3009(doc, matcher, pattern):
|
|
"""Test problem with matcher quantifiers"""
|
|
matcher.add(pattern[0], None, *pattern[1])
|
|
matches = matcher(doc)
|
|
assert matches
|
|
|
|
|
|
def test_issue2464(matcher):
|
|
"""Test problem with successive ?. This is the same bug, so putting it here."""
|
|
doc = Doc(matcher.vocab, words=["a", "b"])
|
|
matcher.add("4", None, [{"OP": "?"}, {"OP": "?"}])
|
|
matches = matcher(doc)
|
|
assert len(matches) == 3
|