mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-11 08:42:28 +03:00
Fix tests and use the new Matcher API
This commit is contained in:
parent
187f370734
commit
b3c7ee0148
|
@ -1,57 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...matcher import Matcher
|
|
||||||
from ...attrs import ORTH
|
|
||||||
from ..util import get_doc
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
# TODO: These can probably be deleted
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
@pytest.mark.parametrize('words,entity', [
|
|
||||||
(["Test", "Entity"], "TestEntity")])
|
|
||||||
def test_matcher_add_empty_entity(en_vocab, words, entity):
|
|
||||||
matcher = Matcher(en_vocab)
|
|
||||||
matcher.add_entity(entity)
|
|
||||||
doc = get_doc(en_vocab, words)
|
|
||||||
assert matcher.n_patterns == 0
|
|
||||||
assert matcher(doc) == []
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
@pytest.mark.parametrize('entity1,entity2,attrs', [
|
|
||||||
("TestEntity", "TestEntity2", {"Hello": "World"})])
|
|
||||||
def test_matcher_get_entity_attrs(en_vocab, entity1, entity2, attrs):
|
|
||||||
matcher = Matcher(en_vocab)
|
|
||||||
matcher.add_entity(entity1)
|
|
||||||
assert matcher.get_entity(entity1) == {}
|
|
||||||
matcher.add_entity(entity2, attrs=attrs)
|
|
||||||
assert matcher.get_entity(entity2) == attrs
|
|
||||||
assert matcher.get_entity(entity1) == {}
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
@pytest.mark.parametrize('words,entity,attrs',
|
|
||||||
[(["Test", "Entity"], "TestEntity", {"Hello": "World"})])
|
|
||||||
def test_matcher_get_entity_via_match(en_vocab, words, entity, attrs):
|
|
||||||
matcher = Matcher(en_vocab)
|
|
||||||
matcher.add_entity(entity, attrs=attrs)
|
|
||||||
doc = get_doc(en_vocab, words)
|
|
||||||
assert matcher.n_patterns == 0
|
|
||||||
assert matcher(doc) == []
|
|
||||||
|
|
||||||
matcher.add_pattern(entity, [{ORTH: words[0]}, {ORTH: words[1]}])
|
|
||||||
assert matcher.n_patterns == 1
|
|
||||||
|
|
||||||
matches = matcher(doc)
|
|
||||||
assert len(matches) == 1
|
|
||||||
assert len(matches[0]) == 4
|
|
||||||
|
|
||||||
ent_id, label, start, end = matches[0]
|
|
||||||
assert ent_id == matcher.vocab.strings[entity]
|
|
||||||
assert label == 0
|
|
||||||
assert start == 0
|
|
||||||
assert end == 2
|
|
||||||
assert matcher.get_entity(ent_id) == attrs
|
|
|
@ -21,7 +21,6 @@ def test_simple_types(EN):
|
||||||
def test_consistency_bug(EN):
|
def test_consistency_bug(EN):
|
||||||
'''Test an arbitrary sequence-consistency bug encountered during speed test'''
|
'''Test an arbitrary sequence-consistency bug encountered during speed test'''
|
||||||
tokens = EN(u'Where rap essentially went mainstream, illustrated by seminal Public Enemy, Beastie Boys and L.L. Cool J. tracks.')
|
tokens = EN(u'Where rap essentially went mainstream, illustrated by seminal Public Enemy, Beastie Boys and L.L. Cool J. tracks.')
|
||||||
|
|
||||||
tokens = EN(u'''Charity and other short-term aid have buoyed them so far, and a tax-relief bill working its way through Congress would help. But the September 11 Victim Compensation Fund, enacted by Congress to discourage people from filing lawsuits, will determine the shape of their lives for years to come.\n\n''', entity=False)
|
tokens = EN(u'''Charity and other short-term aid have buoyed them so far, and a tax-relief bill working its way through Congress would help. But the September 11 Victim Compensation Fund, enacted by Congress to discourage people from filing lawsuits, will determine the shape of their lives for years to come.\n\n''', entity=False)
|
||||||
tokens.ents += tuple(EN.matcher(tokens))
|
tokens.ents += tuple(EN.matcher(tokens))
|
||||||
EN.entity(tokens)
|
EN.entity(tokens)
|
||||||
|
@ -30,17 +29,8 @@ def test_consistency_bug(EN):
|
||||||
@pytest.mark.models
|
@pytest.mark.models
|
||||||
def test_unit_end_gazetteer(EN):
|
def test_unit_end_gazetteer(EN):
|
||||||
'''Test a bug in the interaction between the NER model and the gazetteer'''
|
'''Test a bug in the interaction between the NER model and the gazetteer'''
|
||||||
matcher = Matcher(EN.vocab,
|
matcher = Matcher(EN.vocab)
|
||||||
{'MemberNames':
|
matcher.add('MemberNames', None, [{LOWER: 'cal'}], [{LOWER: 'cal'}, {LOWER: 'henderson'}])
|
||||||
('PERSON', {},
|
|
||||||
[
|
|
||||||
[{LOWER: 'cal'}],
|
|
||||||
[{LOWER: 'cal'}, {LOWER: 'henderson'}],
|
|
||||||
]
|
|
||||||
)
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
doc = EN(u'who is cal the manager of?')
|
doc = EN(u'who is cal the manager of?')
|
||||||
if len(list(doc.ents)) == 0:
|
if len(list(doc.ents)) == 0:
|
||||||
ents = matcher(doc)
|
ents = matcher(doc)
|
||||||
|
|
|
@ -2,15 +2,14 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...matcher import Matcher
|
from ...matcher import Matcher
|
||||||
from ...attrs import ORTH, LOWER
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
pattern1 = [[{LOWER: 'celtics'}], [{LOWER: 'boston'}, {LOWER: 'celtics'}]]
|
pattern1 = [[{'LOWER': 'celtics'}], [{'LOWER': 'boston'}, {'LOWER': 'celtics'}]]
|
||||||
pattern2 = [[{LOWER: 'boston'}, {LOWER: 'celtics'}], [{LOWER: 'celtics'}]]
|
pattern2 = [[{'LOWER': 'boston'}, {'LOWER': 'celtics'}], [{'LOWER': 'celtics'}]]
|
||||||
pattern3 = [[{LOWER: 'boston'}], [{LOWER: 'boston'}, {LOWER: 'celtics'}]]
|
pattern3 = [[{'LOWER': 'boston'}], [{'LOWER': 'boston'}, {'LOWER': 'celtics'}]]
|
||||||
pattern4 = [[{LOWER: 'boston'}, {LOWER: 'celtics'}], [{LOWER: 'boston'}]]
|
pattern4 = [[{'LOWER': 'boston'}, {'LOWER': 'celtics'}], [{'LOWER': 'boston'}]]
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -24,10 +23,11 @@ def doc(en_tokenizer):
|
||||||
def test_issue118(doc, pattern):
|
def test_issue118(doc, pattern):
|
||||||
"""Test a bug that arose from having overlapping matches"""
|
"""Test a bug that arose from having overlapping matches"""
|
||||||
ORG = doc.vocab.strings['ORG']
|
ORG = doc.vocab.strings['ORG']
|
||||||
matcher = Matcher(doc.vocab, {'BostonCeltics': ('ORG', {}, pattern)})
|
matcher = Matcher(doc.vocab)
|
||||||
|
matcher.add("BostonCeltics", None, *pattern)
|
||||||
|
|
||||||
assert len(list(doc.ents)) == 0
|
assert len(list(doc.ents)) == 0
|
||||||
matches = [(ent_type, start, end) for ent_id, ent_type, start, end in matcher(doc)]
|
matches = [(ORG, start, end) for _, start, end in matcher(doc)]
|
||||||
assert matches == [(ORG, 9, 11), (ORG, 10, 11)]
|
assert matches == [(ORG, 9, 11), (ORG, 10, 11)]
|
||||||
doc.ents = matches[:1]
|
doc.ents = matches[:1]
|
||||||
ents = list(doc.ents)
|
ents = list(doc.ents)
|
||||||
|
@ -41,10 +41,11 @@ def test_issue118(doc, pattern):
|
||||||
def test_issue118_prefix_reorder(doc, pattern):
|
def test_issue118_prefix_reorder(doc, pattern):
|
||||||
"""Test a bug that arose from having overlapping matches"""
|
"""Test a bug that arose from having overlapping matches"""
|
||||||
ORG = doc.vocab.strings['ORG']
|
ORG = doc.vocab.strings['ORG']
|
||||||
matcher = Matcher(doc.vocab, {'BostonCeltics': ('ORG', {}, pattern)})
|
matcher = Matcher(doc.vocab)
|
||||||
|
matcher.add('BostonCeltics', None, *pattern)
|
||||||
|
|
||||||
assert len(list(doc.ents)) == 0
|
assert len(list(doc.ents)) == 0
|
||||||
matches = [(ent_type, start, end) for ent_id, ent_type, start, end in matcher(doc)]
|
matches = [(ORG, start, end) for _, start, end in matcher(doc)]
|
||||||
doc.ents += tuple(matches)[1:]
|
doc.ents += tuple(matches)[1:]
|
||||||
assert matches == [(ORG, 9, 10), (ORG, 9, 11)]
|
assert matches == [(ORG, 9, 10), (ORG, 9, 11)]
|
||||||
ents = doc.ents
|
ents = doc.ents
|
||||||
|
|
|
@ -2,7 +2,6 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...matcher import Matcher
|
from ...matcher import Matcher
|
||||||
from ...attrs import LOWER
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
@ -10,14 +9,14 @@ import pytest
|
||||||
def test_issue242(en_tokenizer):
|
def test_issue242(en_tokenizer):
|
||||||
"""Test overlapping multi-word phrases."""
|
"""Test overlapping multi-word phrases."""
|
||||||
text = "There are different food safety standards in different countries."
|
text = "There are different food safety standards in different countries."
|
||||||
patterns = [[{LOWER: 'food'}, {LOWER: 'safety'}],
|
patterns = [[{'LOWER': 'food'}, {'LOWER': 'safety'}],
|
||||||
[{LOWER: 'safety'}, {LOWER: 'standards'}]]
|
[{'LOWER': 'safety'}, {'LOWER': 'standards'}]]
|
||||||
|
|
||||||
doc = en_tokenizer(text)
|
doc = en_tokenizer(text)
|
||||||
matcher = Matcher(doc.vocab)
|
matcher = Matcher(doc.vocab)
|
||||||
matcher.add('FOOD', 'FOOD', {}, patterns)
|
matcher.add('FOOD', None, *patterns)
|
||||||
|
|
||||||
matches = [(ent_type, start, end) for ent_id, ent_type, start, end in matcher(doc)]
|
matches = [(ent_type, start, end) for ent_type, start, end in matcher(doc)]
|
||||||
doc.ents += tuple(matches)
|
doc.ents += tuple(matches)
|
||||||
match1, match2 = matches
|
match1, match2 = matches
|
||||||
assert match1[1] == 3
|
assert match1[1] == 3
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...attrs import ORTH
|
|
||||||
from ...matcher import Matcher
|
from ...matcher import Matcher
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
@ -12,13 +11,13 @@ def test_issue429(EN):
|
||||||
def merge_phrases(matcher, doc, i, matches):
|
def merge_phrases(matcher, doc, i, matches):
|
||||||
if i != len(matches) - 1:
|
if i != len(matches) - 1:
|
||||||
return None
|
return None
|
||||||
spans = [(ent_id, label, doc[start:end]) for ent_id, label, start, end in matches]
|
spans = [(ent_id, ent_id, doc[start:end]) for ent_id, start, end in matches]
|
||||||
for ent_id, label, span in spans:
|
for ent_id, label, span in spans:
|
||||||
span.merge('NNP' if label else span.root.tag_, span.text, EN.vocab.strings[label])
|
span.merge('NNP' if label else span.root.tag_, span.text, EN.vocab.strings[label])
|
||||||
|
|
||||||
doc = EN('a')
|
doc = EN('a')
|
||||||
matcher = Matcher(EN.vocab)
|
matcher = Matcher(EN.vocab)
|
||||||
matcher.add('key', label='TEST', attrs={}, specs=[[{ORTH: 'a'}]], on_match=merge_phrases)
|
matcher.add('TEST', on_match=merge_phrases, [{'ORTH': 'a'}])
|
||||||
doc = EN.tokenizer('a b c')
|
doc = EN.tokenizer('a b c')
|
||||||
EN.tagger(doc)
|
EN.tagger(doc)
|
||||||
matcher(doc)
|
matcher(doc)
|
||||||
|
|
|
@ -7,14 +7,16 @@ from ...attrs import IS_PUNCT, ORTH
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
def test_issue587(en_tokenizer):
|
||||||
def test_issue587(EN):
|
|
||||||
"""Test that Matcher doesn't segfault on particular input"""
|
"""Test that Matcher doesn't segfault on particular input"""
|
||||||
matcher = Matcher(EN.vocab)
|
doc = en_tokenizer('a b; c')
|
||||||
content = '''a b; c'''
|
matcher = Matcher(doc.vocab)
|
||||||
matcher.add(entity_key='1', label='TEST', attrs={}, specs=[[{ORTH: 'a'}, {ORTH: 'b'}]])
|
matcher.add('TEST1', None, [{ORTH: 'a'}, {ORTH: 'b'}])
|
||||||
matcher(EN(content))
|
matches = matcher(doc)
|
||||||
matcher.add(entity_key='2', label='TEST', attrs={}, specs=[[{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'c'}]])
|
assert len(matches) == 1
|
||||||
matcher(EN(content))
|
matcher.add('TEST2', None, [{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'c'}])
|
||||||
matcher.add(entity_key='3', label='TEST', attrs={}, specs=[[{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'd'}]])
|
matches = matcher(doc)
|
||||||
matcher(EN(content))
|
assert len(matches) == 2
|
||||||
|
matcher.add('TEST3', None, [{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'd'}])
|
||||||
|
matches = matcher(doc)
|
||||||
|
assert len(matches) == 2
|
||||||
|
|
|
@ -9,4 +9,4 @@ import pytest
|
||||||
def test_issue588(en_vocab):
|
def test_issue588(en_vocab):
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
matcher.add(entity_key='1', label='TEST', attrs={}, specs=[[]])
|
matcher.add('TEST', None, [])
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...attrs import ORTH, IS_ALPHA, LIKE_NUM
|
|
||||||
from ...matcher import Matcher
|
from ...matcher import Matcher
|
||||||
from ..util import get_doc
|
from ..util import get_doc
|
||||||
|
|
||||||
|
@ -9,14 +8,8 @@ from ..util import get_doc
|
||||||
def test_issue590(en_vocab):
|
def test_issue590(en_vocab):
|
||||||
"""Test overlapping matches"""
|
"""Test overlapping matches"""
|
||||||
doc = get_doc(en_vocab, ['n', '=', '1', ';', 'a', ':', '5', '%'])
|
doc = get_doc(en_vocab, ['n', '=', '1', ';', 'a', ':', '5', '%'])
|
||||||
|
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
matcher.add_entity("ab", acceptor=None, on_match=None)
|
matcher.add('ab', None, [{'IS_ALPHA': True}, {'ORTH': ':'}, {'LIKE_NUM': True}, {'ORTH': '%'}])
|
||||||
matcher.add_pattern('ab', [{IS_ALPHA: True}, {ORTH: ':'},
|
matcher.add('ab', None, [{'IS_ALPHA': True}, {'ORTH': '='}, {'LIKE_NUM': True}])
|
||||||
{LIKE_NUM: True}, {ORTH: '%'}],
|
|
||||||
label='a')
|
|
||||||
matcher.add_pattern('ab', [{IS_ALPHA: True}, {ORTH: '='},
|
|
||||||
{LIKE_NUM: True}],
|
|
||||||
label='b')
|
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert len(matches) == 2
|
assert len(matches) == 2
|
||||||
|
|
|
@ -1,21 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...attrs import ORTH
|
|
||||||
from ...matcher import Matcher
|
|
||||||
from ..util import get_doc
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue605(en_vocab):
|
|
||||||
def return_false(doc, ent_id, label, start, end):
|
|
||||||
return False
|
|
||||||
|
|
||||||
words = ["The", "golf", "club", "is", "broken"]
|
|
||||||
pattern = [{ORTH: "golf"}, {ORTH: "club"}]
|
|
||||||
label = "Sport_Equipment"
|
|
||||||
doc = get_doc(en_vocab, words)
|
|
||||||
matcher = Matcher(doc.vocab)
|
|
||||||
matcher.add_entity(label, acceptor=return_false)
|
|
||||||
matcher.add_pattern(label, pattern)
|
|
||||||
match = matcher(doc)
|
|
||||||
assert match == []
|
|
|
@ -2,7 +2,6 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...matcher import Matcher
|
from ...matcher import Matcher
|
||||||
from ...attrs import ORTH
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue615(en_tokenizer):
|
def test_issue615(en_tokenizer):
|
||||||
|
@ -14,19 +13,17 @@ def test_issue615(en_tokenizer):
|
||||||
if i != len(matches)-1:
|
if i != len(matches)-1:
|
||||||
return None
|
return None
|
||||||
# Get Span objects
|
# Get Span objects
|
||||||
spans = [(ent_id, label, doc[start : end]) for ent_id, label, start, end in matches]
|
spans = [(ent_id, ent_id, doc[start : end]) for ent_id, start, end in matches]
|
||||||
for ent_id, label, span in spans:
|
for ent_id, label, span in spans:
|
||||||
span.merge('NNP' if label else span.root.tag_, span.text, doc.vocab.strings[label])
|
span.merge('NNP' if label else span.root.tag_, span.text, doc.vocab.strings[label])
|
||||||
|
|
||||||
text = "The golf club is broken"
|
text = "The golf club is broken"
|
||||||
pattern = [{ORTH: "golf"}, {ORTH: "club"}]
|
pattern = [{'ORTH': "golf"}, {'ORTH': "club"}]
|
||||||
label = "Sport_Equipment"
|
label = "Sport_Equipment"
|
||||||
|
|
||||||
doc = en_tokenizer(text)
|
doc = en_tokenizer(text)
|
||||||
matcher = Matcher(doc.vocab)
|
matcher = Matcher(doc.vocab)
|
||||||
matcher.add_entity(label, on_match=merge_phrases)
|
matcher.add(label, merge_phrases, pattern)
|
||||||
matcher.add_pattern(label, pattern, label=label)
|
|
||||||
|
|
||||||
match = matcher(doc)
|
match = matcher(doc)
|
||||||
entities = list(doc.ents)
|
entities = list(doc.ents)
|
||||||
|
|
||||||
|
|
|
@ -1,16 +1,13 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
from ... import load as load_spacy
|
|
||||||
from ...attrs import LEMMA
|
|
||||||
from ...matcher import merge_phrase
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
@pytest.mark.models
|
@pytest.mark.models
|
||||||
def test_issue758():
|
def test_issue758(EN):
|
||||||
'''Test parser transition bug after label added.'''
|
'''Test parser transition bug after label added.'''
|
||||||
nlp = load_spacy('en')
|
from ...matcher import merge_phrase
|
||||||
nlp.matcher.add('splash', 'my_entity', {},
|
nlp = EN()
|
||||||
[[{LEMMA: 'splash'}, {LEMMA: 'on'}]],
|
nlp.matcher.add('splash', merge_phrase, [[{'LEMMA': 'splash'}, {'LEMMA': 'on'}]])
|
||||||
on_match=merge_phrase)
|
|
||||||
doc = nlp('splash On', parse=False)
|
doc = nlp('splash On', parse=False)
|
||||||
|
|
|
@ -1,8 +1,5 @@
|
||||||
'''
|
# coding: utf-8
|
||||||
Test Matcher matches with '*' operator and Boolean flag
|
from __future__ import unicode_literals, print_function
|
||||||
'''
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
from __future__ import print_function
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from ...matcher import Matcher
|
from ...matcher import Matcher
|
||||||
|
@ -12,41 +9,30 @@ from ...tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
def test_basic_case():
|
def test_basic_case():
|
||||||
|
"""Test Matcher matches with '*' operator and Boolean flag"""
|
||||||
matcher = Matcher(Vocab(
|
matcher = Matcher(Vocab(
|
||||||
lex_attr_getters={LOWER: lambda string: string.lower()}))
|
lex_attr_getters={LOWER: lambda string: string.lower()}))
|
||||||
IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
|
IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
|
||||||
matcher.add_pattern(
|
matcher.add('FarAway', None, [{'LOWER': "bob"}, {'OP': '*', 'LOWER': 'and'}, {'LOWER': 'frank'}])
|
||||||
"FarAway",
|
|
||||||
[
|
|
||||||
{LOWER: "bob"},
|
|
||||||
{'OP': '*', LOWER: 'and'},
|
|
||||||
{LOWER: 'frank'}
|
|
||||||
])
|
|
||||||
doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank'])
|
doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank'])
|
||||||
match = matcher(doc)
|
match = matcher(doc)
|
||||||
assert len(match) == 1
|
assert len(match) == 1
|
||||||
ent_id, label, start, end = match[0]
|
ent_id, start, end = match[0]
|
||||||
assert start == 0
|
assert start == 0
|
||||||
assert end == 4
|
assert end == 4
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
@pytest.mark.xfail
|
||||||
def test_issue850():
|
def test_issue850():
|
||||||
'''The problem here is that the variable-length pattern matches the
|
"""The problem here is that the variable-length pattern matches the
|
||||||
succeeding token. We then don't handle the ambiguity correctly.'''
|
succeeding token. We then don't handle the ambiguity correctly."""
|
||||||
matcher = Matcher(Vocab(
|
matcher = Matcher(Vocab(
|
||||||
lex_attr_getters={LOWER: lambda string: string.lower()}))
|
lex_attr_getters={LOWER: lambda string: string.lower()}))
|
||||||
IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
|
IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
|
||||||
matcher.add_pattern(
|
matcher.add('FarAway', None, [{'LOWER': "bob"}, {'OP': '*', 'IS_ANY_TOKEN': True}, {'LOWER': 'frank'}])
|
||||||
"FarAway",
|
|
||||||
[
|
|
||||||
{LOWER: "bob"},
|
|
||||||
{'OP': '*', IS_ANY_TOKEN: True},
|
|
||||||
{LOWER: 'frank'}
|
|
||||||
])
|
|
||||||
doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank'])
|
doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank'])
|
||||||
match = matcher(doc)
|
match = matcher(doc)
|
||||||
assert len(match) == 1
|
assert len(match) == 1
|
||||||
ent_id, label, start, end = match[0]
|
ent_id, start, end = match[0]
|
||||||
assert start == 0
|
assert start == 0
|
||||||
assert end == 4
|
assert end == 4
|
||||||
|
|
|
@ -73,8 +73,6 @@ def test_matcher_phrase_matcher(en_vocab):
|
||||||
assert len(matcher(doc)) == 1
|
assert len(matcher(doc)) == 1
|
||||||
|
|
||||||
|
|
||||||
# TODO; Not sure what's wrong here. Possible bug?
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_matcher_match_zero(matcher):
|
def test_matcher_match_zero(matcher):
|
||||||
words1 = 'He said , " some words " ...'.split()
|
words1 = 'He said , " some words " ...'.split()
|
||||||
words2 = 'He said , " some three words " ...'.split()
|
words2 = 'He said , " some three words " ...'.split()
|
||||||
|
@ -88,40 +86,33 @@ def test_matcher_match_zero(matcher):
|
||||||
{'IS_PUNCT': True},
|
{'IS_PUNCT': True},
|
||||||
{'ORTH': '"'}]
|
{'ORTH': '"'}]
|
||||||
|
|
||||||
matcher.add('Quote', pattern1)
|
matcher.add('Quote', None, pattern1)
|
||||||
doc = get_doc(matcher.vocab, words1)
|
doc = get_doc(matcher.vocab, words1)
|
||||||
assert len(matcher(doc)) == 1
|
assert len(matcher(doc)) == 1
|
||||||
|
|
||||||
doc = get_doc(matcher.vocab, words2)
|
doc = get_doc(matcher.vocab, words2)
|
||||||
assert len(matcher(doc)) == 0
|
assert len(matcher(doc)) == 0
|
||||||
matcher.add('Quote', pattern2)
|
matcher.add('Quote', None, pattern2)
|
||||||
assert len(matcher(doc)) == 0
|
assert len(matcher(doc)) == 0
|
||||||
|
|
||||||
|
|
||||||
# TODO; Not sure what's wrong here. Possible bug?
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_matcher_match_zero_plus(matcher):
|
def test_matcher_match_zero_plus(matcher):
|
||||||
words = 'He said , " some words " ...'.split()
|
words = 'He said , " some words " ...'.split()
|
||||||
pattern = [{'ORTH': '"'},
|
pattern = [{'ORTH': '"'},
|
||||||
{'OP': '*', 'IS_PUNCT': False},
|
{'OP': '*', 'IS_PUNCT': False},
|
||||||
{'ORTH': '"'}]
|
{'ORTH': '"'}]
|
||||||
matcher.add('Quote', [pattern])
|
matcher.add('Quote', None, pattern)
|
||||||
doc = get_doc(matcher.vocab, words)
|
doc = get_doc(matcher.vocab, words)
|
||||||
assert len(matcher(doc)) == 1
|
assert len(matcher(doc)) == 1
|
||||||
|
|
||||||
# TODO; Not sure what's wrong here. Possible bug?
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_matcher_match_one_plus(matcher):
|
def test_matcher_match_one_plus(matcher):
|
||||||
control = Matcher(matcher.vocab)
|
control = Matcher(matcher.vocab)
|
||||||
control.add('BasicPhilippe', None, [{'ORTH': 'Philippe'}])
|
control.add('BasicPhilippe', None, [{'ORTH': 'Philippe'}])
|
||||||
|
|
||||||
doc = get_doc(control.vocab, ['Philippe', 'Philippe'])
|
doc = get_doc(control.vocab, ['Philippe', 'Philippe'])
|
||||||
|
|
||||||
m = control(doc)
|
m = control(doc)
|
||||||
assert len(m) == 2
|
assert len(m) == 2
|
||||||
matcher.add('KleenePhilippe',
|
matcher.add('KleenePhilippe', None, [{'ORTH': 'Philippe', 'OP': '1'},
|
||||||
[
|
|
||||||
{'ORTH': 'Philippe', 'OP': '1'},
|
|
||||||
{'ORTH': 'Philippe', 'OP': '+'}])
|
{'ORTH': 'Philippe', 'OP': '+'}])
|
||||||
m = matcher(doc)
|
m = matcher(doc)
|
||||||
assert len(m) == 1
|
assert len(m) == 1
|
Loading…
Reference in New Issue
Block a user