Fix tests and use the new Matcher API

This commit is contained in:
ines 2017-05-22 13:54:20 +02:00
parent 187f370734
commit b3c7ee0148
14 changed files with 57 additions and 180 deletions

View File

@ -1,57 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
from ...matcher import Matcher
from ...attrs import ORTH
from ..util import get_doc
import pytest
# TODO: These can probably be deleted
@pytest.mark.xfail
@pytest.mark.parametrize('words,entity', [
(["Test", "Entity"], "TestEntity")])
def test_matcher_add_empty_entity(en_vocab, words, entity):
matcher = Matcher(en_vocab)
matcher.add_entity(entity)
doc = get_doc(en_vocab, words)
assert matcher.n_patterns == 0
assert matcher(doc) == []
@pytest.mark.xfail
@pytest.mark.parametrize('entity1,entity2,attrs', [
("TestEntity", "TestEntity2", {"Hello": "World"})])
def test_matcher_get_entity_attrs(en_vocab, entity1, entity2, attrs):
matcher = Matcher(en_vocab)
matcher.add_entity(entity1)
assert matcher.get_entity(entity1) == {}
matcher.add_entity(entity2, attrs=attrs)
assert matcher.get_entity(entity2) == attrs
assert matcher.get_entity(entity1) == {}
@pytest.mark.xfail
@pytest.mark.parametrize('words,entity,attrs',
[(["Test", "Entity"], "TestEntity", {"Hello": "World"})])
def test_matcher_get_entity_via_match(en_vocab, words, entity, attrs):
matcher = Matcher(en_vocab)
matcher.add_entity(entity, attrs=attrs)
doc = get_doc(en_vocab, words)
assert matcher.n_patterns == 0
assert matcher(doc) == []
matcher.add_pattern(entity, [{ORTH: words[0]}, {ORTH: words[1]}])
assert matcher.n_patterns == 1
matches = matcher(doc)
assert len(matches) == 1
assert len(matches[0]) == 4
ent_id, label, start, end = matches[0]
assert ent_id == matcher.vocab.strings[entity]
assert label == 0
assert start == 0
assert end == 2
assert matcher.get_entity(ent_id) == attrs

View File

@ -21,7 +21,6 @@ def test_simple_types(EN):
def test_consistency_bug(EN):
'''Test an arbitrary sequence-consistency bug encountered during speed test'''
tokens = EN(u'Where rap essentially went mainstream, illustrated by seminal Public Enemy, Beastie Boys and L.L. Cool J. tracks.')
tokens = EN(u'''Charity and other short-term aid have buoyed them so far, and a tax-relief bill working its way through Congress would help. But the September 11 Victim Compensation Fund, enacted by Congress to discourage people from filing lawsuits, will determine the shape of their lives for years to come.\n\n''', entity=False)
tokens.ents += tuple(EN.matcher(tokens))
EN.entity(tokens)
@ -30,17 +29,8 @@ def test_consistency_bug(EN):
@pytest.mark.models
def test_unit_end_gazetteer(EN):
'''Test a bug in the interaction between the NER model and the gazetteer'''
matcher = Matcher(EN.vocab,
{'MemberNames':
('PERSON', {},
[
[{LOWER: 'cal'}],
[{LOWER: 'cal'}, {LOWER: 'henderson'}],
]
)
}
)
matcher = Matcher(EN.vocab)
matcher.add('MemberNames', None, [{LOWER: 'cal'}], [{LOWER: 'cal'}, {LOWER: 'henderson'}])
doc = EN(u'who is cal the manager of?')
if len(list(doc.ents)) == 0:
ents = matcher(doc)
@ -50,4 +40,4 @@ def test_unit_end_gazetteer(EN):
assert list(doc.ents)[0].text == 'cal'

View File

@ -2,15 +2,14 @@
from __future__ import unicode_literals
from ...matcher import Matcher
from ...attrs import ORTH, LOWER
import pytest
pattern1 = [[{LOWER: 'celtics'}], [{LOWER: 'boston'}, {LOWER: 'celtics'}]]
pattern2 = [[{LOWER: 'boston'}, {LOWER: 'celtics'}], [{LOWER: 'celtics'}]]
pattern3 = [[{LOWER: 'boston'}], [{LOWER: 'boston'}, {LOWER: 'celtics'}]]
pattern4 = [[{LOWER: 'boston'}, {LOWER: 'celtics'}], [{LOWER: 'boston'}]]
pattern1 = [[{'LOWER': 'celtics'}], [{'LOWER': 'boston'}, {'LOWER': 'celtics'}]]
pattern2 = [[{'LOWER': 'boston'}, {'LOWER': 'celtics'}], [{'LOWER': 'celtics'}]]
pattern3 = [[{'LOWER': 'boston'}], [{'LOWER': 'boston'}, {'LOWER': 'celtics'}]]
pattern4 = [[{'LOWER': 'boston'}, {'LOWER': 'celtics'}], [{'LOWER': 'boston'}]]
@pytest.fixture
@ -24,10 +23,11 @@ def doc(en_tokenizer):
def test_issue118(doc, pattern):
"""Test a bug that arose from having overlapping matches"""
ORG = doc.vocab.strings['ORG']
matcher = Matcher(doc.vocab, {'BostonCeltics': ('ORG', {}, pattern)})
matcher = Matcher(doc.vocab)
matcher.add("BostonCeltics", None, *pattern)
assert len(list(doc.ents)) == 0
matches = [(ent_type, start, end) for ent_id, ent_type, start, end in matcher(doc)]
matches = [(ORG, start, end) for _, start, end in matcher(doc)]
assert matches == [(ORG, 9, 11), (ORG, 10, 11)]
doc.ents = matches[:1]
ents = list(doc.ents)
@ -41,10 +41,11 @@ def test_issue118(doc, pattern):
def test_issue118_prefix_reorder(doc, pattern):
"""Test a bug that arose from having overlapping matches"""
ORG = doc.vocab.strings['ORG']
matcher = Matcher(doc.vocab, {'BostonCeltics': ('ORG', {}, pattern)})
matcher = Matcher(doc.vocab)
matcher.add('BostonCeltics', None, *pattern)
assert len(list(doc.ents)) == 0
matches = [(ent_type, start, end) for ent_id, ent_type, start, end in matcher(doc)]
matches = [(ORG, start, end) for _, start, end in matcher(doc)]
doc.ents += tuple(matches)[1:]
assert matches == [(ORG, 9, 10), (ORG, 9, 11)]
ents = doc.ents

View File

@ -2,7 +2,6 @@
from __future__ import unicode_literals
from ...matcher import Matcher
from ...attrs import LOWER
import pytest
@ -10,14 +9,14 @@ import pytest
def test_issue242(en_tokenizer):
"""Test overlapping multi-word phrases."""
text = "There are different food safety standards in different countries."
patterns = [[{LOWER: 'food'}, {LOWER: 'safety'}],
[{LOWER: 'safety'}, {LOWER: 'standards'}]]
patterns = [[{'LOWER': 'food'}, {'LOWER': 'safety'}],
[{'LOWER': 'safety'}, {'LOWER': 'standards'}]]
doc = en_tokenizer(text)
matcher = Matcher(doc.vocab)
matcher.add('FOOD', 'FOOD', {}, patterns)
matcher.add('FOOD', None, *patterns)
matches = [(ent_type, start, end) for ent_id, ent_type, start, end in matcher(doc)]
matches = [(ent_type, start, end) for ent_type, start, end in matcher(doc)]
doc.ents += tuple(matches)
match1, match2 = matches
assert match1[1] == 3

View File

@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
from ...attrs import ORTH
from ...matcher import Matcher
import pytest
@ -12,13 +11,13 @@ def test_issue429(EN):
def merge_phrases(matcher, doc, i, matches):
if i != len(matches) - 1:
return None
spans = [(ent_id, label, doc[start:end]) for ent_id, label, start, end in matches]
spans = [(ent_id, ent_id, doc[start:end]) for ent_id, start, end in matches]
for ent_id, label, span in spans:
span.merge('NNP' if label else span.root.tag_, span.text, EN.vocab.strings[label])
doc = EN('a')
matcher = Matcher(EN.vocab)
matcher.add('key', label='TEST', attrs={}, specs=[[{ORTH: 'a'}]], on_match=merge_phrases)
matcher.add('TEST', on_match=merge_phrases, [{'ORTH': 'a'}])
doc = EN.tokenizer('a b c')
EN.tagger(doc)
matcher(doc)

View File

@ -7,14 +7,16 @@ from ...attrs import IS_PUNCT, ORTH
import pytest
@pytest.mark.models
def test_issue587(EN):
def test_issue587(en_tokenizer):
"""Test that Matcher doesn't segfault on particular input"""
matcher = Matcher(EN.vocab)
content = '''a b; c'''
matcher.add(entity_key='1', label='TEST', attrs={}, specs=[[{ORTH: 'a'}, {ORTH: 'b'}]])
matcher(EN(content))
matcher.add(entity_key='2', label='TEST', attrs={}, specs=[[{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'c'}]])
matcher(EN(content))
matcher.add(entity_key='3', label='TEST', attrs={}, specs=[[{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'd'}]])
matcher(EN(content))
doc = en_tokenizer('a b; c')
matcher = Matcher(doc.vocab)
matcher.add('TEST1', None, [{ORTH: 'a'}, {ORTH: 'b'}])
matches = matcher(doc)
assert len(matches) == 1
matcher.add('TEST2', None, [{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'c'}])
matches = matcher(doc)
assert len(matches) == 2
matcher.add('TEST3', None, [{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'd'}])
matches = matcher(doc)
assert len(matches) == 2

View File

@ -9,4 +9,4 @@ import pytest
def test_issue588(en_vocab):
matcher = Matcher(en_vocab)
with pytest.raises(ValueError):
matcher.add(entity_key='1', label='TEST', attrs={}, specs=[[]])
matcher.add('TEST', None, [])

View File

@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
from ...attrs import ORTH, IS_ALPHA, LIKE_NUM
from ...matcher import Matcher
from ..util import get_doc
@ -9,14 +8,8 @@ from ..util import get_doc
def test_issue590(en_vocab):
"""Test overlapping matches"""
doc = get_doc(en_vocab, ['n', '=', '1', ';', 'a', ':', '5', '%'])
matcher = Matcher(en_vocab)
matcher.add_entity("ab", acceptor=None, on_match=None)
matcher.add_pattern('ab', [{IS_ALPHA: True}, {ORTH: ':'},
{LIKE_NUM: True}, {ORTH: '%'}],
label='a')
matcher.add_pattern('ab', [{IS_ALPHA: True}, {ORTH: '='},
{LIKE_NUM: True}],
label='b')
matcher.add('ab', None, [{'IS_ALPHA': True}, {'ORTH': ':'}, {'LIKE_NUM': True}, {'ORTH': '%'}])
matcher.add('ab', None, [{'IS_ALPHA': True}, {'ORTH': '='}, {'LIKE_NUM': True}])
matches = matcher(doc)
assert len(matches) == 2

View File

@ -1,21 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
from ...attrs import ORTH
from ...matcher import Matcher
from ..util import get_doc
def test_issue605(en_vocab):
def return_false(doc, ent_id, label, start, end):
return False
words = ["The", "golf", "club", "is", "broken"]
pattern = [{ORTH: "golf"}, {ORTH: "club"}]
label = "Sport_Equipment"
doc = get_doc(en_vocab, words)
matcher = Matcher(doc.vocab)
matcher.add_entity(label, acceptor=return_false)
matcher.add_pattern(label, pattern)
match = matcher(doc)
assert match == []

View File

@ -2,7 +2,6 @@
from __future__ import unicode_literals
from ...matcher import Matcher
from ...attrs import ORTH
def test_issue615(en_tokenizer):
@ -14,19 +13,17 @@ def test_issue615(en_tokenizer):
if i != len(matches)-1:
return None
# Get Span objects
spans = [(ent_id, label, doc[start : end]) for ent_id, label, start, end in matches]
spans = [(ent_id, ent_id, doc[start : end]) for ent_id, start, end in matches]
for ent_id, label, span in spans:
span.merge('NNP' if label else span.root.tag_, span.text, doc.vocab.strings[label])
text = "The golf club is broken"
pattern = [{ORTH: "golf"}, {ORTH: "club"}]
pattern = [{'ORTH': "golf"}, {'ORTH': "club"}]
label = "Sport_Equipment"
doc = en_tokenizer(text)
matcher = Matcher(doc.vocab)
matcher.add_entity(label, on_match=merge_phrases)
matcher.add_pattern(label, pattern, label=label)
matcher.add(label, merge_phrases, pattern)
match = matcher(doc)
entities = list(doc.ents)

View File

@ -1,16 +1,13 @@
from __future__ import unicode_literals
from ... import load as load_spacy
from ...attrs import LEMMA
from ...matcher import merge_phrase
import pytest
@pytest.mark.xfail
@pytest.mark.models
def test_issue758():
def test_issue758(EN):
'''Test parser transition bug after label added.'''
nlp = load_spacy('en')
nlp.matcher.add('splash', 'my_entity', {},
[[{LEMMA: 'splash'}, {LEMMA: 'on'}]],
on_match=merge_phrase)
from ...matcher import merge_phrase
nlp = EN()
nlp.matcher.add('splash', merge_phrase, [[{'LEMMA': 'splash'}, {'LEMMA': 'on'}]])
doc = nlp('splash On', parse=False)

View File

@ -1,8 +1,5 @@
'''
Test Matcher matches with '*' operator and Boolean flag
'''
from __future__ import unicode_literals
from __future__ import print_function
# coding: utf-8
from __future__ import unicode_literals, print_function
import pytest
from ...matcher import Matcher
@ -12,41 +9,30 @@ from ...tokens import Doc
def test_basic_case():
"""Test Matcher matches with '*' operator and Boolean flag"""
matcher = Matcher(Vocab(
lex_attr_getters={LOWER: lambda string: string.lower()}))
IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
matcher.add_pattern(
"FarAway",
[
{LOWER: "bob"},
{'OP': '*', LOWER: 'and'},
{LOWER: 'frank'}
])
matcher.add('FarAway', None, [{'LOWER': "bob"}, {'OP': '*', 'LOWER': 'and'}, {'LOWER': 'frank'}])
doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank'])
match = matcher(doc)
assert len(match) == 1
ent_id, label, start, end = match[0]
ent_id, start, end = match[0]
assert start == 0
assert end == 4
@pytest.mark.xfail
def test_issue850():
'''The problem here is that the variable-length pattern matches the
succeeding token. We then don't handle the ambiguity correctly.'''
"""The problem here is that the variable-length pattern matches the
succeeding token. We then don't handle the ambiguity correctly."""
matcher = Matcher(Vocab(
lex_attr_getters={LOWER: lambda string: string.lower()}))
IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
matcher.add_pattern(
"FarAway",
[
{LOWER: "bob"},
{'OP': '*', IS_ANY_TOKEN: True},
{LOWER: 'frank'}
])
matcher.add('FarAway', None, [{'LOWER': "bob"}, {'OP': '*', 'IS_ANY_TOKEN': True}, {'LOWER': 'frank'}])
doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank'])
match = matcher(doc)
assert len(match) == 1
ent_id, label, start, end = match[0]
ent_id, start, end = match[0]
assert start == 0
assert end == 4

View File

@ -73,8 +73,6 @@ def test_matcher_phrase_matcher(en_vocab):
assert len(matcher(doc)) == 1
# TODO; Not sure what's wrong here. Possible bug?
@pytest.mark.xfail
def test_matcher_match_zero(matcher):
words1 = 'He said , " some words " ...'.split()
words2 = 'He said , " some three words " ...'.split()
@ -88,40 +86,33 @@ def test_matcher_match_zero(matcher):
{'IS_PUNCT': True},
{'ORTH': '"'}]
matcher.add('Quote', pattern1)
matcher.add('Quote', None, pattern1)
doc = get_doc(matcher.vocab, words1)
assert len(matcher(doc)) == 1
doc = get_doc(matcher.vocab, words2)
assert len(matcher(doc)) == 0
matcher.add('Quote', pattern2)
matcher.add('Quote', None, pattern2)
assert len(matcher(doc)) == 0
# TODO; Not sure what's wrong here. Possible bug?
@pytest.mark.xfail
def test_matcher_match_zero_plus(matcher):
words = 'He said , " some words " ...'.split()
pattern = [{'ORTH': '"'},
{'OP': '*', 'IS_PUNCT': False},
{'ORTH': '"'}]
matcher.add('Quote', [pattern])
matcher.add('Quote', None, pattern)
doc = get_doc(matcher.vocab, words)
assert len(matcher(doc)) == 1
# TODO; Not sure what's wrong here. Possible bug?
@pytest.mark.xfail
def test_matcher_match_one_plus(matcher):
control = Matcher(matcher.vocab)
control.add('BasicPhilippe', None, [{'ORTH': 'Philippe'}])
doc = get_doc(control.vocab, ['Philippe', 'Philippe'])
m = control(doc)
assert len(m) == 2
matcher.add('KleenePhilippe',
[
{'ORTH': 'Philippe', 'OP': '1'},
{'ORTH': 'Philippe', 'OP': '+'}])
matcher.add('KleenePhilippe', None, [{'ORTH': 'Philippe', 'OP': '1'},
{'ORTH': 'Philippe', 'OP': '+'}])
m = matcher(doc)
assert len(m) == 1