Tidy up and rename regression tests and remove unnecessary imports

This commit is contained in:
Ines Montani 2017-01-12 22:00:37 +01:00
parent 5e1b6178e3
commit 9b4bea1df9
18 changed files with 97 additions and 142 deletions

View File

@ -1,7 +1,4 @@
# coding: utf-8
"""Test a bug that arose from having overlapping matches"""
from __future__ import unicode_literals
from ...matcher import Matcher
@ -25,6 +22,7 @@ def doc(en_tokenizer):
@pytest.mark.parametrize('pattern', [pattern1, pattern2])
def test_issue118(doc, pattern):
"""Test a bug that arose from having overlapping matches"""
ORG = doc.vocab.strings['ORG']
matcher = Matcher(doc.vocab, {'BostonCeltics': ('ORG', {}, pattern)})
@ -41,6 +39,7 @@ def test_issue118(doc, pattern):
@pytest.mark.parametrize('pattern', [pattern3, pattern4])
def test_issue118_prefix_reorder(doc, pattern):
"""Test a bug that arose from having overlapping matches"""
ORG = doc.vocab.strings['ORG']
matcher = Matcher(doc.vocab, {'BostonCeltics': ('ORG', {}, pattern)})

View File

@ -9,7 +9,6 @@ import pytest
def test_issue242(en_tokenizer):
"""Test overlapping multi-word phrases."""
text = "There are different food safety standards in different countries."
patterns = [[{LOWER: 'food'}, {LOWER: 'safety'}],
[{LOWER: 'safety'}, {LOWER: 'standards'}]]

View File

@ -4,7 +4,7 @@ from __future__ import unicode_literals
from ..util import get_doc
def test_sbd_empty_string(en_tokenizer):
def test_issue309(en_tokenizer):
"""Test Issue #309: SBD fails on empty string"""
tokens = en_tokenizer(" ")
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=[0], deps=['ROOT'])

View File

@ -1,16 +1,9 @@
# coding: utf-8
from __future__ import unicode_literals
from ...en import English
import pytest
@pytest.fixture
def en_tokenizer():
return English.Defaults.create_tokenizer()
def test_issue351(en_tokenizer):
doc = en_tokenizer(" This is a cat.")
assert doc[0].idx == 0

View File

@ -1,16 +1,10 @@
# coding: utf-8
from __future__ import unicode_literals
from ...en import English
import pytest
@pytest.fixture
def en_tokenizer():
return English.Defaults.create_tokenizer()
def test_big_ellipsis(en_tokenizer):
def test_issue360(en_tokenizer):
"""Test tokenization of big ellipsis"""
tokens = en_tokenizer('$45...............Asking')
assert len(tokens) > 2

View File

@ -1,31 +1,25 @@
# coding: utf-8
from __future__ import unicode_literals
import spacy
from spacy.attrs import ORTH
from ...attrs import ORTH
from ...matcher import Matcher
import pytest
@pytest.mark.models
def test_issue429():
nlp = spacy.load('en', parser=False)
def test_issue429(EN):
def merge_phrases(matcher, doc, i, matches):
if i != len(matches) - 1:
return None
spans = [(ent_id, label, doc[start:end]) for ent_id, label, start, end in matches]
for ent_id, label, span in spans:
span.merge('NNP' if label else span.root.tag_, span.text, nlp.vocab.strings[label])
span.merge('NNP' if label else span.root.tag_, span.text, EN.vocab.strings[label])
doc = nlp('a')
nlp.matcher.add('key', label='TEST', attrs={}, specs=[[{ORTH: 'a'}]], on_match=merge_phrases)
doc = nlp.tokenizer('a b c')
nlp.tagger(doc)
nlp.matcher(doc)
for word in doc:
print(word.text, word.ent_iob_, word.ent_type_)
nlp.entity(doc)
doc = EN('a')
matcher = Matcher(EN.vocab)
matcher.add('key', label='TEST', attrs={}, specs=[[{ORTH: 'a'}]], on_match=merge_phrases)
doc = EN.tokenizer('a b c')
EN.tagger(doc)
matcher(doc)
EN.entity(doc)

View File

@ -0,0 +1,21 @@
# coding: utf-8
from __future__ import unicode_literals
from ..util import get_doc
import pytest
@pytest.mark.models
def test_issue514(EN):
"""Test serializing after adding entity"""
text = ["This", "is", "a", "sentence", "about", "pasta", "."]
vocab = EN.entity.vocab
doc = get_doc(vocab, text)
EN.entity.add_label("Food")
EN.entity(doc)
label_id = vocab.strings[u'Food']
doc.ents = [(label_id, 5,6)]
assert [(ent.label_, ent.text) for ent in doc.ents] == [("Food", "pasta")]
doc2 = get_doc(EN.entity.vocab).from_bytes(doc.to_bytes())
assert [(ent.label_, ent.text) for ent in doc2.ents] == [("Food", "pasta")]

View File

@ -6,5 +6,5 @@ import pytest
@pytest.mark.models
def test_issue54(EN):
text = u'Talks given by women had a slightly higher number of questions asked (3.2$\pm$0.2) than talks given by men (2.6$\pm$0.1).'
text = "Talks given by women had a slightly higher number of questions asked (3.2$\pm$0.2) than talks given by men (2.6$\pm$0.1)."
tokens = EN(text)

View File

@ -1,21 +1,20 @@
# coding: utf-8
from __future__ import unicode_literals
import spacy
import spacy.matcher
from spacy.attrs import IS_PUNCT, ORTH
from ...matcher import Matcher
from ...attrs import IS_PUNCT, ORTH
import pytest
@pytest.mark.models
def test_matcher_segfault():
nlp = spacy.load('en', parser=False, entity=False)
matcher = spacy.matcher.Matcher(nlp.vocab)
def test_issue587(EN):
"""Test that Matcher doesn't segfault on particular input"""
matcher = Matcher(EN.vocab)
content = '''a b; c'''
matcher.add(entity_key='1', label='TEST', attrs={}, specs=[[{ORTH: 'a'}, {ORTH: 'b'}]])
matcher(nlp(content))
matcher(EN(content))
matcher.add(entity_key='2', label='TEST', attrs={}, specs=[[{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'c'}]])
matcher(nlp(content))
matcher(EN(content))
matcher.add(entity_key='3', label='TEST', attrs={}, specs=[[{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'd'}]])
matcher(nlp(content))
matcher(EN(content))

View File

@ -1,14 +1,12 @@
# coding: utf-8
from __future__ import unicode_literals
from ...vocab import Vocab
from ...tokens import Doc
from ...matcher import Matcher
import pytest
def test_issue588():
matcher = Matcher(Vocab())
def test_issue588(en_vocab):
matcher = Matcher(en_vocab)
with pytest.raises(ValueError):
matcher.add(entity_key='1', label='TEST', attrs={}, specs=[[]])

View File

@ -2,7 +2,7 @@
from __future__ import unicode_literals
from ...vocab import Vocab
from ...tokens import Doc
from ..util import get_doc
import pytest
@ -10,4 +10,4 @@ import pytest
def test_issue589():
vocab = Vocab()
vocab.strings.set_frozen(True)
doc = Doc(vocab, words=['whata'])
doc = get_doc(vocab, ['whata'])

View File

@ -1,37 +1,22 @@
# coding: utf-8
from __future__ import unicode_literals
from ...attrs import *
from ...attrs import ORTH, IS_ALPHA, LIKE_NUM
from ...matcher import Matcher
from ...tokens import Doc
from ...en import English
from ..util import get_doc
def test_overlapping_matches():
vocab = English.Defaults.create_vocab()
doc = Doc(vocab, words=['n', '=', '1', ';', 'a', ':', '5', '%'])
matcher = Matcher(vocab)
matcher.add_entity(
"ab",
acceptor=None,
on_match=None
)
matcher.add_pattern(
'ab',
[
{IS_ALPHA: True},
{ORTH: ':'},
{LIKE_NUM: True},
{ORTH: '%'}
], label='a')
matcher.add_pattern(
'ab',
[
{IS_ALPHA: True},
{ORTH: '='},
{LIKE_NUM: True},
], label='b')
def test_issue590(en_vocab):
"""Test overlapping matches"""
doc = get_doc(en_vocab, ['n', '=', '1', ';', 'a', ':', '5', '%'])
matcher = Matcher(en_vocab)
matcher.add_entity("ab", acceptor=None, on_match=None)
matcher.add_pattern('ab', [{IS_ALPHA: True}, {ORTH: ':'},
{LIKE_NUM: True}, {ORTH: '%'}],
label='a')
matcher.add_pattern('ab', [{IS_ALPHA: True}, {ORTH: '='},
{LIKE_NUM: True}],
label='b')
matches = matcher(doc)
assert len(matches) == 2

View File

@ -2,43 +2,23 @@
from __future__ import unicode_literals
from ...symbols import POS, VERB, VerbForm_inf
from ...tokens import Doc
from ...vocab import Vocab
from ...lemmatizer import Lemmatizer
from ..util import get_doc
import pytest
@pytest.fixture
def index():
return {'verb': {}}
def test_issue595():
"""Test lemmatization of base forms"""
words = ["Do", "n't", "feed", "the", "dog"]
tag_map = {'VB': {POS: VERB, 'morph': VerbForm_inf}}
rules = {"verb": [["ed", "e"]]}
@pytest.fixture
def exceptions():
return {'verb': {}}
lemmatizer = Lemmatizer({'verb': {}}, {'verb': {}}, rules)
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
doc = get_doc(vocab, words)
@pytest.fixture
def rules():
return {"verb": [["ed", "e"]]}
@pytest.fixture
def lemmatizer(index, exceptions, rules):
return Lemmatizer(index, exceptions, rules)
@pytest.fixture
def tag_map():
return {'VB': {POS: VERB, 'morph': VerbForm_inf}}
@pytest.fixture
def vocab(lemmatizer, tag_map):
return Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
def test_not_lemmatize_base_forms(vocab):
doc = Doc(vocab, words=["Do", "n't", "feed", "the", "dog"])
feed = doc[2]
feed.tag_ = 'VB'
assert feed.text == 'feed'
assert feed.lemma_ == 'feed'
doc[2].tag_ = 'VB'
assert doc[2].text == 'feed'
assert doc[2].lemma_ == 'feed'

View File

@ -1,15 +1,13 @@
# coding: utf-8
from __future__ import unicode_literals
from ...tokens import Doc
from ...vocab import Vocab
from ..util import get_doc
def test_issue599():
doc = Doc(Vocab())
def test_issue599(en_vocab):
doc = get_doc(en_vocab)
doc.is_tagged = True
doc.is_parsed = True
bytes_ = doc.to_bytes()
doc2 = Doc(doc.vocab)
doc2.from_bytes(bytes_)
doc2 = get_doc(doc.vocab)
doc2.from_bytes(doc.to_bytes())
assert doc2.is_parsed

View File

@ -1,11 +1,11 @@
# coding: utf-8
from __future__ import unicode_literals
from ...tokens import Doc
from ...vocab import Vocab
from ...attrs import POS
from ..util import get_doc
def test_issue600():
doc = Doc(Vocab(tag_map={'NN': {'pos': 'NOUN'}}), words=['hello'])
vocab = Vocab(tag_map={'NN': {'pos': 'NOUN'}})
doc = get_doc(vocab, ["hello"])
doc[0].tag_ = 'NN'

View File

@ -1,27 +1,21 @@
# coding: utf-8
from __future__ import unicode_literals
from ...attrs import LOWER, ORTH
from ...tokens import Doc
from ...vocab import Vocab
from ...attrs import ORTH
from ...matcher import Matcher
from ..util import get_doc
def return_false(doc, ent_id, label, start, end):
def test_issue605(en_vocab):
def return_false(doc, ent_id, label, start, end):
return False
def test_matcher_accept():
doc = Doc(Vocab(), words=['The', 'golf', 'club', 'is', 'broken'])
golf_pattern = [
{ ORTH: "golf"},
{ ORTH: "club"}
]
words = ["The", "golf", "club", "is", "broken"]
pattern = [{ORTH: "golf"}, {ORTH: "club"}]
label = "Sport_Equipment"
doc = get_doc(en_vocab, words)
matcher = Matcher(doc.vocab)
matcher.add_entity('Sport_Equipment', acceptor=return_false)
matcher.add_pattern("Sport_Equipment", golf_pattern)
matcher.add_entity(label, acceptor=return_false)
matcher.add_pattern(label, pattern)
match = matcher(doc)
assert match == []

View File

@ -19,7 +19,7 @@ def test_issue615(en_tokenizer):
span.merge('NNP' if label else span.root.tag_, span.text, doc.vocab.strings[label])
text = "The golf club is broken"
pattern = [{ ORTH: "golf"}, { ORTH: "club"}]
pattern = [{ORTH: "golf"}, {ORTH: "club"}]
label = "Sport_Equipment"
doc = en_tokenizer(text)

View File

@ -4,7 +4,8 @@ from __future__ import unicode_literals
from ...vocab import Vocab
def test_load_vocab_with_string():
def test_issue617():
"""Test loading Vocab with string"""
try:
vocab = Vocab.load('/tmp/vocab')
except IOError: