mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Tidy up and rename regression tests and remove unnecessary imports
This commit is contained in:
parent
5e1b6178e3
commit
9b4bea1df9
|
@ -1,7 +1,4 @@
|
|||
# coding: utf-8
|
||||
"""Test a bug that arose from having overlapping matches"""
|
||||
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...matcher import Matcher
|
||||
|
@ -25,6 +22,7 @@ def doc(en_tokenizer):
|
|||
|
||||
@pytest.mark.parametrize('pattern', [pattern1, pattern2])
|
||||
def test_issue118(doc, pattern):
|
||||
"""Test a bug that arose from having overlapping matches"""
|
||||
ORG = doc.vocab.strings['ORG']
|
||||
matcher = Matcher(doc.vocab, {'BostonCeltics': ('ORG', {}, pattern)})
|
||||
|
||||
|
@ -41,6 +39,7 @@ def test_issue118(doc, pattern):
|
|||
|
||||
@pytest.mark.parametrize('pattern', [pattern3, pattern4])
|
||||
def test_issue118_prefix_reorder(doc, pattern):
|
||||
"""Test a bug that arose from having overlapping matches"""
|
||||
ORG = doc.vocab.strings['ORG']
|
||||
matcher = Matcher(doc.vocab, {'BostonCeltics': ('ORG', {}, pattern)})
|
||||
|
||||
|
|
|
@ -9,7 +9,6 @@ import pytest
|
|||
|
||||
def test_issue242(en_tokenizer):
|
||||
"""Test overlapping multi-word phrases."""
|
||||
|
||||
text = "There are different food safety standards in different countries."
|
||||
patterns = [[{LOWER: 'food'}, {LOWER: 'safety'}],
|
||||
[{LOWER: 'safety'}, {LOWER: 'standards'}]]
|
||||
|
|
|
@ -4,7 +4,7 @@ from __future__ import unicode_literals
|
|||
from ..util import get_doc
|
||||
|
||||
|
||||
def test_sbd_empty_string(en_tokenizer):
|
||||
def test_issue309(en_tokenizer):
|
||||
"""Test Issue #309: SBD fails on empty string"""
|
||||
tokens = en_tokenizer(" ")
|
||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=[0], deps=['ROOT'])
|
||||
|
|
|
@ -1,16 +1,9 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...en import English
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def en_tokenizer():
|
||||
return English.Defaults.create_tokenizer()
|
||||
|
||||
|
||||
def test_issue351(en_tokenizer):
|
||||
doc = en_tokenizer(" This is a cat.")
|
||||
assert doc[0].idx == 0
|
||||
|
|
|
@ -1,16 +1,10 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...en import English
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def en_tokenizer():
|
||||
return English.Defaults.create_tokenizer()
|
||||
|
||||
|
||||
def test_big_ellipsis(en_tokenizer):
|
||||
def test_issue360(en_tokenizer):
|
||||
"""Test tokenization of big ellipsis"""
|
||||
tokens = en_tokenizer('$45...............Asking')
|
||||
assert len(tokens) > 2
|
||||
|
|
|
@ -1,31 +1,25 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import spacy
|
||||
from spacy.attrs import ORTH
|
||||
from ...attrs import ORTH
|
||||
from ...matcher import Matcher
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_issue429():
|
||||
|
||||
nlp = spacy.load('en', parser=False)
|
||||
|
||||
|
||||
def test_issue429(EN):
|
||||
def merge_phrases(matcher, doc, i, matches):
|
||||
if i != len(matches) - 1:
|
||||
return None
|
||||
spans = [(ent_id, label, doc[start:end]) for ent_id, label, start, end in matches]
|
||||
for ent_id, label, span in spans:
|
||||
span.merge('NNP' if label else span.root.tag_, span.text, nlp.vocab.strings[label])
|
||||
span.merge('NNP' if label else span.root.tag_, span.text, EN.vocab.strings[label])
|
||||
|
||||
doc = nlp('a')
|
||||
nlp.matcher.add('key', label='TEST', attrs={}, specs=[[{ORTH: 'a'}]], on_match=merge_phrases)
|
||||
doc = nlp.tokenizer('a b c')
|
||||
nlp.tagger(doc)
|
||||
nlp.matcher(doc)
|
||||
|
||||
for word in doc:
|
||||
print(word.text, word.ent_iob_, word.ent_type_)
|
||||
nlp.entity(doc)
|
||||
doc = EN('a')
|
||||
matcher = Matcher(EN.vocab)
|
||||
matcher.add('key', label='TEST', attrs={}, specs=[[{ORTH: 'a'}]], on_match=merge_phrases)
|
||||
doc = EN.tokenizer('a b c')
|
||||
EN.tagger(doc)
|
||||
matcher(doc)
|
||||
EN.entity(doc)
|
||||
|
|
21
spacy/tests/regression/test_issue514.py
Normal file
21
spacy/tests/regression/test_issue514.py
Normal file
|
@ -0,0 +1,21 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..util import get_doc
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_issue514(EN):
|
||||
"""Test serializing after adding entity"""
|
||||
text = ["This", "is", "a", "sentence", "about", "pasta", "."]
|
||||
vocab = EN.entity.vocab
|
||||
doc = get_doc(vocab, text)
|
||||
EN.entity.add_label("Food")
|
||||
EN.entity(doc)
|
||||
label_id = vocab.strings[u'Food']
|
||||
doc.ents = [(label_id, 5,6)]
|
||||
assert [(ent.label_, ent.text) for ent in doc.ents] == [("Food", "pasta")]
|
||||
doc2 = get_doc(EN.entity.vocab).from_bytes(doc.to_bytes())
|
||||
assert [(ent.label_, ent.text) for ent in doc2.ents] == [("Food", "pasta")]
|
|
@ -6,5 +6,5 @@ import pytest
|
|||
|
||||
@pytest.mark.models
|
||||
def test_issue54(EN):
|
||||
text = u'Talks given by women had a slightly higher number of questions asked (3.2$\pm$0.2) than talks given by men (2.6$\pm$0.1).'
|
||||
text = "Talks given by women had a slightly higher number of questions asked (3.2$\pm$0.2) than talks given by men (2.6$\pm$0.1)."
|
||||
tokens = EN(text)
|
||||
|
|
|
@ -1,21 +1,20 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import spacy
|
||||
import spacy.matcher
|
||||
from spacy.attrs import IS_PUNCT, ORTH
|
||||
from ...matcher import Matcher
|
||||
from ...attrs import IS_PUNCT, ORTH
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_matcher_segfault():
|
||||
nlp = spacy.load('en', parser=False, entity=False)
|
||||
matcher = spacy.matcher.Matcher(nlp.vocab)
|
||||
def test_issue587(EN):
|
||||
"""Test that Matcher doesn't segfault on particular input"""
|
||||
matcher = Matcher(EN.vocab)
|
||||
content = '''a b; c'''
|
||||
matcher.add(entity_key='1', label='TEST', attrs={}, specs=[[{ORTH: 'a'}, {ORTH: 'b'}]])
|
||||
matcher(nlp(content))
|
||||
matcher(EN(content))
|
||||
matcher.add(entity_key='2', label='TEST', attrs={}, specs=[[{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'c'}]])
|
||||
matcher(nlp(content))
|
||||
matcher(EN(content))
|
||||
matcher.add(entity_key='3', label='TEST', attrs={}, specs=[[{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'd'}]])
|
||||
matcher(nlp(content))
|
||||
matcher(EN(content))
|
||||
|
|
|
@ -1,14 +1,12 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...vocab import Vocab
|
||||
from ...tokens import Doc
|
||||
from ...matcher import Matcher
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def test_issue588():
|
||||
matcher = Matcher(Vocab())
|
||||
def test_issue588(en_vocab):
|
||||
matcher = Matcher(en_vocab)
|
||||
with pytest.raises(ValueError):
|
||||
matcher.add(entity_key='1', label='TEST', attrs={}, specs=[[]])
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from ...vocab import Vocab
|
||||
from ...tokens import Doc
|
||||
from ..util import get_doc
|
||||
|
||||
import pytest
|
||||
|
||||
|
@ -10,4 +10,4 @@ import pytest
|
|||
def test_issue589():
|
||||
vocab = Vocab()
|
||||
vocab.strings.set_frozen(True)
|
||||
doc = Doc(vocab, words=['whata'])
|
||||
doc = get_doc(vocab, ['whata'])
|
||||
|
|
|
@ -1,37 +1,22 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...attrs import *
|
||||
from ...attrs import ORTH, IS_ALPHA, LIKE_NUM
|
||||
from ...matcher import Matcher
|
||||
from ...tokens import Doc
|
||||
from ...en import English
|
||||
from ..util import get_doc
|
||||
|
||||
|
||||
def test_overlapping_matches():
|
||||
vocab = English.Defaults.create_vocab()
|
||||
doc = Doc(vocab, words=['n', '=', '1', ';', 'a', ':', '5', '%'])
|
||||
|
||||
matcher = Matcher(vocab)
|
||||
matcher.add_entity(
|
||||
"ab",
|
||||
acceptor=None,
|
||||
on_match=None
|
||||
)
|
||||
matcher.add_pattern(
|
||||
'ab',
|
||||
[
|
||||
{IS_ALPHA: True},
|
||||
{ORTH: ':'},
|
||||
{LIKE_NUM: True},
|
||||
{ORTH: '%'}
|
||||
], label='a')
|
||||
matcher.add_pattern(
|
||||
'ab',
|
||||
[
|
||||
{IS_ALPHA: True},
|
||||
{ORTH: '='},
|
||||
{LIKE_NUM: True},
|
||||
], label='b')
|
||||
def test_issue590(en_vocab):
|
||||
"""Test overlapping matches"""
|
||||
doc = get_doc(en_vocab, ['n', '=', '1', ';', 'a', ':', '5', '%'])
|
||||
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add_entity("ab", acceptor=None, on_match=None)
|
||||
matcher.add_pattern('ab', [{IS_ALPHA: True}, {ORTH: ':'},
|
||||
{LIKE_NUM: True}, {ORTH: '%'}],
|
||||
label='a')
|
||||
matcher.add_pattern('ab', [{IS_ALPHA: True}, {ORTH: '='},
|
||||
{LIKE_NUM: True}],
|
||||
label='b')
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 2
|
||||
|
|
|
@ -2,43 +2,23 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import POS, VERB, VerbForm_inf
|
||||
from ...tokens import Doc
|
||||
from ...vocab import Vocab
|
||||
from ...lemmatizer import Lemmatizer
|
||||
from ..util import get_doc
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def index():
|
||||
return {'verb': {}}
|
||||
def test_issue595():
|
||||
"""Test lemmatization of base forms"""
|
||||
words = ["Do", "n't", "feed", "the", "dog"]
|
||||
tag_map = {'VB': {POS: VERB, 'morph': VerbForm_inf}}
|
||||
rules = {"verb": [["ed", "e"]]}
|
||||
|
||||
@pytest.fixture
|
||||
def exceptions():
|
||||
return {'verb': {}}
|
||||
lemmatizer = Lemmatizer({'verb': {}}, {'verb': {}}, rules)
|
||||
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
|
||||
doc = get_doc(vocab, words)
|
||||
|
||||
@pytest.fixture
|
||||
def rules():
|
||||
return {"verb": [["ed", "e"]]}
|
||||
|
||||
@pytest.fixture
|
||||
def lemmatizer(index, exceptions, rules):
|
||||
return Lemmatizer(index, exceptions, rules)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tag_map():
|
||||
return {'VB': {POS: VERB, 'morph': VerbForm_inf}}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def vocab(lemmatizer, tag_map):
|
||||
return Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
|
||||
|
||||
|
||||
def test_not_lemmatize_base_forms(vocab):
|
||||
doc = Doc(vocab, words=["Do", "n't", "feed", "the", "dog"])
|
||||
feed = doc[2]
|
||||
feed.tag_ = 'VB'
|
||||
assert feed.text == 'feed'
|
||||
assert feed.lemma_ == 'feed'
|
||||
doc[2].tag_ = 'VB'
|
||||
assert doc[2].text == 'feed'
|
||||
assert doc[2].lemma_ == 'feed'
|
||||
|
|
|
@ -1,15 +1,13 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...tokens import Doc
|
||||
from ...vocab import Vocab
|
||||
from ..util import get_doc
|
||||
|
||||
|
||||
def test_issue599():
|
||||
doc = Doc(Vocab())
|
||||
def test_issue599(en_vocab):
|
||||
doc = get_doc(en_vocab)
|
||||
doc.is_tagged = True
|
||||
doc.is_parsed = True
|
||||
bytes_ = doc.to_bytes()
|
||||
doc2 = Doc(doc.vocab)
|
||||
doc2.from_bytes(bytes_)
|
||||
doc2 = get_doc(doc.vocab)
|
||||
doc2.from_bytes(doc.to_bytes())
|
||||
assert doc2.is_parsed
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...tokens import Doc
|
||||
from ...vocab import Vocab
|
||||
from ...attrs import POS
|
||||
from ..util import get_doc
|
||||
|
||||
|
||||
def test_issue600():
|
||||
doc = Doc(Vocab(tag_map={'NN': {'pos': 'NOUN'}}), words=['hello'])
|
||||
vocab = Vocab(tag_map={'NN': {'pos': 'NOUN'}})
|
||||
doc = get_doc(vocab, ["hello"])
|
||||
doc[0].tag_ = 'NN'
|
||||
|
|
|
@ -1,27 +1,21 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...attrs import LOWER, ORTH
|
||||
from ...tokens import Doc
|
||||
from ...vocab import Vocab
|
||||
from ...attrs import ORTH
|
||||
from ...matcher import Matcher
|
||||
from ..util import get_doc
|
||||
|
||||
|
||||
def return_false(doc, ent_id, label, start, end):
|
||||
return False
|
||||
def test_issue605(en_vocab):
|
||||
def return_false(doc, ent_id, label, start, end):
|
||||
return False
|
||||
|
||||
|
||||
def test_matcher_accept():
|
||||
doc = Doc(Vocab(), words=['The', 'golf', 'club', 'is', 'broken'])
|
||||
|
||||
golf_pattern = [
|
||||
{ ORTH: "golf"},
|
||||
{ ORTH: "club"}
|
||||
]
|
||||
words = ["The", "golf", "club", "is", "broken"]
|
||||
pattern = [{ORTH: "golf"}, {ORTH: "club"}]
|
||||
label = "Sport_Equipment"
|
||||
doc = get_doc(en_vocab, words)
|
||||
matcher = Matcher(doc.vocab)
|
||||
|
||||
matcher.add_entity('Sport_Equipment', acceptor=return_false)
|
||||
matcher.add_pattern("Sport_Equipment", golf_pattern)
|
||||
matcher.add_entity(label, acceptor=return_false)
|
||||
matcher.add_pattern(label, pattern)
|
||||
match = matcher(doc)
|
||||
|
||||
assert match == []
|
||||
|
|
|
@ -19,7 +19,7 @@ def test_issue615(en_tokenizer):
|
|||
span.merge('NNP' if label else span.root.tag_, span.text, doc.vocab.strings[label])
|
||||
|
||||
text = "The golf club is broken"
|
||||
pattern = [{ ORTH: "golf"}, { ORTH: "club"}]
|
||||
pattern = [{ORTH: "golf"}, {ORTH: "club"}]
|
||||
label = "Sport_Equipment"
|
||||
|
||||
doc = en_tokenizer(text)
|
||||
|
|
|
@ -4,7 +4,8 @@ from __future__ import unicode_literals
|
|||
from ...vocab import Vocab
|
||||
|
||||
|
||||
def test_load_vocab_with_string():
|
||||
def test_issue617():
|
||||
"""Test loading Vocab with string"""
|
||||
try:
|
||||
vocab = Vocab.load('/tmp/vocab')
|
||||
except IOError:
|
||||
|
|
Loading…
Reference in New Issue
Block a user