mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
Tidy up and rename regression tests and remove unnecessary imports
This commit is contained in:
parent
5e1b6178e3
commit
9b4bea1df9
|
@ -1,7 +1,4 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
"""Test a bug that arose from having overlapping matches"""
|
|
||||||
|
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...matcher import Matcher
|
from ...matcher import Matcher
|
||||||
|
@ -25,6 +22,7 @@ def doc(en_tokenizer):
|
||||||
|
|
||||||
@pytest.mark.parametrize('pattern', [pattern1, pattern2])
|
@pytest.mark.parametrize('pattern', [pattern1, pattern2])
|
||||||
def test_issue118(doc, pattern):
|
def test_issue118(doc, pattern):
|
||||||
|
"""Test a bug that arose from having overlapping matches"""
|
||||||
ORG = doc.vocab.strings['ORG']
|
ORG = doc.vocab.strings['ORG']
|
||||||
matcher = Matcher(doc.vocab, {'BostonCeltics': ('ORG', {}, pattern)})
|
matcher = Matcher(doc.vocab, {'BostonCeltics': ('ORG', {}, pattern)})
|
||||||
|
|
||||||
|
@ -41,6 +39,7 @@ def test_issue118(doc, pattern):
|
||||||
|
|
||||||
@pytest.mark.parametrize('pattern', [pattern3, pattern4])
|
@pytest.mark.parametrize('pattern', [pattern3, pattern4])
|
||||||
def test_issue118_prefix_reorder(doc, pattern):
|
def test_issue118_prefix_reorder(doc, pattern):
|
||||||
|
"""Test a bug that arose from having overlapping matches"""
|
||||||
ORG = doc.vocab.strings['ORG']
|
ORG = doc.vocab.strings['ORG']
|
||||||
matcher = Matcher(doc.vocab, {'BostonCeltics': ('ORG', {}, pattern)})
|
matcher = Matcher(doc.vocab, {'BostonCeltics': ('ORG', {}, pattern)})
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,6 @@ import pytest
|
||||||
|
|
||||||
def test_issue242(en_tokenizer):
|
def test_issue242(en_tokenizer):
|
||||||
"""Test overlapping multi-word phrases."""
|
"""Test overlapping multi-word phrases."""
|
||||||
|
|
||||||
text = "There are different food safety standards in different countries."
|
text = "There are different food safety standards in different countries."
|
||||||
patterns = [[{LOWER: 'food'}, {LOWER: 'safety'}],
|
patterns = [[{LOWER: 'food'}, {LOWER: 'safety'}],
|
||||||
[{LOWER: 'safety'}, {LOWER: 'standards'}]]
|
[{LOWER: 'safety'}, {LOWER: 'standards'}]]
|
||||||
|
|
|
@ -4,7 +4,7 @@ from __future__ import unicode_literals
|
||||||
from ..util import get_doc
|
from ..util import get_doc
|
||||||
|
|
||||||
|
|
||||||
def test_sbd_empty_string(en_tokenizer):
|
def test_issue309(en_tokenizer):
|
||||||
"""Test Issue #309: SBD fails on empty string"""
|
"""Test Issue #309: SBD fails on empty string"""
|
||||||
tokens = en_tokenizer(" ")
|
tokens = en_tokenizer(" ")
|
||||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=[0], deps=['ROOT'])
|
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=[0], deps=['ROOT'])
|
||||||
|
|
|
@ -1,16 +1,9 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...en import English
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def en_tokenizer():
|
|
||||||
return English.Defaults.create_tokenizer()
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue351(en_tokenizer):
|
def test_issue351(en_tokenizer):
|
||||||
doc = en_tokenizer(" This is a cat.")
|
doc = en_tokenizer(" This is a cat.")
|
||||||
assert doc[0].idx == 0
|
assert doc[0].idx == 0
|
||||||
|
|
|
@ -1,16 +1,10 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...en import English
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
def test_issue360(en_tokenizer):
|
||||||
def en_tokenizer():
|
"""Test tokenization of big ellipsis"""
|
||||||
return English.Defaults.create_tokenizer()
|
|
||||||
|
|
||||||
|
|
||||||
def test_big_ellipsis(en_tokenizer):
|
|
||||||
tokens = en_tokenizer('$45...............Asking')
|
tokens = en_tokenizer('$45...............Asking')
|
||||||
assert len(tokens) > 2
|
assert len(tokens) > 2
|
||||||
|
|
|
@ -1,31 +1,25 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import spacy
|
from ...attrs import ORTH
|
||||||
from spacy.attrs import ORTH
|
from ...matcher import Matcher
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
@pytest.mark.models
|
||||||
def test_issue429():
|
def test_issue429(EN):
|
||||||
|
|
||||||
nlp = spacy.load('en', parser=False)
|
|
||||||
|
|
||||||
|
|
||||||
def merge_phrases(matcher, doc, i, matches):
|
def merge_phrases(matcher, doc, i, matches):
|
||||||
if i != len(matches) - 1:
|
if i != len(matches) - 1:
|
||||||
return None
|
return None
|
||||||
spans = [(ent_id, label, doc[start:end]) for ent_id, label, start, end in matches]
|
spans = [(ent_id, label, doc[start:end]) for ent_id, label, start, end in matches]
|
||||||
for ent_id, label, span in spans:
|
for ent_id, label, span in spans:
|
||||||
span.merge('NNP' if label else span.root.tag_, span.text, nlp.vocab.strings[label])
|
span.merge('NNP' if label else span.root.tag_, span.text, EN.vocab.strings[label])
|
||||||
|
|
||||||
doc = nlp('a')
|
doc = EN('a')
|
||||||
nlp.matcher.add('key', label='TEST', attrs={}, specs=[[{ORTH: 'a'}]], on_match=merge_phrases)
|
matcher = Matcher(EN.vocab)
|
||||||
doc = nlp.tokenizer('a b c')
|
matcher.add('key', label='TEST', attrs={}, specs=[[{ORTH: 'a'}]], on_match=merge_phrases)
|
||||||
nlp.tagger(doc)
|
doc = EN.tokenizer('a b c')
|
||||||
nlp.matcher(doc)
|
EN.tagger(doc)
|
||||||
|
matcher(doc)
|
||||||
for word in doc:
|
EN.entity(doc)
|
||||||
print(word.text, word.ent_iob_, word.ent_type_)
|
|
||||||
nlp.entity(doc)
|
|
||||||
|
|
21
spacy/tests/regression/test_issue514.py
Normal file
21
spacy/tests/regression/test_issue514.py
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ..util import get_doc
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
|
def test_issue514(EN):
|
||||||
|
"""Test serializing after adding entity"""
|
||||||
|
text = ["This", "is", "a", "sentence", "about", "pasta", "."]
|
||||||
|
vocab = EN.entity.vocab
|
||||||
|
doc = get_doc(vocab, text)
|
||||||
|
EN.entity.add_label("Food")
|
||||||
|
EN.entity(doc)
|
||||||
|
label_id = vocab.strings[u'Food']
|
||||||
|
doc.ents = [(label_id, 5,6)]
|
||||||
|
assert [(ent.label_, ent.text) for ent in doc.ents] == [("Food", "pasta")]
|
||||||
|
doc2 = get_doc(EN.entity.vocab).from_bytes(doc.to_bytes())
|
||||||
|
assert [(ent.label_, ent.text) for ent in doc2.ents] == [("Food", "pasta")]
|
|
@ -6,5 +6,5 @@ import pytest
|
||||||
|
|
||||||
@pytest.mark.models
|
@pytest.mark.models
|
||||||
def test_issue54(EN):
|
def test_issue54(EN):
|
||||||
text = u'Talks given by women had a slightly higher number of questions asked (3.2$\pm$0.2) than talks given by men (2.6$\pm$0.1).'
|
text = "Talks given by women had a slightly higher number of questions asked (3.2$\pm$0.2) than talks given by men (2.6$\pm$0.1)."
|
||||||
tokens = EN(text)
|
tokens = EN(text)
|
||||||
|
|
|
@ -1,21 +1,20 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import spacy
|
from ...matcher import Matcher
|
||||||
import spacy.matcher
|
from ...attrs import IS_PUNCT, ORTH
|
||||||
from spacy.attrs import IS_PUNCT, ORTH
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
@pytest.mark.models
|
||||||
def test_matcher_segfault():
|
def test_issue587(EN):
|
||||||
nlp = spacy.load('en', parser=False, entity=False)
|
"""Test that Matcher doesn't segfault on particular input"""
|
||||||
matcher = spacy.matcher.Matcher(nlp.vocab)
|
matcher = Matcher(EN.vocab)
|
||||||
content = '''a b; c'''
|
content = '''a b; c'''
|
||||||
matcher.add(entity_key='1', label='TEST', attrs={}, specs=[[{ORTH: 'a'}, {ORTH: 'b'}]])
|
matcher.add(entity_key='1', label='TEST', attrs={}, specs=[[{ORTH: 'a'}, {ORTH: 'b'}]])
|
||||||
matcher(nlp(content))
|
matcher(EN(content))
|
||||||
matcher.add(entity_key='2', label='TEST', attrs={}, specs=[[{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'c'}]])
|
matcher.add(entity_key='2', label='TEST', attrs={}, specs=[[{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'c'}]])
|
||||||
matcher(nlp(content))
|
matcher(EN(content))
|
||||||
matcher.add(entity_key='3', label='TEST', attrs={}, specs=[[{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'd'}]])
|
matcher.add(entity_key='3', label='TEST', attrs={}, specs=[[{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'd'}]])
|
||||||
matcher(nlp(content))
|
matcher(EN(content))
|
||||||
|
|
|
@ -1,14 +1,12 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...vocab import Vocab
|
|
||||||
from ...tokens import Doc
|
|
||||||
from ...matcher import Matcher
|
from ...matcher import Matcher
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_issue588():
|
def test_issue588(en_vocab):
|
||||||
matcher = Matcher(Vocab())
|
matcher = Matcher(en_vocab)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
matcher.add(entity_key='1', label='TEST', attrs={}, specs=[[]])
|
matcher.add(entity_key='1', label='TEST', attrs={}, specs=[[]])
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...vocab import Vocab
|
from ...vocab import Vocab
|
||||||
from ...tokens import Doc
|
from ..util import get_doc
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
@ -10,4 +10,4 @@ import pytest
|
||||||
def test_issue589():
|
def test_issue589():
|
||||||
vocab = Vocab()
|
vocab = Vocab()
|
||||||
vocab.strings.set_frozen(True)
|
vocab.strings.set_frozen(True)
|
||||||
doc = Doc(vocab, words=['whata'])
|
doc = get_doc(vocab, ['whata'])
|
||||||
|
|
|
@ -1,37 +1,22 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...attrs import *
|
from ...attrs import ORTH, IS_ALPHA, LIKE_NUM
|
||||||
from ...matcher import Matcher
|
from ...matcher import Matcher
|
||||||
from ...tokens import Doc
|
from ..util import get_doc
|
||||||
from ...en import English
|
|
||||||
|
|
||||||
|
|
||||||
def test_overlapping_matches():
|
def test_issue590(en_vocab):
|
||||||
vocab = English.Defaults.create_vocab()
|
"""Test overlapping matches"""
|
||||||
doc = Doc(vocab, words=['n', '=', '1', ';', 'a', ':', '5', '%'])
|
doc = get_doc(en_vocab, ['n', '=', '1', ';', 'a', ':', '5', '%'])
|
||||||
|
|
||||||
matcher = Matcher(vocab)
|
|
||||||
matcher.add_entity(
|
|
||||||
"ab",
|
|
||||||
acceptor=None,
|
|
||||||
on_match=None
|
|
||||||
)
|
|
||||||
matcher.add_pattern(
|
|
||||||
'ab',
|
|
||||||
[
|
|
||||||
{IS_ALPHA: True},
|
|
||||||
{ORTH: ':'},
|
|
||||||
{LIKE_NUM: True},
|
|
||||||
{ORTH: '%'}
|
|
||||||
], label='a')
|
|
||||||
matcher.add_pattern(
|
|
||||||
'ab',
|
|
||||||
[
|
|
||||||
{IS_ALPHA: True},
|
|
||||||
{ORTH: '='},
|
|
||||||
{LIKE_NUM: True},
|
|
||||||
], label='b')
|
|
||||||
|
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
matcher.add_entity("ab", acceptor=None, on_match=None)
|
||||||
|
matcher.add_pattern('ab', [{IS_ALPHA: True}, {ORTH: ':'},
|
||||||
|
{LIKE_NUM: True}, {ORTH: '%'}],
|
||||||
|
label='a')
|
||||||
|
matcher.add_pattern('ab', [{IS_ALPHA: True}, {ORTH: '='},
|
||||||
|
{LIKE_NUM: True}],
|
||||||
|
label='b')
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert len(matches) == 2
|
assert len(matches) == 2
|
||||||
|
|
|
@ -2,43 +2,23 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...symbols import POS, VERB, VerbForm_inf
|
from ...symbols import POS, VERB, VerbForm_inf
|
||||||
from ...tokens import Doc
|
|
||||||
from ...vocab import Vocab
|
from ...vocab import Vocab
|
||||||
from ...lemmatizer import Lemmatizer
|
from ...lemmatizer import Lemmatizer
|
||||||
|
from ..util import get_doc
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
def test_issue595():
|
||||||
def index():
|
"""Test lemmatization of base forms"""
|
||||||
return {'verb': {}}
|
words = ["Do", "n't", "feed", "the", "dog"]
|
||||||
|
tag_map = {'VB': {POS: VERB, 'morph': VerbForm_inf}}
|
||||||
|
rules = {"verb": [["ed", "e"]]}
|
||||||
|
|
||||||
@pytest.fixture
|
lemmatizer = Lemmatizer({'verb': {}}, {'verb': {}}, rules)
|
||||||
def exceptions():
|
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
|
||||||
return {'verb': {}}
|
doc = get_doc(vocab, words)
|
||||||
|
|
||||||
@pytest.fixture
|
doc[2].tag_ = 'VB'
|
||||||
def rules():
|
assert doc[2].text == 'feed'
|
||||||
return {"verb": [["ed", "e"]]}
|
assert doc[2].lemma_ == 'feed'
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def lemmatizer(index, exceptions, rules):
|
|
||||||
return Lemmatizer(index, exceptions, rules)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def tag_map():
|
|
||||||
return {'VB': {POS: VERB, 'morph': VerbForm_inf}}
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def vocab(lemmatizer, tag_map):
|
|
||||||
return Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
|
|
||||||
|
|
||||||
|
|
||||||
def test_not_lemmatize_base_forms(vocab):
|
|
||||||
doc = Doc(vocab, words=["Do", "n't", "feed", "the", "dog"])
|
|
||||||
feed = doc[2]
|
|
||||||
feed.tag_ = 'VB'
|
|
||||||
assert feed.text == 'feed'
|
|
||||||
assert feed.lemma_ == 'feed'
|
|
||||||
|
|
|
@ -1,15 +1,13 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...tokens import Doc
|
from ..util import get_doc
|
||||||
from ...vocab import Vocab
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue599():
|
def test_issue599(en_vocab):
|
||||||
doc = Doc(Vocab())
|
doc = get_doc(en_vocab)
|
||||||
doc.is_tagged = True
|
doc.is_tagged = True
|
||||||
doc.is_parsed = True
|
doc.is_parsed = True
|
||||||
bytes_ = doc.to_bytes()
|
doc2 = get_doc(doc.vocab)
|
||||||
doc2 = Doc(doc.vocab)
|
doc2.from_bytes(doc.to_bytes())
|
||||||
doc2.from_bytes(bytes_)
|
|
||||||
assert doc2.is_parsed
|
assert doc2.is_parsed
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...tokens import Doc
|
|
||||||
from ...vocab import Vocab
|
from ...vocab import Vocab
|
||||||
from ...attrs import POS
|
from ..util import get_doc
|
||||||
|
|
||||||
|
|
||||||
def test_issue600():
|
def test_issue600():
|
||||||
doc = Doc(Vocab(tag_map={'NN': {'pos': 'NOUN'}}), words=['hello'])
|
vocab = Vocab(tag_map={'NN': {'pos': 'NOUN'}})
|
||||||
|
doc = get_doc(vocab, ["hello"])
|
||||||
doc[0].tag_ = 'NN'
|
doc[0].tag_ = 'NN'
|
||||||
|
|
|
@ -1,27 +1,21 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...attrs import LOWER, ORTH
|
from ...attrs import ORTH
|
||||||
from ...tokens import Doc
|
|
||||||
from ...vocab import Vocab
|
|
||||||
from ...matcher import Matcher
|
from ...matcher import Matcher
|
||||||
|
from ..util import get_doc
|
||||||
|
|
||||||
|
|
||||||
def return_false(doc, ent_id, label, start, end):
|
def test_issue605(en_vocab):
|
||||||
return False
|
def return_false(doc, ent_id, label, start, end):
|
||||||
|
return False
|
||||||
|
|
||||||
|
words = ["The", "golf", "club", "is", "broken"]
|
||||||
def test_matcher_accept():
|
pattern = [{ORTH: "golf"}, {ORTH: "club"}]
|
||||||
doc = Doc(Vocab(), words=['The', 'golf', 'club', 'is', 'broken'])
|
label = "Sport_Equipment"
|
||||||
|
doc = get_doc(en_vocab, words)
|
||||||
golf_pattern = [
|
|
||||||
{ ORTH: "golf"},
|
|
||||||
{ ORTH: "club"}
|
|
||||||
]
|
|
||||||
matcher = Matcher(doc.vocab)
|
matcher = Matcher(doc.vocab)
|
||||||
|
matcher.add_entity(label, acceptor=return_false)
|
||||||
matcher.add_entity('Sport_Equipment', acceptor=return_false)
|
matcher.add_pattern(label, pattern)
|
||||||
matcher.add_pattern("Sport_Equipment", golf_pattern)
|
|
||||||
match = matcher(doc)
|
match = matcher(doc)
|
||||||
|
|
||||||
assert match == []
|
assert match == []
|
||||||
|
|
|
@ -19,7 +19,7 @@ def test_issue615(en_tokenizer):
|
||||||
span.merge('NNP' if label else span.root.tag_, span.text, doc.vocab.strings[label])
|
span.merge('NNP' if label else span.root.tag_, span.text, doc.vocab.strings[label])
|
||||||
|
|
||||||
text = "The golf club is broken"
|
text = "The golf club is broken"
|
||||||
pattern = [{ ORTH: "golf"}, { ORTH: "club"}]
|
pattern = [{ORTH: "golf"}, {ORTH: "club"}]
|
||||||
label = "Sport_Equipment"
|
label = "Sport_Equipment"
|
||||||
|
|
||||||
doc = en_tokenizer(text)
|
doc = en_tokenizer(text)
|
||||||
|
|
|
@ -4,7 +4,8 @@ from __future__ import unicode_literals
|
||||||
from ...vocab import Vocab
|
from ...vocab import Vocab
|
||||||
|
|
||||||
|
|
||||||
def test_load_vocab_with_string():
|
def test_issue617():
|
||||||
|
"""Test loading Vocab with string"""
|
||||||
try:
|
try:
|
||||||
vocab = Vocab.load('/tmp/vocab')
|
vocab = Vocab.load('/tmp/vocab')
|
||||||
except IOError:
|
except IOError:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user