mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-08 05:43:08 +03:00
Update tests
This commit is contained in:
parent
55f1f7edaf
commit
939a791a52
|
@ -1,21 +1,27 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
@pytest.mark.models
|
||||||
class TestModelSanity:
|
class TestModelSanity:
|
||||||
"""
|
"""
|
||||||
This is to make sure the model works as expected. The tests make sure that values are properly set.
|
This is to make sure the model works as expected. The tests make sure that
|
||||||
Tests are not meant to evaluate the content of the output, only make sure the output is formally okay.
|
values are properly set.
|
||||||
|
Tests are not meant to evaluate the content of the output, only make sure
|
||||||
|
the output is formally okay.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@pytest.fixture(scope='class', params=['en','de'])
|
@pytest.fixture(scope='class', params=['en','de'])
|
||||||
def example(self, request, EN, DE):
|
def example(self, request, EN, DE):
|
||||||
|
assert EN.entity != None
|
||||||
|
assert DE.entity != None
|
||||||
if request.param == 'en':
|
if request.param == 'en':
|
||||||
return EN(u'There was a stranger standing at the big street talking to herself.')
|
doc = EN(u'There was a stranger standing at the big ' +
|
||||||
|
u'street talking to herself.')
|
||||||
elif request.param == 'de':
|
elif request.param == 'de':
|
||||||
return DE(u'An der großen Straße stand eine merkwürdige Gestalt und führte Selbstgespräche.')
|
doc = DE(u'An der großen Straße stand eine merkwürdige ' +
|
||||||
|
u'Gestalt und führte Selbstgespräche.')
|
||||||
|
return doc
|
||||||
|
|
||||||
def test_tokenization(self, example):
|
def test_tokenization(self, example):
|
||||||
# tokenization should split the document into tokens
|
# tokenization should split the document into tokens
|
||||||
|
@ -37,11 +43,12 @@ class TestModelSanity:
|
||||||
|
|
||||||
def test_ner(self, example):
|
def test_ner(self, example):
|
||||||
# if ner was done properly, ent_iob shouldn't be empty
|
# if ner was done properly, ent_iob shouldn't be empty
|
||||||
assert all( t.ent_iob != 0 for t in example )
|
assert all([t.ent_iob != 0 for t in example])
|
||||||
|
|
||||||
def test_vectors(self, example):
|
def test_vectors(self, example):
|
||||||
# if vectors are available, they should differ on different words
|
# if vectors are available, they should differ on different words
|
||||||
# this isn't a perfect test since this could in principle fail in a sane model as well,
|
# this isn't a perfect test since this could in principle fail
|
||||||
|
# in a sane model as well,
|
||||||
# but that's very unlikely and a good indicator if something is wrong
|
# but that's very unlikely and a good indicator if something is wrong
|
||||||
vector0 = example[0].vector
|
vector0 = example[0].vector
|
||||||
vector1 = example[1].vector
|
vector1 = example[1].vector
|
||||||
|
@ -51,8 +58,10 @@ class TestModelSanity:
|
||||||
assert not numpy.array_equal(vector1,vector2)
|
assert not numpy.array_equal(vector1,vector2)
|
||||||
|
|
||||||
def test_probs(self, example):
|
def test_probs(self, example):
|
||||||
# if frequencies/probabilities are okay, they should differ for different words
|
# if frequencies/probabilities are okay, they should differ for
|
||||||
# this isn't a perfect test since this could in principle fail in a sane model as well,
|
# different words
|
||||||
|
# this isn't a perfect test since this could in principle fail
|
||||||
|
# in a sane model as well,
|
||||||
# but that's very unlikely and a good indicator if something is wrong
|
# but that's very unlikely and a good indicator if something is wrong
|
||||||
prob0 = example[0].prob
|
prob0 = example[0].prob
|
||||||
prob1 = example[1].prob
|
prob1 = example[1].prob
|
||||||
|
|
|
@ -27,6 +27,7 @@ def test_overlap_issue118(EN):
|
||||||
assert len(list(doc.ents)) == 0
|
assert len(list(doc.ents)) == 0
|
||||||
matches = [(ent_type, start, end) for ent_id, ent_type, start, end in matcher(doc)]
|
matches = [(ent_type, start, end) for ent_id, ent_type, start, end in matcher(doc)]
|
||||||
assert matches == [(ORG, 9, 11), (ORG, 10, 11)]
|
assert matches == [(ORG, 9, 11), (ORG, 10, 11)]
|
||||||
|
doc.ents = matches[:1]
|
||||||
ents = list(doc.ents)
|
ents = list(doc.ents)
|
||||||
assert len(ents) == 1
|
assert len(ents) == 1
|
||||||
assert ents[0].label == ORG
|
assert ents[0].label == ORG
|
||||||
|
@ -54,6 +55,7 @@ def test_overlap_issue242():
|
||||||
doc = nlp.tokenizer(u'There are different food safety standards in different countries.')
|
doc = nlp.tokenizer(u'There are different food safety standards in different countries.')
|
||||||
|
|
||||||
matches = [(ent_type, start, end) for ent_id, ent_type, start, end in nlp.matcher(doc)]
|
matches = [(ent_type, start, end) for ent_id, ent_type, start, end in nlp.matcher(doc)]
|
||||||
|
doc.ents += tuple(matches)
|
||||||
food_safety, safety_standards = matches
|
food_safety, safety_standards = matches
|
||||||
assert food_safety[1] == 3
|
assert food_safety[1] == 3
|
||||||
assert food_safety[2] == 5
|
assert food_safety[2] == 5
|
||||||
|
@ -79,6 +81,7 @@ def test_overlap_reorder(EN):
|
||||||
assert len(list(doc.ents)) == 0
|
assert len(list(doc.ents)) == 0
|
||||||
matches = [(ent_type, start, end) for ent_id, ent_type, start, end in matcher(doc)]
|
matches = [(ent_type, start, end) for ent_id, ent_type, start, end in matcher(doc)]
|
||||||
assert matches == [(ORG, 9, 11), (ORG, 10, 11)]
|
assert matches == [(ORG, 9, 11), (ORG, 10, 11)]
|
||||||
|
doc.ents = matches[:1]
|
||||||
ents = list(doc.ents)
|
ents = list(doc.ents)
|
||||||
assert len(ents) == 1
|
assert len(ents) == 1
|
||||||
assert ents[0].label == ORG
|
assert ents[0].label == ORG
|
||||||
|
@ -103,6 +106,7 @@ def test_overlap_prefix(EN):
|
||||||
|
|
||||||
assert len(list(doc.ents)) == 0
|
assert len(list(doc.ents)) == 0
|
||||||
matches = [(ent_type, start, end) for ent_id, ent_type, start, end in matcher(doc)]
|
matches = [(ent_type, start, end) for ent_id, ent_type, start, end in matcher(doc)]
|
||||||
|
doc.ents = matches[1:]
|
||||||
assert matches == [(ORG, 9, 10), (ORG, 9, 11)]
|
assert matches == [(ORG, 9, 10), (ORG, 9, 11)]
|
||||||
ents = list(doc.ents)
|
ents = list(doc.ents)
|
||||||
assert len(ents) == 1
|
assert len(ents) == 1
|
||||||
|
@ -128,8 +132,9 @@ def test_overlap_prefix_reorder(EN):
|
||||||
|
|
||||||
assert len(list(doc.ents)) == 0
|
assert len(list(doc.ents)) == 0
|
||||||
matches = [(ent_type, start, end) for ent_id, ent_type, start, end in matcher(doc)]
|
matches = [(ent_type, start, end) for ent_id, ent_type, start, end in matcher(doc)]
|
||||||
|
doc.ents += tuple(matches)[1:]
|
||||||
assert matches == [(ORG, 9, 10), (ORG, 9, 11)]
|
assert matches == [(ORG, 9, 10), (ORG, 9, 11)]
|
||||||
ents = list(doc.ents)
|
ents = doc.ents
|
||||||
assert len(ents) == 1
|
assert len(ents) == 1
|
||||||
assert ents[0].label == ORG
|
assert ents[0].label == ORG
|
||||||
assert ents[0].start == 9
|
assert ents[0].start == 9
|
||||||
|
|
|
@ -23,7 +23,7 @@ def test_consistency_bug(EN):
|
||||||
tokens = EN(u'Where rap essentially went mainstream, illustrated by seminal Public Enemy, Beastie Boys and L.L. Cool J. tracks.')
|
tokens = EN(u'Where rap essentially went mainstream, illustrated by seminal Public Enemy, Beastie Boys and L.L. Cool J. tracks.')
|
||||||
|
|
||||||
tokens = EN(u'''Charity and other short-term aid have buoyed them so far, and a tax-relief bill working its way through Congress would help. But the September 11 Victim Compensation Fund, enacted by Congress to discourage people from filing lawsuits, will determine the shape of their lives for years to come.\n\n''', entity=False)
|
tokens = EN(u'''Charity and other short-term aid have buoyed them so far, and a tax-relief bill working its way through Congress would help. But the September 11 Victim Compensation Fund, enacted by Congress to discourage people from filing lawsuits, will determine the shape of their lives for years to come.\n\n''', entity=False)
|
||||||
ents = EN.matcher(tokens)
|
tokens.ents += tuple(EN.matcher(tokens))
|
||||||
EN.entity(tokens)
|
EN.entity(tokens)
|
||||||
|
|
||||||
|
|
||||||
|
@ -45,6 +45,7 @@ def test_unit_end_gazetteer(EN):
|
||||||
if len(list(doc.ents)) == 0:
|
if len(list(doc.ents)) == 0:
|
||||||
ents = matcher(doc)
|
ents = matcher(doc)
|
||||||
assert len(ents) == 1
|
assert len(ents) == 1
|
||||||
|
doc.ents += tuple(ents)
|
||||||
EN.entity(doc)
|
EN.entity(doc)
|
||||||
assert list(doc.ents)[0].text == 'cal'
|
assert list(doc.ents)[0].text == 'cal'
|
||||||
|
|
||||||
|
|
|
@ -93,7 +93,7 @@ def test_match_preserved(matcher, EN):
|
||||||
EN.tagger(doc)
|
EN.tagger(doc)
|
||||||
assert len(doc.ents) == 0
|
assert len(doc.ents) == 0
|
||||||
doc = EN.tokenizer('I like java')
|
doc = EN.tokenizer('I like java')
|
||||||
matcher(doc)
|
doc.ents += tuple(matcher(doc))
|
||||||
assert len(doc.ents) == 1
|
assert len(doc.ents) == 1
|
||||||
EN.tagger(doc)
|
EN.tagger(doc)
|
||||||
EN.entity(doc)
|
EN.entity(doc)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user