Update tests

This commit is contained in:
Matthew Honnibal 2016-09-24 01:17:03 +02:00
parent 55f1f7edaf
commit 939a791a52
4 changed files with 68 additions and 53 deletions

View File

@ -1,62 +1,71 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import pytest import pytest
import numpy import numpy
@pytest.mark.models @pytest.mark.models
class TestModelSanity: class TestModelSanity:
""" """
This is to make sure the model works as expected. The tests make sure that values are properly set. This is to make sure the model works as expected. The tests make sure that
Tests are not meant to evaluate the content of the output, only make sure the output is formally okay. values are properly set.
""" Tests are not meant to evaluate the content of the output, only make sure
the output is formally okay.
"""
@pytest.fixture(scope='class', params=['en','de'])
def example(self, request, EN, DE):
assert EN.entity != None
assert DE.entity != None
if request.param == 'en':
doc = EN(u'There was a stranger standing at the big ' +
u'street talking to herself.')
elif request.param == 'de':
doc = DE(u'An der großen Straße stand eine merkwürdige ' +
u'Gestalt und führte Selbstgespräche.')
return doc
@pytest.fixture(scope='class', params=['en','de']) def test_tokenization(self, example):
def example(self, request, EN, DE): # tokenization should split the document into tokens
if request.param == 'en': assert len(example) > 1
return EN(u'There was a stranger standing at the big street talking to herself.')
elif request.param == 'de':
return DE(u'An der großen Straße stand eine merkwürdige Gestalt und führte Selbstgespräche.')
def test_tokenization(self, example): def test_tagging(self, example):
# tokenization should split the document into tokens # if tagging was done properly, pos tags shouldn't be empty
assert len(example) > 1 assert example.is_tagged
assert all( t.pos != 0 for t in example )
assert all( t.tag != 0 for t in example )
def test_tagging(self, example): def test_parsing(self, example):
# if tagging was done properly, pos tags shouldn't be empty # if parsing was done properly
assert example.is_tagged # - dependency labels shouldn't be empty
assert all( t.pos != 0 for t in example ) # - the head of some tokens should not be root
assert all( t.tag != 0 for t in example ) assert example.is_parsed
assert all( t.dep != 0 for t in example )
assert any( t.dep != i for i,t in enumerate(example) )
def test_parsing(self, example): def test_ner(self, example):
# if parsing was done properly # if ner was done properly, ent_iob shouldn't be empty
# - dependency labels shouldn't be empty assert all([t.ent_iob != 0 for t in example])
# - the head of some tokens should not be root
assert example.is_parsed
assert all( t.dep != 0 for t in example )
assert any( t.dep != i for i,t in enumerate(example) )
def test_ner(self, example): def test_vectors(self, example):
# if ner was done properly, ent_iob shouldn't be empty # if vectors are available, they should differ on different words
assert all( t.ent_iob != 0 for t in example ) # this isn't a perfect test since this could in principle fail
# in a sane model as well,
# but that's very unlikely and a good indicator if something is wrong
vector0 = example[0].vector
vector1 = example[1].vector
vector2 = example[2].vector
assert not numpy.array_equal(vector0,vector1)
assert not numpy.array_equal(vector0,vector2)
assert not numpy.array_equal(vector1,vector2)
def test_vectors(self, example): def test_probs(self, example):
# if vectors are available, they should differ on different words # if frequencies/probabilities are okay, they should differ for
# this isn't a perfect test since this could in principle fail in a sane model as well, # different words
# but that's very unlikely and a good indicator if something is wrong # this isn't a perfect test since this could in principle fail
vector0 = example[0].vector # in a sane model as well,
vector1 = example[1].vector # but that's very unlikely and a good indicator if something is wrong
vector2 = example[2].vector prob0 = example[0].prob
assert not numpy.array_equal(vector0,vector1) prob1 = example[1].prob
assert not numpy.array_equal(vector0,vector2) prob2 = example[2].prob
assert not numpy.array_equal(vector1,vector2) assert not prob0 == prob1
assert not prob0 == prob2
def test_probs(self, example): assert not prob1 == prob2
# if frequencies/probabilities are okay, they should differ for different words
# this isn't a perfect test since this could in principle fail in a sane model as well,
# but that's very unlikely and a good indicator if something is wrong
prob0 = example[0].prob
prob1 = example[1].prob
prob2 = example[2].prob
assert not prob0 == prob1
assert not prob0 == prob2
assert not prob1 == prob2

View File

@ -27,6 +27,7 @@ def test_overlap_issue118(EN):
assert len(list(doc.ents)) == 0 assert len(list(doc.ents)) == 0
matches = [(ent_type, start, end) for ent_id, ent_type, start, end in matcher(doc)] matches = [(ent_type, start, end) for ent_id, ent_type, start, end in matcher(doc)]
assert matches == [(ORG, 9, 11), (ORG, 10, 11)] assert matches == [(ORG, 9, 11), (ORG, 10, 11)]
doc.ents = matches[:1]
ents = list(doc.ents) ents = list(doc.ents)
assert len(ents) == 1 assert len(ents) == 1
assert ents[0].label == ORG assert ents[0].label == ORG
@ -54,6 +55,7 @@ def test_overlap_issue242():
doc = nlp.tokenizer(u'There are different food safety standards in different countries.') doc = nlp.tokenizer(u'There are different food safety standards in different countries.')
matches = [(ent_type, start, end) for ent_id, ent_type, start, end in nlp.matcher(doc)] matches = [(ent_type, start, end) for ent_id, ent_type, start, end in nlp.matcher(doc)]
doc.ents += tuple(matches)
food_safety, safety_standards = matches food_safety, safety_standards = matches
assert food_safety[1] == 3 assert food_safety[1] == 3
assert food_safety[2] == 5 assert food_safety[2] == 5
@ -79,6 +81,7 @@ def test_overlap_reorder(EN):
assert len(list(doc.ents)) == 0 assert len(list(doc.ents)) == 0
matches = [(ent_type, start, end) for ent_id, ent_type, start, end in matcher(doc)] matches = [(ent_type, start, end) for ent_id, ent_type, start, end in matcher(doc)]
assert matches == [(ORG, 9, 11), (ORG, 10, 11)] assert matches == [(ORG, 9, 11), (ORG, 10, 11)]
doc.ents = matches[:1]
ents = list(doc.ents) ents = list(doc.ents)
assert len(ents) == 1 assert len(ents) == 1
assert ents[0].label == ORG assert ents[0].label == ORG
@ -103,6 +106,7 @@ def test_overlap_prefix(EN):
assert len(list(doc.ents)) == 0 assert len(list(doc.ents)) == 0
matches = [(ent_type, start, end) for ent_id, ent_type, start, end in matcher(doc)] matches = [(ent_type, start, end) for ent_id, ent_type, start, end in matcher(doc)]
doc.ents = matches[1:]
assert matches == [(ORG, 9, 10), (ORG, 9, 11)] assert matches == [(ORG, 9, 10), (ORG, 9, 11)]
ents = list(doc.ents) ents = list(doc.ents)
assert len(ents) == 1 assert len(ents) == 1
@ -128,8 +132,9 @@ def test_overlap_prefix_reorder(EN):
assert len(list(doc.ents)) == 0 assert len(list(doc.ents)) == 0
matches = [(ent_type, start, end) for ent_id, ent_type, start, end in matcher(doc)] matches = [(ent_type, start, end) for ent_id, ent_type, start, end in matcher(doc)]
doc.ents += tuple(matches)[1:]
assert matches == [(ORG, 9, 10), (ORG, 9, 11)] assert matches == [(ORG, 9, 10), (ORG, 9, 11)]
ents = list(doc.ents) ents = doc.ents
assert len(ents) == 1 assert len(ents) == 1
assert ents[0].label == ORG assert ents[0].label == ORG
assert ents[0].start == 9 assert ents[0].start == 9

View File

@ -23,7 +23,7 @@ def test_consistency_bug(EN):
tokens = EN(u'Where rap essentially went mainstream, illustrated by seminal Public Enemy, Beastie Boys and L.L. Cool J. tracks.') tokens = EN(u'Where rap essentially went mainstream, illustrated by seminal Public Enemy, Beastie Boys and L.L. Cool J. tracks.')
tokens = EN(u'''Charity and other short-term aid have buoyed them so far, and a tax-relief bill working its way through Congress would help. But the September 11 Victim Compensation Fund, enacted by Congress to discourage people from filing lawsuits, will determine the shape of their lives for years to come.\n\n''', entity=False) tokens = EN(u'''Charity and other short-term aid have buoyed them so far, and a tax-relief bill working its way through Congress would help. But the September 11 Victim Compensation Fund, enacted by Congress to discourage people from filing lawsuits, will determine the shape of their lives for years to come.\n\n''', entity=False)
ents = EN.matcher(tokens) tokens.ents += tuple(EN.matcher(tokens))
EN.entity(tokens) EN.entity(tokens)
@ -45,6 +45,7 @@ def test_unit_end_gazetteer(EN):
if len(list(doc.ents)) == 0: if len(list(doc.ents)) == 0:
ents = matcher(doc) ents = matcher(doc)
assert len(ents) == 1 assert len(ents) == 1
doc.ents += tuple(ents)
EN.entity(doc) EN.entity(doc)
assert list(doc.ents)[0].text == 'cal' assert list(doc.ents)[0].text == 'cal'

View File

@ -93,7 +93,7 @@ def test_match_preserved(matcher, EN):
EN.tagger(doc) EN.tagger(doc)
assert len(doc.ents) == 0 assert len(doc.ents) == 0
doc = EN.tokenizer('I like java') doc = EN.tokenizer('I like java')
matcher(doc) doc.ents += tuple(matcher(doc))
assert len(doc.ents) == 1 assert len(doc.ents) == 1
EN.tagger(doc) EN.tagger(doc)
EN.entity(doc) EN.entity(doc)