mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-15 20:16:23 +03:00
86 lines
2.6 KiB
Python
86 lines
2.6 KiB
Python
# coding: utf-8
|
|
from __future__ import unicode_literals
|
|
|
|
import numpy
|
|
import pytest
|
|
|
|
|
|
@pytest.fixture
|
|
def example(EN):
|
|
"""
|
|
This is to make sure the model works as expected. The tests make sure that
|
|
values are properly set. Tests are not meant to evaluate the content of the
|
|
output, only make sure the output is formally okay.
|
|
"""
|
|
assert EN.entity != None
|
|
return EN('There was a stranger standing at the big street talking to herself.')
|
|
|
|
|
|
@pytest.mark.models('en')
|
|
def test_en_models_tokenization(example):
|
|
# tokenization should split the document into tokens
|
|
assert len(example) > 1
|
|
|
|
|
|
@pytest.mark.models('en')
|
|
def test_en_models_tagging(example):
|
|
# if tagging was done properly, pos tags shouldn't be empty
|
|
assert example.is_tagged
|
|
assert all(t.pos != 0 for t in example)
|
|
assert all(t.tag != 0 for t in example)
|
|
|
|
|
|
@pytest.mark.models('en')
|
|
def test_en_models_parsing(example):
|
|
# if parsing was done properly
|
|
# - dependency labels shouldn't be empty
|
|
# - the head of some tokens should not be root
|
|
assert example.is_parsed
|
|
assert all(t.dep != 0 for t in example)
|
|
assert any(t.dep != i for i,t in enumerate(example))
|
|
|
|
|
|
@pytest.mark.models('en')
|
|
def test_en_models_ner(example):
|
|
# if ner was done properly, ent_iob shouldn't be empty
|
|
assert all([t.ent_iob != 0 for t in example])
|
|
|
|
|
|
@pytest.mark.models('en')
|
|
def test_en_models_vectors(example):
|
|
# if vectors are available, they should differ on different words
|
|
# this isn't a perfect test since this could in principle fail
|
|
# in a sane model as well,
|
|
# but that's very unlikely and a good indicator if something is wrong
|
|
if example.vocab.vectors_length:
|
|
vector0 = example[0].vector
|
|
vector1 = example[1].vector
|
|
vector2 = example[2].vector
|
|
assert not numpy.array_equal(vector0,vector1)
|
|
assert not numpy.array_equal(vector0,vector2)
|
|
assert not numpy.array_equal(vector1,vector2)
|
|
|
|
|
|
@pytest.mark.xfail
|
|
@pytest.mark.models('en')
|
|
def test_en_models_probs(example):
|
|
# if frequencies/probabilities are okay, they should differ for
|
|
# different words
|
|
# this isn't a perfect test since this could in principle fail
|
|
# in a sane model as well,
|
|
# but that's very unlikely and a good indicator if something is wrong
|
|
prob0 = example[0].prob
|
|
prob1 = example[1].prob
|
|
prob2 = example[2].prob
|
|
assert not prob0 == prob1
|
|
assert not prob0 == prob2
|
|
assert not prob1 == prob2
|
|
|
|
|
|
@pytest.mark.models('en')
|
|
def test_no_vectors_similarity(EN):
|
|
doc1 = EN(u'hallo')
|
|
doc2 = EN(u'hi')
|
|
assert doc1.similarity(doc2) > 0
|
|
|