mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Update tests
This commit is contained in:
parent
66766c1454
commit
908809d488
|
@ -2,6 +2,8 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..util import get_doc
|
from ..util import get_doc
|
||||||
|
from ...tokens import Doc
|
||||||
|
from ...vocab import Vocab
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import numpy
|
import numpy
|
||||||
|
@ -204,17 +206,11 @@ def test_doc_api_right_edge(en_tokenizer):
|
||||||
assert doc[6].right_edge.text == ','
|
assert doc[6].right_edge.text == ','
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
def test_doc_api_has_vector():
|
||||||
@pytest.mark.parametrize('text,vectors', [
|
vocab = Vocab()
|
||||||
("apple orange pear", ["apple -1 -1 -1", "orange -1 -1 0", "pear -1 0 -1"])
|
vocab.clear_vectors(2)
|
||||||
])
|
vocab.vectors.add('kitten', numpy.asarray([0., 2.], dtype='f'))
|
||||||
def test_doc_api_has_vector(en_tokenizer, text_file, text, vectors):
|
doc = Doc(vocab, words=['kitten'])
|
||||||
text_file.write('\n'.join(vectors))
|
|
||||||
text_file.seek(0)
|
|
||||||
vector_length = en_tokenizer.vocab.load_vectors(text_file)
|
|
||||||
assert vector_length == 3
|
|
||||||
|
|
||||||
doc = en_tokenizer(text)
|
|
||||||
assert doc.has_vector
|
assert doc.has_vector
|
||||||
|
|
||||||
def test_lowest_common_ancestor(en_tokenizer):
|
def test_lowest_common_ancestor(en_tokenizer):
|
||||||
|
|
|
@ -3,6 +3,8 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...attrs import IS_ALPHA, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_TITLE, IS_STOP
|
from ...attrs import IS_ALPHA, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_TITLE, IS_STOP
|
||||||
from ..util import get_doc
|
from ..util import get_doc
|
||||||
|
from ...vocab import Vocab
|
||||||
|
from ...tokens import Doc
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import numpy
|
import numpy
|
||||||
|
@ -68,26 +70,21 @@ def test_doc_token_api_is_properties(en_vocab):
|
||||||
assert doc[5].like_email
|
assert doc[5].like_email
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
def test_doc_token_api_vectors():
|
||||||
@pytest.mark.parametrize('text,vectors', [
|
vocab = Vocab()
|
||||||
("apples oranges ldskbjls", ["apples -1 -1 -1", "oranges -1 -1 0"])
|
vocab.clear_vectors(2)
|
||||||
])
|
vocab.vectors.add('apples', numpy.asarray([0., 2.], dtype='f'))
|
||||||
def test_doc_token_api_vectors(en_tokenizer, text_file, text, vectors):
|
vocab.vectors.add('oranges', numpy.asarray([0., 1.], dtype='f'))
|
||||||
text_file.write('\n'.join(vectors))
|
doc = Doc(vocab, words=['apples', 'oranges', 'oov'])
|
||||||
text_file.seek(0)
|
assert doc.has_vector
|
||||||
vector_length = en_tokenizer.vocab.load_vectors(text_file)
|
|
||||||
assert vector_length == 3
|
|
||||||
|
|
||||||
tokens = en_tokenizer(text)
|
assert doc[0].has_vector
|
||||||
assert tokens[0].has_vector
|
assert doc[1].has_vector
|
||||||
assert tokens[1].has_vector
|
assert not doc[2].has_vector
|
||||||
assert not tokens[2].has_vector
|
apples_norm = (0*0 + 2*2) ** 0.5
|
||||||
assert tokens[0].similarity(tokens[1]) > tokens[0].similarity(tokens[2])
|
oranges_norm = (0*0 + 1*1) ** 0.5
|
||||||
assert tokens[0].similarity(tokens[1]) == tokens[1].similarity(tokens[0])
|
cosine = ((0*0) + (2*1)) / (apples_norm * oranges_norm)
|
||||||
assert sum(tokens[0].vector) != sum(tokens[1].vector)
|
assert doc[0].similarity(doc[1]) == cosine
|
||||||
assert numpy.isclose(
|
|
||||||
tokens[0].vector_norm,
|
|
||||||
numpy.sqrt(numpy.dot(tokens[0].vector, tokens[0].vector)))
|
|
||||||
|
|
||||||
|
|
||||||
def test_doc_token_api_ancestors(en_tokenizer):
|
def test_doc_token_api_ancestors(en_tokenizer):
|
||||||
|
|
|
@ -1,8 +1,11 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
import spacy
|
||||||
|
|
||||||
@pytest.mark.models('en')
|
#@pytest.mark.models('en')
|
||||||
def test_issue1305(EN):
|
def test_issue1305():
|
||||||
'''Test lemmatization of English VBZ'''
|
'''Test lemmatization of English VBZ'''
|
||||||
assert EN.vocab.morphology.lemmatizer('works', 'verb') == set(['work'])
|
nlp = spacy.load('en_core_web_sm')
|
||||||
doc = EN(u'This app works well')
|
assert nlp.vocab.morphology.lemmatizer('works', 'verb') == ['work']
|
||||||
|
doc = nlp(u'This app works well')
|
||||||
|
print([(w.text, w.tag_) for w in doc])
|
||||||
assert doc[2].lemma_ == 'work'
|
assert doc[2].lemma_ == 'work'
|
||||||
|
|
|
@ -9,4 +9,4 @@ import pytest
|
||||||
@pytest.mark.parametrize('word,lemmas', [("chromosomes", ["chromosome"]), ("endosomes", ["endosome"]), ("colocalizes", ["colocalize", "colocaliz"])])
|
@pytest.mark.parametrize('word,lemmas', [("chromosomes", ["chromosome"]), ("endosomes", ["endosome"]), ("colocalizes", ["colocalize", "colocaliz"])])
|
||||||
def test_issue781(EN, word, lemmas):
|
def test_issue781(EN, word, lemmas):
|
||||||
lemmatizer = EN.Defaults.create_lemmatizer()
|
lemmatizer = EN.Defaults.create_lemmatizer()
|
||||||
assert lemmatizer(word, 'noun', morphology={'number': 'plur'}) == set(lemmas)
|
assert lemmatizer(word, 'noun', morphology={'number': 'plur'}) == lemmas
|
||||||
|
|
Loading…
Reference in New Issue
Block a user