Start testing Vectors class

This commit is contained in:
Matthew Honnibal 2017-06-05 12:32:49 +02:00
parent c811790095
commit 30369d580f

View File

@ -1,140 +1,166 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from ...tokenizer import Tokenizer from ...vectors import Vectors
from ..util import get_doc, add_vecs_to_vocab
import numpy
import pytest import pytest
@pytest.fixture @pytest.fixture
def vectors(): def strings():
return [("apple", [0.0, 1.0, 2.0]), ("orange", [3.0, -2.0, 4.0])] return ["apple", "orange"]
@pytest.fixture
def data():
return numpy.asarray([[0.0, 1.0, 2.0], [3.0, -2.0, 4.0]], dtype='f')
@pytest.fixture() def test_init_vectors_with_data(strings, data):
def vocab(en_vocab, vectors): v = Vectors(strings, data)
return add_vecs_to_vocab(en_vocab, vectors) assert v.shape == data.shape
def test_init_vectors_with_width(strings):
v = Vectors(strings, 3)
assert v.shape == (len(strings), 3)
@pytest.fixture() def test_get_vector(strings, data):
def tokenizer_v(vocab): v = Vectors(strings, data)
return Tokenizer(vocab, {}, None, None, None) assert list(v[strings[0]]) == list(data[0])
assert list(v[strings[0]]) != list(data[1])
assert list(v[strings[1]]) != list(data[0])
@pytest.mark.xfail def test_set_vector(strings, data):
@pytest.mark.parametrize('text', ["apple and orange"]) orig = data.copy()
def test_vectors_token_vector(tokenizer_v, vectors, text): v = Vectors(strings, data)
doc = tokenizer_v(text) assert list(v[strings[0]]) == list(orig[0])
assert vectors[0] == (doc[0].text, list(doc[0].vector)) assert list(v[strings[0]]) != list(orig[1])
assert vectors[1] == (doc[2].text, list(doc[2].vector)) v[strings[0]] = data[1]
assert list(v[strings[0]]) == list(orig[1])
assert list(v[strings[0]]) != list(orig[0])
@pytest.mark.xfail #
@pytest.mark.parametrize('text', ["apple", "orange"]) #@pytest.fixture()
def test_vectors_lexeme_vector(vocab, text): #def tokenizer_v(vocab):
lex = vocab[text] # return Tokenizer(vocab, {}, None, None, None)
assert list(lex.vector) #
assert lex.vector_norm #
#@pytest.mark.xfail
#@pytest.mark.parametrize('text', ["apple and orange"])
@pytest.mark.xfail #def test_vectors_token_vector(tokenizer_v, vectors, text):
@pytest.mark.parametrize('text', [["apple", "and", "orange"]]) # doc = tokenizer_v(text)
def test_vectors_doc_vector(vocab, text): # assert vectors[0] == (doc[0].text, list(doc[0].vector))
doc = get_doc(vocab, text) # assert vectors[1] == (doc[2].text, list(doc[2].vector))
assert list(doc.vector) #
assert doc.vector_norm #
#@pytest.mark.xfail
#@pytest.mark.parametrize('text', ["apple", "orange"])
@pytest.mark.xfail #def test_vectors_lexeme_vector(vocab, text):
@pytest.mark.parametrize('text', [["apple", "and", "orange"]]) # lex = vocab[text]
def test_vectors_span_vector(vocab, text): # assert list(lex.vector)
span = get_doc(vocab, text)[0:2] # assert lex.vector_norm
assert list(span.vector) #
assert span.vector_norm #
#@pytest.mark.xfail
#@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
@pytest.mark.xfail #def test_vectors_doc_vector(vocab, text):
@pytest.mark.parametrize('text', ["apple orange"]) # doc = get_doc(vocab, text)
def test_vectors_token_token_similarity(tokenizer_v, text): # assert list(doc.vector)
doc = tokenizer_v(text) # assert doc.vector_norm
assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0]) #
assert 0.0 < doc[0].similarity(doc[1]) < 1.0 #
#@pytest.mark.xfail
#@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
@pytest.mark.xfail #def test_vectors_span_vector(vocab, text):
@pytest.mark.parametrize('text1,text2', [("apple", "orange")]) # span = get_doc(vocab, text)[0:2]
def test_vectors_token_lexeme_similarity(tokenizer_v, vocab, text1, text2): # assert list(span.vector)
token = tokenizer_v(text1) # assert span.vector_norm
lex = vocab[text2] #
assert token.similarity(lex) == lex.similarity(token) #
assert 0.0 < token.similarity(lex) < 1.0 #@pytest.mark.xfail
#@pytest.mark.parametrize('text', ["apple orange"])
#def test_vectors_token_token_similarity(tokenizer_v, text):
@pytest.mark.xfail # doc = tokenizer_v(text)
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) # assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0])
def test_vectors_token_span_similarity(vocab, text): # assert 0.0 < doc[0].similarity(doc[1]) < 1.0
doc = get_doc(vocab, text) #
assert doc[0].similarity(doc[1:3]) == doc[1:3].similarity(doc[0]) #
assert 0.0 < doc[0].similarity(doc[1:3]) < 1.0 #@pytest.mark.xfail
#@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
#def test_vectors_token_lexeme_similarity(tokenizer_v, vocab, text1, text2):
@pytest.mark.xfail # token = tokenizer_v(text1)
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) # lex = vocab[text2]
def test_vectors_token_doc_similarity(vocab, text): # assert token.similarity(lex) == lex.similarity(token)
doc = get_doc(vocab, text) # assert 0.0 < token.similarity(lex) < 1.0
assert doc[0].similarity(doc) == doc.similarity(doc[0]) #
assert 0.0 < doc[0].similarity(doc) < 1.0 #
#@pytest.mark.xfail
#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
@pytest.mark.xfail #def test_vectors_token_span_similarity(vocab, text):
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) # doc = get_doc(vocab, text)
def test_vectors_lexeme_span_similarity(vocab, text): # assert doc[0].similarity(doc[1:3]) == doc[1:3].similarity(doc[0])
doc = get_doc(vocab, text) # assert 0.0 < doc[0].similarity(doc[1:3]) < 1.0
lex = vocab[text[0]] #
assert lex.similarity(doc[1:3]) == doc[1:3].similarity(lex) #
assert 0.0 < doc.similarity(doc[1:3]) < 1.0 #@pytest.mark.xfail
#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
#def test_vectors_token_doc_similarity(vocab, text):
@pytest.mark.xfail # doc = get_doc(vocab, text)
@pytest.mark.parametrize('text1,text2', [("apple", "orange")]) # assert doc[0].similarity(doc) == doc.similarity(doc[0])
def test_vectors_lexeme_lexeme_similarity(vocab, text1, text2): # assert 0.0 < doc[0].similarity(doc) < 1.0
lex1 = vocab[text1] #
lex2 = vocab[text2] #
assert lex1.similarity(lex2) == lex2.similarity(lex1) #@pytest.mark.xfail
assert 0.0 < lex1.similarity(lex2) < 1.0 #@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
#def test_vectors_lexeme_span_similarity(vocab, text):
# doc = get_doc(vocab, text)
@pytest.mark.xfail # lex = vocab[text[0]]
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) # assert lex.similarity(doc[1:3]) == doc[1:3].similarity(lex)
def test_vectors_lexeme_doc_similarity(vocab, text): # assert 0.0 < doc.similarity(doc[1:3]) < 1.0
doc = get_doc(vocab, text) #
lex = vocab[text[0]] #
assert lex.similarity(doc) == doc.similarity(lex) #@pytest.mark.xfail
assert 0.0 < lex.similarity(doc) < 1.0 #@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
#def test_vectors_lexeme_lexeme_similarity(vocab, text1, text2):
# lex1 = vocab[text1]
@pytest.mark.xfail # lex2 = vocab[text2]
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) # assert lex1.similarity(lex2) == lex2.similarity(lex1)
def test_vectors_span_span_similarity(vocab, text): # assert 0.0 < lex1.similarity(lex2) < 1.0
doc = get_doc(vocab, text) #
assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2]) #
assert 0.0 < doc[0:2].similarity(doc[1:3]) < 1.0 #@pytest.mark.xfail
#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
#def test_vectors_lexeme_doc_similarity(vocab, text):
@pytest.mark.xfail # doc = get_doc(vocab, text)
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) # lex = vocab[text[0]]
def test_vectors_span_doc_similarity(vocab, text): # assert lex.similarity(doc) == doc.similarity(lex)
doc = get_doc(vocab, text) # assert 0.0 < lex.similarity(doc) < 1.0
assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2]) #
assert 0.0 < doc[0:2].similarity(doc) < 1.0 #
#@pytest.mark.xfail
#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
@pytest.mark.xfail #def test_vectors_span_span_similarity(vocab, text):
@pytest.mark.parametrize('text1,text2', [ # doc = get_doc(vocab, text)
(["apple", "and", "apple", "pie"], ["orange", "juice"])]) # assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2])
def test_vectors_doc_doc_similarity(vocab, text1, text2): # assert 0.0 < doc[0:2].similarity(doc[1:3]) < 1.0
doc1 = get_doc(vocab, text1) #
doc2 = get_doc(vocab, text2) #
assert doc1.similarity(doc2) == doc2.similarity(doc1) #@pytest.mark.xfail
assert 0.0 < doc1.similarity(doc2) < 1.0 #@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
#def test_vectors_span_doc_similarity(vocab, text):
# doc = get_doc(vocab, text)
# assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2])
# assert 0.0 < doc[0:2].similarity(doc) < 1.0
#
#
#@pytest.mark.xfail
#@pytest.mark.parametrize('text1,text2', [
# (["apple", "and", "apple", "pie"], ["orange", "juice"])])
#def test_vectors_doc_doc_similarity(vocab, text1, text2):
# doc1 = get_doc(vocab, text1)
# doc2 = get_doc(vocab, text2)
# assert doc1.similarity(doc2) == doc2.similarity(doc1)
# assert 0.0 < doc1.similarity(doc2) < 1.0