From e9c631453968288f224a1ab5861bf59a9c109f63 Mon Sep 17 00:00:00 2001 From: Richard Hudson Date: Thu, 20 Jan 2022 11:40:46 +0100 Subject: [PATCH] Bugfix for similarity return types (#10051) --- spacy/lexeme.pyx | 6 ++-- spacy/tests/vocab_vectors/test_similarity.py | 34 ++++++++++++++++---- spacy/tokens/span.pyx | 6 ++-- spacy/tokens/token.pyx | 6 ++-- 4 files changed, 40 insertions(+), 12 deletions(-) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 792e405dd..6c66effde 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -130,8 +130,10 @@ cdef class Lexeme: return 0.0 vector = self.vector xp = get_array_module(vector) - return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)) - + result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm) + # ensure we get a scalar back (numpy does this automatically but cupy doesn't) + return result.item() + @property def has_vector(self): """RETURNS (bool): Whether a word vector is associated with the object. diff --git a/spacy/tests/vocab_vectors/test_similarity.py b/spacy/tests/vocab_vectors/test_similarity.py index 3b9308f4d..47cd1f060 100644 --- a/spacy/tests/vocab_vectors/test_similarity.py +++ b/spacy/tests/vocab_vectors/test_similarity.py @@ -35,6 +35,7 @@ def test_vectors_similarity_LL(vocab, vectors): assert lex1.vector_norm != 0 assert lex2.vector_norm != 0 assert lex1.vector[0] != lex2.vector[0] and lex1.vector[1] != lex2.vector[1] + assert isinstance(lex1.similarity(lex2), float) assert numpy.isclose(lex1.similarity(lex2), get_cosine(vec1, vec2)) assert numpy.isclose(lex2.similarity(lex2), lex1.similarity(lex1)) @@ -47,25 +48,46 @@ def test_vectors_similarity_TT(vocab, vectors): assert doc[0].vector_norm != 0 assert doc[1].vector_norm != 0 assert doc[0].vector[0] != doc[1].vector[0] and doc[0].vector[1] != doc[1].vector[1] + assert isinstance(doc[0].similarity(doc[1]), float) assert numpy.isclose(doc[0].similarity(doc[1]), get_cosine(vec1, vec2)) assert numpy.isclose(doc[1].similarity(doc[0]), doc[0].similarity(doc[1])) +def test_vectors_similarity_SS(vocab, vectors): + [(word1, vec1), (word2, vec2)] = vectors + doc = Doc(vocab, words=[word1, word2]) + assert isinstance(doc[0:1].similarity(doc[0:2]), float) + assert doc[0:1].similarity(doc[0:2]) == doc[0:2].similarity(doc[0:1]) + + +def test_vectors_similarity_DD(vocab, vectors): + [(word1, vec1), (word2, vec2)] = vectors + doc1 = Doc(vocab, words=[word1, word2]) + doc2 = Doc(vocab, words=[word2, word1]) + assert isinstance(doc1.similarity(doc2), float) + assert doc1.similarity(doc2) == doc2.similarity(doc1) + + def test_vectors_similarity_TD(vocab, vectors): [(word1, vec1), (word2, vec2)] = vectors doc = Doc(vocab, words=[word1, word2]) with pytest.warns(UserWarning): + assert isinstance(doc.similarity(doc[0]), float) + assert isinstance(doc[0].similarity(doc), float) assert doc.similarity(doc[0]) == doc[0].similarity(doc) -def test_vectors_similarity_DS(vocab, vectors): - [(word1, vec1), (word2, vec2)] = vectors - doc = Doc(vocab, words=[word1, word2]) - assert doc.similarity(doc[:2]) == doc[:2].similarity(doc) - - def test_vectors_similarity_TS(vocab, vectors): [(word1, vec1), (word2, vec2)] = vectors doc = Doc(vocab, words=[word1, word2]) with pytest.warns(UserWarning): + assert isinstance(doc[:2].similarity(doc[0]), float) + assert isinstance(doc[0].similarity(doc[-2]), float) assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2]) + + +def test_vectors_similarity_DS(vocab, vectors): + [(word1, vec1), (word2, vec2)] = vectors + doc = Doc(vocab, words=[word1, word2]) + assert isinstance(doc.similarity(doc[:2]), float) + assert doc.similarity(doc[:2]) == doc[:2].similarity(doc) diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 9bb6bf2e7..f7ddc5136 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -364,8 +364,10 @@ cdef class Span: return 0.0 vector = self.vector xp = get_array_module(vector) - return xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm) - + result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm) + # ensure we get a scalar back (numpy does this automatically but cupy doesn't) + return result.item() + cpdef np.ndarray to_array(self, object py_attr_ids): """Given a list of M attribute IDs, export the tokens to a numpy `ndarray` of shape `(N, M)`, where `N` is the length of the document. diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index aa97e2b07..c09ec28d6 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -209,8 +209,10 @@ cdef class Token: return 0.0 vector = self.vector xp = get_array_module(vector) - return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)) - + result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm) + # ensure we get a scalar back (numpy does this automatically but cupy doesn't) + return result.item() + def has_morph(self): """Check whether the token has annotated morph information. Return False when the morph annotation is unset/missing.