From 144a93c2a5383dc51ef16226c71c36be172adb82 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 3 Nov 2017 20:56:33 +0100 Subject: [PATCH] Back-off to tensor for similarity if no vectors --- spacy/tests/lang/en/test_models.py | 8 ++++++++ spacy/tokens/doc.pyx | 6 +++--- spacy/tokens/span.pyx | 7 ++++++- spacy/tokens/token.pyx | 7 ++++++- 4 files changed, 23 insertions(+), 5 deletions(-) diff --git a/spacy/tests/lang/en/test_models.py b/spacy/tests/lang/en/test_models.py index ab318213c..a6006caba 100644 --- a/spacy/tests/lang/en/test_models.py +++ b/spacy/tests/lang/en/test_models.py @@ -75,3 +75,11 @@ def test_en_models_probs(example): assert not prob0 == prob1 assert not prob0 == prob2 assert not prob1 == prob2 + + +@pytest.mark.models('en') +def test_no_vectors_similarity(EN): + doc1 = EN(u'hallo') + doc2 = EN(u'hi') + assert doc1.similarity(doc2) > 0 + diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 9e9a52a8c..eef25c712 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -307,7 +307,7 @@ cdef class Doc: def __get__(self): if 'has_vector' in self.user_hooks: return self.user_hooks['has_vector'](self) - elif any(token.has_vector for token in self): + elif self.vocab.vectors.data.size: return True elif self.tensor.size: return True @@ -330,13 +330,13 @@ cdef class Doc: self._vector = numpy.zeros((self.vocab.vectors_length,), dtype='f') return self._vector - elif self.has_vector: + elif self.vocab.vectors.data.size > 0: vector = numpy.zeros((self.vocab.vectors_length,), dtype='f') for token in self.c[:self.length]: vector += self.vocab.get_vector(token.lex.orth) self._vector = vector / len(self) return self._vector - elif self.tensor.size: + elif self.tensor.size > 0: self._vector = self.tensor.mean(axis=0) return self._vector else: diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 49b892adb..4056ef615 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -283,7 +283,12 @@ cdef class Span: def __get__(self): if 'has_vector' in self.doc.user_span_hooks: return self.doc.user_span_hooks['has_vector'](self) - return any(token.has_vector for token in self) + elif self.vocab.vectors.data.size > 0: + return any(token.has_vector for token in self) + elif self.doc.tensor.size > 0: + return True + else: + return False property vector: """A real-valued meaning representation. Defaults to an average of the diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 3253fa738..6715c5098 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -292,6 +292,8 @@ cdef class Token: def __get__(self): if 'has_vector' in self.doc.user_token_hooks: return self.doc.user_token_hooks['has_vector'](self) + if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0: + return True return self.vocab.has_vector(self.c.lex.orth) property vector: @@ -303,7 +305,10 @@ cdef class Token: def __get__(self): if 'vector' in self.doc.user_token_hooks: return self.doc.user_token_hooks['vector'](self) - return self.vocab.get_vector(self.c.lex.orth) + if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0: + return self.doc.tensor[self.i] + else: + return self.vocab.get_vector(self.c.lex.orth) property vector_norm: """The L2 norm of the token's vector representation.