diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index f4836dd14..e4adb9d28 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -657,9 +657,6 @@ cdef class Doc: elif self.vocab.vectors.size > 0: self._vector = sum(t.vector for t in self) / len(self) return self._vector - elif self.tensor.size > 0: - self._vector = self.tensor.mean(axis=0) - return self._vector else: return xp.zeros((self.vocab.vectors_length,), dtype="float32") diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 25dbfecdf..8fcf5ad83 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -532,8 +532,6 @@ cdef class Span: return self.doc.user_span_hooks["has_vector"](self) elif self.vocab.vectors.size > 0: return any(token.has_vector for token in self) - elif self.doc.tensor.size > 0: - return True else: return False diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 64c707acd..74f812af7 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -389,8 +389,6 @@ cdef class Token: """ if "has_vector" in self.doc.user_token_hooks: return self.doc.user_token_hooks["has_vector"](self) - if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0: - return True return self.vocab.has_vector(self.c.lex.orth) @property @@ -404,8 +402,6 @@ cdef class Token: """ if "vector" in self.doc.user_token_hooks: return self.doc.user_token_hooks["vector"](self) - if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0: - return self.doc.tensor[self.i] else: return self.vocab.get_vector(self.c.lex.orth) diff --git a/website/docs/usage/101/_vectors-similarity.mdx b/website/docs/usage/101/_vectors-similarity.mdx index 6deab926d..39ee8e48a 100644 --- a/website/docs/usage/101/_vectors-similarity.mdx +++ b/website/docs/usage/101/_vectors-similarity.mdx @@ -22,17 +22,20 @@ array([2.02280000e-01, -7.66180009e-02, 3.70319992e-01, To make them compact and fast, spaCy's small [pipeline packages](/models) (all -packages that end in `sm`) **don't ship with word vectors**, and only include -context-sensitive **tensors**. This means you can still use the `similarity()` -methods to compare documents, spans and tokens – but the result won't be as -good, and individual tokens won't have any vectors assigned. So in order to use -_real_ word vectors, you need to download a larger pipeline package: +packages that end in `sm`) **don't ship with word vectors**. In order to use +`similarity()`, you need to download a larger pipeline package that includes +vectors: ```diff - python -m spacy download en_core_web_sm -+ python -m spacy download en_core_web_lg ++ python -m spacy download en_core_web_md ``` +In spaCy v3 and earlier, small pipeline packages supported `similarity()` by +backing off to context-sensitive tensors from the `tok2vec` component. These +tensors do not work well for this purpose and this backoff has been removed in +spaCy v4. + Pipeline packages that come with built-in word vectors make them available as