Remove backoff from .vector to .tensor (#12292)

2025-08-04 20:30:24 +03:00 · 2023-02-23 11:36:50 +01:00 · 2023-02-23 11:36:50 +01:00 · df4c069a13
commit df4c069a13
parent e27c60a702
4 changed files with 9 additions and 15 deletions
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -657,9 +657,6 @@ cdef class Doc:
            elif self.vocab.vectors.size > 0:
                self._vector = sum(t.vector for t in self) / len(self)
                return self._vector
-            elif self.tensor.size > 0:
-                self._vector = self.tensor.mean(axis=0)
-                return self._vector
            else:
                return xp.zeros((self.vocab.vectors_length,), dtype="float32")

--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -532,8 +532,6 @@ cdef class Span:
            return self.doc.user_span_hooks["has_vector"](self)
        elif self.vocab.vectors.size > 0:
            return any(token.has_vector for token in self)
-        elif self.doc.tensor.size > 0:
-            return True
        else:
            return False

--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -389,8 +389,6 @@ cdef class Token:
        """
        if "has_vector" in self.doc.user_token_hooks:
            return self.doc.user_token_hooks["has_vector"](self)
-        if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
-            return True
        return self.vocab.has_vector(self.c.lex.orth)

    @property
@ -404,8 +402,6 @@ cdef class Token:
        """
        if "vector" in self.doc.user_token_hooks:
            return self.doc.user_token_hooks["vector"](self)
-        if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
-            return self.doc.tensor[self.i]
        else:
            return self.vocab.get_vector(self.c.lex.orth)

--- a/website/docs/usage/101/_vectors-similarity.mdx
+++ b/website/docs/usage/101/_vectors-similarity.mdx
@ -22,17 +22,20 @@ array([2.02280000e-01,  -7.66180009e-02,   3.70319992e-01,
 <Infobox title="Important note" variant="warning">

 To make them compact and fast, spaCy's small [pipeline packages](/models) (all
-packages that end in `sm`) **don't ship with word vectors**, and only include
-context-sensitive **tensors**. This means you can still use the `similarity()`
-methods to compare documents, spans and tokens – but the result won't be as
-good, and individual tokens won't have any vectors assigned. So in order to use
-_real_ word vectors, you need to download a larger pipeline package:
+packages that end in `sm`) **don't ship with word vectors**. In order to use
+`similarity()`, you need to download a larger pipeline package that includes
+vectors:

 ```diff
 - python -m spacy download en_core_web_sm
-+ python -m spacy download en_core_web_lg
+ python -m spacy download en_core_web_md
 ```

+In spaCy v3 and earlier, small pipeline packages supported `similarity()` by
+backing off to context-sensitive tensors from the `tok2vec` component. These
+tensors do not work well for this purpose and this backoff has been removed in
+spaCy v4.
+
 </Infobox>

 Pipeline packages that come with built-in word vectors make them available as