From 50d2a2c93071f4d96606ba0d5985c54b59184cbf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 18 Jan 2022 17:14:35 +0100
Subject: [PATCH] User fewer Vector internals (#9879)

* Use Vectors.shape rather than Vectors.data.shape

* Use Vectors.size rather than Vectors.data.size

* Add Vectors.to_ops to move data between different ops

* Add documentation for Vector.to_ops
---
 spacy/language.py                         |  8 ++++----
 spacy/ml/models/multi_task.py             |  4 ++--
 spacy/ml/staticvectors.py                 |  2 +-
 spacy/tests/vocab_vectors/test_vectors.py | 10 +++++-----
 spacy/tokens/doc.pyx                      |  4 ++--
 spacy/tokens/span.pyx                     |  2 +-
 spacy/training/initialize.py              |  2 +-
 spacy/vectors.pyx                         |  7 +++++--
 spacy/vocab.pyx                           |  4 ++--
 website/docs/api/vectors.md               | 17 +++++++++++++++++
 10 files changed, 40 insertions(+), 20 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 638616316..798254b80 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1285,9 +1285,9 @@ class Language:
             )
         except IOError:
             raise IOError(Errors.E884.format(vectors=I["vectors"]))
-        if self.vocab.vectors.data.shape[1] >= 1:
+        if self.vocab.vectors.shape[1] >= 1:
             ops = get_current_ops()
-            self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
+            self.vocab.vectors.to_ops(ops)
         if hasattr(self.tokenizer, "initialize"):
             tok_settings = validate_init_settings(
                 self.tokenizer.initialize,  # type: ignore[union-attr]
@@ -1332,8 +1332,8 @@ class Language:
         DOCS: https://spacy.io/api/language#resume_training
         """
         ops = get_current_ops()
-        if self.vocab.vectors.data.shape[1] >= 1:
-            self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
+        if self.vocab.vectors.shape[1] >= 1:
+            self.vocab.vectors.to_ops(ops)
         for name, proc in self.pipeline:
             if hasattr(proc, "_rehearsal_model"):
                 proc._rehearsal_model = deepcopy(proc.model)  # type: ignore[attr-defined]
diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py
index 37473b7f4..9e1face63 100644
--- a/spacy/ml/models/multi_task.py
+++ b/spacy/ml/models/multi_task.py
@@ -23,7 +23,7 @@ def create_pretrain_vectors(
     maxout_pieces: int, hidden_size: int, loss: str
 ) -> Callable[["Vocab", Model], Model]:
     def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model:
-        if vocab.vectors.data.shape[1] == 0:
+        if vocab.vectors.shape[1] == 0:
             raise ValueError(Errors.E875)
         model = build_cloze_multi_task_model(
             vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
@@ -116,7 +116,7 @@ def build_multi_task_model(
 def build_cloze_multi_task_model(
     vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int
 ) -> Model:
-    nO = vocab.vectors.data.shape[1]
+    nO = vocab.vectors.shape[1]
     output_layer = chain(
         cast(Model[List["Floats2d"], Floats2d], list2array()),
         Maxout(
diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py
index 8dd65833b..8d9b1af9b 100644
--- a/spacy/ml/staticvectors.py
+++ b/spacy/ml/staticvectors.py
@@ -94,7 +94,7 @@ def init(
     nM = model.get_dim("nM") if model.has_dim("nM") else None
     nO = model.get_dim("nO") if model.has_dim("nO") else None
     if X is not None and len(X):
-        nM = X[0].vocab.vectors.data.shape[1]
+        nM = X[0].vocab.vectors.shape[1]
     if Y is not None:
         nO = Y.data.shape[1]
 
diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py
index 9dc40b499..0650a7487 100644
--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@@ -421,7 +421,7 @@ def test_vector_is_oov():
 def test_init_vectors_unset():
     v = Vectors(shape=(10, 10))
     assert v.is_full is False
-    assert v.data.shape == (10, 10)
+    assert v.shape == (10, 10)
 
     with pytest.raises(ValueError):
         v = Vectors(shape=(10, 10), mode="floret")
@@ -514,7 +514,7 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str):
     # rows: 2 rows per ngram
     rows = OPS.xp.asarray(
         [
-            h % nlp.vocab.vectors.data.shape[0]
+            h % nlp.vocab.vectors.shape[0]
             for ngram in ngrams
             for h in nlp.vocab.vectors._get_ngram_hashes(ngram)
         ],
@@ -544,17 +544,17 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str):
     # an empty key returns 0s
     assert_equal(
         OPS.to_numpy(nlp.vocab[""].vector),
-        numpy.zeros((nlp.vocab.vectors.data.shape[0],)),
+        numpy.zeros((nlp.vocab.vectors.shape[0],)),
     )
     # an empty batch returns 0s
     assert_equal(
         OPS.to_numpy(nlp.vocab.vectors.get_batch([""])),
-        numpy.zeros((1, nlp.vocab.vectors.data.shape[0])),
+        numpy.zeros((1, nlp.vocab.vectors.shape[0])),
     )
     # an empty key within a batch returns 0s
     assert_equal(
         OPS.to_numpy(nlp.vocab.vectors.get_batch(["a", "", "b"])[1]),
-        numpy.zeros((nlp.vocab.vectors.data.shape[0],)),
+        numpy.zeros((nlp.vocab.vectors.shape[0],)),
     )
 
     # the loaded ngram vector table cannot be modified
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 2f82a0d1b..5a0db115d 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -616,7 +616,7 @@ cdef class Doc:
         """
         if "has_vector" in self.user_hooks:
             return self.user_hooks["has_vector"](self)
-        elif self.vocab.vectors.data.size:
+        elif self.vocab.vectors.size:
             return True
         elif self.tensor.size:
             return True
@@ -641,7 +641,7 @@ cdef class Doc:
             if not len(self):
                 self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f")
                 return self._vector
-            elif self.vocab.vectors.data.size > 0:
+            elif self.vocab.vectors.size > 0:
                 self._vector = sum(t.vector for t in self) / len(self)
                 return self._vector
             elif self.tensor.size > 0:
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index cd02cab36..9bb6bf2e7 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -497,7 +497,7 @@ cdef class Span:
         """
         if "has_vector" in self.doc.user_span_hooks:
             return self.doc.user_span_hooks["has_vector"](self)
-        elif self.vocab.vectors.data.size > 0:
+        elif self.vocab.vectors.size > 0:
             return any(token.has_vector for token in self)
         elif self.doc.tensor.size > 0:
             return True
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 084204389..b59288e38 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -164,7 +164,7 @@ def load_vectors_into_model(
         len(vectors_nlp.vocab.vectors.keys()) == 0
         and vectors_nlp.vocab.vectors.mode != VectorsMode.floret
     ) or (
-        vectors_nlp.vocab.vectors.data.shape[0] == 0
+        vectors_nlp.vocab.vectors.shape[0] == 0
         and vectors_nlp.vocab.vectors.mode == VectorsMode.floret
     ):
         logger.warning(Warnings.W112.format(name=name))
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 345e8df68..bc4863703 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -10,7 +10,7 @@ from typing import cast
 import warnings
 from enum import Enum
 import srsly
-from thinc.api import get_array_module, get_current_ops
+from thinc.api import Ops, get_array_module, get_current_ops
 from thinc.backends import get_array_ops
 from thinc.types import Floats2d
 
@@ -146,7 +146,7 @@ cdef class Vectors:
 
         DOCS: https://spacy.io/api/vectors#size
         """
-        return self.data.shape[0] * self.data.shape[1]
+        return self.data.size
 
     @property
     def is_full(self):
@@ -517,6 +517,9 @@ cdef class Vectors:
                     for i in range(len(queries)) ], dtype="uint64")
         return (keys, best_rows, scores)
 
+    def to_ops(self, ops: Ops):
+        self.data = ops.asarray(self.data)
+
     def _get_cfg(self):
         if self.mode == Mode.default:
             return {
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index e2e7ad1db..badd291ed 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -283,7 +283,7 @@ cdef class Vocab:
 
     @property
     def vectors_length(self):
-        return self.vectors.data.shape[1]
+        return self.vectors.shape[1]
 
     def reset_vectors(self, *, width=None, shape=None):
         """Drop the current vector table. Because all vectors must be the same
@@ -294,7 +294,7 @@ cdef class Vocab:
         elif shape is not None:
             self.vectors = Vectors(strings=self.strings, shape=shape)
         else:
-            width = width if width is not None else self.vectors.data.shape[1]
+            width = width if width is not None else self.vectors.shape[1]
             self.vectors = Vectors(strings=self.strings, shape=(self.vectors.shape[0], width))
 
     def prune_vectors(self, nr_row, batch_size=1024):
diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md
index 84d2c00ad..b3bee822c 100644
--- a/website/docs/api/vectors.md
+++ b/website/docs/api/vectors.md
@@ -371,6 +371,23 @@ Get the vectors for the provided keys efficiently as a batch.
 | ------ | --------------------------------------- |
 | `keys` | The keys. ~~Iterable[Union[int, str]]~~ |
 
+## Vectors.to_ops {#to_ops tag="method"}
+
+Change the embedding matrix to use different Thinc ops.
+
+> #### Example
+>
+> ```python
+> from thinc.api import NumpyOps
+>
+> vectors.to_ops(NumpyOps())
+>
+> ```
+
+| Name  | Description                                              |
+|-------|----------------------------------------------------------|
+| `ops` | The Thinc ops to switch the embedding matrix to. ~~Ops~~ |
+
 ## Vectors.to_disk {#to_disk tag="method"}
 
 Save the current state to a directory.