From 50d2a2c93071f4d96606ba0d5985c54b59184cbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Tue, 18 Jan 2022 17:14:35 +0100 Subject: [PATCH] User fewer Vector internals (#9879) * Use Vectors.shape rather than Vectors.data.shape * Use Vectors.size rather than Vectors.data.size * Add Vectors.to_ops to move data between different ops * Add documentation for Vector.to_ops --- spacy/language.py | 8 ++++---- spacy/ml/models/multi_task.py | 4 ++-- spacy/ml/staticvectors.py | 2 +- spacy/tests/vocab_vectors/test_vectors.py | 10 +++++----- spacy/tokens/doc.pyx | 4 ++-- spacy/tokens/span.pyx | 2 +- spacy/training/initialize.py | 2 +- spacy/vectors.pyx | 7 +++++-- spacy/vocab.pyx | 4 ++-- website/docs/api/vectors.md | 17 +++++++++++++++++ 10 files changed, 40 insertions(+), 20 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 638616316..798254b80 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1285,9 +1285,9 @@ class Language: ) except IOError: raise IOError(Errors.E884.format(vectors=I["vectors"])) - if self.vocab.vectors.data.shape[1] >= 1: + if self.vocab.vectors.shape[1] >= 1: ops = get_current_ops() - self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) + self.vocab.vectors.to_ops(ops) if hasattr(self.tokenizer, "initialize"): tok_settings = validate_init_settings( self.tokenizer.initialize, # type: ignore[union-attr] @@ -1332,8 +1332,8 @@ class Language: DOCS: https://spacy.io/api/language#resume_training """ ops = get_current_ops() - if self.vocab.vectors.data.shape[1] >= 1: - self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) + if self.vocab.vectors.shape[1] >= 1: + self.vocab.vectors.to_ops(ops) for name, proc in self.pipeline: if hasattr(proc, "_rehearsal_model"): proc._rehearsal_model = deepcopy(proc.model) # type: ignore[attr-defined] diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py index 37473b7f4..9e1face63 100644 --- a/spacy/ml/models/multi_task.py +++ b/spacy/ml/models/multi_task.py @@ -23,7 +23,7 @@ def create_pretrain_vectors( maxout_pieces: int, hidden_size: int, loss: str ) -> Callable[["Vocab", Model], Model]: def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model: - if vocab.vectors.data.shape[1] == 0: + if vocab.vectors.shape[1] == 0: raise ValueError(Errors.E875) model = build_cloze_multi_task_model( vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces @@ -116,7 +116,7 @@ def build_multi_task_model( def build_cloze_multi_task_model( vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int ) -> Model: - nO = vocab.vectors.data.shape[1] + nO = vocab.vectors.shape[1] output_layer = chain( cast(Model[List["Floats2d"], Floats2d], list2array()), Maxout( diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py index 8dd65833b..8d9b1af9b 100644 --- a/spacy/ml/staticvectors.py +++ b/spacy/ml/staticvectors.py @@ -94,7 +94,7 @@ def init( nM = model.get_dim("nM") if model.has_dim("nM") else None nO = model.get_dim("nO") if model.has_dim("nO") else None if X is not None and len(X): - nM = X[0].vocab.vectors.data.shape[1] + nM = X[0].vocab.vectors.shape[1] if Y is not None: nO = Y.data.shape[1] diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index 9dc40b499..0650a7487 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -421,7 +421,7 @@ def test_vector_is_oov(): def test_init_vectors_unset(): v = Vectors(shape=(10, 10)) assert v.is_full is False - assert v.data.shape == (10, 10) + assert v.shape == (10, 10) with pytest.raises(ValueError): v = Vectors(shape=(10, 10), mode="floret") @@ -514,7 +514,7 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str): # rows: 2 rows per ngram rows = OPS.xp.asarray( [ - h % nlp.vocab.vectors.data.shape[0] + h % nlp.vocab.vectors.shape[0] for ngram in ngrams for h in nlp.vocab.vectors._get_ngram_hashes(ngram) ], @@ -544,17 +544,17 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str): # an empty key returns 0s assert_equal( OPS.to_numpy(nlp.vocab[""].vector), - numpy.zeros((nlp.vocab.vectors.data.shape[0],)), + numpy.zeros((nlp.vocab.vectors.shape[0],)), ) # an empty batch returns 0s assert_equal( OPS.to_numpy(nlp.vocab.vectors.get_batch([""])), - numpy.zeros((1, nlp.vocab.vectors.data.shape[0])), + numpy.zeros((1, nlp.vocab.vectors.shape[0])), ) # an empty key within a batch returns 0s assert_equal( OPS.to_numpy(nlp.vocab.vectors.get_batch(["a", "", "b"])[1]), - numpy.zeros((nlp.vocab.vectors.data.shape[0],)), + numpy.zeros((nlp.vocab.vectors.shape[0],)), ) # the loaded ngram vector table cannot be modified diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 2f82a0d1b..5a0db115d 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -616,7 +616,7 @@ cdef class Doc: """ if "has_vector" in self.user_hooks: return self.user_hooks["has_vector"](self) - elif self.vocab.vectors.data.size: + elif self.vocab.vectors.size: return True elif self.tensor.size: return True @@ -641,7 +641,7 @@ cdef class Doc: if not len(self): self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f") return self._vector - elif self.vocab.vectors.data.size > 0: + elif self.vocab.vectors.size > 0: self._vector = sum(t.vector for t in self) / len(self) return self._vector elif self.tensor.size > 0: diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index cd02cab36..9bb6bf2e7 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -497,7 +497,7 @@ cdef class Span: """ if "has_vector" in self.doc.user_span_hooks: return self.doc.user_span_hooks["has_vector"](self) - elif self.vocab.vectors.data.size > 0: + elif self.vocab.vectors.size > 0: return any(token.has_vector for token in self) elif self.doc.tensor.size > 0: return True diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 084204389..b59288e38 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -164,7 +164,7 @@ def load_vectors_into_model( len(vectors_nlp.vocab.vectors.keys()) == 0 and vectors_nlp.vocab.vectors.mode != VectorsMode.floret ) or ( - vectors_nlp.vocab.vectors.data.shape[0] == 0 + vectors_nlp.vocab.vectors.shape[0] == 0 and vectors_nlp.vocab.vectors.mode == VectorsMode.floret ): logger.warning(Warnings.W112.format(name=name)) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 345e8df68..bc4863703 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -10,7 +10,7 @@ from typing import cast import warnings from enum import Enum import srsly -from thinc.api import get_array_module, get_current_ops +from thinc.api import Ops, get_array_module, get_current_ops from thinc.backends import get_array_ops from thinc.types import Floats2d @@ -146,7 +146,7 @@ cdef class Vectors: DOCS: https://spacy.io/api/vectors#size """ - return self.data.shape[0] * self.data.shape[1] + return self.data.size @property def is_full(self): @@ -517,6 +517,9 @@ cdef class Vectors: for i in range(len(queries)) ], dtype="uint64") return (keys, best_rows, scores) + def to_ops(self, ops: Ops): + self.data = ops.asarray(self.data) + def _get_cfg(self): if self.mode == Mode.default: return { diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index e2e7ad1db..badd291ed 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -283,7 +283,7 @@ cdef class Vocab: @property def vectors_length(self): - return self.vectors.data.shape[1] + return self.vectors.shape[1] def reset_vectors(self, *, width=None, shape=None): """Drop the current vector table. Because all vectors must be the same @@ -294,7 +294,7 @@ cdef class Vocab: elif shape is not None: self.vectors = Vectors(strings=self.strings, shape=shape) else: - width = width if width is not None else self.vectors.data.shape[1] + width = width if width is not None else self.vectors.shape[1] self.vectors = Vectors(strings=self.strings, shape=(self.vectors.shape[0], width)) def prune_vectors(self, nr_row, batch_size=1024): diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md index 84d2c00ad..b3bee822c 100644 --- a/website/docs/api/vectors.md +++ b/website/docs/api/vectors.md @@ -371,6 +371,23 @@ Get the vectors for the provided keys efficiently as a batch. | ------ | --------------------------------------- | | `keys` | The keys. ~~Iterable[Union[int, str]]~~ | +## Vectors.to_ops {#to_ops tag="method"} + +Change the embedding matrix to use different Thinc ops. + +> #### Example +> +> ```python +> from thinc.api import NumpyOps +> +> vectors.to_ops(NumpyOps()) +> +> ``` + +| Name | Description | +|-------|----------------------------------------------------------| +| `ops` | The Thinc ops to switch the embedding matrix to. ~~Ops~~ | + ## Vectors.to_disk {#to_disk tag="method"} Save the current state to a directory.