From 294f89e1caf68870f2ae75a894d345771c9ba69a Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 1 Aug 2023 11:19:04 +0200 Subject: [PATCH] Add initial draft of docs --- website/docs/api/basevectors.mdx | 149 +++++++++++++++++ website/docs/api/vectors.mdx | 9 +- .../docs/usage/embeddings-transformers.mdx | 156 ++++++++++++++++++ website/meta/sidebars.json | 1 + 4 files changed, 310 insertions(+), 5 deletions(-) create mode 100644 website/docs/api/basevectors.mdx diff --git a/website/docs/api/basevectors.mdx b/website/docs/api/basevectors.mdx new file mode 100644 index 000000000..03eb6e836 --- /dev/null +++ b/website/docs/api/basevectors.mdx @@ -0,0 +1,149 @@ +--- +title: BaseVectors +teaser: Abstract class for word vectors +tag: class +source: spacy/vectors.pyx +version: 3.7 +--- + +`BaseVectors` is an abstract class to support the development of custom vectors +implementations. + +For use in training with [`StaticVectors`](/api/architectures#staticvectors), +`get_batch` must be implemented. For improved performance, use efficient +batching in `get_batch` and implement `to_ops` to copy the vector data to the +current device. See an example custom implementation for +[BPEMb subword embeddings](/usage/embeddings-transformers#custom-vectors). + +## BaseVectors.\_\_init\_\_ {id="init",tag="method"} + +Create a new vector store. + +| Name | Description | +| -------------- | --------------------------------------------------------------------------------------------------------------------- | +| _keyword-only_ | | +| `strings` | The string store. A new string store is created if one is not provided. Defaults to `None`. ~~Optional[StringStore]~~ | + +## BaseVectors.\_\_getitem\_\_ {id="getitem",tag="method"} + +Get a vector by key. If the key is not found in the table, a `KeyError` should +be raised. + +| Name | Description | +| ----------- | ---------------------------------------------------------------- | +| `key` | The key to get the vector for. ~~Union[int, str]~~ | +| **RETURNS** | The vector for the key. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | + +## BaseVectors.\_\_len\_\_ {id="len",tag="method"} + +Return the number of vectors in the table. + +| Name | Description | +| ----------- | ------------------------------------------- | +| **RETURNS** | The number of vectors in the table. ~~int~~ | + +## BaseVectors.\_\_contains\_\_ {id="contains",tag="method"} + +Check whether there is a vector entry for key. + +| Name | Description | +| ----------- | -------------------------------------------- | +| `key` | The key to check. ~~int~~ | +| **RETURNS** | Whether the key has a vector entry. ~~bool~~ | + +## BaseVectors.add {id="add",tag="method"} + +Add a key to the table, if possible. If no keys can be added, return `-1`. + +| Name | Description | +| ----------- | ----------------------------------------------------------------------------------- | +| `key` | The key to add. ~~Union[str, int]~~ | +| **RETURNS** | The row the vector was added to, or `-1` if the operation is not supported. ~~int~~ | + +## BaseVectors.shape {id="shape",tag="property"} + +Get `(rows, dims)` tuples of number of rows and number of dimensions in the +vector table. + +| Name | Description | +| ----------- | ------------------------------------------ | +| **RETURNS** | A `(rows, dims)` pair. ~~Tuple[int, int]~~ | + +## BaseVectors.size {id="size",tag="property"} + +The vector size, i.e. `rows * dims`. + +| Name | Description | +| ----------- | ------------------------ | +| **RETURNS** | The vector size. ~~int~~ | + +## BaseVectors.is_full {id="is_full",tag="property"} + +Whether the vectors table is full and no slots are available for new keys. + +| Name | Description | +| ----------- | ------------------------------------------- | +| **RETURNS** | Whether the vectors table is full. ~~bool~~ | + +## BaseVectors.get_batch {id="get_batch",tag="method",version="3.2"} + +Get the vectors for the provided keys efficiently as a batch. Required to use +the vectors with [`StaticVectors`](/api/architectures#StaticVectors) for +training. + +| Name | Description | +| ------ | --------------------------------------- | +| `keys` | The keys. ~~Iterable[Union[int, str]]~~ | + +## BaseVectors.to_ops {id="to_ops",tag="method"} + +Dummy method. Implement this to change the embedding matrix to use different +Thinc ops. + +| Name | Description | +| ----- | -------------------------------------------------------- | +| `ops` | The Thinc ops to switch the embedding matrix to. ~~Ops~~ | + +## BaseVectors.to_disk {id="to_disk",tag="method"} + +Dummy method to allow serialization. Implement to save vector data with the +pipeline. + +| Name | Description | +| ------ | ------------------------------------------------------------------------------------------------------------------------------------------ | +| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | + +## BaseVectors.from_disk {id="from_disk",tag="method"} + +Dummy method to allow serialization. Implement to load vector data from a saved +pipeline. + +| Name | Description | +| ----------- | ----------------------------------------------------------------------------------------------- | +| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| **RETURNS** | The modified vectors object. ~~BaseVectors~~ | + +## BaseVectors.to_bytes {id="to_bytes",tag="method"} + +Dummy method to allow serialization. Implement to serialize vector data to a +binary string. + +> #### Example +> +> ```python +> vectors_bytes = vectors.to_bytes() +> ``` + +| Name | Description | +| ----------- | ---------------------------------------------------- | +| **RETURNS** | The serialized form of the vectors object. ~~bytes~~ | + +## BaseVectors.from_bytes {id="from_bytes",tag="method"} + +Dummy method to allow serialization. Implement to load vector data from a binary +string. + +| Name | Description | +| ----------- | ----------------------------------- | +| `data` | The data to load from. ~~bytes~~ | +| **RETURNS** | The vectors object. ~~BaseVectors~~ | diff --git a/website/docs/api/vectors.mdx b/website/docs/api/vectors.mdx index fa4cd0c7a..0e92eb12b 100644 --- a/website/docs/api/vectors.mdx +++ b/website/docs/api/vectors.mdx @@ -297,10 +297,9 @@ The vector size, i.e. `rows * dims`. ## Vectors.is_full {id="is_full",tag="property"} -Whether the vectors table is full and has no slots are available for new keys. -If a table is full, it can be resized using -[`Vectors.resize`](/api/vectors#resize). In `floret` mode, the table is always -full and cannot be resized. +Whether the vectors table is full and no slots are available for new keys. If a +table is full, it can be resized using [`Vectors.resize`](/api/vectors#resize). +In `floret` mode, the table is always full and cannot be resized. > #### Example > @@ -441,7 +440,7 @@ Load state from a binary string. > #### Example > > ```python -> fron spacy.vectors import Vectors +> from spacy.vectors import Vectors > vectors_bytes = vectors.to_bytes() > new_vectors = Vectors(StringStore()) > new_vectors.from_bytes(vectors_bytes) diff --git a/website/docs/usage/embeddings-transformers.mdx b/website/docs/usage/embeddings-transformers.mdx index 5f1e5b817..3d4590b1b 100644 --- a/website/docs/usage/embeddings-transformers.mdx +++ b/website/docs/usage/embeddings-transformers.mdx @@ -632,6 +632,162 @@ def MyCustomVectors( ) ``` +#### Creating a custom vectors implementation {id="custom-vectors",version="3.7"} + +You can specify a custom registered vectors class under `[nlp.vectors]` in order +to use static vectors in formats other than the ones supported by +[`Vectors`](/api/vectors). Extend the abstract [`BaseVectors`](/api/basevectors) +class to implement your custom vectors. + +As an example, the following `BPEmbVectors` class implements support for +[BPEmb subword embeddings](https://bpemb.h-its.org/): + +```python +# requires: pip install bpemb +from typing import cast, Callable, Optional +from pathlib import Path +import warnings +from bpemb import BPEmb +from spacy.util import registry +from spacy.vectors import BaseVectors +from spacy.vocab import Vocab +from thinc.api import Ops, get_current_ops +from thinc.backends import get_array_ops +from thinc.types import Floats2d + + +class BPEmbVectors(BaseVectors): + def __init__( + self, + *, + strings: Optional[str] = None, + lang: Optional[str] = None, + vs: Optional[int] = None, + dim: Optional[int] = None, + cache_dir: Optional[Path] = None, + encode_extra_options: Optional[str] = None, + model_file: Optional[Path] = None, + emb_file: Optional[Path] = None, + ): + kwargs = {} + if lang is not None: + kwargs["lang"] = lang + if vs is not None: + kwargs["vs"] = vs + if dim is not None: + kwargs["dim"] = dim + if cache_dir is not None: + kwargs["cache_dir"] = cache_dir + if encode_extra_options is not None: + kwargs["encode_extra_options"] = encode_extra_options + if model_file is not None: + kwargs["model_file"] = model_file + if emb_file is not None: + kwargs["emb_file"] = emb_file + self.bpemb = BPEmb(**kwargs) + self.strings = strings + self.name = repr(self.bpemb) + self.n_keys = -1 + self.mode = "BPEmb" + self.to_ops(get_current_ops()) + + def __contains__(self, key): + return True + + def is_full(self): + return True + + def add(self, key, *, vector=None, row=None): + warnings.warn( + ( + "Skipping BPEmbVectors.add: the bpemb vector table cannot be " + "modified. Vectors are calculated from bytepieces." + ) + ) + return -1 + + def __getitem__(self, key): + return self.get_batch([key])[0] + + def get_batch(self, keys): + keys = [self.strings.as_string(key) for key in keys] + bp_ids = self.bpemb.encode_ids(keys) + ops = get_array_ops(self.bpemb.emb.vectors) + indices = ops.asarray(ops.xp.hstack(bp_ids), dtype="int32") + lengths = ops.asarray([len(x) for x in bp_ids], dtype="int32") + vecs = ops.reduce_mean(cast(Floats2d, self.bpemb.emb.vectors[indices]), lengths) + return vecs + + @property + def shape(self): + return self.bpemb.vectors.shape + + def __len__(self): + return self.shape[0] + + @property + def vectors_length(self): + return self.shape[1] + + @property + def size(self): + return self.bpemb.vectors.size + + def to_ops(self, ops: Ops): + self.bpemb.emb.vectors = ops.asarray(self.bpemb.emb.vectors) + + +@registry.vectors("BPEmbVectors.v1") +def create_bpemb_vectors( + lang: Optional[str] = "multi", + vs: Optional[int] = None, + dim: Optional[int] = None, + cache_dir: Optional[Path] = None, + encode_extra_options: Optional[str] = None, + model_file: Optional[Path] = None, + emb_file: Optional[Path] = None, +) -> Callable[[Vocab], BPEmbVectors]: + def bpemb_vectors_factory(vocab: Vocab) -> BPEmbVectors: + return BPEmbVectors( + strings=vocab.strings, + lang=lang, + vs=vs, + dim=dim, + cache_dir=cache_dir, + encode_extra_options=encode_extra_options, + model_file=model_file, + emb_file=emb_file, + ) + + return bpemb_vectors_factory +``` + + + +Note that the serialization methods are not implemented, so the embeddings are +loaded from your local cache or downloaded by `BPEmb` each time the pipeline is +loaded. + + + +To use this in your pipeline, specify this registered function under +`[nlp.vectors]` in your config: + +```ini +[nlp.vectors] +@vectors = "BPEmbVectors.v1" +lang = "en" +``` + +Or specify it when creating a blank pipeline: + +```python +nlp = spacy.blank("en", config={"nlp.vectors": {"@vectors": "BPEmbVectors.v1", "lang": "en"}}) +``` + +Remember to include this code with `--code` when using +[`spacy train`](/api/cli#train) and [`spacy package`](/api/cli#package). + ## Pretraining {id="pretraining"} The [`spacy pretrain`](/api/cli#pretrain) command lets you initialize your diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index 04102095f..d2f73d83a 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -131,6 +131,7 @@ "label": "Other", "items": [ { "text": "Attributes", "url": "/api/attributes" }, + { "text": "BaseVectors", "url": "/api/basevectors" }, { "text": "Corpus", "url": "/api/corpus" }, { "text": "InMemoryLookupKB", "url": "/api/inmemorylookupkb" }, { "text": "KnowledgeBase", "url": "/api/kb" },