mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Update docs [ci skip]
This commit is contained in:
parent
9299166c75
commit
728fec0194
|
@ -36,11 +36,11 @@ redirects = [
|
||||||
{from = "/docs/api/features", to = "/models/#architecture", force = true},
|
{from = "/docs/api/features", to = "/models/#architecture", force = true},
|
||||||
{from = "/docs/api/philosophy", to = "/usage/spacy-101", force = true},
|
{from = "/docs/api/philosophy", to = "/usage/spacy-101", force = true},
|
||||||
{from = "/docs/usage/showcase", to = "/universe", force = true},
|
{from = "/docs/usage/showcase", to = "/universe", force = true},
|
||||||
{from = "/tutorials/load-new-word-vectors", to = "/usage/vectors-similarity#custom", force = true},
|
{from = "/tutorials/load-new-word-vectors", to = "/usage/linguistic-features", force = true},
|
||||||
{from = "/tutorials", to = "/usage/examples", force = true},
|
{from = "/tutorials", to = "/usage/examples", force = true},
|
||||||
# Old documentation pages (v2.x)
|
# Old documentation pages (v2.x)
|
||||||
{from = "/usage/adding-languages", to = "/usage/linguistic-features", force = true},
|
{from = "/usage/adding-languages", to = "/usage/linguistic-features", force = true},
|
||||||
{from = "/usage/vectors-similarity", to = "/usage/vectors-embeddings", force = true},
|
{from = "/usage/vectors-similarity", to = "/usage/linguistic-features#vectors-similarity", force = true},
|
||||||
{from = "/api/goldparse", to = "/api/top-level", force = true},
|
{from = "/api/goldparse", to = "/api/top-level", force = true},
|
||||||
{from = "/api/goldcorpus", to = "/api/corpus", force = true},
|
{from = "/api/goldcorpus", to = "/api/corpus", force = true},
|
||||||
{from = "/api/annotation", to = "/api/data-formats", force = true},
|
{from = "/api/annotation", to = "/api/data-formats", force = true},
|
||||||
|
|
|
@ -243,11 +243,15 @@ Encode context using bidirectional LSTM layers. Requires
|
||||||
| `window_size` | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. ~~int~~ |
|
| `window_size` | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. ~~int~~ |
|
||||||
| `depth` | The number of convolutional layers. Recommended value is `4`. ~~int~~ |
|
| `depth` | The number of convolutional layers. Recommended value is `4`. ~~int~~ |
|
||||||
|
|
||||||
|
### spacy.StaticVectors.v1 {#StaticVectors}
|
||||||
|
|
||||||
|
<!-- TODO: -->
|
||||||
|
|
||||||
## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"}
|
## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"}
|
||||||
|
|
||||||
The following architectures are provided by the package
|
The following architectures are provided by the package
|
||||||
[`spacy-transformers`](https://github.com/explosion/spacy-transformers). See the
|
[`spacy-transformers`](https://github.com/explosion/spacy-transformers). See the
|
||||||
[usage documentation](/usage/transformers) for how to integrate the
|
[usage documentation](/usage/embeddings-transformers) for how to integrate the
|
||||||
architectures into your training config.
|
architectures into your training config.
|
||||||
|
|
||||||
### spacy-transformers.TransformerModel.v1 {#TransformerModel}
|
### spacy-transformers.TransformerModel.v1 {#TransformerModel}
|
||||||
|
|
|
@ -162,14 +162,12 @@ $ python -m spacy init fill-config [base_path] [output_file] [--diff]
|
||||||
|
|
||||||
### init model {#init-model new="2" tag="command"}
|
### init model {#init-model new="2" tag="command"}
|
||||||
|
|
||||||
<!-- TODO: update for v3 -->
|
|
||||||
|
|
||||||
Create a new model directory from raw data, like word frequencies, Brown
|
Create a new model directory from raw data, like word frequencies, Brown
|
||||||
clusters and word vectors. This command is similar to the `spacy model` command
|
clusters and word vectors. Note that in order to populate the model's vocab, you
|
||||||
in v1.x. Note that in order to populate the model's vocab, you need to pass in a
|
need to pass in a JSONL-formatted
|
||||||
JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) as
|
[vocabulary file](/api/data-formats#vocab-jsonl) as `--jsonl-loc` with optional
|
||||||
`--jsonl-loc` with optional `id` values that correspond to the vectors table.
|
`id` values that correspond to the vectors table. Just loading in vectors will
|
||||||
Just loading in vectors will not automatically populate the vocab.
|
not automatically populate the vocab.
|
||||||
|
|
||||||
<Infobox title="New in v3.0" variant="warning">
|
<Infobox title="New in v3.0" variant="warning">
|
||||||
|
|
||||||
|
|
|
@ -316,7 +316,7 @@ factories.
|
||||||
The following registries are added by the
|
The following registries are added by the
|
||||||
[`spacy-transformers`](https://github.com/explosion/spacy-transformers) package.
|
[`spacy-transformers`](https://github.com/explosion/spacy-transformers) package.
|
||||||
See the [`Transformer`](/api/transformer) API reference and
|
See the [`Transformer`](/api/transformer) API reference and
|
||||||
[usage docs](/usage/transformers) for details.
|
[usage docs](/usage/embeddings-transformers) for details.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
|
|
@ -41,7 +41,8 @@ token, the spaCy token receives the sum of their values. To access the values,
|
||||||
you can use the custom [`Doc._.trf_data`](#custom-attributes) attribute. The
|
you can use the custom [`Doc._.trf_data`](#custom-attributes) attribute. The
|
||||||
package also adds the function registries [`@span_getters`](#span_getters) and
|
package also adds the function registries [`@span_getters`](#span_getters) and
|
||||||
[`@annotation_setters`](#annotation_setters) with several built-in registered
|
[`@annotation_setters`](#annotation_setters) with several built-in registered
|
||||||
functions. For more details, see the [usage documentation](/usage/transformers).
|
functions. For more details, see the
|
||||||
|
[usage documentation](/usage/embeddings-transformers).
|
||||||
|
|
||||||
## Config and implementation {#config}
|
## Config and implementation {#config}
|
||||||
|
|
||||||
|
|
|
@ -77,12 +77,14 @@ or flagging duplicates. For example, you can suggest a user content that's
|
||||||
similar to what they're currently looking at, or label a support ticket as a
|
similar to what they're currently looking at, or label a support ticket as a
|
||||||
duplicate if it's very similar to an already existing one.
|
duplicate if it's very similar to an already existing one.
|
||||||
|
|
||||||
Each `Doc`, `Span` and `Token` comes with a
|
Each [`Doc`](/api/doc), [`Span`](/api/span), [`Token`](/api/token) and
|
||||||
[`.similarity()`](/api/token#similarity) method that lets you compare it with
|
[`Lexeme`](/api/lexeme) comes with a [`.similarity`](/api/token#similarity)
|
||||||
another object, and determine the similarity. Of course similarity is always
|
method that lets you compare it with another object, and determine the
|
||||||
subjective – whether "dog" and "cat" are similar really depends on how you're
|
similarity. Of course similarity is always subjective – whether "dog" and "cat"
|
||||||
looking at it. spaCy's similarity model usually assumes a pretty general-purpose
|
are similar really depends on how you're looking at it. spaCy's similarity model
|
||||||
definition of similarity.
|
usually assumes a pretty general-purpose definition of similarity.
|
||||||
|
|
||||||
|
<!-- TODO: use better example here -->
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### {executable="true"}
|
### {executable="true"}
|
||||||
|
|
459
website/docs/usage/embeddings-transformers.md
Normal file
459
website/docs/usage/embeddings-transformers.md
Normal file
|
@ -0,0 +1,459 @@
|
||||||
|
---
|
||||||
|
title: Embeddings, Transformers and Transfer Learning
|
||||||
|
teaser: Using transformer embeddings like BERT in spaCy
|
||||||
|
menu:
|
||||||
|
- ['Embedding Layers', 'embedding-layers']
|
||||||
|
- ['Transformers', 'transformers']
|
||||||
|
- ['Static Vectors', 'static-vectors']
|
||||||
|
- ['Pretraining', 'pretraining']
|
||||||
|
next: /usage/training
|
||||||
|
---
|
||||||
|
|
||||||
|
<!-- TODO: intro, short explanation of embeddings/transformers, point user to processing pipelines docs for intro -->
|
||||||
|
|
||||||
|
## Shared embedding layers {#embedding-layers}
|
||||||
|
|
||||||
|
<!-- TODO: write: `Tok2Vec` and `Transformer` components -->
|
||||||
|
|
||||||
|
<Accordion title="What’s the difference between word vectors and language models?" id="vectors-vs-language-models">
|
||||||
|
|
||||||
|
The key difference between [word vectors](#word-vectors) and contextual language
|
||||||
|
models such as [transformers](#transformers) is that word vectors model
|
||||||
|
**lexical types**, rather than _tokens_. If you have a list of terms with no
|
||||||
|
context around them, a transformer model like BERT can't really help you. BERT
|
||||||
|
is designed to understand language **in context**, which isn't what you have. A
|
||||||
|
word vectors table will be a much better fit for your task. However, if you do
|
||||||
|
have words in context — whole sentences or paragraphs of running text — word
|
||||||
|
vectors will only provide a very rough approximation of what the text is about.
|
||||||
|
|
||||||
|
Word vectors are also very computationally efficient, as they map a word to a
|
||||||
|
vector with a single indexing operation. Word vectors are therefore useful as a
|
||||||
|
way to **improve the accuracy** of neural network models, especially models that
|
||||||
|
are small or have received little or no pretraining. In spaCy, word vector
|
||||||
|
tables are only used as **static features**. spaCy does not backpropagate
|
||||||
|
gradients to the pretrained word vectors table. The static vectors table is
|
||||||
|
usually used in combination with a smaller table of learned task-specific
|
||||||
|
embeddings.
|
||||||
|
|
||||||
|
</Accordion>
|
||||||
|
|
||||||
|
<Accordion title="When should I add word vectors to my model?">
|
||||||
|
|
||||||
|
Word vectors are not compatible with most [transformer models](#transformers),
|
||||||
|
but if you're training another type of NLP network, it's almost always worth
|
||||||
|
adding word vectors to your model. As well as improving your final accuracy,
|
||||||
|
word vectors often make experiments more consistent, as the accuracy you reach
|
||||||
|
will be less sensitive to how the network is randomly initialized. High variance
|
||||||
|
due to random chance can slow down your progress significantly, as you need to
|
||||||
|
run many experiments to filter the signal from the noise.
|
||||||
|
|
||||||
|
Word vector features need to be enabled prior to training, and the same word
|
||||||
|
vectors table will need to be available at runtime as well. You cannot add word
|
||||||
|
vector features once the model has already been trained, and you usually cannot
|
||||||
|
replace one word vectors table with another without causing a significant loss
|
||||||
|
of performance.
|
||||||
|
|
||||||
|
</Accordion>
|
||||||
|
|
||||||
|
## Using transformer models {#transformers}
|
||||||
|
|
||||||
|
Transformers are a family of neural network architectures that compute **dense,
|
||||||
|
context-sensitive representations** for the tokens in your documents. Downstream
|
||||||
|
models in your pipeline can then use these representations as input features to
|
||||||
|
**improve their predictions**. You can connect multiple components to a single
|
||||||
|
transformer model, with any or all of those components giving feedback to the
|
||||||
|
transformer to fine-tune it to your tasks. spaCy's transformer support
|
||||||
|
interoperates with [PyTorch](https://pytorch.org) and the
|
||||||
|
[HuggingFace `transformers`](https://huggingface.co/transformers/) library,
|
||||||
|
giving you access to thousands of pretrained models for your pipelines. There
|
||||||
|
are many [great guides](http://jalammar.github.io/illustrated-transformer/) to
|
||||||
|
transformer models, but for practical purposes, you can simply think of them as
|
||||||
|
a drop-in replacement that let you achieve **higher accuracy** in exchange for
|
||||||
|
**higher training and runtime costs**.
|
||||||
|
|
||||||
|
### Setup and installation {#transformers-installation}
|
||||||
|
|
||||||
|
> #### System requirements
|
||||||
|
>
|
||||||
|
> We recommend an NVIDIA **GPU** with at least **10GB of memory** in order to
|
||||||
|
> work with transformer models. Make sure your GPU drivers are up to date and
|
||||||
|
> you have **CUDA v9+** installed.
|
||||||
|
|
||||||
|
> The exact requirements will depend on the transformer model. Training a
|
||||||
|
> transformer-based model without a GPU will be too slow for most practical
|
||||||
|
> purposes.
|
||||||
|
>
|
||||||
|
> Provisioning a new machine will require about **5GB** of data to be
|
||||||
|
> downloaded: 3GB CUDA runtime, 800MB PyTorch, 400MB CuPy, 500MB weights, 200MB
|
||||||
|
> spaCy and dependencies.
|
||||||
|
|
||||||
|
Once you have CUDA installed, you'll need to install two pip packages,
|
||||||
|
[`cupy`](https://docs.cupy.dev/en/stable/install.html) and
|
||||||
|
[`spacy-transformers`](https://github.com/explosion/spacy-transformers). `cupy`
|
||||||
|
is just like `numpy`, but for GPU. The best way to install it is to choose a
|
||||||
|
wheel that matches the version of CUDA you're using. You may also need to set
|
||||||
|
the `CUDA_PATH` environment variable if your CUDA runtime is installed in a
|
||||||
|
non-standard location. Putting it all together, if you had installed CUDA 10.2
|
||||||
|
in `/opt/nvidia/cuda`, you would run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
### Installation with CUDA
|
||||||
|
export CUDA_PATH="/opt/nvidia/cuda"
|
||||||
|
pip install cupy-cuda102
|
||||||
|
pip install spacy-transformers
|
||||||
|
```
|
||||||
|
|
||||||
|
### Runtime usage {#transformers-runtime}
|
||||||
|
|
||||||
|
Transformer models can be used as **drop-in replacements** for other types of
|
||||||
|
neural networks, so your spaCy pipeline can include them in a way that's
|
||||||
|
completely invisible to the user. Users will download, load and use the model in
|
||||||
|
the standard way, like any other spaCy pipeline. Instead of using the
|
||||||
|
transformers as subnetworks directly, you can also use them via the
|
||||||
|
[`Transformer`](/api/transformer) pipeline component.
|
||||||
|
|
||||||
|
![The processing pipeline with the transformer component](../images/pipeline_transformer.svg)
|
||||||
|
|
||||||
|
The `Transformer` component sets the
|
||||||
|
[`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute,
|
||||||
|
which lets you access the transformers outputs at runtime.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ python -m spacy download en_core_trf_lg
|
||||||
|
```
|
||||||
|
|
||||||
|
```python
|
||||||
|
### Example
|
||||||
|
import spacy
|
||||||
|
from thinc.api import use_pytorch_for_gpu_memory, require_gpu
|
||||||
|
|
||||||
|
# Use the GPU, with memory allocations directed via PyTorch.
|
||||||
|
# This prevents out-of-memory errors that would otherwise occur from competing
|
||||||
|
# memory pools.
|
||||||
|
use_pytorch_for_gpu_memory()
|
||||||
|
require_gpu(0)
|
||||||
|
|
||||||
|
nlp = spacy.load("en_core_trf_lg")
|
||||||
|
for doc in nlp.pipe(["some text", "some other text"]):
|
||||||
|
tokvecs = doc._.trf_data.tensors[-1]
|
||||||
|
```
|
||||||
|
|
||||||
|
You can also customize how the [`Transformer`](/api/transformer) component sets
|
||||||
|
annotations onto the [`Doc`](/api/doc), by customizing the `annotation_setter`.
|
||||||
|
This callback will be called with the raw input and output data for the whole
|
||||||
|
batch, along with the batch of `Doc` objects, allowing you to implement whatever
|
||||||
|
you need. The annotation setter is called with a batch of [`Doc`](/api/doc)
|
||||||
|
objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch)
|
||||||
|
containing the transformers data for the batch.
|
||||||
|
|
||||||
|
```python
|
||||||
|
def custom_annotation_setter(docs, trf_data):
|
||||||
|
# TODO:
|
||||||
|
...
|
||||||
|
|
||||||
|
nlp = spacy.load("en_core_trf_lg")
|
||||||
|
nlp.get_pipe("transformer").annotation_setter = custom_annotation_setter
|
||||||
|
doc = nlp("This is a text")
|
||||||
|
print() # TODO:
|
||||||
|
```
|
||||||
|
|
||||||
|
### Training usage {#transformers-training}
|
||||||
|
|
||||||
|
The recommended workflow for training is to use spaCy's
|
||||||
|
[config system](/usage/training#config), usually via the
|
||||||
|
[`spacy train`](/api/cli#train) command. The training config defines all
|
||||||
|
component settings and hyperparameters in one place and lets you describe a tree
|
||||||
|
of objects by referring to creation functions, including functions you register
|
||||||
|
yourself. For details on how to get started with training your own model, check
|
||||||
|
out the [training quickstart](/usage/training#quickstart).
|
||||||
|
|
||||||
|
<Project id="en_core_bert">
|
||||||
|
|
||||||
|
The easiest way to get started is to clone a transformers-based project
|
||||||
|
template. Swap in your data, edit the settings and hyperparameters and train,
|
||||||
|
evaluate, package and visualize your model.
|
||||||
|
|
||||||
|
</Project>
|
||||||
|
|
||||||
|
The `[components]` section in the [`config.cfg`](/api/data-formats#config)
|
||||||
|
describes the pipeline components and the settings used to construct them,
|
||||||
|
including their model implementation. Here's a config snippet for the
|
||||||
|
[`Transformer`](/api/transformer) component, along with matching Python code. In
|
||||||
|
this case, the `[components.transformer]` block describes the `transformer`
|
||||||
|
component:
|
||||||
|
|
||||||
|
> #### Python equivalent
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from spacy_transformers import Transformer, TransformerModel
|
||||||
|
> from spacy_transformers.annotation_setters import null_annotation_setter
|
||||||
|
> from spacy_transformers.span_getters import get_doc_spans
|
||||||
|
>
|
||||||
|
> trf = Transformer(
|
||||||
|
> nlp.vocab,
|
||||||
|
> TransformerModel(
|
||||||
|
> "bert-base-cased",
|
||||||
|
> get_spans=get_doc_spans,
|
||||||
|
> tokenizer_config={"use_fast": True},
|
||||||
|
> ),
|
||||||
|
> annotation_setter=null_annotation_setter,
|
||||||
|
> max_batch_items=4096,
|
||||||
|
> )
|
||||||
|
> ```
|
||||||
|
|
||||||
|
```ini
|
||||||
|
### config.cfg (excerpt)
|
||||||
|
[components.transformer]
|
||||||
|
factory = "transformer"
|
||||||
|
max_batch_items = 4096
|
||||||
|
|
||||||
|
[components.transformer.model]
|
||||||
|
@architectures = "spacy-transformers.TransformerModel.v1"
|
||||||
|
name = "bert-base-cased"
|
||||||
|
tokenizer_config = {"use_fast": true}
|
||||||
|
|
||||||
|
[components.transformer.model.get_spans]
|
||||||
|
@span_getters = "doc_spans.v1"
|
||||||
|
|
||||||
|
[components.transformer.annotation_setter]
|
||||||
|
@annotation_setters = "spacy-transformer.null_annotation_setter.v1"
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
The `[components.transformer.model]` block describes the `model` argument passed
|
||||||
|
to the transformer component. It's a Thinc
|
||||||
|
[`Model`](https://thinc.ai/docs/api-model) object that will be passed into the
|
||||||
|
component. Here, it references the function
|
||||||
|
[spacy-transformers.TransformerModel.v1](/api/architectures#TransformerModel)
|
||||||
|
registered in the [`architectures` registry](/api/top-level#registry). If a key
|
||||||
|
in a block starts with `@`, it's **resolved to a function** and all other
|
||||||
|
settings are passed to the function as arguments. In this case, `name`,
|
||||||
|
`tokenizer_config` and `get_spans`.
|
||||||
|
|
||||||
|
`get_spans` is a function that takes a batch of `Doc` object and returns lists
|
||||||
|
of potentially overlapping `Span` objects to process by the transformer. Several
|
||||||
|
[built-in functions](/api/transformer#span-getters) are available – for example,
|
||||||
|
to process the whole document or individual sentences. When the config is
|
||||||
|
resolved, the function is created and passed into the model as an argument.
|
||||||
|
|
||||||
|
<Infobox variant="warning">
|
||||||
|
|
||||||
|
Remember that the `config.cfg` used for training should contain **no missing
|
||||||
|
values** and requires all settings to be defined. You don't want any hidden
|
||||||
|
defaults creeping in and changing your results! spaCy will tell you if settings
|
||||||
|
are missing, and you can run
|
||||||
|
[`spacy init fill-config`](/api/cli#init-fill-config) to automatically fill in
|
||||||
|
all defaults.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
### Customizing the settings {#transformers-training-custom-settings}
|
||||||
|
|
||||||
|
To change any of the settings, you can edit the `config.cfg` and re-run the
|
||||||
|
training. To change any of the functions, like the span getter, you can replace
|
||||||
|
the name of the referenced function – e.g. `@span_getters = "sent_spans.v1"` to
|
||||||
|
process sentences. You can also register your own functions using the
|
||||||
|
`span_getters` registry:
|
||||||
|
|
||||||
|
> #### config.cfg
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [components.transformer.model.get_spans]
|
||||||
|
> @span_getters = "custom_sent_spans"
|
||||||
|
> ```
|
||||||
|
|
||||||
|
```python
|
||||||
|
### code.py
|
||||||
|
import spacy_transformers
|
||||||
|
|
||||||
|
@spacy_transformers.registry.span_getters("custom_sent_spans")
|
||||||
|
def configure_custom_sent_spans():
|
||||||
|
# TODO: write custom example
|
||||||
|
def get_sent_spans(docs):
|
||||||
|
return [list(doc.sents) for doc in docs]
|
||||||
|
|
||||||
|
return get_sent_spans
|
||||||
|
```
|
||||||
|
|
||||||
|
To resolve the config during training, spaCy needs to know about your custom
|
||||||
|
function. You can make it available via the `--code` argument that can point to
|
||||||
|
a Python file. For more details on training with custom code, see the
|
||||||
|
[training documentation](/usage/training#custom-code).
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ python -m spacy train ./config.cfg --code ./code.py
|
||||||
|
```
|
||||||
|
|
||||||
|
### Customizing the model implementations {#training-custom-model}
|
||||||
|
|
||||||
|
The [`Transformer`](/api/transformer) component expects a Thinc
|
||||||
|
[`Model`](https://thinc.ai/docs/api-model) object to be passed in as its `model`
|
||||||
|
argument. You're not limited to the implementation provided by
|
||||||
|
`spacy-transformers` – the only requirement is that your registered function
|
||||||
|
must return an object of type ~~Model[List[Doc], FullTransformerBatch]~~: that
|
||||||
|
is, a Thinc model that takes a list of [`Doc`](/api/doc) objects, and returns a
|
||||||
|
[`FullTransformerBatch`](/api/transformer#fulltransformerbatch) object with the
|
||||||
|
transformer data.
|
||||||
|
|
||||||
|
> #### Model type annotations
|
||||||
|
>
|
||||||
|
> In the documentation and code base, you may come across type annotations and
|
||||||
|
> descriptions of [Thinc](https://thinc.ai) model types, like ~~Model[List[Doc],
|
||||||
|
> List[Floats2d]]~~. This so-called generic type describes the layer and its
|
||||||
|
> input and output type – in this case, it takes a list of `Doc` objects as the
|
||||||
|
> input and list of 2-dimensional arrays of floats as the output. You can read
|
||||||
|
> more about defining Thinc models [here](https://thinc.ai/docs/usage-models).
|
||||||
|
> Also see the [type checking](https://thinc.ai/docs/usage-type-checking) for
|
||||||
|
> how to enable linting in your editor to see live feedback if your inputs and
|
||||||
|
> outputs don't match.
|
||||||
|
|
||||||
|
The same idea applies to task models that power the **downstream components**.
|
||||||
|
Most of spaCy's built-in model creation functions support a `tok2vec` argument,
|
||||||
|
which should be a Thinc layer of type ~~Model[List[Doc], List[Floats2d]]~~. This
|
||||||
|
is where we'll plug in our transformer model, using the
|
||||||
|
[Tok2VecListener](/api/architectures#Tok2VecListener) layer, which sneakily
|
||||||
|
delegates to the `Transformer` pipeline component.
|
||||||
|
|
||||||
|
```ini
|
||||||
|
### config.cfg (excerpt) {highlight="12"}
|
||||||
|
[components.ner]
|
||||||
|
factory = "ner"
|
||||||
|
|
||||||
|
[nlp.pipeline.ner.model]
|
||||||
|
@architectures = "spacy.TransitionBasedParser.v1"
|
||||||
|
nr_feature_tokens = 3
|
||||||
|
hidden_width = 128
|
||||||
|
maxout_pieces = 3
|
||||||
|
use_upper = false
|
||||||
|
|
||||||
|
[nlp.pipeline.ner.model.tok2vec]
|
||||||
|
@architectures = "spacy-transformers.Tok2VecListener.v1"
|
||||||
|
grad_factor = 1.0
|
||||||
|
|
||||||
|
[nlp.pipeline.ner.model.tok2vec.pooling]
|
||||||
|
@layers = "reduce_mean.v1"
|
||||||
|
```
|
||||||
|
|
||||||
|
The [Tok2VecListener](/api/architectures#Tok2VecListener) layer expects a
|
||||||
|
[pooling layer](https://thinc.ai/docs/api-layers#reduction-ops) as the argument
|
||||||
|
`pooling`, which needs to be of type ~~Model[Ragged, Floats2d]~~. This layer
|
||||||
|
determines how the vector for each spaCy token will be computed from the zero or
|
||||||
|
more source rows the token is aligned against. Here we use the
|
||||||
|
[`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean) layer, which
|
||||||
|
averages the wordpiece rows. We could instead use
|
||||||
|
[`reduce_max`](https://thinc.ai/docs/api-layers#reduce_max), or a custom
|
||||||
|
function you write yourself.
|
||||||
|
|
||||||
|
You can have multiple components all listening to the same transformer model,
|
||||||
|
and all passing gradients back to it. By default, all of the gradients will be
|
||||||
|
**equally weighted**. You can control this with the `grad_factor` setting, which
|
||||||
|
lets you reweight the gradients from the different listeners. For instance,
|
||||||
|
setting `grad_factor = 0` would disable gradients from one of the listeners,
|
||||||
|
while `grad_factor = 2.0` would multiply them by 2. This is similar to having a
|
||||||
|
custom learning rate for each component. Instead of a constant, you can also
|
||||||
|
provide a schedule, allowing you to freeze the shared parameters at the start of
|
||||||
|
training.
|
||||||
|
|
||||||
|
## Static vectors {#static-vectors}
|
||||||
|
|
||||||
|
<!-- TODO: write -->
|
||||||
|
|
||||||
|
### Using word vectors in your models {#word-vectors-models}
|
||||||
|
|
||||||
|
Many neural network models are able to use word vector tables as additional
|
||||||
|
features, which sometimes results in significant improvements in accuracy.
|
||||||
|
spaCy's built-in embedding layer,
|
||||||
|
[MultiHashEmbed](/api/architectures#MultiHashEmbed), can be configured to use
|
||||||
|
word vector tables using the `also_use_static_vectors` flag. This setting is
|
||||||
|
also available on the [MultiHashEmbedCNN](/api/architectures#MultiHashEmbedCNN)
|
||||||
|
layer, which builds the default token-to-vector encoding architecture.
|
||||||
|
|
||||||
|
```ini
|
||||||
|
[tagger.model.tok2vec.embed]
|
||||||
|
@architectures = "spacy.MultiHashEmbed.v1"
|
||||||
|
width = 128
|
||||||
|
rows = 7000
|
||||||
|
also_embed_subwords = true
|
||||||
|
also_use_static_vectors = true
|
||||||
|
```
|
||||||
|
|
||||||
|
<Infobox title="How it works" emoji="💡">
|
||||||
|
|
||||||
|
The configuration system will look up the string `"spacy.MultiHashEmbed.v1"` in
|
||||||
|
the `architectures` [registry](/api/top-level#registry), and call the returned
|
||||||
|
object with the rest of the arguments from the block. This will result in a call
|
||||||
|
to the
|
||||||
|
[`MultiHashEmbed`](https://github.com/explosion/spacy/tree/develop/spacy/ml/models/tok2vec.py)
|
||||||
|
function, which will return a [Thinc](https://thinc.ai) model object with the
|
||||||
|
type signature ~~Model[List[Doc], List[Floats2d]]~~. Because the embedding layer
|
||||||
|
takes a list of `Doc` objects as input, it does not need to store a copy of the
|
||||||
|
vectors table. The vectors will be retrieved from the `Doc` objects that are
|
||||||
|
passed in, via the `doc.vocab.vectors` attribute. This part of the process is
|
||||||
|
handled by the [StaticVectors](/api/architectures#StaticVectors) layer.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
#### Creating a custom embedding layer {#custom-embedding-layer}
|
||||||
|
|
||||||
|
The [MultiHashEmbed](/api/architectures#StaticVectors) layer is spaCy's
|
||||||
|
recommended strategy for constructing initial word representations for your
|
||||||
|
neural network models, but you can also implement your own. You can register any
|
||||||
|
function to a string name, and then reference that function within your config
|
||||||
|
(see the [training docs](/usage/training) for more details). To try this out,
|
||||||
|
you can save the following little example to a new Python file:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from spacy.ml.staticvectors import StaticVectors
|
||||||
|
from spacy.util import registry
|
||||||
|
|
||||||
|
print("I was imported!")
|
||||||
|
|
||||||
|
@registry.architectures("my_example.MyEmbedding.v1")
|
||||||
|
def MyEmbedding(output_width: int) -> Model[List[Doc], List[Floats2d]]:
|
||||||
|
print("I was called!")
|
||||||
|
return StaticVectors(nO=output_width)
|
||||||
|
```
|
||||||
|
|
||||||
|
If you pass the path to your file to the [`spacy train`](/api/cli#train) command
|
||||||
|
using the `--code` argument, your file will be imported, which means the
|
||||||
|
decorator registering the function will be run. Your function is now on equal
|
||||||
|
footing with any of spaCy's built-ins, so you can drop it in instead of any
|
||||||
|
other model with the same input and output signature. For instance, you could
|
||||||
|
use it in the tagger model as follows:
|
||||||
|
|
||||||
|
```ini
|
||||||
|
[tagger.model.tok2vec.embed]
|
||||||
|
@architectures = "my_example.MyEmbedding.v1"
|
||||||
|
output_width = 128
|
||||||
|
```
|
||||||
|
|
||||||
|
Now that you have a custom function wired into the network, you can start
|
||||||
|
implementing the logic you're interested in. For example, let's say you want to
|
||||||
|
try a relatively simple embedding strategy that makes use of static word
|
||||||
|
vectors, but combines them via summation with a smaller table of learned
|
||||||
|
embeddings.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from thinc.api import add, chain, remap_ids, Embed
|
||||||
|
from spacy.ml.staticvectors import StaticVectors
|
||||||
|
|
||||||
|
@registry.architectures("my_example.MyEmbedding.v1")
|
||||||
|
def MyCustomVectors(
|
||||||
|
output_width: int,
|
||||||
|
vector_width: int,
|
||||||
|
embed_rows: int,
|
||||||
|
key2row: Dict[int, int]
|
||||||
|
) -> Model[List[Doc], List[Floats2d]]:
|
||||||
|
return add(
|
||||||
|
StaticVectors(nO=output_width),
|
||||||
|
chain(
|
||||||
|
FeatureExtractor(["ORTH"]),
|
||||||
|
remap_ids(key2row),
|
||||||
|
Embed(nO=output_width, nV=embed_rows)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Pretraining {#pretraining}
|
||||||
|
|
||||||
|
<!-- TODO: write -->
|
|
@ -9,6 +9,7 @@ menu:
|
||||||
- ['Tokenization', 'tokenization']
|
- ['Tokenization', 'tokenization']
|
||||||
- ['Merging & Splitting', 'retokenization']
|
- ['Merging & Splitting', 'retokenization']
|
||||||
- ['Sentence Segmentation', 'sbd']
|
- ['Sentence Segmentation', 'sbd']
|
||||||
|
- ['Vectors & Similarity', 'vectors-similarity']
|
||||||
- ['Language data', 'language-data']
|
- ['Language data', 'language-data']
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -1024,10 +1025,10 @@ produced by the tokenizer.
|
||||||
>
|
>
|
||||||
> If you're working with transformer models like BERT, check out the
|
> If you're working with transformer models like BERT, check out the
|
||||||
> [`spacy-transformers`](https://github.com/explosion/spacy-transformers)
|
> [`spacy-transformers`](https://github.com/explosion/spacy-transformers)
|
||||||
> extension package and [documentation](/usage/transformers). It includes a
|
> extension package and [documentation](/usage/embeddings-transformers). It
|
||||||
> pipeline component for using pretrained transformer weights and **training
|
> includes a pipeline component for using pretrained transformer weights and
|
||||||
> transformer models** in spaCy, as well as helpful utilities for aligning word
|
> **training transformer models** in spaCy, as well as helpful utilities for
|
||||||
> pieces to linguistic tokenization.
|
> aligning word pieces to linguistic tokenization.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### Custom BERT word piece tokenizer
|
### Custom BERT word piece tokenizer
|
||||||
|
@ -1510,7 +1511,7 @@ adding it to the pipeline using [`nlp.add_pipe`](/api/language#add_pipe).
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
Here's an example of a component that implements a pre-processing rule for
|
Here's an example of a component that implements a pre-processing rule for
|
||||||
splitting on `'...'` tokens. The component is added before the parser, which is
|
splitting on `"..."` tokens. The component is added before the parser, which is
|
||||||
then used to further segment the text. That's possible, because `is_sent_start`
|
then used to further segment the text. That's possible, because `is_sent_start`
|
||||||
is only set to `True` for some of the tokens – all others still specify `None`
|
is only set to `True` for some of the tokens – all others still specify `None`
|
||||||
for unset sentence boundaries. This approach can be useful if you want to
|
for unset sentence boundaries. This approach can be useful if you want to
|
||||||
|
@ -1540,6 +1541,152 @@ doc = nlp(text)
|
||||||
print("After:", [sent.text for sent in doc.sents])
|
print("After:", [sent.text for sent in doc.sents])
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Word vectors and semantic similarity {#vectors-similarity}
|
||||||
|
|
||||||
|
import Vectors101 from 'usage/101/\_vectors-similarity.md'
|
||||||
|
|
||||||
|
<Vectors101 />
|
||||||
|
|
||||||
|
<Infobox title="What to expect from similarity results" variant="warning">
|
||||||
|
|
||||||
|
Computing similarity scores can be helpful in many situations, but it's also
|
||||||
|
important to maintain **realistic expectations** about what information it can
|
||||||
|
provide. Words can be related to each over in many ways, so a single
|
||||||
|
"similarity" score will always be a **mix of different signals**, and vectors
|
||||||
|
trained on different data can produce very different results that may not be
|
||||||
|
useful for your purpose.
|
||||||
|
|
||||||
|
Also note that the similarity of `Doc` or `Span` objects defaults to the
|
||||||
|
**average** of the token vectors. This means it's insensitive to the order of
|
||||||
|
the words. Two documents expressing the same meaning with dissimilar wording
|
||||||
|
will return a lower similarity score than two documents that happen to contain
|
||||||
|
the same words while expressing different meanings.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
### Adding word vectors {#adding-vectors}
|
||||||
|
|
||||||
|
Custom word vectors can be trained using a number of open-source libraries, such
|
||||||
|
as [Gensim](https://radimrehurek.com/gensim), [Fast Text](https://fasttext.cc),
|
||||||
|
or Tomas Mikolov's original
|
||||||
|
[Word2vec implementation](https://code.google.com/archive/p/word2vec/). Most
|
||||||
|
word vector libraries output an easy-to-read text-based format, where each line
|
||||||
|
consists of the word followed by its vector. For everyday use, we want to
|
||||||
|
convert the vectors model into a binary format that loads faster and takes up
|
||||||
|
less space on disk. The easiest way to do this is the
|
||||||
|
[`init model`](/api/cli#init-model) command-line utility. This will output a
|
||||||
|
spaCy model in the directory `/tmp/la_vectors_wiki_lg`, giving you access to
|
||||||
|
some nice Latin vectors. You can then pass the directory path to
|
||||||
|
[`spacy.load`](/api/top-level#spacy.load).
|
||||||
|
|
||||||
|
> #### Usage example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> nlp_latin = spacy.load("/tmp/la_vectors_wiki_lg")
|
||||||
|
> doc1 = nlp_latin("Caecilius est in horto")
|
||||||
|
> doc2 = nlp_latin("servus est in atrio")
|
||||||
|
> doc1.similarity(doc2)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/word-vectors-v2/cc.la.300.vec.gz
|
||||||
|
python -m spacy init model en /tmp/la_vectors_wiki_lg --vectors-loc cc.la.300.vec.gz
|
||||||
|
```
|
||||||
|
|
||||||
|
<Accordion title="How to optimize vector coverage" id="custom-vectors-coverage" spaced>
|
||||||
|
|
||||||
|
To help you strike a good balance between coverage and memory usage, spaCy's
|
||||||
|
[`Vectors`](/api/vectors) class lets you map **multiple keys** to the **same
|
||||||
|
row** of the table. If you're using the
|
||||||
|
[`spacy init model`](/api/cli#init-model) command to create a vocabulary,
|
||||||
|
pruning the vectors will be taken care of automatically if you set the
|
||||||
|
`--prune-vectors` flag. You can also do it manually in the following steps:
|
||||||
|
|
||||||
|
1. Start with a **word vectors model** that covers a huge vocabulary. For
|
||||||
|
instance, the [`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg)
|
||||||
|
model provides 300-dimensional GloVe vectors for over 1 million terms of
|
||||||
|
English.
|
||||||
|
2. If your vocabulary has values set for the `Lexeme.prob` attribute, the
|
||||||
|
lexemes will be sorted by descending probability to determine which vectors
|
||||||
|
to prune. Otherwise, lexemes will be sorted by their order in the `Vocab`.
|
||||||
|
3. Call [`Vocab.prune_vectors`](/api/vocab#prune_vectors) with the number of
|
||||||
|
vectors you want to keep.
|
||||||
|
|
||||||
|
```python
|
||||||
|
nlp = spacy.load('en_vectors_web_lg')
|
||||||
|
n_vectors = 105000 # number of vectors to keep
|
||||||
|
removed_words = nlp.vocab.prune_vectors(n_vectors)
|
||||||
|
|
||||||
|
assert len(nlp.vocab.vectors) <= n_vectors # unique vectors have been pruned
|
||||||
|
assert nlp.vocab.vectors.n_keys > n_vectors # but not the total entries
|
||||||
|
```
|
||||||
|
|
||||||
|
[`Vocab.prune_vectors`](/api/vocab#prune_vectors) reduces the current vector
|
||||||
|
table to a given number of unique entries, and returns a dictionary containing
|
||||||
|
the removed words, mapped to `(string, score)` tuples, where `string` is the
|
||||||
|
entry the removed word was mapped to, and `score` the similarity score between
|
||||||
|
the two words.
|
||||||
|
|
||||||
|
```python
|
||||||
|
### Removed words
|
||||||
|
{
|
||||||
|
"Shore": ("coast", 0.732257),
|
||||||
|
"Precautionary": ("caution", 0.490973),
|
||||||
|
"hopelessness": ("sadness", 0.742366),
|
||||||
|
"Continous": ("continuous", 0.732549),
|
||||||
|
"Disemboweled": ("corpse", 0.499432),
|
||||||
|
"biostatistician": ("scientist", 0.339724),
|
||||||
|
"somewheres": ("somewheres", 0.402736),
|
||||||
|
"observing": ("observe", 0.823096),
|
||||||
|
"Leaving": ("leaving", 1.0),
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
In the example above, the vector for "Shore" was removed and remapped to the
|
||||||
|
vector of "coast", which is deemed about 73% similar. "Leaving" was remapped to
|
||||||
|
the vector of "leaving", which is identical. If you're using the
|
||||||
|
[`init model`](/api/cli#init-model) command, you can set the `--prune-vectors`
|
||||||
|
option to easily reduce the size of the vectors as you add them to a spaCy
|
||||||
|
model:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ python -m spacy init model /tmp/la_vectors_web_md --vectors-loc la.300d.vec.tgz --prune-vectors 10000
|
||||||
|
```
|
||||||
|
|
||||||
|
This will create a spaCy model with vectors for the first 10,000 words in the
|
||||||
|
vectors model. All other words in the vectors model are mapped to the closest
|
||||||
|
vector among those retained.
|
||||||
|
|
||||||
|
</Accordion>
|
||||||
|
|
||||||
|
### Adding vectors individually {#adding-individual-vectors}
|
||||||
|
|
||||||
|
The `vector` attribute is a **read-only** numpy or cupy array (depending on
|
||||||
|
whether you've configured spaCy to use GPU memory), with dtype `float32`. The
|
||||||
|
array is read-only so that spaCy can avoid unnecessary copy operations where
|
||||||
|
possible. You can modify the vectors via the [`Vocab`](/api/vocab) or
|
||||||
|
[`Vectors`](/api/vectors) table. Using the
|
||||||
|
[`Vocab.set_vector`](/api/vocab#set_vector) method is often the easiest approach
|
||||||
|
if you have vectors in an arbitrary format, as you can read in the vectors with
|
||||||
|
your own logic, and just set them with a simple loop. This method is likely to
|
||||||
|
be slower than approaches that work with the whole vectors table at once, but
|
||||||
|
it's a great approach for once-off conversions before you save out your model to
|
||||||
|
disk.
|
||||||
|
|
||||||
|
```python
|
||||||
|
### Adding vectors
|
||||||
|
from spacy.vocab import Vocab
|
||||||
|
|
||||||
|
vector_data = {
|
||||||
|
"dog": numpy.random.uniform(-1, 1, (300,)),
|
||||||
|
"cat": numpy.random.uniform(-1, 1, (300,)),
|
||||||
|
"orange": numpy.random.uniform(-1, 1, (300,))
|
||||||
|
}
|
||||||
|
vocab = Vocab()
|
||||||
|
for word, vector in vector_data.items():
|
||||||
|
vocab.set_vector(word, vector)
|
||||||
|
```
|
||||||
|
|
||||||
## Language data {#language-data}
|
## Language data {#language-data}
|
||||||
|
|
||||||
import LanguageData101 from 'usage/101/\_language-data.md'
|
import LanguageData101 from 'usage/101/\_language-data.md'
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
---
|
---
|
||||||
title: Language Processing Pipelines
|
title: Language Processing Pipelines
|
||||||
next: /usage/vectors-embeddings
|
next: /usage/embeddings-transformers
|
||||||
menu:
|
menu:
|
||||||
- ['Processing Text', 'processing']
|
- ['Processing Text', 'processing']
|
||||||
- ['How Pipelines Work', 'pipelines']
|
- ['How Pipelines Work', 'pipelines']
|
||||||
|
@ -324,9 +324,9 @@ pretrained components and new components trained on your data.
|
||||||
|
|
||||||
When reusing components across models, keep in mind that the **vocabulary**,
|
When reusing components across models, keep in mind that the **vocabulary**,
|
||||||
**vectors** and model settings **must match**. If a pretrained model includes
|
**vectors** and model settings **must match**. If a pretrained model includes
|
||||||
[word vectors](/usage/vectors-embeddings) and the component uses them as
|
[word vectors](/usage/linguistic-features#vectors-similarity) and the component
|
||||||
features, the model you copy it to needs to have the _same_ vectors available –
|
uses them as features, the model you copy it to needs to have the _same_ vectors
|
||||||
otherwise, it won't be able to make the same predictions.
|
available – otherwise, it won't be able to make the same predictions.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
@ -1202,7 +1202,7 @@ document similarity method.
|
||||||
Hooks let you customize some of the behaviors of the `Doc`, `Span` or `Token`
|
Hooks let you customize some of the behaviors of the `Doc`, `Span` or `Token`
|
||||||
objects by adding a component to the pipeline. For instance, to customize the
|
objects by adding a component to the pipeline. For instance, to customize the
|
||||||
[`Doc.similarity`](/api/doc#similarity) method, you can add a component that
|
[`Doc.similarity`](/api/doc#similarity) method, you can add a component that
|
||||||
sets a custom function to `doc.user_hooks['similarity']`. The built-in
|
sets a custom function to `doc.user_hooks["similarity"]`. The built-in
|
||||||
`Doc.similarity` method will check the `user_hooks` dict, and delegate to your
|
`Doc.similarity` method will check the `user_hooks` dict, and delegate to your
|
||||||
function if you've set one. Similar results can be achieved by setting functions
|
function if you've set one. Similar results can be achieved by setting functions
|
||||||
to `Doc.user_span_hooks` and `Doc.user_token_hooks`.
|
to `Doc.user_span_hooks` and `Doc.user_token_hooks`.
|
||||||
|
|
|
@ -247,7 +247,7 @@ import Vectors101 from 'usage/101/\_vectors-similarity.md'
|
||||||
|
|
||||||
To learn more about word vectors, how to **customize them** and how to load
|
To learn more about word vectors, how to **customize them** and how to load
|
||||||
**your own vectors** into spaCy, see the usage guide on
|
**your own vectors** into spaCy, see the usage guide on
|
||||||
[using word vectors and semantic similarities](/usage/vectors-embeddings).
|
[using word vectors and semantic similarities](/usage/linguistic-features#vectors-similarity).
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
|
|
@ -30,7 +30,7 @@ ready-to-use spaCy models.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
## Quickstart {#quickstart}
|
## Quickstart {#quickstart tag="new"}
|
||||||
|
|
||||||
The recommended way to train your spaCy models is via the
|
The recommended way to train your spaCy models is via the
|
||||||
[`spacy train`](/api/cli#train) command on the command line. It only needs a
|
[`spacy train`](/api/cli#train) command on the command line. It only needs a
|
||||||
|
@ -131,7 +131,7 @@ Some of the main advantages and features of spaCy's training config are:
|
||||||
multiple components, define them once and reference them as
|
multiple components, define them once and reference them as
|
||||||
[variables](#config-interpolation).
|
[variables](#config-interpolation).
|
||||||
- **Reproducibility with no hidden defaults.** The config file is the "single
|
- **Reproducibility with no hidden defaults.** The config file is the "single
|
||||||
source of truth" and includes all settings. <!-- TODO: explain this better -->
|
source of truth" and includes all settings.
|
||||||
- **Automated checks and validation.** When you load a config, spaCy checks if
|
- **Automated checks and validation.** When you load a config, spaCy checks if
|
||||||
the settings are complete and if all values have the correct types. This lets
|
the settings are complete and if all values have the correct types. This lets
|
||||||
you catch potential mistakes early. In your custom architectures, you can use
|
you catch potential mistakes early. In your custom architectures, you can use
|
||||||
|
@ -667,7 +667,7 @@ visualize your model.
|
||||||
|
|
||||||
For more details on how to integrate transformer models into your training
|
For more details on how to integrate transformer models into your training
|
||||||
config and customize the implementations, see the usage guide on
|
config and customize the implementations, see the usage guide on
|
||||||
[training transformers](/usage/transformers#training).
|
[training transformers](/usage/embeddings-transformers#transformers-training).
|
||||||
|
|
||||||
### Pretraining with spaCy {#pretraining}
|
### Pretraining with spaCy {#pretraining}
|
||||||
|
|
||||||
|
|
|
@ -218,7 +218,7 @@ available via `token.orth`.
|
||||||
|
|
||||||
The new [`Vectors`](/api/vectors) class helps the `Vocab` manage the vectors
|
The new [`Vectors`](/api/vectors) class helps the `Vocab` manage the vectors
|
||||||
assigned to strings, and lets you assign vectors individually, or
|
assigned to strings, and lets you assign vectors individually, or
|
||||||
[load in GloVe vectors](/usage/vectors-embeddings#custom-loading-glove) from a
|
[load in GloVe vectors](/usage/linguistic-features#adding-vectors) from a
|
||||||
directory. To help you strike a good balance between coverage and memory usage,
|
directory. To help you strike a good balance between coverage and memory usage,
|
||||||
the `Vectors` class lets you map **multiple keys** to the **same row** of the
|
the `Vectors` class lets you map **multiple keys** to the **same row** of the
|
||||||
table. If you're using the [`spacy init-model`](/api/cli#init-model) command to
|
table. If you're using the [`spacy init-model`](/api/cli#init-model) command to
|
||||||
|
|
|
@ -30,7 +30,7 @@ menu:
|
||||||
|
|
||||||
<Infobox title="Details & Documentation" emoji="📖" list>
|
<Infobox title="Details & Documentation" emoji="📖" list>
|
||||||
|
|
||||||
- **Usage:** [Transformers](/usage/transformers),
|
- **Usage:** [Embeddings & Transformers](/usage/embeddings-transformers),
|
||||||
[Training models](/usage/training)
|
[Training models](/usage/training)
|
||||||
- **API:** [`Transformer`](/api/transformer),
|
- **API:** [`Transformer`](/api/transformer),
|
||||||
[`TransformerData`](/api/transformer#transformerdata),
|
[`TransformerData`](/api/transformer#transformerdata),
|
||||||
|
@ -60,12 +60,12 @@ menu:
|
||||||
### New built-in pipeline components {#features-pipeline-components}
|
### New built-in pipeline components {#features-pipeline-components}
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| ----------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| [`SentenceRecognizer`](/api/sentencerecognizer) | Trainable component for sentence segmentation. |
|
| [`SentenceRecognizer`](/api/sentencerecognizer) | Trainable component for sentence segmentation. |
|
||||||
| [`Morphologizer`](/api/morphologizer) | Trainable component to predict morphological features. |
|
| [`Morphologizer`](/api/morphologizer) | Trainable component to predict morphological features. |
|
||||||
| [`Lemmatizer`](/api/lemmatizer) | Standalone component for rule-based and lookup lemmatization. |
|
| [`Lemmatizer`](/api/lemmatizer) | Standalone component for rule-based and lookup lemmatization. |
|
||||||
| [`AttributeRuler`](/api/attributeruler) | Component for setting token attributes using match patterns. |
|
| [`AttributeRuler`](/api/attributeruler) | Component for setting token attributes using match patterns. |
|
||||||
| [`Transformer`](/api/transformer) | Component for using [transformer models](/usage/transformers) in your pipeline, accessing outputs and aligning tokens. Provided via [`spacy-transformers`](https://github.com/explosion/spacy-transformers). |
|
| [`Transformer`](/api/transformer) | Component for using [transformer models](/usage/embeddings-transformers) in your pipeline, accessing outputs and aligning tokens. Provided via [`spacy-transformers`](https://github.com/explosion/spacy-transformers). |
|
||||||
|
|
||||||
<Infobox title="Details & Documentation" emoji="📖" list>
|
<Infobox title="Details & Documentation" emoji="📖" list>
|
||||||
|
|
||||||
|
|
|
@ -1,340 +0,0 @@
|
||||||
---
|
|
||||||
title: Vectors and Embeddings
|
|
||||||
menu:
|
|
||||||
- ["What's a Word Vector?", 'whats-a-vector']
|
|
||||||
- ['Using Word Vectors', 'usage']
|
|
||||||
- ['Converting and Importing', 'converting']
|
|
||||||
next: /usage/transformers
|
|
||||||
---
|
|
||||||
|
|
||||||
Word vector tables (or "embeddings") let you find similar terms, and can improve
|
|
||||||
the accuracy of some of your components. You can even use word vectors as a
|
|
||||||
quick-and-dirty text-classification solution when you don't have any training data.
|
|
||||||
Word vector tables are included in some of the spaCy [model packages](/models)
|
|
||||||
we distribute, and you can easily create your own model packages with word
|
|
||||||
vectors you train or download yourself.
|
|
||||||
|
|
||||||
## What's a word vector? {#whats-a-vector}
|
|
||||||
|
|
||||||
For spaCy's purposes, a "word vector" is a 1-dimensional slice from a
|
|
||||||
2-dimensional **vectors table**, with a deterministic mapping from word types to
|
|
||||||
rows in the table.
|
|
||||||
|
|
||||||
```python
|
|
||||||
def what_is_a_word_vector(
|
|
||||||
word_id: int,
|
|
||||||
key2row: Dict[int, int],
|
|
||||||
vectors_table: Floats2d,
|
|
||||||
*,
|
|
||||||
default_row: int=0
|
|
||||||
) -> Floats1d:
|
|
||||||
return vectors_table[key2row.get(word_id, default_row)]
|
|
||||||
```
|
|
||||||
|
|
||||||
An old idea in linguistics is that you can "know a word by the company it
|
|
||||||
keeps": that is, word meanings can be understood relationally, based on their
|
|
||||||
patterns of usage. This idea inspired a branch of NLP research known as
|
|
||||||
"distributional semantics" that has aimed to compute databases of lexical
|
|
||||||
knowledge automatically. The [Word2vec](https://en.wikipedia.org/wiki/Word2vec)
|
|
||||||
family of algorithms are a key milestone in this line of research. For
|
|
||||||
simplicity, we will refer to a distributional word representation as a "word
|
|
||||||
vector", and algorithms that computes word vectors (such as
|
|
||||||
[GloVe](https://nlp.stanford.edu/projects/glove/),
|
|
||||||
[FastText](https://fasttext.cc), etc.) as "Word2vec algorithms".
|
|
||||||
|
|
||||||
Word2vec algorithms try to produce vectors tables that let you estimate useful
|
|
||||||
relationships between words using simple linear algebra operations. For
|
|
||||||
instance, you can often find close synonyms of a word by finding the vectors
|
|
||||||
closest to it by cosine distance, and then finding the words that are mapped to
|
|
||||||
those neighboring vectors. Word vectors can also be useful as features in
|
|
||||||
statistical models.
|
|
||||||
|
|
||||||
### Word vectors vs. contextual language models {#vectors-vs-language-models}
|
|
||||||
|
|
||||||
The key difference between word vectors and contextual language models such
|
|
||||||
as [transformers](/usage/transformers)
|
|
||||||
is that word vectors model **lexical types**, rather than
|
|
||||||
_tokens_. If you have a list of terms with no context around them,
|
|
||||||
a transformer model like BERT can't really help you. BERT is designed to understand
|
|
||||||
language **in context**, which isn't what you have. A word vectors table will be
|
|
||||||
a much better fit for your task. However, if you do have words in context — whole
|
|
||||||
sentences or paragraphs of running text — word vectors will only provide a very
|
|
||||||
rough approximation of what the text is about.
|
|
||||||
|
|
||||||
Word vectors are also very computationally efficient, as they map a word to a
|
|
||||||
vector with a single indexing operation. Word vectors are therefore useful as a
|
|
||||||
way to **improve the accuracy** of neural network models, especially models that
|
|
||||||
are small or have received little or no pretraining. In spaCy, word vector
|
|
||||||
tables are only used as **static features**. spaCy does not backpropagate
|
|
||||||
gradients to the pretrained word vectors table. The static vectors table is
|
|
||||||
usually used in combination with a smaller table of learned task-specific
|
|
||||||
embeddings.
|
|
||||||
|
|
||||||
## Using word vectors {#usage}
|
|
||||||
|
|
||||||
spaCy stores word vector information in the
|
|
||||||
[`Vocab.vectors`](/api/vocab#attributes) attribute, so you can access the whole
|
|
||||||
vectors table from most spaCy objects. You can also access the vector for a
|
|
||||||
[`Doc`](/api/doc), [`Span`](/api/span), [`Token`](/api/token) or
|
|
||||||
[`Lexeme`](/api/lexeme) instance via the `vector` attribute. If your `Doc` or
|
|
||||||
`Span` has multiple tokens, the average of the word vectors will be returned,
|
|
||||||
excluding any "out of vocabulary" entries that have no vector available. If none
|
|
||||||
of the words have a vector, a zeroed vector will be returned.
|
|
||||||
|
|
||||||
The `vector` attribute is a **read-only** numpy or cupy array (depending on
|
|
||||||
whether you've configured spaCy to use GPU memory), with dtype `float32`. The
|
|
||||||
array is read-only so that spaCy can avoid unnecessary copy operations where
|
|
||||||
possible. You can modify the vectors via the `Vocab` or `Vectors` table.
|
|
||||||
|
|
||||||
### Word vectors and similarity
|
|
||||||
|
|
||||||
A common use-case of word vectors is to answer _similarity questions_. You can
|
|
||||||
ask how similar a `token`, `span`, `doc` or `lexeme` is to another object using
|
|
||||||
the `.similarity()` method. You can even check the similarity of mismatched
|
|
||||||
types, asking how similar a whole document is to a particular word, how similar
|
|
||||||
a span is to a document, etc. By default, the `.similarity()` method will use
|
|
||||||
return the cosine of the `.vector` attribute of the two objects being compared.
|
|
||||||
You can customize this behavior by setting one or more
|
|
||||||
[user hooks](/usage/processing-pipelines#custom-components-user-hooks) for the
|
|
||||||
types you want to customize.
|
|
||||||
|
|
||||||
Word vector similarity is a practical technique for many situations, especially
|
|
||||||
since it's easy to use and relatively efficient to compute. However, it's
|
|
||||||
important to maintain realistic expectations about what information it can
|
|
||||||
provide. Words can be related to each over in many ways, so a single
|
|
||||||
"similarity" score will always be a mix of different signals. The word vectors
|
|
||||||
model is also not trained for your specific use-case, so you have no way of
|
|
||||||
telling it which results are more or less useful for your purpose. These
|
|
||||||
problems are even more accute when you go from measuring the similarity of
|
|
||||||
single words to the similarity of spans or documents. The vector averaging
|
|
||||||
process is insensitive to the order of the words, so `doc1.similarity(doc2)`
|
|
||||||
will mostly be based on the overlap in lexical items between the two documents
|
|
||||||
objects. Two documents expressing the same meaning with dissimilar wording will
|
|
||||||
return a lower similarity score than two documents that happen to contain the
|
|
||||||
same words while expressing different meanings.
|
|
||||||
|
|
||||||
### Using word vectors in your models
|
|
||||||
|
|
||||||
Many neural network models are able to use word vector tables as additional
|
|
||||||
features, which sometimes results in significant improvements in accuracy.
|
|
||||||
spaCy's built-in embedding layer, `spacy.MultiHashEmbed.v1`, can be configured
|
|
||||||
to use word vector tables using the `also_use_static_vectors` flag. This
|
|
||||||
setting is also available on the `spacy.MultiHashEmbedCNN.v1` layer, which
|
|
||||||
builds the default token-to-vector encoding architecture.
|
|
||||||
|
|
||||||
```
|
|
||||||
[tagger.model.tok2vec.embed]
|
|
||||||
@architectures = "spacy.MultiHashEmbed.v1"
|
|
||||||
width = 128
|
|
||||||
rows = 7000
|
|
||||||
also_embed_subwords = true
|
|
||||||
also_use_static_vectors = true
|
|
||||||
```
|
|
||||||
|
|
||||||
<Infobox title="How it works">
|
|
||||||
The configuration system will look up the string `spacy.MultiHashEmbed.v1`
|
|
||||||
in the `architectures` registry, and call the returned object with the
|
|
||||||
rest of the arguments from the block. This will result in a call to the
|
|
||||||
`spacy.ml.models.tok2vec.MultiHashEmbed` function, which will return
|
|
||||||
a Thinc model object with the type signature `Model[List[Doc],
|
|
||||||
List[Floats2d]]`. Because the embedding layer takes a list of `Doc` objects as
|
|
||||||
input, it does not need to store a copy of the vectors table. The vectors will
|
|
||||||
be retrieved from the `Doc` objects that are passed in, via the
|
|
||||||
`doc.vocab.vectors` attribute. This part of the process is handled by the
|
|
||||||
`spacy.ml.staticvectors.StaticVectors` layer.
|
|
||||||
</Infobox>
|
|
||||||
|
|
||||||
#### Creating a custom embedding layer
|
|
||||||
|
|
||||||
The `MultiHashEmbed` layer is spaCy's recommended strategy for constructing
|
|
||||||
initial word representations for your neural network models, but you can also
|
|
||||||
implement your own. You can register any function to a string name, and then
|
|
||||||
reference that function within your config (see the [training]("/usage/training")
|
|
||||||
section for more details). To try this out, you can save the following little
|
|
||||||
example to a new Python file:
|
|
||||||
|
|
||||||
```
|
|
||||||
from spacy.ml.staticvectors import StaticVectors
|
|
||||||
from spacy.util import registry
|
|
||||||
|
|
||||||
print("I was imported!")
|
|
||||||
|
|
||||||
@registry.architectures("my_example.MyEmbedding.v1")
|
|
||||||
def MyEmbedding(output_width: int) -> Model[List[Doc], List[Floats2d]]:
|
|
||||||
print("I was called!")
|
|
||||||
return StaticVectors(nO=output_width)
|
|
||||||
```
|
|
||||||
|
|
||||||
If you pass the path to your file to the `spacy train` command using the `-c`
|
|
||||||
argument, your file will be imported, which means the decorator registering the
|
|
||||||
function will be run. Your function is now on equal footing with any of spaCy's
|
|
||||||
built-ins, so you can drop it in instead of any other model with the same input
|
|
||||||
and output signature. For instance, you could use it in the tagger model as
|
|
||||||
follows:
|
|
||||||
|
|
||||||
```
|
|
||||||
[tagger.model.tok2vec.embed]
|
|
||||||
@architectures = "my_example.MyEmbedding.v1"
|
|
||||||
output_width = 128
|
|
||||||
```
|
|
||||||
|
|
||||||
Now that you have a custom function wired into the network, you can start
|
|
||||||
implementing the logic you're interested in. For example, let's say you want to
|
|
||||||
try a relatively simple embedding strategy that makes use of static word vectors,
|
|
||||||
but combines them via summation with a smaller table of learned embeddings.
|
|
||||||
|
|
||||||
```python
|
|
||||||
from thinc.api import add, chain, remap_ids, Embed
|
|
||||||
from spacy.ml.staticvectors import StaticVectors
|
|
||||||
|
|
||||||
@registry.architectures("my_example.MyEmbedding.v1")
|
|
||||||
def MyCustomVectors(
|
|
||||||
output_width: int,
|
|
||||||
vector_width: int,
|
|
||||||
embed_rows: int,
|
|
||||||
key2row: Dict[int, int]
|
|
||||||
) -> Model[List[Doc], List[Floats2d]]:
|
|
||||||
return add(
|
|
||||||
StaticVectors(nO=output_width),
|
|
||||||
chain(
|
|
||||||
FeatureExtractor(["ORTH"]),
|
|
||||||
remap_ids(key2row),
|
|
||||||
Embed(nO=output_width, nV=embed_rows)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
#### When should you add word vectors to your model?
|
|
||||||
|
|
||||||
Word vectors are not compatible with most [transformer models](/usage/transformers),
|
|
||||||
but if you're training another type of NLP network, it's almost always worth
|
|
||||||
adding word vectors to your model. As well as improving your final accuracy,
|
|
||||||
word vectors often make experiments more consistent, as the accuracy you
|
|
||||||
reach will be less sensitive to how the network is randomly initialized. High
|
|
||||||
variance due to random chance can slow down your progress significantly, as you
|
|
||||||
need to run many experiments to filter the signal from the noise.
|
|
||||||
|
|
||||||
Word vector features need to be enabled prior to training, and the same word vectors
|
|
||||||
table will need to be available at runtime as well. You cannot add word vector
|
|
||||||
features once the model has already been trained, and you usually cannot
|
|
||||||
replace one word vectors table with another without causing a significant loss
|
|
||||||
of performance.
|
|
||||||
|
|
||||||
## Converting word vectors for use in spaCy {#converting}
|
|
||||||
|
|
||||||
Custom word vectors can be trained using a number of open-source libraries, such
|
|
||||||
as [Gensim](https://radimrehurek.com/gensim), [Fast Text](https://fasttext.cc),
|
|
||||||
or Tomas Mikolov's original
|
|
||||||
[Word2vec implementation](https://code.google.com/archive/p/word2vec/). Most
|
|
||||||
word vector libraries output an easy-to-read text-based format, where each line
|
|
||||||
consists of the word followed by its vector. For everyday use, we want to
|
|
||||||
convert the vectors model into a binary format that loads faster and takes up
|
|
||||||
less space on disk. The easiest way to do this is the
|
|
||||||
[`init-model`](/api/cli#init-model) command-line utility:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/word-vectors-v2/cc.la.300.vec.gz
|
|
||||||
python -m spacy init-model en /tmp/la_vectors_wiki_lg --vectors-loc cc.la.300.vec.gz
|
|
||||||
```
|
|
||||||
|
|
||||||
This will output a spaCy model in the directory `/tmp/la_vectors_wiki_lg`,
|
|
||||||
giving you access to some nice Latin vectors 😉 You can then pass the directory
|
|
||||||
path to [`spacy.load()`](/api/top-level#spacy.load).
|
|
||||||
|
|
||||||
```python
|
|
||||||
nlp_latin = spacy.load("/tmp/la_vectors_wiki_lg")
|
|
||||||
doc1 = nlp_latin("Caecilius est in horto")
|
|
||||||
doc2 = nlp_latin("servus est in atrio")
|
|
||||||
doc1.similarity(doc2)
|
|
||||||
```
|
|
||||||
|
|
||||||
The model directory will have a `/vocab` directory with the strings, lexical
|
|
||||||
entries and word vectors from the input vectors model. The
|
|
||||||
[`init-model`](/api/cli#init-model) command supports a number of archive formats
|
|
||||||
for the word vectors: the vectors can be in plain text (`.txt`), zipped
|
|
||||||
(`.zip`), or tarred and zipped (`.tgz`).
|
|
||||||
|
|
||||||
### Optimizing vector coverage {#custom-vectors-coverage new="2"}
|
|
||||||
|
|
||||||
To help you strike a good balance between coverage and memory usage, spaCy's
|
|
||||||
[`Vectors`](/api/vectors) class lets you map **multiple keys** to the **same
|
|
||||||
row** of the table. If you're using the
|
|
||||||
[`spacy init-model`](/api/cli#init-model) command to create a vocabulary,
|
|
||||||
pruning the vectors will be taken care of automatically if you set the
|
|
||||||
`--prune-vectors` flag. You can also do it manually in the following steps:
|
|
||||||
|
|
||||||
1. Start with a **word vectors model** that covers a huge vocabulary. For
|
|
||||||
instance, the [`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg)
|
|
||||||
model provides 300-dimensional GloVe vectors for over 1 million terms of
|
|
||||||
English.
|
|
||||||
2. If your vocabulary has values set for the `Lexeme.prob` attribute, the
|
|
||||||
lexemes will be sorted by descending probability to determine which vectors
|
|
||||||
to prune. Otherwise, lexemes will be sorted by their order in the `Vocab`.
|
|
||||||
3. Call [`Vocab.prune_vectors`](/api/vocab#prune_vectors) with the number of
|
|
||||||
vectors you want to keep.
|
|
||||||
|
|
||||||
```python
|
|
||||||
nlp = spacy.load('en_vectors_web_lg')
|
|
||||||
n_vectors = 105000 # number of vectors to keep
|
|
||||||
removed_words = nlp.vocab.prune_vectors(n_vectors)
|
|
||||||
|
|
||||||
assert len(nlp.vocab.vectors) <= n_vectors # unique vectors have been pruned
|
|
||||||
assert nlp.vocab.vectors.n_keys > n_vectors # but not the total entries
|
|
||||||
```
|
|
||||||
|
|
||||||
[`Vocab.prune_vectors`](/api/vocab#prune_vectors) reduces the current vector
|
|
||||||
table to a given number of unique entries, and returns a dictionary containing
|
|
||||||
the removed words, mapped to `(string, score)` tuples, where `string` is the
|
|
||||||
entry the removed word was mapped to, and `score` the similarity score between
|
|
||||||
the two words.
|
|
||||||
|
|
||||||
```python
|
|
||||||
### Removed words
|
|
||||||
{
|
|
||||||
"Shore": ("coast", 0.732257),
|
|
||||||
"Precautionary": ("caution", 0.490973),
|
|
||||||
"hopelessness": ("sadness", 0.742366),
|
|
||||||
"Continous": ("continuous", 0.732549),
|
|
||||||
"Disemboweled": ("corpse", 0.499432),
|
|
||||||
"biostatistician": ("scientist", 0.339724),
|
|
||||||
"somewheres": ("somewheres", 0.402736),
|
|
||||||
"observing": ("observe", 0.823096),
|
|
||||||
"Leaving": ("leaving", 1.0),
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
In the example above, the vector for "Shore" was removed and remapped to the
|
|
||||||
vector of "coast", which is deemed about 73% similar. "Leaving" was remapped to
|
|
||||||
the vector of "leaving", which is identical. If you're using the
|
|
||||||
[`init-model`](/api/cli#init-model) command, you can set the `--prune-vectors`
|
|
||||||
option to easily reduce the size of the vectors as you add them to a spaCy
|
|
||||||
model:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
$ python -m spacy init-model /tmp/la_vectors_web_md --vectors-loc la.300d.vec.tgz --prune-vectors 10000
|
|
||||||
```
|
|
||||||
|
|
||||||
This will create a spaCy model with vectors for the first 10,000 words in the
|
|
||||||
vectors model. All other words in the vectors model are mapped to the closest
|
|
||||||
vector among those retained.
|
|
||||||
|
|
||||||
### Adding vectors {#adding-vectors}
|
|
||||||
|
|
||||||
You can also add word vectors individually, using the method `vocab.set_vector`.
|
|
||||||
This is often the easiest approach if you have vectors in an arbitrary format,
|
|
||||||
as you can read in the vectors with your own logic, and just set them with
|
|
||||||
a simple loop. This method is likely to be slower than approaches that work
|
|
||||||
with the whole vectors table at once, but it's a great approach for once-off
|
|
||||||
conversions before you save out your model to disk.
|
|
||||||
|
|
||||||
```python
|
|
||||||
### Adding vectors
|
|
||||||
from spacy.vocab import Vocab
|
|
||||||
|
|
||||||
vector_data = {"dog": numpy.random.uniform(-1, 1, (300,)),
|
|
||||||
"cat": numpy.random.uniform(-1, 1, (300,)),
|
|
||||||
"orange": numpy.random.uniform(-1, 1, (300,))}
|
|
||||||
vocab = Vocab()
|
|
||||||
for word, vector in vector_data.items():
|
|
||||||
vocab.set_vector(word, vector)
|
|
||||||
```
|
|
|
@ -18,8 +18,11 @@
|
||||||
{ "text": "Linguistic Features", "url": "/usage/linguistic-features" },
|
{ "text": "Linguistic Features", "url": "/usage/linguistic-features" },
|
||||||
{ "text": "Rule-based Matching", "url": "/usage/rule-based-matching" },
|
{ "text": "Rule-based Matching", "url": "/usage/rule-based-matching" },
|
||||||
{ "text": "Processing Pipelines", "url": "/usage/processing-pipelines" },
|
{ "text": "Processing Pipelines", "url": "/usage/processing-pipelines" },
|
||||||
{ "text": "Vectors & Embeddings", "url": "/usage/vectors-embeddings" },
|
{
|
||||||
{ "text": "Transformers", "url": "/usage/transformers", "tag": "new" },
|
"text": "Embeddings & Transformers",
|
||||||
|
"url": "/usage/embeddings-transformers",
|
||||||
|
"tag": "new"
|
||||||
|
},
|
||||||
{ "text": "Training Models", "url": "/usage/training", "tag": "new" },
|
{ "text": "Training Models", "url": "/usage/training", "tag": "new" },
|
||||||
{ "text": "spaCy Projects", "url": "/usage/projects", "tag": "new" },
|
{ "text": "spaCy Projects", "url": "/usage/projects", "tag": "new" },
|
||||||
{ "text": "Saving & Loading", "url": "/usage/saving-loading" },
|
{ "text": "Saving & Loading", "url": "/usage/saving-loading" },
|
||||||
|
|
|
@ -9,7 +9,12 @@ import { isString, github, headingTextClassName } from './util'
|
||||||
import classes from '../styles/typography.module.sass'
|
import classes from '../styles/typography.module.sass'
|
||||||
|
|
||||||
export const H1 = ({ Component = 'h1', className, ...props }) => (
|
export const H1 = ({ Component = 'h1', className, ...props }) => (
|
||||||
<Headline Component={Component} className={classNames(classes.h1, className)} {...props} />
|
<Headline
|
||||||
|
Component={Component}
|
||||||
|
className={classNames(classes.h1, className)}
|
||||||
|
permalink={false}
|
||||||
|
{...props}
|
||||||
|
/>
|
||||||
)
|
)
|
||||||
export const H2 = ({ className, ...props }) => (
|
export const H2 = ({ className, ...props }) => (
|
||||||
<Headline Component="h2" className={classNames(classes.h2, className)} {...props} />
|
<Headline Component="h2" className={classNames(classes.h2, className)} {...props} />
|
||||||
|
@ -90,6 +95,7 @@ const Headline = ({
|
||||||
source,
|
source,
|
||||||
hidden,
|
hidden,
|
||||||
action,
|
action,
|
||||||
|
permalink = true,
|
||||||
className,
|
className,
|
||||||
children,
|
children,
|
||||||
}) => {
|
}) => {
|
||||||
|
@ -102,7 +108,7 @@ const Headline = ({
|
||||||
const tags = tag ? tag.split(',').map(t => t.trim()) : []
|
const tags = tag ? tag.split(',').map(t => t.trim()) : []
|
||||||
return (
|
return (
|
||||||
<Component id={id} name={name} className={headingClassNames}>
|
<Component id={id} name={name} className={headingClassNames}>
|
||||||
<Permalink id={id}>{children} </Permalink>
|
<Permalink id={permalink ? id : null}>{children} </Permalink>
|
||||||
{tags.map((tag, i) => (
|
{tags.map((tag, i) => (
|
||||||
<Tag spaced key={i}>
|
<Tag spaced key={i}>
|
||||||
{tag}
|
{tag}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user