diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index 73631c64a..cc6f44fcc 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -274,7 +274,7 @@ architectures into your training config.
| `get_spans` | `Callable` | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples. |
| `tokenizer_config` | `Dict[str, Any]` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). |
-### spacy-transformers.Tok2VecListener.v1 {#Tok2VecListener}
+### spacy-transformers.Tok2VecListener.v1 {#transformers-Tok2VecListener}
> #### Example Config
>
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 5c971effa..32aaee7b8 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -43,7 +43,7 @@ $ python -m spacy download [model] [--direct] [pip args]
| Argument | Type | Description |
| ------------------------------------- | ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `model` | positional | Model name, e.g. `en_core_web_sm`.. |
+| `model` | positional | Model name, e.g. [`en_core_web_sm`](/models/en#en_core_web_sm). |
| `--direct`, `-d` | flag | Force direct download of exact model version. |
| pip args 2.1 | - | Additional installation options to be passed to `pip install` when installing the model package. For example, `--user` to install to the user home directory or `--no-deps` to not install model dependencies. |
| `--help`, `-h` | flag | Show help message and available arguments. |
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index af7cb26de..32633330e 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -182,10 +182,10 @@ run [`spacy pretrain`](/api/cli#pretrain).
> ```
The main data format used in spaCy v3.0 is a **binary format** created by
-serializing a [`DocBin`](/api/docbin) object, which represents a collection of
-`Doc` objects. This means that you can train spaCy models using the same format
-it outputs: annotated `Doc` objects. The binary format is extremely **efficient
-in storage**, especially when packing multiple documents together.
+serializing a [`DocBin`](/api/docbin), which represents a collection of `Doc`
+objects. This means that you can train spaCy models using the same format it
+outputs: annotated `Doc` objects. The binary format is extremely **efficient in
+storage**, especially when packing multiple documents together.
Typically, the extension for these binary files is `.spacy`, and they are used
as input format for specifying a [training corpus](/api/corpus) and for spaCy's
diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md
index 187abfdbb..c7af8ffae 100644
--- a/website/docs/api/dependencyparser.md
+++ b/website/docs/api/dependencyparser.md
@@ -142,14 +142,20 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and
## DependencyParser.begin_training {#begin_training tag="method"}
-Initialize the pipe for training, using data examples if available. Returns an
-[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
+Initialize the component for training and return an
+[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
+function that returns an iterable of [`Example`](/api/example) objects. The data
+examples are used to **initialize the model** of the component and can either be
+the full training data or a representative sample. Initialization includes
+validating the network,
+[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
+setting up the label scheme based on the data.
> #### Example
>
> ```python
> parser = nlp.add_pipe("parser")
-> optimizer = parser.begin_training(pipeline=nlp.pipeline)
+> optimizer = parser.begin_training(lambda: [], pipeline=nlp.pipeline)
> ```
| Name | Type | Description |
diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md
index 930188e26..fa8918dba 100644
--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@@ -142,14 +142,20 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
## EntityLinker.begin_training {#begin_training tag="method"}
-Initialize the pipe for training, using data examples if available. Returns an
-[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
+Initialize the component for training and return an
+[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
+function that returns an iterable of [`Example`](/api/example) objects. The data
+examples are used to **initialize the model** of the component and can either be
+the full training data or a representative sample. Initialization includes
+validating the network,
+[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
+setting up the label scheme based on the data.
> #### Example
>
> ```python
> entity_linker = nlp.add_pipe("entity_linker", last=True)
-> optimizer = entity_linker.begin_training(pipeline=nlp.pipeline)
+> optimizer = entity_linker.begin_training(lambda: [], pipeline=nlp.pipeline)
> ```
| Name | Type | Description |
diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md
index 2d66710d7..8d30463ff 100644
--- a/website/docs/api/entityrecognizer.md
+++ b/website/docs/api/entityrecognizer.md
@@ -131,14 +131,20 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and
## EntityRecognizer.begin_training {#begin_training tag="method"}
-Initialize the pipe for training, using data examples if available. Returns an
-[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
+Initialize the component for training and return an
+[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
+function that returns an iterable of [`Example`](/api/example) objects. The data
+examples are used to **initialize the model** of the component and can either be
+the full training data or a representative sample. Initialization includes
+validating the network,
+[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
+setting up the label scheme based on the data.
> #### Example
>
> ```python
> ner = nlp.add_pipe("ner")
-> optimizer = ner.begin_training(pipeline=nlp.pipeline)
+> optimizer = ner.begin_training(lambda: [], pipeline=nlp.pipeline)
> ```
| Name | Type | Description |
diff --git a/website/docs/api/language.md b/website/docs/api/language.md
index 79782fd72..41d660421 100644
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@@ -200,12 +200,28 @@ more efficient than processing texts one-by-one.
## Language.begin_training {#begin_training tag="method"}
-Initialize the pipe for training, using data examples if available. Returns an
-[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
+Initialize the pipeline for training and return an
+[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
+function that returns an iterable of [`Example`](/api/example) objects. The data
+examples can either be the full training data or a representative sample. They
+are used to **initialize the models** of trainable pipeline components and are
+passed each component's [`begin_training`](/api/pipe#begin_training) method, if
+available. Initialization includes validating the network,
+[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
+setting up the label scheme based on the data.
+
+
+
+The `Language.update` method now takes a **function** that is called with no
+arguments and returns a sequence of [`Example`](/api/example) objects instead of
+tuples of `Doc` and `GoldParse` objects.
+
+
> #### Example
>
> ```python
+> get_examples = lambda: examples
> optimizer = nlp.begin_training(get_examples)
> ```
@@ -276,7 +292,7 @@ and custom registered functions if needed. See the
| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. |
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
-## Language.rehearse {#rehearse tag="method,experimental"}
+## Language.rehearse {#rehearse tag="method,experimental" new="3"}
Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
current model to make predictions similar to an initial model, to try to address
@@ -302,6 +318,13 @@ the "catastrophic forgetting" problem. This feature is experimental.
Evaluate a model's pipeline components.
+
+
+The `Language.update` method now takes a batch of [`Example`](/api/example)
+objects instead of tuples of `Doc` and `GoldParse` objects.
+
+
+
> #### Example
>
> ```python
diff --git a/website/docs/api/morphologizer.md b/website/docs/api/morphologizer.md
index 04d189939..12d3050f6 100644
--- a/website/docs/api/morphologizer.md
+++ b/website/docs/api/morphologizer.md
@@ -121,15 +121,21 @@ applied to the `Doc` in order. Both [`__call__`](/api/morphologizer#call) and
## Morphologizer.begin_training {#begin_training tag="method"}
-Initialize the pipe for training, using data examples if available. Returns an
-[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
+Initialize the component for training and return an
+[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
+function that returns an iterable of [`Example`](/api/example) objects. The data
+examples are used to **initialize the model** of the component and can either be
+the full training data or a representative sample. Initialization includes
+validating the network,
+[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
+setting up the label scheme based on the data.
> #### Example
>
> ```python
> morphologizer = nlp.add_pipe("morphologizer")
> nlp.pipeline.append(morphologizer)
-> optimizer = morphologizer.begin_training(pipeline=nlp.pipeline)
+> optimizer = morphologizer.begin_training(lambda: [], pipeline=nlp.pipeline)
> ```
| Name | Type | Description |
diff --git a/website/docs/api/pipe.md b/website/docs/api/pipe.md
index b41ec210e..81ecc5faf 100644
--- a/website/docs/api/pipe.md
+++ b/website/docs/api/pipe.md
@@ -9,8 +9,8 @@ components like the [`EntityRecognizer`](/api/entityrecognizer) or
[`TextCategorizer`](/api/textcategorizer) inherit from it and it defines the
interface that components should follow to function as trainable components in a
spaCy pipeline. See the docs on
-[writing trainable components](/usage/processing-pipelines#trainable) for how to
-use the `Pipe` base class to implement custom components.
+[writing trainable components](/usage/processing-pipelines#trainable-components)
+for how to use the `Pipe` base class to implement custom components.
> #### Why is Pipe implemented in Cython?
>
@@ -106,14 +106,20 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and
## Pipe.begin_training {#begin_training tag="method"}
-Initialize the pipe for training, using data examples if available. Returns an
-[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
+Initialize the component for training and return an
+[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
+function that returns an iterable of [`Example`](/api/example) objects. The data
+examples are used to **initialize the model** of the component and can either be
+the full training data or a representative sample. Initialization includes
+validating the network,
+[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
+setting up the label scheme based on the data.
> #### Example
>
> ```python
> pipe = nlp.add_pipe("your_custom_pipe")
-> optimizer = pipe.begin_training(pipeline=nlp.pipeline)
+> optimizer = pipe.begin_training(lambda: [], pipeline=nlp.pipeline)
> ```
| Name | Type | Description |
@@ -200,7 +206,7 @@ This method needs to be overwritten with your own custom `update` method.
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. |
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
-## Pipe.rehearse {#rehearse tag="method,experimental"}
+## Pipe.rehearse {#rehearse tag="method,experimental" new="3"}
Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
current model to make predictions similar to an initial model, to try to address
diff --git a/website/docs/api/sentencerecognizer.md b/website/docs/api/sentencerecognizer.md
index 59ada7fcb..cefdbea88 100644
--- a/website/docs/api/sentencerecognizer.md
+++ b/website/docs/api/sentencerecognizer.md
@@ -116,14 +116,20 @@ and [`pipe`](/api/sentencerecognizer#pipe) delegate to the
## SentenceRecognizer.begin_training {#begin_training tag="method"}
-Initialize the pipe for training, using data examples if available. Returns an
-[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
+Initialize the component for training and return an
+[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
+function that returns an iterable of [`Example`](/api/example) objects. The data
+examples are used to **initialize the model** of the component and can either be
+the full training data or a representative sample. Initialization includes
+validating the network,
+[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
+setting up the label scheme based on the data.
> #### Example
>
> ```python
> senter = nlp.add_pipe("senter")
-> optimizer = senter.begin_training(pipeline=nlp.pipeline)
+> optimizer = senter.begin_training(lambda: [], pipeline=nlp.pipeline)
> ```
| Name | Type | Description |
@@ -193,7 +199,7 @@ Delegates to [`predict`](/api/sentencerecognizer#predict) and
| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. |
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
-## SentenceRecognizer.rehearse {#rehearse tag="method,experimental"}
+## SentenceRecognizer.rehearse {#rehearse tag="method,experimental" new="3"}
Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
current model to make predictions similar to an initial model, to try to address
diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md
index 7ea29e53c..9761dea15 100644
--- a/website/docs/api/tagger.md
+++ b/website/docs/api/tagger.md
@@ -114,14 +114,20 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and
## Tagger.begin_training {#begin_training tag="method"}
-Initialize the pipe for training, using data examples if available. Returns an
-[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
+Initialize the component for training and return an
+[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
+function that returns an iterable of [`Example`](/api/example) objects. The data
+examples are used to **initialize the model** of the component and can either be
+the full training data or a representative sample. Initialization includes
+validating the network,
+[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
+setting up the label scheme based on the data.
> #### Example
>
> ```python
> tagger = nlp.add_pipe("tagger")
-> optimizer = tagger.begin_training(pipeline=nlp.pipeline)
+> optimizer = tagger.begin_training(lambda: [], pipeline=nlp.pipeline)
> ```
| Name | Type | Description |
@@ -191,7 +197,7 @@ Delegates to [`predict`](/api/tagger#predict) and
| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. |
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
-## Tagger.rehearse {#rehearse tag="method,experimental"}
+## Tagger.rehearse {#rehearse tag="method,experimental" new="3"}
Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
current model to make predictions similar to an initial model, to try to address
diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md
index 494bc569f..73b50b865 100644
--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@@ -122,14 +122,20 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and
## TextCategorizer.begin_training {#begin_training tag="method"}
-Initialize the pipe for training, using data examples if available. Returns an
-[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
+Initialize the component for training and return an
+[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
+function that returns an iterable of [`Example`](/api/example) objects. The data
+examples are used to **initialize the model** of the component and can either be
+the full training data or a representative sample. Initialization includes
+validating the network,
+[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
+setting up the label scheme based on the data.
> #### Example
>
> ```python
> textcat = nlp.add_pipe("textcat")
-> optimizer = textcat.begin_training(pipeline=nlp.pipeline)
+> optimizer = textcat.begin_training(lambda: [], pipeline=nlp.pipeline)
> ```
| Name | Type | Description |
@@ -199,7 +205,7 @@ Delegates to [`predict`](/api/textcategorizer#predict) and
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. |
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
-## TextCategorizer.rehearse {#rehearse tag="method,experimental"}
+## TextCategorizer.rehearse {#rehearse tag="method,experimental" new="3"}
Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
current model to make predictions similar to an initial model, to try to address
diff --git a/website/docs/api/tok2vec.md b/website/docs/api/tok2vec.md
index 8e5f78bf7..4c820c07c 100644
--- a/website/docs/api/tok2vec.md
+++ b/website/docs/api/tok2vec.md
@@ -125,14 +125,20 @@ and [`set_annotations`](/api/tok2vec#set_annotations) methods.
## Tok2Vec.begin_training {#begin_training tag="method"}
-Initialize the pipe for training, using data examples if available. Returns an
-[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
+Initialize the component for training and return an
+[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
+function that returns an iterable of [`Example`](/api/example) objects. The data
+examples are used to **initialize the model** of the component and can either be
+the full training data or a representative sample. Initialization includes
+validating the network,
+[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
+setting up the label scheme based on the data.
> #### Example
>
> ```python
> tok2vec = nlp.add_pipe("tok2vec")
-> optimizer = tok2vec.begin_training(pipeline=nlp.pipeline)
+> optimizer = tok2vec.begin_training(lambda: [], pipeline=nlp.pipeline)
> ```
| Name | Type | Description |
diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md
index a8b328688..dda212906 100644
--- a/website/docs/api/transformer.md
+++ b/website/docs/api/transformer.md
@@ -159,14 +159,20 @@ applied to the `Doc` in order. Both [`__call__`](/api/transformer#call) and
## Transformer.begin_training {#begin_training tag="method"}
-Initialize the pipe for training, using data examples if available. Returns an
-[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
+Initialize the component for training and return an
+[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
+function that returns an iterable of [`Example`](/api/example) objects. The data
+examples are used to **initialize the model** of the component and can either be
+the full training data or a representative sample. Initialization includes
+validating the network,
+[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
+setting up the label scheme based on the data.
> #### Example
>
> ```python
> trf = nlp.add_pipe("transformer")
-> optimizer = trf.begin_training(pipeline=nlp.pipeline)
+> optimizer = trf.begin_training(lambda: [], pipeline=nlp.pipeline)
> ```
| Name | Type | Description |
diff --git a/website/docs/models/index.md b/website/docs/models/index.md
index b25e46f1e..d5f87d3b5 100644
--- a/website/docs/models/index.md
+++ b/website/docs/models/index.md
@@ -45,9 +45,9 @@ three components:
2. **Genre:** Type of text the model is trained on, e.g. `web` or `news`.
3. **Size:** Model size indicator, `sm`, `md` or `lg`.
-For example, `en_core_web_sm` is a small English model trained on written web
-text (blogs, news, comments), that includes vocabulary, vectors, syntax and
-entities.
+For example, [`en_core_web_sm`](/models/en#en_core_web_sm) is a small English
+model trained on written web text (blogs, news, comments), that includes
+vocabulary, vectors, syntax and entities.
### Model versioning {#model-versioning}
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index e6d328d02..a3a2e7102 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -687,13 +687,13 @@ give you everything you need to train fully custom models with
-
-
The [`Example`](/api/example) object contains annotated training data, also
called the **gold standard**. It's initialized with a [`Doc`](/api/doc) object
that will hold the predictions, and another `Doc` object that holds the
-gold-standard annotations. Here's an example of a simple `Example` for
-part-of-speech tags:
+gold-standard annotations. It also includes the **alignment** between those two
+documents if they differ in tokenization. The `Example` class ensures that spaCy
+can rely on one **standardized format** that's passed through the pipeline.
+Here's an example of a simple `Example` for part-of-speech tags:
```python
words = ["I", "like", "stuff"]
@@ -744,7 +744,8 @@ example = Example.from_dict(doc, {"entities": ["U-ORG", "O", "U-TECHNOLOGY", "O"
As of v3.0, the [`Example`](/api/example) object replaces the `GoldParse` class.
It can be constructed in a very similar way, from a `Doc` and a dictionary of
-annotations:
+annotations. For more details, see the
+[migration guide](/usage/v3#migrating-training).
```diff
- gold = GoldParse(doc, entities=entities)
diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index 02f6882e4..919af3ffb 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -14,12 +14,49 @@ menu:
### New training workflow and config system {#features-training}
+
+
+- **Usage:** [Training models](/usage/training)
+- **Thinc:** [Thinc's config system](https://thinc.ai/docs/usage-config),
+ [`Config`](https://thinc.ai/docs/api-config#config)
+- **CLI:** [`train`](/api/cli#train), [`pretrain`](/api/cli#pretrain),
+ [`evaluate`](/api/cli#evaluate)
+- **API:** [Config format](/api/data-formats#config),
+ [`registry`](/api/top-level#registry)
+
+
+
### Transformer-based pipelines {#features-transformers}
+
+
+- **Usage:** [Transformers](/usage/transformers),
+ [Training models](/usage/training)
+- **API:** [`Transformer`](/api/transformer),
+ [`TransformerData`](/api/transformer#transformerdata),
+ [`FullTransformerBatch`](/api/transformer#fulltransformerbatch)
+- **Architectures: ** [TransformerModel](/api/architectures#TransformerModel),
+ [Tok2VecListener](/api/architectures#transformers-Tok2VecListener),
+ [Tok2VecTransformer](/api/architectures#Tok2VecTransformer)
+- **Models:** [`en_core_bert_sm`](/models/en)
+- **Implementation:**
+ [`spacy-transformers`](https://github.com/explosion/spacy-transformers)
+
+
+
### Custom models using any framework {#feautres-custom-models}
### Manage end-to-end workflows with projects {#features-projects}
+
+
+- **Usage:** [spaCy projects](/usage/projects),
+ [Training models](/usage/training)
+- **CLI:** [`project`](/api/cli#project), [`train`](/api/cli#train)
+- **Templates:** [`projects`](https://github.com/explosion/projects)
+
+
+
### New built-in pipeline components {#features-pipeline-components}
| Name | Description |
@@ -30,14 +67,48 @@ menu:
| [`AttributeRuler`](/api/attributeruler) | Component for setting token attributes using match patterns. |
| [`Transformer`](/api/transformer) | Component for using [transformer models](/usage/transformers) in your pipeline, accessing outputs and aligning tokens. Provided via [`spacy-transformers`](https://github.com/explosion/spacy-transformers). |
+
+
+- **Usage:** [Processing pipelines](/usage/processing-pipelines)
+- **API:** [Built-in pipeline components](/api#architecture-pipeline)
+- **Implementation:**
+ [`spacy/pipeline`](https://github.com/explosion/spaCy/tree/develop/spacy/pipeline)
+
+
+
### New and improved pipeline component APIs {#features-components}
- `Language.factory`, `Language.component`
- `Language.analyze_pipes`
- Adding components from other models
+
+
+- **Usage:** [Custom components](/usage/processing-pipelines#custom_components),
+ [Defining components during training](/usage/training#config-components)
+- **API:** [`Language`](/api/language)
+- **Implementation:**
+ [`spacy/language.py`](https://github.com/explosion/spaCy/tree/develop/spacy/language.py)
+
+
+
### Type hints and type-based data validation {#features-types}
+> #### Example
+>
+> ```python
+> from spacy.language import Language
+> from pydantic import StrictBool
+>
+> @Language.factory("my_component")
+> def create_my_component(
+> nlp: Language,
+> name: str,
+> custom: StrictBool
+> ):
+> ...
+> ```
+
spaCy v3.0 officially drops support for Python 2 and now requires **Python
3.6+**. This also means that the code base can take full advantage of
[type hints](https://docs.python.org/3/library/typing.html). spaCy's user-facing
@@ -54,13 +125,36 @@ validation of Thinc's [config system](https://thinc.ai/docs/usage-config), which
lets you to register **custom functions with typed arguments**, reference them
in your config and see validation errors if the argument values don't match.
-### CLI
+
-| Name | Description |
-| --------------------------------------- | -------------------------------------------------------------------------------------------------------- |
-| [`init config`](/api/cli#init-config) | Initialize a [training config](/usage/training) file for a blank language or auto-fill a partial config. |
-| [`debug config`](/api/cli#debug-config) | Debug a [training config](/usage/training) file and show validation errors. |
-| [`project`](/api/cli#project) | Subcommand for cloning and running [spaCy projects](/usage/projects). |
+- **Usage: **
+ [Component type hints and validation](/usage/processing-pipelines#type-hints),
+ [Training with custom code](/usage/training#custom-code)
+- **Thinc: **
+ [Type checking in Thinc](https://thinc.ai/docs/usage-type-checking),
+ [Thinc's config system](https://thinc.ai/docs/usage-config)
+
+
+
+### New methods, attributes and commands
+
+The following methods, attributes and commands are new in spaCy v3.0.
+
+| Name | Description |
+| ------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| [`Token.lex`](/api/token#attributes) | Access a token's [`Lexeme`](/api/lexeme). |
+| [`Language.select_pipes`](/api/language#select_pipes) | Contextmanager for enabling or disabling specific pipeline components for a block. |
+| [`Language.analyze_pipes`](/api/language#analyze_pipes) | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies. |
+| [`Language.resume_training`](/api/language#resume_training) | Experimental: continue training a pretrained model and initialize "rehearsal" for components that implement a `rehearse` method to prevent catastrophic forgetting. |
+| [`@Language.factory`](/api/language#factory) [`@Language.component`](/api/language#component) | Decorators for [registering](/usage/processing-pipelines#custom-components) pipeline component factories and simple stateless component functions. |
+| [`Language.has_factory`](/api/language#has_factory) | Check whether a component factory is registered on a language class.s |
+| [`Language.get_factory_meta`](/api/language#get_factory_meta) [`Language.get_pipe_meta`](/api/language#get_factory_meta) | Get the [`FactoryMeta`](/api/language#factorymeta) with component metadata for a factory or instance name. |
+| [`Language.config`](/api/language#config) | The [config](/usage/training#config) used to create the current `nlp` object. An instance of [`Config`](https://thinc.ai/docs/api-config#config) and can be saved to disk and used for training. |
+| [`Pipe.score`](/api/pipe#score) | Method on trainable pipeline components that returns a dictionary of evaluation scores. |
+| [`registry`](/api/top-level#registry) | Function registry to map functions to string names that can be referenced in [configs](/usage/training#config). |
+| [`init config`](/api/cli#init-config) | CLI command for initializing a [training config](/usage/training) file for a blank language or auto-filling a partial config. |
+| [`debug config`](/api/cli#debug-config) | CLI command for debugging a [training config](/usage/training) file and showing validation errors. |
+| [`project`](/api/cli#project) | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects). |
## Backwards Incompatibilities {#incompat}
@@ -70,12 +164,21 @@ usability. The following section lists the relevant changes to the user-facing
API. For specific examples of how to rewrite your code, check out the
[migration guide](#migrating).
-### Compatibility {#incompat-compat}
+
-- spaCy now requires **Python 3.6+**.
+Note that spaCy v3.0 now requires **Python 3.6+**.
+
+
### API changes {#incompat-api}
+- Model symlinks, the `link` command and shortcut names are now deprecated.
+ There can be many [different models](/models) and not just one "English
+ model", so you should always use the full model name like
+ [`en_core_web_sm`](/models/en) explicitly.
+- The [`train`](/api/cli#train) and [`pretrain`](/api/cli#pretrain) commands now
+ only take a `config.cfg` file containing the full
+ [training config](/usage/training#config).
- [`Language.add_pipe`](/api/language#add_pipe) now takes the **string name** of
the component factory instead of the component function.
- **Custom pipeline components** now needs to be decorated with the
@@ -87,6 +190,20 @@ API. For specific examples of how to rewrite your code, check out the
- The `Language.disable_pipes` contextmanager has been replaced by
[`Language.select_pipes`](/api/language#select_pipes), which can explicitly
disable or enable components.
+- The [`Language.update`](/api/language#update),
+ [`Language.evaluate`](/api/language#evaluate) and
+ [`Pipe.update`](/api/pipe#update) methods now all take batches of
+ [`Example`](/api/example) objects instead of `Doc` and `GoldParse` objects, or
+ raw text and a dictionary of annotations.
+ [`Language.begin_training`](/api/language#begin_training) and
+ [`Pipe.begin_training`](/api/pipe#begin_training) now take a function that
+ returns a sequence of `Example` objects to initialize the model instead of a
+ list of tuples.
+- [`Matcher.add`](/api/matcher#add),
+ [`PhraseMatcher.add`](/api/phrasematcher#add) and
+ [`DependencyMatcher.add`](/api/dependencymatcher#add) now only accept a list
+ of patterns as the second argument (instead of a variable number of
+ arguments). The `on_match` callback becomes an optional keyword argument.
### Removed or renamed API {#incompat-removed}
@@ -96,6 +213,7 @@ API. For specific examples of how to rewrite your code, check out the
| `GoldParse` | [`Example`](/api/example) |
| `GoldCorpus` | [`Corpus`](/api/corpus) |
| `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) |
+| `spacy profile` | [`spacy debug profile`](/api/cli#debug-profile) |
| `spacy link`, `util.set_data_path`, `util.get_data_path` | not needed, model symlinks are deprecated |
The following deprecated methods, attributes and arguments were removed in v3.0.
@@ -121,7 +239,7 @@ on them.
Model symlinks and shortcuts like `en` are now officially deprecated. There are
[many different models](/models) with different capabilities and not just one
"English model". In order to download and load a model, you should always use
-its full name – for instance, `en_core_web_sm`.
+its full name – for instance, [`en_core_web_sm`](/models/en#en_core_web_sm).
```diff
- python -m spacy download en
@@ -224,6 +342,51 @@ and you typically shouldn't have to use it in your code.
+ parser = nlp.add_pipe("parser")
```
+If you need to add a component from an existing pretrained model, you can now
+use the `source` argument on [`nlp.add_pipe`](/api/language#add_pipe). This will
+check that the component is compatible, and take care of porting over all
+config. During training, you can also reference existing pretrained components
+in your [config](/usage/training#config-components) and decide whether or not
+they should be updated with more data.
+
+> #### config.cfg (excerpt)
+>
+> ```ini
+> [components.ner]
+> source = "en_core_web_sm"
+> component = "ner"
+> ```
+
+```diff
+source_nlp = spacy.load("en_core_web_sm")
+nlp = spacy.blank("en")
+- ner = source_nlp.get_pipe("ner")
+- nlp.add_pipe(ner)
++ nlp.add_pipe("ner", source=source_nlp)
+```
+
+### Adding match patterns {#migrating-matcher}
+
+The [`Matcher.add`](/api/matcher#add),
+[`PhraseMatcher.add`](/api/phrasematcher#add) and
+[`DependencyMatcher.add`](/api/dependencymatcher#add) methods now only accept a
+**list of patterns** as the second argument (instead of a variable number of
+arguments). The `on_match` callback becomes an optional keyword argument.
+
+```diff
+matcher = Matcher(nlp.vocab)
+patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]]
+- matcher.add("GoogleNow", on_match, *patterns)
++ matcher.add("GoogleNow", patterns, on_match=on_match)
+```
+
+```diff
+matcher = PhraseMatcher(nlp.vocab)
+patterns = [nlp("health care reform"), nlp("healthcare reform")]
+- matcher.add("HEALTH", on_match, *patterns)
++ matcher.add("HEALTH", patterns, on_match=on_match)
+```
+
### Training models {#migrating-training}
To train your models, you should now pretty much always use the
@@ -233,15 +396,20 @@ use a [flexible config file](/usage/training#config) that describes all training
settings and hyperparameters, as well as your pipeline, model components and
architectures to use. The `--code` argument lets you pass in code containing
[custom registered functions](/usage/training#custom-code) that you can
-reference in your config.
+reference in your config. To get started, check out the
+[quickstart widget](/usage/training#quickstart).
#### Binary .spacy training data format {#migrating-training-format}
-spaCy now uses a new
-[binary training data format](/api/data-formats#binary-training), which is much
-smaller and consists of `Doc` objects, serialized via the
-[`DocBin`](/api/docbin). You can convert your existing JSON-formatted data using
-the [`spacy convert`](/api/cli#convert) command, which outputs `.spacy` files:
+spaCy v3.0 uses a new
+[binary training data format](/api/data-formats#binary-training) created by
+serializing a [`DocBin`](/api/docbin), which represents a collection of `Doc`
+objects. This means that you can train spaCy models using the same format it
+outputs: annotated `Doc` objects. The binary format is extremely **efficient in
+storage**, especially when packing multiple documents together.
+
+You can convert your existing JSON-formatted data using the
+[`spacy convert`](/api/cli#convert) command, which outputs `.spacy` files:
```bash
$ python -m spacy convert ./training.json ./output
@@ -273,13 +441,72 @@ workflows, from data preprocessing to training and packaging your model.
-#### Migrating training scripts to CLI command and config {#migrating-training-scripts}
-
-
-
#### Training via the Python API {#migrating-training-python}
-
+For most use cases, you **shouldn't** have to write your own training scripts
+anymore. Instead, you can use [`spacy train`](/api/cli#train) with a
+[config file](/usage/training#config) and custom
+[registered functions](/usage/training#custom-code) if needed. You can even
+register callbacks that can modify the `nlp` object at different stages of its
+lifecycle to fully customize it before training.
+
+If you do decide to use the [internal training API](/usage/training#api) from
+Python, you should only need a few small modifications to convert your scripts
+from spaCy v2.x to v3.x. The [`Example.from_dict`](/api/example#from_dict)
+classmethod takes a reference `Doc` and a
+[dictionary of annotations](/api/data-formats#dict-input), similar to the
+"simple training style" in spaCy v2.x:
+
+```diff
+### Migrating Doc and GoldParse
+doc = nlp.make_doc("Mark Zuckerberg is the CEO of Facebook")
+entities = [(0, 15, "PERSON"), (30, 38, "ORG")]
+- gold = GoldParse(doc, entities=entities)
++ example = Example.from_dict(doc, {"entities": entities})
+```
+
+```diff
+### Migrating simple training style
+text = "Mark Zuckerberg is the CEO of Facebook"
+annotations = {"entities": [(0, 15, "PERSON"), (30, 38, "ORG")]}
++ doc = nlp.make_doc(text)
++ example = Example.from_dict(doc, annotations)
+```
+
+The [`Language.update`](/api/language#update),
+[`Language.evaluate`](/api/language#evaluate) and
+[`Pipe.update`](/api/pipe#update) methods now all take batches of
+[`Example`](/api/example) objects instead of `Doc` and `GoldParse` objects, or
+raw text and a dictionary of annotations.
+
+```python
+### Training loop {highlight="11"}
+TRAIN_DATA = [
+ ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
+ ("I like London.", {"entities": [(7, 13, "LOC")]}),
+]
+nlp.begin_training()
+for i in range(20):
+ random.shuffle(TRAIN_DATA)
+ for batch in minibatch(TRAIN_DATA):
+ examples = []
+ for text, annots in batch:
+ examples.append(Example.from_dict(nlp.make_doc(text), annots))
+ nlp.update(examples)
+```
+
+[`Language.begin_training`](/api/language#begin_training) and
+[`Pipe.begin_training`](/api/pipe#begin_training) now take a function that
+returns a sequence of `Example` objects to initialize the model instead of a
+list of tuples. The data examples are used to **initialize the models** of
+trainable pipeline components, which includes validating the network,
+[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
+setting up the label scheme.
+
+```diff
+- nlp.begin_training(examples)
++ nlp.begin_training(lambda: examples)
+```
#### Packaging models {#migrating-training-packaging}
diff --git a/website/src/components/icon.js b/website/src/components/icon.js
index a5ccf1bde..322337955 100644
--- a/website/src/components/icon.js
+++ b/website/src/components/icon.js
@@ -23,6 +23,7 @@ import { ReactComponent as MoonIcon } from '../images/icons/moon.svg'
import { ReactComponent as ClipboardIcon } from '../images/icons/clipboard.svg'
import { ReactComponent as NetworkIcon } from '../images/icons/network.svg'
import { ReactComponent as DownloadIcon } from '../images/icons/download.svg'
+import { ReactComponent as PackageIcon } from '../images/icons/package.svg'
import classes from '../styles/icon.module.sass'
@@ -49,6 +50,7 @@ const icons = {
clipboard: ClipboardIcon,
network: NetworkIcon,
download: DownloadIcon,
+ package: PackageIcon,
}
export default function Icon({ name, width = 20, height, inline = false, variant, className }) {
diff --git a/website/src/components/infobox.js b/website/src/components/infobox.js
index 046384986..363638bf2 100644
--- a/website/src/components/infobox.js
+++ b/website/src/components/infobox.js
@@ -5,8 +5,17 @@ import classNames from 'classnames'
import Icon from './icon'
import classes from '../styles/infobox.module.sass'
-export default function Infobox({ title, emoji, id, variant = 'default', className, children }) {
+export default function Infobox({
+ title,
+ emoji,
+ id,
+ variant = 'default',
+ list = false,
+ className,
+ children,
+}) {
const infoboxClassNames = classNames(classes.root, className, {
+ [classes.list]: !!list,
[classes.warning]: variant === 'warning',
[classes.danger]: variant === 'danger',
})
diff --git a/website/src/components/link.js b/website/src/components/link.js
index 34df20554..3644479c5 100644
--- a/website/src/components/link.js
+++ b/website/src/components/link.js
@@ -8,13 +8,21 @@ import Icon from './icon'
import classes from '../styles/link.module.sass'
import { isString } from './util'
-const internalRegex = /(http(s?)):\/\/(prodi.gy|spacy.io|irl.spacy.io)/gi
+const internalRegex = /(http(s?)):\/\/(prodi.gy|spacy.io|irl.spacy.io|explosion.ai|course.spacy.io)/gi
const Whitespace = ({ children }) => (
// Ensure that links are always wrapped in spaces
<> {children} >
)
+function getIcon(dest) {
+ if (/(github.com)/.test(dest)) return 'code'
+ if (/^\/?api\/architectures#/.test(dest)) return 'network'
+ if (/^\/?api/.test(dest)) return 'docs'
+ if (/^\/?models\/(.+)/.test(dest)) return 'package'
+ return null
+}
+
export default function Link({
children,
to,
@@ -30,22 +38,19 @@ export default function Link({
}) {
const dest = to || href
const external = forceExternal || /(http(s?)):\/\//gi.test(dest)
- const isApi = !external && !hidden && !hideIcon && /^\/?api/.test(dest)
- const isArch = !external && !hidden && !hideIcon && /^\/?api\/architectures#/.test(dest)
- const isSource = external && !hidden && !hideIcon && /(github.com)/.test(dest)
- const withIcon = isApi || isArch || isSource
+ const icon = getIcon(dest)
+ const withIcon = !hidden && !hideIcon && !!icon
const sourceWithText = withIcon && isString(children)
const linkClassNames = classNames(classes.root, className, {
[classes.hidden]: hidden,
- [classes.nowrap]: (withIcon && !sourceWithText) || isArch,
+ [classes.nowrap]: (withIcon && !sourceWithText) || icon === 'network',
[classes.withIcon]: withIcon,
})
const Wrapper = ws ? Whitespace : Fragment
- const icon = isArch ? 'network' : isApi ? 'docs' : isSource ? 'code' : null
const content = (
<>
{sourceWithText ? {children} : children}
- {icon && }
+ {withIcon && }
>
)
diff --git a/website/src/images/icons/package.svg b/website/src/images/icons/package.svg
new file mode 100644
index 000000000..4edaf4e6f
--- /dev/null
+++ b/website/src/images/icons/package.svg
@@ -0,0 +1,5 @@
+
diff --git a/website/src/styles/infobox.module.sass b/website/src/styles/infobox.module.sass
index baf9919c3..8d6071f18 100644
--- a/website/src/styles/infobox.module.sass
+++ b/website/src/styles/infobox.module.sass
@@ -14,6 +14,21 @@
font-size: inherit
line-height: inherit
+ ul li
+ padding-left: 0.75em
+
+.list ul li
+ font-size: var(--font-size-sm)
+ list-style: none
+ padding: 0
+ margin: 0 0 0.35rem 0
+
+ &:before
+ all: initial
+
+ a, a span
+ border-bottom: 0 !important
+
.title
font-weight: bold
color: var(--color-theme)