diff --git a/setup.cfg b/setup.cfg
index 5dd0227f2..e101a2eb6 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -70,7 +70,7 @@ console_scripts =
lookups =
spacy_lookups_data>=1.0.2,<1.1.0
transformers =
- spacy_transformers>=1.0.1,<1.1.0
+ spacy_transformers>=1.0.1,<1.2.0
ray =
spacy_ray>=0.1.0,<1.0.0
cuda =
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 339fb1e96..bb4061177 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -32,7 +32,7 @@ batch_size = {{ 128 if hardware == "gpu" else 1000 }}
factory = "transformer"
[components.transformer.model]
-@architectures = "spacy-transformers.TransformerModel.v1"
+@architectures = "spacy-transformers.TransformerModel.v3"
name = "{{ transformer["name"] }}"
tokenizer_config = {"use_fast": true}
diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index ceeb388ab..047de0164 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -332,15 +332,18 @@ for details and system requirements.
-### spacy-transformers.TransformerModel.v1 {#TransformerModel}
+### spacy-transformers.TransformerModel.v3 {#TransformerModel}
> #### Example Config
>
> ```ini
> [model]
-> @architectures = "spacy-transformers.TransformerModel.v1"
+> @architectures = "spacy-transformers.TransformerModel.v3"
> name = "roberta-base"
> tokenizer_config = {"use_fast": true}
+> transformer_config = {}
+> mixed_precision = true
+> grad_scaler_config = {"init_scale": 32768}
>
> [model.get_spans]
> @span_getters = "spacy-transformers.strided_spans.v1"
@@ -366,12 +369,31 @@ transformer weights across your pipeline. For a layer that's configured for use
in other components, see
[Tok2VecTransformer](/api/architectures#Tok2VecTransformer).
-| Name | Description |
-| ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `name` | Any model name that can be loaded by [`transformers.AutoModel`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoModel). ~~str~~ |
-| `get_spans` | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples. ~~Callable[[List[Doc]], List[Span]]~~ |
-| `tokenizer_config` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). ~~Dict[str, Any]~~ |
-| **CREATES** | The model using the architecture. ~~Model[List[Doc], FullTransformerBatch]~~ |
+| Name | Description |
+|----------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `name` | Any model name that can be loaded by [`transformers.AutoModel`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoModel). ~~str~~ |
+| `get_spans` | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples. ~~Callable[[List[Doc]], List[Span]]~~ |
+| `tokenizer_config` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). ~~Dict[str, Any]~~ |
+| `transformer_config` | Settings to pass to the transformers forward pass. ~~Dict[str, Any]~~ |
+| `mixed_precision` | Replace whitelisted ops by half-precision counterparts. Speeds up training and prediction on GPUs with [Tensor Cores](https://developer.nvidia.com/tensor-cores) and reduces GPU memory use. ~~bool~~ |
+| `grad_scaler_config` | Configuration to pass to `thinc.api.PyTorchGradScaler` during training when `mixed_precision` is enabled. ~~Dict[str, Any]~~ |
+| **CREATES** | The model using the architecture. ~~Model[List[Doc], FullTransformerBatch]~~ |
+| | |
+
+
+Mixed-precision support is currently an experimental feature.
+
+
+
+
+* The `transformer_config` argument was added in
+`spacy-transformers.TransformerModel.v2`.
+* The `mixed_precision` and `grad_scaler_config` arguments were added in
+`spacy-transformers.TransformerModel.v3`.
+
+The other arguments are shared between all versions.
+
+
### spacy-transformers.TransformerListener.v1 {#TransformerListener}
@@ -403,16 +425,19 @@ a single token vector given zero or more wordpiece vectors.
| `upstream` | A string to identify the "upstream" `Transformer` component to communicate with. By default, the upstream name is the wildcard string `"*"`, but you could also specify the name of the `Transformer` component. You'll almost never have multiple upstream `Transformer` components, so the wildcard string will almost always be fine. ~~str~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
-### spacy-transformers.Tok2VecTransformer.v1 {#Tok2VecTransformer}
+### spacy-transformers.Tok2VecTransformer.v3 {#Tok2VecTransformer}
> #### Example Config
>
> ```ini
> [model]
-> @architectures = "spacy-transformers.Tok2VecTransformer.v1"
+> @architectures = "spacy-transformers.Tok2VecTransformer.v3"
> name = "albert-base-v2"
> tokenizer_config = {"use_fast": false}
+> transformer_config = {}
> grad_factor = 1.0
+> mixed_precision = true
+> grad_scaler_config = {"init_scale": 32768}
> ```
Use a transformer as a [`Tok2Vec`](/api/tok2vec) layer directly. This does
@@ -421,13 +446,32 @@ Use a transformer as a [`Tok2Vec`](/api/tok2vec) layer directly. This does
object, but it's a **simpler solution** if you only need the transformer within
one component.
-| Name | Description |
-| ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `get_spans` | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples. ~~Callable[[List[Doc]], List[Span]]~~ |
-| `tokenizer_config` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). ~~Dict[str, Any]~~ |
-| `pooling` | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. ~~Model[Ragged, Floats2d]~~ |
-| `grad_factor` | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. ~~float~~ |
-| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
+| Name | Description |
+|----------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `get_spans` | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples. ~~Callable[[List[Doc]], List[Span]]~~ |
+| `tokenizer_config` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). ~~Dict[str, Any]~~ |
+| `transformer_config` | Settings to pass to the transformers forward pass. ~~Dict[str, Any]~~ |
+| `pooling` | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. ~~Model[Ragged, Floats2d]~~ |
+| `grad_factor` | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. ~~float~~ |
+| `mixed_precision` | Replace whitelisted ops by half-precision counterparts. Speeds up training and prediction on GPUs with [Tensor Cores](https://developer.nvidia.com/tensor-cores) and reduces GPU memory use. ~~bool~~ |
+| `grad_scaler_config` | Configuration to pass to `thinc.api.PyTorchGradScaler` during training when `mixed_precision` is enabled. ~~Dict[str, Any]~~ |
+| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
+
+
+
+Mixed-precision support is currently an experimental feature.
+
+
+
+
+* The `transformer_config` argument was added in
+`spacy-transformers.Tok2VecTransformer.v2`.
+* The `mixed_precision` and `grad_scaler_config` arguments were added in
+`spacy-transformers.Tok2VecTransformer.v3`.
+
+The other arguments are shared between all versions.
+
+
## Pretraining architectures {#pretrain source="spacy/ml/models/multi_task.py"}
diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md
index 6e68ac599..571fb8d80 100644
--- a/website/docs/api/transformer.md
+++ b/website/docs/api/transformer.md
@@ -92,9 +92,12 @@ https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/p
> # Construction via add_pipe with custom config
> config = {
> "model": {
-> "@architectures": "spacy-transformers.TransformerModel.v1",
+> "@architectures": "spacy-transformers.TransformerModel.v3",
> "name": "bert-base-uncased",
-> "tokenizer_config": {"use_fast": True}
+> "tokenizer_config": {"use_fast": True},
+> "transformer_config": {"output_attentions": True},
+> "mixed_precision": True,
+> "grad_scaler_config": {"init_scale": 32768}
> }
> }
> trf = nlp.add_pipe("transformer", config=config)
diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index 88fb39f61..253b3d0b5 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -351,7 +351,7 @@ factory = "transformer"
max_batch_items = 4096
[components.transformer.model]
-@architectures = "spacy-transformers.TransformerModel.v1"
+@architectures = "spacy-transformers.TransformerModel.v3"
name = "bert-base-cased"
tokenizer_config = {"use_fast": true}
@@ -367,7 +367,7 @@ The `[components.transformer.model]` block describes the `model` argument passed
to the transformer component. It's a Thinc
[`Model`](https://thinc.ai/docs/api-model) object that will be passed into the
component. Here, it references the function
-[spacy-transformers.TransformerModel.v1](/api/architectures#TransformerModel)
+[spacy-transformers.TransformerModel.v3](/api/architectures#TransformerModel)
registered in the [`architectures` registry](/api/top-level#registry). If a key
in a block starts with `@`, it's **resolved to a function** and all other
settings are passed to the function as arguments. In this case, `name`,