diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 53cd954be..a6cb41e5e 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -754,7 +754,7 @@ in the section `[paths]`.
 </Infobox>
 
 ```cli
-$ python -m spacy train [config_path] [--output] [--code] [--verbose] [overrides]
+$ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id] [overrides]
 ```
 
 | Name              | Description                                                                                                                                                                                |
@@ -778,8 +778,8 @@ pretrained ones. The weights are saved to a directory after each epoch. You can
 then include a **path to one of these pretrained weights files** in your
 [training config](/usage/training#config) as the `init_tok2vec` setting when you
 train your pipeline. This technique may be especially helpful if you have little
-labelled data. See the usage docs on [pretraining](/usage/training#pretraining)
-for more info.
+labelled data. See the usage docs on
+[pretraining](/usage/embeddings-transformers#pretraining) for more info.
 
 <Infobox title="Changed in v3.0" variant="warning">
 
@@ -794,7 +794,7 @@ auto-generated by setting `--pretraining` on
 </Infobox>
 
 ```cli
-$ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [--epoch-resume] [overrides]
+$ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [--epoch-resume] [--gpu-id] [overrides]
 ```
 
 | Name                    | Description                                                                                                                                                                           |
diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index b00760e62..97249bfb2 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -752,7 +752,7 @@ network to model something about word cooccurrence statistics. Predicting
 leading and trailing characters does that more than adequately, as the exact
 word sequence could be recovered with high accuracy if the initial and trailing
 characters are predicted accurately. With the vectors objective, the pretraining
-is use the embedding space learned by an algorithm such as
+uses the embedding space learned by an algorithm such as
 [GloVe](https://nlp.stanford.edu/projects/glove/) or
 [Word2vec](https://code.google.com/archive/p/word2vec/), allowing the model to
 focus on the contextual modelling we actual care about.
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 65afd0eb4..54be6b367 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -175,7 +175,7 @@ sections of a config file are:
 | `paths`       | Paths to data and other assets. Re-used across the config as variables, e.g. `${paths.train}`, and can be [overwritten](#config-overrides) on the CLI.          |
 | `system`      | Settings related to system and hardware. Re-used across the config as variables, e.g. `${system.seed}`, and can be [overwritten](#config-overrides) on the CLI. |
 | `training`    | Settings and controls for the training and evaluation process.                                                                                                  |
-| `pretraining` | Optional settings and controls for the [language model pretraining](#pretraining).                                                                              |
+| `pretraining` | Optional settings and controls for the [language model pretraining](/usage/embeddings-transformers#pretraining).                                                |
 
 <Infobox title="Config format and settings" emoji="📖">