From 01c1538c720f529f433163d495c351ecbd13ccc2 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 2 Oct 2020 01:36:06 +0200
Subject: [PATCH] Integrate file readers

---
 pyproject.toml                                |   2 +-
 requirements.txt                              |   4 +-
 setup.cfg                                     |   6 +-
 spacy/default_config_pretraining.cfg          |   2 +-
 spacy/errors.py                               |   6 -
 spacy/tests/training/test_training.py         |   6 +-
 spacy/training/augment.py                     |  40 +++---
 spacy/training/corpus.py                      |  10 +-
 spacy/util.py                                 |   4 -
 website/docs/api/corpus.md                    |  16 +--
 website/docs/api/data-formats.md              |   4 +-
 website/docs/api/top-level.md                 | 115 ++++++++++++------
 website/docs/usage/embeddings-transformers.md |   2 +-
 website/meta/type-annotations.json            |   2 +-
 14 files changed, 126 insertions(+), 93 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index e88ba7db9..611a95d27 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.0.0a42,<8.0.0a50",
+    "thinc>=8.0.0a43,<8.0.0a50",
     "blis>=0.4.0,<0.5.0",
     "pytokenizations",
     "pathy"
diff --git a/requirements.txt b/requirements.txt
index 064efed42..44dad38e3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,12 +1,12 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.0.0a42,<8.0.0a50
+thinc>=8.0.0a43,<8.0.0a50
 blis>=0.4.0,<0.5.0
 ml_datasets==0.2.0a0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.8.0,<1.1.0
-srsly>=2.1.0,<3.0.0
+srsly>=2.3.0,<3.0.0
 catalogue>=2.0.1,<2.1.0
 typer>=0.3.0,<0.4.0
 pathy
diff --git a/setup.cfg b/setup.cfg
index 36ab64bd9..7a3a2cb30 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -34,16 +34,16 @@ setup_requires =
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
     murmurhash>=0.28.0,<1.1.0
-    thinc>=8.0.0a42,<8.0.0a50
+    thinc>=8.0.0a43,<8.0.0a50
 install_requires =
     # Our libraries
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=8.0.0a42,<8.0.0a50
+    thinc>=8.0.0a43,<8.0.0a50
     blis>=0.4.0,<0.5.0
     wasabi>=0.8.0,<1.1.0
-    srsly>=2.1.0,<3.0.0
+    srsly>=2.3.0,<3.0.0
     catalogue>=2.0.1,<2.1.0
     typer>=0.3.0,<0.4.0
     pathy
diff --git a/spacy/default_config_pretraining.cfg b/spacy/default_config_pretraining.cfg
index 4011159a4..66987171a 100644
--- a/spacy/default_config_pretraining.cfg
+++ b/spacy/default_config_pretraining.cfg
@@ -34,7 +34,7 @@ learn_rate = 0.001
 [corpora]
 
 [corpora.pretrain]
-@readers = "spacy.JsonlReader.v1"
+@readers = "spacy.JsonlCorpus.v1"
 path = ${paths.raw_text}
 min_length = 5
 max_length = 500
diff --git a/spacy/errors.py b/spacy/errors.py
index 5236992e9..881a697f6 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -477,12 +477,6 @@ class Errors:
     E201 = ("Span index out of range.")
 
     # TODO: fix numbering after merging develop into master
-    E912 = ("No orth_variants lookups table for data augmentation available for "
-            "language '{lang}'. If orth_variants are available in "
-            "spacy-lookups-data, make sure the package is installed and the "
-            "table is loaded in the [initialize.lookups] block of your config. "
-            "Alternatively, you can provide your own Lookups object with a "
-            "table orth_variants as the argument 'lookuos' of the augmenter.")
     E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
             "config.cfg or override it on the CLI?")
     E914 = ("Executing {name} callback failed. Expected the function to "
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index 405801f62..c53042ef1 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -504,9 +504,9 @@ def test_make_orth_variants(doc):
             {"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]},
         ]
     }
-    lookups = Lookups()
-    lookups.add_table("orth_variants", orth_variants)
-    augmenter = create_orth_variants_augmenter(level=0.2, lower=0.5, lookups=lookups)
+    augmenter = create_orth_variants_augmenter(
+        level=0.2, lower=0.5, orth_variants=orth_variants
+    )
     with make_tempdir() as tmpdir:
         output_file = tmpdir / "roundtrip.spacy"
         DocBin(docs=[doc]).to_disk(output_file)
diff --git a/spacy/training/augment.py b/spacy/training/augment.py
index 176530a1c..8965c5457 100644
--- a/spacy/training/augment.py
+++ b/spacy/training/augment.py
@@ -1,27 +1,43 @@
-from typing import Callable, Iterator, Dict, List, Tuple, Optional, TYPE_CHECKING
+from typing import Callable, Iterator, Dict, List, Tuple, TYPE_CHECKING
 import random
 import itertools
 import copy
 from functools import partial
+from pydantic import BaseModel, StrictStr
 
 from ..util import registry, logger
 from ..tokens import Doc
 from .example import Example
-from ..lookups import Lookups
-from ..errors import Errors
 
 if TYPE_CHECKING:
     from ..language import Language  # noqa: F401
 
 
+class OrthVariantsSingle(BaseModel):
+    tags: List[StrictStr]
+    variants: List[StrictStr]
+
+
+class OrthVariantsPaired(BaseModel):
+    tags: List[StrictStr]
+    variants: List[List[StrictStr]]
+
+
+class OrthVariants(BaseModel):
+    paired: List[OrthVariantsPaired] = {}
+    single: List[OrthVariantsSingle] = {}
+
+
 @registry.augmenters("spacy.orth_variants.v1")
 def create_orth_variants_augmenter(
-    level: float, lower: float, lookups: Optional[Lookups] = None,
+    level: float, lower: float, orth_variants: OrthVariants,
 ) -> Callable[["Language", Example], Iterator[Example]]:
     """Create a data augmentation callback that uses orth-variant replacement.
     The callback can be added to a corpus or other data iterator during training.
     """
-    return partial(orth_variants_augmenter, level=level, lower=lower, lookups=lookups)
+    return partial(
+        orth_variants_augmenter, orth_variants=orth_variants, level=level, lower=lower
+    )
 
 
 def dont_augment(nlp: "Language", example: Example) -> Iterator[Example]:
@@ -31,20 +47,11 @@ def dont_augment(nlp: "Language", example: Example) -> Iterator[Example]:
 def orth_variants_augmenter(
     nlp: "Language",
     example: Example,
+    orth_variants: dict,
     *,
     level: float = 0.0,
     lower: float = 0.0,
-    lookups: Optional[Lookups] = None,
 ) -> Iterator[Example]:
-    table_name = "orth_variants"
-    if lookups is not None:
-        orth_variants = lookups.get_table(table_name, {})
-        logger.debug("Using data augmentation orth variants from provided lookups")
-    else:
-        orth_variants = nlp.vocab.lookups.get_table(table_name, {})
-        logger.debug("Using data augmentation orth variants from default vocab lookups")
-        if not orth_variants:
-            raise ValueError(Errors.E912.format(lang=nlp.lang))
     if random.random() >= level:
         yield example
     else:
@@ -74,13 +81,14 @@ def make_orth_variants(
     nlp: "Language",
     raw: str,
     token_dict: Dict[str, List[str]],
-    orth_variants: Dict[str, list],
+    orth_variants: Dict[str, List[Dict[str, List[str]]]],
     *,
     lower: bool = False,
 ) -> Tuple[str, Dict[str, List[str]]]:
     orig_token_dict = copy.deepcopy(token_dict)
     ndsv = orth_variants.get("single", [])
     ndpv = orth_variants.get("paired", [])
+    logger.debug(f"Data augmentation: {len(ndsv)} single / {len(ndpv)} paired variants")
     words = token_dict.get("words", [])
     tags = token_dict.get("tags", [])
     # keep unmodified if words or tags are not defined
diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py
index 57787cf76..b3ff30e66 100644
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@@ -38,11 +38,11 @@ def create_docbin_reader(
     )
 
 
-@util.registry.readers("spacy.JsonlReader.v1")
+@util.registry.readers("spacy.JsonlCorpus.v1")
 def create_jsonl_reader(
     path: Path, min_length: int = 0, max_length: int = 0, limit: int = 0
 ) -> Callable[["Language"], Iterable[Doc]]:
-    return JsonlTexts(path, min_length=min_length, max_length=max_length, limit=limit)
+    return JsonlCorpus(path, min_length=min_length, max_length=max_length, limit=limit)
 
 
 @util.registry.readers("spacy.read_labels.v1")
@@ -193,7 +193,7 @@ class Corpus:
                             break
 
 
-class JsonlTexts:
+class JsonlCorpus:
     """Iterate Doc objects from a file or directory of jsonl
     formatted raw text files.
 
@@ -206,7 +206,7 @@ class JsonlTexts:
     limit (int): Limit corpus to a subset of examples, e.g. for debugging.
         Defaults to 0, which indicates no limit.
 
-    DOCS: https://nightly.spacy.io/api/corpus#jsonltexts
+    DOCS: https://nightly.spacy.io/api/corpus#jsonlcorpus
     """
 
     file_type = "jsonl"
@@ -230,7 +230,7 @@ class JsonlTexts:
         nlp (Language): The current nlp object.
         YIELDS (Example): The example objects.
 
-        DOCS: https://nightly.spacy.io/api/corpus#jsonltexts-call
+        DOCS: https://nightly.spacy.io/api/corpus#jsonlcorpus-call
         """
         for loc in walk_corpus(self.path, ".jsonl"):
             records = srsly.read_jsonl(loc)
diff --git a/spacy/util.py b/spacy/util.py
index 8a96ba4fe..f234927d6 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -103,10 +103,6 @@ class registry(thinc.registry):
     cli = catalogue.create("spacy", "cli", entry_points=True)
 
 
-# We want json loading in the registry, so manually register srsly.read_json.
-registry.readers("srsly.read_json.v0", srsly.read_json)
-
-
 class SimpleFrozenDict(dict):
     """Simplified implementation of a frozen dict, mainly used as default
     function or method argument (for arguments that should default to empty
diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md
index 58006a19b..986c6f458 100644
--- a/website/docs/api/corpus.md
+++ b/website/docs/api/corpus.md
@@ -100,7 +100,7 @@ Yield examples from the data.
 | `nlp`      | The current `nlp` object. ~~Language~~ |
 | **YIELDS** | The examples. ~~Example~~              |
 
-## JsonlTexts {#jsonltexts tag="class"}
+## JsonlCorpus {#jsonlcorpus tag="class"}
 
 Iterate Doc objects from a file or directory of JSONL (newline-delimited JSON)
 formatted raw text files. Can be used to read the raw text corpus for language
@@ -126,22 +126,22 @@ file.
 {"text": "My cynical view on this is that it will never be free to the public. Reason: what would be the draw of joining the military? Right now their selling point is free Healthcare and Education. Ironically both are run horribly and most, that I've talked to, come out wishing they never went in."}
 ```
 
-### JsonlTexts.\_\init\_\_ {#jsonltexts-init tag="method"}
+### JsonlCorpus.\_\init\_\_ {#jsonlcorpus tag="method"}
 
 Initialize the reader.
 
 > #### Example
 >
 > ```python
-> from spacy.training import JsonlTexts
+> from spacy.training import JsonlCorpus
 >
-> corpus = JsonlTexts("./data/texts.jsonl")
+> corpus = JsonlCorpus("./data/texts.jsonl")
 > ```
 >
 > ```ini
 > ### Example config
 > [corpora.pretrain]
-> @readers = "spacy.JsonlReader.v1"
+> @readers = "spacy.JsonlCorpus.v1"
 > path = "corpus/raw_text.jsonl"
 > min_length = 0
 > max_length = 0
@@ -156,17 +156,17 @@ Initialize the reader.
 | `max_length`   | Maximum document length (in tokens). Longer documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~        |
 | `limit`        | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~                                  |
 
-### JsonlTexts.\_\_call\_\_ {#jsonltexts-call tag="method"}
+### JsonlCorpus.\_\_call\_\_ {#jsonlcorpus-call tag="method"}
 
 Yield examples from the data.
 
 > #### Example
 >
 > ```python
-> from spacy.training import JsonlTexts
+> from spacy.training import JsonlCorpus
 > import spacy
 >
-> corpus = JsonlTexts("./texts.jsonl")
+> corpus = JsonlCorpus("./texts.jsonl")
 > nlp = spacy.blank("en")
 > data = corpus(nlp)
 > ```
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index 22a0076cd..c1b9bfef4 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -135,7 +135,7 @@ $ python -m spacy train config.cfg --paths.train ./corpus/train.spacy
 > path = ${paths:dev}
 >
 > [corpora.pretrain]
-> @readers = "spacy.JsonlReader.v1"
+> @readers = "spacy.JsonlCorpus.v1"
 > path = ${paths.raw}
 >
 > [corpora.my_custom_data]
@@ -146,7 +146,7 @@ This section defines a **dictionary** mapping of string keys to functions. Each
 function takes an `nlp` object and yields [`Example`](/api/example) objects. By
 default, the two keys `train` and `dev` are specified and each refer to a
 [`Corpus`](/api/top-level#Corpus). When pretraining, an additional `pretrain`
-section is added that defaults to a [`JsonlReader`](/api/top-level#JsonlReader).
+section is added that defaults to a [`JsonlCorpus`](/api/top-level#JsonlCorpus).
 You can also register custom functions that return a callable.
 
 | Name       | Description                                                                                                                                                                 |
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index 22de0ea83..876006774 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -327,7 +327,7 @@ factories.
 | `losses`          | Registry for functions that create [losses](https://thinc.ai/docs/api-loss).                                                                                                                                                                       |
 | `misc`            | Registry for miscellaneous functions that return data assets, knowledge bases or anything else you may need.                                                                                                                                       |
 | `optimizers`      | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers).                                                                                                                                                             |
-| `readers`         | Registry for training and evaluation data readers like [`Corpus`](/api/corpus).                                                                                                                                                                    |
+| `readers`         | Registry for file and data readers, including training and evaluation data readers like [`Corpus`](/api/corpus).                                                                                                                                   |
 | `schedules`       | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules).                                                                                                                                                               |
 | `tokenizers`      | Registry for tokenizer factories. Registered functions should return a callback that receives the `nlp` object and returns a [`Tokenizer`](/api/tokenizer) or a custom callable.                                                                   |
 
@@ -470,7 +470,65 @@ logging the results.
 
 </Project>
 
-## Readers {#readers source="spacy/training/corpus.py" new="3"}
+## Readers {#readers}
+
+### File readers {#file-readers source="github.com/explosion/srsly" new="3"}
+
+The following file readers are provided by our serialization library
+[`srsly`](https://github.com/explosion/srsly). All registered functions take one
+argument `path`, pointing to the file path to load.
+
+> #### Example config
+>
+> ```ini
+> [corpora.train.augmenter.orth_variants]
+> @readers = "srsly.read_json.v1"
+> path = "corpus/en_orth_variants.json"
+> ```
+
+| Name                    | Description                                           |
+| ----------------------- | ----------------------------------------------------- |
+| `srsly.read_json.v1`    | Read data from a JSON file.                           |
+| `srsly.read_jsonl.v1`   | Read data from a JSONL (newline-delimited JSON) file. |
+| `srsly.read_yaml.v1`    | Read data from a YAML file.                           |
+| `srsly.read_msgpack.v1` | Read data from a binary MessagePack file.             |
+
+<Infobox title="Important note" variant="warning">
+
+Since the file readers expect a local path, you should only use them in config
+blocks that are **not executed at runtime** – for example, in `[training]` and
+`[corpora]` (to load data or resources like data augmentation tables) or in
+`[initialize]` (to pass data to pipeline components).
+
+</Infobox>
+
+#### spacy.read_labels.v1 {#read_labels tag="registered function"}
+
+Read a JSON-formatted labels file generated with
+[`init labels`](/api/cli#init-labels). Typically used in the
+[`[initialize]`](/api/data-formats#config-initialize) block of the training
+config to speed up the model initialization process and provide pre-generated
+label sets.
+
+> #### Example config
+>
+> ```ini
+> [initialize.components]
+>
+> [initialize.components.ner]
+>
+> [initialize.components.ner.labels]
+> @readers = "spacy.read_labels.v1"
+> path = "corpus/labels/ner.json"
+> ```
+
+| Name        | Description                                                                                                                                                                                                               |
+| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `path`      | The path to the labels file generated with [`init labels`](/api/cli#init-labels). ~~Path~~                                                                                                                                |
+| `require`   | Whether to require the file to exist. If set to `False` and the labels file doesn't exist, the loader will return `None` and the `initialize` method will extract the labels from the data. Defaults to `False`. ~~bool~~ |
+| **CREATES** | The                                                                                                                                                                                                                       |
+
+### Corpus readers {#corpus-readers source="spacy/training/corpus.py" new="3"}
 
 Corpus readers are registered functions that load data and return a function
 that takes the current `nlp` object and yields [`Example`](/api/example) objects
@@ -480,7 +538,7 @@ with your own registered function in the
 [`@readers` registry](/api/top-level#registry) to customize the data loading and
 streaming.
 
-### spacy.Corpus.v1 {#corpus tag="registered function"}
+#### spacy.Corpus.v1 {#corpus tag="registered function"}
 
 The `Corpus` reader manages annotated corpora and can be used for training and
 development datasets in the [DocBin](/api/docbin) (`.spacy`) format. Also see
@@ -509,12 +567,12 @@ the [`Corpus`](/api/corpus) class.
 | `augmenter`     | Apply some simply data augmentation, where we replace tokens with variations. This is especially useful for punctuation and case replacement, to help generalize beyond corpora that don't have smart-quotes, or only have smart quotes, etc. Defaults to `None`. ~~Optional[Callable]~~ |
 | **CREATES**     | The corpus reader. ~~Corpus~~                                                                                                                                                                                                                                                            |
 
-### spacy.JsonlReader.v1 {#jsonlreader tag="registered function"}
+#### spacy.JsonlCorpus.v1 {#jsonlcorpus tag="registered function"}
 
 Create [`Example`](/api/example) objects from a JSONL (newline-delimited JSON)
 file of texts keyed by `"text"`. Can be used to read the raw text corpus for
 language model [pretraining](/usage/embeddings-transformers#pretraining) from a
-JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class.
+JSONL file. Also see the [`JsonlCorpus`](/api/corpus#jsonlcorpus) class.
 
 > #### Example config
 >
@@ -523,7 +581,7 @@ JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class.
 > pretrain = "corpus/raw_text.jsonl"
 >
 > [corpora.pretrain]
-> @readers = "spacy.JsonlReader.v1"
+> @readers = "spacy.JsonlCorpus.v1"
 > path = ${paths.pretrain}
 > min_length = 0
 > max_length = 0
@@ -536,33 +594,7 @@ JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class.
 | `min_length` | Minimum document length (in tokens). Shorter documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~       |
 | `max_length` | Maximum document length (in tokens). Longer documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~        |
 | `limit`      | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~                                  |
-| **CREATES**  | The corpus reader. ~~JsonlTexts~~                                                                                                |
-
-### spacy.read_labels.v1 {#read_labels tag="registered function"}
-
-Read a JSON-formatted labels file generated with
-[`init labels`](/api/cli#init-labels). Typically used in the
-[`[initialize]`](/api/data-formats#config-initialize) block of the training
-config to speed up the model initialization process and provide pre-generated
-label sets.
-
-> #### Example config
->
-> ```ini
-> [initialize.components]
->
-> [initialize.components.ner]
->
-> [initialize.components.ner.labels]
-> @readers = "spacy.read_labels.v1"
-> path = "corpus/labels/ner.json"
-> ```
-
-| Name        | Description                                                                                                                                                                                                               |
-| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `path`      | The path to the labels file generated with [`init labels`](/api/cli#init-labels). ~~Path~~                                                                                                                                |
-| `require`   | Whether to require the file to exist. If set to `False` and the labels file doesn't exist, the loader will return `None` and the `initialize` method will extract the labels from the data. Defaults to `False`. ~~bool~~ |
-| **CREATES** | The                                                                                                                                                                                                                       |
+| **CREATES**  | The corpus reader. ~~JsonlCorpus~~                                                                                               |
 
 ## Batchers {#batchers source="spacy/training/batchers.py" new="3"}
 
@@ -664,7 +696,10 @@ sequences in the batch.
 > @augmenters = "spacy.orth_variants.v1"
 > level = 0.1
 > lower = 0.5
-> lookups = null
+>
+> [corpora.train.augmenter.orth_variants]
+> @readers = "srsly.read_json.v1"
+> path = "corpus/en_orth_variants.json"
 > ```
 
 Create a data augmentation callback that uses orth-variant replacement. The
@@ -672,12 +707,12 @@ callback can be added to a corpus or other data iterator during training. This
 is especially useful for punctuation and case replacement, to help generalize
 beyond corpora that don't have smart quotes, or only have smart quotes etc.
 
-| Name        | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `level`     | The percentage of texts that will be augmented. ~~float~~                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
-| `lower`     | The percentage of texts that will be lowercased. ~~float~~                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
-| `lookups`   | Lookups table containing the orth variants to use. See [`orth_variants.json`](https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_orth_variants.json) for an example. If not set, tables from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) are used if available and added in the [`[initialize]`](/api/data-formats#config-initialize) block of the config. If no orth variants are found, spaCy will raise an error. Defaults to `None`. ~~Optional[Lookups]~~ |
-| **CREATES** | A function that takes the current `nlp` object and an [`Example`](/api/example) and yields augmented `Example` objects. ~~Callable[[Language, Example], Iterator[Example]]~~                                                                                                                                                                                                                                                                                                                                                   |
+| Name            | Description                                                                                                                                                                                                                                                                                               |
+| --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `level`         | The percentage of texts that will be augmented. ~~float~~                                                                                                                                                                                                                                                 |
+| `lower`         | The percentage of texts that will be lowercased. ~~float~~                                                                                                                                                                                                                                                |
+| `orth_variants` | A dictionary containing the single and paired orth variants. Typically loaded from a JSON file. See [`en_orth_variants.json`](https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_orth_variants.json) for an example. ~~Dict[str, Dict[List[Union[str, List[str]]]]]~~ |
+| **CREATES**     | A function that takes the current `nlp` object and an [`Example`](/api/example) and yields augmented `Example` objects. ~~Callable[[Language, Example], Iterator[Example]]~~                                                                                                                              |
 
 ## Training data and alignment {#gold source="spacy/training"}
 
diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index 1b78b8dc5..c615097d6 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -622,7 +622,7 @@ that are familiar from the training block: the `[pretraining.batcher]`,
 `[pretraining.optimizer]` and `[pretraining.corpus]` all work the same way and
 expect the same types of objects, although for pretraining your corpus does not
 need to have any annotations, so you will often use a different reader, such as
-the [`JsonlReader`](/api/top-level#jsonlreader).
+the [`JsonlCorpus`](/api/top-level#jsonlcorpus).
 
 > #### Raw text format
 >
diff --git a/website/meta/type-annotations.json b/website/meta/type-annotations.json
index 43a524e93..acbc88ae2 100644
--- a/website/meta/type-annotations.json
+++ b/website/meta/type-annotations.json
@@ -24,7 +24,7 @@
     "TransformerData": "/api/transformer#transformerdata",
     "FullTransformerBatch": "/api/transformer#fulltransformerbatch",
     "Corpus": "/api/corpus",
-    "JsonlTexts": "/api/corpus#jsonltexts",
+    "JsonlCorpus": "/api/corpus#jsonlcorpus",
     "LexemeC": "/api/cython-structs#lexemec",
     "TokenC": "/api/cython-structs#tokenc",
     "Config": "https://thinc.ai/docs/api-config#config",