Integrate file readers

2025-11-28 05:45:44 +03:00 · 2020-10-02 01:36:06 +02:00 · 2020-10-02 01:36:06 +02:00 · 01c1538c72
commit 01c1538c72
parent af282ae732
14 changed files with 126 additions and 93 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,7 +6,7 @@ requires = [
    "cymem>=2.0.2,<2.1.0",
    "preshed>=3.0.2,<3.1.0",
    "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.0.0a42,<8.0.0a50",
+    "thinc>=8.0.0a43,<8.0.0a50",
    "blis>=0.4.0,<0.5.0",
    "pytokenizations",
    "pathy"
--- a/requirements.txt
+++ b/requirements.txt
@ -1,12 +1,12 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.0.0a42,<8.0.0a50
+thinc>=8.0.0a43,<8.0.0a50
 blis>=0.4.0,<0.5.0
 ml_datasets==0.2.0a0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.8.0,<1.1.0
-srsly>=2.1.0,<3.0.0
+srsly>=2.3.0,<3.0.0
 catalogue>=2.0.1,<2.1.0
 typer>=0.3.0,<0.4.0
 pathy
--- a/setup.cfg
+++ b/setup.cfg
@ -34,16 +34,16 @@ setup_requires =
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
    murmurhash>=0.28.0,<1.1.0
-    thinc>=8.0.0a42,<8.0.0a50
+    thinc>=8.0.0a43,<8.0.0a50
 install_requires =
    # Our libraries
    murmurhash>=0.28.0,<1.1.0
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
-    thinc>=8.0.0a42,<8.0.0a50
+    thinc>=8.0.0a43,<8.0.0a50
    blis>=0.4.0,<0.5.0
    wasabi>=0.8.0,<1.1.0
-    srsly>=2.1.0,<3.0.0
+    srsly>=2.3.0,<3.0.0
    catalogue>=2.0.1,<2.1.0
    typer>=0.3.0,<0.4.0
    pathy
--- a/spacy/default_config_pretraining.cfg
+++ b/spacy/default_config_pretraining.cfg
@ -34,7 +34,7 @@ learn_rate = 0.001
 [corpora]

 [corpora.pretrain]
-@readers = "spacy.JsonlReader.v1"
+@readers = "spacy.JsonlCorpus.v1"
 path = ${paths.raw_text}
 min_length = 5
 max_length = 500
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -477,12 +477,6 @@ class Errors:
    E201 = ("Span index out of range.")

    # TODO: fix numbering after merging develop into master
-    E912 = ("No orth_variants lookups table for data augmentation available for "
-            "language '{lang}'. If orth_variants are available in "
-            "spacy-lookups-data, make sure the package is installed and the "
-            "table is loaded in the [initialize.lookups] block of your config. "
-            "Alternatively, you can provide your own Lookups object with a "
-            "table orth_variants as the argument 'lookuos' of the augmenter.")
    E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
            "config.cfg or override it on the CLI?")
    E914 = ("Executing {name} callback failed. Expected the function to "
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@ -504,9 +504,9 @@ def test_make_orth_variants(doc):
            {"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]},
        ]
    }
-    lookups = Lookups()
-    lookups.add_table("orth_variants", orth_variants)
-    augmenter = create_orth_variants_augmenter(level=0.2, lower=0.5, lookups=lookups)
+    augmenter = create_orth_variants_augmenter(
+        level=0.2, lower=0.5, orth_variants=orth_variants
+    )
    with make_tempdir() as tmpdir:
        output_file = tmpdir / "roundtrip.spacy"
        DocBin(docs=[doc]).to_disk(output_file)
--- a/spacy/training/augment.py
+++ b/spacy/training/augment.py
@ -1,27 +1,43 @@
-from typing import Callable, Iterator, Dict, List, Tuple, Optional, TYPE_CHECKING
+from typing import Callable, Iterator, Dict, List, Tuple, TYPE_CHECKING
 import random
 import itertools
 import copy
 from functools import partial
+from pydantic import BaseModel, StrictStr

 from ..util import registry, logger
 from ..tokens import Doc
 from .example import Example
-from ..lookups import Lookups
-from ..errors import Errors

 if TYPE_CHECKING:
    from ..language import Language  # noqa: F401


+class OrthVariantsSingle(BaseModel):
+    tags: List[StrictStr]
+    variants: List[StrictStr]
+
+
+class OrthVariantsPaired(BaseModel):
+    tags: List[StrictStr]
+    variants: List[List[StrictStr]]
+
+
+class OrthVariants(BaseModel):
+    paired: List[OrthVariantsPaired] = {}
+    single: List[OrthVariantsSingle] = {}
+
+
@registry.augmenters("spacy.orth_variants.v1")
 def create_orth_variants_augmenter(
-    level: float, lower: float, lookups: Optional[Lookups] = None,
+    level: float, lower: float, orth_variants: OrthVariants,
 ) -> Callable[["Language", Example], Iterator[Example]]:
    """Create a data augmentation callback that uses orth-variant replacement.
    The callback can be added to a corpus or other data iterator during training.
    """
-    return partial(orth_variants_augmenter, level=level, lower=lower, lookups=lookups)
+    return partial(
+        orth_variants_augmenter, orth_variants=orth_variants, level=level, lower=lower
+    )


 def dont_augment(nlp: "Language", example: Example) -> Iterator[Example]:
@ -31,20 +47,11 @@ def dont_augment(nlp: "Language", example: Example) -> Iterator[Example]:
 def orth_variants_augmenter(
    nlp: "Language",
    example: Example,
+    orth_variants: dict,
    *,
    level: float = 0.0,
    lower: float = 0.0,
-    lookups: Optional[Lookups] = None,
 ) -> Iterator[Example]:
-    table_name = "orth_variants"
-    if lookups is not None:
-        orth_variants = lookups.get_table(table_name, {})
-        logger.debug("Using data augmentation orth variants from provided lookups")
-    else:
-        orth_variants = nlp.vocab.lookups.get_table(table_name, {})
-        logger.debug("Using data augmentation orth variants from default vocab lookups")
-        if not orth_variants:
-            raise ValueError(Errors.E912.format(lang=nlp.lang))
    if random.random() >= level:
        yield example
    else:
@ -74,13 +81,14 @@ def make_orth_variants(
    nlp: "Language",
    raw: str,
    token_dict: Dict[str, List[str]],
-    orth_variants: Dict[str, list],
+    orth_variants: Dict[str, List[Dict[str, List[str]]]],
    *,
    lower: bool = False,
 ) -> Tuple[str, Dict[str, List[str]]]:
    orig_token_dict = copy.deepcopy(token_dict)
    ndsv = orth_variants.get("single", [])
    ndpv = orth_variants.get("paired", [])
+    logger.debug(f"Data augmentation: {len(ndsv)} single / {len(ndpv)} paired variants")
    words = token_dict.get("words", [])
    tags = token_dict.get("tags", [])
    # keep unmodified if words or tags are not defined
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@ -38,11 +38,11 @@ def create_docbin_reader(
    )


-@util.registry.readers("spacy.JsonlReader.v1")
+@util.registry.readers("spacy.JsonlCorpus.v1")
 def create_jsonl_reader(
    path: Path, min_length: int = 0, max_length: int = 0, limit: int = 0
 ) -> Callable[["Language"], Iterable[Doc]]:
-    return JsonlTexts(path, min_length=min_length, max_length=max_length, limit=limit)
+    return JsonlCorpus(path, min_length=min_length, max_length=max_length, limit=limit)


@util.registry.readers("spacy.read_labels.v1")
@ -193,7 +193,7 @@ class Corpus:
                            break


-class JsonlTexts:
+class JsonlCorpus:
    """Iterate Doc objects from a file or directory of jsonl
    formatted raw text files.

@ -206,7 +206,7 @@ class JsonlTexts:
    limit (int): Limit corpus to a subset of examples, e.g. for debugging.
        Defaults to 0, which indicates no limit.

-    DOCS: https://nightly.spacy.io/api/corpus#jsonltexts
+    DOCS: https://nightly.spacy.io/api/corpus#jsonlcorpus
    """

    file_type = "jsonl"
@ -230,7 +230,7 @@ class JsonlTexts:
        nlp (Language): The current nlp object.
        YIELDS (Example): The example objects.

-        DOCS: https://nightly.spacy.io/api/corpus#jsonltexts-call
+        DOCS: https://nightly.spacy.io/api/corpus#jsonlcorpus-call
        """
        for loc in walk_corpus(self.path, ".jsonl"):
            records = srsly.read_jsonl(loc)
--- a/spacy/util.py
+++ b/spacy/util.py
@ -103,10 +103,6 @@ class registry(thinc.registry):
    cli = catalogue.create("spacy", "cli", entry_points=True)


-# We want json loading in the registry, so manually register srsly.read_json.
-registry.readers("srsly.read_json.v0", srsly.read_json)
-
-
 class SimpleFrozenDict(dict):
    """Simplified implementation of a frozen dict, mainly used as default
    function or method argument (for arguments that should default to empty
--- a/website/docs/api/corpus.md
+++ b/website/docs/api/corpus.md
@ -100,7 +100,7 @@ Yield examples from the data.
 | `nlp`      | The current `nlp` object. ~~Language~~ |
 | **YIELDS** | The examples. ~~Example~~              |

-## JsonlTexts {#jsonltexts tag="class"}
+## JsonlCorpus {#jsonlcorpus tag="class"}

 Iterate Doc objects from a file or directory of JSONL (newline-delimited JSON)
 formatted raw text files. Can be used to read the raw text corpus for language
@ -126,22 +126,22 @@ file.
 {"text": "My cynical view on this is that it will never be free to the public. Reason: what would be the draw of joining the military? Right now their selling point is free Healthcare and Education. Ironically both are run horribly and most, that I've talked to, come out wishing they never went in."}
 ```

-### JsonlTexts.\_\init\_\_ {#jsonltexts-init tag="method"}
+### JsonlCorpus.\_\init\_\_ {#jsonlcorpus tag="method"}

 Initialize the reader.

 > #### Example
 >
 > ```python
-> from spacy.training import JsonlTexts
+> from spacy.training import JsonlCorpus
 >
-> corpus = JsonlTexts("./data/texts.jsonl")
+> corpus = JsonlCorpus("./data/texts.jsonl")
 > ```
 >
 > ```ini
 > ### Example config
 > [corpora.pretrain]
-> @readers = "spacy.JsonlReader.v1"
+> @readers = "spacy.JsonlCorpus.v1"
 > path = "corpus/raw_text.jsonl"
 > min_length = 0
 > max_length = 0
@ -156,17 +156,17 @@ Initialize the reader.
 | `max_length`   | Maximum document length (in tokens). Longer documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~        |
 | `limit`        | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~                                  |

-### JsonlTexts.\_\_call\_\_ {#jsonltexts-call tag="method"}
+### JsonlCorpus.\_\_call\_\_ {#jsonlcorpus-call tag="method"}

 Yield examples from the data.

 > #### Example
 >
 > ```python
-> from spacy.training import JsonlTexts
+> from spacy.training import JsonlCorpus
 > import spacy
 >
-> corpus = JsonlTexts("./texts.jsonl")
+> corpus = JsonlCorpus("./texts.jsonl")
 > nlp = spacy.blank("en")
 > data = corpus(nlp)
 > ```
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@ -135,7 +135,7 @@ $ python -m spacy train config.cfg --paths.train ./corpus/train.spacy
 > path = ${paths:dev}
 >
 > [corpora.pretrain]
-> @readers = "spacy.JsonlReader.v1"
+> @readers = "spacy.JsonlCorpus.v1"
 > path = ${paths.raw}
 >
 > [corpora.my_custom_data]
@ -146,7 +146,7 @@ This section defines a **dictionary** mapping of string keys to functions. Each
 function takes an `nlp` object and yields [`Example`](/api/example) objects. By
 default, the two keys `train` and `dev` are specified and each refer to a
 [`Corpus`](/api/top-level#Corpus). When pretraining, an additional `pretrain`
-section is added that defaults to a [`JsonlReader`](/api/top-level#JsonlReader).
+section is added that defaults to a [`JsonlCorpus`](/api/top-level#JsonlCorpus).
 You can also register custom functions that return a callable.

 | Name       | Description                                                                                                                                                                 |
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@ -327,7 +327,7 @@ factories.
 | `losses`          | Registry for functions that create [losses](https://thinc.ai/docs/api-loss).                                                                                                                                                                       |
 | `misc`            | Registry for miscellaneous functions that return data assets, knowledge bases or anything else you may need.                                                                                                                                       |
 | `optimizers`      | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers).                                                                                                                                                             |
-| `readers`         | Registry for training and evaluation data readers like [`Corpus`](/api/corpus).                                                                                                                                                                    |
+| `readers`         | Registry for file and data readers, including training and evaluation data readers like [`Corpus`](/api/corpus).                                                                                                                                   |
 | `schedules`       | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules).                                                                                                                                                               |
 | `tokenizers`      | Registry for tokenizer factories. Registered functions should return a callback that receives the `nlp` object and returns a [`Tokenizer`](/api/tokenizer) or a custom callable.                                                                   |

@ -470,7 +470,65 @@ logging the results.

 </Project>

-## Readers {#readers source="spacy/training/corpus.py" new="3"}
+## Readers {#readers}
+
+### File readers {#file-readers source="github.com/explosion/srsly" new="3"}
+
+The following file readers are provided by our serialization library
+[`srsly`](https://github.com/explosion/srsly). All registered functions take one
+argument `path`, pointing to the file path to load.
+
+> #### Example config
+>
+> ```ini
+> [corpora.train.augmenter.orth_variants]
+> @readers = "srsly.read_json.v1"
+> path = "corpus/en_orth_variants.json"
+> ```
+
+| Name                    | Description                                           |
+| ----------------------- | ----------------------------------------------------- |
+| `srsly.read_json.v1`    | Read data from a JSON file.                           |
+| `srsly.read_jsonl.v1`   | Read data from a JSONL (newline-delimited JSON) file. |
+| `srsly.read_yaml.v1`    | Read data from a YAML file.                           |
+| `srsly.read_msgpack.v1` | Read data from a binary MessagePack file.             |
+
+<Infobox title="Important note" variant="warning">
+
+Since the file readers expect a local path, you should only use them in config
+blocks that are **not executed at runtime** – for example, in `[training]` and
+`[corpora]` (to load data or resources like data augmentation tables) or in
+`[initialize]` (to pass data to pipeline components).
+
+</Infobox>
+
+#### spacy.read_labels.v1 {#read_labels tag="registered function"}
+
+Read a JSON-formatted labels file generated with
+[`init labels`](/api/cli#init-labels). Typically used in the
+[`[initialize]`](/api/data-formats#config-initialize) block of the training
+config to speed up the model initialization process and provide pre-generated
+label sets.
+
+> #### Example config
+>
+> ```ini
+> [initialize.components]
+>
+> [initialize.components.ner]
+>
+> [initialize.components.ner.labels]
+> @readers = "spacy.read_labels.v1"
+> path = "corpus/labels/ner.json"
+> ```
+
+| Name        | Description                                                                                                                                                                                                               |
+| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `path`      | The path to the labels file generated with [`init labels`](/api/cli#init-labels). ~~Path~~                                                                                                                                |
+| `require`   | Whether to require the file to exist. If set to `False` and the labels file doesn't exist, the loader will return `None` and the `initialize` method will extract the labels from the data. Defaults to `False`. ~~bool~~ |
+| **CREATES** | The                                                                                                                                                                                                                       |
+
+### Corpus readers {#corpus-readers source="spacy/training/corpus.py" new="3"}

 Corpus readers are registered functions that load data and return a function
 that takes the current `nlp` object and yields [`Example`](/api/example) objects
@ -480,7 +538,7 @@ with your own registered function in the
 [`@readers` registry](/api/top-level#registry) to customize the data loading and
 streaming.

-### spacy.Corpus.v1 {#corpus tag="registered function"}
+#### spacy.Corpus.v1 {#corpus tag="registered function"}

 The `Corpus` reader manages annotated corpora and can be used for training and
 development datasets in the [DocBin](/api/docbin) (`.spacy`) format. Also see
@ -509,12 +567,12 @@ the [`Corpus`](/api/corpus) class.
 | `augmenter`     | Apply some simply data augmentation, where we replace tokens with variations. This is especially useful for punctuation and case replacement, to help generalize beyond corpora that don't have smart-quotes, or only have smart quotes, etc. Defaults to `None`. ~~Optional[Callable]~~ |
 | **CREATES**     | The corpus reader. ~~Corpus~~                                                                                                                                                                                                                                                            |

-### spacy.JsonlReader.v1 {#jsonlreader tag="registered function"}
+#### spacy.JsonlCorpus.v1 {#jsonlcorpus tag="registered function"}

 Create [`Example`](/api/example) objects from a JSONL (newline-delimited JSON)
 file of texts keyed by `"text"`. Can be used to read the raw text corpus for
 language model [pretraining](/usage/embeddings-transformers#pretraining) from a
-JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class.
+JSONL file. Also see the [`JsonlCorpus`](/api/corpus#jsonlcorpus) class.

 > #### Example config
 >
@ -523,7 +581,7 @@ JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class.
 > pretrain = "corpus/raw_text.jsonl"
 >
 > [corpora.pretrain]
-> @readers = "spacy.JsonlReader.v1"
+> @readers = "spacy.JsonlCorpus.v1"
 > path = ${paths.pretrain}
 > min_length = 0
 > max_length = 0
@ -536,33 +594,7 @@ JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class.
 | `min_length` | Minimum document length (in tokens). Shorter documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~       |
 | `max_length` | Maximum document length (in tokens). Longer documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~        |
 | `limit`      | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~                                  |
-| **CREATES**  | The corpus reader. ~~JsonlTexts~~                                                                                                |
-
-### spacy.read_labels.v1 {#read_labels tag="registered function"}
-
-Read a JSON-formatted labels file generated with
-[`init labels`](/api/cli#init-labels). Typically used in the
-[`[initialize]`](/api/data-formats#config-initialize) block of the training
-config to speed up the model initialization process and provide pre-generated
-label sets.
-
-> #### Example config
->
-> ```ini
-> [initialize.components]
->
-> [initialize.components.ner]
->
-> [initialize.components.ner.labels]
-> @readers = "spacy.read_labels.v1"
-> path = "corpus/labels/ner.json"
-> ```
-
-| Name        | Description                                                                                                                                                                                                               |
-| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `path`      | The path to the labels file generated with [`init labels`](/api/cli#init-labels). ~~Path~~                                                                                                                                |
-| `require`   | Whether to require the file to exist. If set to `False` and the labels file doesn't exist, the loader will return `None` and the `initialize` method will extract the labels from the data. Defaults to `False`. ~~bool~~ |
-| **CREATES** | The                                                                                                                                                                                                                       |
+| **CREATES**  | The corpus reader. ~~JsonlCorpus~~                                                                                               |

 ## Batchers {#batchers source="spacy/training/batchers.py" new="3"}

@ -664,7 +696,10 @@ sequences in the batch.
 > @augmenters = "spacy.orth_variants.v1"
 > level = 0.1
 > lower = 0.5
-> lookups = null
+>
+> [corpora.train.augmenter.orth_variants]
+> @readers = "srsly.read_json.v1"
+> path = "corpus/en_orth_variants.json"
 > ```

 Create a data augmentation callback that uses orth-variant replacement. The
@ -673,10 +708,10 @@ is especially useful for punctuation and case replacement, to help generalize
 beyond corpora that don't have smart quotes, or only have smart quotes etc.

 | Name            | Description                                                                                                                                                                                                                                                                                               |
-| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `level`         | The percentage of texts that will be augmented. ~~float~~                                                                                                                                                                                                                                                 |
 | `lower`         | The percentage of texts that will be lowercased. ~~float~~                                                                                                                                                                                                                                                |
-| `lookups`   | Lookups table containing the orth variants to use. See [`orth_variants.json`](https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_orth_variants.json) for an example. If not set, tables from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) are used if available and added in the [`[initialize]`](/api/data-formats#config-initialize) block of the config. If no orth variants are found, spaCy will raise an error. Defaults to `None`. ~~Optional[Lookups]~~ |
+| `orth_variants` | A dictionary containing the single and paired orth variants. Typically loaded from a JSON file. See [`en_orth_variants.json`](https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_orth_variants.json) for an example. ~~Dict[str, Dict[List[Union[str, List[str]]]]]~~ |
 | **CREATES**     | A function that takes the current `nlp` object and an [`Example`](/api/example) and yields augmented `Example` objects. ~~Callable[[Language, Example], Iterator[Example]]~~                                                                                                                              |

 ## Training data and alignment {#gold source="spacy/training"}
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@ -622,7 +622,7 @@ that are familiar from the training block: the `[pretraining.batcher]`,
 `[pretraining.optimizer]` and `[pretraining.corpus]` all work the same way and
 expect the same types of objects, although for pretraining your corpus does not
 need to have any annotations, so you will often use a different reader, such as
-the [`JsonlReader`](/api/top-level#jsonlreader).
+the [`JsonlCorpus`](/api/top-level#jsonlcorpus).

 > #### Raw text format
 >
--- a/website/meta/type-annotations.json
+++ b/website/meta/type-annotations.json
@ -24,7 +24,7 @@
    "TransformerData": "/api/transformer#transformerdata",
    "FullTransformerBatch": "/api/transformer#fulltransformerbatch",
    "Corpus": "/api/corpus",
-    "JsonlTexts": "/api/corpus#jsonltexts",
+    "JsonlCorpus": "/api/corpus#jsonlcorpus",
    "LexemeC": "/api/cython-structs#lexemec",
    "TokenC": "/api/cython-structs#tokenc",
    "Config": "https://thinc.ai/docs/api-config#config",