From 01c1538c720f529f433163d495c351ecbd13ccc2 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 2 Oct 2020 01:36:06 +0200 Subject: [PATCH] Integrate file readers --- pyproject.toml | 2 +- requirements.txt | 4 +- setup.cfg | 6 +- spacy/default_config_pretraining.cfg | 2 +- spacy/errors.py | 6 - spacy/tests/training/test_training.py | 6 +- spacy/training/augment.py | 40 +++--- spacy/training/corpus.py | 10 +- spacy/util.py | 4 - website/docs/api/corpus.md | 16 +-- website/docs/api/data-formats.md | 4 +- website/docs/api/top-level.md | 115 ++++++++++++------ website/docs/usage/embeddings-transformers.md | 2 +- website/meta/type-annotations.json | 2 +- 14 files changed, 126 insertions(+), 93 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e88ba7db9..611a95d27 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc>=8.0.0a42,<8.0.0a50", + "thinc>=8.0.0a43,<8.0.0a50", "blis>=0.4.0,<0.5.0", "pytokenizations", "pathy" diff --git a/requirements.txt b/requirements.txt index 064efed42..44dad38e3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,12 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc>=8.0.0a42,<8.0.0a50 +thinc>=8.0.0a43,<8.0.0a50 blis>=0.4.0,<0.5.0 ml_datasets==0.2.0a0 murmurhash>=0.28.0,<1.1.0 wasabi>=0.8.0,<1.1.0 -srsly>=2.1.0,<3.0.0 +srsly>=2.3.0,<3.0.0 catalogue>=2.0.1,<2.1.0 typer>=0.3.0,<0.4.0 pathy diff --git a/setup.cfg b/setup.cfg index 36ab64bd9..7a3a2cb30 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,16 +34,16 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc>=8.0.0a42,<8.0.0a50 + thinc>=8.0.0a43,<8.0.0a50 install_requires = # Our libraries murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=8.0.0a42,<8.0.0a50 + thinc>=8.0.0a43,<8.0.0a50 blis>=0.4.0,<0.5.0 wasabi>=0.8.0,<1.1.0 - srsly>=2.1.0,<3.0.0 + srsly>=2.3.0,<3.0.0 catalogue>=2.0.1,<2.1.0 typer>=0.3.0,<0.4.0 pathy diff --git a/spacy/default_config_pretraining.cfg b/spacy/default_config_pretraining.cfg index 4011159a4..66987171a 100644 --- a/spacy/default_config_pretraining.cfg +++ b/spacy/default_config_pretraining.cfg @@ -34,7 +34,7 @@ learn_rate = 0.001 [corpora] [corpora.pretrain] -@readers = "spacy.JsonlReader.v1" +@readers = "spacy.JsonlCorpus.v1" path = ${paths.raw_text} min_length = 5 max_length = 500 diff --git a/spacy/errors.py b/spacy/errors.py index 5236992e9..881a697f6 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -477,12 +477,6 @@ class Errors: E201 = ("Span index out of range.") # TODO: fix numbering after merging develop into master - E912 = ("No orth_variants lookups table for data augmentation available for " - "language '{lang}'. If orth_variants are available in " - "spacy-lookups-data, make sure the package is installed and the " - "table is loaded in the [initialize.lookups] block of your config. " - "Alternatively, you can provide your own Lookups object with a " - "table orth_variants as the argument 'lookuos' of the augmenter.") E913 = ("Corpus path can't be None. Maybe you forgot to define it in your " "config.cfg or override it on the CLI?") E914 = ("Executing {name} callback failed. Expected the function to " diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py index 405801f62..c53042ef1 100644 --- a/spacy/tests/training/test_training.py +++ b/spacy/tests/training/test_training.py @@ -504,9 +504,9 @@ def test_make_orth_variants(doc): {"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]}, ] } - lookups = Lookups() - lookups.add_table("orth_variants", orth_variants) - augmenter = create_orth_variants_augmenter(level=0.2, lower=0.5, lookups=lookups) + augmenter = create_orth_variants_augmenter( + level=0.2, lower=0.5, orth_variants=orth_variants + ) with make_tempdir() as tmpdir: output_file = tmpdir / "roundtrip.spacy" DocBin(docs=[doc]).to_disk(output_file) diff --git a/spacy/training/augment.py b/spacy/training/augment.py index 176530a1c..8965c5457 100644 --- a/spacy/training/augment.py +++ b/spacy/training/augment.py @@ -1,27 +1,43 @@ -from typing import Callable, Iterator, Dict, List, Tuple, Optional, TYPE_CHECKING +from typing import Callable, Iterator, Dict, List, Tuple, TYPE_CHECKING import random import itertools import copy from functools import partial +from pydantic import BaseModel, StrictStr from ..util import registry, logger from ..tokens import Doc from .example import Example -from ..lookups import Lookups -from ..errors import Errors if TYPE_CHECKING: from ..language import Language # noqa: F401 +class OrthVariantsSingle(BaseModel): + tags: List[StrictStr] + variants: List[StrictStr] + + +class OrthVariantsPaired(BaseModel): + tags: List[StrictStr] + variants: List[List[StrictStr]] + + +class OrthVariants(BaseModel): + paired: List[OrthVariantsPaired] = {} + single: List[OrthVariantsSingle] = {} + + @registry.augmenters("spacy.orth_variants.v1") def create_orth_variants_augmenter( - level: float, lower: float, lookups: Optional[Lookups] = None, + level: float, lower: float, orth_variants: OrthVariants, ) -> Callable[["Language", Example], Iterator[Example]]: """Create a data augmentation callback that uses orth-variant replacement. The callback can be added to a corpus or other data iterator during training. """ - return partial(orth_variants_augmenter, level=level, lower=lower, lookups=lookups) + return partial( + orth_variants_augmenter, orth_variants=orth_variants, level=level, lower=lower + ) def dont_augment(nlp: "Language", example: Example) -> Iterator[Example]: @@ -31,20 +47,11 @@ def dont_augment(nlp: "Language", example: Example) -> Iterator[Example]: def orth_variants_augmenter( nlp: "Language", example: Example, + orth_variants: dict, *, level: float = 0.0, lower: float = 0.0, - lookups: Optional[Lookups] = None, ) -> Iterator[Example]: - table_name = "orth_variants" - if lookups is not None: - orth_variants = lookups.get_table(table_name, {}) - logger.debug("Using data augmentation orth variants from provided lookups") - else: - orth_variants = nlp.vocab.lookups.get_table(table_name, {}) - logger.debug("Using data augmentation orth variants from default vocab lookups") - if not orth_variants: - raise ValueError(Errors.E912.format(lang=nlp.lang)) if random.random() >= level: yield example else: @@ -74,13 +81,14 @@ def make_orth_variants( nlp: "Language", raw: str, token_dict: Dict[str, List[str]], - orth_variants: Dict[str, list], + orth_variants: Dict[str, List[Dict[str, List[str]]]], *, lower: bool = False, ) -> Tuple[str, Dict[str, List[str]]]: orig_token_dict = copy.deepcopy(token_dict) ndsv = orth_variants.get("single", []) ndpv = orth_variants.get("paired", []) + logger.debug(f"Data augmentation: {len(ndsv)} single / {len(ndpv)} paired variants") words = token_dict.get("words", []) tags = token_dict.get("tags", []) # keep unmodified if words or tags are not defined diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py index 57787cf76..b3ff30e66 100644 --- a/spacy/training/corpus.py +++ b/spacy/training/corpus.py @@ -38,11 +38,11 @@ def create_docbin_reader( ) -@util.registry.readers("spacy.JsonlReader.v1") +@util.registry.readers("spacy.JsonlCorpus.v1") def create_jsonl_reader( path: Path, min_length: int = 0, max_length: int = 0, limit: int = 0 ) -> Callable[["Language"], Iterable[Doc]]: - return JsonlTexts(path, min_length=min_length, max_length=max_length, limit=limit) + return JsonlCorpus(path, min_length=min_length, max_length=max_length, limit=limit) @util.registry.readers("spacy.read_labels.v1") @@ -193,7 +193,7 @@ class Corpus: break -class JsonlTexts: +class JsonlCorpus: """Iterate Doc objects from a file or directory of jsonl formatted raw text files. @@ -206,7 +206,7 @@ class JsonlTexts: limit (int): Limit corpus to a subset of examples, e.g. for debugging. Defaults to 0, which indicates no limit. - DOCS: https://nightly.spacy.io/api/corpus#jsonltexts + DOCS: https://nightly.spacy.io/api/corpus#jsonlcorpus """ file_type = "jsonl" @@ -230,7 +230,7 @@ class JsonlTexts: nlp (Language): The current nlp object. YIELDS (Example): The example objects. - DOCS: https://nightly.spacy.io/api/corpus#jsonltexts-call + DOCS: https://nightly.spacy.io/api/corpus#jsonlcorpus-call """ for loc in walk_corpus(self.path, ".jsonl"): records = srsly.read_jsonl(loc) diff --git a/spacy/util.py b/spacy/util.py index 8a96ba4fe..f234927d6 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -103,10 +103,6 @@ class registry(thinc.registry): cli = catalogue.create("spacy", "cli", entry_points=True) -# We want json loading in the registry, so manually register srsly.read_json. -registry.readers("srsly.read_json.v0", srsly.read_json) - - class SimpleFrozenDict(dict): """Simplified implementation of a frozen dict, mainly used as default function or method argument (for arguments that should default to empty diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md index 58006a19b..986c6f458 100644 --- a/website/docs/api/corpus.md +++ b/website/docs/api/corpus.md @@ -100,7 +100,7 @@ Yield examples from the data. | `nlp` | The current `nlp` object. ~~Language~~ | | **YIELDS** | The examples. ~~Example~~ | -## JsonlTexts {#jsonltexts tag="class"} +## JsonlCorpus {#jsonlcorpus tag="class"} Iterate Doc objects from a file or directory of JSONL (newline-delimited JSON) formatted raw text files. Can be used to read the raw text corpus for language @@ -126,22 +126,22 @@ file. {"text": "My cynical view on this is that it will never be free to the public. Reason: what would be the draw of joining the military? Right now their selling point is free Healthcare and Education. Ironically both are run horribly and most, that I've talked to, come out wishing they never went in."} ``` -### JsonlTexts.\_\init\_\_ {#jsonltexts-init tag="method"} +### JsonlCorpus.\_\init\_\_ {#jsonlcorpus tag="method"} Initialize the reader. > #### Example > > ```python -> from spacy.training import JsonlTexts +> from spacy.training import JsonlCorpus > -> corpus = JsonlTexts("./data/texts.jsonl") +> corpus = JsonlCorpus("./data/texts.jsonl") > ``` > > ```ini > ### Example config > [corpora.pretrain] -> @readers = "spacy.JsonlReader.v1" +> @readers = "spacy.JsonlCorpus.v1" > path = "corpus/raw_text.jsonl" > min_length = 0 > max_length = 0 @@ -156,17 +156,17 @@ Initialize the reader. | `max_length` | Maximum document length (in tokens). Longer documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~ | | `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ | -### JsonlTexts.\_\_call\_\_ {#jsonltexts-call tag="method"} +### JsonlCorpus.\_\_call\_\_ {#jsonlcorpus-call tag="method"} Yield examples from the data. > #### Example > > ```python -> from spacy.training import JsonlTexts +> from spacy.training import JsonlCorpus > import spacy > -> corpus = JsonlTexts("./texts.jsonl") +> corpus = JsonlCorpus("./texts.jsonl") > nlp = spacy.blank("en") > data = corpus(nlp) > ``` diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 22a0076cd..c1b9bfef4 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -135,7 +135,7 @@ $ python -m spacy train config.cfg --paths.train ./corpus/train.spacy > path = ${paths:dev} > > [corpora.pretrain] -> @readers = "spacy.JsonlReader.v1" +> @readers = "spacy.JsonlCorpus.v1" > path = ${paths.raw} > > [corpora.my_custom_data] @@ -146,7 +146,7 @@ This section defines a **dictionary** mapping of string keys to functions. Each function takes an `nlp` object and yields [`Example`](/api/example) objects. By default, the two keys `train` and `dev` are specified and each refer to a [`Corpus`](/api/top-level#Corpus). When pretraining, an additional `pretrain` -section is added that defaults to a [`JsonlReader`](/api/top-level#JsonlReader). +section is added that defaults to a [`JsonlCorpus`](/api/top-level#JsonlCorpus). You can also register custom functions that return a callable. | Name | Description | diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 22de0ea83..876006774 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -327,7 +327,7 @@ factories. | `losses` | Registry for functions that create [losses](https://thinc.ai/docs/api-loss). | | `misc` | Registry for miscellaneous functions that return data assets, knowledge bases or anything else you may need. | | `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). | -| `readers` | Registry for training and evaluation data readers like [`Corpus`](/api/corpus). | +| `readers` | Registry for file and data readers, including training and evaluation data readers like [`Corpus`](/api/corpus). | | `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). | | `tokenizers` | Registry for tokenizer factories. Registered functions should return a callback that receives the `nlp` object and returns a [`Tokenizer`](/api/tokenizer) or a custom callable. | @@ -470,7 +470,65 @@ logging the results. -## Readers {#readers source="spacy/training/corpus.py" new="3"} +## Readers {#readers} + +### File readers {#file-readers source="github.com/explosion/srsly" new="3"} + +The following file readers are provided by our serialization library +[`srsly`](https://github.com/explosion/srsly). All registered functions take one +argument `path`, pointing to the file path to load. + +> #### Example config +> +> ```ini +> [corpora.train.augmenter.orth_variants] +> @readers = "srsly.read_json.v1" +> path = "corpus/en_orth_variants.json" +> ``` + +| Name | Description | +| ----------------------- | ----------------------------------------------------- | +| `srsly.read_json.v1` | Read data from a JSON file. | +| `srsly.read_jsonl.v1` | Read data from a JSONL (newline-delimited JSON) file. | +| `srsly.read_yaml.v1` | Read data from a YAML file. | +| `srsly.read_msgpack.v1` | Read data from a binary MessagePack file. | + + + +Since the file readers expect a local path, you should only use them in config +blocks that are **not executed at runtime** – for example, in `[training]` and +`[corpora]` (to load data or resources like data augmentation tables) or in +`[initialize]` (to pass data to pipeline components). + + + +#### spacy.read_labels.v1 {#read_labels tag="registered function"} + +Read a JSON-formatted labels file generated with +[`init labels`](/api/cli#init-labels). Typically used in the +[`[initialize]`](/api/data-formats#config-initialize) block of the training +config to speed up the model initialization process and provide pre-generated +label sets. + +> #### Example config +> +> ```ini +> [initialize.components] +> +> [initialize.components.ner] +> +> [initialize.components.ner.labels] +> @readers = "spacy.read_labels.v1" +> path = "corpus/labels/ner.json" +> ``` + +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `path` | The path to the labels file generated with [`init labels`](/api/cli#init-labels). ~~Path~~ | +| `require` | Whether to require the file to exist. If set to `False` and the labels file doesn't exist, the loader will return `None` and the `initialize` method will extract the labels from the data. Defaults to `False`. ~~bool~~ | +| **CREATES** | The | + +### Corpus readers {#corpus-readers source="spacy/training/corpus.py" new="3"} Corpus readers are registered functions that load data and return a function that takes the current `nlp` object and yields [`Example`](/api/example) objects @@ -480,7 +538,7 @@ with your own registered function in the [`@readers` registry](/api/top-level#registry) to customize the data loading and streaming. -### spacy.Corpus.v1 {#corpus tag="registered function"} +#### spacy.Corpus.v1 {#corpus tag="registered function"} The `Corpus` reader manages annotated corpora and can be used for training and development datasets in the [DocBin](/api/docbin) (`.spacy`) format. Also see @@ -509,12 +567,12 @@ the [`Corpus`](/api/corpus) class. | `augmenter` | Apply some simply data augmentation, where we replace tokens with variations. This is especially useful for punctuation and case replacement, to help generalize beyond corpora that don't have smart-quotes, or only have smart quotes, etc. Defaults to `None`. ~~Optional[Callable]~~ | | **CREATES** | The corpus reader. ~~Corpus~~ | -### spacy.JsonlReader.v1 {#jsonlreader tag="registered function"} +#### spacy.JsonlCorpus.v1 {#jsonlcorpus tag="registered function"} Create [`Example`](/api/example) objects from a JSONL (newline-delimited JSON) file of texts keyed by `"text"`. Can be used to read the raw text corpus for language model [pretraining](/usage/embeddings-transformers#pretraining) from a -JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class. +JSONL file. Also see the [`JsonlCorpus`](/api/corpus#jsonlcorpus) class. > #### Example config > @@ -523,7 +581,7 @@ JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class. > pretrain = "corpus/raw_text.jsonl" > > [corpora.pretrain] -> @readers = "spacy.JsonlReader.v1" +> @readers = "spacy.JsonlCorpus.v1" > path = ${paths.pretrain} > min_length = 0 > max_length = 0 @@ -536,33 +594,7 @@ JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class. | `min_length` | Minimum document length (in tokens). Shorter documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~ | | `max_length` | Maximum document length (in tokens). Longer documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~ | | `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ | -| **CREATES** | The corpus reader. ~~JsonlTexts~~ | - -### spacy.read_labels.v1 {#read_labels tag="registered function"} - -Read a JSON-formatted labels file generated with -[`init labels`](/api/cli#init-labels). Typically used in the -[`[initialize]`](/api/data-formats#config-initialize) block of the training -config to speed up the model initialization process and provide pre-generated -label sets. - -> #### Example config -> -> ```ini -> [initialize.components] -> -> [initialize.components.ner] -> -> [initialize.components.ner.labels] -> @readers = "spacy.read_labels.v1" -> path = "corpus/labels/ner.json" -> ``` - -| Name | Description | -| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `path` | The path to the labels file generated with [`init labels`](/api/cli#init-labels). ~~Path~~ | -| `require` | Whether to require the file to exist. If set to `False` and the labels file doesn't exist, the loader will return `None` and the `initialize` method will extract the labels from the data. Defaults to `False`. ~~bool~~ | -| **CREATES** | The | +| **CREATES** | The corpus reader. ~~JsonlCorpus~~ | ## Batchers {#batchers source="spacy/training/batchers.py" new="3"} @@ -664,7 +696,10 @@ sequences in the batch. > @augmenters = "spacy.orth_variants.v1" > level = 0.1 > lower = 0.5 -> lookups = null +> +> [corpora.train.augmenter.orth_variants] +> @readers = "srsly.read_json.v1" +> path = "corpus/en_orth_variants.json" > ``` Create a data augmentation callback that uses orth-variant replacement. The @@ -672,12 +707,12 @@ callback can be added to a corpus or other data iterator during training. This is especially useful for punctuation and case replacement, to help generalize beyond corpora that don't have smart quotes, or only have smart quotes etc. -| Name | Description | -| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `level` | The percentage of texts that will be augmented. ~~float~~ | -| `lower` | The percentage of texts that will be lowercased. ~~float~~ | -| `lookups` | Lookups table containing the orth variants to use. See [`orth_variants.json`](https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_orth_variants.json) for an example. If not set, tables from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) are used if available and added in the [`[initialize]`](/api/data-formats#config-initialize) block of the config. If no orth variants are found, spaCy will raise an error. Defaults to `None`. ~~Optional[Lookups]~~ | -| **CREATES** | A function that takes the current `nlp` object and an [`Example`](/api/example) and yields augmented `Example` objects. ~~Callable[[Language, Example], Iterator[Example]]~~ | +| Name | Description | +| --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `level` | The percentage of texts that will be augmented. ~~float~~ | +| `lower` | The percentage of texts that will be lowercased. ~~float~~ | +| `orth_variants` | A dictionary containing the single and paired orth variants. Typically loaded from a JSON file. See [`en_orth_variants.json`](https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_orth_variants.json) for an example. ~~Dict[str, Dict[List[Union[str, List[str]]]]]~~ | +| **CREATES** | A function that takes the current `nlp` object and an [`Example`](/api/example) and yields augmented `Example` objects. ~~Callable[[Language, Example], Iterator[Example]]~~ | ## Training data and alignment {#gold source="spacy/training"} diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index 1b78b8dc5..c615097d6 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -622,7 +622,7 @@ that are familiar from the training block: the `[pretraining.batcher]`, `[pretraining.optimizer]` and `[pretraining.corpus]` all work the same way and expect the same types of objects, although for pretraining your corpus does not need to have any annotations, so you will often use a different reader, such as -the [`JsonlReader`](/api/top-level#jsonlreader). +the [`JsonlCorpus`](/api/top-level#jsonlcorpus). > #### Raw text format > diff --git a/website/meta/type-annotations.json b/website/meta/type-annotations.json index 43a524e93..acbc88ae2 100644 --- a/website/meta/type-annotations.json +++ b/website/meta/type-annotations.json @@ -24,7 +24,7 @@ "TransformerData": "/api/transformer#transformerdata", "FullTransformerBatch": "/api/transformer#fulltransformerbatch", "Corpus": "/api/corpus", - "JsonlTexts": "/api/corpus#jsonltexts", + "JsonlCorpus": "/api/corpus#jsonlcorpus", "LexemeC": "/api/cython-structs#lexemec", "TokenC": "/api/cython-structs#tokenc", "Config": "https://thinc.ai/docs/api-config#config",