Integrate file readers

This commit is contained in:
Ines Montani 2020-10-02 01:36:06 +02:00
parent af282ae732
commit 01c1538c72
14 changed files with 126 additions and 93 deletions

View File

@ -6,7 +6,7 @@ requires = [
"cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.0a42,<8.0.0a50",
"thinc>=8.0.0a43,<8.0.0a50",
"blis>=0.4.0,<0.5.0",
"pytokenizations",
"pathy"

View File

@ -1,12 +1,12 @@
# Our libraries
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.0a42,<8.0.0a50
thinc>=8.0.0a43,<8.0.0a50
blis>=0.4.0,<0.5.0
ml_datasets==0.2.0a0
murmurhash>=0.28.0,<1.1.0
wasabi>=0.8.0,<1.1.0
srsly>=2.1.0,<3.0.0
srsly>=2.3.0,<3.0.0
catalogue>=2.0.1,<2.1.0
typer>=0.3.0,<0.4.0
pathy

View File

@ -34,16 +34,16 @@ setup_requires =
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0
thinc>=8.0.0a42,<8.0.0a50
thinc>=8.0.0a43,<8.0.0a50
install_requires =
# Our libraries
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.0a42,<8.0.0a50
thinc>=8.0.0a43,<8.0.0a50
blis>=0.4.0,<0.5.0
wasabi>=0.8.0,<1.1.0
srsly>=2.1.0,<3.0.0
srsly>=2.3.0,<3.0.0
catalogue>=2.0.1,<2.1.0
typer>=0.3.0,<0.4.0
pathy

View File

@ -34,7 +34,7 @@ learn_rate = 0.001
[corpora]
[corpora.pretrain]
@readers = "spacy.JsonlReader.v1"
@readers = "spacy.JsonlCorpus.v1"
path = ${paths.raw_text}
min_length = 5
max_length = 500

View File

@ -477,12 +477,6 @@ class Errors:
E201 = ("Span index out of range.")
# TODO: fix numbering after merging develop into master
E912 = ("No orth_variants lookups table for data augmentation available for "
"language '{lang}'. If orth_variants are available in "
"spacy-lookups-data, make sure the package is installed and the "
"table is loaded in the [initialize.lookups] block of your config. "
"Alternatively, you can provide your own Lookups object with a "
"table orth_variants as the argument 'lookuos' of the augmenter.")
E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
"config.cfg or override it on the CLI?")
E914 = ("Executing {name} callback failed. Expected the function to "

View File

@ -504,9 +504,9 @@ def test_make_orth_variants(doc):
{"tags": [":"], "variants": ["-", "", "", "--", "---", "——"]},
]
}
lookups = Lookups()
lookups.add_table("orth_variants", orth_variants)
augmenter = create_orth_variants_augmenter(level=0.2, lower=0.5, lookups=lookups)
augmenter = create_orth_variants_augmenter(
level=0.2, lower=0.5, orth_variants=orth_variants
)
with make_tempdir() as tmpdir:
output_file = tmpdir / "roundtrip.spacy"
DocBin(docs=[doc]).to_disk(output_file)

View File

@ -1,27 +1,43 @@
from typing import Callable, Iterator, Dict, List, Tuple, Optional, TYPE_CHECKING
from typing import Callable, Iterator, Dict, List, Tuple, TYPE_CHECKING
import random
import itertools
import copy
from functools import partial
from pydantic import BaseModel, StrictStr
from ..util import registry, logger
from ..tokens import Doc
from .example import Example
from ..lookups import Lookups
from ..errors import Errors
if TYPE_CHECKING:
from ..language import Language # noqa: F401
class OrthVariantsSingle(BaseModel):
tags: List[StrictStr]
variants: List[StrictStr]
class OrthVariantsPaired(BaseModel):
tags: List[StrictStr]
variants: List[List[StrictStr]]
class OrthVariants(BaseModel):
paired: List[OrthVariantsPaired] = {}
single: List[OrthVariantsSingle] = {}
@registry.augmenters("spacy.orth_variants.v1")
def create_orth_variants_augmenter(
level: float, lower: float, lookups: Optional[Lookups] = None,
level: float, lower: float, orth_variants: OrthVariants,
) -> Callable[["Language", Example], Iterator[Example]]:
"""Create a data augmentation callback that uses orth-variant replacement.
The callback can be added to a corpus or other data iterator during training.
"""
return partial(orth_variants_augmenter, level=level, lower=lower, lookups=lookups)
return partial(
orth_variants_augmenter, orth_variants=orth_variants, level=level, lower=lower
)
def dont_augment(nlp: "Language", example: Example) -> Iterator[Example]:
@ -31,20 +47,11 @@ def dont_augment(nlp: "Language", example: Example) -> Iterator[Example]:
def orth_variants_augmenter(
nlp: "Language",
example: Example,
orth_variants: dict,
*,
level: float = 0.0,
lower: float = 0.0,
lookups: Optional[Lookups] = None,
) -> Iterator[Example]:
table_name = "orth_variants"
if lookups is not None:
orth_variants = lookups.get_table(table_name, {})
logger.debug("Using data augmentation orth variants from provided lookups")
else:
orth_variants = nlp.vocab.lookups.get_table(table_name, {})
logger.debug("Using data augmentation orth variants from default vocab lookups")
if not orth_variants:
raise ValueError(Errors.E912.format(lang=nlp.lang))
if random.random() >= level:
yield example
else:
@ -74,13 +81,14 @@ def make_orth_variants(
nlp: "Language",
raw: str,
token_dict: Dict[str, List[str]],
orth_variants: Dict[str, list],
orth_variants: Dict[str, List[Dict[str, List[str]]]],
*,
lower: bool = False,
) -> Tuple[str, Dict[str, List[str]]]:
orig_token_dict = copy.deepcopy(token_dict)
ndsv = orth_variants.get("single", [])
ndpv = orth_variants.get("paired", [])
logger.debug(f"Data augmentation: {len(ndsv)} single / {len(ndpv)} paired variants")
words = token_dict.get("words", [])
tags = token_dict.get("tags", [])
# keep unmodified if words or tags are not defined

View File

@ -38,11 +38,11 @@ def create_docbin_reader(
)
@util.registry.readers("spacy.JsonlReader.v1")
@util.registry.readers("spacy.JsonlCorpus.v1")
def create_jsonl_reader(
path: Path, min_length: int = 0, max_length: int = 0, limit: int = 0
) -> Callable[["Language"], Iterable[Doc]]:
return JsonlTexts(path, min_length=min_length, max_length=max_length, limit=limit)
return JsonlCorpus(path, min_length=min_length, max_length=max_length, limit=limit)
@util.registry.readers("spacy.read_labels.v1")
@ -193,7 +193,7 @@ class Corpus:
break
class JsonlTexts:
class JsonlCorpus:
"""Iterate Doc objects from a file or directory of jsonl
formatted raw text files.
@ -206,7 +206,7 @@ class JsonlTexts:
limit (int): Limit corpus to a subset of examples, e.g. for debugging.
Defaults to 0, which indicates no limit.
DOCS: https://nightly.spacy.io/api/corpus#jsonltexts
DOCS: https://nightly.spacy.io/api/corpus#jsonlcorpus
"""
file_type = "jsonl"
@ -230,7 +230,7 @@ class JsonlTexts:
nlp (Language): The current nlp object.
YIELDS (Example): The example objects.
DOCS: https://nightly.spacy.io/api/corpus#jsonltexts-call
DOCS: https://nightly.spacy.io/api/corpus#jsonlcorpus-call
"""
for loc in walk_corpus(self.path, ".jsonl"):
records = srsly.read_jsonl(loc)

View File

@ -103,10 +103,6 @@ class registry(thinc.registry):
cli = catalogue.create("spacy", "cli", entry_points=True)
# We want json loading in the registry, so manually register srsly.read_json.
registry.readers("srsly.read_json.v0", srsly.read_json)
class SimpleFrozenDict(dict):
"""Simplified implementation of a frozen dict, mainly used as default
function or method argument (for arguments that should default to empty

View File

@ -100,7 +100,7 @@ Yield examples from the data.
| `nlp` | The current `nlp` object. ~~Language~~ |
| **YIELDS** | The examples. ~~Example~~ |
## JsonlTexts {#jsonltexts tag="class"}
## JsonlCorpus {#jsonlcorpus tag="class"}
Iterate Doc objects from a file or directory of JSONL (newline-delimited JSON)
formatted raw text files. Can be used to read the raw text corpus for language
@ -126,22 +126,22 @@ file.
{"text": "My cynical view on this is that it will never be free to the public. Reason: what would be the draw of joining the military? Right now their selling point is free Healthcare and Education. Ironically both are run horribly and most, that I've talked to, come out wishing they never went in."}
```
### JsonlTexts.\_\init\_\_ {#jsonltexts-init tag="method"}
### JsonlCorpus.\_\init\_\_ {#jsonlcorpus tag="method"}
Initialize the reader.
> #### Example
>
> ```python
> from spacy.training import JsonlTexts
> from spacy.training import JsonlCorpus
>
> corpus = JsonlTexts("./data/texts.jsonl")
> corpus = JsonlCorpus("./data/texts.jsonl")
> ```
>
> ```ini
> ### Example config
> [corpora.pretrain]
> @readers = "spacy.JsonlReader.v1"
> @readers = "spacy.JsonlCorpus.v1"
> path = "corpus/raw_text.jsonl"
> min_length = 0
> max_length = 0
@ -156,17 +156,17 @@ Initialize the reader.
| `max_length` | Maximum document length (in tokens). Longer documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~ |
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
### JsonlTexts.\_\_call\_\_ {#jsonltexts-call tag="method"}
### JsonlCorpus.\_\_call\_\_ {#jsonlcorpus-call tag="method"}
Yield examples from the data.
> #### Example
>
> ```python
> from spacy.training import JsonlTexts
> from spacy.training import JsonlCorpus
> import spacy
>
> corpus = JsonlTexts("./texts.jsonl")
> corpus = JsonlCorpus("./texts.jsonl")
> nlp = spacy.blank("en")
> data = corpus(nlp)
> ```

View File

@ -135,7 +135,7 @@ $ python -m spacy train config.cfg --paths.train ./corpus/train.spacy
> path = ${paths:dev}
>
> [corpora.pretrain]
> @readers = "spacy.JsonlReader.v1"
> @readers = "spacy.JsonlCorpus.v1"
> path = ${paths.raw}
>
> [corpora.my_custom_data]
@ -146,7 +146,7 @@ This section defines a **dictionary** mapping of string keys to functions. Each
function takes an `nlp` object and yields [`Example`](/api/example) objects. By
default, the two keys `train` and `dev` are specified and each refer to a
[`Corpus`](/api/top-level#Corpus). When pretraining, an additional `pretrain`
section is added that defaults to a [`JsonlReader`](/api/top-level#JsonlReader).
section is added that defaults to a [`JsonlCorpus`](/api/top-level#JsonlCorpus).
You can also register custom functions that return a callable.
| Name | Description |

View File

@ -327,7 +327,7 @@ factories.
| `losses` | Registry for functions that create [losses](https://thinc.ai/docs/api-loss). |
| `misc` | Registry for miscellaneous functions that return data assets, knowledge bases or anything else you may need. |
| `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). |
| `readers` | Registry for training and evaluation data readers like [`Corpus`](/api/corpus). |
| `readers` | Registry for file and data readers, including training and evaluation data readers like [`Corpus`](/api/corpus). |
| `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). |
| `tokenizers` | Registry for tokenizer factories. Registered functions should return a callback that receives the `nlp` object and returns a [`Tokenizer`](/api/tokenizer) or a custom callable. |
@ -470,7 +470,65 @@ logging the results.
</Project>
## Readers {#readers source="spacy/training/corpus.py" new="3"}
## Readers {#readers}
### File readers {#file-readers source="github.com/explosion/srsly" new="3"}
The following file readers are provided by our serialization library
[`srsly`](https://github.com/explosion/srsly). All registered functions take one
argument `path`, pointing to the file path to load.
> #### Example config
>
> ```ini
> [corpora.train.augmenter.orth_variants]
> @readers = "srsly.read_json.v1"
> path = "corpus/en_orth_variants.json"
> ```
| Name | Description |
| ----------------------- | ----------------------------------------------------- |
| `srsly.read_json.v1` | Read data from a JSON file. |
| `srsly.read_jsonl.v1` | Read data from a JSONL (newline-delimited JSON) file. |
| `srsly.read_yaml.v1` | Read data from a YAML file. |
| `srsly.read_msgpack.v1` | Read data from a binary MessagePack file. |
<Infobox title="Important note" variant="warning">
Since the file readers expect a local path, you should only use them in config
blocks that are **not executed at runtime** for example, in `[training]` and
`[corpora]` (to load data or resources like data augmentation tables) or in
`[initialize]` (to pass data to pipeline components).
</Infobox>
#### spacy.read_labels.v1 {#read_labels tag="registered function"}
Read a JSON-formatted labels file generated with
[`init labels`](/api/cli#init-labels). Typically used in the
[`[initialize]`](/api/data-formats#config-initialize) block of the training
config to speed up the model initialization process and provide pre-generated
label sets.
> #### Example config
>
> ```ini
> [initialize.components]
>
> [initialize.components.ner]
>
> [initialize.components.ner.labels]
> @readers = "spacy.read_labels.v1"
> path = "corpus/labels/ner.json"
> ```
| Name | Description |
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `path` | The path to the labels file generated with [`init labels`](/api/cli#init-labels). ~~Path~~ |
| `require` | Whether to require the file to exist. If set to `False` and the labels file doesn't exist, the loader will return `None` and the `initialize` method will extract the labels from the data. Defaults to `False`. ~~bool~~ |
| **CREATES** | The |
### Corpus readers {#corpus-readers source="spacy/training/corpus.py" new="3"}
Corpus readers are registered functions that load data and return a function
that takes the current `nlp` object and yields [`Example`](/api/example) objects
@ -480,7 +538,7 @@ with your own registered function in the
[`@readers` registry](/api/top-level#registry) to customize the data loading and
streaming.
### spacy.Corpus.v1 {#corpus tag="registered function"}
#### spacy.Corpus.v1 {#corpus tag="registered function"}
The `Corpus` reader manages annotated corpora and can be used for training and
development datasets in the [DocBin](/api/docbin) (`.spacy`) format. Also see
@ -509,12 +567,12 @@ the [`Corpus`](/api/corpus) class.
| `augmenter` | Apply some simply data augmentation, where we replace tokens with variations. This is especially useful for punctuation and case replacement, to help generalize beyond corpora that don't have smart-quotes, or only have smart quotes, etc. Defaults to `None`. ~~Optional[Callable]~~ |
| **CREATES** | The corpus reader. ~~Corpus~~ |
### spacy.JsonlReader.v1 {#jsonlreader tag="registered function"}
#### spacy.JsonlCorpus.v1 {#jsonlcorpus tag="registered function"}
Create [`Example`](/api/example) objects from a JSONL (newline-delimited JSON)
file of texts keyed by `"text"`. Can be used to read the raw text corpus for
language model [pretraining](/usage/embeddings-transformers#pretraining) from a
JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class.
JSONL file. Also see the [`JsonlCorpus`](/api/corpus#jsonlcorpus) class.
> #### Example config
>
@ -523,7 +581,7 @@ JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class.
> pretrain = "corpus/raw_text.jsonl"
>
> [corpora.pretrain]
> @readers = "spacy.JsonlReader.v1"
> @readers = "spacy.JsonlCorpus.v1"
> path = ${paths.pretrain}
> min_length = 0
> max_length = 0
@ -536,33 +594,7 @@ JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class.
| `min_length` | Minimum document length (in tokens). Shorter documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~ |
| `max_length` | Maximum document length (in tokens). Longer documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~ |
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
| **CREATES** | The corpus reader. ~~JsonlTexts~~ |
### spacy.read_labels.v1 {#read_labels tag="registered function"}
Read a JSON-formatted labels file generated with
[`init labels`](/api/cli#init-labels). Typically used in the
[`[initialize]`](/api/data-formats#config-initialize) block of the training
config to speed up the model initialization process and provide pre-generated
label sets.
> #### Example config
>
> ```ini
> [initialize.components]
>
> [initialize.components.ner]
>
> [initialize.components.ner.labels]
> @readers = "spacy.read_labels.v1"
> path = "corpus/labels/ner.json"
> ```
| Name | Description |
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `path` | The path to the labels file generated with [`init labels`](/api/cli#init-labels). ~~Path~~ |
| `require` | Whether to require the file to exist. If set to `False` and the labels file doesn't exist, the loader will return `None` and the `initialize` method will extract the labels from the data. Defaults to `False`. ~~bool~~ |
| **CREATES** | The |
| **CREATES** | The corpus reader. ~~JsonlCorpus~~ |
## Batchers {#batchers source="spacy/training/batchers.py" new="3"}
@ -664,7 +696,10 @@ sequences in the batch.
> @augmenters = "spacy.orth_variants.v1"
> level = 0.1
> lower = 0.5
> lookups = null
>
> [corpora.train.augmenter.orth_variants]
> @readers = "srsly.read_json.v1"
> path = "corpus/en_orth_variants.json"
> ```
Create a data augmentation callback that uses orth-variant replacement. The
@ -672,12 +707,12 @@ callback can be added to a corpus or other data iterator during training. This
is especially useful for punctuation and case replacement, to help generalize
beyond corpora that don't have smart quotes, or only have smart quotes etc.
| Name | Description |
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `level` | The percentage of texts that will be augmented. ~~float~~ |
| `lower` | The percentage of texts that will be lowercased. ~~float~~ |
| `lookups` | Lookups table containing the orth variants to use. See [`orth_variants.json`](https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_orth_variants.json) for an example. If not set, tables from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) are used if available and added in the [`[initialize]`](/api/data-formats#config-initialize) block of the config. If no orth variants are found, spaCy will raise an error. Defaults to `None`. ~~Optional[Lookups]~~ |
| **CREATES** | A function that takes the current `nlp` object and an [`Example`](/api/example) and yields augmented `Example` objects. ~~Callable[[Language, Example], Iterator[Example]]~~ |
| Name | Description |
| --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `level` | The percentage of texts that will be augmented. ~~float~~ |
| `lower` | The percentage of texts that will be lowercased. ~~float~~ |
| `orth_variants` | A dictionary containing the single and paired orth variants. Typically loaded from a JSON file. See [`en_orth_variants.json`](https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_orth_variants.json) for an example. ~~Dict[str, Dict[List[Union[str, List[str]]]]]~~ |
| **CREATES** | A function that takes the current `nlp` object and an [`Example`](/api/example) and yields augmented `Example` objects. ~~Callable[[Language, Example], Iterator[Example]]~~ |
## Training data and alignment {#gold source="spacy/training"}

View File

@ -622,7 +622,7 @@ that are familiar from the training block: the `[pretraining.batcher]`,
`[pretraining.optimizer]` and `[pretraining.corpus]` all work the same way and
expect the same types of objects, although for pretraining your corpus does not
need to have any annotations, so you will often use a different reader, such as
the [`JsonlReader`](/api/top-level#jsonlreader).
the [`JsonlCorpus`](/api/top-level#jsonlcorpus).
> #### Raw text format
>

View File

@ -24,7 +24,7 @@
"TransformerData": "/api/transformer#transformerdata",
"FullTransformerBatch": "/api/transformer#fulltransformerbatch",
"Corpus": "/api/corpus",
"JsonlTexts": "/api/corpus#jsonltexts",
"JsonlCorpus": "/api/corpus#jsonlcorpus",
"LexemeC": "/api/cython-structs#lexemec",
"TokenC": "/api/cython-structs#tokenc",
"Config": "https://thinc.ai/docs/api-config#config",