Update augmenter lookups and docs

This commit is contained in:
Ines Montani 2020-09-30 23:03:47 +02:00
parent 5128298964
commit a103ab5f1a
6 changed files with 131 additions and 22 deletions

View File

@ -477,6 +477,12 @@ class Errors:
E201 = ("Span index out of range.") E201 = ("Span index out of range.")
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
E912 = ("No orth_variants lookups table for data augmentation available for "
"language '{lang}'. If orth_variants are available in "
"spacy-lookups-data, make sure the package is installed and the "
"table is loaded in the [initialize.lookups] block of your config. "
"Alternatively, you can provide your own Lookups object with a "
"table orth_variants as the argument 'lookuos' of the augmenter.")
E913 = ("Corpus path can't be None. Maybe you forgot to define it in your " E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
"config.cfg or override it on the CLI?") "config.cfg or override it on the CLI?")
E914 = ("Executing {name} callback failed. Expected the function to " E914 = ("Executing {name} callback failed. Expected the function to "

View File

@ -7,6 +7,7 @@ from spacy.training.converters import json_to_docs
from spacy.training.augment import create_orth_variants_augmenter from spacy.training.augment import create_orth_variants_augmenter
from spacy.lang.en import English from spacy.lang.en import English
from spacy.tokens import Doc, DocBin from spacy.tokens import Doc, DocBin
from spacy.lookups import Lookups
from spacy.util import get_words_and_spaces, minibatch from spacy.util import get_words_and_spaces, minibatch
from thinc.api import compounding from thinc.api import compounding
import pytest import pytest
@ -492,13 +493,20 @@ def test_roundtrip_docs_to_docbin(doc):
@pytest.mark.filterwarnings("ignore::UserWarning") @pytest.mark.filterwarnings("ignore::UserWarning")
def test_make_orth_variants(doc): def test_make_orth_variants(doc):
nlp = English() nlp = English()
orth_variants = {
"single": [
{"tags": ["NFP"], "variants": ["", "..."]},
{"tags": [":"], "variants": ["-", "", "", "--", "---", "——"]},
]
}
lookups = Lookups()
lookups.add_table("orth_variants", orth_variants)
augmenter = create_orth_variants_augmenter(level=0.2, lower=0.5, lookups=lookups)
with make_tempdir() as tmpdir: with make_tempdir() as tmpdir:
output_file = tmpdir / "roundtrip.spacy" output_file = tmpdir / "roundtrip.spacy"
DocBin(docs=[doc]).to_disk(output_file) DocBin(docs=[doc]).to_disk(output_file)
# due to randomness, test only that this runs with no errors for now # due to randomness, test only that this runs with no errors for now
reader = Corpus( reader = Corpus(output_file, augmenter=augmenter)
output_file, augmenter=create_orth_variants_augmenter(level=0.2, lower=0.5)
)
list(reader(nlp)) list(reader(nlp))

View File

@ -1,30 +1,50 @@
from typing import Callable from typing import Callable, Iterator, Dict, List, Tuple, Optional, TYPE_CHECKING
import random import random
import itertools import itertools
import copy import copy
from functools import partial from functools import partial
from ..util import registry
from ..util import registry, logger
from ..tokens import Doc from ..tokens import Doc
from .example import Example
from ..lookups import Lookups
from ..errors import Errors
if TYPE_CHECKING:
@registry.augmenters("spacy.dont_augment.v1") from ..language import Language # noqa: F401
def create_null_augmenter():
return dont_augment
@registry.augmenters("spacy.orth_variants.v1") @registry.augmenters("spacy.orth_variants.v1")
def create_orth_variants_augmenter(level: float, lower: float) -> Callable: def create_orth_variants_augmenter(
level: float, lower: float, lookups: Optional[Lookups] = None,
) -> Callable[["Language", Example], Iterator[Example]]:
"""Create a data augmentation callback that uses orth-variant replacement. """Create a data augmentation callback that uses orth-variant replacement.
The callback can be added to a corpus or other data iterator during training. The callback can be added to a corpus or other data iterator during training.
""" """
return partial(orth_variants_augmenter, level=level, lower=lower) return partial(orth_variants_augmenter, level=level, lower=lower, lookups=lookups)
def dont_augment(nlp, example): def dont_augment(nlp: "Language", example: Example) -> Iterator[Example]:
yield example yield example
def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float = 0.0): def orth_variants_augmenter(
nlp: "Language",
example: Example,
*,
level: float = 0.0,
lower: float = 0.0,
lookups: Optional[Lookups] = None,
) -> Iterator[Example]:
table_name = "orth_variants"
if lookups is not None:
orth_variants = lookups.get_table(table_name, {})
logger.debug("Using data augmentation orth variants from provided lookups")
else:
orth_variants = nlp.vocab.lookups.get_table(table_name, {})
logger.debug("Using data augmentation orth variants from default vocab lookups")
if not orth_variants:
raise ValueError(Errors.E912.format(lang=nlp.lang))
if random.random() >= level: if random.random() >= level:
yield example yield example
else: else:
@ -37,6 +57,7 @@ def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float =
nlp, nlp,
raw_text, raw_text,
orig_dict["token_annotation"], orig_dict["token_annotation"],
orth_variants,
lower=raw_text is not None and random.random() < lower, lower=raw_text is not None and random.random() < lower,
) )
if variant_text: if variant_text:
@ -49,9 +70,15 @@ def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float =
yield example.from_dict(doc, orig_dict) yield example.from_dict(doc, orig_dict)
def make_orth_variants(nlp, raw, token_dict, *, lower: bool = False): def make_orth_variants(
nlp: "Language",
raw: str,
token_dict: Dict[str, List[str]],
orth_variants: Dict[str, list],
*,
lower: bool = False,
) -> Tuple[str, Dict[str, List[str]]]:
orig_token_dict = copy.deepcopy(token_dict) orig_token_dict = copy.deepcopy(token_dict)
orth_variants = nlp.vocab.lookups.get_table("orth_variants", {})
ndsv = orth_variants.get("single", []) ndsv = orth_variants.get("single", [])
ndpv = orth_variants.get("paired", []) ndpv = orth_variants.get("paired", [])
words = token_dict.get("words", []) words = token_dict.get("words", [])

View File

@ -7,9 +7,11 @@ new: 3
--- ---
This class manages annotated corpora and can be used for training and This class manages annotated corpora and can be used for training and
development datasets in the [DocBin](/api/docbin) (`.spacy`) format. To development datasets in the [`DocBin`](/api/docbin) (`.spacy`) format. To
customize the data loading during training, you can register your own customize the data loading during training, you can register your own
[data readers and batchers](/usage/training#custom-code-readers-batchers). [data readers and batchers](/usage/training#custom-code-readers-batchers). Also
see the usage guide on [data utilities](/usage/training#data) for more details
and examples.
## Config and implementation {#config} ## Config and implementation {#config}

View File

@ -7,7 +7,8 @@ menu:
- ['Loggers', 'loggers'] - ['Loggers', 'loggers']
- ['Readers', 'readers'] - ['Readers', 'readers']
- ['Batchers', 'batchers'] - ['Batchers', 'batchers']
- ['Data & Alignment', 'gold'] - ['Augmenters', 'augmenters']
- ['Training & Alignment', 'gold']
- ['Utility Functions', 'util'] - ['Utility Functions', 'util']
--- ---
@ -313,6 +314,7 @@ factories.
| Registry name | Description | | Registry name | Description |
| ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `architectures` | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`. | | `architectures` | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`. |
| `augmenters` | Registry for functions that create [data augmentation](#augmenters) callbacks for corpora and other training data iterators. |
| `batchers` | Registry for training and evaluation [data batchers](#batchers). | | `batchers` | Registry for training and evaluation [data batchers](#batchers). |
| `callbacks` | Registry for custom callbacks to [modify the `nlp` object](/usage/training#custom-code-nlp-callbacks) before training. | | `callbacks` | Registry for custom callbacks to [modify the `nlp` object](/usage/training#custom-code-nlp-callbacks) before training. |
| `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points). | | `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points). |
@ -618,6 +620,34 @@ sequences in the batch.
| `discard_oversize` | Whether to discard sequences that are by themselves longer than the largest padded batch size. ~~bool~~ | | `discard_oversize` | Whether to discard sequences that are by themselves longer than the largest padded batch size. ~~bool~~ |
| `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ | | `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ |
## Augmenters {#augmenters source="spacy/training/augment.py" new="3"}
<!-- TODO: intro, explain data augmentation concept -->
### orth_variants {#orth_variants tag="registered function"}
> #### Example config
>
> ```ini
> [corpora.train.augmenter]
> @augmenters = "spacy.orth_variants.v1"
> level = 0.0
> lower = 0.0
> lookups = null
> ```
Create a data augmentation callback that uses orth-variant replacement. The
callback can be added to a corpus or other data iterator during training. This
is especially useful for punctuation and case replacement, to help generalize
beyond corpora that don't have smart quotes, or only have smart quotes etc.
| Name | Description |
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `level` | ~~float~~ |
| `lower` | ~~float~~ |
| `lookups` | Lookups table containing the orth variants to use. See [`orth_variants.json`](https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_orth_variants.json) for an example. If not set, tables from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) are used if available and added in the [`[initialize]`](/api/data-formats#config-initialize) block of the config. If no orth variants are found, spaCy will raise an error. Defaults to `None`. ~~Optional[Lookups]~~ |
| **RETURNS** | A function that takes the current `nlp` object and an [`Example`](/api/example) and yields augmented `Example` objects. ~~Callable[[Language, Example], Iterator[Example]]~~ |
## Training data and alignment {#gold source="spacy/training"} ## Training data and alignment {#gold source="spacy/training"}
### training.offsets_to_biluo_tags {#offsets_to_biluo_tags tag="function"} ### training.offsets_to_biluo_tags {#offsets_to_biluo_tags tag="function"}

View File

@ -805,15 +805,30 @@ def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
## Data utilities {#data} ## Data utilities {#data}
spaCy includes various features and utilities to make it easy to train from your spaCy includes various features and utilities to make it easy to train models
own data. If you have training data in a standard format like `.conll` or using your own data, manage training and evaluation corpora, convert existing
`.conllu`, the easiest way to convert it for use with spaCy is to run annotations and configure data augmentation strategies for more robust models.
[`spacy convert`](/api/cli#convert) and pass it a file and an output directory:
### Converting existing corpora and annotations {#data-convert}
If you have training data in a standard format like `.conll` or `.conllu`, the
easiest way to convert it for use with spaCy is to run
[`spacy convert`](/api/cli#convert) and pass it a file and an output directory.
By default, the command will pick the converter based on the file extension.
```cli ```cli
$ python -m spacy convert ./train.gold.conll ./corpus $ python -m spacy convert ./train.gold.conll ./corpus
``` ```
> #### 💡 Tip: Converting from Prodigy
>
> If you're using the [Prodigy](https://prodi.gy) annotation tool to create
> training data, you can run the
> [`data-to-spacy` command](https://prodi.gy/docs/recipes#data-to-spacy) to
> merge and export multiple datasets for use with
> [`spacy train`](/api/cli#train). Different types of annotations on the same
> text will be combined, giving you one corpus to train multiple components.
<Infobox title="Tip: Manage multi-step workflows with projects" emoji="💡"> <Infobox title="Tip: Manage multi-step workflows with projects" emoji="💡">
Training workflows often consist of multiple steps, from preprocessing the data Training workflows often consist of multiple steps, from preprocessing the data
@ -823,6 +838,27 @@ data assets, track changes and share your end-to-end processes with your team.
</Infobox> </Infobox>
The binary `.spacy` format is a serialized [`DocBin`](/api/docbin) containing
one or more [`Doc`](/api/doc) objects. It's is extremely **efficient in
storage**, especially when packing multiple documents together. You can also
create `Doc` objects manually, so you can write your own custom logic to convert
and store existing annotations for use in spaCy.
```python
### Training data from Doc objects {highlight="6-9"}
import spacy
from spacy.tokens import Doc, DocBin
nlp = spacy.blank("en")
docbin = DocBin(nlp.vocab)
words = ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "."]
spaces = [True, True, True, True, True, True, True, False]
ents = [("ORG", 0, 1), ("GPE", 5, 6)]
doc = Doc(nlp.vocab, words=words, spaces=spaces, ents=ents)
docbin.add(doc)
docbin.to_disk("./train.spacy")
```
### Working with corpora {#data-corpora} ### Working with corpora {#data-corpora}
> #### Example > #### Example