Update augmenter lookups and docs

2025-07-14 18:22:27 +03:00 · 2020-09-30 23:03:47 +02:00 · 2020-09-30 23:03:47 +02:00 · a103ab5f1a
commit a103ab5f1a
parent 5128298964
6 changed files with 131 additions and 22 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -477,6 +477,12 @@ class Errors:
    E201 = ("Span index out of range.")
    # TODO: fix numbering after merging develop into master
    E912 = ("No orth_variants lookups table for data augmentation available for "
            "language '{lang}'. If orth_variants are available in "
            "spacy-lookups-data, make sure the package is installed and the "
            "table is loaded in the [initialize.lookups] block of your config. "
            "Alternatively, you can provide your own Lookups object with a "
            "table orth_variants as the argument 'lookuos' of the augmenter.")
    E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
            "config.cfg or override it on the CLI?")
    E914 = ("Executing {name} callback failed. Expected the function to "
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@ -7,6 +7,7 @@ from spacy.training.converters import json_to_docs
 from spacy.training.augment import create_orth_variants_augmenter
 from spacy.lang.en import English
 from spacy.tokens import Doc, DocBin
 from spacy.lookups import Lookups
 from spacy.util import get_words_and_spaces, minibatch
 from thinc.api import compounding
 import pytest
@ -492,13 +493,20 @@ def test_roundtrip_docs_to_docbin(doc):
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_make_orth_variants(doc):
    nlp = English()
    orth_variants = {
        "single": [
            {"tags": ["NFP"], "variants": ["…", "..."]},
            {"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]},
        ]
    }
    lookups = Lookups()
    lookups.add_table("orth_variants", orth_variants)
    augmenter = create_orth_variants_augmenter(level=0.2, lower=0.5, lookups=lookups)
    with make_tempdir() as tmpdir:
        output_file = tmpdir / "roundtrip.spacy"
        DocBin(docs=[doc]).to_disk(output_file)
        # due to randomness, test only that this runs with no errors for now
-        reader = Corpus(
+        reader = Corpus(output_file, augmenter=augmenter)
            output_file, augmenter=create_orth_variants_augmenter(level=0.2, lower=0.5)
        )
        list(reader(nlp))
--- a/spacy/training/augment.py
+++ b/spacy/training/augment.py
@ -1,30 +1,50 @@
-from typing import Callable
+from typing import Callable, Iterator, Dict, List, Tuple, Optional, TYPE_CHECKING
 import random
 import itertools
 import copy
 from functools import partial
-from ..util import registry
+
 from ..util import registry, logger
 from ..tokens import Doc
 from .example import Example
 from ..lookups import Lookups
 from ..errors import Errors
-
+if TYPE_CHECKING:
-@registry.augmenters("spacy.dont_augment.v1")
+    from ..language import Language  # noqa: F401
 def create_null_augmenter():
    return dont_augment
@registry.augmenters("spacy.orth_variants.v1")
-def create_orth_variants_augmenter(level: float, lower: float) -> Callable:
+def create_orth_variants_augmenter(
    level: float, lower: float, lookups: Optional[Lookups] = None,
 ) -> Callable[["Language", Example], Iterator[Example]]:
    """Create a data augmentation callback that uses orth-variant replacement.
    The callback can be added to a corpus or other data iterator during training.
    """
-    return partial(orth_variants_augmenter, level=level, lower=lower)
+    return partial(orth_variants_augmenter, level=level, lower=lower, lookups=lookups)
-def dont_augment(nlp, example):
+def dont_augment(nlp: "Language", example: Example) -> Iterator[Example]:
    yield example
-def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float = 0.0):
+def orth_variants_augmenter(
    nlp: "Language",
    example: Example,
    *,
    level: float = 0.0,
    lower: float = 0.0,
    lookups: Optional[Lookups] = None,
 ) -> Iterator[Example]:
    table_name = "orth_variants"
    if lookups is not None:
        orth_variants = lookups.get_table(table_name, {})
        logger.debug("Using data augmentation orth variants from provided lookups")
    else:
        orth_variants = nlp.vocab.lookups.get_table(table_name, {})
        logger.debug("Using data augmentation orth variants from default vocab lookups")
        if not orth_variants:
            raise ValueError(Errors.E912.format(lang=nlp.lang))
    if random.random() >= level:
        yield example
    else:
@ -37,6 +57,7 @@ def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float =
                nlp,
                raw_text,
                orig_dict["token_annotation"],
                orth_variants,
                lower=raw_text is not None and random.random() < lower,
            )
            if variant_text:
@ -49,9 +70,15 @@ def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float =
            yield example.from_dict(doc, orig_dict)
-def make_orth_variants(nlp, raw, token_dict, *, lower: bool = False):
+def make_orth_variants(
    nlp: "Language",
    raw: str,
    token_dict: Dict[str, List[str]],
    orth_variants: Dict[str, list],
    *,
    lower: bool = False,
 ) -> Tuple[str, Dict[str, List[str]]]:
    orig_token_dict = copy.deepcopy(token_dict)
    orth_variants = nlp.vocab.lookups.get_table("orth_variants", {})
    ndsv = orth_variants.get("single", [])
    ndpv = orth_variants.get("paired", [])
    words = token_dict.get("words", [])
--- a/website/docs/api/corpus.md
+++ b/website/docs/api/corpus.md
@ -7,9 +7,11 @@ new: 3
 ---
 This class manages annotated corpora and can be used for training and
-development datasets in the [DocBin](/api/docbin) (`.spacy`) format. To
+development datasets in the [`DocBin`](/api/docbin) (`.spacy`) format. To
 customize the data loading during training, you can register your own
-[data readers and batchers](/usage/training#custom-code-readers-batchers).
+[data readers and batchers](/usage/training#custom-code-readers-batchers). Also
 see the usage guide on [data utilities](/usage/training#data) for more details
 and examples.
 ## Config and implementation {#config}
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@ -7,7 +7,8 @@ menu:
  - ['Loggers', 'loggers']
  - ['Readers', 'readers']
  - ['Batchers', 'batchers']
-  - ['Data & Alignment', 'gold']
+  - ['Augmenters', 'augmenters']
  - ['Training & Alignment', 'gold']
  - ['Utility Functions', 'util']
 ---
@ -313,6 +314,7 @@ factories.
 | Registry name     | Description                                                                                                                                                                                                                                        |
 | ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `architectures`   | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`.                                                                           |
 | `augmenters`      | Registry for functions that create [data augmentation](#augmenters) callbacks for corpora and other training data iterators.                                                                                                                       |
 | `batchers`        | Registry for training and evaluation [data batchers](#batchers).                                                                                                                                                                                   |
 | `callbacks`       | Registry for custom callbacks to [modify the `nlp` object](/usage/training#custom-code-nlp-callbacks) before training.                                                                                                                             |
 | `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points).                                                                             |
@ -618,6 +620,34 @@ sequences in the batch.
 | `discard_oversize` | Whether to discard sequences that are by themselves longer than the largest padded batch size. ~~bool~~                                                                                                                                     |
 | `get_length`       | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~                                                                                     |
 ## Augmenters {#augmenters source="spacy/training/augment.py" new="3"}
 <!-- TODO: intro, explain data augmentation concept -->
 ### orth_variants {#orth_variants tag="registered function"}
 > #### Example config
 >
 > ```ini
 > [corpora.train.augmenter]
 > @augmenters = "spacy.orth_variants.v1"
 > level = 0.0
 > lower = 0.0
 > lookups = null
 > ```
 Create a data augmentation callback that uses orth-variant replacement. The
 callback can be added to a corpus or other data iterator during training. This
 is especially useful for punctuation and case replacement, to help generalize
 beyond corpora that don't have smart quotes, or only have smart quotes etc.
 | Name        | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
 | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `level`     | ~~float~~                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
 | `lower`     | ~~float~~                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
 | `lookups`   | Lookups table containing the orth variants to use. See [`orth_variants.json`](https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_orth_variants.json) for an example. If not set, tables from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) are used if available and added in the [`[initialize]`](/api/data-formats#config-initialize) block of the config. If no orth variants are found, spaCy will raise an error. Defaults to `None`. ~~Optional[Lookups]~~ |
 | **RETURNS** | A function that takes the current `nlp` object and an [`Example`](/api/example) and yields augmented `Example` objects. ~~Callable[[Language, Example], Iterator[Example]]~~                                                                                                                                                                                                                                                                                                                                                   |
 ## Training data and alignment {#gold source="spacy/training"}
 ### training.offsets_to_biluo_tags {#offsets_to_biluo_tags tag="function"}
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@ -805,15 +805,30 @@ def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
 ## Data utilities {#data}
-spaCy includes various features and utilities to make it easy to train from your
+spaCy includes various features and utilities to make it easy to train models
-own data. If you have training data in a standard format like `.conll` or
+using your own data, manage training and evaluation corpora, convert existing
-`.conllu`, the easiest way to convert it for use with spaCy is to run
+annotations and configure data augmentation strategies for more robust models.
-[`spacy convert`](/api/cli#convert) and pass it a file and an output directory:
+
 ### Converting existing corpora and annotations {#data-convert}
 If you have training data in a standard format like `.conll` or `.conllu`, the
 easiest way to convert it for use with spaCy is to run
 [`spacy convert`](/api/cli#convert) and pass it a file and an output directory.
 By default, the command will pick the converter based on the file extension.
 ```cli
 $ python -m spacy convert ./train.gold.conll ./corpus
 ```
 > #### 💡 Tip: Converting from Prodigy
 >
 > If you're using the [Prodigy](https://prodi.gy) annotation tool to create
 > training data, you can run the
 > [`data-to-spacy` command](https://prodi.gy/docs/recipes#data-to-spacy) to
 > merge and export multiple datasets for use with
 > [`spacy train`](/api/cli#train). Different types of annotations on the same
 > text will be combined, giving you one corpus to train multiple components.
 <Infobox title="Tip: Manage multi-step workflows with projects" emoji="💡">
 Training workflows often consist of multiple steps, from preprocessing the data
@ -823,6 +838,27 @@ data assets, track changes and share your end-to-end processes with your team.
 </Infobox>
 The binary `.spacy` format is a serialized [`DocBin`](/api/docbin) containing
 one or more [`Doc`](/api/doc) objects. It's is extremely **efficient in
 storage**, especially when packing multiple documents together. You can also
 create `Doc` objects manually, so you can write your own custom logic to convert
 and store existing annotations for use in spaCy.
 ```python
 ### Training data from Doc objects {highlight="6-9"}
 import spacy
 from spacy.tokens import Doc, DocBin
 nlp = spacy.blank("en")
 docbin = DocBin(nlp.vocab)
 words = ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "."]
 spaces = [True, True, True, True, True, True, True, False]
 ents = [("ORG", 0, 1), ("GPE", 5, 6)]
 doc = Doc(nlp.vocab, words=words, spaces=spaces, ents=ents)
 docbin.add(doc)
 docbin.to_disk("./train.spacy")
 ```
 ### Working with corpora {#data-corpora}
 > #### Example