Update augmenter lookups and docs

2025-07-17 11:42:30 +03:00 · 2020-09-30 23:03:47 +02:00 · 2020-09-30 23:03:47 +02:00 · a103ab5f1a
commit a103ab5f1a
parent 5128298964
6 changed files with 131 additions and 22 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -477,6 +477,12 @@ class Errors:
    E201 = ("Span index out of range.")

    # TODO: fix numbering after merging develop into master
+    E912 = ("No orth_variants lookups table for data augmentation available for "
+            "language '{lang}'. If orth_variants are available in "
+            "spacy-lookups-data, make sure the package is installed and the "
+            "table is loaded in the [initialize.lookups] block of your config. "
+            "Alternatively, you can provide your own Lookups object with a "
+            "table orth_variants as the argument 'lookuos' of the augmenter.")
    E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
            "config.cfg or override it on the CLI?")
    E914 = ("Executing {name} callback failed. Expected the function to "
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@ -7,6 +7,7 @@ from spacy.training.converters import json_to_docs
 from spacy.training.augment import create_orth_variants_augmenter
 from spacy.lang.en import English
 from spacy.tokens import Doc, DocBin
+from spacy.lookups import Lookups
 from spacy.util import get_words_and_spaces, minibatch
 from thinc.api import compounding
 import pytest
@ -492,13 +493,20 @@ def test_roundtrip_docs_to_docbin(doc):
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_make_orth_variants(doc):
    nlp = English()
+    orth_variants = {
+        "single": [
+            {"tags": ["NFP"], "variants": ["…", "..."]},
+            {"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]},
+        ]
+    }
+    lookups = Lookups()
+    lookups.add_table("orth_variants", orth_variants)
+    augmenter = create_orth_variants_augmenter(level=0.2, lower=0.5, lookups=lookups)
    with make_tempdir() as tmpdir:
        output_file = tmpdir / "roundtrip.spacy"
        DocBin(docs=[doc]).to_disk(output_file)
        # due to randomness, test only that this runs with no errors for now
-        reader = Corpus(
-            output_file, augmenter=create_orth_variants_augmenter(level=0.2, lower=0.5)
-        )
+        reader = Corpus(output_file, augmenter=augmenter)
        list(reader(nlp))


--- a/spacy/training/augment.py
+++ b/spacy/training/augment.py
@ -1,30 +1,50 @@
-from typing import Callable
+from typing import Callable, Iterator, Dict, List, Tuple, Optional, TYPE_CHECKING
 import random
 import itertools
 import copy
 from functools import partial
-from ..util import registry
+
+from ..util import registry, logger
 from ..tokens import Doc
+from .example import Example
+from ..lookups import Lookups
+from ..errors import Errors

-
-@registry.augmenters("spacy.dont_augment.v1")
-def create_null_augmenter():
-    return dont_augment
+if TYPE_CHECKING:
+    from ..language import Language  # noqa: F401


@registry.augmenters("spacy.orth_variants.v1")
-def create_orth_variants_augmenter(level: float, lower: float) -> Callable:
+def create_orth_variants_augmenter(
+    level: float, lower: float, lookups: Optional[Lookups] = None,
+) -> Callable[["Language", Example], Iterator[Example]]:
    """Create a data augmentation callback that uses orth-variant replacement.
    The callback can be added to a corpus or other data iterator during training.
    """
-    return partial(orth_variants_augmenter, level=level, lower=lower)
+    return partial(orth_variants_augmenter, level=level, lower=lower, lookups=lookups)


-def dont_augment(nlp, example):
+def dont_augment(nlp: "Language", example: Example) -> Iterator[Example]:
    yield example


-def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float = 0.0):
+def orth_variants_augmenter(
+    nlp: "Language",
+    example: Example,
+    *,
+    level: float = 0.0,
+    lower: float = 0.0,
+    lookups: Optional[Lookups] = None,
+) -> Iterator[Example]:
+    table_name = "orth_variants"
+    if lookups is not None:
+        orth_variants = lookups.get_table(table_name, {})
+        logger.debug("Using data augmentation orth variants from provided lookups")
+    else:
+        orth_variants = nlp.vocab.lookups.get_table(table_name, {})
+        logger.debug("Using data augmentation orth variants from default vocab lookups")
+        if not orth_variants:
+            raise ValueError(Errors.E912.format(lang=nlp.lang))
    if random.random() >= level:
        yield example
    else:
@ -37,6 +57,7 @@ def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float =
                nlp,
                raw_text,
                orig_dict["token_annotation"],
+                orth_variants,
                lower=raw_text is not None and random.random() < lower,
            )
            if variant_text:
@ -49,9 +70,15 @@ def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float =
            yield example.from_dict(doc, orig_dict)


-def make_orth_variants(nlp, raw, token_dict, *, lower: bool = False):
+def make_orth_variants(
+    nlp: "Language",
+    raw: str,
+    token_dict: Dict[str, List[str]],
+    orth_variants: Dict[str, list],
+    *,
+    lower: bool = False,
+) -> Tuple[str, Dict[str, List[str]]]:
    orig_token_dict = copy.deepcopy(token_dict)
-    orth_variants = nlp.vocab.lookups.get_table("orth_variants", {})
    ndsv = orth_variants.get("single", [])
    ndpv = orth_variants.get("paired", [])
    words = token_dict.get("words", [])
--- a/website/docs/api/corpus.md
+++ b/website/docs/api/corpus.md
@ -7,9 +7,11 @@ new: 3
 ---

 This class manages annotated corpora and can be used for training and
-development datasets in the [DocBin](/api/docbin) (`.spacy`) format. To
+development datasets in the [`DocBin`](/api/docbin) (`.spacy`) format. To
 customize the data loading during training, you can register your own
-[data readers and batchers](/usage/training#custom-code-readers-batchers).
+[data readers and batchers](/usage/training#custom-code-readers-batchers). Also
+see the usage guide on [data utilities](/usage/training#data) for more details
+and examples.

 ## Config and implementation {#config}

--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@ -7,7 +7,8 @@ menu:
  - ['Loggers', 'loggers']
  - ['Readers', 'readers']
  - ['Batchers', 'batchers']
-  - ['Data & Alignment', 'gold']
+  - ['Augmenters', 'augmenters']
+  - ['Training & Alignment', 'gold']
  - ['Utility Functions', 'util']
 ---

@ -313,6 +314,7 @@ factories.
 | Registry name     | Description                                                                                                                                                                                                                                        |
 | ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `architectures`   | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`.                                                                           |
+| `augmenters`      | Registry for functions that create [data augmentation](#augmenters) callbacks for corpora and other training data iterators.                                                                                                                       |
 | `batchers`        | Registry for training and evaluation [data batchers](#batchers).                                                                                                                                                                                   |
 | `callbacks`       | Registry for custom callbacks to [modify the `nlp` object](/usage/training#custom-code-nlp-callbacks) before training.                                                                                                                             |
 | `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points).                                                                             |
@ -618,6 +620,34 @@ sequences in the batch.
 | `discard_oversize` | Whether to discard sequences that are by themselves longer than the largest padded batch size. ~~bool~~                                                                                                                                     |
 | `get_length`       | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~                                                                                     |

+## Augmenters {#augmenters source="spacy/training/augment.py" new="3"}
+
+<!-- TODO: intro, explain data augmentation concept -->
+
+### orth_variants {#orth_variants tag="registered function"}
+
+> #### Example config
+>
+> ```ini
+> [corpora.train.augmenter]
+> @augmenters = "spacy.orth_variants.v1"
+> level = 0.0
+> lower = 0.0
+> lookups = null
+> ```
+
+Create a data augmentation callback that uses orth-variant replacement. The
+callback can be added to a corpus or other data iterator during training. This
+is especially useful for punctuation and case replacement, to help generalize
+beyond corpora that don't have smart quotes, or only have smart quotes etc.
+
+| Name        | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `level`     | ~~float~~                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+| `lower`     | ~~float~~                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+| `lookups`   | Lookups table containing the orth variants to use. See [`orth_variants.json`](https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_orth_variants.json) for an example. If not set, tables from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) are used if available and added in the [`[initialize]`](/api/data-formats#config-initialize) block of the config. If no orth variants are found, spaCy will raise an error. Defaults to `None`. ~~Optional[Lookups]~~ |
+| **RETURNS** | A function that takes the current `nlp` object and an [`Example`](/api/example) and yields augmented `Example` objects. ~~Callable[[Language, Example], Iterator[Example]]~~                                                                                                                                                                                                                                                                                                                                                   |
+
 ## Training data and alignment {#gold source="spacy/training"}

 ### training.offsets_to_biluo_tags {#offsets_to_biluo_tags tag="function"}
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@ -805,15 +805,30 @@ def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:

 ## Data utilities {#data}

-spaCy includes various features and utilities to make it easy to train from your
-own data. If you have training data in a standard format like `.conll` or
-`.conllu`, the easiest way to convert it for use with spaCy is to run
-[`spacy convert`](/api/cli#convert) and pass it a file and an output directory:
+spaCy includes various features and utilities to make it easy to train models
+using your own data, manage training and evaluation corpora, convert existing
+annotations and configure data augmentation strategies for more robust models.
+
+### Converting existing corpora and annotations {#data-convert}
+
+If you have training data in a standard format like `.conll` or `.conllu`, the
+easiest way to convert it for use with spaCy is to run
+[`spacy convert`](/api/cli#convert) and pass it a file and an output directory.
+By default, the command will pick the converter based on the file extension.

 ```cli
 $ python -m spacy convert ./train.gold.conll ./corpus
 ```

+> #### 💡 Tip: Converting from Prodigy
+>
+> If you're using the [Prodigy](https://prodi.gy) annotation tool to create
+> training data, you can run the
+> [`data-to-spacy` command](https://prodi.gy/docs/recipes#data-to-spacy) to
+> merge and export multiple datasets for use with
+> [`spacy train`](/api/cli#train). Different types of annotations on the same
+> text will be combined, giving you one corpus to train multiple components.
+
 <Infobox title="Tip: Manage multi-step workflows with projects" emoji="💡">

 Training workflows often consist of multiple steps, from preprocessing the data
@ -823,6 +838,27 @@ data assets, track changes and share your end-to-end processes with your team.

 </Infobox>

+The binary `.spacy` format is a serialized [`DocBin`](/api/docbin) containing
+one or more [`Doc`](/api/doc) objects. It's is extremely **efficient in
+storage**, especially when packing multiple documents together. You can also
+create `Doc` objects manually, so you can write your own custom logic to convert
+and store existing annotations for use in spaCy.
+
+```python
+### Training data from Doc objects {highlight="6-9"}
+import spacy
+from spacy.tokens import Doc, DocBin
+
+nlp = spacy.blank("en")
+docbin = DocBin(nlp.vocab)
+words = ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "."]
+spaces = [True, True, True, True, True, True, True, False]
+ents = [("ORG", 0, 1), ("GPE", 5, 6)]
+doc = Doc(nlp.vocab, words=words, spaces=spaces, ents=ents)
+docbin.add(doc)
+docbin.to_disk("./train.spacy")
+```
+
 ### Working with corpora {#data-corpora}

 > #### Example