mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Update augmenter lookups and docs
This commit is contained in:
parent
5128298964
commit
a103ab5f1a
|
@ -477,6 +477,12 @@ class Errors:
|
|||
E201 = ("Span index out of range.")
|
||||
|
||||
# TODO: fix numbering after merging develop into master
|
||||
E912 = ("No orth_variants lookups table for data augmentation available for "
|
||||
"language '{lang}'. If orth_variants are available in "
|
||||
"spacy-lookups-data, make sure the package is installed and the "
|
||||
"table is loaded in the [initialize.lookups] block of your config. "
|
||||
"Alternatively, you can provide your own Lookups object with a "
|
||||
"table orth_variants as the argument 'lookuos' of the augmenter.")
|
||||
E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
|
||||
"config.cfg or override it on the CLI?")
|
||||
E914 = ("Executing {name} callback failed. Expected the function to "
|
||||
|
|
|
@ -7,6 +7,7 @@ from spacy.training.converters import json_to_docs
|
|||
from spacy.training.augment import create_orth_variants_augmenter
|
||||
from spacy.lang.en import English
|
||||
from spacy.tokens import Doc, DocBin
|
||||
from spacy.lookups import Lookups
|
||||
from spacy.util import get_words_and_spaces, minibatch
|
||||
from thinc.api import compounding
|
||||
import pytest
|
||||
|
@ -492,13 +493,20 @@ def test_roundtrip_docs_to_docbin(doc):
|
|||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_make_orth_variants(doc):
|
||||
nlp = English()
|
||||
orth_variants = {
|
||||
"single": [
|
||||
{"tags": ["NFP"], "variants": ["…", "..."]},
|
||||
{"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]},
|
||||
]
|
||||
}
|
||||
lookups = Lookups()
|
||||
lookups.add_table("orth_variants", orth_variants)
|
||||
augmenter = create_orth_variants_augmenter(level=0.2, lower=0.5, lookups=lookups)
|
||||
with make_tempdir() as tmpdir:
|
||||
output_file = tmpdir / "roundtrip.spacy"
|
||||
DocBin(docs=[doc]).to_disk(output_file)
|
||||
# due to randomness, test only that this runs with no errors for now
|
||||
reader = Corpus(
|
||||
output_file, augmenter=create_orth_variants_augmenter(level=0.2, lower=0.5)
|
||||
)
|
||||
reader = Corpus(output_file, augmenter=augmenter)
|
||||
list(reader(nlp))
|
||||
|
||||
|
||||
|
|
|
@ -1,30 +1,50 @@
|
|||
from typing import Callable
|
||||
from typing import Callable, Iterator, Dict, List, Tuple, Optional, TYPE_CHECKING
|
||||
import random
|
||||
import itertools
|
||||
import copy
|
||||
from functools import partial
|
||||
from ..util import registry
|
||||
|
||||
from ..util import registry, logger
|
||||
from ..tokens import Doc
|
||||
from .example import Example
|
||||
from ..lookups import Lookups
|
||||
from ..errors import Errors
|
||||
|
||||
|
||||
@registry.augmenters("spacy.dont_augment.v1")
|
||||
def create_null_augmenter():
|
||||
return dont_augment
|
||||
if TYPE_CHECKING:
|
||||
from ..language import Language # noqa: F401
|
||||
|
||||
|
||||
@registry.augmenters("spacy.orth_variants.v1")
|
||||
def create_orth_variants_augmenter(level: float, lower: float) -> Callable:
|
||||
def create_orth_variants_augmenter(
|
||||
level: float, lower: float, lookups: Optional[Lookups] = None,
|
||||
) -> Callable[["Language", Example], Iterator[Example]]:
|
||||
"""Create a data augmentation callback that uses orth-variant replacement.
|
||||
The callback can be added to a corpus or other data iterator during training.
|
||||
"""
|
||||
return partial(orth_variants_augmenter, level=level, lower=lower)
|
||||
return partial(orth_variants_augmenter, level=level, lower=lower, lookups=lookups)
|
||||
|
||||
|
||||
def dont_augment(nlp, example):
|
||||
def dont_augment(nlp: "Language", example: Example) -> Iterator[Example]:
|
||||
yield example
|
||||
|
||||
|
||||
def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float = 0.0):
|
||||
def orth_variants_augmenter(
|
||||
nlp: "Language",
|
||||
example: Example,
|
||||
*,
|
||||
level: float = 0.0,
|
||||
lower: float = 0.0,
|
||||
lookups: Optional[Lookups] = None,
|
||||
) -> Iterator[Example]:
|
||||
table_name = "orth_variants"
|
||||
if lookups is not None:
|
||||
orth_variants = lookups.get_table(table_name, {})
|
||||
logger.debug("Using data augmentation orth variants from provided lookups")
|
||||
else:
|
||||
orth_variants = nlp.vocab.lookups.get_table(table_name, {})
|
||||
logger.debug("Using data augmentation orth variants from default vocab lookups")
|
||||
if not orth_variants:
|
||||
raise ValueError(Errors.E912.format(lang=nlp.lang))
|
||||
if random.random() >= level:
|
||||
yield example
|
||||
else:
|
||||
|
@ -37,6 +57,7 @@ def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float =
|
|||
nlp,
|
||||
raw_text,
|
||||
orig_dict["token_annotation"],
|
||||
orth_variants,
|
||||
lower=raw_text is not None and random.random() < lower,
|
||||
)
|
||||
if variant_text:
|
||||
|
@ -49,9 +70,15 @@ def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float =
|
|||
yield example.from_dict(doc, orig_dict)
|
||||
|
||||
|
||||
def make_orth_variants(nlp, raw, token_dict, *, lower: bool = False):
|
||||
def make_orth_variants(
|
||||
nlp: "Language",
|
||||
raw: str,
|
||||
token_dict: Dict[str, List[str]],
|
||||
orth_variants: Dict[str, list],
|
||||
*,
|
||||
lower: bool = False,
|
||||
) -> Tuple[str, Dict[str, List[str]]]:
|
||||
orig_token_dict = copy.deepcopy(token_dict)
|
||||
orth_variants = nlp.vocab.lookups.get_table("orth_variants", {})
|
||||
ndsv = orth_variants.get("single", [])
|
||||
ndpv = orth_variants.get("paired", [])
|
||||
words = token_dict.get("words", [])
|
||||
|
|
|
@ -7,9 +7,11 @@ new: 3
|
|||
---
|
||||
|
||||
This class manages annotated corpora and can be used for training and
|
||||
development datasets in the [DocBin](/api/docbin) (`.spacy`) format. To
|
||||
development datasets in the [`DocBin`](/api/docbin) (`.spacy`) format. To
|
||||
customize the data loading during training, you can register your own
|
||||
[data readers and batchers](/usage/training#custom-code-readers-batchers).
|
||||
[data readers and batchers](/usage/training#custom-code-readers-batchers). Also
|
||||
see the usage guide on [data utilities](/usage/training#data) for more details
|
||||
and examples.
|
||||
|
||||
## Config and implementation {#config}
|
||||
|
||||
|
|
|
@ -7,7 +7,8 @@ menu:
|
|||
- ['Loggers', 'loggers']
|
||||
- ['Readers', 'readers']
|
||||
- ['Batchers', 'batchers']
|
||||
- ['Data & Alignment', 'gold']
|
||||
- ['Augmenters', 'augmenters']
|
||||
- ['Training & Alignment', 'gold']
|
||||
- ['Utility Functions', 'util']
|
||||
---
|
||||
|
||||
|
@ -313,6 +314,7 @@ factories.
|
|||
| Registry name | Description |
|
||||
| ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `architectures` | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`. |
|
||||
| `augmenters` | Registry for functions that create [data augmentation](#augmenters) callbacks for corpora and other training data iterators. |
|
||||
| `batchers` | Registry for training and evaluation [data batchers](#batchers). |
|
||||
| `callbacks` | Registry for custom callbacks to [modify the `nlp` object](/usage/training#custom-code-nlp-callbacks) before training. |
|
||||
| `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points). |
|
||||
|
@ -618,6 +620,34 @@ sequences in the batch.
|
|||
| `discard_oversize` | Whether to discard sequences that are by themselves longer than the largest padded batch size. ~~bool~~ |
|
||||
| `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ |
|
||||
|
||||
## Augmenters {#augmenters source="spacy/training/augment.py" new="3"}
|
||||
|
||||
<!-- TODO: intro, explain data augmentation concept -->
|
||||
|
||||
### orth_variants {#orth_variants tag="registered function"}
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [corpora.train.augmenter]
|
||||
> @augmenters = "spacy.orth_variants.v1"
|
||||
> level = 0.0
|
||||
> lower = 0.0
|
||||
> lookups = null
|
||||
> ```
|
||||
|
||||
Create a data augmentation callback that uses orth-variant replacement. The
|
||||
callback can be added to a corpus or other data iterator during training. This
|
||||
is especially useful for punctuation and case replacement, to help generalize
|
||||
beyond corpora that don't have smart quotes, or only have smart quotes etc.
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `level` | ~~float~~ |
|
||||
| `lower` | ~~float~~ |
|
||||
| `lookups` | Lookups table containing the orth variants to use. See [`orth_variants.json`](https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_orth_variants.json) for an example. If not set, tables from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) are used if available and added in the [`[initialize]`](/api/data-formats#config-initialize) block of the config. If no orth variants are found, spaCy will raise an error. Defaults to `None`. ~~Optional[Lookups]~~ |
|
||||
| **RETURNS** | A function that takes the current `nlp` object and an [`Example`](/api/example) and yields augmented `Example` objects. ~~Callable[[Language, Example], Iterator[Example]]~~ |
|
||||
|
||||
## Training data and alignment {#gold source="spacy/training"}
|
||||
|
||||
### training.offsets_to_biluo_tags {#offsets_to_biluo_tags tag="function"}
|
||||
|
|
|
@ -805,15 +805,30 @@ def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
|
|||
|
||||
## Data utilities {#data}
|
||||
|
||||
spaCy includes various features and utilities to make it easy to train from your
|
||||
own data. If you have training data in a standard format like `.conll` or
|
||||
`.conllu`, the easiest way to convert it for use with spaCy is to run
|
||||
[`spacy convert`](/api/cli#convert) and pass it a file and an output directory:
|
||||
spaCy includes various features and utilities to make it easy to train models
|
||||
using your own data, manage training and evaluation corpora, convert existing
|
||||
annotations and configure data augmentation strategies for more robust models.
|
||||
|
||||
### Converting existing corpora and annotations {#data-convert}
|
||||
|
||||
If you have training data in a standard format like `.conll` or `.conllu`, the
|
||||
easiest way to convert it for use with spaCy is to run
|
||||
[`spacy convert`](/api/cli#convert) and pass it a file and an output directory.
|
||||
By default, the command will pick the converter based on the file extension.
|
||||
|
||||
```cli
|
||||
$ python -m spacy convert ./train.gold.conll ./corpus
|
||||
```
|
||||
|
||||
> #### 💡 Tip: Converting from Prodigy
|
||||
>
|
||||
> If you're using the [Prodigy](https://prodi.gy) annotation tool to create
|
||||
> training data, you can run the
|
||||
> [`data-to-spacy` command](https://prodi.gy/docs/recipes#data-to-spacy) to
|
||||
> merge and export multiple datasets for use with
|
||||
> [`spacy train`](/api/cli#train). Different types of annotations on the same
|
||||
> text will be combined, giving you one corpus to train multiple components.
|
||||
|
||||
<Infobox title="Tip: Manage multi-step workflows with projects" emoji="💡">
|
||||
|
||||
Training workflows often consist of multiple steps, from preprocessing the data
|
||||
|
@ -823,6 +838,27 @@ data assets, track changes and share your end-to-end processes with your team.
|
|||
|
||||
</Infobox>
|
||||
|
||||
The binary `.spacy` format is a serialized [`DocBin`](/api/docbin) containing
|
||||
one or more [`Doc`](/api/doc) objects. It's is extremely **efficient in
|
||||
storage**, especially when packing multiple documents together. You can also
|
||||
create `Doc` objects manually, so you can write your own custom logic to convert
|
||||
and store existing annotations for use in spaCy.
|
||||
|
||||
```python
|
||||
### Training data from Doc objects {highlight="6-9"}
|
||||
import spacy
|
||||
from spacy.tokens import Doc, DocBin
|
||||
|
||||
nlp = spacy.blank("en")
|
||||
docbin = DocBin(nlp.vocab)
|
||||
words = ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "."]
|
||||
spaces = [True, True, True, True, True, True, True, False]
|
||||
ents = [("ORG", 0, 1), ("GPE", 5, 6)]
|
||||
doc = Doc(nlp.vocab, words=words, spaces=spaces, ents=ents)
|
||||
docbin.add(doc)
|
||||
docbin.to_disk("./train.spacy")
|
||||
```
|
||||
|
||||
### Working with corpora {#data-corpora}
|
||||
|
||||
> #### Example
|
||||
|
|
Loading…
Reference in New Issue
Block a user