mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-31 18:39:49 +03:00
In order to support Python 3.13, we had to migrate to Cython 3.0. This caused some tricky interaction with our Pydantic usage, because Cython 3 uses the from __future__ import annotations semantics, which causes type annotations to be saved as strings. The end result is that we can't have Language.factory decorated functions in Cython modules anymore, as the Language.factory decorator expects to inspect the signature of the functions and build a Pydantic model. If the function is implemented in Cython, an error is raised because the type is not resolved. To address this I've moved the factory functions into a new module, spacy.pipeline.factories. I've added __getattr__ importlib hooks to the previous locations, in case anyone was importing these functions directly. The change should have no backwards compatibility implications. Along the way I've also refactored the registration of functions for the config. Previously these ran as import-time side-effects, using the registry decorator. I've created instead a new module spacy.registrations. When the registry is accessed it calls a function ensure_populated(), which cases the registrations to occur. I've made a similar change to the Language.factory registrations in the new spacy.pipeline.factories module. I want to remove these import-time side-effects so that we can speed up the loading time of the library, which can be especially painful on the CLI. I also find that I'm often working to track down the implementations of functions referenced by strings in the config. Having the registrations all happen in one place will make this easier. With these changes I've fortunately avoided the need to migrate to Pydantic v2 properly --- we're still using the v1 compatibility shim. We might not be able to hold out forever though: Pydantic (reasonably) aren't actively supporting the v1 shims. I put a lot of work into v2 migration when investigating the 3.13 support, and it's definitely challenging. In any case, it's a relief that we don't have to do the v2 migration at the same time as the Cython 3.0/Python 3.13 support.
346 lines
13 KiB
Python
346 lines
13 KiB
Python
import itertools
|
|
import random
|
|
from functools import partial
|
|
from typing import TYPE_CHECKING, Callable, Dict, Iterator, List, Optional, Tuple
|
|
|
|
from ..util import registry
|
|
from .example import Example
|
|
from .iob_utils import _doc_to_biluo_tags_with_partial, split_bilu_label
|
|
|
|
if TYPE_CHECKING:
|
|
from ..language import Language # noqa: F401
|
|
|
|
|
|
def create_combined_augmenter(
|
|
lower_level: float,
|
|
orth_level: float,
|
|
orth_variants: Optional[Dict[str, List[Dict]]],
|
|
whitespace_level: float,
|
|
whitespace_per_token: float,
|
|
whitespace_variants: Optional[List[str]],
|
|
) -> Callable[["Language", Example], Iterator[Example]]:
|
|
"""Create a data augmentation callback that uses orth-variant replacement.
|
|
The callback can be added to a corpus or other data iterator during training.
|
|
|
|
lower_level (float): The percentage of texts that will be lowercased.
|
|
orth_level (float): The percentage of texts that will be augmented.
|
|
orth_variants (Optional[Dict[str, List[Dict]]]): A dictionary containing the
|
|
single and paired orth variants. Typically loaded from a JSON file.
|
|
whitespace_level (float): The percentage of texts that will have whitespace
|
|
tokens inserted.
|
|
whitespace_per_token (float): The number of whitespace tokens to insert in
|
|
the modified doc as a percentage of the doc length.
|
|
whitespace_variants (Optional[List[str]]): The whitespace token texts.
|
|
RETURNS (Callable[[Language, Example], Iterator[Example]]): The augmenter.
|
|
"""
|
|
return partial(
|
|
combined_augmenter,
|
|
lower_level=lower_level,
|
|
orth_level=orth_level,
|
|
orth_variants=orth_variants,
|
|
whitespace_level=whitespace_level,
|
|
whitespace_per_token=whitespace_per_token,
|
|
whitespace_variants=whitespace_variants,
|
|
)
|
|
|
|
|
|
def combined_augmenter(
|
|
nlp: "Language",
|
|
example: Example,
|
|
*,
|
|
lower_level: float = 0.0,
|
|
orth_level: float = 0.0,
|
|
orth_variants: Optional[Dict[str, List[Dict]]] = None,
|
|
whitespace_level: float = 0.0,
|
|
whitespace_per_token: float = 0.0,
|
|
whitespace_variants: Optional[List[str]] = None,
|
|
) -> Iterator[Example]:
|
|
if random.random() < lower_level:
|
|
example = make_lowercase_variant(nlp, example)
|
|
if orth_variants and random.random() < orth_level:
|
|
raw_text = example.text
|
|
orig_dict = example.to_dict()
|
|
orig_dict["doc_annotation"]["entities"] = _doc_to_biluo_tags_with_partial(
|
|
example.reference
|
|
)
|
|
variant_text, variant_token_annot = make_orth_variants(
|
|
nlp,
|
|
raw_text,
|
|
orig_dict["token_annotation"],
|
|
orth_variants,
|
|
lower=False,
|
|
)
|
|
orig_dict["token_annotation"] = variant_token_annot
|
|
example = example.from_dict(nlp.make_doc(variant_text), orig_dict)
|
|
if whitespace_variants and random.random() < whitespace_level:
|
|
for _ in range(int(len(example.reference) * whitespace_per_token)):
|
|
example = make_whitespace_variant(
|
|
nlp,
|
|
example,
|
|
random.choice(whitespace_variants),
|
|
random.randrange(0, len(example.reference)),
|
|
)
|
|
yield example
|
|
|
|
|
|
def create_orth_variants_augmenter(
|
|
level: float, lower: float, orth_variants: Dict[str, List[Dict]]
|
|
) -> Callable[["Language", Example], Iterator[Example]]:
|
|
"""Create a data augmentation callback that uses orth-variant replacement.
|
|
The callback can be added to a corpus or other data iterator during training.
|
|
|
|
level (float): The percentage of texts that will be augmented.
|
|
lower (float): The percentage of texts that will be lowercased.
|
|
orth_variants (Dict[str, List[Dict]]): A dictionary containing
|
|
the single and paired orth variants. Typically loaded from a JSON file.
|
|
RETURNS (Callable[[Language, Example], Iterator[Example]]): The augmenter.
|
|
"""
|
|
return partial(
|
|
orth_variants_augmenter, orth_variants=orth_variants, level=level, lower=lower
|
|
)
|
|
|
|
|
|
def create_lower_casing_augmenter(
|
|
level: float,
|
|
) -> Callable[["Language", Example], Iterator[Example]]:
|
|
"""Create a data augmentation callback that converts documents to lowercase.
|
|
The callback can be added to a corpus or other data iterator during training.
|
|
|
|
level (float): The percentage of texts that will be augmented.
|
|
RETURNS (Callable[[Language, Example], Iterator[Example]]): The augmenter.
|
|
"""
|
|
return partial(lower_casing_augmenter, level=level)
|
|
|
|
|
|
def dont_augment(nlp: "Language", example: Example) -> Iterator[Example]:
|
|
yield example
|
|
|
|
|
|
def lower_casing_augmenter(
|
|
nlp: "Language", example: Example, *, level: float
|
|
) -> Iterator[Example]:
|
|
if random.random() >= level:
|
|
yield example
|
|
else:
|
|
yield make_lowercase_variant(nlp, example)
|
|
|
|
|
|
def make_lowercase_variant(nlp: "Language", example: Example):
|
|
example_dict = example.to_dict()
|
|
example_dict["doc_annotation"]["entities"] = _doc_to_biluo_tags_with_partial(
|
|
example.reference
|
|
)
|
|
doc = nlp.make_doc(example.text.lower())
|
|
example_dict["token_annotation"]["ORTH"] = [t.lower_ for t in example.reference]
|
|
return example.from_dict(doc, example_dict)
|
|
|
|
|
|
def orth_variants_augmenter(
|
|
nlp: "Language",
|
|
example: Example,
|
|
orth_variants: Dict[str, List[Dict]],
|
|
*,
|
|
level: float = 0.0,
|
|
lower: float = 0.0,
|
|
) -> Iterator[Example]:
|
|
if random.random() >= level:
|
|
yield example
|
|
else:
|
|
raw_text = example.text
|
|
orig_dict = example.to_dict()
|
|
orig_dict["doc_annotation"]["entities"] = _doc_to_biluo_tags_with_partial(
|
|
example.reference
|
|
)
|
|
variant_text, variant_token_annot = make_orth_variants(
|
|
nlp,
|
|
raw_text,
|
|
orig_dict["token_annotation"],
|
|
orth_variants,
|
|
lower=raw_text is not None and random.random() < lower,
|
|
)
|
|
orig_dict["token_annotation"] = variant_token_annot
|
|
yield example.from_dict(nlp.make_doc(variant_text), orig_dict)
|
|
|
|
|
|
def make_orth_variants(
|
|
nlp: "Language",
|
|
raw: str,
|
|
token_dict: Dict[str, List[str]],
|
|
orth_variants: Dict[str, List[Dict[str, List[str]]]],
|
|
*,
|
|
lower: bool = False,
|
|
) -> Tuple[str, Dict[str, List[str]]]:
|
|
words = token_dict.get("ORTH", [])
|
|
tags = token_dict.get("TAG", [])
|
|
# keep unmodified if words are not defined
|
|
if not words:
|
|
return raw, token_dict
|
|
if lower:
|
|
words = [w.lower() for w in words]
|
|
raw = raw.lower()
|
|
# if no tags, only lowercase
|
|
if not tags:
|
|
token_dict["ORTH"] = words
|
|
return raw, token_dict
|
|
# single variants
|
|
ndsv = orth_variants.get("single", [])
|
|
punct_choices = [random.choice(x["variants"]) for x in ndsv]
|
|
for word_idx in range(len(words)):
|
|
for punct_idx in range(len(ndsv)):
|
|
if (
|
|
tags[word_idx] in ndsv[punct_idx]["tags"]
|
|
and words[word_idx] in ndsv[punct_idx]["variants"]
|
|
):
|
|
words[word_idx] = punct_choices[punct_idx]
|
|
# paired variants
|
|
ndpv = orth_variants.get("paired", [])
|
|
punct_choices = [random.choice(x["variants"]) for x in ndpv]
|
|
for word_idx in range(len(words)):
|
|
for punct_idx in range(len(ndpv)):
|
|
if tags[word_idx] in ndpv[punct_idx]["tags"] and words[
|
|
word_idx
|
|
] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
|
|
# backup option: random left vs. right from pair
|
|
pair_idx = random.choice([0, 1])
|
|
# best option: rely on paired POS tags like `` / ''
|
|
if len(ndpv[punct_idx]["tags"]) == 2:
|
|
pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx])
|
|
# next best option: rely on position in variants
|
|
# (may not be unambiguous, so order of variants matters)
|
|
else:
|
|
for pair in ndpv[punct_idx]["variants"]:
|
|
if words[word_idx] in pair:
|
|
pair_idx = pair.index(words[word_idx])
|
|
words[word_idx] = punct_choices[punct_idx][pair_idx]
|
|
token_dict["ORTH"] = words
|
|
raw = construct_modified_raw_text(token_dict)
|
|
return raw, token_dict
|
|
|
|
|
|
def make_whitespace_variant(
|
|
nlp: "Language",
|
|
example: Example,
|
|
whitespace: str,
|
|
position: int,
|
|
) -> Example:
|
|
"""Insert the whitespace token at the specified token offset in the doc.
|
|
This is primarily intended for v2-compatible training data that doesn't
|
|
include links or spans. If the document includes links, spans, or partial
|
|
dependency annotation, it is returned without modifications.
|
|
|
|
The augmentation follows the basics of the v2 space attachment policy, but
|
|
without a distinction between "real" and other tokens, so space tokens
|
|
may be attached to space tokens:
|
|
- at the beginning of a sentence attach the space token to the following
|
|
token
|
|
- otherwise attach the space token to the preceding token
|
|
|
|
The augmenter does not attempt to consolidate adjacent whitespace in the
|
|
same way that the tokenizer would.
|
|
|
|
The following annotation is used for the space token:
|
|
TAG: "_SP"
|
|
MORPH: ""
|
|
POS: "SPACE"
|
|
LEMMA: ORTH
|
|
DEP: "dep"
|
|
SENT_START: False
|
|
|
|
The annotation for each attribute is only set for the space token if there
|
|
is already at least partial annotation for that attribute in the original
|
|
example.
|
|
|
|
RETURNS (Example): Example with one additional space token.
|
|
"""
|
|
example_dict = example.to_dict()
|
|
example_dict["doc_annotation"]["entities"] = _doc_to_biluo_tags_with_partial(
|
|
example.reference
|
|
)
|
|
doc_dict = example_dict.get("doc_annotation", {})
|
|
token_dict = example_dict.get("token_annotation", {})
|
|
# returned unmodified if:
|
|
# - doc is empty
|
|
# - words are not defined
|
|
# - links are defined (only character-based offsets, which is more a quirk
|
|
# of Example.to_dict than a technical constraint)
|
|
# - spans are defined
|
|
# - there are partial dependencies
|
|
if (
|
|
len(example.reference) == 0
|
|
or "ORTH" not in token_dict
|
|
or len(doc_dict.get("links", [])) > 0
|
|
or len(example.reference.spans) > 0
|
|
or (
|
|
example.reference.has_annotation("DEP")
|
|
and not example.reference.has_annotation("DEP", require_complete=True)
|
|
)
|
|
):
|
|
return example
|
|
words = token_dict.get("ORTH", [])
|
|
length = len(words)
|
|
assert 0 <= position <= length
|
|
if example.reference.has_annotation("ENT_TYPE"):
|
|
# I-ENTITY if between B/I-ENTITY and I/L-ENTITY otherwise O
|
|
entity = "O"
|
|
if position > 1 and position < length:
|
|
ent_prev = doc_dict["entities"][position - 1]
|
|
ent_next = doc_dict["entities"][position]
|
|
if "-" in ent_prev and "-" in ent_next:
|
|
ent_iob_prev, ent_type_prev = split_bilu_label(ent_prev)
|
|
ent_iob_next, ent_type_next = split_bilu_label(ent_next)
|
|
if (
|
|
ent_iob_prev in ("B", "I")
|
|
and ent_iob_next in ("I", "L")
|
|
and ent_type_prev == ent_type_next
|
|
):
|
|
entity = f"I-{ent_type_prev}"
|
|
doc_dict["entities"].insert(position, entity)
|
|
else:
|
|
del doc_dict["entities"]
|
|
token_dict["ORTH"].insert(position, whitespace)
|
|
token_dict["SPACY"].insert(position, False)
|
|
if example.reference.has_annotation("TAG"):
|
|
token_dict["TAG"].insert(position, "_SP")
|
|
else:
|
|
del token_dict["TAG"]
|
|
if example.reference.has_annotation("LEMMA"):
|
|
token_dict["LEMMA"].insert(position, whitespace)
|
|
else:
|
|
del token_dict["LEMMA"]
|
|
if example.reference.has_annotation("POS"):
|
|
token_dict["POS"].insert(position, "SPACE")
|
|
else:
|
|
del token_dict["POS"]
|
|
if example.reference.has_annotation("MORPH"):
|
|
token_dict["MORPH"].insert(position, "")
|
|
else:
|
|
del token_dict["MORPH"]
|
|
if example.reference.has_annotation("DEP", require_complete=True):
|
|
if position == 0:
|
|
token_dict["HEAD"].insert(position, 0)
|
|
else:
|
|
token_dict["HEAD"].insert(position, position - 1)
|
|
for i in range(len(token_dict["HEAD"])):
|
|
if token_dict["HEAD"][i] >= position:
|
|
token_dict["HEAD"][i] += 1
|
|
token_dict["DEP"].insert(position, "dep")
|
|
else:
|
|
del token_dict["HEAD"]
|
|
del token_dict["DEP"]
|
|
if example.reference.has_annotation("SENT_START"):
|
|
token_dict["SENT_START"].insert(position, False)
|
|
else:
|
|
del token_dict["SENT_START"]
|
|
raw = construct_modified_raw_text(token_dict)
|
|
return Example.from_dict(nlp.make_doc(raw), example_dict)
|
|
|
|
|
|
def construct_modified_raw_text(token_dict):
|
|
"""Construct modified raw text from words and spaces."""
|
|
raw = ""
|
|
for orth, spacy in zip(token_dict["ORTH"], token_dict["SPACY"]):
|
|
raw += orth
|
|
if spacy:
|
|
raw += " "
|
|
return raw
|