From 89f974d4f54fc9c24fd2cf244ed783631f191181 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Thu, 2 Feb 2023 22:13:38 +0900 Subject: [PATCH] Cleanup/remove backwards compat overwrite settings (#11888) * Remove backwards-compatible overwrite from Entity Linker This also adds a docstring about overwrite, since it wasn't present. * Fix docstring * Remove backward compat settings in Morphologizer This also needed a docstring added. For this component it's less clear what the right overwrite settings are. * Remove backward compat from sentencizer This was simple * Remove backward compat from senter Another simple one * Remove backward compat setting from tagger * Add docstrings * Update spacy/pipeline/morphologizer.pyx Co-authored-by: Adriane Boyd * Update docs --------- Co-authored-by: Adriane Boyd --- spacy/pipeline/entity_linker.py | 8 +++----- spacy/pipeline/morphologizer.pyx | 10 ++++------ spacy/pipeline/sentencizer.pyx | 6 ++---- spacy/pipeline/senter.pyx | 5 ++--- spacy/pipeline/tagger.pyx | 6 ++---- website/docs/api/entitylinker.mdx | 2 +- website/docs/api/morphologizer.mdx | 2 +- 7 files changed, 15 insertions(+), 24 deletions(-) diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 6fe322b62..63d5cccc2 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -27,9 +27,6 @@ ActivationsT = Dict[str, Union[List[Ragged], List[str]]] KNOWLEDGE_BASE_IDS = "kb_ids" -# See #9050 -BACKWARD_OVERWRITE = True - default_model_config = """ [model] @architectures = "spacy.EntityLinker.v2" @@ -60,7 +57,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"] "entity_vector_length": 64, "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"}, "get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"}, - "overwrite": True, + "overwrite": False, "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"}, "use_gold_ents": True, "candidates_batch_size": 1, @@ -191,7 +188,7 @@ class EntityLinker(TrainablePipe): get_candidates_batch: Callable[ [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] ], - overwrite: bool = BACKWARD_OVERWRITE, + overwrite: bool = False, scorer: Optional[Callable] = entity_linker_score, use_gold_ents: bool, candidates_batch_size: int, @@ -215,6 +212,7 @@ class EntityLinker(TrainablePipe): Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]] ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. + overwrite (bool): Whether to overwrite existing non-empty annotations. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links. use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another component must provide entity annotations. diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 293add9e1..fabc51fee 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -21,10 +21,6 @@ from ..scorer import Scorer from ..training import validate_examples, validate_get_examples from ..util import registry -# See #9050 -BACKWARD_OVERWRITE = True -BACKWARD_EXTEND = False - default_model_config = """ [model] @architectures = "spacy.Tagger.v2" @@ -102,8 +98,8 @@ class Morphologizer(Tagger): model: Model, name: str = "morphologizer", *, - overwrite: bool = BACKWARD_OVERWRITE, - extend: bool = BACKWARD_EXTEND, + overwrite: bool = False, + extend: bool = False, scorer: Optional[Callable] = morphologizer_score, save_activations: bool = False, ): @@ -113,6 +109,8 @@ class Morphologizer(Tagger): model (thinc.api.Model): The Thinc Model powering the pipeline component. name (str): The component instance name, used to add entries to the losses during training. + overwrite (bool): Whether to overwrite existing annotations. + extend (bool): Whether to extend existing annotations. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_token_attr for the attributes "pos" and "morph" and Scorer.score_token_attr_per_feat for the attribute "morph". diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx index 77f4e8adb..6c2565170 100644 --- a/spacy/pipeline/sentencizer.pyx +++ b/spacy/pipeline/sentencizer.pyx @@ -10,9 +10,6 @@ from ..language import Language from ..scorer import Scorer from .. import util -# see #9050 -BACKWARD_OVERWRITE = False - @Language.factory( "sentencizer", assigns=["token.is_sent_start", "doc.sents"], @@ -52,13 +49,14 @@ class Sentencizer(Pipe): name="sentencizer", *, punct_chars=None, - overwrite=BACKWARD_OVERWRITE, + overwrite=False, scorer=senter_score, ): """Initialize the sentencizer. punct_chars (list): Punctuation characters to split on. Will be serialized with the nlp object. + overwrite (bool): Whether to overwrite existing annotations. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_spans for the attribute "sents". diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index 42feeb277..a7d263e94 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -18,8 +18,6 @@ from ..training import validate_examples, validate_get_examples from ..util import registry from .. import util -# See #9050 -BACKWARD_OVERWRITE = False default_model_config = """ [model] @@ -83,7 +81,7 @@ class SentenceRecognizer(Tagger): model, name="senter", *, - overwrite=BACKWARD_OVERWRITE, + overwrite=False, scorer=senter_score, save_activations: bool = False, ): @@ -93,6 +91,7 @@ class SentenceRecognizer(Tagger): model (thinc.api.Model): The Thinc Model powering the pipeline component. name (str): The component instance name, used to add entries to the losses during training. + overwrite (bool): Whether to overwrite existing annotations. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_spans for the attribute "sents". save_activations (bool): save model activations in Doc when annotating. diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index a6be51c3c..101d8bcea 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -27,9 +27,6 @@ from .. import util ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]] -# See #9050 -BACKWARD_OVERWRITE = False - default_model_config = """ [model] @architectures = "spacy.Tagger.v2" @@ -99,7 +96,7 @@ class Tagger(TrainablePipe): model, name="tagger", *, - overwrite=BACKWARD_OVERWRITE, + overwrite=False, scorer=tagger_score, neg_prefix="!", save_activations: bool = False, @@ -110,6 +107,7 @@ class Tagger(TrainablePipe): model (thinc.api.Model): The Thinc Model powering the pipeline component. name (str): The component instance name, used to add entries to the losses during training. + overwrite (bool): Whether to overwrite existing annotations. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_token_attr for the attribute "tag". save_activations (bool): save model activations in Doc when annotating. diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx index 238b62a2e..12b2f6bef 100644 --- a/website/docs/api/entitylinker.mdx +++ b/website/docs/api/entitylinker.mdx @@ -63,7 +63,7 @@ architectures and their arguments and hyperparameters. | `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ | | `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ | | `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | -| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | +| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ | | `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | | `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~ | | `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | diff --git a/website/docs/api/morphologizer.mdx b/website/docs/api/morphologizer.mdx index 4660ec312..9514bc773 100644 --- a/website/docs/api/morphologizer.mdx +++ b/website/docs/api/morphologizer.mdx @@ -45,7 +45,7 @@ architectures and their arguments and hyperparameters. | Setting | Description | | ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | -| `overwrite` 3.2 | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~ | +| `overwrite` 3.2 | Whether the values of existing features are overwritten. Defaults to `False`. ~~bool~~ | | `extend` 3.2 | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~ | | `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ | | `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~ |