From c62b9dac0b285418705b4b09ba4ac00366b6091f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 19 May 2025 16:25:33 +0200 Subject: [PATCH] Register factories in spacy.registrations, to avoid import-time side-effects --- spacy/pipeline/attributeruler.py | 7 - spacy/pipeline/dep_parser.pyx | 43 --- spacy/pipeline/edit_tree_lemmatizer.py | 14 - spacy/pipeline/entity_linker.py | 26 -- spacy/pipeline/entityruler.py | 18 - spacy/pipeline/functions.py | 9 - spacy/pipeline/lemmatizer.py | 11 - spacy/pipeline/morphologizer.pyx | 7 - spacy/pipeline/multitask.pyx | 4 - spacy/pipeline/ner.pyx | 28 -- spacy/pipeline/sentencizer.pyx | 6 - spacy/pipeline/senter.pyx | 6 - spacy/pipeline/span_finder.py | 17 - spacy/pipeline/span_ruler.py | 42 --- spacy/pipeline/spancat.py | 26 -- spacy/pipeline/tagger.pyx | 6 - spacy/pipeline/textcat.py | 21 -- spacy/pipeline/textcat_multilabel.py | 21 -- spacy/pipeline/tok2vec.py | 3 - spacy/registrations.py | 438 ++++++++++++++++++++++++- 20 files changed, 433 insertions(+), 320 deletions(-) diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py index 8ac74d92b..5b5de78ef 100644 --- a/spacy/pipeline/attributeruler.py +++ b/spacy/pipeline/attributeruler.py @@ -22,13 +22,6 @@ TagMapType = Dict[str, Dict[Union[int, str], Union[int, str]]] MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]] -@Language.factory( - "attribute_ruler", - default_config={ - "validate": False, - "scorer": {"@scorers": "spacy.attribute_ruler_scorer.v1"}, - }, -) def make_attribute_ruler( nlp: Language, name: str, validate: bool, scorer: Optional[Callable] ): diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx index 18a220bd6..42d50dde6 100644 --- a/spacy/pipeline/dep_parser.pyx +++ b/spacy/pipeline/dep_parser.pyx @@ -39,26 +39,6 @@ subword_features = true DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"] -@Language.factory( - "parser", - assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"], - default_config={ - "moves": None, - "update_with_oracle_cut_size": 100, - "learn_tokens": False, - "min_action_freq": 30, - "model": DEFAULT_PARSER_MODEL, - "scorer": {"@scorers": "spacy.parser_scorer.v1"}, - }, - default_score_weights={ - "dep_uas": 0.5, - "dep_las": 0.5, - "dep_las_per_type": None, - "sents_p": None, - "sents_r": None, - "sents_f": 0.0, - }, -) def make_parser( nlp: Language, name: str, @@ -125,29 +105,6 @@ def make_parser( ) -@Language.factory( - "beam_parser", - assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"], - default_config={ - "beam_width": 8, - "beam_density": 0.01, - "beam_update_prob": 0.5, - "moves": None, - "update_with_oracle_cut_size": 100, - "learn_tokens": False, - "min_action_freq": 30, - "model": DEFAULT_PARSER_MODEL, - "scorer": {"@scorers": "spacy.parser_scorer.v1"}, - }, - default_score_weights={ - "dep_uas": 0.5, - "dep_las": 0.5, - "dep_las_per_type": None, - "sents_p": None, - "sents_r": None, - "sents_f": 0.0, - }, -) def make_beam_parser( nlp: Language, name: str, diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py index 4a6174bc3..f8ae2cba3 100644 --- a/spacy/pipeline/edit_tree_lemmatizer.py +++ b/spacy/pipeline/edit_tree_lemmatizer.py @@ -39,20 +39,6 @@ subword_features = true DEFAULT_EDIT_TREE_LEMMATIZER_MODEL = Config().from_str(default_model_config)["model"] -@Language.factory( - "trainable_lemmatizer", - assigns=["token.lemma"], - requires=[], - default_config={ - "model": DEFAULT_EDIT_TREE_LEMMATIZER_MODEL, - "backoff": "orth", - "min_tree_freq": 3, - "overwrite": False, - "top_k": 1, - "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, - }, - default_score_weights={"lemma_acc": 1.0}, -) def make_edit_tree_lemmatizer( nlp: Language, name: str, diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 40a9c8a79..65293a301 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -40,32 +40,6 @@ subword_features = true DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"] -@Language.factory( - "entity_linker", - requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"], - assigns=["token.ent_kb_id"], - default_config={ - "model": DEFAULT_NEL_MODEL, - "labels_discard": [], - "n_sents": 0, - "incl_prior": True, - "incl_context": True, - "entity_vector_length": 64, - "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"}, - "get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"}, - "generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"}, - "overwrite": True, - "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"}, - "use_gold_ents": True, - "candidates_batch_size": 1, - "threshold": None, - }, - default_score_weights={ - "nel_micro_f": 1.0, - "nel_micro_r": None, - "nel_micro_p": None, - }, -) def make_entity_linker( nlp: Language, name: str, diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 189c7b378..22df8065d 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -19,24 +19,6 @@ DEFAULT_ENT_ID_SEP = "||" PatternType = Dict[str, Union[str, List[Dict[str, Any]]]] -@Language.factory( - "entity_ruler", - assigns=["doc.ents", "token.ent_type", "token.ent_iob"], - default_config={ - "phrase_matcher_attr": None, - "matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"}, - "validate": False, - "overwrite_ents": False, - "ent_id_sep": DEFAULT_ENT_ID_SEP, - "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"}, - }, - default_score_weights={ - "ents_f": 1.0, - "ents_p": 0.0, - "ents_r": 0.0, - "ents_per_type": None, - }, -) def make_entity_ruler( nlp: Language, name: str, diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py index 2bf0437d5..e788979cf 100644 --- a/spacy/pipeline/functions.py +++ b/spacy/pipeline/functions.py @@ -73,11 +73,6 @@ def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc: return doc -@Language.factory( - "token_splitter", - default_config={"min_length": 25, "split_length": 10}, - retokenizes=True, -) def make_token_splitter( nlp: Language, name: str, *, min_length: int = 0, split_length: int = 0 ): @@ -141,10 +136,6 @@ class TokenSplitter: util.from_disk(path, serializers, []) -@Language.factory( - "doc_cleaner", - default_config={"attrs": {"tensor": None, "_.trf_data": None}, "silent": True}, -) def make_doc_cleaner(nlp: Language, name: str, *, attrs: Dict[str, Any], silent: bool): return DocCleaner(attrs, silent=silent) diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py index bf556627a..f737b84b5 100644 --- a/spacy/pipeline/lemmatizer.py +++ b/spacy/pipeline/lemmatizer.py @@ -16,17 +16,6 @@ from ..vocab import Vocab from .pipe import Pipe -@Language.factory( - "lemmatizer", - assigns=["token.lemma"], - default_config={ - "model": None, - "mode": "lookup", - "overwrite": False, - "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, - }, - default_score_weights={"lemma_acc": 1.0}, -) def make_lemmatizer( nlp: Language, model: Optional[Model], diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index d415ae43c..937bd00da 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -47,13 +47,6 @@ maxout_pieces = 3 DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"] -@Language.factory( - "morphologizer", - assigns=["token.morph", "token.pos"], - default_config={"model": DEFAULT_MORPH_MODEL, "overwrite": True, "extend": False, - "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}, "label_smoothing": 0.0}, - default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None}, -) def make_morphologizer( nlp: Language, model: Model, diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx index f33a90fde..a7fdbd9b4 100644 --- a/spacy/pipeline/multitask.pyx +++ b/spacy/pipeline/multitask.pyx @@ -30,10 +30,6 @@ subword_features = true DEFAULT_MT_MODEL = Config().from_str(default_model_config)["model"] -@Language.factory( - "nn_labeller", - default_config={"labels": None, "target": "dep_tag_offset", "model": DEFAULT_MT_MODEL} -) def make_nn_labeller(nlp: Language, name: str, model: Model, labels: Optional[dict], target: str): return MultitaskObjective(nlp.vocab, model, name) diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx index b8663937b..548f4b966 100644 --- a/spacy/pipeline/ner.pyx +++ b/spacy/pipeline/ner.pyx @@ -36,19 +36,6 @@ subword_features = true DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"] -@Language.factory( - "ner", - assigns=["doc.ents", "token.ent_iob", "token.ent_type"], - default_config={ - "moves": None, - "update_with_oracle_cut_size": 100, - "model": DEFAULT_NER_MODEL, - "incorrect_spans_key": None, - "scorer": {"@scorers": "spacy.ner_scorer.v1"}, - }, - default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None}, - -) def make_ner( nlp: Language, name: str, @@ -101,21 +88,6 @@ def make_ner( ) -@Language.factory( - "beam_ner", - assigns=["doc.ents", "token.ent_iob", "token.ent_type"], - default_config={ - "moves": None, - "update_with_oracle_cut_size": 100, - "model": DEFAULT_NER_MODEL, - "beam_density": 0.01, - "beam_update_prob": 0.5, - "beam_width": 32, - "incorrect_spans_key": None, - "scorer": None, - }, - default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None}, -) def make_beam_ner( nlp: Language, name: str, diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx index 08ba9d989..9669caf7e 100644 --- a/spacy/pipeline/sentencizer.pyx +++ b/spacy/pipeline/sentencizer.pyx @@ -14,12 +14,6 @@ from .senter import senter_score BACKWARD_OVERWRITE = False -@Language.factory( - "sentencizer", - assigns=["token.is_sent_start", "doc.sents"], - default_config={"punct_chars": None, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}}, - default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, -) def make_sentencizer( nlp: Language, name: str, diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index 5bffc356b..c8e09c5ab 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -34,12 +34,6 @@ subword_features = true DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"] -@Language.factory( - "senter", - assigns=["token.is_sent_start"], - default_config={"model": DEFAULT_SENTER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}}, - default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, -) def make_senter(nlp: Language, name: str, model: Model, overwrite: bool, scorer: Optional[Callable]): return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer) diff --git a/spacy/pipeline/span_finder.py b/spacy/pipeline/span_finder.py index 29d1730ab..709a67b7f 100644 --- a/spacy/pipeline/span_finder.py +++ b/spacy/pipeline/span_finder.py @@ -41,23 +41,6 @@ depth = 4 DEFAULT_SPAN_FINDER_MODEL = Config().from_str(span_finder_default_config)["model"] -@Language.factory( - "span_finder", - assigns=["doc.spans"], - default_config={ - "threshold": 0.5, - "model": DEFAULT_SPAN_FINDER_MODEL, - "spans_key": DEFAULT_SPANS_KEY, - "max_length": 25, - "min_length": None, - "scorer": {"@scorers": "spacy.span_finder_scorer.v1"}, - }, - default_score_weights={ - f"spans_{DEFAULT_SPANS_KEY}_f": 1.0, - f"spans_{DEFAULT_SPANS_KEY}_p": 0.0, - f"spans_{DEFAULT_SPANS_KEY}_r": 0.0, - }, -) def make_span_finder( nlp: Language, name: str, diff --git a/spacy/pipeline/span_ruler.py b/spacy/pipeline/span_ruler.py index 2a5e2179a..1f9ab2622 100644 --- a/spacy/pipeline/span_ruler.py +++ b/spacy/pipeline/span_ruler.py @@ -32,24 +32,6 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]] DEFAULT_SPANS_KEY = "ruler" -@Language.factory( - "future_entity_ruler", - assigns=["doc.ents"], - default_config={ - "phrase_matcher_attr": None, - "validate": False, - "overwrite_ents": False, - "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"}, - "ent_id_sep": "__unused__", - "matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"}, - }, - default_score_weights={ - "ents_f": 1.0, - "ents_p": 0.0, - "ents_r": 0.0, - "ents_per_type": None, - }, -) def make_entity_ruler( nlp: Language, name: str, @@ -79,30 +61,6 @@ def make_entity_ruler( ) -@Language.factory( - "span_ruler", - assigns=["doc.spans"], - default_config={ - "spans_key": DEFAULT_SPANS_KEY, - "spans_filter": None, - "annotate_ents": False, - "ents_filter": {"@misc": "spacy.first_longest_spans_filter.v1"}, - "phrase_matcher_attr": None, - "matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"}, - "validate": False, - "overwrite": True, - "scorer": { - "@scorers": "spacy.overlapping_labeled_spans_scorer.v1", - "spans_key": DEFAULT_SPANS_KEY, - }, - }, - default_score_weights={ - f"spans_{DEFAULT_SPANS_KEY}_f": 1.0, - f"spans_{DEFAULT_SPANS_KEY}_p": 0.0, - f"spans_{DEFAULT_SPANS_KEY}_r": 0.0, - f"spans_{DEFAULT_SPANS_KEY}_per_type": None, - }, -) def make_span_ruler( nlp: Language, name: str, diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py index 0556d2657..3d49bef2a 100644 --- a/spacy/pipeline/spancat.py +++ b/spacy/pipeline/spancat.py @@ -157,19 +157,6 @@ def build_preset_spans_suggester(spans_key: str) -> Suggester: return partial(preset_spans_suggester, spans_key=spans_key) -@Language.factory( - "spancat", - assigns=["doc.spans"], - default_config={ - "threshold": 0.5, - "spans_key": DEFAULT_SPANS_KEY, - "max_positive": None, - "model": DEFAULT_SPANCAT_MODEL, - "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]}, - "scorer": {"@scorers": "spacy.spancat_scorer.v1"}, - }, - default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0}, -) def make_spancat( nlp: Language, name: str, @@ -222,19 +209,6 @@ def make_spancat( ) -@Language.factory( - "spancat_singlelabel", - assigns=["doc.spans"], - default_config={ - "spans_key": DEFAULT_SPANS_KEY, - "model": DEFAULT_SPANCAT_SINGLELABEL_MODEL, - "negative_weight": 1.0, - "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]}, - "scorer": {"@scorers": "spacy.spancat_scorer.v1"}, - "allow_overlap": True, - }, - default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0}, -) def make_spancat_singlelabel( nlp: Language, name: str, diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 28d4c6e7f..f0085d3ff 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -35,12 +35,6 @@ subword_features = true DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"] -@Language.factory( - "tagger", - assigns=["token.tag"], - default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}, "neg_prefix": "!", "label_smoothing": 0.0}, - default_score_weights={"tag_acc": 1.0}, -) def make_tagger( nlp: Language, name: str, diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 3fcbe9870..98393355f 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -74,27 +74,6 @@ subword_features = true """ -@Language.factory( - "textcat", - assigns=["doc.cats"], - default_config={ - "threshold": 0.0, - "model": DEFAULT_SINGLE_TEXTCAT_MODEL, - "scorer": {"@scorers": "spacy.textcat_scorer.v2"}, - }, - default_score_weights={ - "cats_score": 1.0, - "cats_score_desc": None, - "cats_micro_p": None, - "cats_micro_r": None, - "cats_micro_f": None, - "cats_macro_p": None, - "cats_macro_r": None, - "cats_macro_f": None, - "cats_macro_auc": None, - "cats_f_per_type": None, - }, -) def make_textcat( nlp: Language, name: str, diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py index 06910aee4..f1306f92c 100644 --- a/spacy/pipeline/textcat_multilabel.py +++ b/spacy/pipeline/textcat_multilabel.py @@ -72,27 +72,6 @@ subword_features = true """ -@Language.factory( - "textcat_multilabel", - assigns=["doc.cats"], - default_config={ - "threshold": 0.5, - "model": DEFAULT_MULTI_TEXTCAT_MODEL, - "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v2"}, - }, - default_score_weights={ - "cats_score": 1.0, - "cats_score_desc": None, - "cats_micro_p": None, - "cats_micro_r": None, - "cats_micro_f": None, - "cats_macro_p": None, - "cats_macro_r": None, - "cats_macro_f": None, - "cats_macro_auc": None, - "cats_f_per_type": None, - }, -) def make_multilabel_textcat( nlp: Language, name: str, diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index 677f5eec1..22c30d548 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -24,9 +24,6 @@ subword_features = true DEFAULT_TOK2VEC_MODEL = Config().from_str(default_model_config)["model"] -@Language.factory( - "tok2vec", assigns=["doc.tensor"], default_config={"model": DEFAULT_TOK2VEC_MODEL} -) def make_tok2vec(nlp: Language, name: str, model: Model) -> "Tok2Vec": return Tok2Vec(nlp.vocab, model, name) diff --git a/spacy/registrations.py b/spacy/registrations.py index cc9171425..4a5f18293 100644 --- a/spacy/registrations.py +++ b/spacy/registrations.py @@ -4,11 +4,14 @@ This module centralizes registry decorations to prevent circular import issues with Cython annotation changes from __future__ import annotations. Functions remain in their original locations, but decoration is moved here. """ -from typing import Dict, Any +from typing import Dict, Any, Callable, Iterable, List, Optional, Union # Global flag to track if registry has been populated REGISTRY_POPULATED = False +# Global flag to track if factories have been registered +FACTORIES_REGISTERED = False + def populate_registry() -> None: """Populate the registry with all necessary components. @@ -43,9 +46,6 @@ def populate_registry() -> None: # Need to get references to the existing functions in registry by importing the function that is there # For the registry that was previously decorated - # Import functions for use in registry - from .scorer import get_ner_prf # Used for entity_ruler_scorer - # Import ML components that use registry from .ml.models.tok2vec import tok2vec_listener_v1, build_hash_embed_cnn_tok2vec, build_Tok2Vec_model, MultiHashEmbed, CharacterEmbed, MaxoutWindowEncoder, MishWindowEncoder, BiLSTMEncoder @@ -74,5 +74,433 @@ def populate_registry() -> None: registry.architectures("spacy.MishWindowEncoder.v2")(MishWindowEncoder) registry.architectures("spacy.TorchBiLSTMEncoder.v1")(BiLSTMEncoder) + # Register factory components + register_factories() + # Set the flag to indicate that the registry has been populated - REGISTRY_POPULATED = True \ No newline at end of file + REGISTRY_POPULATED = True + + +def register_factories() -> None: + """Register all factories with the registry. + + This function registers all pipeline component factories, centralizing + the registrations that were previously done with @Language.factory decorators. + """ + global FACTORIES_REGISTERED + if FACTORIES_REGISTERED: + return + + from .language import Language + + # Import factory default configurations + from .pipeline.entity_linker import DEFAULT_NEL_MODEL + from .pipeline.entityruler import DEFAULT_ENT_ID_SEP + from .pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL + from .pipeline.senter import DEFAULT_SENTER_MODEL + from .pipeline.morphologizer import DEFAULT_MORPH_MODEL + from .pipeline.spancat import DEFAULT_SPANCAT_MODEL, DEFAULT_SPANCAT_SINGLELABEL_MODEL, DEFAULT_SPANS_KEY + from .pipeline.span_ruler import DEFAULT_SPANS_KEY as SPAN_RULER_DEFAULT_SPANS_KEY + from .pipeline.edit_tree_lemmatizer import DEFAULT_EDIT_TREE_LEMMATIZER_MODEL + from .pipeline.textcat_multilabel import DEFAULT_MULTI_TEXTCAT_MODEL + from .pipeline.span_finder import DEFAULT_SPAN_FINDER_MODEL + from .pipeline.ner import DEFAULT_NER_MODEL + from .pipeline.dep_parser import DEFAULT_PARSER_MODEL + from .pipeline.tagger import DEFAULT_TAGGER_MODEL + from .pipeline.multitask import DEFAULT_MT_MODEL + + # Import all factory functions + from .pipeline.attributeruler import make_attribute_ruler + from .pipeline.entity_linker import make_entity_linker + from .pipeline.entityruler import make_entity_ruler + from .pipeline.lemmatizer import make_lemmatizer + from .pipeline.textcat import make_textcat, DEFAULT_SINGLE_TEXTCAT_MODEL + from .pipeline.functions import make_token_splitter, make_doc_cleaner + from .pipeline.tok2vec import make_tok2vec + from .pipeline.senter import make_senter + from .pipeline.morphologizer import make_morphologizer + from .pipeline.spancat import make_spancat, make_spancat_singlelabel + from .pipeline.span_ruler import make_entity_ruler as make_span_entity_ruler, make_span_ruler + from .pipeline.edit_tree_lemmatizer import make_edit_tree_lemmatizer + from .pipeline.textcat_multilabel import make_multilabel_textcat + from .pipeline.span_finder import make_span_finder + from .pipeline.ner import make_ner, make_beam_ner + from .pipeline.dep_parser import make_parser, make_beam_parser + from .pipeline.tagger import make_tagger + from .pipeline.multitask import make_nn_labeller + from .pipeline.sentencizer import make_sentencizer + + # Register factories using the same pattern as Language.factory decorator + # We use Language.factory()() pattern which exactly mimics the decorator + + # attributeruler + Language.factory( + "attribute_ruler", + default_config={ + "validate": False, + "scorer": {"@scorers": "spacy.attribute_ruler_scorer.v1"}, + }, + )(make_attribute_ruler) + + # entity_linker + Language.factory( + "entity_linker", + requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"], + assigns=["token.ent_kb_id"], + default_config={ + "model": DEFAULT_NEL_MODEL, + "labels_discard": [], + "n_sents": 0, + "incl_prior": True, + "incl_context": True, + "entity_vector_length": 64, + "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"}, + "get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"}, + "generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"}, + "overwrite": True, + "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"}, + "use_gold_ents": True, + "candidates_batch_size": 1, + "threshold": None, + }, + default_score_weights={ + "nel_micro_f": 1.0, + "nel_micro_r": None, + "nel_micro_p": None, + }, + )(make_entity_linker) + + # entity_ruler + Language.factory( + "entity_ruler", + assigns=["doc.ents", "token.ent_type", "token.ent_iob"], + default_config={ + "phrase_matcher_attr": None, + "matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"}, + "validate": False, + "overwrite_ents": False, + "ent_id_sep": DEFAULT_ENT_ID_SEP, + "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"}, + }, + default_score_weights={ + "ents_f": 1.0, + "ents_p": 0.0, + "ents_r": 0.0, + "ents_per_type": None, + }, + )(make_entity_ruler) + + # lemmatizer + Language.factory( + "lemmatizer", + assigns=["token.lemma"], + default_config={ + "model": None, + "mode": "lookup", + "overwrite": False, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, + default_score_weights={"lemma_acc": 1.0}, + )(make_lemmatizer) + + # textcat + Language.factory( + "textcat", + assigns=["doc.cats"], + default_config={ + "threshold": 0.0, + "model": DEFAULT_SINGLE_TEXTCAT_MODEL, + "scorer": {"@scorers": "spacy.textcat_scorer.v2"}, + }, + default_score_weights={ + "cats_score": 1.0, + "cats_score_desc": None, + "cats_micro_p": None, + "cats_micro_r": None, + "cats_micro_f": None, + "cats_macro_p": None, + "cats_macro_r": None, + "cats_macro_f": None, + "cats_macro_auc": None, + "cats_f_per_type": None, + }, + )(make_textcat) + + # token_splitter + Language.factory( + "token_splitter", + default_config={"min_length": 25, "split_length": 10}, + retokenizes=True, + )(make_token_splitter) + + # doc_cleaner + Language.factory( + "doc_cleaner", + default_config={"attrs": {"tensor": None, "_.trf_data": None}, "silent": True}, + )(make_doc_cleaner) + + # tok2vec + Language.factory( + "tok2vec", + assigns=["doc.tensor"], + default_config={"model": DEFAULT_TOK2VEC_MODEL} + )(make_tok2vec) + + # senter + Language.factory( + "senter", + assigns=["token.is_sent_start"], + default_config={"model": DEFAULT_SENTER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}}, + default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, + )(make_senter) + + # morphologizer + Language.factory( + "morphologizer", + assigns=["token.morph", "token.pos"], + default_config={ + "model": DEFAULT_MORPH_MODEL, + "overwrite": True, + "extend": False, + "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}, + "label_smoothing": 0.0 + }, + default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None}, + )(make_morphologizer) + + # spancat + Language.factory( + "spancat", + assigns=["doc.spans"], + default_config={ + "threshold": 0.5, + "spans_key": DEFAULT_SPANS_KEY, + "max_positive": None, + "model": DEFAULT_SPANCAT_MODEL, + "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]}, + "scorer": {"@scorers": "spacy.spancat_scorer.v1"}, + }, + default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0}, + )(make_spancat) + + # spancat_singlelabel + Language.factory( + "spancat_singlelabel", + assigns=["doc.spans"], + default_config={ + "spans_key": DEFAULT_SPANS_KEY, + "model": DEFAULT_SPANCAT_SINGLELABEL_MODEL, + "negative_weight": 1.0, + "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]}, + "scorer": {"@scorers": "spacy.spancat_scorer.v1"}, + "allow_overlap": True, + }, + default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0}, + )(make_spancat_singlelabel) + + # future_entity_ruler + Language.factory( + "future_entity_ruler", + assigns=["doc.ents"], + default_config={ + "phrase_matcher_attr": None, + "validate": False, + "overwrite_ents": False, + "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"}, + "ent_id_sep": "__unused__", + "matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"}, + }, + default_score_weights={ + "ents_f": 1.0, + "ents_p": 0.0, + "ents_r": 0.0, + "ents_per_type": None, + }, + )(make_span_entity_ruler) + + # span_ruler + Language.factory( + "span_ruler", + assigns=["doc.spans"], + default_config={ + "spans_key": SPAN_RULER_DEFAULT_SPANS_KEY, + "spans_filter": None, + "annotate_ents": False, + "ents_filter": {"@misc": "spacy.first_longest_spans_filter.v1"}, + "phrase_matcher_attr": None, + "matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"}, + "validate": False, + "overwrite": True, + "scorer": { + "@scorers": "spacy.overlapping_labeled_spans_scorer.v1", + "spans_key": SPAN_RULER_DEFAULT_SPANS_KEY, + }, + }, + default_score_weights={ + f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_f": 1.0, + f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_p": 0.0, + f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_r": 0.0, + f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_per_type": None, + }, + )(make_span_ruler) + + # trainable_lemmatizer + Language.factory( + "trainable_lemmatizer", + assigns=["token.lemma"], + requires=[], + default_config={ + "model": DEFAULT_EDIT_TREE_LEMMATIZER_MODEL, + "backoff": "orth", + "min_tree_freq": 3, + "overwrite": False, + "top_k": 1, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, + default_score_weights={"lemma_acc": 1.0}, + )(make_edit_tree_lemmatizer) + + # textcat_multilabel + Language.factory( + "textcat_multilabel", + assigns=["doc.cats"], + default_config={ + "threshold": 0.5, + "model": DEFAULT_MULTI_TEXTCAT_MODEL, + "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v2"}, + }, + default_score_weights={ + "cats_score": 1.0, + "cats_score_desc": None, + "cats_micro_p": None, + "cats_micro_r": None, + "cats_micro_f": None, + "cats_macro_p": None, + "cats_macro_r": None, + "cats_macro_f": None, + "cats_macro_auc": None, + "cats_f_per_type": None, + }, + )(make_multilabel_textcat) + + # span_finder + Language.factory( + "span_finder", + assigns=["doc.spans"], + default_config={ + "threshold": 0.5, + "model": DEFAULT_SPAN_FINDER_MODEL, + "spans_key": DEFAULT_SPANS_KEY, + "max_length": 25, + "min_length": None, + "scorer": {"@scorers": "spacy.span_finder_scorer.v1"}, + }, + default_score_weights={ + f"spans_{DEFAULT_SPANS_KEY}_f": 1.0, + f"spans_{DEFAULT_SPANS_KEY}_p": 0.0, + f"spans_{DEFAULT_SPANS_KEY}_r": 0.0, + }, + )(make_span_finder) + + # ner + Language.factory( + "ner", + assigns=["doc.ents", "token.ent_iob", "token.ent_type"], + default_config={ + "moves": None, + "update_with_oracle_cut_size": 100, + "model": DEFAULT_NER_MODEL, + "incorrect_spans_key": None, + "scorer": {"@scorers": "spacy.ner_scorer.v1"}, + }, + default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None}, + )(make_ner) + + # beam_ner + Language.factory( + "beam_ner", + assigns=["doc.ents", "token.ent_iob", "token.ent_type"], + default_config={ + "moves": None, + "update_with_oracle_cut_size": 100, + "model": DEFAULT_NER_MODEL, + "beam_density": 0.01, + "beam_update_prob": 0.5, + "beam_width": 32, + "incorrect_spans_key": None, + "scorer": {"@scorers": "spacy.ner_scorer.v1"}, + }, + default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None}, + )(make_beam_ner) + + # parser + Language.factory( + "parser", + assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"], + default_config={ + "moves": None, + "update_with_oracle_cut_size": 100, + "learn_tokens": False, + "min_action_freq": 30, + "model": DEFAULT_PARSER_MODEL, + "scorer": {"@scorers": "spacy.parser_scorer.v1"}, + }, + default_score_weights={ + "dep_uas": 0.5, + "dep_las": 0.5, + "dep_las_per_type": None, + "sents_p": None, + "sents_r": None, + "sents_f": 0.0, + }, + )(make_parser) + + # beam_parser + Language.factory( + "beam_parser", + assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"], + default_config={ + "moves": None, + "update_with_oracle_cut_size": 100, + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 8, + "beam_density": 0.0001, + "beam_update_prob": 0.5, + "model": DEFAULT_PARSER_MODEL, + "scorer": {"@scorers": "spacy.parser_scorer.v1"}, + }, + default_score_weights={ + "dep_uas": 0.5, + "dep_las": 0.5, + "dep_las_per_type": None, + "sents_p": None, + "sents_r": None, + "sents_f": 0.0, + }, + )(make_beam_parser) + + # tagger + Language.factory( + "tagger", + assigns=["token.tag"], + default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}, "neg_prefix": "!", "label_smoothing": 0.0}, + default_score_weights={"tag_acc": 1.0, "pos_acc": 0.0, "tag_micro_p": None, "tag_micro_r": None, "tag_micro_f": None}, + )(make_tagger) + + # nn_labeller + Language.factory( + "nn_labeller", + default_config={"labels": None, "target": "dep_tag_offset", "model": DEFAULT_MT_MODEL} + )(make_nn_labeller) + + # sentencizer + Language.factory( + "sentencizer", + assigns=["token.is_sent_start", "doc.sents"], + default_config={"punct_chars": None, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}}, + default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, + )(make_sentencizer) + + # Set the flag to indicate that all factories have been registered + FACTORIES_REGISTERED = True