From bd2d707773e106e350e9683edd7df05d974d63b3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 21 May 2025 15:34:01 +0200 Subject: [PATCH] Move factories to their own file --- spacy/registrations.py | 928 +---------------------------------------- 1 file changed, 3 insertions(+), 925 deletions(-) diff --git a/spacy/registrations.py b/spacy/registrations.py index 5e33a4e27..5d8aea3c0 100644 --- a/spacy/registrations.py +++ b/spacy/registrations.py @@ -1,69 +1,14 @@ -"""Centralized registry population for spaCy components. +"""Centralized registry population for spaCy config This module centralizes registry decorations to prevent circular import issues with Cython annotation changes from __future__ import annotations. Functions remain in their original locations, but decoration is moved here. + +Component definitions and registrations are in spacy/pipeline/factories.py """ -from typing import Dict, Any, Callable, Iterable, List, Optional, Union, Tuple -from thinc.api import Model -from thinc.types import Floats2d, Ragged -from .tokens.doc import Doc -from .tokens.span import Span -from .kb import KnowledgeBase, Candidate -from .vocab import Vocab -from .pipeline.textcat import TextCategorizer -from .pipeline.tok2vec import Tok2Vec -from .pipeline.spancat import SpanCategorizer, Suggester -from .pipeline.textcat_multilabel import MultiLabel_TextCategorizer -from .pipeline.entityruler import EntityRuler -from .pipeline.span_finder import SpanFinder -from .pipeline.ner import EntityRecognizer -from .pipeline._parser_internals.transition_system import TransitionSystem -from .pipeline.dep_parser import DependencyParser -from .pipeline.tagger import Tagger -from .pipeline.multitask import MultitaskObjective -from .pipeline.senter import SentenceRecognizer -from .language import Language -from .pipeline.sentencizer import Sentencizer - -# Import factory default configurations -from .pipeline.entity_linker import DEFAULT_NEL_MODEL -from .pipeline.entityruler import DEFAULT_ENT_ID_SEP -from .pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL -from .pipeline.senter import DEFAULT_SENTER_MODEL -from .pipeline.morphologizer import DEFAULT_MORPH_MODEL -from .pipeline.spancat import ( - DEFAULT_SPANCAT_MODEL, - DEFAULT_SPANCAT_SINGLELABEL_MODEL, - DEFAULT_SPANS_KEY, -) -from .pipeline.span_ruler import DEFAULT_SPANS_KEY as SPAN_RULER_DEFAULT_SPANS_KEY -from .pipeline.edit_tree_lemmatizer import DEFAULT_EDIT_TREE_LEMMATIZER_MODEL -from .pipeline.textcat_multilabel import DEFAULT_MULTI_TEXTCAT_MODEL -from .pipeline.span_finder import DEFAULT_SPAN_FINDER_MODEL -from .pipeline.ner import DEFAULT_NER_MODEL -from .pipeline.dep_parser import DEFAULT_PARSER_MODEL -from .pipeline.tagger import DEFAULT_TAGGER_MODEL -from .pipeline.multitask import DEFAULT_MT_MODEL -from .pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL -from .pipeline.entity_linker import EntityLinker, EntityLinker_v1 -from .pipeline.attributeruler import AttributeRuler -from .pipeline.spancat import SpanCategorizer -from .pipeline.lemmatizer import Lemmatizer -from .pipeline.functions import TokenSplitter -from .pipeline.functions import DocCleaner -from .pipeline.span_ruler import SpanRuler, prioritize_new_ents_filter, prioritize_existing_ents_filter -from .pipeline.span_ruler import SpanRuler -from .pipeline.edit_tree_lemmatizer import EditTreeLemmatizer -from .pipeline.morphologizer import Morphologizer - - # Global flag to track if registry has been populated REGISTRY_POPULATED = False -# Global flag to track if factories have been registered -FACTORIES_REGISTERED = False - def populate_registry() -> None: """Populate the registry with all necessary components. @@ -149,872 +94,5 @@ def populate_registry() -> None: registry.architectures("spacy.MishWindowEncoder.v2")(MishWindowEncoder) registry.architectures("spacy.TorchBiLSTMEncoder.v1")(BiLSTMEncoder) - # Register factory components - register_factories() - # Set the flag to indicate that the registry has been populated REGISTRY_POPULATED = True - - -def register_factories() -> None: - """Register all factories with the registry. - - This function registers all pipeline component factories, centralizing - the registrations that were previously done with @Language.factory decorators. - """ - global FACTORIES_REGISTERED - - if FACTORIES_REGISTERED: - return - - # Register factories using the same pattern as Language.factory decorator - # We use Language.factory()() pattern which exactly mimics the decorator - - # attributeruler - Language.factory( - "attribute_ruler", - default_config={ - "validate": False, - "scorer": {"@scorers": "spacy.attribute_ruler_scorer.v1"}, - }, - )(make_attribute_ruler) - - # entity_linker - Language.factory( - "entity_linker", - requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"], - assigns=["token.ent_kb_id"], - default_config={ - "model": DEFAULT_NEL_MODEL, - "labels_discard": [], - "n_sents": 0, - "incl_prior": True, - "incl_context": True, - "entity_vector_length": 64, - "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"}, - "get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"}, - "generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"}, - "overwrite": True, - "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"}, - "use_gold_ents": True, - "candidates_batch_size": 1, - "threshold": None, - }, - default_score_weights={ - "nel_micro_f": 1.0, - "nel_micro_r": None, - "nel_micro_p": None, - }, - )(make_entity_linker) - - # entity_ruler - Language.factory( - "entity_ruler", - assigns=["doc.ents", "token.ent_type", "token.ent_iob"], - default_config={ - "phrase_matcher_attr": None, - "matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"}, - "validate": False, - "overwrite_ents": False, - "ent_id_sep": DEFAULT_ENT_ID_SEP, - "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"}, - }, - default_score_weights={ - "ents_f": 1.0, - "ents_p": 0.0, - "ents_r": 0.0, - "ents_per_type": None, - }, - )(make_entity_ruler) - - # lemmatizer - Language.factory( - "lemmatizer", - assigns=["token.lemma"], - default_config={ - "model": None, - "mode": "lookup", - "overwrite": False, - "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, - }, - default_score_weights={"lemma_acc": 1.0}, - )(make_lemmatizer) - - # textcat - Language.factory( - "textcat", - assigns=["doc.cats"], - default_config={ - "threshold": 0.0, - "model": DEFAULT_SINGLE_TEXTCAT_MODEL, - "scorer": {"@scorers": "spacy.textcat_scorer.v2"}, - }, - default_score_weights={ - "cats_score": 1.0, - "cats_score_desc": None, - "cats_micro_p": None, - "cats_micro_r": None, - "cats_micro_f": None, - "cats_macro_p": None, - "cats_macro_r": None, - "cats_macro_f": None, - "cats_macro_auc": None, - "cats_f_per_type": None, - }, - )(make_textcat) - - # token_splitter - Language.factory( - "token_splitter", - default_config={"min_length": 25, "split_length": 10}, - retokenizes=True, - )(make_token_splitter) - - # doc_cleaner - Language.factory( - "doc_cleaner", - default_config={"attrs": {"tensor": None, "_.trf_data": None}, "silent": True}, - )(make_doc_cleaner) - - # tok2vec - Language.factory( - "tok2vec", - assigns=["doc.tensor"], - default_config={"model": DEFAULT_TOK2VEC_MODEL}, - )(make_tok2vec) - - # senter - Language.factory( - "senter", - assigns=["token.is_sent_start"], - default_config={ - "model": DEFAULT_SENTER_MODEL, - "overwrite": False, - "scorer": {"@scorers": "spacy.senter_scorer.v1"}, - }, - default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, - )(make_senter) - - # morphologizer - Language.factory( - "morphologizer", - assigns=["token.morph", "token.pos"], - default_config={ - "model": DEFAULT_MORPH_MODEL, - "overwrite": True, - "extend": False, - "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}, - "label_smoothing": 0.0, - }, - default_score_weights={ - "pos_acc": 0.5, - "morph_acc": 0.5, - "morph_per_feat": None, - }, - )(make_morphologizer) - - # spancat - Language.factory( - "spancat", - assigns=["doc.spans"], - default_config={ - "threshold": 0.5, - "spans_key": DEFAULT_SPANS_KEY, - "max_positive": None, - "model": DEFAULT_SPANCAT_MODEL, - "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]}, - "scorer": {"@scorers": "spacy.spancat_scorer.v1"}, - }, - default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0}, - )(make_spancat) - - # spancat_singlelabel - Language.factory( - "spancat_singlelabel", - assigns=["doc.spans"], - default_config={ - "spans_key": DEFAULT_SPANS_KEY, - "model": DEFAULT_SPANCAT_SINGLELABEL_MODEL, - "negative_weight": 1.0, - "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]}, - "scorer": {"@scorers": "spacy.spancat_scorer.v1"}, - "allow_overlap": True, - }, - default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0}, - )(make_spancat_singlelabel) - - # future_entity_ruler - Language.factory( - "future_entity_ruler", - assigns=["doc.ents"], - default_config={ - "phrase_matcher_attr": None, - "validate": False, - "overwrite_ents": False, - "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"}, - "ent_id_sep": "__unused__", - "matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"}, - }, - default_score_weights={ - "ents_f": 1.0, - "ents_p": 0.0, - "ents_r": 0.0, - "ents_per_type": None, - }, - )(make_future_entity_ruler) - - # span_ruler - Language.factory( - "span_ruler", - assigns=["doc.spans"], - default_config={ - "spans_key": SPAN_RULER_DEFAULT_SPANS_KEY, - "spans_filter": None, - "annotate_ents": False, - "ents_filter": {"@misc": "spacy.first_longest_spans_filter.v1"}, - "phrase_matcher_attr": None, - "matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"}, - "validate": False, - "overwrite": True, - "scorer": { - "@scorers": "spacy.overlapping_labeled_spans_scorer.v1", - "spans_key": SPAN_RULER_DEFAULT_SPANS_KEY, - }, - }, - default_score_weights={ - f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_f": 1.0, - f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_p": 0.0, - f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_r": 0.0, - f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_per_type": None, - }, - )(make_span_ruler) - - # trainable_lemmatizer - Language.factory( - "trainable_lemmatizer", - assigns=["token.lemma"], - requires=[], - default_config={ - "model": DEFAULT_EDIT_TREE_LEMMATIZER_MODEL, - "backoff": "orth", - "min_tree_freq": 3, - "overwrite": False, - "top_k": 1, - "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, - }, - default_score_weights={"lemma_acc": 1.0}, - )(make_edit_tree_lemmatizer) - - # textcat_multilabel - Language.factory( - "textcat_multilabel", - assigns=["doc.cats"], - default_config={ - "threshold": 0.5, - "model": DEFAULT_MULTI_TEXTCAT_MODEL, - "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v2"}, - }, - default_score_weights={ - "cats_score": 1.0, - "cats_score_desc": None, - "cats_micro_p": None, - "cats_micro_r": None, - "cats_micro_f": None, - "cats_macro_p": None, - "cats_macro_r": None, - "cats_macro_f": None, - "cats_macro_auc": None, - "cats_f_per_type": None, - }, - )(make_multilabel_textcat) - - # span_finder - Language.factory( - "span_finder", - assigns=["doc.spans"], - default_config={ - "threshold": 0.5, - "model": DEFAULT_SPAN_FINDER_MODEL, - "spans_key": DEFAULT_SPANS_KEY, - "max_length": 25, - "min_length": None, - "scorer": {"@scorers": "spacy.span_finder_scorer.v1"}, - }, - default_score_weights={ - f"spans_{DEFAULT_SPANS_KEY}_f": 1.0, - f"spans_{DEFAULT_SPANS_KEY}_p": 0.0, - f"spans_{DEFAULT_SPANS_KEY}_r": 0.0, - }, - )(make_span_finder) - - # ner - Language.factory( - "ner", - assigns=["doc.ents", "token.ent_iob", "token.ent_type"], - default_config={ - "moves": None, - "update_with_oracle_cut_size": 100, - "model": DEFAULT_NER_MODEL, - "incorrect_spans_key": None, - "scorer": {"@scorers": "spacy.ner_scorer.v1"}, - }, - default_score_weights={ - "ents_f": 1.0, - "ents_p": 0.0, - "ents_r": 0.0, - "ents_per_type": None, - }, - )(make_ner) - - # beam_ner - Language.factory( - "beam_ner", - assigns=["doc.ents", "token.ent_iob", "token.ent_type"], - default_config={ - "moves": None, - "update_with_oracle_cut_size": 100, - "model": DEFAULT_NER_MODEL, - "beam_density": 0.01, - "beam_update_prob": 0.5, - "beam_width": 32, - "incorrect_spans_key": None, - "scorer": {"@scorers": "spacy.ner_scorer.v1"}, - }, - default_score_weights={ - "ents_f": 1.0, - "ents_p": 0.0, - "ents_r": 0.0, - "ents_per_type": None, - }, - )(make_beam_ner) - - # parser - Language.factory( - "parser", - assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"], - default_config={ - "moves": None, - "update_with_oracle_cut_size": 100, - "learn_tokens": False, - "min_action_freq": 30, - "model": DEFAULT_PARSER_MODEL, - "scorer": {"@scorers": "spacy.parser_scorer.v1"}, - }, - default_score_weights={ - "dep_uas": 0.5, - "dep_las": 0.5, - "dep_las_per_type": None, - "sents_p": None, - "sents_r": None, - "sents_f": 0.0, - }, - )(make_parser) - - # beam_parser - Language.factory( - "beam_parser", - assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"], - default_config={ - "moves": None, - "update_with_oracle_cut_size": 100, - "learn_tokens": False, - "min_action_freq": 30, - "beam_width": 8, - "beam_density": 0.0001, - "beam_update_prob": 0.5, - "model": DEFAULT_PARSER_MODEL, - "scorer": {"@scorers": "spacy.parser_scorer.v1"}, - }, - default_score_weights={ - "dep_uas": 0.5, - "dep_las": 0.5, - "dep_las_per_type": None, - "sents_p": None, - "sents_r": None, - "sents_f": 0.0, - }, - )(make_beam_parser) - - # tagger - Language.factory( - "tagger", - assigns=["token.tag"], - default_config={ - "model": DEFAULT_TAGGER_MODEL, - "overwrite": False, - "scorer": {"@scorers": "spacy.tagger_scorer.v1"}, - "neg_prefix": "!", - "label_smoothing": 0.0, - }, - default_score_weights={ - "tag_acc": 1.0, - "pos_acc": 0.0, - "tag_micro_p": None, - "tag_micro_r": None, - "tag_micro_f": None, - }, - )(make_tagger) - - # nn_labeller - Language.factory( - "nn_labeller", - default_config={ - "labels": None, - "target": "dep_tag_offset", - "model": DEFAULT_MT_MODEL, - }, - )(make_nn_labeller) - - # sentencizer - Language.factory( - "sentencizer", - assigns=["token.is_sent_start", "doc.sents"], - default_config={ - "punct_chars": None, - "overwrite": False, - "scorer": {"@scorers": "spacy.senter_scorer.v1"}, - }, - default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, - )(make_sentencizer) - - # Set the flag to indicate that all factories have been registered - FACTORIES_REGISTERED = True - - -# We can't have function implementations for these factories in Cython, because -# we need to build a Pydantic model for them dynamically, reading their argument -# structure from the signature. In Cython 3, this doesn't work because the -# from __future__ import annotations semantics are used, which means the types -# are stored as strings. -def make_sentencizer( - nlp: Language, - name: str, - punct_chars: Optional[List[str]], - overwrite: bool, - scorer: Optional[Callable], -): - return Sentencizer( - name, punct_chars=punct_chars, overwrite=overwrite, scorer=scorer - ) - -def make_attribute_ruler( - nlp: Language, name: str, validate: bool, scorer: Optional[Callable] -): - return AttributeRuler(nlp.vocab, name, validate=validate, scorer=scorer) - -def make_entity_linker( - nlp: Language, - name: str, - model: Model, - *, - labels_discard: Iterable[str], - n_sents: int, - incl_prior: bool, - incl_context: bool, - entity_vector_length: int, - get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], - get_candidates_batch: Callable[ - [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] - ], - generate_empty_kb: Callable[[Vocab, int], KnowledgeBase], - overwrite: bool, - scorer: Optional[Callable], - use_gold_ents: bool, - candidates_batch_size: int, - threshold: Optional[float] = None, -): - - if not model.attrs.get("include_span_maker", False): - # The only difference in arguments here is that use_gold_ents and threshold aren't available. - return EntityLinker_v1( - nlp.vocab, - model, - name, - labels_discard=labels_discard, - n_sents=n_sents, - incl_prior=incl_prior, - incl_context=incl_context, - entity_vector_length=entity_vector_length, - get_candidates=get_candidates, - overwrite=overwrite, - scorer=scorer, - ) - return EntityLinker( - nlp.vocab, - model, - name, - labels_discard=labels_discard, - n_sents=n_sents, - incl_prior=incl_prior, - incl_context=incl_context, - entity_vector_length=entity_vector_length, - get_candidates=get_candidates, - get_candidates_batch=get_candidates_batch, - generate_empty_kb=generate_empty_kb, - overwrite=overwrite, - scorer=scorer, - use_gold_ents=use_gold_ents, - candidates_batch_size=candidates_batch_size, - threshold=threshold, - ) - -def make_lemmatizer( - nlp: Language, - model: Optional[Model], - name: str, - mode: str, - overwrite: bool, - scorer: Optional[Callable], -): - return Lemmatizer( - nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer - ) - -def make_textcat( - nlp: Language, - name: str, - model: Model[List[Doc], List[Floats2d]], - threshold: float, - scorer: Optional[Callable], -) -> TextCategorizer: - return TextCategorizer(nlp.vocab, model, name, threshold=threshold, scorer=scorer) - -def make_token_splitter( - nlp: Language, name: str, *, min_length: int = 0, split_length: int = 0 -): - return TokenSplitter(min_length=min_length, split_length=split_length) - -def make_doc_cleaner(nlp: Language, name: str, *, attrs: Dict[str, Any], silent: bool): - return DocCleaner(attrs, silent=silent) - -def make_tok2vec(nlp: Language, name: str, model: Model) -> Tok2Vec: - return Tok2Vec(nlp.vocab, model, name) - -def make_spancat( - nlp: Language, - name: str, - suggester: Suggester, - model: Model[Tuple[List[Doc], Ragged], Floats2d], - spans_key: str, - scorer: Optional[Callable], - threshold: float, - max_positive: Optional[int], -) -> SpanCategorizer: - return SpanCategorizer( - nlp.vocab, - model=model, - suggester=suggester, - name=name, - spans_key=spans_key, - negative_weight=None, - allow_overlap=True, - max_positive=max_positive, - threshold=threshold, - scorer=scorer, - add_negative_label=False, - ) - -def make_spancat_singlelabel( - nlp: Language, - name: str, - suggester: Suggester, - model: Model[Tuple[List[Doc], Ragged], Floats2d], - spans_key: str, - negative_weight: float, - allow_overlap: bool, - scorer: Optional[Callable], -) -> SpanCategorizer: - return SpanCategorizer( - nlp.vocab, - model=model, - suggester=suggester, - name=name, - spans_key=spans_key, - negative_weight=negative_weight, - allow_overlap=allow_overlap, - max_positive=1, - add_negative_label=True, - threshold=None, - scorer=scorer, - ) - -def make_future_entity_ruler( - nlp: Language, - name: str, - phrase_matcher_attr: Optional[Union[int, str]], - matcher_fuzzy_compare: Callable, - validate: bool, - overwrite_ents: bool, - scorer: Optional[Callable], - ent_id_sep: str, -): - if overwrite_ents: - ents_filter = prioritize_new_ents_filter - else: - ents_filter = prioritize_existing_ents_filter - return SpanRuler( - nlp, - name, - spans_key=None, - spans_filter=None, - annotate_ents=True, - ents_filter=ents_filter, - phrase_matcher_attr=phrase_matcher_attr, - matcher_fuzzy_compare=matcher_fuzzy_compare, - validate=validate, - overwrite=False, - scorer=scorer, - ) - -def make_entity_ruler( - nlp: Language, - name: str, - phrase_matcher_attr: Optional[Union[int, str]], - matcher_fuzzy_compare: Callable, - validate: bool, - overwrite_ents: bool, - ent_id_sep: str, - scorer: Optional[Callable], -): - return EntityRuler( - nlp, - name, - phrase_matcher_attr=phrase_matcher_attr, - matcher_fuzzy_compare=matcher_fuzzy_compare, - validate=validate, - overwrite_ents=overwrite_ents, - ent_id_sep=ent_id_sep, - scorer=scorer, - ) - -def make_span_ruler( - nlp: Language, - name: str, - spans_key: Optional[str], - spans_filter: Optional[Callable[[Iterable[Span], Iterable[Span]], Iterable[Span]]], - annotate_ents: bool, - ents_filter: Callable[[Iterable[Span], Iterable[Span]], Iterable[Span]], - phrase_matcher_attr: Optional[Union[int, str]], - matcher_fuzzy_compare: Callable, - validate: bool, - overwrite: bool, - scorer: Optional[Callable], -): - return SpanRuler( - nlp, - name, - spans_key=spans_key, - spans_filter=spans_filter, - annotate_ents=annotate_ents, - ents_filter=ents_filter, - phrase_matcher_attr=phrase_matcher_attr, - matcher_fuzzy_compare=matcher_fuzzy_compare, - validate=validate, - overwrite=overwrite, - scorer=scorer, - ) - -def make_edit_tree_lemmatizer( - nlp: Language, - name: str, - model: Model, - backoff: Optional[str], - min_tree_freq: int, - overwrite: bool, - top_k: int, - scorer: Optional[Callable], -): - return EditTreeLemmatizer( - nlp.vocab, - model, - name, - backoff=backoff, - min_tree_freq=min_tree_freq, - overwrite=overwrite, - top_k=top_k, - scorer=scorer, - ) - -def make_multilabel_textcat( - nlp: Language, - name: str, - model: Model[List[Doc], List[Floats2d]], - threshold: float, - scorer: Optional[Callable], -) -> MultiLabel_TextCategorizer: - return MultiLabel_TextCategorizer( - nlp.vocab, model, name, threshold=threshold, scorer=scorer - ) - -def make_span_finder( - nlp: Language, - name: str, - model: Model[Iterable[Doc], Floats2d], - spans_key: str, - threshold: float, - max_length: Optional[int], - min_length: Optional[int], - scorer: Optional[Callable], -) -> SpanFinder: - return SpanFinder( - nlp, - model=model, - threshold=threshold, - name=name, - scorer=scorer, - max_length=max_length, - min_length=min_length, - spans_key=spans_key, - ) - -def make_ner( - nlp: Language, - name: str, - model: Model, - moves: Optional[TransitionSystem], - update_with_oracle_cut_size: int, - incorrect_spans_key: Optional[str], - scorer: Optional[Callable], -): - return EntityRecognizer( - nlp.vocab, - model, - name=name, - moves=moves, - update_with_oracle_cut_size=update_with_oracle_cut_size, - incorrect_spans_key=incorrect_spans_key, - scorer=scorer, - ) - -def make_beam_ner( - nlp: Language, - name: str, - model: Model, - moves: Optional[TransitionSystem], - update_with_oracle_cut_size: int, - beam_width: int, - beam_density: float, - beam_update_prob: float, - incorrect_spans_key: Optional[str], - scorer: Optional[Callable], -): - return EntityRecognizer( - nlp.vocab, - model, - name=name, - moves=moves, - update_with_oracle_cut_size=update_with_oracle_cut_size, - beam_width=beam_width, - beam_density=beam_density, - beam_update_prob=beam_update_prob, - incorrect_spans_key=incorrect_spans_key, - scorer=scorer, - ) - -def make_parser( - nlp: Language, - name: str, - model: Model, - moves: Optional[TransitionSystem], - update_with_oracle_cut_size: int, - learn_tokens: bool, - min_action_freq: int, - scorer: Optional[Callable], -): - return DependencyParser( - nlp.vocab, - model, - name=name, - moves=moves, - update_with_oracle_cut_size=update_with_oracle_cut_size, - learn_tokens=learn_tokens, - min_action_freq=min_action_freq, - scorer=scorer, - ) - -def make_beam_parser( - nlp: Language, - name: str, - model: Model, - moves: Optional[TransitionSystem], - update_with_oracle_cut_size: int, - learn_tokens: bool, - min_action_freq: int, - beam_width: int, - beam_density: float, - beam_update_prob: float, - scorer: Optional[Callable], -): - return DependencyParser( - nlp.vocab, - model, - name=name, - moves=moves, - update_with_oracle_cut_size=update_with_oracle_cut_size, - learn_tokens=learn_tokens, - min_action_freq=min_action_freq, - beam_width=beam_width, - beam_density=beam_density, - beam_update_prob=beam_update_prob, - scorer=scorer, - ) - -def make_tagger( - nlp: Language, - name: str, - model: Model, - overwrite: bool, - scorer: Optional[Callable], - neg_prefix: str, - label_smoothing: float, -): - return Tagger( - nlp.vocab, - model, - name=name, - overwrite=overwrite, - scorer=scorer, - neg_prefix=neg_prefix, - label_smoothing=label_smoothing, - ) - -def make_nn_labeller( - nlp: Language, - name: str, - model: Model, - labels: Optional[dict], - target: str -): - return MultitaskObjective(nlp.vocab, model, name, target=target) - -def make_morphologizer( - nlp: Language, - model: Model, - name: str, - overwrite: bool, - extend: bool, - label_smoothing: float, - scorer: Optional[Callable], -): - return Morphologizer( - nlp.vocab, model, name, - overwrite=overwrite, - extend=extend, - label_smoothing=label_smoothing, - scorer=scorer - ) - -def make_senter( - nlp: Language, - name: str, - model: Model, - overwrite: bool, - scorer: Optional[Callable] -): - return SentenceRecognizer( - nlp.vocab, model, name, - overwrite=overwrite, - scorer=scorer - ) - -