mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-13 09:42:26 +03:00
Move factories to their own file
This commit is contained in:
parent
7dd064a089
commit
bd2d707773
|
@ -1,69 +1,14 @@
|
||||||
"""Centralized registry population for spaCy components.
|
"""Centralized registry population for spaCy config
|
||||||
|
|
||||||
This module centralizes registry decorations to prevent circular import issues
|
This module centralizes registry decorations to prevent circular import issues
|
||||||
with Cython annotation changes from __future__ import annotations. Functions
|
with Cython annotation changes from __future__ import annotations. Functions
|
||||||
remain in their original locations, but decoration is moved here.
|
remain in their original locations, but decoration is moved here.
|
||||||
|
|
||||||
|
Component definitions and registrations are in spacy/pipeline/factories.py
|
||||||
"""
|
"""
|
||||||
from typing import Dict, Any, Callable, Iterable, List, Optional, Union, Tuple
|
|
||||||
from thinc.api import Model
|
|
||||||
from thinc.types import Floats2d, Ragged
|
|
||||||
from .tokens.doc import Doc
|
|
||||||
from .tokens.span import Span
|
|
||||||
from .kb import KnowledgeBase, Candidate
|
|
||||||
from .vocab import Vocab
|
|
||||||
from .pipeline.textcat import TextCategorizer
|
|
||||||
from .pipeline.tok2vec import Tok2Vec
|
|
||||||
from .pipeline.spancat import SpanCategorizer, Suggester
|
|
||||||
from .pipeline.textcat_multilabel import MultiLabel_TextCategorizer
|
|
||||||
from .pipeline.entityruler import EntityRuler
|
|
||||||
from .pipeline.span_finder import SpanFinder
|
|
||||||
from .pipeline.ner import EntityRecognizer
|
|
||||||
from .pipeline._parser_internals.transition_system import TransitionSystem
|
|
||||||
from .pipeline.dep_parser import DependencyParser
|
|
||||||
from .pipeline.tagger import Tagger
|
|
||||||
from .pipeline.multitask import MultitaskObjective
|
|
||||||
from .pipeline.senter import SentenceRecognizer
|
|
||||||
from .language import Language
|
|
||||||
from .pipeline.sentencizer import Sentencizer
|
|
||||||
|
|
||||||
# Import factory default configurations
|
|
||||||
from .pipeline.entity_linker import DEFAULT_NEL_MODEL
|
|
||||||
from .pipeline.entityruler import DEFAULT_ENT_ID_SEP
|
|
||||||
from .pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
|
||||||
from .pipeline.senter import DEFAULT_SENTER_MODEL
|
|
||||||
from .pipeline.morphologizer import DEFAULT_MORPH_MODEL
|
|
||||||
from .pipeline.spancat import (
|
|
||||||
DEFAULT_SPANCAT_MODEL,
|
|
||||||
DEFAULT_SPANCAT_SINGLELABEL_MODEL,
|
|
||||||
DEFAULT_SPANS_KEY,
|
|
||||||
)
|
|
||||||
from .pipeline.span_ruler import DEFAULT_SPANS_KEY as SPAN_RULER_DEFAULT_SPANS_KEY
|
|
||||||
from .pipeline.edit_tree_lemmatizer import DEFAULT_EDIT_TREE_LEMMATIZER_MODEL
|
|
||||||
from .pipeline.textcat_multilabel import DEFAULT_MULTI_TEXTCAT_MODEL
|
|
||||||
from .pipeline.span_finder import DEFAULT_SPAN_FINDER_MODEL
|
|
||||||
from .pipeline.ner import DEFAULT_NER_MODEL
|
|
||||||
from .pipeline.dep_parser import DEFAULT_PARSER_MODEL
|
|
||||||
from .pipeline.tagger import DEFAULT_TAGGER_MODEL
|
|
||||||
from .pipeline.multitask import DEFAULT_MT_MODEL
|
|
||||||
from .pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL
|
|
||||||
from .pipeline.entity_linker import EntityLinker, EntityLinker_v1
|
|
||||||
from .pipeline.attributeruler import AttributeRuler
|
|
||||||
from .pipeline.spancat import SpanCategorizer
|
|
||||||
from .pipeline.lemmatizer import Lemmatizer
|
|
||||||
from .pipeline.functions import TokenSplitter
|
|
||||||
from .pipeline.functions import DocCleaner
|
|
||||||
from .pipeline.span_ruler import SpanRuler, prioritize_new_ents_filter, prioritize_existing_ents_filter
|
|
||||||
from .pipeline.span_ruler import SpanRuler
|
|
||||||
from .pipeline.edit_tree_lemmatizer import EditTreeLemmatizer
|
|
||||||
from .pipeline.morphologizer import Morphologizer
|
|
||||||
|
|
||||||
|
|
||||||
# Global flag to track if registry has been populated
|
# Global flag to track if registry has been populated
|
||||||
REGISTRY_POPULATED = False
|
REGISTRY_POPULATED = False
|
||||||
|
|
||||||
# Global flag to track if factories have been registered
|
|
||||||
FACTORIES_REGISTERED = False
|
|
||||||
|
|
||||||
|
|
||||||
def populate_registry() -> None:
|
def populate_registry() -> None:
|
||||||
"""Populate the registry with all necessary components.
|
"""Populate the registry with all necessary components.
|
||||||
|
@ -149,872 +94,5 @@ def populate_registry() -> None:
|
||||||
registry.architectures("spacy.MishWindowEncoder.v2")(MishWindowEncoder)
|
registry.architectures("spacy.MishWindowEncoder.v2")(MishWindowEncoder)
|
||||||
registry.architectures("spacy.TorchBiLSTMEncoder.v1")(BiLSTMEncoder)
|
registry.architectures("spacy.TorchBiLSTMEncoder.v1")(BiLSTMEncoder)
|
||||||
|
|
||||||
# Register factory components
|
|
||||||
register_factories()
|
|
||||||
|
|
||||||
# Set the flag to indicate that the registry has been populated
|
# Set the flag to indicate that the registry has been populated
|
||||||
REGISTRY_POPULATED = True
|
REGISTRY_POPULATED = True
|
||||||
|
|
||||||
|
|
||||||
def register_factories() -> None:
|
|
||||||
"""Register all factories with the registry.
|
|
||||||
|
|
||||||
This function registers all pipeline component factories, centralizing
|
|
||||||
the registrations that were previously done with @Language.factory decorators.
|
|
||||||
"""
|
|
||||||
global FACTORIES_REGISTERED
|
|
||||||
|
|
||||||
if FACTORIES_REGISTERED:
|
|
||||||
return
|
|
||||||
|
|
||||||
# Register factories using the same pattern as Language.factory decorator
|
|
||||||
# We use Language.factory()() pattern which exactly mimics the decorator
|
|
||||||
|
|
||||||
# attributeruler
|
|
||||||
Language.factory(
|
|
||||||
"attribute_ruler",
|
|
||||||
default_config={
|
|
||||||
"validate": False,
|
|
||||||
"scorer": {"@scorers": "spacy.attribute_ruler_scorer.v1"},
|
|
||||||
},
|
|
||||||
)(make_attribute_ruler)
|
|
||||||
|
|
||||||
# entity_linker
|
|
||||||
Language.factory(
|
|
||||||
"entity_linker",
|
|
||||||
requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
|
|
||||||
assigns=["token.ent_kb_id"],
|
|
||||||
default_config={
|
|
||||||
"model": DEFAULT_NEL_MODEL,
|
|
||||||
"labels_discard": [],
|
|
||||||
"n_sents": 0,
|
|
||||||
"incl_prior": True,
|
|
||||||
"incl_context": True,
|
|
||||||
"entity_vector_length": 64,
|
|
||||||
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
|
||||||
"get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
|
|
||||||
"generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"},
|
|
||||||
"overwrite": True,
|
|
||||||
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
|
|
||||||
"use_gold_ents": True,
|
|
||||||
"candidates_batch_size": 1,
|
|
||||||
"threshold": None,
|
|
||||||
},
|
|
||||||
default_score_weights={
|
|
||||||
"nel_micro_f": 1.0,
|
|
||||||
"nel_micro_r": None,
|
|
||||||
"nel_micro_p": None,
|
|
||||||
},
|
|
||||||
)(make_entity_linker)
|
|
||||||
|
|
||||||
# entity_ruler
|
|
||||||
Language.factory(
|
|
||||||
"entity_ruler",
|
|
||||||
assigns=["doc.ents", "token.ent_type", "token.ent_iob"],
|
|
||||||
default_config={
|
|
||||||
"phrase_matcher_attr": None,
|
|
||||||
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
|
|
||||||
"validate": False,
|
|
||||||
"overwrite_ents": False,
|
|
||||||
"ent_id_sep": DEFAULT_ENT_ID_SEP,
|
|
||||||
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
|
|
||||||
},
|
|
||||||
default_score_weights={
|
|
||||||
"ents_f": 1.0,
|
|
||||||
"ents_p": 0.0,
|
|
||||||
"ents_r": 0.0,
|
|
||||||
"ents_per_type": None,
|
|
||||||
},
|
|
||||||
)(make_entity_ruler)
|
|
||||||
|
|
||||||
# lemmatizer
|
|
||||||
Language.factory(
|
|
||||||
"lemmatizer",
|
|
||||||
assigns=["token.lemma"],
|
|
||||||
default_config={
|
|
||||||
"model": None,
|
|
||||||
"mode": "lookup",
|
|
||||||
"overwrite": False,
|
|
||||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
|
||||||
},
|
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
|
||||||
)(make_lemmatizer)
|
|
||||||
|
|
||||||
# textcat
|
|
||||||
Language.factory(
|
|
||||||
"textcat",
|
|
||||||
assigns=["doc.cats"],
|
|
||||||
default_config={
|
|
||||||
"threshold": 0.0,
|
|
||||||
"model": DEFAULT_SINGLE_TEXTCAT_MODEL,
|
|
||||||
"scorer": {"@scorers": "spacy.textcat_scorer.v2"},
|
|
||||||
},
|
|
||||||
default_score_weights={
|
|
||||||
"cats_score": 1.0,
|
|
||||||
"cats_score_desc": None,
|
|
||||||
"cats_micro_p": None,
|
|
||||||
"cats_micro_r": None,
|
|
||||||
"cats_micro_f": None,
|
|
||||||
"cats_macro_p": None,
|
|
||||||
"cats_macro_r": None,
|
|
||||||
"cats_macro_f": None,
|
|
||||||
"cats_macro_auc": None,
|
|
||||||
"cats_f_per_type": None,
|
|
||||||
},
|
|
||||||
)(make_textcat)
|
|
||||||
|
|
||||||
# token_splitter
|
|
||||||
Language.factory(
|
|
||||||
"token_splitter",
|
|
||||||
default_config={"min_length": 25, "split_length": 10},
|
|
||||||
retokenizes=True,
|
|
||||||
)(make_token_splitter)
|
|
||||||
|
|
||||||
# doc_cleaner
|
|
||||||
Language.factory(
|
|
||||||
"doc_cleaner",
|
|
||||||
default_config={"attrs": {"tensor": None, "_.trf_data": None}, "silent": True},
|
|
||||||
)(make_doc_cleaner)
|
|
||||||
|
|
||||||
# tok2vec
|
|
||||||
Language.factory(
|
|
||||||
"tok2vec",
|
|
||||||
assigns=["doc.tensor"],
|
|
||||||
default_config={"model": DEFAULT_TOK2VEC_MODEL},
|
|
||||||
)(make_tok2vec)
|
|
||||||
|
|
||||||
# senter
|
|
||||||
Language.factory(
|
|
||||||
"senter",
|
|
||||||
assigns=["token.is_sent_start"],
|
|
||||||
default_config={
|
|
||||||
"model": DEFAULT_SENTER_MODEL,
|
|
||||||
"overwrite": False,
|
|
||||||
"scorer": {"@scorers": "spacy.senter_scorer.v1"},
|
|
||||||
},
|
|
||||||
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
|
|
||||||
)(make_senter)
|
|
||||||
|
|
||||||
# morphologizer
|
|
||||||
Language.factory(
|
|
||||||
"morphologizer",
|
|
||||||
assigns=["token.morph", "token.pos"],
|
|
||||||
default_config={
|
|
||||||
"model": DEFAULT_MORPH_MODEL,
|
|
||||||
"overwrite": True,
|
|
||||||
"extend": False,
|
|
||||||
"scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
|
|
||||||
"label_smoothing": 0.0,
|
|
||||||
},
|
|
||||||
default_score_weights={
|
|
||||||
"pos_acc": 0.5,
|
|
||||||
"morph_acc": 0.5,
|
|
||||||
"morph_per_feat": None,
|
|
||||||
},
|
|
||||||
)(make_morphologizer)
|
|
||||||
|
|
||||||
# spancat
|
|
||||||
Language.factory(
|
|
||||||
"spancat",
|
|
||||||
assigns=["doc.spans"],
|
|
||||||
default_config={
|
|
||||||
"threshold": 0.5,
|
|
||||||
"spans_key": DEFAULT_SPANS_KEY,
|
|
||||||
"max_positive": None,
|
|
||||||
"model": DEFAULT_SPANCAT_MODEL,
|
|
||||||
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
|
|
||||||
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
|
|
||||||
},
|
|
||||||
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
|
|
||||||
)(make_spancat)
|
|
||||||
|
|
||||||
# spancat_singlelabel
|
|
||||||
Language.factory(
|
|
||||||
"spancat_singlelabel",
|
|
||||||
assigns=["doc.spans"],
|
|
||||||
default_config={
|
|
||||||
"spans_key": DEFAULT_SPANS_KEY,
|
|
||||||
"model": DEFAULT_SPANCAT_SINGLELABEL_MODEL,
|
|
||||||
"negative_weight": 1.0,
|
|
||||||
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
|
|
||||||
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
|
|
||||||
"allow_overlap": True,
|
|
||||||
},
|
|
||||||
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
|
|
||||||
)(make_spancat_singlelabel)
|
|
||||||
|
|
||||||
# future_entity_ruler
|
|
||||||
Language.factory(
|
|
||||||
"future_entity_ruler",
|
|
||||||
assigns=["doc.ents"],
|
|
||||||
default_config={
|
|
||||||
"phrase_matcher_attr": None,
|
|
||||||
"validate": False,
|
|
||||||
"overwrite_ents": False,
|
|
||||||
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
|
|
||||||
"ent_id_sep": "__unused__",
|
|
||||||
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
|
|
||||||
},
|
|
||||||
default_score_weights={
|
|
||||||
"ents_f": 1.0,
|
|
||||||
"ents_p": 0.0,
|
|
||||||
"ents_r": 0.0,
|
|
||||||
"ents_per_type": None,
|
|
||||||
},
|
|
||||||
)(make_future_entity_ruler)
|
|
||||||
|
|
||||||
# span_ruler
|
|
||||||
Language.factory(
|
|
||||||
"span_ruler",
|
|
||||||
assigns=["doc.spans"],
|
|
||||||
default_config={
|
|
||||||
"spans_key": SPAN_RULER_DEFAULT_SPANS_KEY,
|
|
||||||
"spans_filter": None,
|
|
||||||
"annotate_ents": False,
|
|
||||||
"ents_filter": {"@misc": "spacy.first_longest_spans_filter.v1"},
|
|
||||||
"phrase_matcher_attr": None,
|
|
||||||
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
|
|
||||||
"validate": False,
|
|
||||||
"overwrite": True,
|
|
||||||
"scorer": {
|
|
||||||
"@scorers": "spacy.overlapping_labeled_spans_scorer.v1",
|
|
||||||
"spans_key": SPAN_RULER_DEFAULT_SPANS_KEY,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
default_score_weights={
|
|
||||||
f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_f": 1.0,
|
|
||||||
f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_p": 0.0,
|
|
||||||
f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_r": 0.0,
|
|
||||||
f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_per_type": None,
|
|
||||||
},
|
|
||||||
)(make_span_ruler)
|
|
||||||
|
|
||||||
# trainable_lemmatizer
|
|
||||||
Language.factory(
|
|
||||||
"trainable_lemmatizer",
|
|
||||||
assigns=["token.lemma"],
|
|
||||||
requires=[],
|
|
||||||
default_config={
|
|
||||||
"model": DEFAULT_EDIT_TREE_LEMMATIZER_MODEL,
|
|
||||||
"backoff": "orth",
|
|
||||||
"min_tree_freq": 3,
|
|
||||||
"overwrite": False,
|
|
||||||
"top_k": 1,
|
|
||||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
|
||||||
},
|
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
|
||||||
)(make_edit_tree_lemmatizer)
|
|
||||||
|
|
||||||
# textcat_multilabel
|
|
||||||
Language.factory(
|
|
||||||
"textcat_multilabel",
|
|
||||||
assigns=["doc.cats"],
|
|
||||||
default_config={
|
|
||||||
"threshold": 0.5,
|
|
||||||
"model": DEFAULT_MULTI_TEXTCAT_MODEL,
|
|
||||||
"scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v2"},
|
|
||||||
},
|
|
||||||
default_score_weights={
|
|
||||||
"cats_score": 1.0,
|
|
||||||
"cats_score_desc": None,
|
|
||||||
"cats_micro_p": None,
|
|
||||||
"cats_micro_r": None,
|
|
||||||
"cats_micro_f": None,
|
|
||||||
"cats_macro_p": None,
|
|
||||||
"cats_macro_r": None,
|
|
||||||
"cats_macro_f": None,
|
|
||||||
"cats_macro_auc": None,
|
|
||||||
"cats_f_per_type": None,
|
|
||||||
},
|
|
||||||
)(make_multilabel_textcat)
|
|
||||||
|
|
||||||
# span_finder
|
|
||||||
Language.factory(
|
|
||||||
"span_finder",
|
|
||||||
assigns=["doc.spans"],
|
|
||||||
default_config={
|
|
||||||
"threshold": 0.5,
|
|
||||||
"model": DEFAULT_SPAN_FINDER_MODEL,
|
|
||||||
"spans_key": DEFAULT_SPANS_KEY,
|
|
||||||
"max_length": 25,
|
|
||||||
"min_length": None,
|
|
||||||
"scorer": {"@scorers": "spacy.span_finder_scorer.v1"},
|
|
||||||
},
|
|
||||||
default_score_weights={
|
|
||||||
f"spans_{DEFAULT_SPANS_KEY}_f": 1.0,
|
|
||||||
f"spans_{DEFAULT_SPANS_KEY}_p": 0.0,
|
|
||||||
f"spans_{DEFAULT_SPANS_KEY}_r": 0.0,
|
|
||||||
},
|
|
||||||
)(make_span_finder)
|
|
||||||
|
|
||||||
# ner
|
|
||||||
Language.factory(
|
|
||||||
"ner",
|
|
||||||
assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
|
|
||||||
default_config={
|
|
||||||
"moves": None,
|
|
||||||
"update_with_oracle_cut_size": 100,
|
|
||||||
"model": DEFAULT_NER_MODEL,
|
|
||||||
"incorrect_spans_key": None,
|
|
||||||
"scorer": {"@scorers": "spacy.ner_scorer.v1"},
|
|
||||||
},
|
|
||||||
default_score_weights={
|
|
||||||
"ents_f": 1.0,
|
|
||||||
"ents_p": 0.0,
|
|
||||||
"ents_r": 0.0,
|
|
||||||
"ents_per_type": None,
|
|
||||||
},
|
|
||||||
)(make_ner)
|
|
||||||
|
|
||||||
# beam_ner
|
|
||||||
Language.factory(
|
|
||||||
"beam_ner",
|
|
||||||
assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
|
|
||||||
default_config={
|
|
||||||
"moves": None,
|
|
||||||
"update_with_oracle_cut_size": 100,
|
|
||||||
"model": DEFAULT_NER_MODEL,
|
|
||||||
"beam_density": 0.01,
|
|
||||||
"beam_update_prob": 0.5,
|
|
||||||
"beam_width": 32,
|
|
||||||
"incorrect_spans_key": None,
|
|
||||||
"scorer": {"@scorers": "spacy.ner_scorer.v1"},
|
|
||||||
},
|
|
||||||
default_score_weights={
|
|
||||||
"ents_f": 1.0,
|
|
||||||
"ents_p": 0.0,
|
|
||||||
"ents_r": 0.0,
|
|
||||||
"ents_per_type": None,
|
|
||||||
},
|
|
||||||
)(make_beam_ner)
|
|
||||||
|
|
||||||
# parser
|
|
||||||
Language.factory(
|
|
||||||
"parser",
|
|
||||||
assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
|
|
||||||
default_config={
|
|
||||||
"moves": None,
|
|
||||||
"update_with_oracle_cut_size": 100,
|
|
||||||
"learn_tokens": False,
|
|
||||||
"min_action_freq": 30,
|
|
||||||
"model": DEFAULT_PARSER_MODEL,
|
|
||||||
"scorer": {"@scorers": "spacy.parser_scorer.v1"},
|
|
||||||
},
|
|
||||||
default_score_weights={
|
|
||||||
"dep_uas": 0.5,
|
|
||||||
"dep_las": 0.5,
|
|
||||||
"dep_las_per_type": None,
|
|
||||||
"sents_p": None,
|
|
||||||
"sents_r": None,
|
|
||||||
"sents_f": 0.0,
|
|
||||||
},
|
|
||||||
)(make_parser)
|
|
||||||
|
|
||||||
# beam_parser
|
|
||||||
Language.factory(
|
|
||||||
"beam_parser",
|
|
||||||
assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
|
|
||||||
default_config={
|
|
||||||
"moves": None,
|
|
||||||
"update_with_oracle_cut_size": 100,
|
|
||||||
"learn_tokens": False,
|
|
||||||
"min_action_freq": 30,
|
|
||||||
"beam_width": 8,
|
|
||||||
"beam_density": 0.0001,
|
|
||||||
"beam_update_prob": 0.5,
|
|
||||||
"model": DEFAULT_PARSER_MODEL,
|
|
||||||
"scorer": {"@scorers": "spacy.parser_scorer.v1"},
|
|
||||||
},
|
|
||||||
default_score_weights={
|
|
||||||
"dep_uas": 0.5,
|
|
||||||
"dep_las": 0.5,
|
|
||||||
"dep_las_per_type": None,
|
|
||||||
"sents_p": None,
|
|
||||||
"sents_r": None,
|
|
||||||
"sents_f": 0.0,
|
|
||||||
},
|
|
||||||
)(make_beam_parser)
|
|
||||||
|
|
||||||
# tagger
|
|
||||||
Language.factory(
|
|
||||||
"tagger",
|
|
||||||
assigns=["token.tag"],
|
|
||||||
default_config={
|
|
||||||
"model": DEFAULT_TAGGER_MODEL,
|
|
||||||
"overwrite": False,
|
|
||||||
"scorer": {"@scorers": "spacy.tagger_scorer.v1"},
|
|
||||||
"neg_prefix": "!",
|
|
||||||
"label_smoothing": 0.0,
|
|
||||||
},
|
|
||||||
default_score_weights={
|
|
||||||
"tag_acc": 1.0,
|
|
||||||
"pos_acc": 0.0,
|
|
||||||
"tag_micro_p": None,
|
|
||||||
"tag_micro_r": None,
|
|
||||||
"tag_micro_f": None,
|
|
||||||
},
|
|
||||||
)(make_tagger)
|
|
||||||
|
|
||||||
# nn_labeller
|
|
||||||
Language.factory(
|
|
||||||
"nn_labeller",
|
|
||||||
default_config={
|
|
||||||
"labels": None,
|
|
||||||
"target": "dep_tag_offset",
|
|
||||||
"model": DEFAULT_MT_MODEL,
|
|
||||||
},
|
|
||||||
)(make_nn_labeller)
|
|
||||||
|
|
||||||
# sentencizer
|
|
||||||
Language.factory(
|
|
||||||
"sentencizer",
|
|
||||||
assigns=["token.is_sent_start", "doc.sents"],
|
|
||||||
default_config={
|
|
||||||
"punct_chars": None,
|
|
||||||
"overwrite": False,
|
|
||||||
"scorer": {"@scorers": "spacy.senter_scorer.v1"},
|
|
||||||
},
|
|
||||||
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
|
|
||||||
)(make_sentencizer)
|
|
||||||
|
|
||||||
# Set the flag to indicate that all factories have been registered
|
|
||||||
FACTORIES_REGISTERED = True
|
|
||||||
|
|
||||||
|
|
||||||
# We can't have function implementations for these factories in Cython, because
|
|
||||||
# we need to build a Pydantic model for them dynamically, reading their argument
|
|
||||||
# structure from the signature. In Cython 3, this doesn't work because the
|
|
||||||
# from __future__ import annotations semantics are used, which means the types
|
|
||||||
# are stored as strings.
|
|
||||||
def make_sentencizer(
|
|
||||||
nlp: Language,
|
|
||||||
name: str,
|
|
||||||
punct_chars: Optional[List[str]],
|
|
||||||
overwrite: bool,
|
|
||||||
scorer: Optional[Callable],
|
|
||||||
):
|
|
||||||
return Sentencizer(
|
|
||||||
name, punct_chars=punct_chars, overwrite=overwrite, scorer=scorer
|
|
||||||
)
|
|
||||||
|
|
||||||
def make_attribute_ruler(
|
|
||||||
nlp: Language, name: str, validate: bool, scorer: Optional[Callable]
|
|
||||||
):
|
|
||||||
return AttributeRuler(nlp.vocab, name, validate=validate, scorer=scorer)
|
|
||||||
|
|
||||||
def make_entity_linker(
|
|
||||||
nlp: Language,
|
|
||||||
name: str,
|
|
||||||
model: Model,
|
|
||||||
*,
|
|
||||||
labels_discard: Iterable[str],
|
|
||||||
n_sents: int,
|
|
||||||
incl_prior: bool,
|
|
||||||
incl_context: bool,
|
|
||||||
entity_vector_length: int,
|
|
||||||
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
|
||||||
get_candidates_batch: Callable[
|
|
||||||
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
|
||||||
],
|
|
||||||
generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
|
|
||||||
overwrite: bool,
|
|
||||||
scorer: Optional[Callable],
|
|
||||||
use_gold_ents: bool,
|
|
||||||
candidates_batch_size: int,
|
|
||||||
threshold: Optional[float] = None,
|
|
||||||
):
|
|
||||||
|
|
||||||
if not model.attrs.get("include_span_maker", False):
|
|
||||||
# The only difference in arguments here is that use_gold_ents and threshold aren't available.
|
|
||||||
return EntityLinker_v1(
|
|
||||||
nlp.vocab,
|
|
||||||
model,
|
|
||||||
name,
|
|
||||||
labels_discard=labels_discard,
|
|
||||||
n_sents=n_sents,
|
|
||||||
incl_prior=incl_prior,
|
|
||||||
incl_context=incl_context,
|
|
||||||
entity_vector_length=entity_vector_length,
|
|
||||||
get_candidates=get_candidates,
|
|
||||||
overwrite=overwrite,
|
|
||||||
scorer=scorer,
|
|
||||||
)
|
|
||||||
return EntityLinker(
|
|
||||||
nlp.vocab,
|
|
||||||
model,
|
|
||||||
name,
|
|
||||||
labels_discard=labels_discard,
|
|
||||||
n_sents=n_sents,
|
|
||||||
incl_prior=incl_prior,
|
|
||||||
incl_context=incl_context,
|
|
||||||
entity_vector_length=entity_vector_length,
|
|
||||||
get_candidates=get_candidates,
|
|
||||||
get_candidates_batch=get_candidates_batch,
|
|
||||||
generate_empty_kb=generate_empty_kb,
|
|
||||||
overwrite=overwrite,
|
|
||||||
scorer=scorer,
|
|
||||||
use_gold_ents=use_gold_ents,
|
|
||||||
candidates_batch_size=candidates_batch_size,
|
|
||||||
threshold=threshold,
|
|
||||||
)
|
|
||||||
|
|
||||||
def make_lemmatizer(
|
|
||||||
nlp: Language,
|
|
||||||
model: Optional[Model],
|
|
||||||
name: str,
|
|
||||||
mode: str,
|
|
||||||
overwrite: bool,
|
|
||||||
scorer: Optional[Callable],
|
|
||||||
):
|
|
||||||
return Lemmatizer(
|
|
||||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
|
||||||
)
|
|
||||||
|
|
||||||
def make_textcat(
|
|
||||||
nlp: Language,
|
|
||||||
name: str,
|
|
||||||
model: Model[List[Doc], List[Floats2d]],
|
|
||||||
threshold: float,
|
|
||||||
scorer: Optional[Callable],
|
|
||||||
) -> TextCategorizer:
|
|
||||||
return TextCategorizer(nlp.vocab, model, name, threshold=threshold, scorer=scorer)
|
|
||||||
|
|
||||||
def make_token_splitter(
|
|
||||||
nlp: Language, name: str, *, min_length: int = 0, split_length: int = 0
|
|
||||||
):
|
|
||||||
return TokenSplitter(min_length=min_length, split_length=split_length)
|
|
||||||
|
|
||||||
def make_doc_cleaner(nlp: Language, name: str, *, attrs: Dict[str, Any], silent: bool):
|
|
||||||
return DocCleaner(attrs, silent=silent)
|
|
||||||
|
|
||||||
def make_tok2vec(nlp: Language, name: str, model: Model) -> Tok2Vec:
|
|
||||||
return Tok2Vec(nlp.vocab, model, name)
|
|
||||||
|
|
||||||
def make_spancat(
|
|
||||||
nlp: Language,
|
|
||||||
name: str,
|
|
||||||
suggester: Suggester,
|
|
||||||
model: Model[Tuple[List[Doc], Ragged], Floats2d],
|
|
||||||
spans_key: str,
|
|
||||||
scorer: Optional[Callable],
|
|
||||||
threshold: float,
|
|
||||||
max_positive: Optional[int],
|
|
||||||
) -> SpanCategorizer:
|
|
||||||
return SpanCategorizer(
|
|
||||||
nlp.vocab,
|
|
||||||
model=model,
|
|
||||||
suggester=suggester,
|
|
||||||
name=name,
|
|
||||||
spans_key=spans_key,
|
|
||||||
negative_weight=None,
|
|
||||||
allow_overlap=True,
|
|
||||||
max_positive=max_positive,
|
|
||||||
threshold=threshold,
|
|
||||||
scorer=scorer,
|
|
||||||
add_negative_label=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
def make_spancat_singlelabel(
|
|
||||||
nlp: Language,
|
|
||||||
name: str,
|
|
||||||
suggester: Suggester,
|
|
||||||
model: Model[Tuple[List[Doc], Ragged], Floats2d],
|
|
||||||
spans_key: str,
|
|
||||||
negative_weight: float,
|
|
||||||
allow_overlap: bool,
|
|
||||||
scorer: Optional[Callable],
|
|
||||||
) -> SpanCategorizer:
|
|
||||||
return SpanCategorizer(
|
|
||||||
nlp.vocab,
|
|
||||||
model=model,
|
|
||||||
suggester=suggester,
|
|
||||||
name=name,
|
|
||||||
spans_key=spans_key,
|
|
||||||
negative_weight=negative_weight,
|
|
||||||
allow_overlap=allow_overlap,
|
|
||||||
max_positive=1,
|
|
||||||
add_negative_label=True,
|
|
||||||
threshold=None,
|
|
||||||
scorer=scorer,
|
|
||||||
)
|
|
||||||
|
|
||||||
def make_future_entity_ruler(
|
|
||||||
nlp: Language,
|
|
||||||
name: str,
|
|
||||||
phrase_matcher_attr: Optional[Union[int, str]],
|
|
||||||
matcher_fuzzy_compare: Callable,
|
|
||||||
validate: bool,
|
|
||||||
overwrite_ents: bool,
|
|
||||||
scorer: Optional[Callable],
|
|
||||||
ent_id_sep: str,
|
|
||||||
):
|
|
||||||
if overwrite_ents:
|
|
||||||
ents_filter = prioritize_new_ents_filter
|
|
||||||
else:
|
|
||||||
ents_filter = prioritize_existing_ents_filter
|
|
||||||
return SpanRuler(
|
|
||||||
nlp,
|
|
||||||
name,
|
|
||||||
spans_key=None,
|
|
||||||
spans_filter=None,
|
|
||||||
annotate_ents=True,
|
|
||||||
ents_filter=ents_filter,
|
|
||||||
phrase_matcher_attr=phrase_matcher_attr,
|
|
||||||
matcher_fuzzy_compare=matcher_fuzzy_compare,
|
|
||||||
validate=validate,
|
|
||||||
overwrite=False,
|
|
||||||
scorer=scorer,
|
|
||||||
)
|
|
||||||
|
|
||||||
def make_entity_ruler(
|
|
||||||
nlp: Language,
|
|
||||||
name: str,
|
|
||||||
phrase_matcher_attr: Optional[Union[int, str]],
|
|
||||||
matcher_fuzzy_compare: Callable,
|
|
||||||
validate: bool,
|
|
||||||
overwrite_ents: bool,
|
|
||||||
ent_id_sep: str,
|
|
||||||
scorer: Optional[Callable],
|
|
||||||
):
|
|
||||||
return EntityRuler(
|
|
||||||
nlp,
|
|
||||||
name,
|
|
||||||
phrase_matcher_attr=phrase_matcher_attr,
|
|
||||||
matcher_fuzzy_compare=matcher_fuzzy_compare,
|
|
||||||
validate=validate,
|
|
||||||
overwrite_ents=overwrite_ents,
|
|
||||||
ent_id_sep=ent_id_sep,
|
|
||||||
scorer=scorer,
|
|
||||||
)
|
|
||||||
|
|
||||||
def make_span_ruler(
|
|
||||||
nlp: Language,
|
|
||||||
name: str,
|
|
||||||
spans_key: Optional[str],
|
|
||||||
spans_filter: Optional[Callable[[Iterable[Span], Iterable[Span]], Iterable[Span]]],
|
|
||||||
annotate_ents: bool,
|
|
||||||
ents_filter: Callable[[Iterable[Span], Iterable[Span]], Iterable[Span]],
|
|
||||||
phrase_matcher_attr: Optional[Union[int, str]],
|
|
||||||
matcher_fuzzy_compare: Callable,
|
|
||||||
validate: bool,
|
|
||||||
overwrite: bool,
|
|
||||||
scorer: Optional[Callable],
|
|
||||||
):
|
|
||||||
return SpanRuler(
|
|
||||||
nlp,
|
|
||||||
name,
|
|
||||||
spans_key=spans_key,
|
|
||||||
spans_filter=spans_filter,
|
|
||||||
annotate_ents=annotate_ents,
|
|
||||||
ents_filter=ents_filter,
|
|
||||||
phrase_matcher_attr=phrase_matcher_attr,
|
|
||||||
matcher_fuzzy_compare=matcher_fuzzy_compare,
|
|
||||||
validate=validate,
|
|
||||||
overwrite=overwrite,
|
|
||||||
scorer=scorer,
|
|
||||||
)
|
|
||||||
|
|
||||||
def make_edit_tree_lemmatizer(
|
|
||||||
nlp: Language,
|
|
||||||
name: str,
|
|
||||||
model: Model,
|
|
||||||
backoff: Optional[str],
|
|
||||||
min_tree_freq: int,
|
|
||||||
overwrite: bool,
|
|
||||||
top_k: int,
|
|
||||||
scorer: Optional[Callable],
|
|
||||||
):
|
|
||||||
return EditTreeLemmatizer(
|
|
||||||
nlp.vocab,
|
|
||||||
model,
|
|
||||||
name,
|
|
||||||
backoff=backoff,
|
|
||||||
min_tree_freq=min_tree_freq,
|
|
||||||
overwrite=overwrite,
|
|
||||||
top_k=top_k,
|
|
||||||
scorer=scorer,
|
|
||||||
)
|
|
||||||
|
|
||||||
def make_multilabel_textcat(
|
|
||||||
nlp: Language,
|
|
||||||
name: str,
|
|
||||||
model: Model[List[Doc], List[Floats2d]],
|
|
||||||
threshold: float,
|
|
||||||
scorer: Optional[Callable],
|
|
||||||
) -> MultiLabel_TextCategorizer:
|
|
||||||
return MultiLabel_TextCategorizer(
|
|
||||||
nlp.vocab, model, name, threshold=threshold, scorer=scorer
|
|
||||||
)
|
|
||||||
|
|
||||||
def make_span_finder(
|
|
||||||
nlp: Language,
|
|
||||||
name: str,
|
|
||||||
model: Model[Iterable[Doc], Floats2d],
|
|
||||||
spans_key: str,
|
|
||||||
threshold: float,
|
|
||||||
max_length: Optional[int],
|
|
||||||
min_length: Optional[int],
|
|
||||||
scorer: Optional[Callable],
|
|
||||||
) -> SpanFinder:
|
|
||||||
return SpanFinder(
|
|
||||||
nlp,
|
|
||||||
model=model,
|
|
||||||
threshold=threshold,
|
|
||||||
name=name,
|
|
||||||
scorer=scorer,
|
|
||||||
max_length=max_length,
|
|
||||||
min_length=min_length,
|
|
||||||
spans_key=spans_key,
|
|
||||||
)
|
|
||||||
|
|
||||||
def make_ner(
|
|
||||||
nlp: Language,
|
|
||||||
name: str,
|
|
||||||
model: Model,
|
|
||||||
moves: Optional[TransitionSystem],
|
|
||||||
update_with_oracle_cut_size: int,
|
|
||||||
incorrect_spans_key: Optional[str],
|
|
||||||
scorer: Optional[Callable],
|
|
||||||
):
|
|
||||||
return EntityRecognizer(
|
|
||||||
nlp.vocab,
|
|
||||||
model,
|
|
||||||
name=name,
|
|
||||||
moves=moves,
|
|
||||||
update_with_oracle_cut_size=update_with_oracle_cut_size,
|
|
||||||
incorrect_spans_key=incorrect_spans_key,
|
|
||||||
scorer=scorer,
|
|
||||||
)
|
|
||||||
|
|
||||||
def make_beam_ner(
|
|
||||||
nlp: Language,
|
|
||||||
name: str,
|
|
||||||
model: Model,
|
|
||||||
moves: Optional[TransitionSystem],
|
|
||||||
update_with_oracle_cut_size: int,
|
|
||||||
beam_width: int,
|
|
||||||
beam_density: float,
|
|
||||||
beam_update_prob: float,
|
|
||||||
incorrect_spans_key: Optional[str],
|
|
||||||
scorer: Optional[Callable],
|
|
||||||
):
|
|
||||||
return EntityRecognizer(
|
|
||||||
nlp.vocab,
|
|
||||||
model,
|
|
||||||
name=name,
|
|
||||||
moves=moves,
|
|
||||||
update_with_oracle_cut_size=update_with_oracle_cut_size,
|
|
||||||
beam_width=beam_width,
|
|
||||||
beam_density=beam_density,
|
|
||||||
beam_update_prob=beam_update_prob,
|
|
||||||
incorrect_spans_key=incorrect_spans_key,
|
|
||||||
scorer=scorer,
|
|
||||||
)
|
|
||||||
|
|
||||||
def make_parser(
|
|
||||||
nlp: Language,
|
|
||||||
name: str,
|
|
||||||
model: Model,
|
|
||||||
moves: Optional[TransitionSystem],
|
|
||||||
update_with_oracle_cut_size: int,
|
|
||||||
learn_tokens: bool,
|
|
||||||
min_action_freq: int,
|
|
||||||
scorer: Optional[Callable],
|
|
||||||
):
|
|
||||||
return DependencyParser(
|
|
||||||
nlp.vocab,
|
|
||||||
model,
|
|
||||||
name=name,
|
|
||||||
moves=moves,
|
|
||||||
update_with_oracle_cut_size=update_with_oracle_cut_size,
|
|
||||||
learn_tokens=learn_tokens,
|
|
||||||
min_action_freq=min_action_freq,
|
|
||||||
scorer=scorer,
|
|
||||||
)
|
|
||||||
|
|
||||||
def make_beam_parser(
|
|
||||||
nlp: Language,
|
|
||||||
name: str,
|
|
||||||
model: Model,
|
|
||||||
moves: Optional[TransitionSystem],
|
|
||||||
update_with_oracle_cut_size: int,
|
|
||||||
learn_tokens: bool,
|
|
||||||
min_action_freq: int,
|
|
||||||
beam_width: int,
|
|
||||||
beam_density: float,
|
|
||||||
beam_update_prob: float,
|
|
||||||
scorer: Optional[Callable],
|
|
||||||
):
|
|
||||||
return DependencyParser(
|
|
||||||
nlp.vocab,
|
|
||||||
model,
|
|
||||||
name=name,
|
|
||||||
moves=moves,
|
|
||||||
update_with_oracle_cut_size=update_with_oracle_cut_size,
|
|
||||||
learn_tokens=learn_tokens,
|
|
||||||
min_action_freq=min_action_freq,
|
|
||||||
beam_width=beam_width,
|
|
||||||
beam_density=beam_density,
|
|
||||||
beam_update_prob=beam_update_prob,
|
|
||||||
scorer=scorer,
|
|
||||||
)
|
|
||||||
|
|
||||||
def make_tagger(
|
|
||||||
nlp: Language,
|
|
||||||
name: str,
|
|
||||||
model: Model,
|
|
||||||
overwrite: bool,
|
|
||||||
scorer: Optional[Callable],
|
|
||||||
neg_prefix: str,
|
|
||||||
label_smoothing: float,
|
|
||||||
):
|
|
||||||
return Tagger(
|
|
||||||
nlp.vocab,
|
|
||||||
model,
|
|
||||||
name=name,
|
|
||||||
overwrite=overwrite,
|
|
||||||
scorer=scorer,
|
|
||||||
neg_prefix=neg_prefix,
|
|
||||||
label_smoothing=label_smoothing,
|
|
||||||
)
|
|
||||||
|
|
||||||
def make_nn_labeller(
|
|
||||||
nlp: Language,
|
|
||||||
name: str,
|
|
||||||
model: Model,
|
|
||||||
labels: Optional[dict],
|
|
||||||
target: str
|
|
||||||
):
|
|
||||||
return MultitaskObjective(nlp.vocab, model, name, target=target)
|
|
||||||
|
|
||||||
def make_morphologizer(
|
|
||||||
nlp: Language,
|
|
||||||
model: Model,
|
|
||||||
name: str,
|
|
||||||
overwrite: bool,
|
|
||||||
extend: bool,
|
|
||||||
label_smoothing: float,
|
|
||||||
scorer: Optional[Callable],
|
|
||||||
):
|
|
||||||
return Morphologizer(
|
|
||||||
nlp.vocab, model, name,
|
|
||||||
overwrite=overwrite,
|
|
||||||
extend=extend,
|
|
||||||
label_smoothing=label_smoothing,
|
|
||||||
scorer=scorer
|
|
||||||
)
|
|
||||||
|
|
||||||
def make_senter(
|
|
||||||
nlp: Language,
|
|
||||||
name: str,
|
|
||||||
model: Model,
|
|
||||||
overwrite: bool,
|
|
||||||
scorer: Optional[Callable]
|
|
||||||
):
|
|
||||||
return SentenceRecognizer(
|
|
||||||
nlp.vocab, model, name,
|
|
||||||
overwrite=overwrite,
|
|
||||||
scorer=scorer
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user