mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-13 01:32:32 +03:00
In order to support Python 3.13, we had to migrate to Cython 3.0. This caused some tricky interaction with our Pydantic usage, because Cython 3 uses the from __future__ import annotations semantics, which causes type annotations to be saved as strings. The end result is that we can't have Language.factory decorated functions in Cython modules anymore, as the Language.factory decorator expects to inspect the signature of the functions and build a Pydantic model. If the function is implemented in Cython, an error is raised because the type is not resolved. To address this I've moved the factory functions into a new module, spacy.pipeline.factories. I've added __getattr__ importlib hooks to the previous locations, in case anyone was importing these functions directly. The change should have no backwards compatibility implications. Along the way I've also refactored the registration of functions for the config. Previously these ran as import-time side-effects, using the registry decorator. I've created instead a new module spacy.registrations. When the registry is accessed it calls a function ensure_populated(), which cases the registrations to occur. I've made a similar change to the Language.factory registrations in the new spacy.pipeline.factories module. I want to remove these import-time side-effects so that we can speed up the loading time of the library, which can be especially painful on the CLI. I also find that I'm often working to track down the implementations of functions referenced by strings in the config. Having the registrations all happen in one place will make this easier. With these changes I've fortunately avoided the need to migrate to Pydantic v2 properly --- we're still using the v1 compatibility shim. We might not be able to hold out forever though: Pydantic (reasonably) aren't actively supporting the v1 shims. I put a lot of work into v2 migration when investigating the 3.13 support, and it's definitely challenging. In any case, it's a relief that we don't have to do the v2 migration at the same time as the Cython 3.0/Python 3.13 support.
930 lines
26 KiB
Python
930 lines
26 KiB
Python
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
|
|
|
|
from thinc.api import Model
|
|
from thinc.types import Floats2d, Ragged
|
|
|
|
from ..kb import Candidate, KnowledgeBase
|
|
from ..language import Language
|
|
from ..pipeline._parser_internals.transition_system import TransitionSystem
|
|
from ..pipeline.attributeruler import AttributeRuler
|
|
from ..pipeline.dep_parser import DEFAULT_PARSER_MODEL, DependencyParser
|
|
from ..pipeline.edit_tree_lemmatizer import (
|
|
DEFAULT_EDIT_TREE_LEMMATIZER_MODEL,
|
|
EditTreeLemmatizer,
|
|
)
|
|
|
|
# Import factory default configurations
|
|
from ..pipeline.entity_linker import DEFAULT_NEL_MODEL, EntityLinker, EntityLinker_v1
|
|
from ..pipeline.entityruler import DEFAULT_ENT_ID_SEP, EntityRuler
|
|
from ..pipeline.functions import DocCleaner, TokenSplitter
|
|
from ..pipeline.lemmatizer import Lemmatizer
|
|
from ..pipeline.morphologizer import DEFAULT_MORPH_MODEL, Morphologizer
|
|
from ..pipeline.multitask import DEFAULT_MT_MODEL, MultitaskObjective
|
|
from ..pipeline.ner import DEFAULT_NER_MODEL, EntityRecognizer
|
|
from ..pipeline.sentencizer import Sentencizer
|
|
from ..pipeline.senter import DEFAULT_SENTER_MODEL, SentenceRecognizer
|
|
from ..pipeline.span_finder import DEFAULT_SPAN_FINDER_MODEL, SpanFinder
|
|
from ..pipeline.span_ruler import DEFAULT_SPANS_KEY as SPAN_RULER_DEFAULT_SPANS_KEY
|
|
from ..pipeline.span_ruler import (
|
|
SpanRuler,
|
|
prioritize_existing_ents_filter,
|
|
prioritize_new_ents_filter,
|
|
)
|
|
from ..pipeline.spancat import (
|
|
DEFAULT_SPANCAT_MODEL,
|
|
DEFAULT_SPANCAT_SINGLELABEL_MODEL,
|
|
DEFAULT_SPANS_KEY,
|
|
SpanCategorizer,
|
|
Suggester,
|
|
)
|
|
from ..pipeline.tagger import DEFAULT_TAGGER_MODEL, Tagger
|
|
from ..pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL, TextCategorizer
|
|
from ..pipeline.textcat_multilabel import (
|
|
DEFAULT_MULTI_TEXTCAT_MODEL,
|
|
MultiLabel_TextCategorizer,
|
|
)
|
|
from ..pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL, Tok2Vec
|
|
from ..tokens.doc import Doc
|
|
from ..tokens.span import Span
|
|
from ..vocab import Vocab
|
|
|
|
# Global flag to track if factories have been registered
|
|
FACTORIES_REGISTERED = False
|
|
|
|
|
|
def register_factories() -> None:
|
|
"""Register all factories with the registry.
|
|
|
|
This function registers all pipeline component factories, centralizing
|
|
the registrations that were previously done with @Language.factory decorators.
|
|
"""
|
|
global FACTORIES_REGISTERED
|
|
|
|
if FACTORIES_REGISTERED:
|
|
return
|
|
|
|
# Register factories using the same pattern as Language.factory decorator
|
|
# We use Language.factory()() pattern which exactly mimics the decorator
|
|
|
|
# attributeruler
|
|
Language.factory(
|
|
"attribute_ruler",
|
|
default_config={
|
|
"validate": False,
|
|
"scorer": {"@scorers": "spacy.attribute_ruler_scorer.v1"},
|
|
},
|
|
)(make_attribute_ruler)
|
|
|
|
# entity_linker
|
|
Language.factory(
|
|
"entity_linker",
|
|
requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
|
|
assigns=["token.ent_kb_id"],
|
|
default_config={
|
|
"model": DEFAULT_NEL_MODEL,
|
|
"labels_discard": [],
|
|
"n_sents": 0,
|
|
"incl_prior": True,
|
|
"incl_context": True,
|
|
"entity_vector_length": 64,
|
|
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
|
"get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
|
|
"generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"},
|
|
"overwrite": True,
|
|
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
|
|
"use_gold_ents": True,
|
|
"candidates_batch_size": 1,
|
|
"threshold": None,
|
|
},
|
|
default_score_weights={
|
|
"nel_micro_f": 1.0,
|
|
"nel_micro_r": None,
|
|
"nel_micro_p": None,
|
|
},
|
|
)(make_entity_linker)
|
|
|
|
# entity_ruler
|
|
Language.factory(
|
|
"entity_ruler",
|
|
assigns=["doc.ents", "token.ent_type", "token.ent_iob"],
|
|
default_config={
|
|
"phrase_matcher_attr": None,
|
|
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
|
|
"validate": False,
|
|
"overwrite_ents": False,
|
|
"ent_id_sep": DEFAULT_ENT_ID_SEP,
|
|
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
|
|
},
|
|
default_score_weights={
|
|
"ents_f": 1.0,
|
|
"ents_p": 0.0,
|
|
"ents_r": 0.0,
|
|
"ents_per_type": None,
|
|
},
|
|
)(make_entity_ruler)
|
|
|
|
# lemmatizer
|
|
Language.factory(
|
|
"lemmatizer",
|
|
assigns=["token.lemma"],
|
|
default_config={
|
|
"model": None,
|
|
"mode": "lookup",
|
|
"overwrite": False,
|
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
|
},
|
|
default_score_weights={"lemma_acc": 1.0},
|
|
)(make_lemmatizer)
|
|
|
|
# textcat
|
|
Language.factory(
|
|
"textcat",
|
|
assigns=["doc.cats"],
|
|
default_config={
|
|
"threshold": 0.0,
|
|
"model": DEFAULT_SINGLE_TEXTCAT_MODEL,
|
|
"scorer": {"@scorers": "spacy.textcat_scorer.v2"},
|
|
},
|
|
default_score_weights={
|
|
"cats_score": 1.0,
|
|
"cats_score_desc": None,
|
|
"cats_micro_p": None,
|
|
"cats_micro_r": None,
|
|
"cats_micro_f": None,
|
|
"cats_macro_p": None,
|
|
"cats_macro_r": None,
|
|
"cats_macro_f": None,
|
|
"cats_macro_auc": None,
|
|
"cats_f_per_type": None,
|
|
},
|
|
)(make_textcat)
|
|
|
|
# token_splitter
|
|
Language.factory(
|
|
"token_splitter",
|
|
default_config={"min_length": 25, "split_length": 10},
|
|
retokenizes=True,
|
|
)(make_token_splitter)
|
|
|
|
# doc_cleaner
|
|
Language.factory(
|
|
"doc_cleaner",
|
|
default_config={"attrs": {"tensor": None, "_.trf_data": None}, "silent": True},
|
|
)(make_doc_cleaner)
|
|
|
|
# tok2vec
|
|
Language.factory(
|
|
"tok2vec",
|
|
assigns=["doc.tensor"],
|
|
default_config={"model": DEFAULT_TOK2VEC_MODEL},
|
|
)(make_tok2vec)
|
|
|
|
# senter
|
|
Language.factory(
|
|
"senter",
|
|
assigns=["token.is_sent_start"],
|
|
default_config={
|
|
"model": DEFAULT_SENTER_MODEL,
|
|
"overwrite": False,
|
|
"scorer": {"@scorers": "spacy.senter_scorer.v1"},
|
|
},
|
|
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
|
|
)(make_senter)
|
|
|
|
# morphologizer
|
|
Language.factory(
|
|
"morphologizer",
|
|
assigns=["token.morph", "token.pos"],
|
|
default_config={
|
|
"model": DEFAULT_MORPH_MODEL,
|
|
"overwrite": True,
|
|
"extend": False,
|
|
"scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
|
|
"label_smoothing": 0.0,
|
|
},
|
|
default_score_weights={
|
|
"pos_acc": 0.5,
|
|
"morph_acc": 0.5,
|
|
"morph_per_feat": None,
|
|
},
|
|
)(make_morphologizer)
|
|
|
|
# spancat
|
|
Language.factory(
|
|
"spancat",
|
|
assigns=["doc.spans"],
|
|
default_config={
|
|
"threshold": 0.5,
|
|
"spans_key": DEFAULT_SPANS_KEY,
|
|
"max_positive": None,
|
|
"model": DEFAULT_SPANCAT_MODEL,
|
|
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
|
|
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
|
|
},
|
|
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
|
|
)(make_spancat)
|
|
|
|
# spancat_singlelabel
|
|
Language.factory(
|
|
"spancat_singlelabel",
|
|
assigns=["doc.spans"],
|
|
default_config={
|
|
"spans_key": DEFAULT_SPANS_KEY,
|
|
"model": DEFAULT_SPANCAT_SINGLELABEL_MODEL,
|
|
"negative_weight": 1.0,
|
|
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
|
|
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
|
|
"allow_overlap": True,
|
|
},
|
|
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
|
|
)(make_spancat_singlelabel)
|
|
|
|
# future_entity_ruler
|
|
Language.factory(
|
|
"future_entity_ruler",
|
|
assigns=["doc.ents"],
|
|
default_config={
|
|
"phrase_matcher_attr": None,
|
|
"validate": False,
|
|
"overwrite_ents": False,
|
|
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
|
|
"ent_id_sep": "__unused__",
|
|
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
|
|
},
|
|
default_score_weights={
|
|
"ents_f": 1.0,
|
|
"ents_p": 0.0,
|
|
"ents_r": 0.0,
|
|
"ents_per_type": None,
|
|
},
|
|
)(make_future_entity_ruler)
|
|
|
|
# span_ruler
|
|
Language.factory(
|
|
"span_ruler",
|
|
assigns=["doc.spans"],
|
|
default_config={
|
|
"spans_key": SPAN_RULER_DEFAULT_SPANS_KEY,
|
|
"spans_filter": None,
|
|
"annotate_ents": False,
|
|
"ents_filter": {"@misc": "spacy.first_longest_spans_filter.v1"},
|
|
"phrase_matcher_attr": None,
|
|
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
|
|
"validate": False,
|
|
"overwrite": True,
|
|
"scorer": {
|
|
"@scorers": "spacy.overlapping_labeled_spans_scorer.v1",
|
|
"spans_key": SPAN_RULER_DEFAULT_SPANS_KEY,
|
|
},
|
|
},
|
|
default_score_weights={
|
|
f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_f": 1.0,
|
|
f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_p": 0.0,
|
|
f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_r": 0.0,
|
|
f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_per_type": None,
|
|
},
|
|
)(make_span_ruler)
|
|
|
|
# trainable_lemmatizer
|
|
Language.factory(
|
|
"trainable_lemmatizer",
|
|
assigns=["token.lemma"],
|
|
requires=[],
|
|
default_config={
|
|
"model": DEFAULT_EDIT_TREE_LEMMATIZER_MODEL,
|
|
"backoff": "orth",
|
|
"min_tree_freq": 3,
|
|
"overwrite": False,
|
|
"top_k": 1,
|
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
|
},
|
|
default_score_weights={"lemma_acc": 1.0},
|
|
)(make_edit_tree_lemmatizer)
|
|
|
|
# textcat_multilabel
|
|
Language.factory(
|
|
"textcat_multilabel",
|
|
assigns=["doc.cats"],
|
|
default_config={
|
|
"threshold": 0.5,
|
|
"model": DEFAULT_MULTI_TEXTCAT_MODEL,
|
|
"scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v2"},
|
|
},
|
|
default_score_weights={
|
|
"cats_score": 1.0,
|
|
"cats_score_desc": None,
|
|
"cats_micro_p": None,
|
|
"cats_micro_r": None,
|
|
"cats_micro_f": None,
|
|
"cats_macro_p": None,
|
|
"cats_macro_r": None,
|
|
"cats_macro_f": None,
|
|
"cats_macro_auc": None,
|
|
"cats_f_per_type": None,
|
|
},
|
|
)(make_multilabel_textcat)
|
|
|
|
# span_finder
|
|
Language.factory(
|
|
"span_finder",
|
|
assigns=["doc.spans"],
|
|
default_config={
|
|
"threshold": 0.5,
|
|
"model": DEFAULT_SPAN_FINDER_MODEL,
|
|
"spans_key": DEFAULT_SPANS_KEY,
|
|
"max_length": 25,
|
|
"min_length": None,
|
|
"scorer": {"@scorers": "spacy.span_finder_scorer.v1"},
|
|
},
|
|
default_score_weights={
|
|
f"spans_{DEFAULT_SPANS_KEY}_f": 1.0,
|
|
f"spans_{DEFAULT_SPANS_KEY}_p": 0.0,
|
|
f"spans_{DEFAULT_SPANS_KEY}_r": 0.0,
|
|
},
|
|
)(make_span_finder)
|
|
|
|
# ner
|
|
Language.factory(
|
|
"ner",
|
|
assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
|
|
default_config={
|
|
"moves": None,
|
|
"update_with_oracle_cut_size": 100,
|
|
"model": DEFAULT_NER_MODEL,
|
|
"incorrect_spans_key": None,
|
|
"scorer": {"@scorers": "spacy.ner_scorer.v1"},
|
|
},
|
|
default_score_weights={
|
|
"ents_f": 1.0,
|
|
"ents_p": 0.0,
|
|
"ents_r": 0.0,
|
|
"ents_per_type": None,
|
|
},
|
|
)(make_ner)
|
|
|
|
# beam_ner
|
|
Language.factory(
|
|
"beam_ner",
|
|
assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
|
|
default_config={
|
|
"moves": None,
|
|
"update_with_oracle_cut_size": 100,
|
|
"model": DEFAULT_NER_MODEL,
|
|
"beam_density": 0.01,
|
|
"beam_update_prob": 0.5,
|
|
"beam_width": 32,
|
|
"incorrect_spans_key": None,
|
|
"scorer": {"@scorers": "spacy.ner_scorer.v1"},
|
|
},
|
|
default_score_weights={
|
|
"ents_f": 1.0,
|
|
"ents_p": 0.0,
|
|
"ents_r": 0.0,
|
|
"ents_per_type": None,
|
|
},
|
|
)(make_beam_ner)
|
|
|
|
# parser
|
|
Language.factory(
|
|
"parser",
|
|
assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
|
|
default_config={
|
|
"moves": None,
|
|
"update_with_oracle_cut_size": 100,
|
|
"learn_tokens": False,
|
|
"min_action_freq": 30,
|
|
"model": DEFAULT_PARSER_MODEL,
|
|
"scorer": {"@scorers": "spacy.parser_scorer.v1"},
|
|
},
|
|
default_score_weights={
|
|
"dep_uas": 0.5,
|
|
"dep_las": 0.5,
|
|
"dep_las_per_type": None,
|
|
"sents_p": None,
|
|
"sents_r": None,
|
|
"sents_f": 0.0,
|
|
},
|
|
)(make_parser)
|
|
|
|
# beam_parser
|
|
Language.factory(
|
|
"beam_parser",
|
|
assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
|
|
default_config={
|
|
"moves": None,
|
|
"update_with_oracle_cut_size": 100,
|
|
"learn_tokens": False,
|
|
"min_action_freq": 30,
|
|
"beam_width": 8,
|
|
"beam_density": 0.0001,
|
|
"beam_update_prob": 0.5,
|
|
"model": DEFAULT_PARSER_MODEL,
|
|
"scorer": {"@scorers": "spacy.parser_scorer.v1"},
|
|
},
|
|
default_score_weights={
|
|
"dep_uas": 0.5,
|
|
"dep_las": 0.5,
|
|
"dep_las_per_type": None,
|
|
"sents_p": None,
|
|
"sents_r": None,
|
|
"sents_f": 0.0,
|
|
},
|
|
)(make_beam_parser)
|
|
|
|
# tagger
|
|
Language.factory(
|
|
"tagger",
|
|
assigns=["token.tag"],
|
|
default_config={
|
|
"model": DEFAULT_TAGGER_MODEL,
|
|
"overwrite": False,
|
|
"scorer": {"@scorers": "spacy.tagger_scorer.v1"},
|
|
"neg_prefix": "!",
|
|
"label_smoothing": 0.0,
|
|
},
|
|
default_score_weights={
|
|
"tag_acc": 1.0,
|
|
"pos_acc": 0.0,
|
|
"tag_micro_p": None,
|
|
"tag_micro_r": None,
|
|
"tag_micro_f": None,
|
|
},
|
|
)(make_tagger)
|
|
|
|
# nn_labeller
|
|
Language.factory(
|
|
"nn_labeller",
|
|
default_config={
|
|
"labels": None,
|
|
"target": "dep_tag_offset",
|
|
"model": DEFAULT_MT_MODEL,
|
|
},
|
|
)(make_nn_labeller)
|
|
|
|
# sentencizer
|
|
Language.factory(
|
|
"sentencizer",
|
|
assigns=["token.is_sent_start", "doc.sents"],
|
|
default_config={
|
|
"punct_chars": None,
|
|
"overwrite": False,
|
|
"scorer": {"@scorers": "spacy.senter_scorer.v1"},
|
|
},
|
|
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
|
|
)(make_sentencizer)
|
|
|
|
# Set the flag to indicate that all factories have been registered
|
|
FACTORIES_REGISTERED = True
|
|
|
|
|
|
# We can't have function implementations for these factories in Cython, because
|
|
# we need to build a Pydantic model for them dynamically, reading their argument
|
|
# structure from the signature. In Cython 3, this doesn't work because the
|
|
# from __future__ import annotations semantics are used, which means the types
|
|
# are stored as strings.
|
|
def make_sentencizer(
|
|
nlp: Language,
|
|
name: str,
|
|
punct_chars: Optional[List[str]],
|
|
overwrite: bool,
|
|
scorer: Optional[Callable],
|
|
):
|
|
return Sentencizer(
|
|
name, punct_chars=punct_chars, overwrite=overwrite, scorer=scorer
|
|
)
|
|
|
|
|
|
def make_attribute_ruler(
|
|
nlp: Language, name: str, validate: bool, scorer: Optional[Callable]
|
|
):
|
|
return AttributeRuler(nlp.vocab, name, validate=validate, scorer=scorer)
|
|
|
|
|
|
def make_entity_linker(
|
|
nlp: Language,
|
|
name: str,
|
|
model: Model,
|
|
*,
|
|
labels_discard: Iterable[str],
|
|
n_sents: int,
|
|
incl_prior: bool,
|
|
incl_context: bool,
|
|
entity_vector_length: int,
|
|
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
|
get_candidates_batch: Callable[
|
|
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
|
],
|
|
generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
|
|
overwrite: bool,
|
|
scorer: Optional[Callable],
|
|
use_gold_ents: bool,
|
|
candidates_batch_size: int,
|
|
threshold: Optional[float] = None,
|
|
):
|
|
|
|
if not model.attrs.get("include_span_maker", False):
|
|
# The only difference in arguments here is that use_gold_ents and threshold aren't available.
|
|
return EntityLinker_v1(
|
|
nlp.vocab,
|
|
model,
|
|
name,
|
|
labels_discard=labels_discard,
|
|
n_sents=n_sents,
|
|
incl_prior=incl_prior,
|
|
incl_context=incl_context,
|
|
entity_vector_length=entity_vector_length,
|
|
get_candidates=get_candidates,
|
|
overwrite=overwrite,
|
|
scorer=scorer,
|
|
)
|
|
return EntityLinker(
|
|
nlp.vocab,
|
|
model,
|
|
name,
|
|
labels_discard=labels_discard,
|
|
n_sents=n_sents,
|
|
incl_prior=incl_prior,
|
|
incl_context=incl_context,
|
|
entity_vector_length=entity_vector_length,
|
|
get_candidates=get_candidates,
|
|
get_candidates_batch=get_candidates_batch,
|
|
generate_empty_kb=generate_empty_kb,
|
|
overwrite=overwrite,
|
|
scorer=scorer,
|
|
use_gold_ents=use_gold_ents,
|
|
candidates_batch_size=candidates_batch_size,
|
|
threshold=threshold,
|
|
)
|
|
|
|
|
|
def make_lemmatizer(
|
|
nlp: Language,
|
|
model: Optional[Model],
|
|
name: str,
|
|
mode: str,
|
|
overwrite: bool,
|
|
scorer: Optional[Callable],
|
|
):
|
|
return Lemmatizer(
|
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
|
)
|
|
|
|
|
|
def make_textcat(
|
|
nlp: Language,
|
|
name: str,
|
|
model: Model[List[Doc], List[Floats2d]],
|
|
threshold: float,
|
|
scorer: Optional[Callable],
|
|
) -> TextCategorizer:
|
|
return TextCategorizer(nlp.vocab, model, name, threshold=threshold, scorer=scorer)
|
|
|
|
|
|
def make_token_splitter(
|
|
nlp: Language, name: str, *, min_length: int = 0, split_length: int = 0
|
|
):
|
|
return TokenSplitter(min_length=min_length, split_length=split_length)
|
|
|
|
|
|
def make_doc_cleaner(nlp: Language, name: str, *, attrs: Dict[str, Any], silent: bool):
|
|
return DocCleaner(attrs, silent=silent)
|
|
|
|
|
|
def make_tok2vec(nlp: Language, name: str, model: Model) -> Tok2Vec:
|
|
return Tok2Vec(nlp.vocab, model, name)
|
|
|
|
|
|
def make_spancat(
|
|
nlp: Language,
|
|
name: str,
|
|
suggester: Suggester,
|
|
model: Model[Tuple[List[Doc], Ragged], Floats2d],
|
|
spans_key: str,
|
|
scorer: Optional[Callable],
|
|
threshold: float,
|
|
max_positive: Optional[int],
|
|
) -> SpanCategorizer:
|
|
return SpanCategorizer(
|
|
nlp.vocab,
|
|
model=model,
|
|
suggester=suggester,
|
|
name=name,
|
|
spans_key=spans_key,
|
|
negative_weight=None,
|
|
allow_overlap=True,
|
|
max_positive=max_positive,
|
|
threshold=threshold,
|
|
scorer=scorer,
|
|
add_negative_label=False,
|
|
)
|
|
|
|
|
|
def make_spancat_singlelabel(
|
|
nlp: Language,
|
|
name: str,
|
|
suggester: Suggester,
|
|
model: Model[Tuple[List[Doc], Ragged], Floats2d],
|
|
spans_key: str,
|
|
negative_weight: float,
|
|
allow_overlap: bool,
|
|
scorer: Optional[Callable],
|
|
) -> SpanCategorizer:
|
|
return SpanCategorizer(
|
|
nlp.vocab,
|
|
model=model,
|
|
suggester=suggester,
|
|
name=name,
|
|
spans_key=spans_key,
|
|
negative_weight=negative_weight,
|
|
allow_overlap=allow_overlap,
|
|
max_positive=1,
|
|
add_negative_label=True,
|
|
threshold=None,
|
|
scorer=scorer,
|
|
)
|
|
|
|
|
|
def make_future_entity_ruler(
|
|
nlp: Language,
|
|
name: str,
|
|
phrase_matcher_attr: Optional[Union[int, str]],
|
|
matcher_fuzzy_compare: Callable,
|
|
validate: bool,
|
|
overwrite_ents: bool,
|
|
scorer: Optional[Callable],
|
|
ent_id_sep: str,
|
|
):
|
|
if overwrite_ents:
|
|
ents_filter = prioritize_new_ents_filter
|
|
else:
|
|
ents_filter = prioritize_existing_ents_filter
|
|
return SpanRuler(
|
|
nlp,
|
|
name,
|
|
spans_key=None,
|
|
spans_filter=None,
|
|
annotate_ents=True,
|
|
ents_filter=ents_filter,
|
|
phrase_matcher_attr=phrase_matcher_attr,
|
|
matcher_fuzzy_compare=matcher_fuzzy_compare,
|
|
validate=validate,
|
|
overwrite=False,
|
|
scorer=scorer,
|
|
)
|
|
|
|
|
|
def make_entity_ruler(
|
|
nlp: Language,
|
|
name: str,
|
|
phrase_matcher_attr: Optional[Union[int, str]],
|
|
matcher_fuzzy_compare: Callable,
|
|
validate: bool,
|
|
overwrite_ents: bool,
|
|
ent_id_sep: str,
|
|
scorer: Optional[Callable],
|
|
):
|
|
return EntityRuler(
|
|
nlp,
|
|
name,
|
|
phrase_matcher_attr=phrase_matcher_attr,
|
|
matcher_fuzzy_compare=matcher_fuzzy_compare,
|
|
validate=validate,
|
|
overwrite_ents=overwrite_ents,
|
|
ent_id_sep=ent_id_sep,
|
|
scorer=scorer,
|
|
)
|
|
|
|
|
|
def make_span_ruler(
|
|
nlp: Language,
|
|
name: str,
|
|
spans_key: Optional[str],
|
|
spans_filter: Optional[Callable[[Iterable[Span], Iterable[Span]], Iterable[Span]]],
|
|
annotate_ents: bool,
|
|
ents_filter: Callable[[Iterable[Span], Iterable[Span]], Iterable[Span]],
|
|
phrase_matcher_attr: Optional[Union[int, str]],
|
|
matcher_fuzzy_compare: Callable,
|
|
validate: bool,
|
|
overwrite: bool,
|
|
scorer: Optional[Callable],
|
|
):
|
|
return SpanRuler(
|
|
nlp,
|
|
name,
|
|
spans_key=spans_key,
|
|
spans_filter=spans_filter,
|
|
annotate_ents=annotate_ents,
|
|
ents_filter=ents_filter,
|
|
phrase_matcher_attr=phrase_matcher_attr,
|
|
matcher_fuzzy_compare=matcher_fuzzy_compare,
|
|
validate=validate,
|
|
overwrite=overwrite,
|
|
scorer=scorer,
|
|
)
|
|
|
|
|
|
def make_edit_tree_lemmatizer(
|
|
nlp: Language,
|
|
name: str,
|
|
model: Model,
|
|
backoff: Optional[str],
|
|
min_tree_freq: int,
|
|
overwrite: bool,
|
|
top_k: int,
|
|
scorer: Optional[Callable],
|
|
):
|
|
return EditTreeLemmatizer(
|
|
nlp.vocab,
|
|
model,
|
|
name,
|
|
backoff=backoff,
|
|
min_tree_freq=min_tree_freq,
|
|
overwrite=overwrite,
|
|
top_k=top_k,
|
|
scorer=scorer,
|
|
)
|
|
|
|
|
|
def make_multilabel_textcat(
|
|
nlp: Language,
|
|
name: str,
|
|
model: Model[List[Doc], List[Floats2d]],
|
|
threshold: float,
|
|
scorer: Optional[Callable],
|
|
) -> MultiLabel_TextCategorizer:
|
|
return MultiLabel_TextCategorizer(
|
|
nlp.vocab, model, name, threshold=threshold, scorer=scorer
|
|
)
|
|
|
|
|
|
def make_span_finder(
|
|
nlp: Language,
|
|
name: str,
|
|
model: Model[Iterable[Doc], Floats2d],
|
|
spans_key: str,
|
|
threshold: float,
|
|
max_length: Optional[int],
|
|
min_length: Optional[int],
|
|
scorer: Optional[Callable],
|
|
) -> SpanFinder:
|
|
return SpanFinder(
|
|
nlp,
|
|
model=model,
|
|
threshold=threshold,
|
|
name=name,
|
|
scorer=scorer,
|
|
max_length=max_length,
|
|
min_length=min_length,
|
|
spans_key=spans_key,
|
|
)
|
|
|
|
|
|
def make_ner(
|
|
nlp: Language,
|
|
name: str,
|
|
model: Model,
|
|
moves: Optional[TransitionSystem],
|
|
update_with_oracle_cut_size: int,
|
|
incorrect_spans_key: Optional[str],
|
|
scorer: Optional[Callable],
|
|
):
|
|
return EntityRecognizer(
|
|
nlp.vocab,
|
|
model,
|
|
name=name,
|
|
moves=moves,
|
|
update_with_oracle_cut_size=update_with_oracle_cut_size,
|
|
incorrect_spans_key=incorrect_spans_key,
|
|
scorer=scorer,
|
|
)
|
|
|
|
|
|
def make_beam_ner(
|
|
nlp: Language,
|
|
name: str,
|
|
model: Model,
|
|
moves: Optional[TransitionSystem],
|
|
update_with_oracle_cut_size: int,
|
|
beam_width: int,
|
|
beam_density: float,
|
|
beam_update_prob: float,
|
|
incorrect_spans_key: Optional[str],
|
|
scorer: Optional[Callable],
|
|
):
|
|
return EntityRecognizer(
|
|
nlp.vocab,
|
|
model,
|
|
name=name,
|
|
moves=moves,
|
|
update_with_oracle_cut_size=update_with_oracle_cut_size,
|
|
beam_width=beam_width,
|
|
beam_density=beam_density,
|
|
beam_update_prob=beam_update_prob,
|
|
incorrect_spans_key=incorrect_spans_key,
|
|
scorer=scorer,
|
|
)
|
|
|
|
|
|
def make_parser(
|
|
nlp: Language,
|
|
name: str,
|
|
model: Model,
|
|
moves: Optional[TransitionSystem],
|
|
update_with_oracle_cut_size: int,
|
|
learn_tokens: bool,
|
|
min_action_freq: int,
|
|
scorer: Optional[Callable],
|
|
):
|
|
return DependencyParser(
|
|
nlp.vocab,
|
|
model,
|
|
name=name,
|
|
moves=moves,
|
|
update_with_oracle_cut_size=update_with_oracle_cut_size,
|
|
learn_tokens=learn_tokens,
|
|
min_action_freq=min_action_freq,
|
|
scorer=scorer,
|
|
)
|
|
|
|
|
|
def make_beam_parser(
|
|
nlp: Language,
|
|
name: str,
|
|
model: Model,
|
|
moves: Optional[TransitionSystem],
|
|
update_with_oracle_cut_size: int,
|
|
learn_tokens: bool,
|
|
min_action_freq: int,
|
|
beam_width: int,
|
|
beam_density: float,
|
|
beam_update_prob: float,
|
|
scorer: Optional[Callable],
|
|
):
|
|
return DependencyParser(
|
|
nlp.vocab,
|
|
model,
|
|
name=name,
|
|
moves=moves,
|
|
update_with_oracle_cut_size=update_with_oracle_cut_size,
|
|
learn_tokens=learn_tokens,
|
|
min_action_freq=min_action_freq,
|
|
beam_width=beam_width,
|
|
beam_density=beam_density,
|
|
beam_update_prob=beam_update_prob,
|
|
scorer=scorer,
|
|
)
|
|
|
|
|
|
def make_tagger(
|
|
nlp: Language,
|
|
name: str,
|
|
model: Model,
|
|
overwrite: bool,
|
|
scorer: Optional[Callable],
|
|
neg_prefix: str,
|
|
label_smoothing: float,
|
|
):
|
|
return Tagger(
|
|
nlp.vocab,
|
|
model,
|
|
name=name,
|
|
overwrite=overwrite,
|
|
scorer=scorer,
|
|
neg_prefix=neg_prefix,
|
|
label_smoothing=label_smoothing,
|
|
)
|
|
|
|
|
|
def make_nn_labeller(
|
|
nlp: Language, name: str, model: Model, labels: Optional[dict], target: str
|
|
):
|
|
return MultitaskObjective(nlp.vocab, model, name, target=target)
|
|
|
|
|
|
def make_morphologizer(
|
|
nlp: Language,
|
|
model: Model,
|
|
name: str,
|
|
overwrite: bool,
|
|
extend: bool,
|
|
label_smoothing: float,
|
|
scorer: Optional[Callable],
|
|
):
|
|
return Morphologizer(
|
|
nlp.vocab,
|
|
model,
|
|
name,
|
|
overwrite=overwrite,
|
|
extend=extend,
|
|
label_smoothing=label_smoothing,
|
|
scorer=scorer,
|
|
)
|
|
|
|
|
|
def make_senter(
|
|
nlp: Language, name: str, model: Model, overwrite: bool, scorer: Optional[Callable]
|
|
):
|
|
return SentenceRecognizer(
|
|
nlp.vocab, model, name, overwrite=overwrite, scorer=scorer
|
|
)
|