mirror of
https://github.com/explosion/spaCy.git
synced 2026-01-22 16:24:26 +03:00
The functions can't be in Cython anymore, as we can't read the types off the signatures in Cython 3. To avoid having some in the file and some not, I've moved the Python ones as well. We'll need to do a re-import of these functions into the files that had them before to maintain backwards compatibility. This might require some import trickery to avoid circular imports.
1020 lines
33 KiB
Python
1020 lines
33 KiB
Python
"""Centralized registry population for spaCy components.
|
|
|
|
This module centralizes registry decorations to prevent circular import issues
|
|
with Cython annotation changes from __future__ import annotations. Functions
|
|
remain in their original locations, but decoration is moved here.
|
|
"""
|
|
from typing import Dict, Any, Callable, Iterable, List, Optional, Union, Tuple
|
|
from thinc.api import Model
|
|
from thinc.types import Floats2d, Ragged
|
|
from .tokens.doc import Doc
|
|
from .tokens.span import Span
|
|
from .kb import KnowledgeBase, Candidate
|
|
from .vocab import Vocab
|
|
from .pipeline.textcat import TextCategorizer
|
|
from .pipeline.tok2vec import Tok2Vec
|
|
from .pipeline.spancat import SpanCategorizer, Suggester
|
|
from .pipeline.textcat_multilabel import MultiLabel_TextCategorizer
|
|
from .pipeline.entityruler import EntityRuler
|
|
from .pipeline.span_finder import SpanFinder
|
|
from .pipeline.ner import EntityRecognizer
|
|
from .pipeline._parser_internals.transition_system import TransitionSystem
|
|
from .pipeline.ner import EntityRecognizer
|
|
from .pipeline.dep_parser import DependencyParser
|
|
from .pipeline.dep_parser import DependencyParser
|
|
from .pipeline.tagger import Tagger
|
|
from .pipeline.multitask import MultitaskObjective
|
|
from .pipeline.senter import SentenceRecognizer
|
|
|
|
# Global flag to track if registry has been populated
|
|
REGISTRY_POPULATED = False
|
|
|
|
# Global flag to track if factories have been registered
|
|
FACTORIES_REGISTERED = False
|
|
|
|
|
|
def populate_registry() -> None:
|
|
"""Populate the registry with all necessary components.
|
|
|
|
This function should be called before accessing the registry, to ensure
|
|
it's populated. The function uses a global flag to prevent repopulation.
|
|
"""
|
|
global REGISTRY_POPULATED
|
|
if REGISTRY_POPULATED:
|
|
return
|
|
|
|
# Import all necessary modules
|
|
from .util import registry, make_first_longest_spans_filter
|
|
|
|
# Import all pipeline components that were using registry decorators
|
|
from .pipeline.tagger import make_tagger_scorer
|
|
from .pipeline.ner import make_ner_scorer
|
|
from .pipeline.lemmatizer import make_lemmatizer_scorer
|
|
from .pipeline.span_finder import make_span_finder_scorer
|
|
from .pipeline.spancat import (
|
|
make_spancat_scorer,
|
|
build_ngram_suggester,
|
|
build_ngram_range_suggester,
|
|
build_preset_spans_suggester,
|
|
)
|
|
from .pipeline.entityruler import (
|
|
make_entity_ruler_scorer as make_entityruler_scorer,
|
|
)
|
|
from .pipeline.sentencizer import senter_score as make_sentencizer_scorer
|
|
from .pipeline.senter import make_senter_scorer
|
|
from .pipeline.textcat import make_textcat_scorer
|
|
from .pipeline.textcat_multilabel import make_textcat_multilabel_scorer
|
|
|
|
# Register miscellaneous components
|
|
registry.misc("spacy.first_longest_spans_filter.v1")(
|
|
make_first_longest_spans_filter
|
|
)
|
|
registry.misc("spacy.ngram_suggester.v1")(build_ngram_suggester)
|
|
registry.misc("spacy.ngram_range_suggester.v1")(build_ngram_range_suggester)
|
|
registry.misc("spacy.preset_spans_suggester.v1")(build_preset_spans_suggester)
|
|
|
|
# Need to get references to the existing functions in registry by importing the function that is there
|
|
# For the registry that was previously decorated
|
|
|
|
# Import ML components that use registry
|
|
from .ml.models.tok2vec import (
|
|
tok2vec_listener_v1,
|
|
build_hash_embed_cnn_tok2vec,
|
|
build_Tok2Vec_model,
|
|
MultiHashEmbed,
|
|
CharacterEmbed,
|
|
MaxoutWindowEncoder,
|
|
MishWindowEncoder,
|
|
BiLSTMEncoder,
|
|
)
|
|
|
|
# Register scorers
|
|
registry.scorers("spacy.tagger_scorer.v1")(make_tagger_scorer)
|
|
registry.scorers("spacy.ner_scorer.v1")(make_ner_scorer)
|
|
# span_ruler_scorer removed as it's not in span_ruler.py
|
|
registry.scorers("spacy.entity_ruler_scorer.v1")(make_entityruler_scorer)
|
|
registry.scorers("spacy.sentencizer_scorer.v1")(make_sentencizer_scorer)
|
|
registry.scorers("spacy.senter_scorer.v1")(make_senter_scorer)
|
|
registry.scorers("spacy.textcat_scorer.v1")(make_textcat_scorer)
|
|
registry.scorers("spacy.textcat_scorer.v2")(make_textcat_scorer)
|
|
registry.scorers("spacy.textcat_multilabel_scorer.v1")(
|
|
make_textcat_multilabel_scorer
|
|
)
|
|
registry.scorers("spacy.textcat_multilabel_scorer.v2")(
|
|
make_textcat_multilabel_scorer
|
|
)
|
|
registry.scorers("spacy.lemmatizer_scorer.v1")(make_lemmatizer_scorer)
|
|
registry.scorers("spacy.span_finder_scorer.v1")(make_span_finder_scorer)
|
|
registry.scorers("spacy.spancat_scorer.v1")(make_spancat_scorer)
|
|
|
|
# Register tok2vec architectures we've modified
|
|
registry.architectures("spacy.Tok2VecListener.v1")(tok2vec_listener_v1)
|
|
registry.architectures("spacy.HashEmbedCNN.v2")(build_hash_embed_cnn_tok2vec)
|
|
registry.architectures("spacy.Tok2Vec.v2")(build_Tok2Vec_model)
|
|
registry.architectures("spacy.MultiHashEmbed.v2")(MultiHashEmbed)
|
|
registry.architectures("spacy.CharacterEmbed.v2")(CharacterEmbed)
|
|
registry.architectures("spacy.MaxoutWindowEncoder.v2")(MaxoutWindowEncoder)
|
|
registry.architectures("spacy.MishWindowEncoder.v2")(MishWindowEncoder)
|
|
registry.architectures("spacy.TorchBiLSTMEncoder.v1")(BiLSTMEncoder)
|
|
|
|
# Register factory components
|
|
register_factories()
|
|
|
|
# Set the flag to indicate that the registry has been populated
|
|
REGISTRY_POPULATED = True
|
|
|
|
|
|
def register_factories() -> None:
|
|
"""Register all factories with the registry.
|
|
|
|
This function registers all pipeline component factories, centralizing
|
|
the registrations that were previously done with @Language.factory decorators.
|
|
"""
|
|
global FACTORIES_REGISTERED
|
|
|
|
if FACTORIES_REGISTERED:
|
|
return
|
|
|
|
from .language import Language
|
|
from .pipeline.sentencizer import Sentencizer
|
|
|
|
# Import factory default configurations
|
|
from .pipeline.entity_linker import DEFAULT_NEL_MODEL
|
|
from .pipeline.entityruler import DEFAULT_ENT_ID_SEP
|
|
from .pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
|
from .pipeline.senter import DEFAULT_SENTER_MODEL
|
|
from .pipeline.morphologizer import DEFAULT_MORPH_MODEL
|
|
from .pipeline.spancat import (
|
|
DEFAULT_SPANCAT_MODEL,
|
|
DEFAULT_SPANCAT_SINGLELABEL_MODEL,
|
|
DEFAULT_SPANS_KEY,
|
|
)
|
|
from .pipeline.span_ruler import DEFAULT_SPANS_KEY as SPAN_RULER_DEFAULT_SPANS_KEY
|
|
from .pipeline.edit_tree_lemmatizer import DEFAULT_EDIT_TREE_LEMMATIZER_MODEL
|
|
from .pipeline.textcat_multilabel import DEFAULT_MULTI_TEXTCAT_MODEL
|
|
from .pipeline.span_finder import DEFAULT_SPAN_FINDER_MODEL
|
|
from .pipeline.ner import DEFAULT_NER_MODEL
|
|
from .pipeline.dep_parser import DEFAULT_PARSER_MODEL
|
|
from .pipeline.tagger import DEFAULT_TAGGER_MODEL
|
|
from .pipeline.multitask import DEFAULT_MT_MODEL
|
|
from .pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL
|
|
|
|
# We can't have function implementations for these factories in Cython, because
|
|
# we need to build a Pydantic model for them dynamically, reading their argument
|
|
# structure from the signature. In Cython 3, this doesn't work because the
|
|
# from __future__ import annotations semantics are used, which means the types
|
|
# are stored as strings.
|
|
def make_sentencizer(
|
|
nlp: Language,
|
|
name: str,
|
|
punct_chars: Optional[List[str]],
|
|
overwrite: bool,
|
|
scorer: Optional[Callable],
|
|
):
|
|
return Sentencizer(
|
|
name, punct_chars=punct_chars, overwrite=overwrite, scorer=scorer
|
|
)
|
|
|
|
def make_attribute_ruler(
|
|
nlp: Language, name: str, validate: bool, scorer: Optional[Callable]
|
|
):
|
|
from .pipeline.attributeruler import AttributeRuler
|
|
return AttributeRuler(nlp.vocab, name, validate=validate, scorer=scorer)
|
|
|
|
def make_entity_linker(
|
|
nlp: Language,
|
|
name: str,
|
|
model: Model,
|
|
*,
|
|
labels_discard: Iterable[str],
|
|
n_sents: int,
|
|
incl_prior: bool,
|
|
incl_context: bool,
|
|
entity_vector_length: int,
|
|
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
|
get_candidates_batch: Callable[
|
|
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
|
],
|
|
generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
|
|
overwrite: bool,
|
|
scorer: Optional[Callable],
|
|
use_gold_ents: bool,
|
|
candidates_batch_size: int,
|
|
threshold: Optional[float] = None,
|
|
):
|
|
from .pipeline.entity_linker import EntityLinker, EntityLinker_v1
|
|
|
|
if not model.attrs.get("include_span_maker", False):
|
|
# The only difference in arguments here is that use_gold_ents and threshold aren't available.
|
|
return EntityLinker_v1(
|
|
nlp.vocab,
|
|
model,
|
|
name,
|
|
labels_discard=labels_discard,
|
|
n_sents=n_sents,
|
|
incl_prior=incl_prior,
|
|
incl_context=incl_context,
|
|
entity_vector_length=entity_vector_length,
|
|
get_candidates=get_candidates,
|
|
overwrite=overwrite,
|
|
scorer=scorer,
|
|
)
|
|
return EntityLinker(
|
|
nlp.vocab,
|
|
model,
|
|
name,
|
|
labels_discard=labels_discard,
|
|
n_sents=n_sents,
|
|
incl_prior=incl_prior,
|
|
incl_context=incl_context,
|
|
entity_vector_length=entity_vector_length,
|
|
get_candidates=get_candidates,
|
|
get_candidates_batch=get_candidates_batch,
|
|
generate_empty_kb=generate_empty_kb,
|
|
overwrite=overwrite,
|
|
scorer=scorer,
|
|
use_gold_ents=use_gold_ents,
|
|
candidates_batch_size=candidates_batch_size,
|
|
threshold=threshold,
|
|
)
|
|
|
|
def make_lemmatizer(
|
|
nlp: Language,
|
|
model: Optional[Model],
|
|
name: str,
|
|
mode: str,
|
|
overwrite: bool,
|
|
scorer: Optional[Callable],
|
|
):
|
|
from .pipeline.lemmatizer import Lemmatizer
|
|
return Lemmatizer(
|
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
|
)
|
|
|
|
def make_textcat(
|
|
nlp: Language,
|
|
name: str,
|
|
model: Model[List[Doc], List[Floats2d]],
|
|
threshold: float,
|
|
scorer: Optional[Callable],
|
|
) -> TextCategorizer:
|
|
return TextCategorizer(nlp.vocab, model, name, threshold=threshold, scorer=scorer)
|
|
|
|
def make_token_splitter(
|
|
nlp: Language, name: str, *, min_length: int = 0, split_length: int = 0
|
|
):
|
|
from .pipeline.functions import TokenSplitter
|
|
return TokenSplitter(min_length=min_length, split_length=split_length)
|
|
|
|
def make_doc_cleaner(nlp: Language, name: str, *, attrs: Dict[str, Any], silent: bool):
|
|
from .pipeline.functions import DocCleaner
|
|
return DocCleaner(attrs, silent=silent)
|
|
|
|
def make_tok2vec(nlp: Language, name: str, model: Model) -> Tok2Vec:
|
|
return Tok2Vec(nlp.vocab, model, name)
|
|
|
|
def make_spancat(
|
|
nlp: Language,
|
|
name: str,
|
|
suggester: Suggester,
|
|
model: Model[Tuple[List[Doc], Ragged], Floats2d],
|
|
spans_key: str,
|
|
scorer: Optional[Callable],
|
|
threshold: float,
|
|
max_positive: Optional[int],
|
|
) -> SpanCategorizer:
|
|
return SpanCategorizer(
|
|
nlp.vocab,
|
|
model=model,
|
|
suggester=suggester,
|
|
name=name,
|
|
spans_key=spans_key,
|
|
negative_weight=None,
|
|
allow_overlap=True,
|
|
max_positive=max_positive,
|
|
threshold=threshold,
|
|
scorer=scorer,
|
|
add_negative_label=False,
|
|
)
|
|
|
|
def make_spancat_singlelabel(
|
|
nlp: Language,
|
|
name: str,
|
|
suggester: Suggester,
|
|
model: Model[Tuple[List[Doc], Ragged], Floats2d],
|
|
spans_key: str,
|
|
negative_weight: float,
|
|
allow_overlap: bool,
|
|
scorer: Optional[Callable],
|
|
) -> "SpanCategorizer":
|
|
from .pipeline.spancat import SpanCategorizer
|
|
return SpanCategorizer(
|
|
nlp.vocab,
|
|
model=model,
|
|
suggester=suggester,
|
|
name=name,
|
|
spans_key=spans_key,
|
|
negative_weight=negative_weight,
|
|
allow_overlap=allow_overlap,
|
|
max_positive=1,
|
|
add_negative_label=True,
|
|
threshold=None,
|
|
scorer=scorer,
|
|
)
|
|
|
|
def make_future_entity_ruler(
|
|
nlp: Language,
|
|
name: str,
|
|
phrase_matcher_attr: Optional[Union[int, str]],
|
|
matcher_fuzzy_compare: Callable,
|
|
validate: bool,
|
|
overwrite_ents: bool,
|
|
scorer: Optional[Callable],
|
|
ent_id_sep: str,
|
|
):
|
|
from .pipeline.span_ruler import SpanRuler, prioritize_new_ents_filter, prioritize_existing_ents_filter
|
|
if overwrite_ents:
|
|
ents_filter = prioritize_new_ents_filter
|
|
else:
|
|
ents_filter = prioritize_existing_ents_filter
|
|
return SpanRuler(
|
|
nlp,
|
|
name,
|
|
spans_key=None,
|
|
spans_filter=None,
|
|
annotate_ents=True,
|
|
ents_filter=ents_filter,
|
|
phrase_matcher_attr=phrase_matcher_attr,
|
|
matcher_fuzzy_compare=matcher_fuzzy_compare,
|
|
validate=validate,
|
|
overwrite=False,
|
|
scorer=scorer,
|
|
)
|
|
|
|
def make_entity_ruler(
|
|
nlp: Language,
|
|
name: str,
|
|
phrase_matcher_attr: Optional[Union[int, str]],
|
|
matcher_fuzzy_compare: Callable,
|
|
validate: bool,
|
|
overwrite_ents: bool,
|
|
ent_id_sep: str,
|
|
scorer: Optional[Callable],
|
|
):
|
|
return EntityRuler(
|
|
nlp,
|
|
name,
|
|
phrase_matcher_attr=phrase_matcher_attr,
|
|
matcher_fuzzy_compare=matcher_fuzzy_compare,
|
|
validate=validate,
|
|
overwrite_ents=overwrite_ents,
|
|
ent_id_sep=ent_id_sep,
|
|
scorer=scorer,
|
|
)
|
|
|
|
def make_span_ruler(
|
|
nlp: Language,
|
|
name: str,
|
|
spans_key: Optional[str],
|
|
spans_filter: Optional[Callable[[Iterable[Span], Iterable[Span]], Iterable[Span]]],
|
|
annotate_ents: bool,
|
|
ents_filter: Callable[[Iterable[Span], Iterable[Span]], Iterable[Span]],
|
|
phrase_matcher_attr: Optional[Union[int, str]],
|
|
matcher_fuzzy_compare: Callable,
|
|
validate: bool,
|
|
overwrite: bool,
|
|
scorer: Optional[Callable],
|
|
):
|
|
from .pipeline.span_ruler import SpanRuler
|
|
return SpanRuler(
|
|
nlp,
|
|
name,
|
|
spans_key=spans_key,
|
|
spans_filter=spans_filter,
|
|
annotate_ents=annotate_ents,
|
|
ents_filter=ents_filter,
|
|
phrase_matcher_attr=phrase_matcher_attr,
|
|
matcher_fuzzy_compare=matcher_fuzzy_compare,
|
|
validate=validate,
|
|
overwrite=overwrite,
|
|
scorer=scorer,
|
|
)
|
|
|
|
def make_edit_tree_lemmatizer(
|
|
nlp: Language,
|
|
name: str,
|
|
model: Model,
|
|
backoff: Optional[str],
|
|
min_tree_freq: int,
|
|
overwrite: bool,
|
|
top_k: int,
|
|
scorer: Optional[Callable],
|
|
):
|
|
from .pipeline.edit_tree_lemmatizer import EditTreeLemmatizer
|
|
return EditTreeLemmatizer(
|
|
nlp.vocab,
|
|
model,
|
|
name,
|
|
backoff=backoff,
|
|
min_tree_freq=min_tree_freq,
|
|
overwrite=overwrite,
|
|
top_k=top_k,
|
|
scorer=scorer,
|
|
)
|
|
|
|
def make_multilabel_textcat(
|
|
nlp: Language,
|
|
name: str,
|
|
model: Model[List[Doc], List[Floats2d]],
|
|
threshold: float,
|
|
scorer: Optional[Callable],
|
|
) -> MultiLabel_TextCategorizer:
|
|
return MultiLabel_TextCategorizer(
|
|
nlp.vocab, model, name, threshold=threshold, scorer=scorer
|
|
)
|
|
|
|
def make_span_finder(
|
|
nlp: Language,
|
|
name: str,
|
|
model: Model[Iterable[Doc], Floats2d],
|
|
spans_key: str,
|
|
threshold: float,
|
|
max_length: Optional[int],
|
|
min_length: Optional[int],
|
|
scorer: Optional[Callable],
|
|
) -> SpanFinder:
|
|
return SpanFinder(
|
|
nlp,
|
|
model=model,
|
|
threshold=threshold,
|
|
name=name,
|
|
scorer=scorer,
|
|
max_length=max_length,
|
|
min_length=min_length,
|
|
spans_key=spans_key,
|
|
)
|
|
|
|
def make_ner(
|
|
nlp: Language,
|
|
name: str,
|
|
model: Model,
|
|
moves: Optional[TransitionSystem],
|
|
update_with_oracle_cut_size: int,
|
|
incorrect_spans_key: Optional[str],
|
|
scorer: Optional[Callable],
|
|
):
|
|
return EntityRecognizer(
|
|
nlp.vocab,
|
|
model,
|
|
name=name,
|
|
moves=moves,
|
|
update_with_oracle_cut_size=update_with_oracle_cut_size,
|
|
incorrect_spans_key=incorrect_spans_key,
|
|
scorer=scorer,
|
|
)
|
|
|
|
def make_beam_ner(
|
|
nlp: Language,
|
|
name: str,
|
|
model: Model,
|
|
moves: Optional[TransitionSystem],
|
|
update_with_oracle_cut_size: int,
|
|
beam_width: int,
|
|
beam_density: float,
|
|
beam_update_prob: float,
|
|
incorrect_spans_key: Optional[str],
|
|
scorer: Optional[Callable],
|
|
):
|
|
return EntityRecognizer(
|
|
nlp.vocab,
|
|
model,
|
|
name=name,
|
|
moves=moves,
|
|
update_with_oracle_cut_size=update_with_oracle_cut_size,
|
|
beam_width=beam_width,
|
|
beam_density=beam_density,
|
|
beam_update_prob=beam_update_prob,
|
|
incorrect_spans_key=incorrect_spans_key,
|
|
scorer=scorer,
|
|
)
|
|
|
|
def make_parser(
|
|
nlp: Language,
|
|
name: str,
|
|
model: Model,
|
|
moves: Optional[TransitionSystem],
|
|
update_with_oracle_cut_size: int,
|
|
learn_tokens: bool,
|
|
min_action_freq: int,
|
|
scorer: Optional[Callable],
|
|
):
|
|
return DependencyParser(
|
|
nlp.vocab,
|
|
model,
|
|
name=name,
|
|
moves=moves,
|
|
update_with_oracle_cut_size=update_with_oracle_cut_size,
|
|
learn_tokens=learn_tokens,
|
|
min_action_freq=min_action_freq,
|
|
scorer=scorer,
|
|
)
|
|
|
|
def make_beam_parser(
|
|
nlp: Language,
|
|
name: str,
|
|
model: Model,
|
|
moves: Optional[TransitionSystem],
|
|
update_with_oracle_cut_size: int,
|
|
learn_tokens: bool,
|
|
min_action_freq: int,
|
|
beam_width: int,
|
|
beam_density: float,
|
|
beam_update_prob: float,
|
|
scorer: Optional[Callable],
|
|
):
|
|
return DependencyParser(
|
|
nlp.vocab,
|
|
model,
|
|
name=name,
|
|
moves=moves,
|
|
update_with_oracle_cut_size=update_with_oracle_cut_size,
|
|
learn_tokens=learn_tokens,
|
|
min_action_freq=min_action_freq,
|
|
beam_width=beam_width,
|
|
beam_density=beam_density,
|
|
beam_update_prob=beam_update_prob,
|
|
scorer=scorer,
|
|
)
|
|
|
|
def make_tagger(
|
|
nlp: Language,
|
|
name: str,
|
|
model: Model,
|
|
overwrite: bool,
|
|
scorer: Optional[Callable],
|
|
neg_prefix: str,
|
|
label_smoothing: float,
|
|
):
|
|
return Tagger(
|
|
nlp.vocab,
|
|
model,
|
|
name=name,
|
|
overwrite=overwrite,
|
|
scorer=scorer,
|
|
neg_prefix=neg_prefix,
|
|
label_smoothing=label_smoothing,
|
|
)
|
|
|
|
def make_nn_labeller(
|
|
nlp: Language,
|
|
name: str,
|
|
model: Model,
|
|
labels: Optional[dict],
|
|
target: str
|
|
):
|
|
return MultitaskObjective(nlp.vocab, model, name, target=target)
|
|
|
|
def make_morphologizer(
|
|
nlp: Language,
|
|
model: Model,
|
|
name: str,
|
|
overwrite: bool,
|
|
extend: bool,
|
|
label_smoothing: float,
|
|
scorer: Optional[Callable],
|
|
):
|
|
from .pipeline.morphologizer import Morphologizer
|
|
return Morphologizer(
|
|
nlp.vocab, model, name,
|
|
overwrite=overwrite,
|
|
extend=extend,
|
|
label_smoothing=label_smoothing,
|
|
scorer=scorer
|
|
)
|
|
|
|
def make_senter(
|
|
nlp: Language,
|
|
name: str,
|
|
model: Model,
|
|
overwrite: bool,
|
|
scorer: Optional[Callable]
|
|
):
|
|
return SentenceRecognizer(
|
|
nlp.vocab, model, name,
|
|
overwrite=overwrite,
|
|
scorer=scorer
|
|
)
|
|
|
|
# Register factories using the same pattern as Language.factory decorator
|
|
# We use Language.factory()() pattern which exactly mimics the decorator
|
|
|
|
# attributeruler
|
|
Language.factory(
|
|
"attribute_ruler",
|
|
default_config={
|
|
"validate": False,
|
|
"scorer": {"@scorers": "spacy.attribute_ruler_scorer.v1"},
|
|
},
|
|
)(make_attribute_ruler)
|
|
|
|
# entity_linker
|
|
Language.factory(
|
|
"entity_linker",
|
|
requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
|
|
assigns=["token.ent_kb_id"],
|
|
default_config={
|
|
"model": DEFAULT_NEL_MODEL,
|
|
"labels_discard": [],
|
|
"n_sents": 0,
|
|
"incl_prior": True,
|
|
"incl_context": True,
|
|
"entity_vector_length": 64,
|
|
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
|
"get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
|
|
"generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"},
|
|
"overwrite": True,
|
|
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
|
|
"use_gold_ents": True,
|
|
"candidates_batch_size": 1,
|
|
"threshold": None,
|
|
},
|
|
default_score_weights={
|
|
"nel_micro_f": 1.0,
|
|
"nel_micro_r": None,
|
|
"nel_micro_p": None,
|
|
},
|
|
)(make_entity_linker)
|
|
|
|
# entity_ruler
|
|
Language.factory(
|
|
"entity_ruler",
|
|
assigns=["doc.ents", "token.ent_type", "token.ent_iob"],
|
|
default_config={
|
|
"phrase_matcher_attr": None,
|
|
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
|
|
"validate": False,
|
|
"overwrite_ents": False,
|
|
"ent_id_sep": DEFAULT_ENT_ID_SEP,
|
|
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
|
|
},
|
|
default_score_weights={
|
|
"ents_f": 1.0,
|
|
"ents_p": 0.0,
|
|
"ents_r": 0.0,
|
|
"ents_per_type": None,
|
|
},
|
|
)(make_entity_ruler)
|
|
|
|
# lemmatizer
|
|
Language.factory(
|
|
"lemmatizer",
|
|
assigns=["token.lemma"],
|
|
default_config={
|
|
"model": None,
|
|
"mode": "lookup",
|
|
"overwrite": False,
|
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
|
},
|
|
default_score_weights={"lemma_acc": 1.0},
|
|
)(make_lemmatizer)
|
|
|
|
# textcat
|
|
Language.factory(
|
|
"textcat",
|
|
assigns=["doc.cats"],
|
|
default_config={
|
|
"threshold": 0.0,
|
|
"model": DEFAULT_SINGLE_TEXTCAT_MODEL,
|
|
"scorer": {"@scorers": "spacy.textcat_scorer.v2"},
|
|
},
|
|
default_score_weights={
|
|
"cats_score": 1.0,
|
|
"cats_score_desc": None,
|
|
"cats_micro_p": None,
|
|
"cats_micro_r": None,
|
|
"cats_micro_f": None,
|
|
"cats_macro_p": None,
|
|
"cats_macro_r": None,
|
|
"cats_macro_f": None,
|
|
"cats_macro_auc": None,
|
|
"cats_f_per_type": None,
|
|
},
|
|
)(make_textcat)
|
|
|
|
# token_splitter
|
|
Language.factory(
|
|
"token_splitter",
|
|
default_config={"min_length": 25, "split_length": 10},
|
|
retokenizes=True,
|
|
)(make_token_splitter)
|
|
|
|
# doc_cleaner
|
|
Language.factory(
|
|
"doc_cleaner",
|
|
default_config={"attrs": {"tensor": None, "_.trf_data": None}, "silent": True},
|
|
)(make_doc_cleaner)
|
|
|
|
# tok2vec
|
|
Language.factory(
|
|
"tok2vec",
|
|
assigns=["doc.tensor"],
|
|
default_config={"model": DEFAULT_TOK2VEC_MODEL},
|
|
)(make_tok2vec)
|
|
|
|
# senter
|
|
Language.factory(
|
|
"senter",
|
|
assigns=["token.is_sent_start"],
|
|
default_config={
|
|
"model": DEFAULT_SENTER_MODEL,
|
|
"overwrite": False,
|
|
"scorer": {"@scorers": "spacy.senter_scorer.v1"},
|
|
},
|
|
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
|
|
)(make_senter)
|
|
|
|
# morphologizer
|
|
Language.factory(
|
|
"morphologizer",
|
|
assigns=["token.morph", "token.pos"],
|
|
default_config={
|
|
"model": DEFAULT_MORPH_MODEL,
|
|
"overwrite": True,
|
|
"extend": False,
|
|
"scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
|
|
"label_smoothing": 0.0,
|
|
},
|
|
default_score_weights={
|
|
"pos_acc": 0.5,
|
|
"morph_acc": 0.5,
|
|
"morph_per_feat": None,
|
|
},
|
|
)(make_morphologizer)
|
|
|
|
# spancat
|
|
Language.factory(
|
|
"spancat",
|
|
assigns=["doc.spans"],
|
|
default_config={
|
|
"threshold": 0.5,
|
|
"spans_key": DEFAULT_SPANS_KEY,
|
|
"max_positive": None,
|
|
"model": DEFAULT_SPANCAT_MODEL,
|
|
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
|
|
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
|
|
},
|
|
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
|
|
)(make_spancat)
|
|
|
|
# spancat_singlelabel
|
|
Language.factory(
|
|
"spancat_singlelabel",
|
|
assigns=["doc.spans"],
|
|
default_config={
|
|
"spans_key": DEFAULT_SPANS_KEY,
|
|
"model": DEFAULT_SPANCAT_SINGLELABEL_MODEL,
|
|
"negative_weight": 1.0,
|
|
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
|
|
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
|
|
"allow_overlap": True,
|
|
},
|
|
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
|
|
)(make_spancat_singlelabel)
|
|
|
|
# future_entity_ruler
|
|
Language.factory(
|
|
"future_entity_ruler",
|
|
assigns=["doc.ents"],
|
|
default_config={
|
|
"phrase_matcher_attr": None,
|
|
"validate": False,
|
|
"overwrite_ents": False,
|
|
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
|
|
"ent_id_sep": "__unused__",
|
|
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
|
|
},
|
|
default_score_weights={
|
|
"ents_f": 1.0,
|
|
"ents_p": 0.0,
|
|
"ents_r": 0.0,
|
|
"ents_per_type": None,
|
|
},
|
|
)(make_future_entity_ruler)
|
|
|
|
# span_ruler
|
|
Language.factory(
|
|
"span_ruler",
|
|
assigns=["doc.spans"],
|
|
default_config={
|
|
"spans_key": SPAN_RULER_DEFAULT_SPANS_KEY,
|
|
"spans_filter": None,
|
|
"annotate_ents": False,
|
|
"ents_filter": {"@misc": "spacy.first_longest_spans_filter.v1"},
|
|
"phrase_matcher_attr": None,
|
|
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
|
|
"validate": False,
|
|
"overwrite": True,
|
|
"scorer": {
|
|
"@scorers": "spacy.overlapping_labeled_spans_scorer.v1",
|
|
"spans_key": SPAN_RULER_DEFAULT_SPANS_KEY,
|
|
},
|
|
},
|
|
default_score_weights={
|
|
f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_f": 1.0,
|
|
f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_p": 0.0,
|
|
f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_r": 0.0,
|
|
f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_per_type": None,
|
|
},
|
|
)(make_span_ruler)
|
|
|
|
# trainable_lemmatizer
|
|
Language.factory(
|
|
"trainable_lemmatizer",
|
|
assigns=["token.lemma"],
|
|
requires=[],
|
|
default_config={
|
|
"model": DEFAULT_EDIT_TREE_LEMMATIZER_MODEL,
|
|
"backoff": "orth",
|
|
"min_tree_freq": 3,
|
|
"overwrite": False,
|
|
"top_k": 1,
|
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
|
},
|
|
default_score_weights={"lemma_acc": 1.0},
|
|
)(make_edit_tree_lemmatizer)
|
|
|
|
# textcat_multilabel
|
|
Language.factory(
|
|
"textcat_multilabel",
|
|
assigns=["doc.cats"],
|
|
default_config={
|
|
"threshold": 0.5,
|
|
"model": DEFAULT_MULTI_TEXTCAT_MODEL,
|
|
"scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v2"},
|
|
},
|
|
default_score_weights={
|
|
"cats_score": 1.0,
|
|
"cats_score_desc": None,
|
|
"cats_micro_p": None,
|
|
"cats_micro_r": None,
|
|
"cats_micro_f": None,
|
|
"cats_macro_p": None,
|
|
"cats_macro_r": None,
|
|
"cats_macro_f": None,
|
|
"cats_macro_auc": None,
|
|
"cats_f_per_type": None,
|
|
},
|
|
)(make_multilabel_textcat)
|
|
|
|
# span_finder
|
|
Language.factory(
|
|
"span_finder",
|
|
assigns=["doc.spans"],
|
|
default_config={
|
|
"threshold": 0.5,
|
|
"model": DEFAULT_SPAN_FINDER_MODEL,
|
|
"spans_key": DEFAULT_SPANS_KEY,
|
|
"max_length": 25,
|
|
"min_length": None,
|
|
"scorer": {"@scorers": "spacy.span_finder_scorer.v1"},
|
|
},
|
|
default_score_weights={
|
|
f"spans_{DEFAULT_SPANS_KEY}_f": 1.0,
|
|
f"spans_{DEFAULT_SPANS_KEY}_p": 0.0,
|
|
f"spans_{DEFAULT_SPANS_KEY}_r": 0.0,
|
|
},
|
|
)(make_span_finder)
|
|
|
|
# ner
|
|
Language.factory(
|
|
"ner",
|
|
assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
|
|
default_config={
|
|
"moves": None,
|
|
"update_with_oracle_cut_size": 100,
|
|
"model": DEFAULT_NER_MODEL,
|
|
"incorrect_spans_key": None,
|
|
"scorer": {"@scorers": "spacy.ner_scorer.v1"},
|
|
},
|
|
default_score_weights={
|
|
"ents_f": 1.0,
|
|
"ents_p": 0.0,
|
|
"ents_r": 0.0,
|
|
"ents_per_type": None,
|
|
},
|
|
)(make_ner)
|
|
|
|
# beam_ner
|
|
Language.factory(
|
|
"beam_ner",
|
|
assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
|
|
default_config={
|
|
"moves": None,
|
|
"update_with_oracle_cut_size": 100,
|
|
"model": DEFAULT_NER_MODEL,
|
|
"beam_density": 0.01,
|
|
"beam_update_prob": 0.5,
|
|
"beam_width": 32,
|
|
"incorrect_spans_key": None,
|
|
"scorer": {"@scorers": "spacy.ner_scorer.v1"},
|
|
},
|
|
default_score_weights={
|
|
"ents_f": 1.0,
|
|
"ents_p": 0.0,
|
|
"ents_r": 0.0,
|
|
"ents_per_type": None,
|
|
},
|
|
)(make_beam_ner)
|
|
|
|
# parser
|
|
Language.factory(
|
|
"parser",
|
|
assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
|
|
default_config={
|
|
"moves": None,
|
|
"update_with_oracle_cut_size": 100,
|
|
"learn_tokens": False,
|
|
"min_action_freq": 30,
|
|
"model": DEFAULT_PARSER_MODEL,
|
|
"scorer": {"@scorers": "spacy.parser_scorer.v1"},
|
|
},
|
|
default_score_weights={
|
|
"dep_uas": 0.5,
|
|
"dep_las": 0.5,
|
|
"dep_las_per_type": None,
|
|
"sents_p": None,
|
|
"sents_r": None,
|
|
"sents_f": 0.0,
|
|
},
|
|
)(make_parser)
|
|
|
|
# beam_parser
|
|
Language.factory(
|
|
"beam_parser",
|
|
assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
|
|
default_config={
|
|
"moves": None,
|
|
"update_with_oracle_cut_size": 100,
|
|
"learn_tokens": False,
|
|
"min_action_freq": 30,
|
|
"beam_width": 8,
|
|
"beam_density": 0.0001,
|
|
"beam_update_prob": 0.5,
|
|
"model": DEFAULT_PARSER_MODEL,
|
|
"scorer": {"@scorers": "spacy.parser_scorer.v1"},
|
|
},
|
|
default_score_weights={
|
|
"dep_uas": 0.5,
|
|
"dep_las": 0.5,
|
|
"dep_las_per_type": None,
|
|
"sents_p": None,
|
|
"sents_r": None,
|
|
"sents_f": 0.0,
|
|
},
|
|
)(make_beam_parser)
|
|
|
|
# tagger
|
|
Language.factory(
|
|
"tagger",
|
|
assigns=["token.tag"],
|
|
default_config={
|
|
"model": DEFAULT_TAGGER_MODEL,
|
|
"overwrite": False,
|
|
"scorer": {"@scorers": "spacy.tagger_scorer.v1"},
|
|
"neg_prefix": "!",
|
|
"label_smoothing": 0.0,
|
|
},
|
|
default_score_weights={
|
|
"tag_acc": 1.0,
|
|
"pos_acc": 0.0,
|
|
"tag_micro_p": None,
|
|
"tag_micro_r": None,
|
|
"tag_micro_f": None,
|
|
},
|
|
)(make_tagger)
|
|
|
|
# nn_labeller
|
|
Language.factory(
|
|
"nn_labeller",
|
|
default_config={
|
|
"labels": None,
|
|
"target": "dep_tag_offset",
|
|
"model": DEFAULT_MT_MODEL,
|
|
},
|
|
)(make_nn_labeller)
|
|
|
|
# sentencizer
|
|
Language.factory(
|
|
"sentencizer",
|
|
assigns=["token.is_sent_start", "doc.sents"],
|
|
default_config={
|
|
"punct_chars": None,
|
|
"overwrite": False,
|
|
"scorer": {"@scorers": "spacy.senter_scorer.v1"},
|
|
},
|
|
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
|
|
)(make_sentencizer)
|
|
|
|
# Set the flag to indicate that all factories have been registered
|
|
FACTORIES_REGISTERED = True
|