mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-13 01:32:32 +03:00
WIP
This commit is contained in:
parent
cda2bd01d4
commit
c3f9fab5e8
|
@ -12,43 +12,62 @@ REGISTRY_POPULATED = False
|
||||||
# Global flag to track if factories have been registered
|
# Global flag to track if factories have been registered
|
||||||
FACTORIES_REGISTERED = False
|
FACTORIES_REGISTERED = False
|
||||||
|
|
||||||
|
|
||||||
def populate_registry() -> None:
|
def populate_registry() -> None:
|
||||||
"""Populate the registry with all necessary components.
|
"""Populate the registry with all necessary components.
|
||||||
|
|
||||||
This function should be called before accessing the registry, to ensure
|
This function should be called before accessing the registry, to ensure
|
||||||
it's populated. The function uses a global flag to prevent repopulation.
|
it's populated. The function uses a global flag to prevent repopulation.
|
||||||
"""
|
"""
|
||||||
global REGISTRY_POPULATED
|
global REGISTRY_POPULATED
|
||||||
if REGISTRY_POPULATED:
|
if REGISTRY_POPULATED:
|
||||||
return
|
return
|
||||||
|
|
||||||
# Import all necessary modules
|
# Import all necessary modules
|
||||||
from .util import registry, make_first_longest_spans_filter
|
from .util import registry, make_first_longest_spans_filter
|
||||||
|
|
||||||
# Import all pipeline components that were using registry decorators
|
# Import all pipeline components that were using registry decorators
|
||||||
from .pipeline.tagger import make_tagger_scorer
|
from .pipeline.tagger import make_tagger_scorer
|
||||||
from .pipeline.ner import make_ner_scorer
|
from .pipeline.ner import make_ner_scorer
|
||||||
from .pipeline.lemmatizer import make_lemmatizer_scorer
|
from .pipeline.lemmatizer import make_lemmatizer_scorer
|
||||||
from .pipeline.span_finder import make_span_finder_scorer
|
from .pipeline.span_finder import make_span_finder_scorer
|
||||||
from .pipeline.spancat import make_spancat_scorer, build_ngram_suggester, build_ngram_range_suggester, build_preset_spans_suggester
|
from .pipeline.spancat import (
|
||||||
from .pipeline.entityruler import make_entity_ruler_scorer as make_entityruler_scorer
|
make_spancat_scorer,
|
||||||
|
build_ngram_suggester,
|
||||||
|
build_ngram_range_suggester,
|
||||||
|
build_preset_spans_suggester,
|
||||||
|
)
|
||||||
|
from .pipeline.entityruler import (
|
||||||
|
make_entity_ruler_scorer as make_entityruler_scorer,
|
||||||
|
)
|
||||||
from .pipeline.sentencizer import senter_score as make_sentencizer_scorer
|
from .pipeline.sentencizer import senter_score as make_sentencizer_scorer
|
||||||
from .pipeline.senter import make_senter_scorer
|
from .pipeline.senter import make_senter_scorer
|
||||||
from .pipeline.textcat import make_textcat_scorer
|
from .pipeline.textcat import make_textcat_scorer
|
||||||
from .pipeline.textcat_multilabel import make_textcat_multilabel_scorer
|
from .pipeline.textcat_multilabel import make_textcat_multilabel_scorer
|
||||||
|
|
||||||
# Register miscellaneous components
|
# Register miscellaneous components
|
||||||
registry.misc("spacy.first_longest_spans_filter.v1")(make_first_longest_spans_filter)
|
registry.misc("spacy.first_longest_spans_filter.v1")(
|
||||||
|
make_first_longest_spans_filter
|
||||||
|
)
|
||||||
registry.misc("spacy.ngram_suggester.v1")(build_ngram_suggester)
|
registry.misc("spacy.ngram_suggester.v1")(build_ngram_suggester)
|
||||||
registry.misc("spacy.ngram_range_suggester.v1")(build_ngram_range_suggester)
|
registry.misc("spacy.ngram_range_suggester.v1")(build_ngram_range_suggester)
|
||||||
registry.misc("spacy.preset_spans_suggester.v1")(build_preset_spans_suggester)
|
registry.misc("spacy.preset_spans_suggester.v1")(build_preset_spans_suggester)
|
||||||
|
|
||||||
# Need to get references to the existing functions in registry by importing the function that is there
|
# Need to get references to the existing functions in registry by importing the function that is there
|
||||||
# For the registry that was previously decorated
|
# For the registry that was previously decorated
|
||||||
|
|
||||||
# Import ML components that use registry
|
# Import ML components that use registry
|
||||||
from .ml.models.tok2vec import tok2vec_listener_v1, build_hash_embed_cnn_tok2vec, build_Tok2Vec_model, MultiHashEmbed, CharacterEmbed, MaxoutWindowEncoder, MishWindowEncoder, BiLSTMEncoder
|
from .ml.models.tok2vec import (
|
||||||
|
tok2vec_listener_v1,
|
||||||
|
build_hash_embed_cnn_tok2vec,
|
||||||
|
build_Tok2Vec_model,
|
||||||
|
MultiHashEmbed,
|
||||||
|
CharacterEmbed,
|
||||||
|
MaxoutWindowEncoder,
|
||||||
|
MishWindowEncoder,
|
||||||
|
BiLSTMEncoder,
|
||||||
|
)
|
||||||
|
|
||||||
# Register scorers
|
# Register scorers
|
||||||
registry.scorers("spacy.tagger_scorer.v1")(make_tagger_scorer)
|
registry.scorers("spacy.tagger_scorer.v1")(make_tagger_scorer)
|
||||||
registry.scorers("spacy.ner_scorer.v1")(make_ner_scorer)
|
registry.scorers("spacy.ner_scorer.v1")(make_ner_scorer)
|
||||||
|
@ -58,12 +77,16 @@ def populate_registry() -> None:
|
||||||
registry.scorers("spacy.senter_scorer.v1")(make_senter_scorer)
|
registry.scorers("spacy.senter_scorer.v1")(make_senter_scorer)
|
||||||
registry.scorers("spacy.textcat_scorer.v1")(make_textcat_scorer)
|
registry.scorers("spacy.textcat_scorer.v1")(make_textcat_scorer)
|
||||||
registry.scorers("spacy.textcat_scorer.v2")(make_textcat_scorer)
|
registry.scorers("spacy.textcat_scorer.v2")(make_textcat_scorer)
|
||||||
registry.scorers("spacy.textcat_multilabel_scorer.v1")(make_textcat_multilabel_scorer)
|
registry.scorers("spacy.textcat_multilabel_scorer.v1")(
|
||||||
registry.scorers("spacy.textcat_multilabel_scorer.v2")(make_textcat_multilabel_scorer)
|
make_textcat_multilabel_scorer
|
||||||
|
)
|
||||||
|
registry.scorers("spacy.textcat_multilabel_scorer.v2")(
|
||||||
|
make_textcat_multilabel_scorer
|
||||||
|
)
|
||||||
registry.scorers("spacy.lemmatizer_scorer.v1")(make_lemmatizer_scorer)
|
registry.scorers("spacy.lemmatizer_scorer.v1")(make_lemmatizer_scorer)
|
||||||
registry.scorers("spacy.span_finder_scorer.v1")(make_span_finder_scorer)
|
registry.scorers("spacy.span_finder_scorer.v1")(make_span_finder_scorer)
|
||||||
registry.scorers("spacy.spancat_scorer.v1")(make_spancat_scorer)
|
registry.scorers("spacy.spancat_scorer.v1")(make_spancat_scorer)
|
||||||
|
|
||||||
# Register tok2vec architectures we've modified
|
# Register tok2vec architectures we've modified
|
||||||
registry.architectures("spacy.Tok2VecListener.v1")(tok2vec_listener_v1)
|
registry.architectures("spacy.Tok2VecListener.v1")(tok2vec_listener_v1)
|
||||||
registry.architectures("spacy.HashEmbedCNN.v2")(build_hash_embed_cnn_tok2vec)
|
registry.architectures("spacy.HashEmbedCNN.v2")(build_hash_embed_cnn_tok2vec)
|
||||||
|
@ -73,33 +96,52 @@ def populate_registry() -> None:
|
||||||
registry.architectures("spacy.MaxoutWindowEncoder.v2")(MaxoutWindowEncoder)
|
registry.architectures("spacy.MaxoutWindowEncoder.v2")(MaxoutWindowEncoder)
|
||||||
registry.architectures("spacy.MishWindowEncoder.v2")(MishWindowEncoder)
|
registry.architectures("spacy.MishWindowEncoder.v2")(MishWindowEncoder)
|
||||||
registry.architectures("spacy.TorchBiLSTMEncoder.v1")(BiLSTMEncoder)
|
registry.architectures("spacy.TorchBiLSTMEncoder.v1")(BiLSTMEncoder)
|
||||||
|
|
||||||
# Register factory components
|
# Register factory components
|
||||||
register_factories()
|
register_factories()
|
||||||
|
|
||||||
# Set the flag to indicate that the registry has been populated
|
# Set the flag to indicate that the registry has been populated
|
||||||
REGISTRY_POPULATED = True
|
REGISTRY_POPULATED = True
|
||||||
|
|
||||||
|
|
||||||
def register_factories() -> None:
|
def register_factories() -> None:
|
||||||
"""Register all factories with the registry.
|
"""Register all factories with the registry.
|
||||||
|
|
||||||
This function registers all pipeline component factories, centralizing
|
This function registers all pipeline component factories, centralizing
|
||||||
the registrations that were previously done with @Language.factory decorators.
|
the registrations that were previously done with @Language.factory decorators.
|
||||||
"""
|
"""
|
||||||
global FACTORIES_REGISTERED
|
global FACTORIES_REGISTERED
|
||||||
|
|
||||||
|
from .language import Language
|
||||||
|
from .pipeline.sentencizer import Sentencizer
|
||||||
|
|
||||||
if FACTORIES_REGISTERED:
|
if FACTORIES_REGISTERED:
|
||||||
return
|
return
|
||||||
|
|
||||||
from .language import Language
|
# TODO: We seem to still get cycle problems with these functions defined in Cython. We need
|
||||||
|
# a Python _factories module maybe?
|
||||||
|
def make_sentencizer(
|
||||||
|
nlp: Language,
|
||||||
|
name: str,
|
||||||
|
punct_chars: Optional[List[str]],
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
|
):
|
||||||
|
return Sentencizer(
|
||||||
|
name, punct_chars=punct_chars, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
# Import factory default configurations
|
# Import factory default configurations
|
||||||
from .pipeline.entity_linker import DEFAULT_NEL_MODEL
|
from .pipeline.entity_linker import DEFAULT_NEL_MODEL
|
||||||
from .pipeline.entityruler import DEFAULT_ENT_ID_SEP
|
from .pipeline.entityruler import DEFAULT_ENT_ID_SEP
|
||||||
from .pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
from .pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
||||||
from .pipeline.senter import DEFAULT_SENTER_MODEL
|
from .pipeline.senter import DEFAULT_SENTER_MODEL
|
||||||
from .pipeline.morphologizer import DEFAULT_MORPH_MODEL
|
from .pipeline.morphologizer import DEFAULT_MORPH_MODEL
|
||||||
from .pipeline.spancat import DEFAULT_SPANCAT_MODEL, DEFAULT_SPANCAT_SINGLELABEL_MODEL, DEFAULT_SPANS_KEY
|
from .pipeline.spancat import (
|
||||||
|
DEFAULT_SPANCAT_MODEL,
|
||||||
|
DEFAULT_SPANCAT_SINGLELABEL_MODEL,
|
||||||
|
DEFAULT_SPANS_KEY,
|
||||||
|
)
|
||||||
from .pipeline.span_ruler import DEFAULT_SPANS_KEY as SPAN_RULER_DEFAULT_SPANS_KEY
|
from .pipeline.span_ruler import DEFAULT_SPANS_KEY as SPAN_RULER_DEFAULT_SPANS_KEY
|
||||||
from .pipeline.edit_tree_lemmatizer import DEFAULT_EDIT_TREE_LEMMATIZER_MODEL
|
from .pipeline.edit_tree_lemmatizer import DEFAULT_EDIT_TREE_LEMMATIZER_MODEL
|
||||||
from .pipeline.textcat_multilabel import DEFAULT_MULTI_TEXTCAT_MODEL
|
from .pipeline.textcat_multilabel import DEFAULT_MULTI_TEXTCAT_MODEL
|
||||||
|
@ -108,7 +150,7 @@ def register_factories() -> None:
|
||||||
from .pipeline.dep_parser import DEFAULT_PARSER_MODEL
|
from .pipeline.dep_parser import DEFAULT_PARSER_MODEL
|
||||||
from .pipeline.tagger import DEFAULT_TAGGER_MODEL
|
from .pipeline.tagger import DEFAULT_TAGGER_MODEL
|
||||||
from .pipeline.multitask import DEFAULT_MT_MODEL
|
from .pipeline.multitask import DEFAULT_MT_MODEL
|
||||||
|
|
||||||
# Import all factory functions
|
# Import all factory functions
|
||||||
from .pipeline.attributeruler import make_attribute_ruler
|
from .pipeline.attributeruler import make_attribute_ruler
|
||||||
from .pipeline.entity_linker import make_entity_linker
|
from .pipeline.entity_linker import make_entity_linker
|
||||||
|
@ -120,7 +162,10 @@ def register_factories() -> None:
|
||||||
from .pipeline.senter import make_senter
|
from .pipeline.senter import make_senter
|
||||||
from .pipeline.morphologizer import make_morphologizer
|
from .pipeline.morphologizer import make_morphologizer
|
||||||
from .pipeline.spancat import make_spancat, make_spancat_singlelabel
|
from .pipeline.spancat import make_spancat, make_spancat_singlelabel
|
||||||
from .pipeline.span_ruler import make_entity_ruler as make_span_entity_ruler, make_span_ruler
|
from .pipeline.span_ruler import (
|
||||||
|
make_entity_ruler as make_span_entity_ruler,
|
||||||
|
make_span_ruler,
|
||||||
|
)
|
||||||
from .pipeline.edit_tree_lemmatizer import make_edit_tree_lemmatizer
|
from .pipeline.edit_tree_lemmatizer import make_edit_tree_lemmatizer
|
||||||
from .pipeline.textcat_multilabel import make_multilabel_textcat
|
from .pipeline.textcat_multilabel import make_multilabel_textcat
|
||||||
from .pipeline.span_finder import make_span_finder
|
from .pipeline.span_finder import make_span_finder
|
||||||
|
@ -128,11 +173,12 @@ def register_factories() -> None:
|
||||||
from .pipeline.dep_parser import make_parser, make_beam_parser
|
from .pipeline.dep_parser import make_parser, make_beam_parser
|
||||||
from .pipeline.tagger import make_tagger
|
from .pipeline.tagger import make_tagger
|
||||||
from .pipeline.multitask import make_nn_labeller
|
from .pipeline.multitask import make_nn_labeller
|
||||||
from .pipeline.sentencizer import make_sentencizer
|
|
||||||
|
# from .pipeline.sentencizer import make_sentencizer
|
||||||
|
|
||||||
# Register factories using the same pattern as Language.factory decorator
|
# Register factories using the same pattern as Language.factory decorator
|
||||||
# We use Language.factory()() pattern which exactly mimics the decorator
|
# We use Language.factory()() pattern which exactly mimics the decorator
|
||||||
|
|
||||||
# attributeruler
|
# attributeruler
|
||||||
Language.factory(
|
Language.factory(
|
||||||
"attribute_ruler",
|
"attribute_ruler",
|
||||||
|
@ -141,7 +187,7 @@ def register_factories() -> None:
|
||||||
"scorer": {"@scorers": "spacy.attribute_ruler_scorer.v1"},
|
"scorer": {"@scorers": "spacy.attribute_ruler_scorer.v1"},
|
||||||
},
|
},
|
||||||
)(make_attribute_ruler)
|
)(make_attribute_ruler)
|
||||||
|
|
||||||
# entity_linker
|
# entity_linker
|
||||||
Language.factory(
|
Language.factory(
|
||||||
"entity_linker",
|
"entity_linker",
|
||||||
|
@ -169,7 +215,7 @@ def register_factories() -> None:
|
||||||
"nel_micro_p": None,
|
"nel_micro_p": None,
|
||||||
},
|
},
|
||||||
)(make_entity_linker)
|
)(make_entity_linker)
|
||||||
|
|
||||||
# entity_ruler
|
# entity_ruler
|
||||||
Language.factory(
|
Language.factory(
|
||||||
"entity_ruler",
|
"entity_ruler",
|
||||||
|
@ -189,7 +235,7 @@ def register_factories() -> None:
|
||||||
"ents_per_type": None,
|
"ents_per_type": None,
|
||||||
},
|
},
|
||||||
)(make_entity_ruler)
|
)(make_entity_ruler)
|
||||||
|
|
||||||
# lemmatizer
|
# lemmatizer
|
||||||
Language.factory(
|
Language.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
|
@ -202,7 +248,7 @@ def register_factories() -> None:
|
||||||
},
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)(make_lemmatizer)
|
)(make_lemmatizer)
|
||||||
|
|
||||||
# textcat
|
# textcat
|
||||||
Language.factory(
|
Language.factory(
|
||||||
"textcat",
|
"textcat",
|
||||||
|
@ -225,49 +271,57 @@ def register_factories() -> None:
|
||||||
"cats_f_per_type": None,
|
"cats_f_per_type": None,
|
||||||
},
|
},
|
||||||
)(make_textcat)
|
)(make_textcat)
|
||||||
|
|
||||||
# token_splitter
|
# token_splitter
|
||||||
Language.factory(
|
Language.factory(
|
||||||
"token_splitter",
|
"token_splitter",
|
||||||
default_config={"min_length": 25, "split_length": 10},
|
default_config={"min_length": 25, "split_length": 10},
|
||||||
retokenizes=True,
|
retokenizes=True,
|
||||||
)(make_token_splitter)
|
)(make_token_splitter)
|
||||||
|
|
||||||
# doc_cleaner
|
# doc_cleaner
|
||||||
Language.factory(
|
Language.factory(
|
||||||
"doc_cleaner",
|
"doc_cleaner",
|
||||||
default_config={"attrs": {"tensor": None, "_.trf_data": None}, "silent": True},
|
default_config={"attrs": {"tensor": None, "_.trf_data": None}, "silent": True},
|
||||||
)(make_doc_cleaner)
|
)(make_doc_cleaner)
|
||||||
|
|
||||||
# tok2vec
|
# tok2vec
|
||||||
Language.factory(
|
Language.factory(
|
||||||
"tok2vec",
|
"tok2vec",
|
||||||
assigns=["doc.tensor"],
|
assigns=["doc.tensor"],
|
||||||
default_config={"model": DEFAULT_TOK2VEC_MODEL}
|
default_config={"model": DEFAULT_TOK2VEC_MODEL},
|
||||||
)(make_tok2vec)
|
)(make_tok2vec)
|
||||||
|
|
||||||
# senter
|
# senter
|
||||||
Language.factory(
|
Language.factory(
|
||||||
"senter",
|
"senter",
|
||||||
assigns=["token.is_sent_start"],
|
assigns=["token.is_sent_start"],
|
||||||
default_config={"model": DEFAULT_SENTER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
|
default_config={
|
||||||
|
"model": DEFAULT_SENTER_MODEL,
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.senter_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
|
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
|
||||||
)(make_senter)
|
)(make_senter)
|
||||||
|
|
||||||
# morphologizer
|
# morphologizer
|
||||||
Language.factory(
|
Language.factory(
|
||||||
"morphologizer",
|
"morphologizer",
|
||||||
assigns=["token.morph", "token.pos"],
|
assigns=["token.morph", "token.pos"],
|
||||||
default_config={
|
default_config={
|
||||||
"model": DEFAULT_MORPH_MODEL,
|
"model": DEFAULT_MORPH_MODEL,
|
||||||
"overwrite": True,
|
"overwrite": True,
|
||||||
"extend": False,
|
"extend": False,
|
||||||
"scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
|
"scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
|
||||||
"label_smoothing": 0.0
|
"label_smoothing": 0.0,
|
||||||
|
},
|
||||||
|
default_score_weights={
|
||||||
|
"pos_acc": 0.5,
|
||||||
|
"morph_acc": 0.5,
|
||||||
|
"morph_per_feat": None,
|
||||||
},
|
},
|
||||||
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
|
|
||||||
)(make_morphologizer)
|
)(make_morphologizer)
|
||||||
|
|
||||||
# spancat
|
# spancat
|
||||||
Language.factory(
|
Language.factory(
|
||||||
"spancat",
|
"spancat",
|
||||||
|
@ -282,7 +336,7 @@ def register_factories() -> None:
|
||||||
},
|
},
|
||||||
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
|
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
|
||||||
)(make_spancat)
|
)(make_spancat)
|
||||||
|
|
||||||
# spancat_singlelabel
|
# spancat_singlelabel
|
||||||
Language.factory(
|
Language.factory(
|
||||||
"spancat_singlelabel",
|
"spancat_singlelabel",
|
||||||
|
@ -297,7 +351,7 @@ def register_factories() -> None:
|
||||||
},
|
},
|
||||||
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
|
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
|
||||||
)(make_spancat_singlelabel)
|
)(make_spancat_singlelabel)
|
||||||
|
|
||||||
# future_entity_ruler
|
# future_entity_ruler
|
||||||
Language.factory(
|
Language.factory(
|
||||||
"future_entity_ruler",
|
"future_entity_ruler",
|
||||||
|
@ -317,7 +371,7 @@ def register_factories() -> None:
|
||||||
"ents_per_type": None,
|
"ents_per_type": None,
|
||||||
},
|
},
|
||||||
)(make_span_entity_ruler)
|
)(make_span_entity_ruler)
|
||||||
|
|
||||||
# span_ruler
|
# span_ruler
|
||||||
Language.factory(
|
Language.factory(
|
||||||
"span_ruler",
|
"span_ruler",
|
||||||
|
@ -343,7 +397,7 @@ def register_factories() -> None:
|
||||||
f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_per_type": None,
|
f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_per_type": None,
|
||||||
},
|
},
|
||||||
)(make_span_ruler)
|
)(make_span_ruler)
|
||||||
|
|
||||||
# trainable_lemmatizer
|
# trainable_lemmatizer
|
||||||
Language.factory(
|
Language.factory(
|
||||||
"trainable_lemmatizer",
|
"trainable_lemmatizer",
|
||||||
|
@ -359,7 +413,7 @@ def register_factories() -> None:
|
||||||
},
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)(make_edit_tree_lemmatizer)
|
)(make_edit_tree_lemmatizer)
|
||||||
|
|
||||||
# textcat_multilabel
|
# textcat_multilabel
|
||||||
Language.factory(
|
Language.factory(
|
||||||
"textcat_multilabel",
|
"textcat_multilabel",
|
||||||
|
@ -382,7 +436,7 @@ def register_factories() -> None:
|
||||||
"cats_f_per_type": None,
|
"cats_f_per_type": None,
|
||||||
},
|
},
|
||||||
)(make_multilabel_textcat)
|
)(make_multilabel_textcat)
|
||||||
|
|
||||||
# span_finder
|
# span_finder
|
||||||
Language.factory(
|
Language.factory(
|
||||||
"span_finder",
|
"span_finder",
|
||||||
|
@ -401,7 +455,7 @@ def register_factories() -> None:
|
||||||
f"spans_{DEFAULT_SPANS_KEY}_r": 0.0,
|
f"spans_{DEFAULT_SPANS_KEY}_r": 0.0,
|
||||||
},
|
},
|
||||||
)(make_span_finder)
|
)(make_span_finder)
|
||||||
|
|
||||||
# ner
|
# ner
|
||||||
Language.factory(
|
Language.factory(
|
||||||
"ner",
|
"ner",
|
||||||
|
@ -413,9 +467,14 @@ def register_factories() -> None:
|
||||||
"incorrect_spans_key": None,
|
"incorrect_spans_key": None,
|
||||||
"scorer": {"@scorers": "spacy.ner_scorer.v1"},
|
"scorer": {"@scorers": "spacy.ner_scorer.v1"},
|
||||||
},
|
},
|
||||||
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
|
default_score_weights={
|
||||||
|
"ents_f": 1.0,
|
||||||
|
"ents_p": 0.0,
|
||||||
|
"ents_r": 0.0,
|
||||||
|
"ents_per_type": None,
|
||||||
|
},
|
||||||
)(make_ner)
|
)(make_ner)
|
||||||
|
|
||||||
# beam_ner
|
# beam_ner
|
||||||
Language.factory(
|
Language.factory(
|
||||||
"beam_ner",
|
"beam_ner",
|
||||||
|
@ -430,9 +489,14 @@ def register_factories() -> None:
|
||||||
"incorrect_spans_key": None,
|
"incorrect_spans_key": None,
|
||||||
"scorer": {"@scorers": "spacy.ner_scorer.v1"},
|
"scorer": {"@scorers": "spacy.ner_scorer.v1"},
|
||||||
},
|
},
|
||||||
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
|
default_score_weights={
|
||||||
|
"ents_f": 1.0,
|
||||||
|
"ents_p": 0.0,
|
||||||
|
"ents_r": 0.0,
|
||||||
|
"ents_per_type": None,
|
||||||
|
},
|
||||||
)(make_beam_ner)
|
)(make_beam_ner)
|
||||||
|
|
||||||
# parser
|
# parser
|
||||||
Language.factory(
|
Language.factory(
|
||||||
"parser",
|
"parser",
|
||||||
|
@ -454,7 +518,7 @@ def register_factories() -> None:
|
||||||
"sents_f": 0.0,
|
"sents_f": 0.0,
|
||||||
},
|
},
|
||||||
)(make_parser)
|
)(make_parser)
|
||||||
|
|
||||||
# beam_parser
|
# beam_parser
|
||||||
Language.factory(
|
Language.factory(
|
||||||
"beam_parser",
|
"beam_parser",
|
||||||
|
@ -479,28 +543,48 @@ def register_factories() -> None:
|
||||||
"sents_f": 0.0,
|
"sents_f": 0.0,
|
||||||
},
|
},
|
||||||
)(make_beam_parser)
|
)(make_beam_parser)
|
||||||
|
|
||||||
# tagger
|
# tagger
|
||||||
Language.factory(
|
Language.factory(
|
||||||
"tagger",
|
"tagger",
|
||||||
assigns=["token.tag"],
|
assigns=["token.tag"],
|
||||||
default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}, "neg_prefix": "!", "label_smoothing": 0.0},
|
default_config={
|
||||||
default_score_weights={"tag_acc": 1.0, "pos_acc": 0.0, "tag_micro_p": None, "tag_micro_r": None, "tag_micro_f": None},
|
"model": DEFAULT_TAGGER_MODEL,
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.tagger_scorer.v1"},
|
||||||
|
"neg_prefix": "!",
|
||||||
|
"label_smoothing": 0.0,
|
||||||
|
},
|
||||||
|
default_score_weights={
|
||||||
|
"tag_acc": 1.0,
|
||||||
|
"pos_acc": 0.0,
|
||||||
|
"tag_micro_p": None,
|
||||||
|
"tag_micro_r": None,
|
||||||
|
"tag_micro_f": None,
|
||||||
|
},
|
||||||
)(make_tagger)
|
)(make_tagger)
|
||||||
|
|
||||||
# nn_labeller
|
# nn_labeller
|
||||||
Language.factory(
|
Language.factory(
|
||||||
"nn_labeller",
|
"nn_labeller",
|
||||||
default_config={"labels": None, "target": "dep_tag_offset", "model": DEFAULT_MT_MODEL}
|
default_config={
|
||||||
|
"labels": None,
|
||||||
|
"target": "dep_tag_offset",
|
||||||
|
"model": DEFAULT_MT_MODEL,
|
||||||
|
},
|
||||||
)(make_nn_labeller)
|
)(make_nn_labeller)
|
||||||
|
|
||||||
# sentencizer
|
# sentencizer
|
||||||
Language.factory(
|
Language.factory(
|
||||||
"sentencizer",
|
"sentencizer",
|
||||||
assigns=["token.is_sent_start", "doc.sents"],
|
assigns=["token.is_sent_start", "doc.sents"],
|
||||||
default_config={"punct_chars": None, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
|
default_config={
|
||||||
|
"punct_chars": None,
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.senter_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
|
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
|
||||||
)(make_sentencizer)
|
)(make_sentencizer)
|
||||||
|
|
||||||
# Set the flag to indicate that all factories have been registered
|
# Set the flag to indicate that all factories have been registered
|
||||||
FACTORIES_REGISTERED = True
|
FACTORIES_REGISTERED = True
|
||||||
|
|
|
@ -479,3 +479,4 @@ NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)]
|
||||||
# (which is generating an enormous amount of C++ in Cython 0.24+)
|
# (which is generating an enormous amount of C++ in Cython 0.24+)
|
||||||
# We keep the enum cdef, and just make sure the names are available to Python
|
# We keep the enum cdef, and just make sure the names are available to Python
|
||||||
locals().update(IDS)
|
locals().update(IDS)
|
||||||
|
|
||||||
|
|
|
@ -87,7 +87,7 @@ def entity_linker():
|
||||||
|
|
||||||
|
|
||||||
objects_to_test = (
|
objects_to_test = (
|
||||||
[nlp(), vectors(), custom_pipe(), tagger(), entity_linker()],
|
[nlp, vectors, custom_pipe, tagger, entity_linker],
|
||||||
["nlp", "vectors", "custom_pipe", "tagger", "entity_linker"],
|
["nlp", "vectors", "custom_pipe", "tagger", "entity_linker"],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -101,8 +101,9 @@ def write_obj_and_catch_warnings(obj):
|
||||||
return list(filter(lambda x: isinstance(x, ResourceWarning), warnings_list))
|
return list(filter(lambda x: isinstance(x, ResourceWarning), warnings_list))
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("obj", objects_to_test[0], ids=objects_to_test[1])
|
@pytest.mark.parametrize("obj_factory", objects_to_test[0], ids=objects_to_test[1])
|
||||||
def test_to_disk_resource_warning(obj):
|
def test_to_disk_resource_warning(obj_factory):
|
||||||
|
obj = obj_factory()
|
||||||
warnings_list = write_obj_and_catch_warnings(obj)
|
warnings_list = write_obj_and_catch_warnings(obj)
|
||||||
assert len(warnings_list) == 0
|
assert len(warnings_list) == 0
|
||||||
|
|
||||||
|
@ -139,7 +140,7 @@ def test_save_and_load_knowledge_base():
|
||||||
|
|
||||||
class TestToDiskResourceWarningUnittest(TestCase):
|
class TestToDiskResourceWarningUnittest(TestCase):
|
||||||
def test_resource_warning(self):
|
def test_resource_warning(self):
|
||||||
scenarios = zip(*objects_to_test)
|
scenarios = zip(*[x() for x in objects_to_test]) # type: ignore
|
||||||
|
|
||||||
for scenario in scenarios:
|
for scenario in scenarios:
|
||||||
with self.subTest(msg=scenario[1]):
|
with self.subTest(msg=scenario[1]):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user