This commit is contained in:
Matthew Honnibal 2025-05-19 17:41:34 +02:00
parent cda2bd01d4
commit c3f9fab5e8
3 changed files with 157 additions and 71 deletions

View File

@ -12,6 +12,7 @@ REGISTRY_POPULATED = False
# Global flag to track if factories have been registered # Global flag to track if factories have been registered
FACTORIES_REGISTERED = False FACTORIES_REGISTERED = False
def populate_registry() -> None: def populate_registry() -> None:
"""Populate the registry with all necessary components. """Populate the registry with all necessary components.
@ -30,15 +31,24 @@ def populate_registry() -> None:
from .pipeline.ner import make_ner_scorer from .pipeline.ner import make_ner_scorer
from .pipeline.lemmatizer import make_lemmatizer_scorer from .pipeline.lemmatizer import make_lemmatizer_scorer
from .pipeline.span_finder import make_span_finder_scorer from .pipeline.span_finder import make_span_finder_scorer
from .pipeline.spancat import make_spancat_scorer, build_ngram_suggester, build_ngram_range_suggester, build_preset_spans_suggester from .pipeline.spancat import (
from .pipeline.entityruler import make_entity_ruler_scorer as make_entityruler_scorer make_spancat_scorer,
build_ngram_suggester,
build_ngram_range_suggester,
build_preset_spans_suggester,
)
from .pipeline.entityruler import (
make_entity_ruler_scorer as make_entityruler_scorer,
)
from .pipeline.sentencizer import senter_score as make_sentencizer_scorer from .pipeline.sentencizer import senter_score as make_sentencizer_scorer
from .pipeline.senter import make_senter_scorer from .pipeline.senter import make_senter_scorer
from .pipeline.textcat import make_textcat_scorer from .pipeline.textcat import make_textcat_scorer
from .pipeline.textcat_multilabel import make_textcat_multilabel_scorer from .pipeline.textcat_multilabel import make_textcat_multilabel_scorer
# Register miscellaneous components # Register miscellaneous components
registry.misc("spacy.first_longest_spans_filter.v1")(make_first_longest_spans_filter) registry.misc("spacy.first_longest_spans_filter.v1")(
make_first_longest_spans_filter
)
registry.misc("spacy.ngram_suggester.v1")(build_ngram_suggester) registry.misc("spacy.ngram_suggester.v1")(build_ngram_suggester)
registry.misc("spacy.ngram_range_suggester.v1")(build_ngram_range_suggester) registry.misc("spacy.ngram_range_suggester.v1")(build_ngram_range_suggester)
registry.misc("spacy.preset_spans_suggester.v1")(build_preset_spans_suggester) registry.misc("spacy.preset_spans_suggester.v1")(build_preset_spans_suggester)
@ -47,7 +57,16 @@ def populate_registry() -> None:
# For the registry that was previously decorated # For the registry that was previously decorated
# Import ML components that use registry # Import ML components that use registry
from .ml.models.tok2vec import tok2vec_listener_v1, build_hash_embed_cnn_tok2vec, build_Tok2Vec_model, MultiHashEmbed, CharacterEmbed, MaxoutWindowEncoder, MishWindowEncoder, BiLSTMEncoder from .ml.models.tok2vec import (
tok2vec_listener_v1,
build_hash_embed_cnn_tok2vec,
build_Tok2Vec_model,
MultiHashEmbed,
CharacterEmbed,
MaxoutWindowEncoder,
MishWindowEncoder,
BiLSTMEncoder,
)
# Register scorers # Register scorers
registry.scorers("spacy.tagger_scorer.v1")(make_tagger_scorer) registry.scorers("spacy.tagger_scorer.v1")(make_tagger_scorer)
@ -58,8 +77,12 @@ def populate_registry() -> None:
registry.scorers("spacy.senter_scorer.v1")(make_senter_scorer) registry.scorers("spacy.senter_scorer.v1")(make_senter_scorer)
registry.scorers("spacy.textcat_scorer.v1")(make_textcat_scorer) registry.scorers("spacy.textcat_scorer.v1")(make_textcat_scorer)
registry.scorers("spacy.textcat_scorer.v2")(make_textcat_scorer) registry.scorers("spacy.textcat_scorer.v2")(make_textcat_scorer)
registry.scorers("spacy.textcat_multilabel_scorer.v1")(make_textcat_multilabel_scorer) registry.scorers("spacy.textcat_multilabel_scorer.v1")(
registry.scorers("spacy.textcat_multilabel_scorer.v2")(make_textcat_multilabel_scorer) make_textcat_multilabel_scorer
)
registry.scorers("spacy.textcat_multilabel_scorer.v2")(
make_textcat_multilabel_scorer
)
registry.scorers("spacy.lemmatizer_scorer.v1")(make_lemmatizer_scorer) registry.scorers("spacy.lemmatizer_scorer.v1")(make_lemmatizer_scorer)
registry.scorers("spacy.span_finder_scorer.v1")(make_span_finder_scorer) registry.scorers("spacy.span_finder_scorer.v1")(make_span_finder_scorer)
registry.scorers("spacy.spancat_scorer.v1")(make_spancat_scorer) registry.scorers("spacy.spancat_scorer.v1")(make_spancat_scorer)
@ -88,10 +111,25 @@ def register_factories() -> None:
the registrations that were previously done with @Language.factory decorators. the registrations that were previously done with @Language.factory decorators.
""" """
global FACTORIES_REGISTERED global FACTORIES_REGISTERED
from .language import Language
from .pipeline.sentencizer import Sentencizer
if FACTORIES_REGISTERED: if FACTORIES_REGISTERED:
return return
from .language import Language # TODO: We seem to still get cycle problems with these functions defined in Cython. We need
# a Python _factories module maybe?
def make_sentencizer(
nlp: Language,
name: str,
punct_chars: Optional[List[str]],
overwrite: bool,
scorer: Optional[Callable],
):
return Sentencizer(
name, punct_chars=punct_chars, overwrite=overwrite, scorer=scorer
)
# Import factory default configurations # Import factory default configurations
from .pipeline.entity_linker import DEFAULT_NEL_MODEL from .pipeline.entity_linker import DEFAULT_NEL_MODEL
@ -99,7 +137,11 @@ def register_factories() -> None:
from .pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL from .pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
from .pipeline.senter import DEFAULT_SENTER_MODEL from .pipeline.senter import DEFAULT_SENTER_MODEL
from .pipeline.morphologizer import DEFAULT_MORPH_MODEL from .pipeline.morphologizer import DEFAULT_MORPH_MODEL
from .pipeline.spancat import DEFAULT_SPANCAT_MODEL, DEFAULT_SPANCAT_SINGLELABEL_MODEL, DEFAULT_SPANS_KEY from .pipeline.spancat import (
DEFAULT_SPANCAT_MODEL,
DEFAULT_SPANCAT_SINGLELABEL_MODEL,
DEFAULT_SPANS_KEY,
)
from .pipeline.span_ruler import DEFAULT_SPANS_KEY as SPAN_RULER_DEFAULT_SPANS_KEY from .pipeline.span_ruler import DEFAULT_SPANS_KEY as SPAN_RULER_DEFAULT_SPANS_KEY
from .pipeline.edit_tree_lemmatizer import DEFAULT_EDIT_TREE_LEMMATIZER_MODEL from .pipeline.edit_tree_lemmatizer import DEFAULT_EDIT_TREE_LEMMATIZER_MODEL
from .pipeline.textcat_multilabel import DEFAULT_MULTI_TEXTCAT_MODEL from .pipeline.textcat_multilabel import DEFAULT_MULTI_TEXTCAT_MODEL
@ -120,7 +162,10 @@ def register_factories() -> None:
from .pipeline.senter import make_senter from .pipeline.senter import make_senter
from .pipeline.morphologizer import make_morphologizer from .pipeline.morphologizer import make_morphologizer
from .pipeline.spancat import make_spancat, make_spancat_singlelabel from .pipeline.spancat import make_spancat, make_spancat_singlelabel
from .pipeline.span_ruler import make_entity_ruler as make_span_entity_ruler, make_span_ruler from .pipeline.span_ruler import (
make_entity_ruler as make_span_entity_ruler,
make_span_ruler,
)
from .pipeline.edit_tree_lemmatizer import make_edit_tree_lemmatizer from .pipeline.edit_tree_lemmatizer import make_edit_tree_lemmatizer
from .pipeline.textcat_multilabel import make_multilabel_textcat from .pipeline.textcat_multilabel import make_multilabel_textcat
from .pipeline.span_finder import make_span_finder from .pipeline.span_finder import make_span_finder
@ -128,7 +173,8 @@ def register_factories() -> None:
from .pipeline.dep_parser import make_parser, make_beam_parser from .pipeline.dep_parser import make_parser, make_beam_parser
from .pipeline.tagger import make_tagger from .pipeline.tagger import make_tagger
from .pipeline.multitask import make_nn_labeller from .pipeline.multitask import make_nn_labeller
from .pipeline.sentencizer import make_sentencizer
# from .pipeline.sentencizer import make_sentencizer
# Register factories using the same pattern as Language.factory decorator # Register factories using the same pattern as Language.factory decorator
# We use Language.factory()() pattern which exactly mimics the decorator # We use Language.factory()() pattern which exactly mimics the decorator
@ -243,14 +289,18 @@ def register_factories() -> None:
Language.factory( Language.factory(
"tok2vec", "tok2vec",
assigns=["doc.tensor"], assigns=["doc.tensor"],
default_config={"model": DEFAULT_TOK2VEC_MODEL} default_config={"model": DEFAULT_TOK2VEC_MODEL},
)(make_tok2vec) )(make_tok2vec)
# senter # senter
Language.factory( Language.factory(
"senter", "senter",
assigns=["token.is_sent_start"], assigns=["token.is_sent_start"],
default_config={"model": DEFAULT_SENTER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}}, default_config={
"model": DEFAULT_SENTER_MODEL,
"overwrite": False,
"scorer": {"@scorers": "spacy.senter_scorer.v1"},
},
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
)(make_senter) )(make_senter)
@ -263,9 +313,13 @@ def register_factories() -> None:
"overwrite": True, "overwrite": True,
"extend": False, "extend": False,
"scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}, "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
"label_smoothing": 0.0 "label_smoothing": 0.0,
},
default_score_weights={
"pos_acc": 0.5,
"morph_acc": 0.5,
"morph_per_feat": None,
}, },
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
)(make_morphologizer) )(make_morphologizer)
# spancat # spancat
@ -413,7 +467,12 @@ def register_factories() -> None:
"incorrect_spans_key": None, "incorrect_spans_key": None,
"scorer": {"@scorers": "spacy.ner_scorer.v1"}, "scorer": {"@scorers": "spacy.ner_scorer.v1"},
}, },
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None}, default_score_weights={
"ents_f": 1.0,
"ents_p": 0.0,
"ents_r": 0.0,
"ents_per_type": None,
},
)(make_ner) )(make_ner)
# beam_ner # beam_ner
@ -430,7 +489,12 @@ def register_factories() -> None:
"incorrect_spans_key": None, "incorrect_spans_key": None,
"scorer": {"@scorers": "spacy.ner_scorer.v1"}, "scorer": {"@scorers": "spacy.ner_scorer.v1"},
}, },
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None}, default_score_weights={
"ents_f": 1.0,
"ents_p": 0.0,
"ents_r": 0.0,
"ents_per_type": None,
},
)(make_beam_ner) )(make_beam_ner)
# parser # parser
@ -484,21 +548,41 @@ def register_factories() -> None:
Language.factory( Language.factory(
"tagger", "tagger",
assigns=["token.tag"], assigns=["token.tag"],
default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}, "neg_prefix": "!", "label_smoothing": 0.0}, default_config={
default_score_weights={"tag_acc": 1.0, "pos_acc": 0.0, "tag_micro_p": None, "tag_micro_r": None, "tag_micro_f": None}, "model": DEFAULT_TAGGER_MODEL,
"overwrite": False,
"scorer": {"@scorers": "spacy.tagger_scorer.v1"},
"neg_prefix": "!",
"label_smoothing": 0.0,
},
default_score_weights={
"tag_acc": 1.0,
"pos_acc": 0.0,
"tag_micro_p": None,
"tag_micro_r": None,
"tag_micro_f": None,
},
)(make_tagger) )(make_tagger)
# nn_labeller # nn_labeller
Language.factory( Language.factory(
"nn_labeller", "nn_labeller",
default_config={"labels": None, "target": "dep_tag_offset", "model": DEFAULT_MT_MODEL} default_config={
"labels": None,
"target": "dep_tag_offset",
"model": DEFAULT_MT_MODEL,
},
)(make_nn_labeller) )(make_nn_labeller)
# sentencizer # sentencizer
Language.factory( Language.factory(
"sentencizer", "sentencizer",
assigns=["token.is_sent_start", "doc.sents"], assigns=["token.is_sent_start", "doc.sents"],
default_config={"punct_chars": None, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}}, default_config={
"punct_chars": None,
"overwrite": False,
"scorer": {"@scorers": "spacy.senter_scorer.v1"},
},
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
)(make_sentencizer) )(make_sentencizer)

View File

@ -479,3 +479,4 @@ NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)]
# (which is generating an enormous amount of C++ in Cython 0.24+) # (which is generating an enormous amount of C++ in Cython 0.24+)
# We keep the enum cdef, and just make sure the names are available to Python # We keep the enum cdef, and just make sure the names are available to Python
locals().update(IDS) locals().update(IDS)

View File

@ -87,7 +87,7 @@ def entity_linker():
objects_to_test = ( objects_to_test = (
[nlp(), vectors(), custom_pipe(), tagger(), entity_linker()], [nlp, vectors, custom_pipe, tagger, entity_linker],
["nlp", "vectors", "custom_pipe", "tagger", "entity_linker"], ["nlp", "vectors", "custom_pipe", "tagger", "entity_linker"],
) )
@ -101,8 +101,9 @@ def write_obj_and_catch_warnings(obj):
return list(filter(lambda x: isinstance(x, ResourceWarning), warnings_list)) return list(filter(lambda x: isinstance(x, ResourceWarning), warnings_list))
@pytest.mark.parametrize("obj", objects_to_test[0], ids=objects_to_test[1]) @pytest.mark.parametrize("obj_factory", objects_to_test[0], ids=objects_to_test[1])
def test_to_disk_resource_warning(obj): def test_to_disk_resource_warning(obj_factory):
obj = obj_factory()
warnings_list = write_obj_and_catch_warnings(obj) warnings_list = write_obj_and_catch_warnings(obj)
assert len(warnings_list) == 0 assert len(warnings_list) == 0
@ -139,7 +140,7 @@ def test_save_and_load_knowledge_base():
class TestToDiskResourceWarningUnittest(TestCase): class TestToDiskResourceWarningUnittest(TestCase):
def test_resource_warning(self): def test_resource_warning(self):
scenarios = zip(*objects_to_test) scenarios = zip(*[x() for x in objects_to_test]) # type: ignore
for scenario in scenarios: for scenario in scenarios:
with self.subTest(msg=scenario[1]): with self.subTest(msg=scenario[1]):