Make factories top-level functions in registrations.py

This commit is contained in:
Matthew Honnibal 2025-05-21 14:03:11 +02:00
parent 5c331884c3
commit d8388aa591
2 changed files with 443 additions and 438 deletions

View File

@ -19,8 +19,6 @@ from .pipeline.entityruler import EntityRuler
from .pipeline.span_finder import SpanFinder
from .pipeline.ner import EntityRecognizer
from .pipeline._parser_internals.transition_system import TransitionSystem
from .pipeline.ner import EntityRecognizer
from .pipeline.dep_parser import DependencyParser
from .pipeline.dep_parser import DependencyParser
from .pipeline.tagger import Tagger
from .pipeline.multitask import MultitaskObjective
@ -169,6 +167,420 @@ def register_factories() -> None:
if FACTORIES_REGISTERED:
return
# Register factories using the same pattern as Language.factory decorator
# We use Language.factory()() pattern which exactly mimics the decorator
# attributeruler
Language.factory(
"attribute_ruler",
default_config={
"validate": False,
"scorer": {"@scorers": "spacy.attribute_ruler_scorer.v1"},
},
)(make_attribute_ruler)
# entity_linker
Language.factory(
"entity_linker",
requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
assigns=["token.ent_kb_id"],
default_config={
"model": DEFAULT_NEL_MODEL,
"labels_discard": [],
"n_sents": 0,
"incl_prior": True,
"incl_context": True,
"entity_vector_length": 64,
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
"get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
"generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"},
"overwrite": True,
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
"use_gold_ents": True,
"candidates_batch_size": 1,
"threshold": None,
},
default_score_weights={
"nel_micro_f": 1.0,
"nel_micro_r": None,
"nel_micro_p": None,
},
)(make_entity_linker)
# entity_ruler
Language.factory(
"entity_ruler",
assigns=["doc.ents", "token.ent_type", "token.ent_iob"],
default_config={
"phrase_matcher_attr": None,
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
"validate": False,
"overwrite_ents": False,
"ent_id_sep": DEFAULT_ENT_ID_SEP,
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
},
default_score_weights={
"ents_f": 1.0,
"ents_p": 0.0,
"ents_r": 0.0,
"ents_per_type": None,
},
)(make_entity_ruler)
# lemmatizer
Language.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={
"model": None,
"mode": "lookup",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0},
)(make_lemmatizer)
# textcat
Language.factory(
"textcat",
assigns=["doc.cats"],
default_config={
"threshold": 0.0,
"model": DEFAULT_SINGLE_TEXTCAT_MODEL,
"scorer": {"@scorers": "spacy.textcat_scorer.v2"},
},
default_score_weights={
"cats_score": 1.0,
"cats_score_desc": None,
"cats_micro_p": None,
"cats_micro_r": None,
"cats_micro_f": None,
"cats_macro_p": None,
"cats_macro_r": None,
"cats_macro_f": None,
"cats_macro_auc": None,
"cats_f_per_type": None,
},
)(make_textcat)
# token_splitter
Language.factory(
"token_splitter",
default_config={"min_length": 25, "split_length": 10},
retokenizes=True,
)(make_token_splitter)
# doc_cleaner
Language.factory(
"doc_cleaner",
default_config={"attrs": {"tensor": None, "_.trf_data": None}, "silent": True},
)(make_doc_cleaner)
# tok2vec
Language.factory(
"tok2vec",
assigns=["doc.tensor"],
default_config={"model": DEFAULT_TOK2VEC_MODEL},
)(make_tok2vec)
# senter
Language.factory(
"senter",
assigns=["token.is_sent_start"],
default_config={
"model": DEFAULT_SENTER_MODEL,
"overwrite": False,
"scorer": {"@scorers": "spacy.senter_scorer.v1"},
},
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
)(make_senter)
# morphologizer
Language.factory(
"morphologizer",
assigns=["token.morph", "token.pos"],
default_config={
"model": DEFAULT_MORPH_MODEL,
"overwrite": True,
"extend": False,
"scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
"label_smoothing": 0.0,
},
default_score_weights={
"pos_acc": 0.5,
"morph_acc": 0.5,
"morph_per_feat": None,
},
)(make_morphologizer)
# spancat
Language.factory(
"spancat",
assigns=["doc.spans"],
default_config={
"threshold": 0.5,
"spans_key": DEFAULT_SPANS_KEY,
"max_positive": None,
"model": DEFAULT_SPANCAT_MODEL,
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
},
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
)(make_spancat)
# spancat_singlelabel
Language.factory(
"spancat_singlelabel",
assigns=["doc.spans"],
default_config={
"spans_key": DEFAULT_SPANS_KEY,
"model": DEFAULT_SPANCAT_SINGLELABEL_MODEL,
"negative_weight": 1.0,
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
"allow_overlap": True,
},
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
)(make_spancat_singlelabel)
# future_entity_ruler
Language.factory(
"future_entity_ruler",
assigns=["doc.ents"],
default_config={
"phrase_matcher_attr": None,
"validate": False,
"overwrite_ents": False,
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
"ent_id_sep": "__unused__",
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
},
default_score_weights={
"ents_f": 1.0,
"ents_p": 0.0,
"ents_r": 0.0,
"ents_per_type": None,
},
)(make_future_entity_ruler)
# span_ruler
Language.factory(
"span_ruler",
assigns=["doc.spans"],
default_config={
"spans_key": SPAN_RULER_DEFAULT_SPANS_KEY,
"spans_filter": None,
"annotate_ents": False,
"ents_filter": {"@misc": "spacy.first_longest_spans_filter.v1"},
"phrase_matcher_attr": None,
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
"validate": False,
"overwrite": True,
"scorer": {
"@scorers": "spacy.overlapping_labeled_spans_scorer.v1",
"spans_key": SPAN_RULER_DEFAULT_SPANS_KEY,
},
},
default_score_weights={
f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_f": 1.0,
f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_p": 0.0,
f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_r": 0.0,
f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_per_type": None,
},
)(make_span_ruler)
# trainable_lemmatizer
Language.factory(
"trainable_lemmatizer",
assigns=["token.lemma"],
requires=[],
default_config={
"model": DEFAULT_EDIT_TREE_LEMMATIZER_MODEL,
"backoff": "orth",
"min_tree_freq": 3,
"overwrite": False,
"top_k": 1,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0},
)(make_edit_tree_lemmatizer)
# textcat_multilabel
Language.factory(
"textcat_multilabel",
assigns=["doc.cats"],
default_config={
"threshold": 0.5,
"model": DEFAULT_MULTI_TEXTCAT_MODEL,
"scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v2"},
},
default_score_weights={
"cats_score": 1.0,
"cats_score_desc": None,
"cats_micro_p": None,
"cats_micro_r": None,
"cats_micro_f": None,
"cats_macro_p": None,
"cats_macro_r": None,
"cats_macro_f": None,
"cats_macro_auc": None,
"cats_f_per_type": None,
},
)(make_multilabel_textcat)
# span_finder
Language.factory(
"span_finder",
assigns=["doc.spans"],
default_config={
"threshold": 0.5,
"model": DEFAULT_SPAN_FINDER_MODEL,
"spans_key": DEFAULT_SPANS_KEY,
"max_length": 25,
"min_length": None,
"scorer": {"@scorers": "spacy.span_finder_scorer.v1"},
},
default_score_weights={
f"spans_{DEFAULT_SPANS_KEY}_f": 1.0,
f"spans_{DEFAULT_SPANS_KEY}_p": 0.0,
f"spans_{DEFAULT_SPANS_KEY}_r": 0.0,
},
)(make_span_finder)
# ner
Language.factory(
"ner",
assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
default_config={
"moves": None,
"update_with_oracle_cut_size": 100,
"model": DEFAULT_NER_MODEL,
"incorrect_spans_key": None,
"scorer": {"@scorers": "spacy.ner_scorer.v1"},
},
default_score_weights={
"ents_f": 1.0,
"ents_p": 0.0,
"ents_r": 0.0,
"ents_per_type": None,
},
)(make_ner)
# beam_ner
Language.factory(
"beam_ner",
assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
default_config={
"moves": None,
"update_with_oracle_cut_size": 100,
"model": DEFAULT_NER_MODEL,
"beam_density": 0.01,
"beam_update_prob": 0.5,
"beam_width": 32,
"incorrect_spans_key": None,
"scorer": {"@scorers": "spacy.ner_scorer.v1"},
},
default_score_weights={
"ents_f": 1.0,
"ents_p": 0.0,
"ents_r": 0.0,
"ents_per_type": None,
},
)(make_beam_ner)
# parser
Language.factory(
"parser",
assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
default_config={
"moves": None,
"update_with_oracle_cut_size": 100,
"learn_tokens": False,
"min_action_freq": 30,
"model": DEFAULT_PARSER_MODEL,
"scorer": {"@scorers": "spacy.parser_scorer.v1"},
},
default_score_weights={
"dep_uas": 0.5,
"dep_las": 0.5,
"dep_las_per_type": None,
"sents_p": None,
"sents_r": None,
"sents_f": 0.0,
},
)(make_parser)
# beam_parser
Language.factory(
"beam_parser",
assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
default_config={
"moves": None,
"update_with_oracle_cut_size": 100,
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 8,
"beam_density": 0.0001,
"beam_update_prob": 0.5,
"model": DEFAULT_PARSER_MODEL,
"scorer": {"@scorers": "spacy.parser_scorer.v1"},
},
default_score_weights={
"dep_uas": 0.5,
"dep_las": 0.5,
"dep_las_per_type": None,
"sents_p": None,
"sents_r": None,
"sents_f": 0.0,
},
)(make_beam_parser)
# tagger
Language.factory(
"tagger",
assigns=["token.tag"],
default_config={
"model": DEFAULT_TAGGER_MODEL,
"overwrite": False,
"scorer": {"@scorers": "spacy.tagger_scorer.v1"},
"neg_prefix": "!",
"label_smoothing": 0.0,
},
default_score_weights={
"tag_acc": 1.0,
"pos_acc": 0.0,
"tag_micro_p": None,
"tag_micro_r": None,
"tag_micro_f": None,
},
)(make_tagger)
# nn_labeller
Language.factory(
"nn_labeller",
default_config={
"labels": None,
"target": "dep_tag_offset",
"model": DEFAULT_MT_MODEL,
},
)(make_nn_labeller)
# sentencizer
Language.factory(
"sentencizer",
assigns=["token.is_sent_start", "doc.sents"],
default_config={
"punct_chars": None,
"overwrite": False,
"scorer": {"@scorers": "spacy.senter_scorer.v1"},
},
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
)(make_sentencizer)
# Set the flag to indicate that all factories have been registered
FACTORIES_REGISTERED = True
# We can't have function implementations for these factories in Cython, because
# we need to build a Pydantic model for them dynamically, reading their argument
# structure from the signature. In Cython 3, this doesn't work because the
@ -605,415 +1017,4 @@ def register_factories() -> None:
scorer=scorer
)
# Register factories using the same pattern as Language.factory decorator
# We use Language.factory()() pattern which exactly mimics the decorator
# attributeruler
Language.factory(
"attribute_ruler",
default_config={
"validate": False,
"scorer": {"@scorers": "spacy.attribute_ruler_scorer.v1"},
},
)(make_attribute_ruler)
# entity_linker
Language.factory(
"entity_linker",
requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
assigns=["token.ent_kb_id"],
default_config={
"model": DEFAULT_NEL_MODEL,
"labels_discard": [],
"n_sents": 0,
"incl_prior": True,
"incl_context": True,
"entity_vector_length": 64,
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
"get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
"generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"},
"overwrite": True,
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
"use_gold_ents": True,
"candidates_batch_size": 1,
"threshold": None,
},
default_score_weights={
"nel_micro_f": 1.0,
"nel_micro_r": None,
"nel_micro_p": None,
},
)(make_entity_linker)
# entity_ruler
Language.factory(
"entity_ruler",
assigns=["doc.ents", "token.ent_type", "token.ent_iob"],
default_config={
"phrase_matcher_attr": None,
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
"validate": False,
"overwrite_ents": False,
"ent_id_sep": DEFAULT_ENT_ID_SEP,
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
},
default_score_weights={
"ents_f": 1.0,
"ents_p": 0.0,
"ents_r": 0.0,
"ents_per_type": None,
},
)(make_entity_ruler)
# lemmatizer
Language.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={
"model": None,
"mode": "lookup",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0},
)(make_lemmatizer)
# textcat
Language.factory(
"textcat",
assigns=["doc.cats"],
default_config={
"threshold": 0.0,
"model": DEFAULT_SINGLE_TEXTCAT_MODEL,
"scorer": {"@scorers": "spacy.textcat_scorer.v2"},
},
default_score_weights={
"cats_score": 1.0,
"cats_score_desc": None,
"cats_micro_p": None,
"cats_micro_r": None,
"cats_micro_f": None,
"cats_macro_p": None,
"cats_macro_r": None,
"cats_macro_f": None,
"cats_macro_auc": None,
"cats_f_per_type": None,
},
)(make_textcat)
# token_splitter
Language.factory(
"token_splitter",
default_config={"min_length": 25, "split_length": 10},
retokenizes=True,
)(make_token_splitter)
# doc_cleaner
Language.factory(
"doc_cleaner",
default_config={"attrs": {"tensor": None, "_.trf_data": None}, "silent": True},
)(make_doc_cleaner)
# tok2vec
Language.factory(
"tok2vec",
assigns=["doc.tensor"],
default_config={"model": DEFAULT_TOK2VEC_MODEL},
)(make_tok2vec)
# senter
Language.factory(
"senter",
assigns=["token.is_sent_start"],
default_config={
"model": DEFAULT_SENTER_MODEL,
"overwrite": False,
"scorer": {"@scorers": "spacy.senter_scorer.v1"},
},
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
)(make_senter)
# morphologizer
Language.factory(
"morphologizer",
assigns=["token.morph", "token.pos"],
default_config={
"model": DEFAULT_MORPH_MODEL,
"overwrite": True,
"extend": False,
"scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
"label_smoothing": 0.0,
},
default_score_weights={
"pos_acc": 0.5,
"morph_acc": 0.5,
"morph_per_feat": None,
},
)(make_morphologizer)
# spancat
Language.factory(
"spancat",
assigns=["doc.spans"],
default_config={
"threshold": 0.5,
"spans_key": DEFAULT_SPANS_KEY,
"max_positive": None,
"model": DEFAULT_SPANCAT_MODEL,
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
},
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
)(make_spancat)
# spancat_singlelabel
Language.factory(
"spancat_singlelabel",
assigns=["doc.spans"],
default_config={
"spans_key": DEFAULT_SPANS_KEY,
"model": DEFAULT_SPANCAT_SINGLELABEL_MODEL,
"negative_weight": 1.0,
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
"allow_overlap": True,
},
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
)(make_spancat_singlelabel)
# future_entity_ruler
Language.factory(
"future_entity_ruler",
assigns=["doc.ents"],
default_config={
"phrase_matcher_attr": None,
"validate": False,
"overwrite_ents": False,
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
"ent_id_sep": "__unused__",
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
},
default_score_weights={
"ents_f": 1.0,
"ents_p": 0.0,
"ents_r": 0.0,
"ents_per_type": None,
},
)(make_future_entity_ruler)
# span_ruler
Language.factory(
"span_ruler",
assigns=["doc.spans"],
default_config={
"spans_key": SPAN_RULER_DEFAULT_SPANS_KEY,
"spans_filter": None,
"annotate_ents": False,
"ents_filter": {"@misc": "spacy.first_longest_spans_filter.v1"},
"phrase_matcher_attr": None,
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
"validate": False,
"overwrite": True,
"scorer": {
"@scorers": "spacy.overlapping_labeled_spans_scorer.v1",
"spans_key": SPAN_RULER_DEFAULT_SPANS_KEY,
},
},
default_score_weights={
f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_f": 1.0,
f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_p": 0.0,
f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_r": 0.0,
f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_per_type": None,
},
)(make_span_ruler)
# trainable_lemmatizer
Language.factory(
"trainable_lemmatizer",
assigns=["token.lemma"],
requires=[],
default_config={
"model": DEFAULT_EDIT_TREE_LEMMATIZER_MODEL,
"backoff": "orth",
"min_tree_freq": 3,
"overwrite": False,
"top_k": 1,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0},
)(make_edit_tree_lemmatizer)
# textcat_multilabel
Language.factory(
"textcat_multilabel",
assigns=["doc.cats"],
default_config={
"threshold": 0.5,
"model": DEFAULT_MULTI_TEXTCAT_MODEL,
"scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v2"},
},
default_score_weights={
"cats_score": 1.0,
"cats_score_desc": None,
"cats_micro_p": None,
"cats_micro_r": None,
"cats_micro_f": None,
"cats_macro_p": None,
"cats_macro_r": None,
"cats_macro_f": None,
"cats_macro_auc": None,
"cats_f_per_type": None,
},
)(make_multilabel_textcat)
# span_finder
Language.factory(
"span_finder",
assigns=["doc.spans"],
default_config={
"threshold": 0.5,
"model": DEFAULT_SPAN_FINDER_MODEL,
"spans_key": DEFAULT_SPANS_KEY,
"max_length": 25,
"min_length": None,
"scorer": {"@scorers": "spacy.span_finder_scorer.v1"},
},
default_score_weights={
f"spans_{DEFAULT_SPANS_KEY}_f": 1.0,
f"spans_{DEFAULT_SPANS_KEY}_p": 0.0,
f"spans_{DEFAULT_SPANS_KEY}_r": 0.0,
},
)(make_span_finder)
# ner
Language.factory(
"ner",
assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
default_config={
"moves": None,
"update_with_oracle_cut_size": 100,
"model": DEFAULT_NER_MODEL,
"incorrect_spans_key": None,
"scorer": {"@scorers": "spacy.ner_scorer.v1"},
},
default_score_weights={
"ents_f": 1.0,
"ents_p": 0.0,
"ents_r": 0.0,
"ents_per_type": None,
},
)(make_ner)
# beam_ner
Language.factory(
"beam_ner",
assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
default_config={
"moves": None,
"update_with_oracle_cut_size": 100,
"model": DEFAULT_NER_MODEL,
"beam_density": 0.01,
"beam_update_prob": 0.5,
"beam_width": 32,
"incorrect_spans_key": None,
"scorer": {"@scorers": "spacy.ner_scorer.v1"},
},
default_score_weights={
"ents_f": 1.0,
"ents_p": 0.0,
"ents_r": 0.0,
"ents_per_type": None,
},
)(make_beam_ner)
# parser
Language.factory(
"parser",
assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
default_config={
"moves": None,
"update_with_oracle_cut_size": 100,
"learn_tokens": False,
"min_action_freq": 30,
"model": DEFAULT_PARSER_MODEL,
"scorer": {"@scorers": "spacy.parser_scorer.v1"},
},
default_score_weights={
"dep_uas": 0.5,
"dep_las": 0.5,
"dep_las_per_type": None,
"sents_p": None,
"sents_r": None,
"sents_f": 0.0,
},
)(make_parser)
# beam_parser
Language.factory(
"beam_parser",
assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
default_config={
"moves": None,
"update_with_oracle_cut_size": 100,
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 8,
"beam_density": 0.0001,
"beam_update_prob": 0.5,
"model": DEFAULT_PARSER_MODEL,
"scorer": {"@scorers": "spacy.parser_scorer.v1"},
},
default_score_weights={
"dep_uas": 0.5,
"dep_las": 0.5,
"dep_las_per_type": None,
"sents_p": None,
"sents_r": None,
"sents_f": 0.0,
},
)(make_beam_parser)
# tagger
Language.factory(
"tagger",
assigns=["token.tag"],
default_config={
"model": DEFAULT_TAGGER_MODEL,
"overwrite": False,
"scorer": {"@scorers": "spacy.tagger_scorer.v1"},
"neg_prefix": "!",
"label_smoothing": 0.0,
},
default_score_weights={
"tag_acc": 1.0,
"pos_acc": 0.0,
"tag_micro_p": None,
"tag_micro_r": None,
"tag_micro_f": None,
},
)(make_tagger)
# nn_labeller
Language.factory(
"nn_labeller",
default_config={
"labels": None,
"target": "dep_tag_offset",
"model": DEFAULT_MT_MODEL,
},
)(make_nn_labeller)
# sentencizer
Language.factory(
"sentencizer",
assigns=["token.is_sent_start", "doc.sents"],
default_config={
"punct_chars": None,
"overwrite": False,
"scorer": {"@scorers": "spacy.senter_scorer.v1"},
},
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
)(make_sentencizer)
# Set the flag to indicate that all factories have been registered
FACTORIES_REGISTERED = True

View File

@ -101,7 +101,11 @@ def test_cat_readers(reader, additional_config):
nlp = load_model_from_config(config, auto_fill=True)
T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
dot_names = [T["train_corpus"], T["dev_corpus"]]
print("T", T)
print("dot names", dot_names)
train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names)
data = list(train_corpus(nlp))
print(len(data))
optimizer = T["optimizer"]
# simulate a training loop
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)