diff --git a/README.md b/README.md index c692ce5f4..79db36daf 100644 --- a/README.md +++ b/README.md @@ -117,7 +117,7 @@ For detailed installation instructions, see the - **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual Studio) -- **Python version**: Python >=3.7, <=3.12 (only 64 bit) +- **Python version**: Python >=3.7, <3.13 (only 64 bit) - **Package managers**: [pip] · [conda] (via `conda-forge`) [pip]: https://pypi.org/project/spacy/ diff --git a/setup.cfg b/setup.cfg index bc7b6e9d7..5a0c04faa 100644 --- a/setup.cfg +++ b/setup.cfg @@ -30,7 +30,7 @@ project_urls = [options] zip_safe = false include_package_data = true -python_requires = >=3.9,<3.14 +python_requires = >=3.9,<3.13 # NOTE: This section is superseded by pyproject.toml and will be removed in # spaCy v4 setup_requires = diff --git a/spacy/__init__.py b/spacy/__init__.py index 1a18ad0d5..efc475dc9 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -19,6 +19,7 @@ from .glossary import explain # noqa: F401 from .language import Language from .util import logger, registry # noqa: F401 from .vocab import Vocab +from .registrations import populate_registry, REGISTRY_POPULATED if sys.maxunicode == 65535: raise SystemError(Errors.E130) diff --git a/spacy/about.py b/spacy/about.py index 1fcb06ec2..76689ba53 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,5 +1,5 @@ # fmt: off __title__ = "spacy" -__version__ = "3.8.5" +__version__ = "3.8.6" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 0edc89991..4dd23679e 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -29,7 +29,6 @@ from ..featureextractor import FeatureExtractor from ..staticvectors import StaticVectors -@registry.architectures("spacy.Tok2VecListener.v1") def tok2vec_listener_v1(width: int, upstream: str = "*"): tok2vec = Tok2VecListener(upstream_name=upstream, width=width) return tok2vec @@ -46,7 +45,6 @@ def get_tok2vec_width(model: Model): return nO -@registry.architectures("spacy.HashEmbedCNN.v2") def build_hash_embed_cnn_tok2vec( *, width: int, @@ -102,7 +100,6 @@ def build_hash_embed_cnn_tok2vec( ) -@registry.architectures("spacy.Tok2Vec.v2") def build_Tok2Vec_model( embed: Model[List[Doc], List[Floats2d]], encode: Model[List[Floats2d], List[Floats2d]], @@ -123,7 +120,6 @@ def build_Tok2Vec_model( return tok2vec -@registry.architectures("spacy.MultiHashEmbed.v2") def MultiHashEmbed( width: int, attrs: List[Union[str, int]], @@ -201,7 +197,6 @@ def MultiHashEmbed( return model -@registry.architectures("spacy.CharacterEmbed.v2") def CharacterEmbed( width: int, rows: int, @@ -278,7 +273,6 @@ def CharacterEmbed( return model -@registry.architectures("spacy.MaxoutWindowEncoder.v2") def MaxoutWindowEncoder( width: int, window_size: int, maxout_pieces: int, depth: int ) -> Model[List[Floats2d], List[Floats2d]]: @@ -310,7 +304,6 @@ def MaxoutWindowEncoder( return with_array(model, pad=receptive_field) -@registry.architectures("spacy.MishWindowEncoder.v2") def MishWindowEncoder( width: int, window_size: int, depth: int ) -> Model[List[Floats2d], List[Floats2d]]: @@ -333,7 +326,6 @@ def MishWindowEncoder( return with_array(model) -@registry.architectures("spacy.TorchBiLSTMEncoder.v1") def BiLSTMEncoder( width: int, depth: int, dropout: float ) -> Model[List[Floats2d], List[Floats2d]]: diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py index 8ac74d92b..5b5de78ef 100644 --- a/spacy/pipeline/attributeruler.py +++ b/spacy/pipeline/attributeruler.py @@ -22,13 +22,6 @@ TagMapType = Dict[str, Dict[Union[int, str], Union[int, str]]] MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]] -@Language.factory( - "attribute_ruler", - default_config={ - "validate": False, - "scorer": {"@scorers": "spacy.attribute_ruler_scorer.v1"}, - }, -) def make_attribute_ruler( nlp: Language, name: str, validate: bool, scorer: Optional[Callable] ): diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx index 18a220bd6..42d50dde6 100644 --- a/spacy/pipeline/dep_parser.pyx +++ b/spacy/pipeline/dep_parser.pyx @@ -39,26 +39,6 @@ subword_features = true DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"] -@Language.factory( - "parser", - assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"], - default_config={ - "moves": None, - "update_with_oracle_cut_size": 100, - "learn_tokens": False, - "min_action_freq": 30, - "model": DEFAULT_PARSER_MODEL, - "scorer": {"@scorers": "spacy.parser_scorer.v1"}, - }, - default_score_weights={ - "dep_uas": 0.5, - "dep_las": 0.5, - "dep_las_per_type": None, - "sents_p": None, - "sents_r": None, - "sents_f": 0.0, - }, -) def make_parser( nlp: Language, name: str, @@ -125,29 +105,6 @@ def make_parser( ) -@Language.factory( - "beam_parser", - assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"], - default_config={ - "beam_width": 8, - "beam_density": 0.01, - "beam_update_prob": 0.5, - "moves": None, - "update_with_oracle_cut_size": 100, - "learn_tokens": False, - "min_action_freq": 30, - "model": DEFAULT_PARSER_MODEL, - "scorer": {"@scorers": "spacy.parser_scorer.v1"}, - }, - default_score_weights={ - "dep_uas": 0.5, - "dep_las": 0.5, - "dep_las_per_type": None, - "sents_p": None, - "sents_r": None, - "sents_f": 0.0, - }, -) def make_beam_parser( nlp: Language, name: str, diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py index 4a6174bc3..f8ae2cba3 100644 --- a/spacy/pipeline/edit_tree_lemmatizer.py +++ b/spacy/pipeline/edit_tree_lemmatizer.py @@ -39,20 +39,6 @@ subword_features = true DEFAULT_EDIT_TREE_LEMMATIZER_MODEL = Config().from_str(default_model_config)["model"] -@Language.factory( - "trainable_lemmatizer", - assigns=["token.lemma"], - requires=[], - default_config={ - "model": DEFAULT_EDIT_TREE_LEMMATIZER_MODEL, - "backoff": "orth", - "min_tree_freq": 3, - "overwrite": False, - "top_k": 1, - "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, - }, - default_score_weights={"lemma_acc": 1.0}, -) def make_edit_tree_lemmatizer( nlp: Language, name: str, diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 40a9c8a79..65293a301 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -40,32 +40,6 @@ subword_features = true DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"] -@Language.factory( - "entity_linker", - requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"], - assigns=["token.ent_kb_id"], - default_config={ - "model": DEFAULT_NEL_MODEL, - "labels_discard": [], - "n_sents": 0, - "incl_prior": True, - "incl_context": True, - "entity_vector_length": 64, - "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"}, - "get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"}, - "generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"}, - "overwrite": True, - "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"}, - "use_gold_ents": True, - "candidates_batch_size": 1, - "threshold": None, - }, - default_score_weights={ - "nel_micro_f": 1.0, - "nel_micro_r": None, - "nel_micro_p": None, - }, -) def make_entity_linker( nlp: Language, name: str, diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 3683cfc02..22df8065d 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -19,24 +19,6 @@ DEFAULT_ENT_ID_SEP = "||" PatternType = Dict[str, Union[str, List[Dict[str, Any]]]] -@Language.factory( - "entity_ruler", - assigns=["doc.ents", "token.ent_type", "token.ent_iob"], - default_config={ - "phrase_matcher_attr": None, - "matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"}, - "validate": False, - "overwrite_ents": False, - "ent_id_sep": DEFAULT_ENT_ID_SEP, - "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"}, - }, - default_score_weights={ - "ents_f": 1.0, - "ents_p": 0.0, - "ents_r": 0.0, - "ents_per_type": None, - }, -) def make_entity_ruler( nlp: Language, name: str, @@ -63,7 +45,6 @@ def entity_ruler_score(examples, **kwargs): return get_ner_prf(examples) -@registry.scorers("spacy.entity_ruler_scorer.v1") def make_entity_ruler_scorer(): return entity_ruler_score diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py index 2bf0437d5..e788979cf 100644 --- a/spacy/pipeline/functions.py +++ b/spacy/pipeline/functions.py @@ -73,11 +73,6 @@ def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc: return doc -@Language.factory( - "token_splitter", - default_config={"min_length": 25, "split_length": 10}, - retokenizes=True, -) def make_token_splitter( nlp: Language, name: str, *, min_length: int = 0, split_length: int = 0 ): @@ -141,10 +136,6 @@ class TokenSplitter: util.from_disk(path, serializers, []) -@Language.factory( - "doc_cleaner", - default_config={"attrs": {"tensor": None, "_.trf_data": None}, "silent": True}, -) def make_doc_cleaner(nlp: Language, name: str, *, attrs: Dict[str, Any], silent: bool): return DocCleaner(attrs, silent=silent) diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py index 09e501595..f737b84b5 100644 --- a/spacy/pipeline/lemmatizer.py +++ b/spacy/pipeline/lemmatizer.py @@ -16,17 +16,6 @@ from ..vocab import Vocab from .pipe import Pipe -@Language.factory( - "lemmatizer", - assigns=["token.lemma"], - default_config={ - "model": None, - "mode": "lookup", - "overwrite": False, - "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, - }, - default_score_weights={"lemma_acc": 1.0}, -) def make_lemmatizer( nlp: Language, model: Optional[Model], @@ -44,7 +33,6 @@ def lemmatizer_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: return Scorer.score_token_attr(examples, "lemma", **kwargs) -@registry.scorers("spacy.lemmatizer_scorer.v1") def make_lemmatizer_scorer(): return lemmatizer_score diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index d415ae43c..937bd00da 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -47,13 +47,6 @@ maxout_pieces = 3 DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"] -@Language.factory( - "morphologizer", - assigns=["token.morph", "token.pos"], - default_config={"model": DEFAULT_MORPH_MODEL, "overwrite": True, "extend": False, - "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}, "label_smoothing": 0.0}, - default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None}, -) def make_morphologizer( nlp: Language, model: Model, diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx index f33a90fde..a7fdbd9b4 100644 --- a/spacy/pipeline/multitask.pyx +++ b/spacy/pipeline/multitask.pyx @@ -30,10 +30,6 @@ subword_features = true DEFAULT_MT_MODEL = Config().from_str(default_model_config)["model"] -@Language.factory( - "nn_labeller", - default_config={"labels": None, "target": "dep_tag_offset", "model": DEFAULT_MT_MODEL} -) def make_nn_labeller(nlp: Language, name: str, model: Model, labels: Optional[dict], target: str): return MultitaskObjective(nlp.vocab, model, name) diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx index bb009dc7a..548f4b966 100644 --- a/spacy/pipeline/ner.pyx +++ b/spacy/pipeline/ner.pyx @@ -36,19 +36,6 @@ subword_features = true DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"] -@Language.factory( - "ner", - assigns=["doc.ents", "token.ent_iob", "token.ent_type"], - default_config={ - "moves": None, - "update_with_oracle_cut_size": 100, - "model": DEFAULT_NER_MODEL, - "incorrect_spans_key": None, - "scorer": {"@scorers": "spacy.ner_scorer.v1"}, - }, - default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None}, - -) def make_ner( nlp: Language, name: str, @@ -101,21 +88,6 @@ def make_ner( ) -@Language.factory( - "beam_ner", - assigns=["doc.ents", "token.ent_iob", "token.ent_type"], - default_config={ - "moves": None, - "update_with_oracle_cut_size": 100, - "model": DEFAULT_NER_MODEL, - "beam_density": 0.01, - "beam_update_prob": 0.5, - "beam_width": 32, - "incorrect_spans_key": None, - "scorer": None, - }, - default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None}, -) def make_beam_ner( nlp: Language, name: str, @@ -183,7 +155,6 @@ def ner_score(examples, **kwargs): return get_ner_prf(examples, **kwargs) -@registry.scorers("spacy.ner_scorer.v1") def make_ner_scorer(): return ner_score diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx index 08ba9d989..9669caf7e 100644 --- a/spacy/pipeline/sentencizer.pyx +++ b/spacy/pipeline/sentencizer.pyx @@ -14,12 +14,6 @@ from .senter import senter_score BACKWARD_OVERWRITE = False -@Language.factory( - "sentencizer", - assigns=["token.is_sent_start", "doc.sents"], - default_config={"punct_chars": None, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}}, - default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, -) def make_sentencizer( nlp: Language, name: str, diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index df093baa9..c8e09c5ab 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -34,12 +34,6 @@ subword_features = true DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"] -@Language.factory( - "senter", - assigns=["token.is_sent_start"], - default_config={"model": DEFAULT_SENTER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}}, - default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, -) def make_senter(nlp: Language, name: str, model: Model, overwrite: bool, scorer: Optional[Callable]): return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer) @@ -53,7 +47,6 @@ def senter_score(examples, **kwargs): return results -@registry.scorers("spacy.senter_scorer.v1") def make_senter_scorer(): return senter_score diff --git a/spacy/pipeline/span_finder.py b/spacy/pipeline/span_finder.py index a12d52911..709a67b7f 100644 --- a/spacy/pipeline/span_finder.py +++ b/spacy/pipeline/span_finder.py @@ -41,23 +41,6 @@ depth = 4 DEFAULT_SPAN_FINDER_MODEL = Config().from_str(span_finder_default_config)["model"] -@Language.factory( - "span_finder", - assigns=["doc.spans"], - default_config={ - "threshold": 0.5, - "model": DEFAULT_SPAN_FINDER_MODEL, - "spans_key": DEFAULT_SPANS_KEY, - "max_length": 25, - "min_length": None, - "scorer": {"@scorers": "spacy.span_finder_scorer.v1"}, - }, - default_score_weights={ - f"spans_{DEFAULT_SPANS_KEY}_f": 1.0, - f"spans_{DEFAULT_SPANS_KEY}_p": 0.0, - f"spans_{DEFAULT_SPANS_KEY}_r": 0.0, - }, -) def make_span_finder( nlp: Language, name: str, @@ -97,7 +80,6 @@ def make_span_finder( ) -@registry.scorers("spacy.span_finder_scorer.v1") def make_span_finder_scorer(): return span_finder_score diff --git a/spacy/pipeline/span_ruler.py b/spacy/pipeline/span_ruler.py index 2a5e2179a..1f9ab2622 100644 --- a/spacy/pipeline/span_ruler.py +++ b/spacy/pipeline/span_ruler.py @@ -32,24 +32,6 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]] DEFAULT_SPANS_KEY = "ruler" -@Language.factory( - "future_entity_ruler", - assigns=["doc.ents"], - default_config={ - "phrase_matcher_attr": None, - "validate": False, - "overwrite_ents": False, - "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"}, - "ent_id_sep": "__unused__", - "matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"}, - }, - default_score_weights={ - "ents_f": 1.0, - "ents_p": 0.0, - "ents_r": 0.0, - "ents_per_type": None, - }, -) def make_entity_ruler( nlp: Language, name: str, @@ -79,30 +61,6 @@ def make_entity_ruler( ) -@Language.factory( - "span_ruler", - assigns=["doc.spans"], - default_config={ - "spans_key": DEFAULT_SPANS_KEY, - "spans_filter": None, - "annotate_ents": False, - "ents_filter": {"@misc": "spacy.first_longest_spans_filter.v1"}, - "phrase_matcher_attr": None, - "matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"}, - "validate": False, - "overwrite": True, - "scorer": { - "@scorers": "spacy.overlapping_labeled_spans_scorer.v1", - "spans_key": DEFAULT_SPANS_KEY, - }, - }, - default_score_weights={ - f"spans_{DEFAULT_SPANS_KEY}_f": 1.0, - f"spans_{DEFAULT_SPANS_KEY}_p": 0.0, - f"spans_{DEFAULT_SPANS_KEY}_r": 0.0, - f"spans_{DEFAULT_SPANS_KEY}_per_type": None, - }, -) def make_span_ruler( nlp: Language, name: str, diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py index 08a5478a9..3d49bef2a 100644 --- a/spacy/pipeline/spancat.py +++ b/spacy/pipeline/spancat.py @@ -134,7 +134,6 @@ def preset_spans_suggester( return output -@registry.misc("spacy.ngram_suggester.v1") def build_ngram_suggester(sizes: List[int]) -> Suggester: """Suggest all spans of the given lengths. Spans are returned as a ragged array of integers. The array has two columns, indicating the start and end @@ -143,7 +142,6 @@ def build_ngram_suggester(sizes: List[int]) -> Suggester: return partial(ngram_suggester, sizes=sizes) -@registry.misc("spacy.ngram_range_suggester.v1") def build_ngram_range_suggester(min_size: int, max_size: int) -> Suggester: """Suggest all spans of the given lengths between a given min and max value - both inclusive. Spans are returned as a ragged array of integers. The array has two columns, @@ -152,7 +150,6 @@ def build_ngram_range_suggester(min_size: int, max_size: int) -> Suggester: return build_ngram_suggester(sizes) -@registry.misc("spacy.preset_spans_suggester.v1") def build_preset_spans_suggester(spans_key: str) -> Suggester: """Suggest all spans that are already stored in doc.spans[spans_key]. This is useful when an upstream component is used to set the spans @@ -160,19 +157,6 @@ def build_preset_spans_suggester(spans_key: str) -> Suggester: return partial(preset_spans_suggester, spans_key=spans_key) -@Language.factory( - "spancat", - assigns=["doc.spans"], - default_config={ - "threshold": 0.5, - "spans_key": DEFAULT_SPANS_KEY, - "max_positive": None, - "model": DEFAULT_SPANCAT_MODEL, - "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]}, - "scorer": {"@scorers": "spacy.spancat_scorer.v1"}, - }, - default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0}, -) def make_spancat( nlp: Language, name: str, @@ -225,19 +209,6 @@ def make_spancat( ) -@Language.factory( - "spancat_singlelabel", - assigns=["doc.spans"], - default_config={ - "spans_key": DEFAULT_SPANS_KEY, - "model": DEFAULT_SPANCAT_SINGLELABEL_MODEL, - "negative_weight": 1.0, - "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]}, - "scorer": {"@scorers": "spacy.spancat_scorer.v1"}, - "allow_overlap": True, - }, - default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0}, -) def make_spancat_singlelabel( nlp: Language, name: str, @@ -303,7 +274,6 @@ def spancat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: return Scorer.score_spans(examples, **kwargs) -@registry.scorers("spacy.spancat_scorer.v1") def make_spancat_scorer(): return spancat_score diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 34e85d49c..f0085d3ff 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -35,12 +35,6 @@ subword_features = true DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"] -@Language.factory( - "tagger", - assigns=["token.tag"], - default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}, "neg_prefix": "!", "label_smoothing": 0.0}, - default_score_weights={"tag_acc": 1.0}, -) def make_tagger( nlp: Language, name: str, @@ -64,7 +58,6 @@ def tagger_score(examples, **kwargs): return Scorer.score_token_attr(examples, "tag", **kwargs) -@registry.scorers("spacy.tagger_scorer.v1") def make_tagger_scorer(): return tagger_score diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index ae227017a..98393355f 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -74,27 +74,6 @@ subword_features = true """ -@Language.factory( - "textcat", - assigns=["doc.cats"], - default_config={ - "threshold": 0.0, - "model": DEFAULT_SINGLE_TEXTCAT_MODEL, - "scorer": {"@scorers": "spacy.textcat_scorer.v2"}, - }, - default_score_weights={ - "cats_score": 1.0, - "cats_score_desc": None, - "cats_micro_p": None, - "cats_micro_r": None, - "cats_micro_f": None, - "cats_macro_p": None, - "cats_macro_r": None, - "cats_macro_f": None, - "cats_macro_auc": None, - "cats_f_per_type": None, - }, -) def make_textcat( nlp: Language, name: str, @@ -123,7 +102,6 @@ def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: ) -@registry.scorers("spacy.textcat_scorer.v2") def make_textcat_scorer(): return textcat_score diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py index 2f8d5e604..f1306f92c 100644 --- a/spacy/pipeline/textcat_multilabel.py +++ b/spacy/pipeline/textcat_multilabel.py @@ -72,27 +72,6 @@ subword_features = true """ -@Language.factory( - "textcat_multilabel", - assigns=["doc.cats"], - default_config={ - "threshold": 0.5, - "model": DEFAULT_MULTI_TEXTCAT_MODEL, - "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v2"}, - }, - default_score_weights={ - "cats_score": 1.0, - "cats_score_desc": None, - "cats_micro_p": None, - "cats_micro_r": None, - "cats_micro_f": None, - "cats_macro_p": None, - "cats_macro_r": None, - "cats_macro_f": None, - "cats_macro_auc": None, - "cats_f_per_type": None, - }, -) def make_multilabel_textcat( nlp: Language, name: str, @@ -124,7 +103,6 @@ def textcat_multilabel_score(examples: Iterable[Example], **kwargs) -> Dict[str, ) -@registry.scorers("spacy.textcat_multilabel_scorer.v2") def make_textcat_multilabel_scorer(): return textcat_multilabel_score diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index 677f5eec1..22c30d548 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -24,9 +24,6 @@ subword_features = true DEFAULT_TOK2VEC_MODEL = Config().from_str(default_model_config)["model"] -@Language.factory( - "tok2vec", assigns=["doc.tensor"], default_config={"model": DEFAULT_TOK2VEC_MODEL} -) def make_tok2vec(nlp: Language, name: str, model: Model) -> "Tok2Vec": return Tok2Vec(nlp.vocab, model, name) diff --git a/spacy/registrations.py b/spacy/registrations.py new file mode 100644 index 000000000..4a5f18293 --- /dev/null +++ b/spacy/registrations.py @@ -0,0 +1,506 @@ +"""Centralized registry population for spaCy components. + +This module centralizes registry decorations to prevent circular import issues +with Cython annotation changes from __future__ import annotations. Functions +remain in their original locations, but decoration is moved here. +""" +from typing import Dict, Any, Callable, Iterable, List, Optional, Union + +# Global flag to track if registry has been populated +REGISTRY_POPULATED = False + +# Global flag to track if factories have been registered +FACTORIES_REGISTERED = False + +def populate_registry() -> None: + """Populate the registry with all necessary components. + + This function should be called before accessing the registry, to ensure + it's populated. The function uses a global flag to prevent repopulation. + """ + global REGISTRY_POPULATED + if REGISTRY_POPULATED: + return + + # Import all necessary modules + from .util import registry, make_first_longest_spans_filter + + # Import all pipeline components that were using registry decorators + from .pipeline.tagger import make_tagger_scorer + from .pipeline.ner import make_ner_scorer + from .pipeline.lemmatizer import make_lemmatizer_scorer + from .pipeline.span_finder import make_span_finder_scorer + from .pipeline.spancat import make_spancat_scorer, build_ngram_suggester, build_ngram_range_suggester, build_preset_spans_suggester + from .pipeline.entityruler import make_entity_ruler_scorer as make_entityruler_scorer + from .pipeline.sentencizer import senter_score as make_sentencizer_scorer + from .pipeline.senter import make_senter_scorer + from .pipeline.textcat import make_textcat_scorer + from .pipeline.textcat_multilabel import make_textcat_multilabel_scorer + + # Register miscellaneous components + registry.misc("spacy.first_longest_spans_filter.v1")(make_first_longest_spans_filter) + registry.misc("spacy.ngram_suggester.v1")(build_ngram_suggester) + registry.misc("spacy.ngram_range_suggester.v1")(build_ngram_range_suggester) + registry.misc("spacy.preset_spans_suggester.v1")(build_preset_spans_suggester) + + # Need to get references to the existing functions in registry by importing the function that is there + # For the registry that was previously decorated + + # Import ML components that use registry + from .ml.models.tok2vec import tok2vec_listener_v1, build_hash_embed_cnn_tok2vec, build_Tok2Vec_model, MultiHashEmbed, CharacterEmbed, MaxoutWindowEncoder, MishWindowEncoder, BiLSTMEncoder + + # Register scorers + registry.scorers("spacy.tagger_scorer.v1")(make_tagger_scorer) + registry.scorers("spacy.ner_scorer.v1")(make_ner_scorer) + # span_ruler_scorer removed as it's not in span_ruler.py + registry.scorers("spacy.entity_ruler_scorer.v1")(make_entityruler_scorer) + registry.scorers("spacy.sentencizer_scorer.v1")(make_sentencizer_scorer) + registry.scorers("spacy.senter_scorer.v1")(make_senter_scorer) + registry.scorers("spacy.textcat_scorer.v1")(make_textcat_scorer) + registry.scorers("spacy.textcat_scorer.v2")(make_textcat_scorer) + registry.scorers("spacy.textcat_multilabel_scorer.v1")(make_textcat_multilabel_scorer) + registry.scorers("spacy.textcat_multilabel_scorer.v2")(make_textcat_multilabel_scorer) + registry.scorers("spacy.lemmatizer_scorer.v1")(make_lemmatizer_scorer) + registry.scorers("spacy.span_finder_scorer.v1")(make_span_finder_scorer) + registry.scorers("spacy.spancat_scorer.v1")(make_spancat_scorer) + + # Register tok2vec architectures we've modified + registry.architectures("spacy.Tok2VecListener.v1")(tok2vec_listener_v1) + registry.architectures("spacy.HashEmbedCNN.v2")(build_hash_embed_cnn_tok2vec) + registry.architectures("spacy.Tok2Vec.v2")(build_Tok2Vec_model) + registry.architectures("spacy.MultiHashEmbed.v2")(MultiHashEmbed) + registry.architectures("spacy.CharacterEmbed.v2")(CharacterEmbed) + registry.architectures("spacy.MaxoutWindowEncoder.v2")(MaxoutWindowEncoder) + registry.architectures("spacy.MishWindowEncoder.v2")(MishWindowEncoder) + registry.architectures("spacy.TorchBiLSTMEncoder.v1")(BiLSTMEncoder) + + # Register factory components + register_factories() + + # Set the flag to indicate that the registry has been populated + REGISTRY_POPULATED = True + + +def register_factories() -> None: + """Register all factories with the registry. + + This function registers all pipeline component factories, centralizing + the registrations that were previously done with @Language.factory decorators. + """ + global FACTORIES_REGISTERED + if FACTORIES_REGISTERED: + return + + from .language import Language + + # Import factory default configurations + from .pipeline.entity_linker import DEFAULT_NEL_MODEL + from .pipeline.entityruler import DEFAULT_ENT_ID_SEP + from .pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL + from .pipeline.senter import DEFAULT_SENTER_MODEL + from .pipeline.morphologizer import DEFAULT_MORPH_MODEL + from .pipeline.spancat import DEFAULT_SPANCAT_MODEL, DEFAULT_SPANCAT_SINGLELABEL_MODEL, DEFAULT_SPANS_KEY + from .pipeline.span_ruler import DEFAULT_SPANS_KEY as SPAN_RULER_DEFAULT_SPANS_KEY + from .pipeline.edit_tree_lemmatizer import DEFAULT_EDIT_TREE_LEMMATIZER_MODEL + from .pipeline.textcat_multilabel import DEFAULT_MULTI_TEXTCAT_MODEL + from .pipeline.span_finder import DEFAULT_SPAN_FINDER_MODEL + from .pipeline.ner import DEFAULT_NER_MODEL + from .pipeline.dep_parser import DEFAULT_PARSER_MODEL + from .pipeline.tagger import DEFAULT_TAGGER_MODEL + from .pipeline.multitask import DEFAULT_MT_MODEL + + # Import all factory functions + from .pipeline.attributeruler import make_attribute_ruler + from .pipeline.entity_linker import make_entity_linker + from .pipeline.entityruler import make_entity_ruler + from .pipeline.lemmatizer import make_lemmatizer + from .pipeline.textcat import make_textcat, DEFAULT_SINGLE_TEXTCAT_MODEL + from .pipeline.functions import make_token_splitter, make_doc_cleaner + from .pipeline.tok2vec import make_tok2vec + from .pipeline.senter import make_senter + from .pipeline.morphologizer import make_morphologizer + from .pipeline.spancat import make_spancat, make_spancat_singlelabel + from .pipeline.span_ruler import make_entity_ruler as make_span_entity_ruler, make_span_ruler + from .pipeline.edit_tree_lemmatizer import make_edit_tree_lemmatizer + from .pipeline.textcat_multilabel import make_multilabel_textcat + from .pipeline.span_finder import make_span_finder + from .pipeline.ner import make_ner, make_beam_ner + from .pipeline.dep_parser import make_parser, make_beam_parser + from .pipeline.tagger import make_tagger + from .pipeline.multitask import make_nn_labeller + from .pipeline.sentencizer import make_sentencizer + + # Register factories using the same pattern as Language.factory decorator + # We use Language.factory()() pattern which exactly mimics the decorator + + # attributeruler + Language.factory( + "attribute_ruler", + default_config={ + "validate": False, + "scorer": {"@scorers": "spacy.attribute_ruler_scorer.v1"}, + }, + )(make_attribute_ruler) + + # entity_linker + Language.factory( + "entity_linker", + requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"], + assigns=["token.ent_kb_id"], + default_config={ + "model": DEFAULT_NEL_MODEL, + "labels_discard": [], + "n_sents": 0, + "incl_prior": True, + "incl_context": True, + "entity_vector_length": 64, + "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"}, + "get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"}, + "generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"}, + "overwrite": True, + "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"}, + "use_gold_ents": True, + "candidates_batch_size": 1, + "threshold": None, + }, + default_score_weights={ + "nel_micro_f": 1.0, + "nel_micro_r": None, + "nel_micro_p": None, + }, + )(make_entity_linker) + + # entity_ruler + Language.factory( + "entity_ruler", + assigns=["doc.ents", "token.ent_type", "token.ent_iob"], + default_config={ + "phrase_matcher_attr": None, + "matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"}, + "validate": False, + "overwrite_ents": False, + "ent_id_sep": DEFAULT_ENT_ID_SEP, + "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"}, + }, + default_score_weights={ + "ents_f": 1.0, + "ents_p": 0.0, + "ents_r": 0.0, + "ents_per_type": None, + }, + )(make_entity_ruler) + + # lemmatizer + Language.factory( + "lemmatizer", + assigns=["token.lemma"], + default_config={ + "model": None, + "mode": "lookup", + "overwrite": False, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, + default_score_weights={"lemma_acc": 1.0}, + )(make_lemmatizer) + + # textcat + Language.factory( + "textcat", + assigns=["doc.cats"], + default_config={ + "threshold": 0.0, + "model": DEFAULT_SINGLE_TEXTCAT_MODEL, + "scorer": {"@scorers": "spacy.textcat_scorer.v2"}, + }, + default_score_weights={ + "cats_score": 1.0, + "cats_score_desc": None, + "cats_micro_p": None, + "cats_micro_r": None, + "cats_micro_f": None, + "cats_macro_p": None, + "cats_macro_r": None, + "cats_macro_f": None, + "cats_macro_auc": None, + "cats_f_per_type": None, + }, + )(make_textcat) + + # token_splitter + Language.factory( + "token_splitter", + default_config={"min_length": 25, "split_length": 10}, + retokenizes=True, + )(make_token_splitter) + + # doc_cleaner + Language.factory( + "doc_cleaner", + default_config={"attrs": {"tensor": None, "_.trf_data": None}, "silent": True}, + )(make_doc_cleaner) + + # tok2vec + Language.factory( + "tok2vec", + assigns=["doc.tensor"], + default_config={"model": DEFAULT_TOK2VEC_MODEL} + )(make_tok2vec) + + # senter + Language.factory( + "senter", + assigns=["token.is_sent_start"], + default_config={"model": DEFAULT_SENTER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}}, + default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, + )(make_senter) + + # morphologizer + Language.factory( + "morphologizer", + assigns=["token.morph", "token.pos"], + default_config={ + "model": DEFAULT_MORPH_MODEL, + "overwrite": True, + "extend": False, + "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}, + "label_smoothing": 0.0 + }, + default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None}, + )(make_morphologizer) + + # spancat + Language.factory( + "spancat", + assigns=["doc.spans"], + default_config={ + "threshold": 0.5, + "spans_key": DEFAULT_SPANS_KEY, + "max_positive": None, + "model": DEFAULT_SPANCAT_MODEL, + "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]}, + "scorer": {"@scorers": "spacy.spancat_scorer.v1"}, + }, + default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0}, + )(make_spancat) + + # spancat_singlelabel + Language.factory( + "spancat_singlelabel", + assigns=["doc.spans"], + default_config={ + "spans_key": DEFAULT_SPANS_KEY, + "model": DEFAULT_SPANCAT_SINGLELABEL_MODEL, + "negative_weight": 1.0, + "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]}, + "scorer": {"@scorers": "spacy.spancat_scorer.v1"}, + "allow_overlap": True, + }, + default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0}, + )(make_spancat_singlelabel) + + # future_entity_ruler + Language.factory( + "future_entity_ruler", + assigns=["doc.ents"], + default_config={ + "phrase_matcher_attr": None, + "validate": False, + "overwrite_ents": False, + "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"}, + "ent_id_sep": "__unused__", + "matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"}, + }, + default_score_weights={ + "ents_f": 1.0, + "ents_p": 0.0, + "ents_r": 0.0, + "ents_per_type": None, + }, + )(make_span_entity_ruler) + + # span_ruler + Language.factory( + "span_ruler", + assigns=["doc.spans"], + default_config={ + "spans_key": SPAN_RULER_DEFAULT_SPANS_KEY, + "spans_filter": None, + "annotate_ents": False, + "ents_filter": {"@misc": "spacy.first_longest_spans_filter.v1"}, + "phrase_matcher_attr": None, + "matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"}, + "validate": False, + "overwrite": True, + "scorer": { + "@scorers": "spacy.overlapping_labeled_spans_scorer.v1", + "spans_key": SPAN_RULER_DEFAULT_SPANS_KEY, + }, + }, + default_score_weights={ + f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_f": 1.0, + f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_p": 0.0, + f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_r": 0.0, + f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_per_type": None, + }, + )(make_span_ruler) + + # trainable_lemmatizer + Language.factory( + "trainable_lemmatizer", + assigns=["token.lemma"], + requires=[], + default_config={ + "model": DEFAULT_EDIT_TREE_LEMMATIZER_MODEL, + "backoff": "orth", + "min_tree_freq": 3, + "overwrite": False, + "top_k": 1, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, + default_score_weights={"lemma_acc": 1.0}, + )(make_edit_tree_lemmatizer) + + # textcat_multilabel + Language.factory( + "textcat_multilabel", + assigns=["doc.cats"], + default_config={ + "threshold": 0.5, + "model": DEFAULT_MULTI_TEXTCAT_MODEL, + "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v2"}, + }, + default_score_weights={ + "cats_score": 1.0, + "cats_score_desc": None, + "cats_micro_p": None, + "cats_micro_r": None, + "cats_micro_f": None, + "cats_macro_p": None, + "cats_macro_r": None, + "cats_macro_f": None, + "cats_macro_auc": None, + "cats_f_per_type": None, + }, + )(make_multilabel_textcat) + + # span_finder + Language.factory( + "span_finder", + assigns=["doc.spans"], + default_config={ + "threshold": 0.5, + "model": DEFAULT_SPAN_FINDER_MODEL, + "spans_key": DEFAULT_SPANS_KEY, + "max_length": 25, + "min_length": None, + "scorer": {"@scorers": "spacy.span_finder_scorer.v1"}, + }, + default_score_weights={ + f"spans_{DEFAULT_SPANS_KEY}_f": 1.0, + f"spans_{DEFAULT_SPANS_KEY}_p": 0.0, + f"spans_{DEFAULT_SPANS_KEY}_r": 0.0, + }, + )(make_span_finder) + + # ner + Language.factory( + "ner", + assigns=["doc.ents", "token.ent_iob", "token.ent_type"], + default_config={ + "moves": None, + "update_with_oracle_cut_size": 100, + "model": DEFAULT_NER_MODEL, + "incorrect_spans_key": None, + "scorer": {"@scorers": "spacy.ner_scorer.v1"}, + }, + default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None}, + )(make_ner) + + # beam_ner + Language.factory( + "beam_ner", + assigns=["doc.ents", "token.ent_iob", "token.ent_type"], + default_config={ + "moves": None, + "update_with_oracle_cut_size": 100, + "model": DEFAULT_NER_MODEL, + "beam_density": 0.01, + "beam_update_prob": 0.5, + "beam_width": 32, + "incorrect_spans_key": None, + "scorer": {"@scorers": "spacy.ner_scorer.v1"}, + }, + default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None}, + )(make_beam_ner) + + # parser + Language.factory( + "parser", + assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"], + default_config={ + "moves": None, + "update_with_oracle_cut_size": 100, + "learn_tokens": False, + "min_action_freq": 30, + "model": DEFAULT_PARSER_MODEL, + "scorer": {"@scorers": "spacy.parser_scorer.v1"}, + }, + default_score_weights={ + "dep_uas": 0.5, + "dep_las": 0.5, + "dep_las_per_type": None, + "sents_p": None, + "sents_r": None, + "sents_f": 0.0, + }, + )(make_parser) + + # beam_parser + Language.factory( + "beam_parser", + assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"], + default_config={ + "moves": None, + "update_with_oracle_cut_size": 100, + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 8, + "beam_density": 0.0001, + "beam_update_prob": 0.5, + "model": DEFAULT_PARSER_MODEL, + "scorer": {"@scorers": "spacy.parser_scorer.v1"}, + }, + default_score_weights={ + "dep_uas": 0.5, + "dep_las": 0.5, + "dep_las_per_type": None, + "sents_p": None, + "sents_r": None, + "sents_f": 0.0, + }, + )(make_beam_parser) + + # tagger + Language.factory( + "tagger", + assigns=["token.tag"], + default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}, "neg_prefix": "!", "label_smoothing": 0.0}, + default_score_weights={"tag_acc": 1.0, "pos_acc": 0.0, "tag_micro_p": None, "tag_micro_r": None, "tag_micro_f": None}, + )(make_tagger) + + # nn_labeller + Language.factory( + "nn_labeller", + default_config={"labels": None, "target": "dep_tag_offset", "model": DEFAULT_MT_MODEL} + )(make_nn_labeller) + + # sentencizer + Language.factory( + "sentencizer", + assigns=["token.is_sent_start", "doc.sents"], + default_config={"punct_chars": None, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}}, + default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, + )(make_sentencizer) + + # Set the flag to indicate that all factories have been registered + FACTORIES_REGISTERED = True diff --git a/spacy/tests/factory_registrations.json b/spacy/tests/factory_registrations.json new file mode 100644 index 000000000..475e48020 --- /dev/null +++ b/spacy/tests/factory_registrations.json @@ -0,0 +1,132 @@ +{ + "attribute_ruler": { + "name": "attribute_ruler", + "module": "spacy.pipeline.attributeruler", + "function": "make_attribute_ruler" + }, + "beam_ner": { + "name": "beam_ner", + "module": "spacy.pipeline.ner", + "function": "make_beam_ner" + }, + "beam_parser": { + "name": "beam_parser", + "module": "spacy.pipeline.dep_parser", + "function": "make_beam_parser" + }, + "doc_cleaner": { + "name": "doc_cleaner", + "module": "spacy.pipeline.functions", + "function": "make_doc_cleaner" + }, + "entity_linker": { + "name": "entity_linker", + "module": "spacy.pipeline.entity_linker", + "function": "make_entity_linker" + }, + "entity_ruler": { + "name": "entity_ruler", + "module": "spacy.pipeline.entityruler", + "function": "make_entity_ruler" + }, + "future_entity_ruler": { + "name": "future_entity_ruler", + "module": "spacy.pipeline.span_ruler", + "function": "make_entity_ruler" + }, + "lemmatizer": { + "name": "lemmatizer", + "module": "spacy.pipeline.lemmatizer", + "function": "make_lemmatizer" + }, + "merge_entities": { + "name": "merge_entities", + "module": "spacy.language", + "function": "Language.component..add_component..factory_func" + }, + "merge_noun_chunks": { + "name": "merge_noun_chunks", + "module": "spacy.language", + "function": "Language.component..add_component..factory_func" + }, + "merge_subtokens": { + "name": "merge_subtokens", + "module": "spacy.language", + "function": "Language.component..add_component..factory_func" + }, + "morphologizer": { + "name": "morphologizer", + "module": "spacy.pipeline.morphologizer", + "function": "make_morphologizer" + }, + "ner": { + "name": "ner", + "module": "spacy.pipeline.ner", + "function": "make_ner" + }, + "parser": { + "name": "parser", + "module": "spacy.pipeline.dep_parser", + "function": "make_parser" + }, + "sentencizer": { + "name": "sentencizer", + "module": "spacy.pipeline.sentencizer", + "function": "make_sentencizer" + }, + "senter": { + "name": "senter", + "module": "spacy.pipeline.senter", + "function": "make_senter" + }, + "span_finder": { + "name": "span_finder", + "module": "spacy.pipeline.span_finder", + "function": "make_span_finder" + }, + "span_ruler": { + "name": "span_ruler", + "module": "spacy.pipeline.span_ruler", + "function": "make_span_ruler" + }, + "spancat": { + "name": "spancat", + "module": "spacy.pipeline.spancat", + "function": "make_spancat" + }, + "spancat_singlelabel": { + "name": "spancat_singlelabel", + "module": "spacy.pipeline.spancat", + "function": "make_spancat_singlelabel" + }, + "tagger": { + "name": "tagger", + "module": "spacy.pipeline.tagger", + "function": "make_tagger" + }, + "textcat": { + "name": "textcat", + "module": "spacy.pipeline.textcat", + "function": "make_textcat" + }, + "textcat_multilabel": { + "name": "textcat_multilabel", + "module": "spacy.pipeline.textcat_multilabel", + "function": "make_multilabel_textcat" + }, + "tok2vec": { + "name": "tok2vec", + "module": "spacy.pipeline.tok2vec", + "function": "make_tok2vec" + }, + "token_splitter": { + "name": "token_splitter", + "module": "spacy.pipeline.functions", + "function": "make_token_splitter" + }, + "trainable_lemmatizer": { + "name": "trainable_lemmatizer", + "module": "spacy.pipeline.edit_tree_lemmatizer", + "function": "make_edit_tree_lemmatizer" + } +} \ No newline at end of file diff --git a/spacy/tests/registry_contents.json b/spacy/tests/registry_contents.json new file mode 100644 index 000000000..1836d0328 --- /dev/null +++ b/spacy/tests/registry_contents.json @@ -0,0 +1,284 @@ +{ + "architectures": [ + "spacy-legacy.CharacterEmbed.v1", + "spacy-legacy.EntityLinker.v1", + "spacy-legacy.HashEmbedCNN.v1", + "spacy-legacy.MaxoutWindowEncoder.v1", + "spacy-legacy.MishWindowEncoder.v1", + "spacy-legacy.MultiHashEmbed.v1", + "spacy-legacy.Tagger.v1", + "spacy-legacy.TextCatBOW.v1", + "spacy-legacy.TextCatCNN.v1", + "spacy-legacy.TextCatEnsemble.v1", + "spacy-legacy.Tok2Vec.v1", + "spacy-legacy.TransitionBasedParser.v1", + "spacy.CharacterEmbed.v2", + "spacy.EntityLinker.v2", + "spacy.HashEmbedCNN.v2", + "spacy.MaxoutWindowEncoder.v2", + "spacy.MishWindowEncoder.v2", + "spacy.MultiHashEmbed.v2", + "spacy.PretrainCharacters.v1", + "spacy.PretrainVectors.v1", + "spacy.SpanCategorizer.v1", + "spacy.SpanFinder.v1", + "spacy.Tagger.v2", + "spacy.TextCatBOW.v2", + "spacy.TextCatBOW.v3", + "spacy.TextCatCNN.v2", + "spacy.TextCatEnsemble.v2", + "spacy.TextCatLowData.v1", + "spacy.TextCatParametricAttention.v1", + "spacy.TextCatReduce.v1", + "spacy.Tok2Vec.v2", + "spacy.Tok2VecListener.v1", + "spacy.TorchBiLSTMEncoder.v1", + "spacy.TransitionBasedParser.v2" + ], + "augmenters": [ + "spacy.combined_augmenter.v1", + "spacy.lower_case.v1", + "spacy.orth_variants.v1" + ], + "batchers": [ + "spacy.batch_by_padded.v1", + "spacy.batch_by_sequence.v1", + "spacy.batch_by_words.v1" + ], + "callbacks": [ + "spacy.copy_from_base_model.v1", + "spacy.models_and_pipes_with_nvtx_range.v1", + "spacy.models_with_nvtx_range.v1" + ], + "cli": [], + "datasets": [], + "displacy_colors": [], + "factories": [ + "attribute_ruler", + "beam_ner", + "beam_parser", + "doc_cleaner", + "entity_linker", + "entity_ruler", + "future_entity_ruler", + "lemmatizer", + "merge_entities", + "merge_noun_chunks", + "merge_subtokens", + "morphologizer", + "ner", + "parser", + "sentencizer", + "senter", + "span_finder", + "span_ruler", + "spancat", + "spancat_singlelabel", + "tagger", + "textcat", + "textcat_multilabel", + "tok2vec", + "token_splitter", + "trainable_lemmatizer" + ], + "initializers": [ + "glorot_normal_init.v1", + "glorot_uniform_init.v1", + "he_normal_init.v1", + "he_uniform_init.v1", + "lecun_normal_init.v1", + "lecun_uniform_init.v1", + "normal_init.v1", + "uniform_init.v1", + "zero_init.v1" + ], + "languages": [], + "layers": [ + "CauchySimilarity.v1", + "ClippedLinear.v1", + "Dish.v1", + "Dropout.v1", + "Embed.v1", + "Gelu.v1", + "HardSigmoid.v1", + "HardSwish.v1", + "HardSwishMobilenet.v1", + "HardTanh.v1", + "HashEmbed.v1", + "LSTM.v1", + "LayerNorm.v1", + "Linear.v1", + "Logistic.v1", + "MXNetWrapper.v1", + "Maxout.v1", + "Mish.v1", + "MultiSoftmax.v1", + "ParametricAttention.v1", + "ParametricAttention.v2", + "PyTorchLSTM.v1", + "PyTorchRNNWrapper.v1", + "PyTorchWrapper.v1", + "PyTorchWrapper.v2", + "PyTorchWrapper.v3", + "Relu.v1", + "ReluK.v1", + "Sigmoid.v1", + "Softmax.v1", + "Softmax.v2", + "SparseLinear.v1", + "SparseLinear.v2", + "Swish.v1", + "add.v1", + "bidirectional.v1", + "chain.v1", + "clone.v1", + "concatenate.v1", + "expand_window.v1", + "list2array.v1", + "list2padded.v1", + "list2ragged.v1", + "noop.v1", + "padded2list.v1", + "premap_ids.v1", + "ragged2list.v1", + "reduce_first.v1", + "reduce_last.v1", + "reduce_max.v1", + "reduce_mean.v1", + "reduce_sum.v1", + "remap_ids.v1", + "remap_ids.v2", + "residual.v1", + "resizable.v1", + "siamese.v1", + "sigmoid_activation.v1", + "softmax_activation.v1", + "spacy-legacy.StaticVectors.v1", + "spacy.CharEmbed.v1", + "spacy.FeatureExtractor.v1", + "spacy.LinearLogistic.v1", + "spacy.PrecomputableAffine.v1", + "spacy.StaticVectors.v2", + "spacy.TransitionModel.v1", + "spacy.extract_ngrams.v1", + "spacy.extract_spans.v1", + "spacy.mean_max_reducer.v1", + "strings2arrays.v1", + "tuplify.v1", + "uniqued.v1", + "with_array.v1", + "with_array2d.v1", + "with_cpu.v1", + "with_flatten.v1", + "with_flatten.v2", + "with_getitem.v1", + "with_list.v1", + "with_padded.v1", + "with_ragged.v1", + "with_reshape.v1" + ], + "lemmatizers": [], + "loggers": [ + "spacy-legacy.ConsoleLogger.v1", + "spacy-legacy.ConsoleLogger.v2", + "spacy-legacy.WandbLogger.v1", + "spacy.ChainLogger.v1", + "spacy.ClearMLLogger.v1", + "spacy.ClearMLLogger.v2", + "spacy.ConsoleLogger.v2", + "spacy.ConsoleLogger.v3", + "spacy.CupyLogger.v1", + "spacy.LookupLogger.v1", + "spacy.MLflowLogger.v1", + "spacy.MLflowLogger.v2", + "spacy.PyTorchLogger.v1", + "spacy.WandbLogger.v1", + "spacy.WandbLogger.v2", + "spacy.WandbLogger.v3", + "spacy.WandbLogger.v4", + "spacy.WandbLogger.v5" + ], + "lookups": [], + "losses": [ + "CategoricalCrossentropy.v1", + "CategoricalCrossentropy.v2", + "CategoricalCrossentropy.v3", + "CosineDistance.v1", + "L2Distance.v1", + "SequenceCategoricalCrossentropy.v1", + "SequenceCategoricalCrossentropy.v2", + "SequenceCategoricalCrossentropy.v3" + ], + "misc": [ + "spacy.CandidateBatchGenerator.v1", + "spacy.CandidateGenerator.v1", + "spacy.EmptyKB.v1", + "spacy.EmptyKB.v2", + "spacy.KBFromFile.v1", + "spacy.LookupsDataLoader.v1", + "spacy.first_longest_spans_filter.v1", + "spacy.levenshtein_compare.v1", + "spacy.ngram_range_suggester.v1", + "spacy.ngram_suggester.v1", + "spacy.preset_spans_suggester.v1", + "spacy.prioritize_existing_ents_filter.v1", + "spacy.prioritize_new_ents_filter.v1" + ], + "models": [], + "ops": [ + "CupyOps", + "MPSOps", + "NumpyOps" + ], + "optimizers": [ + "Adam.v1", + "RAdam.v1", + "SGD.v1" + ], + "readers": [ + "ml_datasets.cmu_movies.v1", + "ml_datasets.dbpedia.v1", + "ml_datasets.imdb_sentiment.v1", + "spacy.Corpus.v1", + "spacy.JsonlCorpus.v1", + "spacy.PlainTextCorpus.v1", + "spacy.read_labels.v1", + "srsly.read_json.v1", + "srsly.read_jsonl.v1", + "srsly.read_msgpack.v1", + "srsly.read_yaml.v1" + ], + "schedules": [ + "compounding.v1", + "constant.v1", + "constant_then.v1", + "cyclic_triangular.v1", + "decaying.v1", + "slanted_triangular.v1", + "warmup_linear.v1" + ], + "scorers": [ + "spacy-legacy.textcat_multilabel_scorer.v1", + "spacy-legacy.textcat_scorer.v1", + "spacy.attribute_ruler_scorer.v1", + "spacy.entity_linker_scorer.v1", + "spacy.entity_ruler_scorer.v1", + "spacy.lemmatizer_scorer.v1", + "spacy.morphologizer_scorer.v1", + "spacy.ner_scorer.v1", + "spacy.overlapping_labeled_spans_scorer.v1", + "spacy.parser_scorer.v1", + "spacy.senter_scorer.v1", + "spacy.span_finder_scorer.v1", + "spacy.spancat_scorer.v1", + "spacy.tagger_scorer.v1", + "spacy.textcat_multilabel_scorer.v2", + "spacy.textcat_scorer.v2" + ], + "tokenizers": [ + "spacy.Tokenizer.v1" + ], + "vectors": [ + "spacy.Vectors.v1" + ] +} diff --git a/spacy/tests/test_factory_registrations.py b/spacy/tests/test_factory_registrations.py new file mode 100644 index 000000000..7dbcc81a5 --- /dev/null +++ b/spacy/tests/test_factory_registrations.py @@ -0,0 +1,76 @@ +import json +import inspect +import pytest +from pathlib import Path +from spacy.language import Language +from spacy.util import registry + +# Path to the reference factory registrations, relative to this file +REFERENCE_FILE = Path(__file__).parent / "factory_registrations.json" + +# Monkey patch the util.is_same_func to handle Cython functions +import inspect +from spacy import util + +original_is_same_func = util.is_same_func + +def patched_is_same_func(func1, func2): + # Handle Cython functions + try: + return original_is_same_func(func1, func2) + except TypeError: + # For Cython functions, just compare the string representation + return str(func1) == str(func2) + +util.is_same_func = patched_is_same_func + +@pytest.fixture +def reference_factory_registrations(): + """Load reference factory registrations from JSON file""" + if not REFERENCE_FILE.exists(): + pytest.fail(f"Reference file {REFERENCE_FILE} not found. Run export_factory_registrations.py first.") + + with REFERENCE_FILE.open("r") as f: + return json.load(f) + +def test_factory_registrations_preserved(reference_factory_registrations): + """Test that all factory registrations from the reference file are still present.""" + # Ensure the registry is populated + registry.ensure_populated() + + # Get all factory registrations + all_factories = registry.factories.get_all() + + # Initialize our data structure to store current factory registrations + current_registrations = {} + + # Process factory registrations + for name, func in all_factories.items(): + # Store information about each factory + try: + module_name = func.__module__ + except (AttributeError, TypeError): + # For Cython functions, just use a placeholder + module_name = str(func).split()[1].split('.')[0] + + try: + func_name = func.__qualname__ + except (AttributeError, TypeError): + # For Cython functions, use the function's name + func_name = func.__name__ if hasattr(func, "__name__") else str(func).split()[1].split('.')[-1] + + current_registrations[name] = { + "name": name, + "module": module_name, + "function": func_name, + } + + # Check for missing registrations + missing_registrations = set(reference_factory_registrations.keys()) - set(current_registrations.keys()) + assert not missing_registrations, f"Missing factory registrations: {', '.join(sorted(missing_registrations))}" + + # Check for new registrations (not an error, but informative) + new_registrations = set(current_registrations.keys()) - set(reference_factory_registrations.keys()) + if new_registrations: + # This is not an error, just informative + print(f"New factory registrations found: {', '.join(sorted(new_registrations))}") \ No newline at end of file diff --git a/spacy/tests/test_registry_population.py b/spacy/tests/test_registry_population.py new file mode 100644 index 000000000..732e57a0d --- /dev/null +++ b/spacy/tests/test_registry_population.py @@ -0,0 +1,48 @@ +import json +import os +import pytest +from pathlib import Path +from spacy.util import registry + +# Path to the reference registry contents, relative to this file +REFERENCE_FILE = Path(__file__).parent / "registry_contents.json" + +@pytest.fixture +def reference_registry(): + """Load reference registry contents from JSON file""" + if not REFERENCE_FILE.exists(): + pytest.fail(f"Reference file {REFERENCE_FILE} not found.") + + with REFERENCE_FILE.open("r") as f: + return json.load(f) + +def test_registry_types(reference_registry): + """Test that all registry types match the reference""" + # Get current registry types + current_registry_types = set(registry.get_registry_names()) + expected_registry_types = set(reference_registry.keys()) + + # Check for missing registry types + missing_types = expected_registry_types - current_registry_types + assert not missing_types, f"Missing registry types: {', '.join(missing_types)}" + +def test_registry_entries(reference_registry): + """Test that all registry entries are present""" + # Check each registry's entries + for registry_name, expected_entries in reference_registry.items(): + # Skip if this registry type doesn't exist + if not hasattr(registry, registry_name): + pytest.fail(f"Registry '{registry_name}' does not exist.") + + # Get current entries + reg = getattr(registry, registry_name) + current_entries = sorted(list(reg.get_all().keys())) + + # Compare entries + expected_set = set(expected_entries) + current_set = set(current_entries) + + # Check for missing entries - these would indicate our new registry population + # mechanism is missing something + missing_entries = expected_set - current_set + assert not missing_entries, f"Registry '{registry_name}' missing entries: {', '.join(missing_entries)}" \ No newline at end of file diff --git a/spacy/util.py b/spacy/util.py index c127be03c..96b52e21d 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -132,9 +132,17 @@ class registry(thinc.registry): models = catalogue.create("spacy", "models", entry_points=True) cli = catalogue.create("spacy", "cli", entry_points=True) + @classmethod + def ensure_populated(cls) -> None: + """Ensure the registry is populated with all necessary components.""" + from .registrations import populate_registry, REGISTRY_POPULATED + if not REGISTRY_POPULATED: + populate_registry() + @classmethod def get_registry_names(cls) -> List[str]: """List all available registries.""" + cls.ensure_populated() names = [] for name, value in inspect.getmembers(cls): if not name.startswith("_") and isinstance(value, Registry): @@ -144,6 +152,7 @@ class registry(thinc.registry): @classmethod def get(cls, registry_name: str, func_name: str) -> Callable: """Get a registered function from the registry.""" + cls.ensure_populated() # We're overwriting this classmethod so we're able to provide more # specific error messages and implement a fallback to spacy-legacy. if not hasattr(cls, registry_name): @@ -179,6 +188,7 @@ class registry(thinc.registry): func_name (str): Name of the registered function. RETURNS (Dict[str, Optional[Union[str, int]]]): The function info. """ + cls.ensure_populated() # We're overwriting this classmethod so we're able to provide more # specific error messages and implement a fallback to spacy-legacy. if not hasattr(cls, registry_name): @@ -205,6 +215,7 @@ class registry(thinc.registry): @classmethod def has(cls, registry_name: str, func_name: str) -> bool: """Check whether a function is available in a registry.""" + cls.ensure_populated() if not hasattr(cls, registry_name): return False reg = getattr(cls, registry_name) @@ -1323,7 +1334,6 @@ def filter_chain_spans(*spans: Iterable["Span"]) -> List["Span"]: return filter_spans(itertools.chain(*spans)) -@registry.misc("spacy.first_longest_spans_filter.v1") def make_first_longest_spans_filter(): return filter_chain_spans