From 9d7b22c52e58114832479749c270483e15175b15 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 19 May 2025 12:33:24 +0200 Subject: [PATCH] Start centralising registry calls --- spacy/registrations.py | 58 +++++ spacy/tests/registry_contents.json | 284 ++++++++++++++++++++++++ spacy/tests/test_registry_population.py | 48 ++++ 3 files changed, 390 insertions(+) create mode 100644 spacy/registrations.py create mode 100644 spacy/tests/registry_contents.json create mode 100644 spacy/tests/test_registry_population.py diff --git a/spacy/registrations.py b/spacy/registrations.py new file mode 100644 index 000000000..483b16a54 --- /dev/null +++ b/spacy/registrations.py @@ -0,0 +1,58 @@ +"""Centralized registry population for spaCy components. + +This module centralizes registry decorations to prevent circular import issues +with Cython annotation changes from __future__ import annotations. Functions +remain in their original locations, but decoration is moved here. +""" +from typing import Dict, Any + +# Global flag to track if registry has been populated +REGISTRY_POPULATED = False + +def populate_registry() -> None: + """Populate the registry with all necessary components. + + This function should be called before accessing the registry, to ensure + it's populated. The function uses a global flag to prevent repopulation. + """ + global REGISTRY_POPULATED + if REGISTRY_POPULATED: + return + + # Import all necessary modules + from .util import registry, make_first_longest_spans_filter + + # Register miscellaneous components + registry.misc("spacy.first_longest_spans_filter.v1")(make_first_longest_spans_filter) + + # Import all pipeline components that were using registry decorators + from .pipeline.tagger import make_tagger_scorer + from .pipeline.ner import make_ner_scorer + + # Need to get references to the existing functions in registry by importing the function that is there + # For the registry that was previously decorated + + # Import functions for use in registry + from .scorer import get_ner_prf # Used for entity_ruler_scorer + + # Import ML components that use registry + from .ml.models.tok2vec import tok2vec_listener_v1, build_hash_embed_cnn_tok2vec + + # Register scorers + registry.scorers("spacy.tagger_scorer.v1")(make_tagger_scorer) + registry.scorers("spacy.ner_scorer.v1")(make_ner_scorer) + # span_ruler_scorer removed as it's not in span_ruler.py + registry.scorers("spacy.entity_ruler_scorer.v1")(make_entityruler_scorer) + registry.scorers("spacy.sentencizer_scorer.v1")(make_sentencizer_scorer) + registry.scorers("spacy.senter_scorer.v1")(make_senter_scorer) + registry.scorers("spacy.textcat_scorer.v1")(make_textcat_scorer) + registry.scorers("spacy.textcat_multilabel_scorer.v1")(make_textcat_multilabel_scorer) + registry.scorers("spacy.span_finder_scorer.v1")(make_span_finder_scorer) + registry.scorers("spacy.spancat_scorer.v1")(make_spancat_scorer) + + # Register tok2vec architectures we've modified + registry.architectures("spacy.Tok2VecListener.v1")(tok2vec_listener_v1) + registry.architectures("spacy.HashEmbedCNN.v2")(build_hash_embed_cnn_tok2vec) + + # Set the flag to indicate that the registry has been populated + REGISTRY_POPULATED = True \ No newline at end of file diff --git a/spacy/tests/registry_contents.json b/spacy/tests/registry_contents.json new file mode 100644 index 000000000..1836d0328 --- /dev/null +++ b/spacy/tests/registry_contents.json @@ -0,0 +1,284 @@ +{ + "architectures": [ + "spacy-legacy.CharacterEmbed.v1", + "spacy-legacy.EntityLinker.v1", + "spacy-legacy.HashEmbedCNN.v1", + "spacy-legacy.MaxoutWindowEncoder.v1", + "spacy-legacy.MishWindowEncoder.v1", + "spacy-legacy.MultiHashEmbed.v1", + "spacy-legacy.Tagger.v1", + "spacy-legacy.TextCatBOW.v1", + "spacy-legacy.TextCatCNN.v1", + "spacy-legacy.TextCatEnsemble.v1", + "spacy-legacy.Tok2Vec.v1", + "spacy-legacy.TransitionBasedParser.v1", + "spacy.CharacterEmbed.v2", + "spacy.EntityLinker.v2", + "spacy.HashEmbedCNN.v2", + "spacy.MaxoutWindowEncoder.v2", + "spacy.MishWindowEncoder.v2", + "spacy.MultiHashEmbed.v2", + "spacy.PretrainCharacters.v1", + "spacy.PretrainVectors.v1", + "spacy.SpanCategorizer.v1", + "spacy.SpanFinder.v1", + "spacy.Tagger.v2", + "spacy.TextCatBOW.v2", + "spacy.TextCatBOW.v3", + "spacy.TextCatCNN.v2", + "spacy.TextCatEnsemble.v2", + "spacy.TextCatLowData.v1", + "spacy.TextCatParametricAttention.v1", + "spacy.TextCatReduce.v1", + "spacy.Tok2Vec.v2", + "spacy.Tok2VecListener.v1", + "spacy.TorchBiLSTMEncoder.v1", + "spacy.TransitionBasedParser.v2" + ], + "augmenters": [ + "spacy.combined_augmenter.v1", + "spacy.lower_case.v1", + "spacy.orth_variants.v1" + ], + "batchers": [ + "spacy.batch_by_padded.v1", + "spacy.batch_by_sequence.v1", + "spacy.batch_by_words.v1" + ], + "callbacks": [ + "spacy.copy_from_base_model.v1", + "spacy.models_and_pipes_with_nvtx_range.v1", + "spacy.models_with_nvtx_range.v1" + ], + "cli": [], + "datasets": [], + "displacy_colors": [], + "factories": [ + "attribute_ruler", + "beam_ner", + "beam_parser", + "doc_cleaner", + "entity_linker", + "entity_ruler", + "future_entity_ruler", + "lemmatizer", + "merge_entities", + "merge_noun_chunks", + "merge_subtokens", + "morphologizer", + "ner", + "parser", + "sentencizer", + "senter", + "span_finder", + "span_ruler", + "spancat", + "spancat_singlelabel", + "tagger", + "textcat", + "textcat_multilabel", + "tok2vec", + "token_splitter", + "trainable_lemmatizer" + ], + "initializers": [ + "glorot_normal_init.v1", + "glorot_uniform_init.v1", + "he_normal_init.v1", + "he_uniform_init.v1", + "lecun_normal_init.v1", + "lecun_uniform_init.v1", + "normal_init.v1", + "uniform_init.v1", + "zero_init.v1" + ], + "languages": [], + "layers": [ + "CauchySimilarity.v1", + "ClippedLinear.v1", + "Dish.v1", + "Dropout.v1", + "Embed.v1", + "Gelu.v1", + "HardSigmoid.v1", + "HardSwish.v1", + "HardSwishMobilenet.v1", + "HardTanh.v1", + "HashEmbed.v1", + "LSTM.v1", + "LayerNorm.v1", + "Linear.v1", + "Logistic.v1", + "MXNetWrapper.v1", + "Maxout.v1", + "Mish.v1", + "MultiSoftmax.v1", + "ParametricAttention.v1", + "ParametricAttention.v2", + "PyTorchLSTM.v1", + "PyTorchRNNWrapper.v1", + "PyTorchWrapper.v1", + "PyTorchWrapper.v2", + "PyTorchWrapper.v3", + "Relu.v1", + "ReluK.v1", + "Sigmoid.v1", + "Softmax.v1", + "Softmax.v2", + "SparseLinear.v1", + "SparseLinear.v2", + "Swish.v1", + "add.v1", + "bidirectional.v1", + "chain.v1", + "clone.v1", + "concatenate.v1", + "expand_window.v1", + "list2array.v1", + "list2padded.v1", + "list2ragged.v1", + "noop.v1", + "padded2list.v1", + "premap_ids.v1", + "ragged2list.v1", + "reduce_first.v1", + "reduce_last.v1", + "reduce_max.v1", + "reduce_mean.v1", + "reduce_sum.v1", + "remap_ids.v1", + "remap_ids.v2", + "residual.v1", + "resizable.v1", + "siamese.v1", + "sigmoid_activation.v1", + "softmax_activation.v1", + "spacy-legacy.StaticVectors.v1", + "spacy.CharEmbed.v1", + "spacy.FeatureExtractor.v1", + "spacy.LinearLogistic.v1", + "spacy.PrecomputableAffine.v1", + "spacy.StaticVectors.v2", + "spacy.TransitionModel.v1", + "spacy.extract_ngrams.v1", + "spacy.extract_spans.v1", + "spacy.mean_max_reducer.v1", + "strings2arrays.v1", + "tuplify.v1", + "uniqued.v1", + "with_array.v1", + "with_array2d.v1", + "with_cpu.v1", + "with_flatten.v1", + "with_flatten.v2", + "with_getitem.v1", + "with_list.v1", + "with_padded.v1", + "with_ragged.v1", + "with_reshape.v1" + ], + "lemmatizers": [], + "loggers": [ + "spacy-legacy.ConsoleLogger.v1", + "spacy-legacy.ConsoleLogger.v2", + "spacy-legacy.WandbLogger.v1", + "spacy.ChainLogger.v1", + "spacy.ClearMLLogger.v1", + "spacy.ClearMLLogger.v2", + "spacy.ConsoleLogger.v2", + "spacy.ConsoleLogger.v3", + "spacy.CupyLogger.v1", + "spacy.LookupLogger.v1", + "spacy.MLflowLogger.v1", + "spacy.MLflowLogger.v2", + "spacy.PyTorchLogger.v1", + "spacy.WandbLogger.v1", + "spacy.WandbLogger.v2", + "spacy.WandbLogger.v3", + "spacy.WandbLogger.v4", + "spacy.WandbLogger.v5" + ], + "lookups": [], + "losses": [ + "CategoricalCrossentropy.v1", + "CategoricalCrossentropy.v2", + "CategoricalCrossentropy.v3", + "CosineDistance.v1", + "L2Distance.v1", + "SequenceCategoricalCrossentropy.v1", + "SequenceCategoricalCrossentropy.v2", + "SequenceCategoricalCrossentropy.v3" + ], + "misc": [ + "spacy.CandidateBatchGenerator.v1", + "spacy.CandidateGenerator.v1", + "spacy.EmptyKB.v1", + "spacy.EmptyKB.v2", + "spacy.KBFromFile.v1", + "spacy.LookupsDataLoader.v1", + "spacy.first_longest_spans_filter.v1", + "spacy.levenshtein_compare.v1", + "spacy.ngram_range_suggester.v1", + "spacy.ngram_suggester.v1", + "spacy.preset_spans_suggester.v1", + "spacy.prioritize_existing_ents_filter.v1", + "spacy.prioritize_new_ents_filter.v1" + ], + "models": [], + "ops": [ + "CupyOps", + "MPSOps", + "NumpyOps" + ], + "optimizers": [ + "Adam.v1", + "RAdam.v1", + "SGD.v1" + ], + "readers": [ + "ml_datasets.cmu_movies.v1", + "ml_datasets.dbpedia.v1", + "ml_datasets.imdb_sentiment.v1", + "spacy.Corpus.v1", + "spacy.JsonlCorpus.v1", + "spacy.PlainTextCorpus.v1", + "spacy.read_labels.v1", + "srsly.read_json.v1", + "srsly.read_jsonl.v1", + "srsly.read_msgpack.v1", + "srsly.read_yaml.v1" + ], + "schedules": [ + "compounding.v1", + "constant.v1", + "constant_then.v1", + "cyclic_triangular.v1", + "decaying.v1", + "slanted_triangular.v1", + "warmup_linear.v1" + ], + "scorers": [ + "spacy-legacy.textcat_multilabel_scorer.v1", + "spacy-legacy.textcat_scorer.v1", + "spacy.attribute_ruler_scorer.v1", + "spacy.entity_linker_scorer.v1", + "spacy.entity_ruler_scorer.v1", + "spacy.lemmatizer_scorer.v1", + "spacy.morphologizer_scorer.v1", + "spacy.ner_scorer.v1", + "spacy.overlapping_labeled_spans_scorer.v1", + "spacy.parser_scorer.v1", + "spacy.senter_scorer.v1", + "spacy.span_finder_scorer.v1", + "spacy.spancat_scorer.v1", + "spacy.tagger_scorer.v1", + "spacy.textcat_multilabel_scorer.v2", + "spacy.textcat_scorer.v2" + ], + "tokenizers": [ + "spacy.Tokenizer.v1" + ], + "vectors": [ + "spacy.Vectors.v1" + ] +} diff --git a/spacy/tests/test_registry_population.py b/spacy/tests/test_registry_population.py new file mode 100644 index 000000000..732e57a0d --- /dev/null +++ b/spacy/tests/test_registry_population.py @@ -0,0 +1,48 @@ +import json +import os +import pytest +from pathlib import Path +from spacy.util import registry + +# Path to the reference registry contents, relative to this file +REFERENCE_FILE = Path(__file__).parent / "registry_contents.json" + +@pytest.fixture +def reference_registry(): + """Load reference registry contents from JSON file""" + if not REFERENCE_FILE.exists(): + pytest.fail(f"Reference file {REFERENCE_FILE} not found.") + + with REFERENCE_FILE.open("r") as f: + return json.load(f) + +def test_registry_types(reference_registry): + """Test that all registry types match the reference""" + # Get current registry types + current_registry_types = set(registry.get_registry_names()) + expected_registry_types = set(reference_registry.keys()) + + # Check for missing registry types + missing_types = expected_registry_types - current_registry_types + assert not missing_types, f"Missing registry types: {', '.join(missing_types)}" + +def test_registry_entries(reference_registry): + """Test that all registry entries are present""" + # Check each registry's entries + for registry_name, expected_entries in reference_registry.items(): + # Skip if this registry type doesn't exist + if not hasattr(registry, registry_name): + pytest.fail(f"Registry '{registry_name}' does not exist.") + + # Get current entries + reg = getattr(registry, registry_name) + current_entries = sorted(list(reg.get_all().keys())) + + # Compare entries + expected_set = set(expected_entries) + current_set = set(current_entries) + + # Check for missing entries - these would indicate our new registry population + # mechanism is missing something + missing_entries = expected_set - current_set + assert not missing_entries, f"Registry '{registry_name}' missing entries: {', '.join(missing_entries)}" \ No newline at end of file