mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-25 23:49:46 +03:00
Start centralising registry calls
This commit is contained in:
parent
911539e9a4
commit
9d7b22c52e
58
spacy/registrations.py
Normal file
58
spacy/registrations.py
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
"""Centralized registry population for spaCy components.
|
||||||
|
|
||||||
|
This module centralizes registry decorations to prevent circular import issues
|
||||||
|
with Cython annotation changes from __future__ import annotations. Functions
|
||||||
|
remain in their original locations, but decoration is moved here.
|
||||||
|
"""
|
||||||
|
from typing import Dict, Any
|
||||||
|
|
||||||
|
# Global flag to track if registry has been populated
|
||||||
|
REGISTRY_POPULATED = False
|
||||||
|
|
||||||
|
def populate_registry() -> None:
|
||||||
|
"""Populate the registry with all necessary components.
|
||||||
|
|
||||||
|
This function should be called before accessing the registry, to ensure
|
||||||
|
it's populated. The function uses a global flag to prevent repopulation.
|
||||||
|
"""
|
||||||
|
global REGISTRY_POPULATED
|
||||||
|
if REGISTRY_POPULATED:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Import all necessary modules
|
||||||
|
from .util import registry, make_first_longest_spans_filter
|
||||||
|
|
||||||
|
# Register miscellaneous components
|
||||||
|
registry.misc("spacy.first_longest_spans_filter.v1")(make_first_longest_spans_filter)
|
||||||
|
|
||||||
|
# Import all pipeline components that were using registry decorators
|
||||||
|
from .pipeline.tagger import make_tagger_scorer
|
||||||
|
from .pipeline.ner import make_ner_scorer
|
||||||
|
|
||||||
|
# Need to get references to the existing functions in registry by importing the function that is there
|
||||||
|
# For the registry that was previously decorated
|
||||||
|
|
||||||
|
# Import functions for use in registry
|
||||||
|
from .scorer import get_ner_prf # Used for entity_ruler_scorer
|
||||||
|
|
||||||
|
# Import ML components that use registry
|
||||||
|
from .ml.models.tok2vec import tok2vec_listener_v1, build_hash_embed_cnn_tok2vec
|
||||||
|
|
||||||
|
# Register scorers
|
||||||
|
registry.scorers("spacy.tagger_scorer.v1")(make_tagger_scorer)
|
||||||
|
registry.scorers("spacy.ner_scorer.v1")(make_ner_scorer)
|
||||||
|
# span_ruler_scorer removed as it's not in span_ruler.py
|
||||||
|
registry.scorers("spacy.entity_ruler_scorer.v1")(make_entityruler_scorer)
|
||||||
|
registry.scorers("spacy.sentencizer_scorer.v1")(make_sentencizer_scorer)
|
||||||
|
registry.scorers("spacy.senter_scorer.v1")(make_senter_scorer)
|
||||||
|
registry.scorers("spacy.textcat_scorer.v1")(make_textcat_scorer)
|
||||||
|
registry.scorers("spacy.textcat_multilabel_scorer.v1")(make_textcat_multilabel_scorer)
|
||||||
|
registry.scorers("spacy.span_finder_scorer.v1")(make_span_finder_scorer)
|
||||||
|
registry.scorers("spacy.spancat_scorer.v1")(make_spancat_scorer)
|
||||||
|
|
||||||
|
# Register tok2vec architectures we've modified
|
||||||
|
registry.architectures("spacy.Tok2VecListener.v1")(tok2vec_listener_v1)
|
||||||
|
registry.architectures("spacy.HashEmbedCNN.v2")(build_hash_embed_cnn_tok2vec)
|
||||||
|
|
||||||
|
# Set the flag to indicate that the registry has been populated
|
||||||
|
REGISTRY_POPULATED = True
|
284
spacy/tests/registry_contents.json
Normal file
284
spacy/tests/registry_contents.json
Normal file
|
@ -0,0 +1,284 @@
|
||||||
|
{
|
||||||
|
"architectures": [
|
||||||
|
"spacy-legacy.CharacterEmbed.v1",
|
||||||
|
"spacy-legacy.EntityLinker.v1",
|
||||||
|
"spacy-legacy.HashEmbedCNN.v1",
|
||||||
|
"spacy-legacy.MaxoutWindowEncoder.v1",
|
||||||
|
"spacy-legacy.MishWindowEncoder.v1",
|
||||||
|
"spacy-legacy.MultiHashEmbed.v1",
|
||||||
|
"spacy-legacy.Tagger.v1",
|
||||||
|
"spacy-legacy.TextCatBOW.v1",
|
||||||
|
"spacy-legacy.TextCatCNN.v1",
|
||||||
|
"spacy-legacy.TextCatEnsemble.v1",
|
||||||
|
"spacy-legacy.Tok2Vec.v1",
|
||||||
|
"spacy-legacy.TransitionBasedParser.v1",
|
||||||
|
"spacy.CharacterEmbed.v2",
|
||||||
|
"spacy.EntityLinker.v2",
|
||||||
|
"spacy.HashEmbedCNN.v2",
|
||||||
|
"spacy.MaxoutWindowEncoder.v2",
|
||||||
|
"spacy.MishWindowEncoder.v2",
|
||||||
|
"spacy.MultiHashEmbed.v2",
|
||||||
|
"spacy.PretrainCharacters.v1",
|
||||||
|
"spacy.PretrainVectors.v1",
|
||||||
|
"spacy.SpanCategorizer.v1",
|
||||||
|
"spacy.SpanFinder.v1",
|
||||||
|
"spacy.Tagger.v2",
|
||||||
|
"spacy.TextCatBOW.v2",
|
||||||
|
"spacy.TextCatBOW.v3",
|
||||||
|
"spacy.TextCatCNN.v2",
|
||||||
|
"spacy.TextCatEnsemble.v2",
|
||||||
|
"spacy.TextCatLowData.v1",
|
||||||
|
"spacy.TextCatParametricAttention.v1",
|
||||||
|
"spacy.TextCatReduce.v1",
|
||||||
|
"spacy.Tok2Vec.v2",
|
||||||
|
"spacy.Tok2VecListener.v1",
|
||||||
|
"spacy.TorchBiLSTMEncoder.v1",
|
||||||
|
"spacy.TransitionBasedParser.v2"
|
||||||
|
],
|
||||||
|
"augmenters": [
|
||||||
|
"spacy.combined_augmenter.v1",
|
||||||
|
"spacy.lower_case.v1",
|
||||||
|
"spacy.orth_variants.v1"
|
||||||
|
],
|
||||||
|
"batchers": [
|
||||||
|
"spacy.batch_by_padded.v1",
|
||||||
|
"spacy.batch_by_sequence.v1",
|
||||||
|
"spacy.batch_by_words.v1"
|
||||||
|
],
|
||||||
|
"callbacks": [
|
||||||
|
"spacy.copy_from_base_model.v1",
|
||||||
|
"spacy.models_and_pipes_with_nvtx_range.v1",
|
||||||
|
"spacy.models_with_nvtx_range.v1"
|
||||||
|
],
|
||||||
|
"cli": [],
|
||||||
|
"datasets": [],
|
||||||
|
"displacy_colors": [],
|
||||||
|
"factories": [
|
||||||
|
"attribute_ruler",
|
||||||
|
"beam_ner",
|
||||||
|
"beam_parser",
|
||||||
|
"doc_cleaner",
|
||||||
|
"entity_linker",
|
||||||
|
"entity_ruler",
|
||||||
|
"future_entity_ruler",
|
||||||
|
"lemmatizer",
|
||||||
|
"merge_entities",
|
||||||
|
"merge_noun_chunks",
|
||||||
|
"merge_subtokens",
|
||||||
|
"morphologizer",
|
||||||
|
"ner",
|
||||||
|
"parser",
|
||||||
|
"sentencizer",
|
||||||
|
"senter",
|
||||||
|
"span_finder",
|
||||||
|
"span_ruler",
|
||||||
|
"spancat",
|
||||||
|
"spancat_singlelabel",
|
||||||
|
"tagger",
|
||||||
|
"textcat",
|
||||||
|
"textcat_multilabel",
|
||||||
|
"tok2vec",
|
||||||
|
"token_splitter",
|
||||||
|
"trainable_lemmatizer"
|
||||||
|
],
|
||||||
|
"initializers": [
|
||||||
|
"glorot_normal_init.v1",
|
||||||
|
"glorot_uniform_init.v1",
|
||||||
|
"he_normal_init.v1",
|
||||||
|
"he_uniform_init.v1",
|
||||||
|
"lecun_normal_init.v1",
|
||||||
|
"lecun_uniform_init.v1",
|
||||||
|
"normal_init.v1",
|
||||||
|
"uniform_init.v1",
|
||||||
|
"zero_init.v1"
|
||||||
|
],
|
||||||
|
"languages": [],
|
||||||
|
"layers": [
|
||||||
|
"CauchySimilarity.v1",
|
||||||
|
"ClippedLinear.v1",
|
||||||
|
"Dish.v1",
|
||||||
|
"Dropout.v1",
|
||||||
|
"Embed.v1",
|
||||||
|
"Gelu.v1",
|
||||||
|
"HardSigmoid.v1",
|
||||||
|
"HardSwish.v1",
|
||||||
|
"HardSwishMobilenet.v1",
|
||||||
|
"HardTanh.v1",
|
||||||
|
"HashEmbed.v1",
|
||||||
|
"LSTM.v1",
|
||||||
|
"LayerNorm.v1",
|
||||||
|
"Linear.v1",
|
||||||
|
"Logistic.v1",
|
||||||
|
"MXNetWrapper.v1",
|
||||||
|
"Maxout.v1",
|
||||||
|
"Mish.v1",
|
||||||
|
"MultiSoftmax.v1",
|
||||||
|
"ParametricAttention.v1",
|
||||||
|
"ParametricAttention.v2",
|
||||||
|
"PyTorchLSTM.v1",
|
||||||
|
"PyTorchRNNWrapper.v1",
|
||||||
|
"PyTorchWrapper.v1",
|
||||||
|
"PyTorchWrapper.v2",
|
||||||
|
"PyTorchWrapper.v3",
|
||||||
|
"Relu.v1",
|
||||||
|
"ReluK.v1",
|
||||||
|
"Sigmoid.v1",
|
||||||
|
"Softmax.v1",
|
||||||
|
"Softmax.v2",
|
||||||
|
"SparseLinear.v1",
|
||||||
|
"SparseLinear.v2",
|
||||||
|
"Swish.v1",
|
||||||
|
"add.v1",
|
||||||
|
"bidirectional.v1",
|
||||||
|
"chain.v1",
|
||||||
|
"clone.v1",
|
||||||
|
"concatenate.v1",
|
||||||
|
"expand_window.v1",
|
||||||
|
"list2array.v1",
|
||||||
|
"list2padded.v1",
|
||||||
|
"list2ragged.v1",
|
||||||
|
"noop.v1",
|
||||||
|
"padded2list.v1",
|
||||||
|
"premap_ids.v1",
|
||||||
|
"ragged2list.v1",
|
||||||
|
"reduce_first.v1",
|
||||||
|
"reduce_last.v1",
|
||||||
|
"reduce_max.v1",
|
||||||
|
"reduce_mean.v1",
|
||||||
|
"reduce_sum.v1",
|
||||||
|
"remap_ids.v1",
|
||||||
|
"remap_ids.v2",
|
||||||
|
"residual.v1",
|
||||||
|
"resizable.v1",
|
||||||
|
"siamese.v1",
|
||||||
|
"sigmoid_activation.v1",
|
||||||
|
"softmax_activation.v1",
|
||||||
|
"spacy-legacy.StaticVectors.v1",
|
||||||
|
"spacy.CharEmbed.v1",
|
||||||
|
"spacy.FeatureExtractor.v1",
|
||||||
|
"spacy.LinearLogistic.v1",
|
||||||
|
"spacy.PrecomputableAffine.v1",
|
||||||
|
"spacy.StaticVectors.v2",
|
||||||
|
"spacy.TransitionModel.v1",
|
||||||
|
"spacy.extract_ngrams.v1",
|
||||||
|
"spacy.extract_spans.v1",
|
||||||
|
"spacy.mean_max_reducer.v1",
|
||||||
|
"strings2arrays.v1",
|
||||||
|
"tuplify.v1",
|
||||||
|
"uniqued.v1",
|
||||||
|
"with_array.v1",
|
||||||
|
"with_array2d.v1",
|
||||||
|
"with_cpu.v1",
|
||||||
|
"with_flatten.v1",
|
||||||
|
"with_flatten.v2",
|
||||||
|
"with_getitem.v1",
|
||||||
|
"with_list.v1",
|
||||||
|
"with_padded.v1",
|
||||||
|
"with_ragged.v1",
|
||||||
|
"with_reshape.v1"
|
||||||
|
],
|
||||||
|
"lemmatizers": [],
|
||||||
|
"loggers": [
|
||||||
|
"spacy-legacy.ConsoleLogger.v1",
|
||||||
|
"spacy-legacy.ConsoleLogger.v2",
|
||||||
|
"spacy-legacy.WandbLogger.v1",
|
||||||
|
"spacy.ChainLogger.v1",
|
||||||
|
"spacy.ClearMLLogger.v1",
|
||||||
|
"spacy.ClearMLLogger.v2",
|
||||||
|
"spacy.ConsoleLogger.v2",
|
||||||
|
"spacy.ConsoleLogger.v3",
|
||||||
|
"spacy.CupyLogger.v1",
|
||||||
|
"spacy.LookupLogger.v1",
|
||||||
|
"spacy.MLflowLogger.v1",
|
||||||
|
"spacy.MLflowLogger.v2",
|
||||||
|
"spacy.PyTorchLogger.v1",
|
||||||
|
"spacy.WandbLogger.v1",
|
||||||
|
"spacy.WandbLogger.v2",
|
||||||
|
"spacy.WandbLogger.v3",
|
||||||
|
"spacy.WandbLogger.v4",
|
||||||
|
"spacy.WandbLogger.v5"
|
||||||
|
],
|
||||||
|
"lookups": [],
|
||||||
|
"losses": [
|
||||||
|
"CategoricalCrossentropy.v1",
|
||||||
|
"CategoricalCrossentropy.v2",
|
||||||
|
"CategoricalCrossentropy.v3",
|
||||||
|
"CosineDistance.v1",
|
||||||
|
"L2Distance.v1",
|
||||||
|
"SequenceCategoricalCrossentropy.v1",
|
||||||
|
"SequenceCategoricalCrossentropy.v2",
|
||||||
|
"SequenceCategoricalCrossentropy.v3"
|
||||||
|
],
|
||||||
|
"misc": [
|
||||||
|
"spacy.CandidateBatchGenerator.v1",
|
||||||
|
"spacy.CandidateGenerator.v1",
|
||||||
|
"spacy.EmptyKB.v1",
|
||||||
|
"spacy.EmptyKB.v2",
|
||||||
|
"spacy.KBFromFile.v1",
|
||||||
|
"spacy.LookupsDataLoader.v1",
|
||||||
|
"spacy.first_longest_spans_filter.v1",
|
||||||
|
"spacy.levenshtein_compare.v1",
|
||||||
|
"spacy.ngram_range_suggester.v1",
|
||||||
|
"spacy.ngram_suggester.v1",
|
||||||
|
"spacy.preset_spans_suggester.v1",
|
||||||
|
"spacy.prioritize_existing_ents_filter.v1",
|
||||||
|
"spacy.prioritize_new_ents_filter.v1"
|
||||||
|
],
|
||||||
|
"models": [],
|
||||||
|
"ops": [
|
||||||
|
"CupyOps",
|
||||||
|
"MPSOps",
|
||||||
|
"NumpyOps"
|
||||||
|
],
|
||||||
|
"optimizers": [
|
||||||
|
"Adam.v1",
|
||||||
|
"RAdam.v1",
|
||||||
|
"SGD.v1"
|
||||||
|
],
|
||||||
|
"readers": [
|
||||||
|
"ml_datasets.cmu_movies.v1",
|
||||||
|
"ml_datasets.dbpedia.v1",
|
||||||
|
"ml_datasets.imdb_sentiment.v1",
|
||||||
|
"spacy.Corpus.v1",
|
||||||
|
"spacy.JsonlCorpus.v1",
|
||||||
|
"spacy.PlainTextCorpus.v1",
|
||||||
|
"spacy.read_labels.v1",
|
||||||
|
"srsly.read_json.v1",
|
||||||
|
"srsly.read_jsonl.v1",
|
||||||
|
"srsly.read_msgpack.v1",
|
||||||
|
"srsly.read_yaml.v1"
|
||||||
|
],
|
||||||
|
"schedules": [
|
||||||
|
"compounding.v1",
|
||||||
|
"constant.v1",
|
||||||
|
"constant_then.v1",
|
||||||
|
"cyclic_triangular.v1",
|
||||||
|
"decaying.v1",
|
||||||
|
"slanted_triangular.v1",
|
||||||
|
"warmup_linear.v1"
|
||||||
|
],
|
||||||
|
"scorers": [
|
||||||
|
"spacy-legacy.textcat_multilabel_scorer.v1",
|
||||||
|
"spacy-legacy.textcat_scorer.v1",
|
||||||
|
"spacy.attribute_ruler_scorer.v1",
|
||||||
|
"spacy.entity_linker_scorer.v1",
|
||||||
|
"spacy.entity_ruler_scorer.v1",
|
||||||
|
"spacy.lemmatizer_scorer.v1",
|
||||||
|
"spacy.morphologizer_scorer.v1",
|
||||||
|
"spacy.ner_scorer.v1",
|
||||||
|
"spacy.overlapping_labeled_spans_scorer.v1",
|
||||||
|
"spacy.parser_scorer.v1",
|
||||||
|
"spacy.senter_scorer.v1",
|
||||||
|
"spacy.span_finder_scorer.v1",
|
||||||
|
"spacy.spancat_scorer.v1",
|
||||||
|
"spacy.tagger_scorer.v1",
|
||||||
|
"spacy.textcat_multilabel_scorer.v2",
|
||||||
|
"spacy.textcat_scorer.v2"
|
||||||
|
],
|
||||||
|
"tokenizers": [
|
||||||
|
"spacy.Tokenizer.v1"
|
||||||
|
],
|
||||||
|
"vectors": [
|
||||||
|
"spacy.Vectors.v1"
|
||||||
|
]
|
||||||
|
}
|
48
spacy/tests/test_registry_population.py
Normal file
48
spacy/tests/test_registry_population.py
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import pytest
|
||||||
|
from pathlib import Path
|
||||||
|
from spacy.util import registry
|
||||||
|
|
||||||
|
# Path to the reference registry contents, relative to this file
|
||||||
|
REFERENCE_FILE = Path(__file__).parent / "registry_contents.json"
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def reference_registry():
|
||||||
|
"""Load reference registry contents from JSON file"""
|
||||||
|
if not REFERENCE_FILE.exists():
|
||||||
|
pytest.fail(f"Reference file {REFERENCE_FILE} not found.")
|
||||||
|
|
||||||
|
with REFERENCE_FILE.open("r") as f:
|
||||||
|
return json.load(f)
|
||||||
|
|
||||||
|
def test_registry_types(reference_registry):
|
||||||
|
"""Test that all registry types match the reference"""
|
||||||
|
# Get current registry types
|
||||||
|
current_registry_types = set(registry.get_registry_names())
|
||||||
|
expected_registry_types = set(reference_registry.keys())
|
||||||
|
|
||||||
|
# Check for missing registry types
|
||||||
|
missing_types = expected_registry_types - current_registry_types
|
||||||
|
assert not missing_types, f"Missing registry types: {', '.join(missing_types)}"
|
||||||
|
|
||||||
|
def test_registry_entries(reference_registry):
|
||||||
|
"""Test that all registry entries are present"""
|
||||||
|
# Check each registry's entries
|
||||||
|
for registry_name, expected_entries in reference_registry.items():
|
||||||
|
# Skip if this registry type doesn't exist
|
||||||
|
if not hasattr(registry, registry_name):
|
||||||
|
pytest.fail(f"Registry '{registry_name}' does not exist.")
|
||||||
|
|
||||||
|
# Get current entries
|
||||||
|
reg = getattr(registry, registry_name)
|
||||||
|
current_entries = sorted(list(reg.get_all().keys()))
|
||||||
|
|
||||||
|
# Compare entries
|
||||||
|
expected_set = set(expected_entries)
|
||||||
|
current_set = set(current_entries)
|
||||||
|
|
||||||
|
# Check for missing entries - these would indicate our new registry population
|
||||||
|
# mechanism is missing something
|
||||||
|
missing_entries = expected_set - current_set
|
||||||
|
assert not missing_entries, f"Registry '{registry_name}' missing entries: {', '.join(missing_entries)}"
|
Loading…
Reference in New Issue
Block a user