Add test for factory registrations

This commit is contained in:
Matthew Honnibal 2025-05-19 16:26:09 +02:00
parent c62b9dac0b
commit d20445ef6f
2 changed files with 208 additions and 0 deletions

View File

@ -0,0 +1,132 @@
{
"attribute_ruler": {
"name": "attribute_ruler",
"module": "spacy.pipeline.attributeruler",
"function": "make_attribute_ruler"
},
"beam_ner": {
"name": "beam_ner",
"module": "spacy.pipeline.ner",
"function": "make_beam_ner"
},
"beam_parser": {
"name": "beam_parser",
"module": "spacy.pipeline.dep_parser",
"function": "make_beam_parser"
},
"doc_cleaner": {
"name": "doc_cleaner",
"module": "spacy.pipeline.functions",
"function": "make_doc_cleaner"
},
"entity_linker": {
"name": "entity_linker",
"module": "spacy.pipeline.entity_linker",
"function": "make_entity_linker"
},
"entity_ruler": {
"name": "entity_ruler",
"module": "spacy.pipeline.entityruler",
"function": "make_entity_ruler"
},
"future_entity_ruler": {
"name": "future_entity_ruler",
"module": "spacy.pipeline.span_ruler",
"function": "make_entity_ruler"
},
"lemmatizer": {
"name": "lemmatizer",
"module": "spacy.pipeline.lemmatizer",
"function": "make_lemmatizer"
},
"merge_entities": {
"name": "merge_entities",
"module": "spacy.language",
"function": "Language.component.<locals>.add_component.<locals>.factory_func"
},
"merge_noun_chunks": {
"name": "merge_noun_chunks",
"module": "spacy.language",
"function": "Language.component.<locals>.add_component.<locals>.factory_func"
},
"merge_subtokens": {
"name": "merge_subtokens",
"module": "spacy.language",
"function": "Language.component.<locals>.add_component.<locals>.factory_func"
},
"morphologizer": {
"name": "morphologizer",
"module": "spacy.pipeline.morphologizer",
"function": "make_morphologizer"
},
"ner": {
"name": "ner",
"module": "spacy.pipeline.ner",
"function": "make_ner"
},
"parser": {
"name": "parser",
"module": "spacy.pipeline.dep_parser",
"function": "make_parser"
},
"sentencizer": {
"name": "sentencizer",
"module": "spacy.pipeline.sentencizer",
"function": "make_sentencizer"
},
"senter": {
"name": "senter",
"module": "spacy.pipeline.senter",
"function": "make_senter"
},
"span_finder": {
"name": "span_finder",
"module": "spacy.pipeline.span_finder",
"function": "make_span_finder"
},
"span_ruler": {
"name": "span_ruler",
"module": "spacy.pipeline.span_ruler",
"function": "make_span_ruler"
},
"spancat": {
"name": "spancat",
"module": "spacy.pipeline.spancat",
"function": "make_spancat"
},
"spancat_singlelabel": {
"name": "spancat_singlelabel",
"module": "spacy.pipeline.spancat",
"function": "make_spancat_singlelabel"
},
"tagger": {
"name": "tagger",
"module": "spacy.pipeline.tagger",
"function": "make_tagger"
},
"textcat": {
"name": "textcat",
"module": "spacy.pipeline.textcat",
"function": "make_textcat"
},
"textcat_multilabel": {
"name": "textcat_multilabel",
"module": "spacy.pipeline.textcat_multilabel",
"function": "make_multilabel_textcat"
},
"tok2vec": {
"name": "tok2vec",
"module": "spacy.pipeline.tok2vec",
"function": "make_tok2vec"
},
"token_splitter": {
"name": "token_splitter",
"module": "spacy.pipeline.functions",
"function": "make_token_splitter"
},
"trainable_lemmatizer": {
"name": "trainable_lemmatizer",
"module": "spacy.pipeline.edit_tree_lemmatizer",
"function": "make_edit_tree_lemmatizer"
}
}

View File

@ -0,0 +1,76 @@
import json
import inspect
import pytest
from pathlib import Path
from spacy.language import Language
from spacy.util import registry
# Path to the reference factory registrations, relative to this file
REFERENCE_FILE = Path(__file__).parent / "factory_registrations.json"
# Monkey patch the util.is_same_func to handle Cython functions
import inspect
from spacy import util
original_is_same_func = util.is_same_func
def patched_is_same_func(func1, func2):
# Handle Cython functions
try:
return original_is_same_func(func1, func2)
except TypeError:
# For Cython functions, just compare the string representation
return str(func1) == str(func2)
util.is_same_func = patched_is_same_func
@pytest.fixture
def reference_factory_registrations():
"""Load reference factory registrations from JSON file"""
if not REFERENCE_FILE.exists():
pytest.fail(f"Reference file {REFERENCE_FILE} not found. Run export_factory_registrations.py first.")
with REFERENCE_FILE.open("r") as f:
return json.load(f)
def test_factory_registrations_preserved(reference_factory_registrations):
"""Test that all factory registrations from the reference file are still present."""
# Ensure the registry is populated
registry.ensure_populated()
# Get all factory registrations
all_factories = registry.factories.get_all()
# Initialize our data structure to store current factory registrations
current_registrations = {}
# Process factory registrations
for name, func in all_factories.items():
# Store information about each factory
try:
module_name = func.__module__
except (AttributeError, TypeError):
# For Cython functions, just use a placeholder
module_name = str(func).split()[1].split('.')[0]
try:
func_name = func.__qualname__
except (AttributeError, TypeError):
# For Cython functions, use the function's name
func_name = func.__name__ if hasattr(func, "__name__") else str(func).split()[1].split('.')[-1]
current_registrations[name] = {
"name": name,
"module": module_name,
"function": func_name,
}
# Check for missing registrations
missing_registrations = set(reference_factory_registrations.keys()) - set(current_registrations.keys())
assert not missing_registrations, f"Missing factory registrations: {', '.join(sorted(missing_registrations))}"
# Check for new registrations (not an error, but informative)
new_registrations = set(current_registrations.keys()) - set(reference_factory_registrations.keys())
if new_registrations:
# This is not an error, just informative
print(f"New factory registrations found: {', '.join(sorted(new_registrations))}")