From d20445ef6f831a6377a61400843a95a7b0c8eedd Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 19 May 2025 16:26:09 +0200 Subject: [PATCH] Add test for factory registrations --- spacy/tests/factory_registrations.json | 132 ++++++++++++++++++++++ spacy/tests/test_factory_registrations.py | 76 +++++++++++++ 2 files changed, 208 insertions(+) create mode 100644 spacy/tests/factory_registrations.json create mode 100644 spacy/tests/test_factory_registrations.py diff --git a/spacy/tests/factory_registrations.json b/spacy/tests/factory_registrations.json new file mode 100644 index 000000000..475e48020 --- /dev/null +++ b/spacy/tests/factory_registrations.json @@ -0,0 +1,132 @@ +{ + "attribute_ruler": { + "name": "attribute_ruler", + "module": "spacy.pipeline.attributeruler", + "function": "make_attribute_ruler" + }, + "beam_ner": { + "name": "beam_ner", + "module": "spacy.pipeline.ner", + "function": "make_beam_ner" + }, + "beam_parser": { + "name": "beam_parser", + "module": "spacy.pipeline.dep_parser", + "function": "make_beam_parser" + }, + "doc_cleaner": { + "name": "doc_cleaner", + "module": "spacy.pipeline.functions", + "function": "make_doc_cleaner" + }, + "entity_linker": { + "name": "entity_linker", + "module": "spacy.pipeline.entity_linker", + "function": "make_entity_linker" + }, + "entity_ruler": { + "name": "entity_ruler", + "module": "spacy.pipeline.entityruler", + "function": "make_entity_ruler" + }, + "future_entity_ruler": { + "name": "future_entity_ruler", + "module": "spacy.pipeline.span_ruler", + "function": "make_entity_ruler" + }, + "lemmatizer": { + "name": "lemmatizer", + "module": "spacy.pipeline.lemmatizer", + "function": "make_lemmatizer" + }, + "merge_entities": { + "name": "merge_entities", + "module": "spacy.language", + "function": "Language.component..add_component..factory_func" + }, + "merge_noun_chunks": { + "name": "merge_noun_chunks", + "module": "spacy.language", + "function": "Language.component..add_component..factory_func" + }, + "merge_subtokens": { + "name": "merge_subtokens", + "module": "spacy.language", + "function": "Language.component..add_component..factory_func" + }, + "morphologizer": { + "name": "morphologizer", + "module": "spacy.pipeline.morphologizer", + "function": "make_morphologizer" + }, + "ner": { + "name": "ner", + "module": "spacy.pipeline.ner", + "function": "make_ner" + }, + "parser": { + "name": "parser", + "module": "spacy.pipeline.dep_parser", + "function": "make_parser" + }, + "sentencizer": { + "name": "sentencizer", + "module": "spacy.pipeline.sentencizer", + "function": "make_sentencizer" + }, + "senter": { + "name": "senter", + "module": "spacy.pipeline.senter", + "function": "make_senter" + }, + "span_finder": { + "name": "span_finder", + "module": "spacy.pipeline.span_finder", + "function": "make_span_finder" + }, + "span_ruler": { + "name": "span_ruler", + "module": "spacy.pipeline.span_ruler", + "function": "make_span_ruler" + }, + "spancat": { + "name": "spancat", + "module": "spacy.pipeline.spancat", + "function": "make_spancat" + }, + "spancat_singlelabel": { + "name": "spancat_singlelabel", + "module": "spacy.pipeline.spancat", + "function": "make_spancat_singlelabel" + }, + "tagger": { + "name": "tagger", + "module": "spacy.pipeline.tagger", + "function": "make_tagger" + }, + "textcat": { + "name": "textcat", + "module": "spacy.pipeline.textcat", + "function": "make_textcat" + }, + "textcat_multilabel": { + "name": "textcat_multilabel", + "module": "spacy.pipeline.textcat_multilabel", + "function": "make_multilabel_textcat" + }, + "tok2vec": { + "name": "tok2vec", + "module": "spacy.pipeline.tok2vec", + "function": "make_tok2vec" + }, + "token_splitter": { + "name": "token_splitter", + "module": "spacy.pipeline.functions", + "function": "make_token_splitter" + }, + "trainable_lemmatizer": { + "name": "trainable_lemmatizer", + "module": "spacy.pipeline.edit_tree_lemmatizer", + "function": "make_edit_tree_lemmatizer" + } +} \ No newline at end of file diff --git a/spacy/tests/test_factory_registrations.py b/spacy/tests/test_factory_registrations.py new file mode 100644 index 000000000..7dbcc81a5 --- /dev/null +++ b/spacy/tests/test_factory_registrations.py @@ -0,0 +1,76 @@ +import json +import inspect +import pytest +from pathlib import Path +from spacy.language import Language +from spacy.util import registry + +# Path to the reference factory registrations, relative to this file +REFERENCE_FILE = Path(__file__).parent / "factory_registrations.json" + +# Monkey patch the util.is_same_func to handle Cython functions +import inspect +from spacy import util + +original_is_same_func = util.is_same_func + +def patched_is_same_func(func1, func2): + # Handle Cython functions + try: + return original_is_same_func(func1, func2) + except TypeError: + # For Cython functions, just compare the string representation + return str(func1) == str(func2) + +util.is_same_func = patched_is_same_func + +@pytest.fixture +def reference_factory_registrations(): + """Load reference factory registrations from JSON file""" + if not REFERENCE_FILE.exists(): + pytest.fail(f"Reference file {REFERENCE_FILE} not found. Run export_factory_registrations.py first.") + + with REFERENCE_FILE.open("r") as f: + return json.load(f) + +def test_factory_registrations_preserved(reference_factory_registrations): + """Test that all factory registrations from the reference file are still present.""" + # Ensure the registry is populated + registry.ensure_populated() + + # Get all factory registrations + all_factories = registry.factories.get_all() + + # Initialize our data structure to store current factory registrations + current_registrations = {} + + # Process factory registrations + for name, func in all_factories.items(): + # Store information about each factory + try: + module_name = func.__module__ + except (AttributeError, TypeError): + # For Cython functions, just use a placeholder + module_name = str(func).split()[1].split('.')[0] + + try: + func_name = func.__qualname__ + except (AttributeError, TypeError): + # For Cython functions, use the function's name + func_name = func.__name__ if hasattr(func, "__name__") else str(func).split()[1].split('.')[-1] + + current_registrations[name] = { + "name": name, + "module": module_name, + "function": func_name, + } + + # Check for missing registrations + missing_registrations = set(reference_factory_registrations.keys()) - set(current_registrations.keys()) + assert not missing_registrations, f"Missing factory registrations: {', '.join(sorted(missing_registrations))}" + + # Check for new registrations (not an error, but informative) + new_registrations = set(current_registrations.keys()) - set(reference_factory_registrations.keys()) + if new_registrations: + # This is not an error, just informative + print(f"New factory registrations found: {', '.join(sorted(new_registrations))}") \ No newline at end of file