registry.assets -> registry.misc

2025-09-13 23:52:38 +03:00 · 2020-09-03 17:31:14 +02:00 · 2020-09-03 17:31:14 +02:00 · 5afe6447cd
commit 5afe6447cd
parent c063e55eb7
13 changed files with 60 additions and 54 deletions
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@ -24,7 +24,7 @@ def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model:
    return model


-@registry.assets.register("spacy.KBFromFile.v1")
+@registry.misc.register("spacy.KBFromFile.v1")
 def load_kb(kb_path: str) -> Callable[[Vocab], KnowledgeBase]:
    def kb_from_file(vocab):
        kb = KnowledgeBase(vocab, entity_vector_length=1)
@ -34,7 +34,7 @@ def load_kb(kb_path: str) -> Callable[[Vocab], KnowledgeBase]:
    return kb_from_file


-@registry.assets.register("spacy.EmptyKB.v1")
+@registry.misc.register("spacy.EmptyKB.v1")
 def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
    def empty_kb_factory(vocab):
        return KnowledgeBase(vocab=vocab, entity_vector_length=entity_vector_length)
@ -42,6 +42,6 @@ def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
    return empty_kb_factory


-@registry.assets.register("spacy.CandidateGenerator.v1")
+@registry.misc.register("spacy.CandidateGenerator.v1")
 def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]:
    return get_candidates
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@ -39,12 +39,12 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
    requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
    assigns=["token.ent_kb_id"],
    default_config={
-        "kb_loader": {"@assets": "spacy.EmptyKB.v1", "entity_vector_length": 64},
+        "kb_loader": {"@misc": "spacy.EmptyKB.v1", "entity_vector_length": 64},
        "model": DEFAULT_NEL_MODEL,
        "labels_discard": [],
        "incl_prior": True,
        "incl_context": True,
-        "get_candidates": {"@assets": "spacy.CandidateGenerator.v1"},
+        "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
    },
 )
 def make_entity_linker(
--- a/spacy/tests/lang/test_lemmatizers.py
+++ b/spacy/tests/lang/test_lemmatizers.py
@ -14,7 +14,7 @@ LANGUAGES = ["el", "en", "fr", "nl"]

@pytest.mark.parametrize("lang", LANGUAGES)
 def test_lemmatizer_initialize(lang, capfd):
-    @registry.assets("lemmatizer_init_lookups")
+    @registry.misc("lemmatizer_init_lookups")
    def lemmatizer_init_lookups():
        lookups = Lookups()
        lookups.add_table("lemma_lookup", {"cope": "cope"})
@ -25,9 +25,7 @@ def test_lemmatizer_initialize(lang, capfd):

    """Test that languages can be initialized."""
    nlp = get_lang_class(lang)()
-    nlp.add_pipe(
-        "lemmatizer", config={"lookups": {"@assets": "lemmatizer_init_lookups"}}
-    )
+    nlp.add_pipe("lemmatizer", config={"lookups": {"@misc": "lemmatizer_init_lookups"}})
    # Check for stray print statements (see #3342)
    doc = nlp("test")  # noqa: F841
    captured = capfd.readouterr()
--- a/spacy/tests/pipeline/test_attributeruler.py
+++ b/spacy/tests/pipeline/test_attributeruler.py
@ -31,7 +31,7 @@ def pattern_dicts():
    ]


-@registry.assets("attribute_ruler_patterns")
+@registry.misc("attribute_ruler_patterns")
 def attribute_ruler_patterns():
    return [
        {
@ -86,7 +86,7 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
    # initialize with patterns from asset
    nlp.add_pipe(
        "attribute_ruler",
-        config={"pattern_dicts": {"@assets": "attribute_ruler_patterns"}},
+        config={"pattern_dicts": {"@misc": "attribute_ruler_patterns"}},
    )
    doc = nlp("This is a test.")
    assert doc[2].lemma_ == "the"
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@ -137,7 +137,7 @@ def test_kb_undefined(nlp):

 def test_kb_empty(nlp):
    """Test that the EL can't train with an empty KB"""
-    config = {"kb_loader": {"@assets": "spacy.EmptyKB.v1", "entity_vector_length": 342}}
+    config = {"kb_loader": {"@misc": "spacy.EmptyKB.v1", "entity_vector_length": 342}}
    entity_linker = nlp.add_pipe("entity_linker", config=config)
    assert len(entity_linker.kb) == 0
    with pytest.raises(ValueError):
@ -183,7 +183,7 @@ def test_el_pipe_configuration(nlp):
    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns([pattern])

-    @registry.assets.register("myAdamKB.v1")
+    @registry.misc.register("myAdamKB.v1")
    def mykb() -> Callable[["Vocab"], KnowledgeBase]:
        def create_kb(vocab):
            kb = KnowledgeBase(vocab, entity_vector_length=1)
@ -199,7 +199,7 @@ def test_el_pipe_configuration(nlp):
    # run an EL pipe without a trained context encoder, to check the candidate generation step only
    nlp.add_pipe(
        "entity_linker",
-        config={"kb_loader": {"@assets": "myAdamKB.v1"}, "incl_context": False},
+        config={"kb_loader": {"@misc": "myAdamKB.v1"}, "incl_context": False},
    )
    # With the default get_candidates function, matching is case-sensitive
    text = "Douglas and douglas are not the same."
@ -211,7 +211,7 @@ def test_el_pipe_configuration(nlp):
    def get_lowercased_candidates(kb, span):
        return kb.get_alias_candidates(span.text.lower())

-    @registry.assets.register("spacy.LowercaseCandidateGenerator.v1")
+    @registry.misc.register("spacy.LowercaseCandidateGenerator.v1")
    def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]:
        return get_lowercased_candidates

@ -220,9 +220,9 @@ def test_el_pipe_configuration(nlp):
        "entity_linker",
        "entity_linker",
        config={
-            "kb_loader": {"@assets": "myAdamKB.v1"},
+            "kb_loader": {"@misc": "myAdamKB.v1"},
            "incl_context": False,
-            "get_candidates": {"@assets": "spacy.LowercaseCandidateGenerator.v1"},
+            "get_candidates": {"@misc": "spacy.LowercaseCandidateGenerator.v1"},
        },
    )
    doc = nlp(text)
@ -282,7 +282,7 @@ def test_append_invalid_alias(nlp):
 def test_preserving_links_asdoc(nlp):
    """Test that Span.as_doc preserves the existing entity links"""

-    @registry.assets.register("myLocationsKB.v1")
+    @registry.misc.register("myLocationsKB.v1")
    def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]:
        def create_kb(vocab):
            mykb = KnowledgeBase(vocab, entity_vector_length=1)
@ -304,7 +304,7 @@ def test_preserving_links_asdoc(nlp):
    ]
    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)
-    el_config = {"kb_loader": {"@assets": "myLocationsKB.v1"}, "incl_prior": False}
+    el_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}, "incl_prior": False}
    el_pipe = nlp.add_pipe("entity_linker", config=el_config, last=True)
    el_pipe.begin_training(lambda: [])
    el_pipe.incl_context = False
@ -387,7 +387,7 @@ def test_overfitting_IO():
        doc = nlp(text)
        train_examples.append(Example.from_dict(doc, annotation))

-    @registry.assets.register("myOverfittingKB.v1")
+    @registry.misc.register("myOverfittingKB.v1")
    def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]:
        def create_kb(vocab):
            # create artificial KB - assign same prior weight to the two russ cochran's
@ -408,7 +408,7 @@ def test_overfitting_IO():
    # Create the Entity Linker component and add it to the pipeline
    nlp.add_pipe(
        "entity_linker",
-        config={"kb_loader": {"@assets": "myOverfittingKB.v1"}},
+        config={"kb_loader": {"@misc": "myOverfittingKB.v1"}},
        last=True,
    )

--- a/spacy/tests/pipeline/test_lemmatizer.py
+++ b/spacy/tests/pipeline/test_lemmatizer.py
@ -13,7 +13,7 @@ def nlp():

@pytest.fixture
 def lemmatizer(nlp):
-    @registry.assets("cope_lookups")
+    @registry.misc("cope_lookups")
    def cope_lookups():
        lookups = Lookups()
        lookups.add_table("lemma_lookup", {"cope": "cope"})
@ -23,13 +23,13 @@ def lemmatizer(nlp):
        return lookups

    lemmatizer = nlp.add_pipe(
-        "lemmatizer", config={"mode": "rule", "lookups": {"@assets": "cope_lookups"}}
+        "lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}}
    )
    return lemmatizer


 def test_lemmatizer_init(nlp):
-    @registry.assets("cope_lookups")
+    @registry.misc("cope_lookups")
    def cope_lookups():
        lookups = Lookups()
        lookups.add_table("lemma_lookup", {"cope": "cope"})
@ -39,7 +39,7 @@ def test_lemmatizer_init(nlp):
        return lookups

    lemmatizer = nlp.add_pipe(
-        "lemmatizer", config={"mode": "lookup", "lookups": {"@assets": "cope_lookups"}}
+        "lemmatizer", config={"mode": "lookup", "lookups": {"@misc": "cope_lookups"}}
    )
    assert isinstance(lemmatizer.lookups, Lookups)
    assert lemmatizer.mode == "lookup"
@ -51,14 +51,14 @@ def test_lemmatizer_init(nlp):

    nlp.remove_pipe("lemmatizer")

-    @registry.assets("empty_lookups")
+    @registry.misc("empty_lookups")
    def empty_lookups():
        return Lookups()

    with pytest.raises(ValueError):
        nlp.add_pipe(
            "lemmatizer",
-            config={"mode": "lookup", "lookups": {"@assets": "empty_lookups"}},
+            config={"mode": "lookup", "lookups": {"@misc": "empty_lookups"}},
        )


@ -79,7 +79,7 @@ def test_lemmatizer_config(nlp, lemmatizer):


 def test_lemmatizer_serialize(nlp, lemmatizer):
-    @registry.assets("cope_lookups")
+    @registry.misc("cope_lookups")
    def cope_lookups():
        lookups = Lookups()
        lookups.add_table("lemma_lookup", {"cope": "cope"})
@ -90,7 +90,7 @@ def test_lemmatizer_serialize(nlp, lemmatizer):

    nlp2 = English()
    lemmatizer2 = nlp2.add_pipe(
-        "lemmatizer", config={"mode": "rule", "lookups": {"@assets": "cope_lookups"}}
+        "lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}}
    )
    lemmatizer2.from_bytes(lemmatizer.to_bytes())
    assert lemmatizer.to_bytes() == lemmatizer2.to_bytes()
--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@ -71,7 +71,7 @@ def tagger():
 def entity_linker():
    nlp = Language()

-    @registry.assets.register("TestIssue5230KB.v1")
+    @registry.misc.register("TestIssue5230KB.v1")
    def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]:
        def create_kb(vocab):
            kb = KnowledgeBase(vocab, entity_vector_length=1)
@ -80,7 +80,7 @@ def entity_linker():

        return create_kb

-    config = {"kb_loader": {"@assets": "TestIssue5230KB.v1"}}
+    config = {"kb_loader": {"@misc": "TestIssue5230KB.v1"}}
    entity_linker = nlp.add_pipe("entity_linker", config=config)
    # need to add model for two reasons:
    # 1. no model leads to error in serialization,
--- a/spacy/tests/serialize/test_serialize_kb.py
+++ b/spacy/tests/serialize/test_serialize_kb.py
@ -85,7 +85,7 @@ def test_serialize_subclassed_kb():
            super().__init__(vocab, entity_vector_length)
            self.custom_field = custom_field

-    @registry.assets.register("spacy.CustomKB.v1")
+    @registry.misc.register("spacy.CustomKB.v1")
    def custom_kb(
        entity_vector_length: int, custom_field: int
    ) -> Callable[["Vocab"], KnowledgeBase]:
@ -101,7 +101,7 @@ def test_serialize_subclassed_kb():
    nlp = English()
    config = {
        "kb_loader": {
-            "@assets": "spacy.CustomKB.v1",
+            "@misc": "spacy.CustomKB.v1",
            "entity_vector_length": 342,
            "custom_field": 666,
        }
--- a/spacy/util.py
+++ b/spacy/util.py
@ -76,7 +76,7 @@ class registry(thinc.registry):
    lemmatizers = catalogue.create("spacy", "lemmatizers", entry_points=True)
    lookups = catalogue.create("spacy", "lookups", entry_points=True)
    displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True)
-    assets = catalogue.create("spacy", "assets", entry_points=True)
+    misc = catalogue.create("spacy", "misc", entry_points=True)
    # Callback functions used to manipulate nlp object etc.
    callbacks = catalogue.create("spacy", "callbacks")
    batchers = catalogue.create("spacy", "batchers", entry_points=True)
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@ -673,11 +673,11 @@ into the "real world". This requires 3 main components:
 > subword_features = true
 >
 > [kb_loader]
-> @assets = "spacy.EmptyKB.v1"
+> @misc = "spacy.EmptyKB.v1"
 > entity_vector_length = 64
 >
 > [get_candidates]
-> @assets = "spacy.CandidateGenerator.v1"
+> @misc = "spacy.CandidateGenerator.v1"
 > ```

 The `EntityLinker` model architecture is a Thinc `Model` with a
--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@ -34,8 +34,8 @@ architectures and their arguments and hyperparameters.
 >    "incl_prior": True,
 >    "incl_context": True,
 >    "model": DEFAULT_NEL_MODEL,
->    "kb_loader": {'@assets': 'spacy.EmptyKB.v1', 'entity_vector_length': 64},
->    "get_candidates": {'@assets': 'spacy.CandidateGenerator.v1'},
+>    "kb_loader": {'@misc': 'spacy.EmptyKB.v1', 'entity_vector_length': 64},
+>    "get_candidates": {'@misc': 'spacy.CandidateGenerator.v1'},
 > }
 > nlp.add_pipe("entity_linker", config=config)
 > ```
@ -66,7 +66,7 @@ https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/entity_linker.py
 > entity_linker = nlp.add_pipe("entity_linker", config=config)
 >
 > # Construction via add_pipe with custom KB and candidate generation
-> config = {"kb": {"@assets": "my_kb.v1"}}
+> config = {"kb": {"@misc": "my_kb.v1"}}
 > entity_linker = nlp.add_pipe("entity_linker", config=config)
 >
 > # Construction from class
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@ -307,7 +307,6 @@ factories.
 | Registry name     | Description                                                                                                                                                                                                                                        |
 | ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `architectures`   | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`.                                                                           |
-| `assets`          | Registry for data assets, knowledge bases etc.                                                                                                                                                                                                     |
 | `batchers`        | Registry for training and evaluation [data batchers](#batchers).                                                                                                                                                                                   |
 | `callbacks`       | Registry for custom callbacks to [modify the `nlp` object](/usage/training#custom-code-nlp-callbacks) before training.                                                                                                                             |
 | `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points).                                                                             |
@ -322,6 +321,7 @@ factories.
 | `readers`         | Registry for training and evaluation data readers like [`Corpus`](/api/corpus).                                                                                                                                                                    |
 | `schedules`       | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules).                                                                                                                                                               |
 | `tokenizers`      | Registry for tokenizer factories. Registered functions should return a callback that receives the `nlp` object and returns a [`Tokenizer`](/api/tokenizer) or a custom callable.                                                                   |
+| `misc`            | Registry for miscellaneous functions that return data assets, knowledge bases or anything else you may need.                                                                                                                                       |

 ### spacy-transformers registry {#registry-transformers}

--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@ -842,12 +842,20 @@ load and train custom pipelines with custom components. A simple solution is to
 **register a function** that returns your resources. The
 [registry](/api/top-level#registry) lets you **map string names to functions**
 that create objects, so given a name and optional arguments, spaCy will know how
-to recreate the object. To register a function that returns a custom asset, you
-can use the `@spacy.registry.assets` decorator with a single argument, the name:
+to recreate the object. To register a function that returns your custom
+dictionary, you can use the `@spacy.registry.misc` decorator with a single
+argument, the name:
+
+> #### What's the misc registry?
+>
+> The [`registry`](/api/top-level#registry) provides different categories for
+> different types of functions – for example, model architectures, tokenizers or
+> batchers. `misc` is intended for miscellaneous functions that don't fit
+> anywhere else.

 ```python
 ### Registered function for assets {highlight="1"}
-@spacy.registry.assets("acronyms.slang_dict.v1")
+@spacy.registry.misc("acronyms.slang_dict.v1")
 def create_acronyms_slang_dict():
    dictionary = {"lol": "laughing out loud", "brb": "be right back"}
    dictionary.update({value: key for key, value in dictionary.items()})
@ -856,9 +864,9 @@ def create_acronyms_slang_dict():

 In your `default_config` (and later in your
 [training config](/usage/training#config)), you can now refer to the function
-registered under the name `"acronyms.slang_dict.v1"` using the `@assets` key.
-This tells spaCy how to create the value, and when your component is created,
-the result of the registered function is passed in as the key `"dictionary"`.
+registered under the name `"acronyms.slang_dict.v1"` using the `@misc` key. This
+tells spaCy how to create the value, and when your component is created, the
+result of the registered function is passed in as the key `"dictionary"`.

 > #### config.cfg
 >
@ -867,22 +875,22 @@ the result of the registered function is passed in as the key `"dictionary"`.
 > factory = "acronyms"
 >
 > [components.acronyms.dictionary]
-> @assets = "acronyms.slang_dict.v1"
+> @misc = "acronyms.slang_dict.v1"
 > ```

 ```diff
 - default_config = {"dictionary:" DICTIONARY}
-+ default_config = {"dictionary": {"@assets": "acronyms.slang_dict.v1"}}
+ default_config = {"dictionary": {"@misc": "acronyms.slang_dict.v1"}}
 ```

 Using a registered function also means that you can easily include your custom
 components in pipelines that you [train](/usage/training). To make sure spaCy
-knows where to find your custom `@assets` function, you can pass in a Python
-file via the argument `--code`. If someone else is using your component, all
-they have to do to customize the data is to register their own function and swap
-out the name. Registered functions can also take **arguments** by the way that
-can be defined in the config as well – you can read more about this in the docs
-on [training with custom code](/usage/training#custom-code).
+knows where to find your custom `@misc` function, you can pass in a Python file
+via the argument `--code`. If someone else is using your component, all they
+have to do to customize the data is to register their own function and swap out
+the name. Registered functions can also take **arguments** by the way that can
+be defined in the config as well – you can read more about this in the docs on
+[training with custom code](/usage/training#custom-code).

 ### Python type hints and pydantic validation {#type-hints new="3"}