From 5afe6447cd835bb6ce4e21adb37340e2e2c34019 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 3 Sep 2020 17:31:14 +0200 Subject: [PATCH] registry.assets -> registry.misc --- spacy/ml/models/entity_linker.py | 6 ++-- spacy/pipeline/entity_linker.py | 4 +-- spacy/tests/lang/test_lemmatizers.py | 6 ++-- spacy/tests/pipeline/test_attributeruler.py | 4 +-- spacy/tests/pipeline/test_entity_linker.py | 20 ++++++------ spacy/tests/pipeline/test_lemmatizer.py | 16 ++++----- spacy/tests/regression/test_issue5230.py | 4 +-- spacy/tests/serialize/test_serialize_kb.py | 4 +-- spacy/util.py | 2 +- website/docs/api/architectures.md | 4 +-- website/docs/api/entitylinker.md | 6 ++-- website/docs/api/top-level.md | 2 +- website/docs/usage/processing-pipelines.md | 36 +++++++++++++-------- 13 files changed, 60 insertions(+), 54 deletions(-) diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index 6792f3e59..d945e5fba 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -24,7 +24,7 @@ def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model: return model -@registry.assets.register("spacy.KBFromFile.v1") +@registry.misc.register("spacy.KBFromFile.v1") def load_kb(kb_path: str) -> Callable[[Vocab], KnowledgeBase]: def kb_from_file(vocab): kb = KnowledgeBase(vocab, entity_vector_length=1) @@ -34,7 +34,7 @@ def load_kb(kb_path: str) -> Callable[[Vocab], KnowledgeBase]: return kb_from_file -@registry.assets.register("spacy.EmptyKB.v1") +@registry.misc.register("spacy.EmptyKB.v1") def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]: def empty_kb_factory(vocab): return KnowledgeBase(vocab=vocab, entity_vector_length=entity_vector_length) @@ -42,6 +42,6 @@ def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]: return empty_kb_factory -@registry.assets.register("spacy.CandidateGenerator.v1") +@registry.misc.register("spacy.CandidateGenerator.v1") def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]: return get_candidates diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index c45cdce75..78cf274ab 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -39,12 +39,12 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"] requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"], assigns=["token.ent_kb_id"], default_config={ - "kb_loader": {"@assets": "spacy.EmptyKB.v1", "entity_vector_length": 64}, + "kb_loader": {"@misc": "spacy.EmptyKB.v1", "entity_vector_length": 64}, "model": DEFAULT_NEL_MODEL, "labels_discard": [], "incl_prior": True, "incl_context": True, - "get_candidates": {"@assets": "spacy.CandidateGenerator.v1"}, + "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"}, }, ) def make_entity_linker( diff --git a/spacy/tests/lang/test_lemmatizers.py b/spacy/tests/lang/test_lemmatizers.py index 8c235c86e..14c59659a 100644 --- a/spacy/tests/lang/test_lemmatizers.py +++ b/spacy/tests/lang/test_lemmatizers.py @@ -14,7 +14,7 @@ LANGUAGES = ["el", "en", "fr", "nl"] @pytest.mark.parametrize("lang", LANGUAGES) def test_lemmatizer_initialize(lang, capfd): - @registry.assets("lemmatizer_init_lookups") + @registry.misc("lemmatizer_init_lookups") def lemmatizer_init_lookups(): lookups = Lookups() lookups.add_table("lemma_lookup", {"cope": "cope"}) @@ -25,9 +25,7 @@ def test_lemmatizer_initialize(lang, capfd): """Test that languages can be initialized.""" nlp = get_lang_class(lang)() - nlp.add_pipe( - "lemmatizer", config={"lookups": {"@assets": "lemmatizer_init_lookups"}} - ) + nlp.add_pipe("lemmatizer", config={"lookups": {"@misc": "lemmatizer_init_lookups"}}) # Check for stray print statements (see #3342) doc = nlp("test") # noqa: F841 captured = capfd.readouterr() diff --git a/spacy/tests/pipeline/test_attributeruler.py b/spacy/tests/pipeline/test_attributeruler.py index 96361a693..c12a2b650 100644 --- a/spacy/tests/pipeline/test_attributeruler.py +++ b/spacy/tests/pipeline/test_attributeruler.py @@ -31,7 +31,7 @@ def pattern_dicts(): ] -@registry.assets("attribute_ruler_patterns") +@registry.misc("attribute_ruler_patterns") def attribute_ruler_patterns(): return [ { @@ -86,7 +86,7 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts): # initialize with patterns from asset nlp.add_pipe( "attribute_ruler", - config={"pattern_dicts": {"@assets": "attribute_ruler_patterns"}}, + config={"pattern_dicts": {"@misc": "attribute_ruler_patterns"}}, ) doc = nlp("This is a test.") assert doc[2].lemma_ == "the" diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 4385d2bf9..4eaa71272 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -137,7 +137,7 @@ def test_kb_undefined(nlp): def test_kb_empty(nlp): """Test that the EL can't train with an empty KB""" - config = {"kb_loader": {"@assets": "spacy.EmptyKB.v1", "entity_vector_length": 342}} + config = {"kb_loader": {"@misc": "spacy.EmptyKB.v1", "entity_vector_length": 342}} entity_linker = nlp.add_pipe("entity_linker", config=config) assert len(entity_linker.kb) == 0 with pytest.raises(ValueError): @@ -183,7 +183,7 @@ def test_el_pipe_configuration(nlp): ruler = nlp.add_pipe("entity_ruler") ruler.add_patterns([pattern]) - @registry.assets.register("myAdamKB.v1") + @registry.misc.register("myAdamKB.v1") def mykb() -> Callable[["Vocab"], KnowledgeBase]: def create_kb(vocab): kb = KnowledgeBase(vocab, entity_vector_length=1) @@ -199,7 +199,7 @@ def test_el_pipe_configuration(nlp): # run an EL pipe without a trained context encoder, to check the candidate generation step only nlp.add_pipe( "entity_linker", - config={"kb_loader": {"@assets": "myAdamKB.v1"}, "incl_context": False}, + config={"kb_loader": {"@misc": "myAdamKB.v1"}, "incl_context": False}, ) # With the default get_candidates function, matching is case-sensitive text = "Douglas and douglas are not the same." @@ -211,7 +211,7 @@ def test_el_pipe_configuration(nlp): def get_lowercased_candidates(kb, span): return kb.get_alias_candidates(span.text.lower()) - @registry.assets.register("spacy.LowercaseCandidateGenerator.v1") + @registry.misc.register("spacy.LowercaseCandidateGenerator.v1") def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]: return get_lowercased_candidates @@ -220,9 +220,9 @@ def test_el_pipe_configuration(nlp): "entity_linker", "entity_linker", config={ - "kb_loader": {"@assets": "myAdamKB.v1"}, + "kb_loader": {"@misc": "myAdamKB.v1"}, "incl_context": False, - "get_candidates": {"@assets": "spacy.LowercaseCandidateGenerator.v1"}, + "get_candidates": {"@misc": "spacy.LowercaseCandidateGenerator.v1"}, }, ) doc = nlp(text) @@ -282,7 +282,7 @@ def test_append_invalid_alias(nlp): def test_preserving_links_asdoc(nlp): """Test that Span.as_doc preserves the existing entity links""" - @registry.assets.register("myLocationsKB.v1") + @registry.misc.register("myLocationsKB.v1") def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]: def create_kb(vocab): mykb = KnowledgeBase(vocab, entity_vector_length=1) @@ -304,7 +304,7 @@ def test_preserving_links_asdoc(nlp): ] ruler = nlp.add_pipe("entity_ruler") ruler.add_patterns(patterns) - el_config = {"kb_loader": {"@assets": "myLocationsKB.v1"}, "incl_prior": False} + el_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}, "incl_prior": False} el_pipe = nlp.add_pipe("entity_linker", config=el_config, last=True) el_pipe.begin_training(lambda: []) el_pipe.incl_context = False @@ -387,7 +387,7 @@ def test_overfitting_IO(): doc = nlp(text) train_examples.append(Example.from_dict(doc, annotation)) - @registry.assets.register("myOverfittingKB.v1") + @registry.misc.register("myOverfittingKB.v1") def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]: def create_kb(vocab): # create artificial KB - assign same prior weight to the two russ cochran's @@ -408,7 +408,7 @@ def test_overfitting_IO(): # Create the Entity Linker component and add it to the pipeline nlp.add_pipe( "entity_linker", - config={"kb_loader": {"@assets": "myOverfittingKB.v1"}}, + config={"kb_loader": {"@misc": "myOverfittingKB.v1"}}, last=True, ) diff --git a/spacy/tests/pipeline/test_lemmatizer.py b/spacy/tests/pipeline/test_lemmatizer.py index 8a70fdeeb..05e15bc16 100644 --- a/spacy/tests/pipeline/test_lemmatizer.py +++ b/spacy/tests/pipeline/test_lemmatizer.py @@ -13,7 +13,7 @@ def nlp(): @pytest.fixture def lemmatizer(nlp): - @registry.assets("cope_lookups") + @registry.misc("cope_lookups") def cope_lookups(): lookups = Lookups() lookups.add_table("lemma_lookup", {"cope": "cope"}) @@ -23,13 +23,13 @@ def lemmatizer(nlp): return lookups lemmatizer = nlp.add_pipe( - "lemmatizer", config={"mode": "rule", "lookups": {"@assets": "cope_lookups"}} + "lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}} ) return lemmatizer def test_lemmatizer_init(nlp): - @registry.assets("cope_lookups") + @registry.misc("cope_lookups") def cope_lookups(): lookups = Lookups() lookups.add_table("lemma_lookup", {"cope": "cope"}) @@ -39,7 +39,7 @@ def test_lemmatizer_init(nlp): return lookups lemmatizer = nlp.add_pipe( - "lemmatizer", config={"mode": "lookup", "lookups": {"@assets": "cope_lookups"}} + "lemmatizer", config={"mode": "lookup", "lookups": {"@misc": "cope_lookups"}} ) assert isinstance(lemmatizer.lookups, Lookups) assert lemmatizer.mode == "lookup" @@ -51,14 +51,14 @@ def test_lemmatizer_init(nlp): nlp.remove_pipe("lemmatizer") - @registry.assets("empty_lookups") + @registry.misc("empty_lookups") def empty_lookups(): return Lookups() with pytest.raises(ValueError): nlp.add_pipe( "lemmatizer", - config={"mode": "lookup", "lookups": {"@assets": "empty_lookups"}}, + config={"mode": "lookup", "lookups": {"@misc": "empty_lookups"}}, ) @@ -79,7 +79,7 @@ def test_lemmatizer_config(nlp, lemmatizer): def test_lemmatizer_serialize(nlp, lemmatizer): - @registry.assets("cope_lookups") + @registry.misc("cope_lookups") def cope_lookups(): lookups = Lookups() lookups.add_table("lemma_lookup", {"cope": "cope"}) @@ -90,7 +90,7 @@ def test_lemmatizer_serialize(nlp, lemmatizer): nlp2 = English() lemmatizer2 = nlp2.add_pipe( - "lemmatizer", config={"mode": "rule", "lookups": {"@assets": "cope_lookups"}} + "lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}} ) lemmatizer2.from_bytes(lemmatizer.to_bytes()) assert lemmatizer.to_bytes() == lemmatizer2.to_bytes() diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index 78ae04bbb..af643aadc 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -71,7 +71,7 @@ def tagger(): def entity_linker(): nlp = Language() - @registry.assets.register("TestIssue5230KB.v1") + @registry.misc.register("TestIssue5230KB.v1") def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]: def create_kb(vocab): kb = KnowledgeBase(vocab, entity_vector_length=1) @@ -80,7 +80,7 @@ def entity_linker(): return create_kb - config = {"kb_loader": {"@assets": "TestIssue5230KB.v1"}} + config = {"kb_loader": {"@misc": "TestIssue5230KB.v1"}} entity_linker = nlp.add_pipe("entity_linker", config=config) # need to add model for two reasons: # 1. no model leads to error in serialization, diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py index 3cf5485d7..63736418b 100644 --- a/spacy/tests/serialize/test_serialize_kb.py +++ b/spacy/tests/serialize/test_serialize_kb.py @@ -85,7 +85,7 @@ def test_serialize_subclassed_kb(): super().__init__(vocab, entity_vector_length) self.custom_field = custom_field - @registry.assets.register("spacy.CustomKB.v1") + @registry.misc.register("spacy.CustomKB.v1") def custom_kb( entity_vector_length: int, custom_field: int ) -> Callable[["Vocab"], KnowledgeBase]: @@ -101,7 +101,7 @@ def test_serialize_subclassed_kb(): nlp = English() config = { "kb_loader": { - "@assets": "spacy.CustomKB.v1", + "@misc": "spacy.CustomKB.v1", "entity_vector_length": 342, "custom_field": 666, } diff --git a/spacy/util.py b/spacy/util.py index 0eb76c3d1..fa4815df8 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -76,7 +76,7 @@ class registry(thinc.registry): lemmatizers = catalogue.create("spacy", "lemmatizers", entry_points=True) lookups = catalogue.create("spacy", "lookups", entry_points=True) displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True) - assets = catalogue.create("spacy", "assets", entry_points=True) + misc = catalogue.create("spacy", "misc", entry_points=True) # Callback functions used to manipulate nlp object etc. callbacks = catalogue.create("spacy", "callbacks") batchers = catalogue.create("spacy", "batchers", entry_points=True) diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index 93e50bfb3..35816a9a2 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -673,11 +673,11 @@ into the "real world". This requires 3 main components: > subword_features = true > > [kb_loader] -> @assets = "spacy.EmptyKB.v1" +> @misc = "spacy.EmptyKB.v1" > entity_vector_length = 64 > > [get_candidates] -> @assets = "spacy.CandidateGenerator.v1" +> @misc = "spacy.CandidateGenerator.v1" > ``` The `EntityLinker` model architecture is a Thinc `Model` with a diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index 637bd3c68..8cde6c490 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -34,8 +34,8 @@ architectures and their arguments and hyperparameters. > "incl_prior": True, > "incl_context": True, > "model": DEFAULT_NEL_MODEL, -> "kb_loader": {'@assets': 'spacy.EmptyKB.v1', 'entity_vector_length': 64}, -> "get_candidates": {'@assets': 'spacy.CandidateGenerator.v1'}, +> "kb_loader": {'@misc': 'spacy.EmptyKB.v1', 'entity_vector_length': 64}, +> "get_candidates": {'@misc': 'spacy.CandidateGenerator.v1'}, > } > nlp.add_pipe("entity_linker", config=config) > ``` @@ -66,7 +66,7 @@ https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/entity_linker.py > entity_linker = nlp.add_pipe("entity_linker", config=config) > > # Construction via add_pipe with custom KB and candidate generation -> config = {"kb": {"@assets": "my_kb.v1"}} +> config = {"kb": {"@misc": "my_kb.v1"}} > entity_linker = nlp.add_pipe("entity_linker", config=config) > > # Construction from class diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 0fe48e736..b9201ca39 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -307,7 +307,6 @@ factories. | Registry name | Description | | ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `architectures` | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`. | -| `assets` | Registry for data assets, knowledge bases etc. | | `batchers` | Registry for training and evaluation [data batchers](#batchers). | | `callbacks` | Registry for custom callbacks to [modify the `nlp` object](/usage/training#custom-code-nlp-callbacks) before training. | | `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points). | @@ -322,6 +321,7 @@ factories. | `readers` | Registry for training and evaluation data readers like [`Corpus`](/api/corpus). | | `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). | | `tokenizers` | Registry for tokenizer factories. Registered functions should return a callback that receives the `nlp` object and returns a [`Tokenizer`](/api/tokenizer) or a custom callable. | +| `misc` | Registry for miscellaneous functions that return data assets, knowledge bases or anything else you may need. | ### spacy-transformers registry {#registry-transformers} diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index c8702a147..2885d9f50 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -842,12 +842,20 @@ load and train custom pipelines with custom components. A simple solution is to **register a function** that returns your resources. The [registry](/api/top-level#registry) lets you **map string names to functions** that create objects, so given a name and optional arguments, spaCy will know how -to recreate the object. To register a function that returns a custom asset, you -can use the `@spacy.registry.assets` decorator with a single argument, the name: +to recreate the object. To register a function that returns your custom +dictionary, you can use the `@spacy.registry.misc` decorator with a single +argument, the name: + +> #### What's the misc registry? +> +> The [`registry`](/api/top-level#registry) provides different categories for +> different types of functions – for example, model architectures, tokenizers or +> batchers. `misc` is intended for miscellaneous functions that don't fit +> anywhere else. ```python ### Registered function for assets {highlight="1"} -@spacy.registry.assets("acronyms.slang_dict.v1") +@spacy.registry.misc("acronyms.slang_dict.v1") def create_acronyms_slang_dict(): dictionary = {"lol": "laughing out loud", "brb": "be right back"} dictionary.update({value: key for key, value in dictionary.items()}) @@ -856,9 +864,9 @@ def create_acronyms_slang_dict(): In your `default_config` (and later in your [training config](/usage/training#config)), you can now refer to the function -registered under the name `"acronyms.slang_dict.v1"` using the `@assets` key. -This tells spaCy how to create the value, and when your component is created, -the result of the registered function is passed in as the key `"dictionary"`. +registered under the name `"acronyms.slang_dict.v1"` using the `@misc` key. This +tells spaCy how to create the value, and when your component is created, the +result of the registered function is passed in as the key `"dictionary"`. > #### config.cfg > @@ -867,22 +875,22 @@ the result of the registered function is passed in as the key `"dictionary"`. > factory = "acronyms" > > [components.acronyms.dictionary] -> @assets = "acronyms.slang_dict.v1" +> @misc = "acronyms.slang_dict.v1" > ``` ```diff - default_config = {"dictionary:" DICTIONARY} -+ default_config = {"dictionary": {"@assets": "acronyms.slang_dict.v1"}} ++ default_config = {"dictionary": {"@misc": "acronyms.slang_dict.v1"}} ``` Using a registered function also means that you can easily include your custom components in pipelines that you [train](/usage/training). To make sure spaCy -knows where to find your custom `@assets` function, you can pass in a Python -file via the argument `--code`. If someone else is using your component, all -they have to do to customize the data is to register their own function and swap -out the name. Registered functions can also take **arguments** by the way that -can be defined in the config as well – you can read more about this in the docs -on [training with custom code](/usage/training#custom-code). +knows where to find your custom `@misc` function, you can pass in a Python file +via the argument `--code`. If someone else is using your component, all they +have to do to customize the data is to register their own function and swap out +the name. Registered functions can also take **arguments** by the way that can +be defined in the config as well – you can read more about this in the docs on +[training with custom code](/usage/training#custom-code). ### Python type hints and pydantic validation {#type-hints new="3"}