registry.assets -> registry.misc

This commit is contained in:
Ines Montani 2020-09-03 17:31:14 +02:00
parent c063e55eb7
commit 5afe6447cd
13 changed files with 60 additions and 54 deletions

View File

@ -24,7 +24,7 @@ def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model:
return model
@registry.assets.register("spacy.KBFromFile.v1")
@registry.misc.register("spacy.KBFromFile.v1")
def load_kb(kb_path: str) -> Callable[[Vocab], KnowledgeBase]:
def kb_from_file(vocab):
kb = KnowledgeBase(vocab, entity_vector_length=1)
@ -34,7 +34,7 @@ def load_kb(kb_path: str) -> Callable[[Vocab], KnowledgeBase]:
return kb_from_file
@registry.assets.register("spacy.EmptyKB.v1")
@registry.misc.register("spacy.EmptyKB.v1")
def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
def empty_kb_factory(vocab):
return KnowledgeBase(vocab=vocab, entity_vector_length=entity_vector_length)
@ -42,6 +42,6 @@ def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
return empty_kb_factory
@registry.assets.register("spacy.CandidateGenerator.v1")
@registry.misc.register("spacy.CandidateGenerator.v1")
def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]:
return get_candidates

View File

@ -39,12 +39,12 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
assigns=["token.ent_kb_id"],
default_config={
"kb_loader": {"@assets": "spacy.EmptyKB.v1", "entity_vector_length": 64},
"kb_loader": {"@misc": "spacy.EmptyKB.v1", "entity_vector_length": 64},
"model": DEFAULT_NEL_MODEL,
"labels_discard": [],
"incl_prior": True,
"incl_context": True,
"get_candidates": {"@assets": "spacy.CandidateGenerator.v1"},
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
},
)
def make_entity_linker(

View File

@ -14,7 +14,7 @@ LANGUAGES = ["el", "en", "fr", "nl"]
@pytest.mark.parametrize("lang", LANGUAGES)
def test_lemmatizer_initialize(lang, capfd):
@registry.assets("lemmatizer_init_lookups")
@registry.misc("lemmatizer_init_lookups")
def lemmatizer_init_lookups():
lookups = Lookups()
lookups.add_table("lemma_lookup", {"cope": "cope"})
@ -25,9 +25,7 @@ def test_lemmatizer_initialize(lang, capfd):
"""Test that languages can be initialized."""
nlp = get_lang_class(lang)()
nlp.add_pipe(
"lemmatizer", config={"lookups": {"@assets": "lemmatizer_init_lookups"}}
)
nlp.add_pipe("lemmatizer", config={"lookups": {"@misc": "lemmatizer_init_lookups"}})
# Check for stray print statements (see #3342)
doc = nlp("test") # noqa: F841
captured = capfd.readouterr()

View File

@ -31,7 +31,7 @@ def pattern_dicts():
]
@registry.assets("attribute_ruler_patterns")
@registry.misc("attribute_ruler_patterns")
def attribute_ruler_patterns():
return [
{
@ -86,7 +86,7 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
# initialize with patterns from asset
nlp.add_pipe(
"attribute_ruler",
config={"pattern_dicts": {"@assets": "attribute_ruler_patterns"}},
config={"pattern_dicts": {"@misc": "attribute_ruler_patterns"}},
)
doc = nlp("This is a test.")
assert doc[2].lemma_ == "the"

View File

@ -137,7 +137,7 @@ def test_kb_undefined(nlp):
def test_kb_empty(nlp):
"""Test that the EL can't train with an empty KB"""
config = {"kb_loader": {"@assets": "spacy.EmptyKB.v1", "entity_vector_length": 342}}
config = {"kb_loader": {"@misc": "spacy.EmptyKB.v1", "entity_vector_length": 342}}
entity_linker = nlp.add_pipe("entity_linker", config=config)
assert len(entity_linker.kb) == 0
with pytest.raises(ValueError):
@ -183,7 +183,7 @@ def test_el_pipe_configuration(nlp):
ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns([pattern])
@registry.assets.register("myAdamKB.v1")
@registry.misc.register("myAdamKB.v1")
def mykb() -> Callable[["Vocab"], KnowledgeBase]:
def create_kb(vocab):
kb = KnowledgeBase(vocab, entity_vector_length=1)
@ -199,7 +199,7 @@ def test_el_pipe_configuration(nlp):
# run an EL pipe without a trained context encoder, to check the candidate generation step only
nlp.add_pipe(
"entity_linker",
config={"kb_loader": {"@assets": "myAdamKB.v1"}, "incl_context": False},
config={"kb_loader": {"@misc": "myAdamKB.v1"}, "incl_context": False},
)
# With the default get_candidates function, matching is case-sensitive
text = "Douglas and douglas are not the same."
@ -211,7 +211,7 @@ def test_el_pipe_configuration(nlp):
def get_lowercased_candidates(kb, span):
return kb.get_alias_candidates(span.text.lower())
@registry.assets.register("spacy.LowercaseCandidateGenerator.v1")
@registry.misc.register("spacy.LowercaseCandidateGenerator.v1")
def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]:
return get_lowercased_candidates
@ -220,9 +220,9 @@ def test_el_pipe_configuration(nlp):
"entity_linker",
"entity_linker",
config={
"kb_loader": {"@assets": "myAdamKB.v1"},
"kb_loader": {"@misc": "myAdamKB.v1"},
"incl_context": False,
"get_candidates": {"@assets": "spacy.LowercaseCandidateGenerator.v1"},
"get_candidates": {"@misc": "spacy.LowercaseCandidateGenerator.v1"},
},
)
doc = nlp(text)
@ -282,7 +282,7 @@ def test_append_invalid_alias(nlp):
def test_preserving_links_asdoc(nlp):
"""Test that Span.as_doc preserves the existing entity links"""
@registry.assets.register("myLocationsKB.v1")
@registry.misc.register("myLocationsKB.v1")
def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]:
def create_kb(vocab):
mykb = KnowledgeBase(vocab, entity_vector_length=1)
@ -304,7 +304,7 @@ def test_preserving_links_asdoc(nlp):
]
ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns)
el_config = {"kb_loader": {"@assets": "myLocationsKB.v1"}, "incl_prior": False}
el_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}, "incl_prior": False}
el_pipe = nlp.add_pipe("entity_linker", config=el_config, last=True)
el_pipe.begin_training(lambda: [])
el_pipe.incl_context = False
@ -387,7 +387,7 @@ def test_overfitting_IO():
doc = nlp(text)
train_examples.append(Example.from_dict(doc, annotation))
@registry.assets.register("myOverfittingKB.v1")
@registry.misc.register("myOverfittingKB.v1")
def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]:
def create_kb(vocab):
# create artificial KB - assign same prior weight to the two russ cochran's
@ -408,7 +408,7 @@ def test_overfitting_IO():
# Create the Entity Linker component and add it to the pipeline
nlp.add_pipe(
"entity_linker",
config={"kb_loader": {"@assets": "myOverfittingKB.v1"}},
config={"kb_loader": {"@misc": "myOverfittingKB.v1"}},
last=True,
)

View File

@ -13,7 +13,7 @@ def nlp():
@pytest.fixture
def lemmatizer(nlp):
@registry.assets("cope_lookups")
@registry.misc("cope_lookups")
def cope_lookups():
lookups = Lookups()
lookups.add_table("lemma_lookup", {"cope": "cope"})
@ -23,13 +23,13 @@ def lemmatizer(nlp):
return lookups
lemmatizer = nlp.add_pipe(
"lemmatizer", config={"mode": "rule", "lookups": {"@assets": "cope_lookups"}}
"lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}}
)
return lemmatizer
def test_lemmatizer_init(nlp):
@registry.assets("cope_lookups")
@registry.misc("cope_lookups")
def cope_lookups():
lookups = Lookups()
lookups.add_table("lemma_lookup", {"cope": "cope"})
@ -39,7 +39,7 @@ def test_lemmatizer_init(nlp):
return lookups
lemmatizer = nlp.add_pipe(
"lemmatizer", config={"mode": "lookup", "lookups": {"@assets": "cope_lookups"}}
"lemmatizer", config={"mode": "lookup", "lookups": {"@misc": "cope_lookups"}}
)
assert isinstance(lemmatizer.lookups, Lookups)
assert lemmatizer.mode == "lookup"
@ -51,14 +51,14 @@ def test_lemmatizer_init(nlp):
nlp.remove_pipe("lemmatizer")
@registry.assets("empty_lookups")
@registry.misc("empty_lookups")
def empty_lookups():
return Lookups()
with pytest.raises(ValueError):
nlp.add_pipe(
"lemmatizer",
config={"mode": "lookup", "lookups": {"@assets": "empty_lookups"}},
config={"mode": "lookup", "lookups": {"@misc": "empty_lookups"}},
)
@ -79,7 +79,7 @@ def test_lemmatizer_config(nlp, lemmatizer):
def test_lemmatizer_serialize(nlp, lemmatizer):
@registry.assets("cope_lookups")
@registry.misc("cope_lookups")
def cope_lookups():
lookups = Lookups()
lookups.add_table("lemma_lookup", {"cope": "cope"})
@ -90,7 +90,7 @@ def test_lemmatizer_serialize(nlp, lemmatizer):
nlp2 = English()
lemmatizer2 = nlp2.add_pipe(
"lemmatizer", config={"mode": "rule", "lookups": {"@assets": "cope_lookups"}}
"lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}}
)
lemmatizer2.from_bytes(lemmatizer.to_bytes())
assert lemmatizer.to_bytes() == lemmatizer2.to_bytes()

View File

@ -71,7 +71,7 @@ def tagger():
def entity_linker():
nlp = Language()
@registry.assets.register("TestIssue5230KB.v1")
@registry.misc.register("TestIssue5230KB.v1")
def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]:
def create_kb(vocab):
kb = KnowledgeBase(vocab, entity_vector_length=1)
@ -80,7 +80,7 @@ def entity_linker():
return create_kb
config = {"kb_loader": {"@assets": "TestIssue5230KB.v1"}}
config = {"kb_loader": {"@misc": "TestIssue5230KB.v1"}}
entity_linker = nlp.add_pipe("entity_linker", config=config)
# need to add model for two reasons:
# 1. no model leads to error in serialization,

View File

@ -85,7 +85,7 @@ def test_serialize_subclassed_kb():
super().__init__(vocab, entity_vector_length)
self.custom_field = custom_field
@registry.assets.register("spacy.CustomKB.v1")
@registry.misc.register("spacy.CustomKB.v1")
def custom_kb(
entity_vector_length: int, custom_field: int
) -> Callable[["Vocab"], KnowledgeBase]:
@ -101,7 +101,7 @@ def test_serialize_subclassed_kb():
nlp = English()
config = {
"kb_loader": {
"@assets": "spacy.CustomKB.v1",
"@misc": "spacy.CustomKB.v1",
"entity_vector_length": 342,
"custom_field": 666,
}

View File

@ -76,7 +76,7 @@ class registry(thinc.registry):
lemmatizers = catalogue.create("spacy", "lemmatizers", entry_points=True)
lookups = catalogue.create("spacy", "lookups", entry_points=True)
displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True)
assets = catalogue.create("spacy", "assets", entry_points=True)
misc = catalogue.create("spacy", "misc", entry_points=True)
# Callback functions used to manipulate nlp object etc.
callbacks = catalogue.create("spacy", "callbacks")
batchers = catalogue.create("spacy", "batchers", entry_points=True)

View File

@ -673,11 +673,11 @@ into the "real world". This requires 3 main components:
> subword_features = true
>
> [kb_loader]
> @assets = "spacy.EmptyKB.v1"
> @misc = "spacy.EmptyKB.v1"
> entity_vector_length = 64
>
> [get_candidates]
> @assets = "spacy.CandidateGenerator.v1"
> @misc = "spacy.CandidateGenerator.v1"
> ```
The `EntityLinker` model architecture is a Thinc `Model` with a

View File

@ -34,8 +34,8 @@ architectures and their arguments and hyperparameters.
> "incl_prior": True,
> "incl_context": True,
> "model": DEFAULT_NEL_MODEL,
> "kb_loader": {'@assets': 'spacy.EmptyKB.v1', 'entity_vector_length': 64},
> "get_candidates": {'@assets': 'spacy.CandidateGenerator.v1'},
> "kb_loader": {'@misc': 'spacy.EmptyKB.v1', 'entity_vector_length': 64},
> "get_candidates": {'@misc': 'spacy.CandidateGenerator.v1'},
> }
> nlp.add_pipe("entity_linker", config=config)
> ```
@ -66,7 +66,7 @@ https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/entity_linker.py
> entity_linker = nlp.add_pipe("entity_linker", config=config)
>
> # Construction via add_pipe with custom KB and candidate generation
> config = {"kb": {"@assets": "my_kb.v1"}}
> config = {"kb": {"@misc": "my_kb.v1"}}
> entity_linker = nlp.add_pipe("entity_linker", config=config)
>
> # Construction from class

View File

@ -307,7 +307,6 @@ factories.
| Registry name | Description |
| ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `architectures` | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`. |
| `assets` | Registry for data assets, knowledge bases etc. |
| `batchers` | Registry for training and evaluation [data batchers](#batchers). |
| `callbacks` | Registry for custom callbacks to [modify the `nlp` object](/usage/training#custom-code-nlp-callbacks) before training. |
| `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points). |
@ -322,6 +321,7 @@ factories.
| `readers` | Registry for training and evaluation data readers like [`Corpus`](/api/corpus). |
| `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). |
| `tokenizers` | Registry for tokenizer factories. Registered functions should return a callback that receives the `nlp` object and returns a [`Tokenizer`](/api/tokenizer) or a custom callable. |
| `misc` | Registry for miscellaneous functions that return data assets, knowledge bases or anything else you may need. |
### spacy-transformers registry {#registry-transformers}

View File

@ -842,12 +842,20 @@ load and train custom pipelines with custom components. A simple solution is to
**register a function** that returns your resources. The
[registry](/api/top-level#registry) lets you **map string names to functions**
that create objects, so given a name and optional arguments, spaCy will know how
to recreate the object. To register a function that returns a custom asset, you
can use the `@spacy.registry.assets` decorator with a single argument, the name:
to recreate the object. To register a function that returns your custom
dictionary, you can use the `@spacy.registry.misc` decorator with a single
argument, the name:
> #### What's the misc registry?
>
> The [`registry`](/api/top-level#registry) provides different categories for
> different types of functions for example, model architectures, tokenizers or
> batchers. `misc` is intended for miscellaneous functions that don't fit
> anywhere else.
```python
### Registered function for assets {highlight="1"}
@spacy.registry.assets("acronyms.slang_dict.v1")
@spacy.registry.misc("acronyms.slang_dict.v1")
def create_acronyms_slang_dict():
dictionary = {"lol": "laughing out loud", "brb": "be right back"}
dictionary.update({value: key for key, value in dictionary.items()})
@ -856,9 +864,9 @@ def create_acronyms_slang_dict():
In your `default_config` (and later in your
[training config](/usage/training#config)), you can now refer to the function
registered under the name `"acronyms.slang_dict.v1"` using the `@assets` key.
This tells spaCy how to create the value, and when your component is created,
the result of the registered function is passed in as the key `"dictionary"`.
registered under the name `"acronyms.slang_dict.v1"` using the `@misc` key. This
tells spaCy how to create the value, and when your component is created, the
result of the registered function is passed in as the key `"dictionary"`.
> #### config.cfg
>
@ -867,22 +875,22 @@ the result of the registered function is passed in as the key `"dictionary"`.
> factory = "acronyms"
>
> [components.acronyms.dictionary]
> @assets = "acronyms.slang_dict.v1"
> @misc = "acronyms.slang_dict.v1"
> ```
```diff
- default_config = {"dictionary:" DICTIONARY}
+ default_config = {"dictionary": {"@assets": "acronyms.slang_dict.v1"}}
+ default_config = {"dictionary": {"@misc": "acronyms.slang_dict.v1"}}
```
Using a registered function also means that you can easily include your custom
components in pipelines that you [train](/usage/training). To make sure spaCy
knows where to find your custom `@assets` function, you can pass in a Python
file via the argument `--code`. If someone else is using your component, all
they have to do to customize the data is to register their own function and swap
out the name. Registered functions can also take **arguments** by the way that
can be defined in the config as well you can read more about this in the docs
on [training with custom code](/usage/training#custom-code).
knows where to find your custom `@misc` function, you can pass in a Python file
via the argument `--code`. If someone else is using your component, all they
have to do to customize the data is to register their own function and swap out
the name. Registered functions can also take **arguments** by the way that can
be defined in the config as well you can read more about this in the docs on
[training with custom code](/usage/training#custom-code).
### Python type hints and pydantic validation {#type-hints new="3"}