From 40276fd3be231be6969f8c51889c13e77a726fa8 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 12 Oct 2020 11:41:27 +0200 Subject: [PATCH] update NEL docs after latest refactor --- spacy/ml/models/entity_linker.py | 3 +- website/docs/api/architectures.md | 19 +++---- website/docs/api/entitylinker.md | 84 ++++++++++++++++++++----------- 3 files changed, 68 insertions(+), 38 deletions(-) diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index d945e5fba..f37203b1b 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -1,3 +1,4 @@ +from pathlib import Path from typing import Optional, Callable, Iterable from thinc.api import chain, clone, list2ragged, reduce_mean, residual from thinc.api import Model, Maxout, Linear @@ -25,7 +26,7 @@ def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model: @registry.misc.register("spacy.KBFromFile.v1") -def load_kb(kb_path: str) -> Callable[[Vocab], KnowledgeBase]: +def load_kb(kb_path: Path) -> Callable[[Vocab], KnowledgeBase]: def kb_from_file(vocab): kb = KnowledgeBase(vocab, entity_vector_length=1) kb.from_disk(kb_path) diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index 5246a3ed6..3157c261a 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -637,13 +637,6 @@ into the "real world". This requires 3 main components: > window_size = 1 > maxout_pieces = 3 > subword_features = true -> -> [kb_loader] -> @misc = "spacy.EmptyKB.v1" -> entity_vector_length = 64 -> -> [get_candidates] -> @misc = "spacy.CandidateGenerator.v1" > ``` The `EntityLinker` model architecture is a Thinc `Model` with a @@ -657,13 +650,21 @@ The `EntityLinker` model architecture is a Thinc `Model` with a ### spacy.EmptyKB.v1 {#EmptyKB} -A function that creates a default, empty `KnowledgeBase` from a -[`Vocab`](/api/vocab) instance. +A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab) +instance. This is the default when a new entity linker component is created. | Name | Description | | ---------------------- | ----------------------------------------------------------------------------------- | | `entity_vector_length` | The length of the vectors encoding each entity in the KB. Defaults to `64`. ~~int~~ | +### spacy.KBFromFile.v1 {#KBFromFile} + +A function that reads an existing `KnowledgeBase` from file. + +| Name | Description | +| --------- | -------------------------------------------------------- | +| `kb_path` | The location of the KB that was stored to file. ~~Path~~ | + ### spacy.CandidateGenerator.v1 {#CandidateGenerator} A function that takes as input a [`KnowledgeBase`](/api/kb) and a diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index 169a175e2..0904bbf72 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -34,20 +34,20 @@ architectures and their arguments and hyperparameters. > "incl_prior": True, > "incl_context": True, > "model": DEFAULT_NEL_MODEL, -> "kb_loader": {'@misc': 'spacy.EmptyKB.v1', 'entity_vector_length': 64}, +> "entity_vector_length": 64, > "get_candidates": {'@misc': 'spacy.CandidateGenerator.v1'}, > } > nlp.add_pipe("entity_linker", config=config) > ``` -| Setting | Description | -| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ | -| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ | -| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ | -| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ | -| `kb_loader` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. Defaults to [EmptyKB](/api/architectures#EmptyKB), a function returning an empty `KnowledgeBase` with an `entity_vector_length` of `64`. ~~Callable[[Vocab], KnowledgeBase]~~ | -| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | +| Setting | Description | +| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ | +| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ | +| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ | +| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to 64. ~~int~~ | +| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/entity_linker.py @@ -65,10 +65,6 @@ architectures and their arguments and hyperparameters. > config = {"model": {"@architectures": "my_el.v1"}} > entity_linker = nlp.add_pipe("entity_linker", config=config) > -> # Construction via add_pipe with custom KB and candidate generation -> config = {"kb": {"@misc": "my_kb.v1"}} -> entity_linker = nlp.add_pipe("entity_linker", config=config) -> > # Construction from class > from spacy.pipeline import EntityLinker > entity_linker = EntityLinker(nlp.vocab, model) @@ -76,21 +72,25 @@ architectures and their arguments and hyperparameters. Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and -[`nlp.add_pipe`](/api/language#add_pipe). Note that both the internal -`KnowledgeBase` as well as the Candidate generator can be customized by -providing custom registered functions. +[`nlp.add_pipe`](/api/language#add_pipe). -| Name | Description | -| ---------------- | -------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary. ~~Vocab~~ | -| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ | -| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | -| _keyword-only_ | | -| `kb_loader` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. ~~Callable[[Vocab], KnowledgeBase]~~ | -| `get_candidates` | Function that generates plausible candidates for a given `Span` object. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | -| `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~ | -| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. ~~bool~~ | -| `incl_context` | Whether or not to include the local context in the model. ~~bool~~ | +Upon construction of the entity linker component, an empty knowledge base is +constructed with the provided `entity_vector_length`. If you want to use a +custom knowledge base, you should either call +[`set_kb`](/api/entitylinker#set_kb) or provide a `kb_loader` in the +[`initialize`](/api/entitylinker#initialize) call. + +| Name | Description | +| ---------------------- | -------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| _keyword-only_ | | +| `entity_vector_length` | Size of encoding vectors in the KB. ~~int~~ | +| `get_candidates` | Function that generates plausible candidates for a given `Span` object. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | +| `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~ | +| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. ~~bool~~ | +| `incl_context` | Whether or not to include the local context in the model. ~~bool~~ | ## EntityLinker.\_\_call\_\_ {#call tag="method"} @@ -139,6 +139,28 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ | +## EntityLinker.set_kb {#initialize tag="method" new="3"} + +The `kb_loader` should be a function that takes a `Vocab` instance and creates +the `KnowledgeBase`, ensuring that the strings of the knowledge base are synced +with the current vocab. + +> #### Example +> +> ```python +> def create_kb(vocab): +> kb = KnowledgeBase(vocab, entity_vector_length=128) +> kb.add_entity(...) +> kb.add_alias(...) +> return kb +> entity_linker = nlp.add_pipe("entity_linker") +> entity_linker.set_kb(lambda: [], nlp=nlp, kb_loader=create_kb) +> ``` + +| Name | Description | +| ----------- | ---------------------------------------------------------------------------------------------------------------- | +| `kb_loader` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. ~~Callable[[Vocab], KnowledgeBase]~~ | + ## EntityLinker.initialize {#initialize tag="method" new="3"} Initialize the component for training. `get_examples` should be a function that @@ -150,6 +172,11 @@ network, setting up the label scheme based on the data. This method is typically called by [`Language.initialize`](/api/language#initialize). +Optionally, a `kb_loader` argument may be specified to change the internal +knowledge base. This argument should be a function that takes a `Vocab` instance +and creates the `KnowledgeBase`, ensuring that the strings of the knowledge base +are synced with the current vocab. + This method was previously called `begin_training`. @@ -160,7 +187,7 @@ This method was previously called `begin_training`. > > ```python > entity_linker = nlp.add_pipe("entity_linker") -> entity_linker.initialize(lambda: [], nlp=nlp) +> entity_linker.initialize(lambda: [], nlp=nlp, kb_loader=my_kb) > ``` | Name | Description | @@ -168,6 +195,7 @@ This method was previously called `begin_training`. | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | | _keyword-only_ | | | `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | +| `kb_loader` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. ~~Callable[[Vocab], KnowledgeBase]~~ | ## EntityLinker.predict {#predict tag="method"}