From 881e3f8fd04cb3a0bcfc1a4ed167863690fa97ed Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 6 Aug 2020 15:29:44 +0200 Subject: [PATCH 1/7] add docbin explanation and example --- spacy/gold/corpus.py | 2 +- website/docs/api/data-formats.md | 32 ++++++++++++++++++++++++++------ 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py index 4a65d8885..47f9a3b53 100644 --- a/spacy/gold/corpus.py +++ b/spacy/gold/corpus.py @@ -20,7 +20,7 @@ def create_docbin_reader( class Corpus: """Iterate Example objects from a file or directory of DocBin (.spacy) - formated data files. + formatted data files. path (Path): The directory or filename to read from. gold_preproc (bool): Whether to set up the Example object with gold-standard diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index ae398cbf5..1813bff6a 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -17,12 +17,32 @@ label schemes used in its components, depending on the data it was trained on. ### Binary training format {#binary-training new="3"} -The built-in [`convert`](/api/cli#convert) command helps you convert the -`.conllu` format used by the -[Universal Dependencies corpora](https://github.com/UniversalDependencies) as -well as spaCy's previous [JSON format](#json-input). +> #### Example +> +> ```python +> from pathlib import Path +> from spacy.tokens import DocBin +> from spacy.gold import Corpus +> output_file = Path(dir) / "output.spacy" +> data = DocBin(docs=docs).to_bytes() +> with output_file.open("wb") as file_: +> file_.write(data) +> reader = Corpus(output_file) +> ``` - +The main data format used in spaCy v3 is a binary format created by serializing +a [`DocBin`](/api/docbin) object, which represents a collection of `Doc` +objects. Typically, the extension for these binary files is `.spacy`, and they +are used as input format for specifying a [training corpus](/api/corpus) and for +spaCy's CLI [`train`](/api/cli#train) command. + +This binary format is extremely efficient in storage, especially when packing +multiple documents together. + +The built-in [`convert`](/api/cli#convert) command helps you convert spaCy's +previous [JSON format](#json-input) to this new `DocBin` format. It also +supports conversion of the `.conllu` format used by the +[Universal Dependencies corpora](https://github.com/UniversalDependencies). ### JSON training format {#json-input tag="deprecated"} @@ -30,7 +50,7 @@ well as spaCy's previous [JSON format](#json-input). As of v3.0, the JSON input format is deprecated and is replaced by the [binary format](#binary-training). Instead of converting [`Doc`](/api/doc) -objects to JSON, you can now now serialize them directly using the +objects to JSON, you can now serialize them directly using the [`DocBin`](/api/docbin) container and then use them as input data. [`spacy convert`](/api/cli) lets you convert your JSON data to the new `.spacy` From 0b4d1e1bc4fb0993b874b8da0cd73ab973e8eabd Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 6 Aug 2020 15:47:31 +0200 Subject: [PATCH 2/7] 'debug data' instead of 'debug-data' --- spacy/errors.py | 4 ++-- website/docs/api/cli.md | 4 ++-- website/docs/usage/training.md | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 5c443ccad..7f47dd332 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -169,9 +169,9 @@ class Errors: "training a named entity recognizer, also make sure that none of " "your annotated entity spans have leading or trailing whitespace " "or punctuation. " - "You can also use the experimental `debug-data` command to " + "You can also use the experimental `debug data` command to " "validate your JSON-formatted training data. For details, run:\n" - "python -m spacy debug-data --help") + "python -m spacy debug data --help") E025 = ("String is too long: {length} characters. Max is 2**30.") E026 = ("Error accessing token at position {i}: out of bounds in Doc of " "length {length}.") diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 00c3bac57..8c40be904 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -132,7 +132,7 @@ $ python -m spacy init config [output] [--base] [--lang] [--model] [--pipeline] | `--base`, `-b` | option | Optional base config file to auto-fill with defaults. | | `--lang`, `-l` | option | Optional language code to use for blank config. If a `--pipeline` is specified, the components will be added in order. | | `--model`, `-m` | option | Optional base model to copy config from. If a `--pipeline` is specified, only those components will be kept, and all other components not in the model will be added. | -| `--pipeline`, `-p` | option | Optional comma-separate pipeline of components to add to blank language or model. | +| `--pipeline`, `-p` | option | Optional comma-separated pipeline of components to add to blank language or model. | | **CREATES** | config | Complete and auto-filled config file for training. | ### init model {#init-model new="2"} @@ -271,7 +271,7 @@ low data labels and more. -The `debug-data` command is now available as a subcommand of `spacy debug`. It +The `debug data` command is now available as a subcommand of `spacy debug`. It takes the same arguments as `train` and reads settings off the [`config.cfg` file](/usage/training#config) and optional [overrides](/usage/training#config-overrides) on the CLI. diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 5b9e76c02..37f838fda 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -83,12 +83,12 @@ $ python -m spacy init config config.cfg --base base_config.cfg > #### Tip: Debug your data > -> The [`debug-data` command](/api/cli#debug-data) lets you analyze and validate +> The [`debug data` command](/api/cli#debug-data) lets you analyze and validate > your training and development data, get useful stats, and find problems like > invalid entity annotations, cyclic dependencies, low data labels and more. > > ```bash -> $ python -m spacy debug-data en train.spacy dev.spacy --verbose +> $ python -m spacy debug data en train.spacy dev.spacy --verbose > ``` You can now run [`train`](/api/cli#train) with your training and development From 81d0b1c39078e5087a5d6f7e3faeeb3f78b0fb12 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 6 Aug 2020 16:22:50 +0200 Subject: [PATCH 3/7] update EL pipe arguments --- website/docs/api/entitylinker.md | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index 18d9c5edd..2708b9928 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -23,22 +23,24 @@ architectures and their arguments and hyperparameters. > ```python > from spacy.pipeline.entity_linker import DEFAULT_NEL_MODEL > config = { -> "kb": None, > "labels_discard": [], > "incl_prior": True, > "incl_context": True, > "model": DEFAULT_NEL_MODEL, +> "kb_loader": {'@assets': 'spacy.EmptyKB.v1', 'entity_vector_length': 64}, +> "get_candidates": {'@assets': 'spacy.CandidateGenerator.v1'}, > } > nlp.add_pipe("entity_linker", config=config) > ``` -| Setting | Type | Description | Default | -| ---------------- | ------------------------------------------ | ----------------------------------------------------------------------- | ----------------------------------------------- | -| `kb` | `KnowledgeBase` | The [`KnowledgeBase`](/api/kb) holding all entities and their aliases. | `None` | -| `labels_discard` | `Iterable[str]` | NER labels that will automatically get a "NIL" prediction. | `[]` | -| `incl_prior` | bool | Whether or not to include prior probabilities from the KB in the model. | `True` | -| `incl_context` | bool | Whether or not to include the local context in the model. | `True` | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [EntityLinker](/api/architectures#EntityLinker) | +| Setting | Type | Description | Default | +| ---------------- | -------------------------------------------------------- | ------------------------------------------------------------------------- | ------------------------------------------------------ | +| `labels_discard` | `Iterable[str]` | NER labels that will automatically get a "NIL" prediction. | `[]` | +| `incl_prior` | bool | Whether or not to include prior probabilities from the KB in the model. | `True` | +| `incl_context` | bool | Whether or not to include the local context in the model. | `True` | +| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [EntityLinker](/api/architectures#EntityLinker) | +| `kb_loader` | `Callable[[Vocab], KnowledgeBase]` | Function that creates a [`KnowledgeBase`](/api/kb) from a Vocab instance. | An empty KnowledgeBase with `entity_vector_length` 64. | +| `get_candidates` | `Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]` | Function that generates plausible candidates for a given `Span` object. | Built-in dictionary-lookup function. | ```python https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/entity_linker.py From f396f091dc256827031392ece21a165048870b21 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 6 Aug 2020 16:40:48 +0200 Subject: [PATCH 4/7] update EL API --- website/docs/api/entitylinker.md | 46 ++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index 2708b9928..652574d15 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -33,14 +33,14 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("entity_linker", config=config) > ``` -| Setting | Type | Description | Default | -| ---------------- | -------------------------------------------------------- | ------------------------------------------------------------------------- | ------------------------------------------------------ | -| `labels_discard` | `Iterable[str]` | NER labels that will automatically get a "NIL" prediction. | `[]` | -| `incl_prior` | bool | Whether or not to include prior probabilities from the KB in the model. | `True` | -| `incl_context` | bool | Whether or not to include the local context in the model. | `True` | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [EntityLinker](/api/architectures#EntityLinker) | -| `kb_loader` | `Callable[[Vocab], KnowledgeBase]` | Function that creates a [`KnowledgeBase`](/api/kb) from a Vocab instance. | An empty KnowledgeBase with `entity_vector_length` 64. | -| `get_candidates` | `Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]` | Function that generates plausible candidates for a given `Span` object. | Built-in dictionary-lookup function. | +| Setting | Type | Description | Default | +| ---------------- | -------------------------------------------------------- | --------------------------------------------------------------------------- | ------------------------------------------------------ | +| `labels_discard` | `Iterable[str]` | NER labels that will automatically get a "NIL" prediction. | `[]` | +| `incl_prior` | bool | Whether or not to include prior probabilities from the KB in the model. | `True` | +| `incl_context` | bool | Whether or not to include the local context in the model. | `True` | +| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [EntityLinker](/api/architectures#EntityLinker) | +| `kb_loader` | `Callable[[Vocab], KnowledgeBase]` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. | An empty KnowledgeBase with `entity_vector_length` 64. | +| `get_candidates` | `Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]` | Function that generates plausible candidates for a given `Span` object. | Built-in dictionary-lookup function. | ```python https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/entity_linker.py @@ -55,7 +55,11 @@ https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/entity_linker.py > entity_linker = nlp.add_pipe("entity_linker") > > # Construction via add_pipe with custom model -> config = {"model": {"@architectures": "my_el"}} +> config = {"model": {"@architectures": "my_el.v1"}} +> entity_linker = nlp.add_pipe("entity_linker", config=config) +> +> # Construction via add_pipe with custom KB and candidate generation +> config = {"kb_loader": {"@assets": "my_kb.v1"}, "get_candidates": {"@assets": "my_candidates.v1"},} > entity_linker = nlp.add_pipe("entity_linker", config=config) > > # Construction from class @@ -67,18 +71,20 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#add_pipe). - +Note that both the internal KB as well as the Candidate generator can be +customized by providing custom registered functions. -| Name | Type | Description | -| ---------------- | --------------- | ------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The shared vocabulary. | -| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | -| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | -| _keyword-only_ | | | -| `kb` | `KnowlegeBase` | The [`KnowledgeBase`](/api/kb) holding all entities and their aliases. | -| `labels_discard` | `Iterable[str]` | NER labels that will automatically get a "NIL" prediction. | -| `incl_prior` | bool | Whether or not to include prior probabilities from the KB in the model. | -| `incl_context` | bool | Whether or not to include the local context in the model. | +| Name | Type | Description | +| ---------------- | -------------------------------------------------------- | ------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | The shared vocabulary. | +| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | +| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | +| _keyword-only_ | | | +| `kb_loader` | `Callable[[Vocab], KnowledgeBase]` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. | +| `get_candidates` | `Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]` | Function that generates plausible candidates for a given `Span` object. | +| `labels_discard` | `Iterable[str]` | NER labels that will automatically get a "NIL" prediction. | +| `incl_prior` | bool | Whether or not to include prior probabilities from the KB in the model. | +| `incl_context` | bool | Whether or not to include the local context in the model. | ## EntityLinker.\_\_call\_\_ {#call tag="method"} From e8fd0c1f1e9deaf35771e9c875cf9e33c9a3bf16 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 6 Aug 2020 17:41:26 +0200 Subject: [PATCH 5/7] EL architectures documentation --- website/docs/api/architectures.md | 60 +++++++++++++++++++++++++++---- website/docs/api/entitylinker.md | 6 ++++ 2 files changed, 60 insertions(+), 6 deletions(-) diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index a22ee5be8..a9849cc81 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -158,9 +158,21 @@ architectures into your training config. ## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"} +An Entity Linker component disambiguates textual mentions (tagged as named +entities) to unique identifiers, grounding the named entities into the "real +world". This requires 3 main components: + +- A [`KnowledgeBase`](/api/kb) (KB) holding the unique identifiers, potential + synonyms and prior probabilities. +- A candidate generation step to produce a set of likely identifiers, given a + certain textual mention. +- A Machine learning [`Model`](https://thinc.ai/docs/api-model) that picks the + most plausible ID from the set of candidates. + ### spacy.EntityLinker.v1 {#EntityLinker} - +The `EntityLinker` model architecture is a `Thinc` `Model` with a Linear output +layer. > #### Example Config > @@ -170,10 +182,46 @@ architectures into your training config. > nO = null > > [model.tok2vec] -> # ... +> @architectures = "spacy.HashEmbedCNN.v1" +> pretrained_vectors = null +> width = 96 +> depth = 2 +> embed_size = 300 +> window_size = 1 +> maxout_pieces = 3 +> subword_features = true +> dropout = null +> +> [kb_loader] +> @assets = "spacy.EmptyKB.v1" +> entity_vector_length = 64 +> +> [get_candidates] +> @assets = "spacy.CandidateGenerator.v1" > ``` -| Name | Type | Description | -| --------- | ------------------------------------------ | ----------- | -| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | | -| `nO` | int | | +| Name | Type | Description | +| --------- | ------------------------------------------ | ---------------------------------------------------------------------------------------- | +| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | The [`tok2vec`](#tok2vec) layer of the model. | +| `nO` | int | Output dimension, determined by the length of the vectors encoding each entity in the KB | + +If the `nO` dimension is not set, the Entity Linking component will set it when +`begin_training` is called. + +### spacy.EmptyKB.v1 {#EmptyKB} + +A function that creates a default, empty Knowledge Base from a [`Vocab`](/api/vocab) instance. + +| Name | Type | Description | +| ---------------------- | ---- | -------------------------------------------------------- | +| `entity_vector_length` | int | The length of the vectors encoding each entity in the KB - 64 by default. | + +### spacy.CandidateGenerator.v1 {#CandidateGenerator} + +A function that takes as input a [`KnowledgeBase`](/api/kb) and a [`Span`](/api/span) object denoting a +named entity, and returns a list of plausible +[`Candidate` objects](/api/kb/#candidate_init). + +The default `CandidateGenerator` simply uses the text of a mention to find its +potential aliases in the Knowledgebase. Note that this function is +case-dependent. diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index 652574d15..50ffe5c09 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -9,6 +9,12 @@ api_string_name: entity_linker api_trainable: true --- +An Entity Linker component disambiguates textual mentions (tagged as named +entities) to unique identifiers, grounding the named entities into the "real +world". It requires a Knowledge base, a function to generate plausible +candidates from that Knowledge base given a certain textual mention, and a ML +model to pick the right candidate, given the local context of the mention. + ## Config and implementation {#config} The default config is defined by the pipeline component factory and describes From 49ddeb99eabeb701b3387ecdd0e570e6d2b9ca61 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 6 Aug 2020 19:44:47 +0200 Subject: [PATCH 6/7] add textcat architectures documentation --- website/docs/api/architectures.md | 121 +++++++++++++++++++++++++++--- 1 file changed, 112 insertions(+), 9 deletions(-) diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index a9849cc81..cb1f7095e 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -148,11 +148,113 @@ architectures into your training config. ## Text classification architectures {#textcat source="spacy/ml/models/textcat.py"} +A text classification architecture needs to take a `Doc` as input, and produce a +score for each potential label class. Textcat challenges can be binary (e.g. +sentiment analysis) or involve multiple possible labels. Multi-label challenges +can either have mutually exclusive labels (each example has exactly one label), +or multiple labels may be applicable at the same time. + +As the properties of text classification problems can vary widely, we provide +several different built-in architectures. It is recommended to experiment with +different architectures and settings to determine what works best on your +specific data and challenge. + ### spacy.TextCatEnsemble.v1 {#TextCatEnsemble} +Stacked ensemble of a bag-of-words model and a neural network model. The neural +network has an internal CNN Tok2Vec layer and uses attention. + +> #### Example Config +> +> ```ini +> [model] +> @architectures = "spacy.TextCatEnsemble.v1" +> exclusive_classes = false +> pretrained_vectors = null +> width = 64 +> embed_size = 2000 +> conv_depth = 2 +> window_size = 1 +> ngram_size = 1 +> dropout = null +> nO = null +> ``` + +| Name | Type | Description | +| -------------------- | ----- | ------------------------------------------------------------------------------------------------------------------------------------------- | +| `exclusive_classes` | bool | Whether or not categories are mutually exclusive. | +| `pretrained_vectors` | bool | Whether or not pretrained vectors will be used in addition to the feature vectors. | +| `width` | int | Output dimension of the feature encoding step. | +| `embed_size` | int | Input dimension of the feature encoding step. | +| `conv_depth` | int | Depth of the Tok2Vec layer. | +| `window_size` | int | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. | +| `ngram_size` | int | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. | +| `dropout` | float | The dropout rate. | +| `nO` | int | Output dimension, determined by the number of different labels. | + +If the `nO` dimension is not set, the TextCategorizer component will set it when +`begin_training` is called. + +### spacy.TextCatCNN.v1 {#TextCatCNN} + +> #### Example Config +> +> ```ini +> [model] +> @architectures = "spacy.TextCatCNN.v1" +> exclusive_classes = false +> nO = null +> +> [model.tok2vec] +> @architectures = "spacy.HashEmbedCNN.v1" +> pretrained_vectors = null +> width = 96 +> depth = 4 +> embed_size = 2000 +> window_size = 1 +> maxout_pieces = 3 +> subword_features = true +> dropout = null +> ``` + +A neural network model where token vectors are calculated using a CNN. The +vectors are mean pooled and used as features in a feed-forward network. This +architecture is usually less accurate than the ensemble, but runs faster. + +| Name | Type | Description | +| ------------------- | ------------------------------------------ | --------------------------------------------------------------- | +| `exclusive_classes` | bool | Whether or not categories are mutually exclusive. | +| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | The [`tok2vec`](#tok2vec) layer of the model. | +| `nO` | int | Output dimension, determined by the number of different labels. | + +If the `nO` dimension is not set, the TextCategorizer component will set it when +`begin_training` is called. + ### spacy.TextCatBOW.v1 {#TextCatBOW} -### spacy.TextCatCNN.v1 {#TextCatCNN} +An ngram "bag-of-words" model. This architecture should run much faster than the +others, but may not be as accurate, especially if texts are short. + +> #### Example Config +> +> ```ini +> [model] +> @architectures = "spacy.TextCatBOW.v1" +> exclusive_classes = false +> ngram_size: 1 +> no_output_layer: false +> nO = null +> ``` + +| Name | Type | Description | +| ------------------- | ----- | ------------------------------------------------------------------------------------------------------------------------------------------- | +| `exclusive_classes` | bool | Whether or not categories are mutually exclusive. | +| `ngram_size` | int | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. | +| `no_output_layer` | float | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes=True`, else `Logistic`. | +| `nO` | int | Output dimension, determined by the number of different labels. | + +If the `nO` dimension is not set, the TextCategorizer component will set it when +`begin_training` is called. ### spacy.TextCatLowData.v1 {#TextCatLowData} @@ -191,11 +293,11 @@ layer. > maxout_pieces = 3 > subword_features = true > dropout = null -> +> > [kb_loader] > @assets = "spacy.EmptyKB.v1" > entity_vector_length = 64 -> +> > [get_candidates] > @assets = "spacy.CandidateGenerator.v1" > ``` @@ -210,17 +312,18 @@ If the `nO` dimension is not set, the Entity Linking component will set it when ### spacy.EmptyKB.v1 {#EmptyKB} -A function that creates a default, empty Knowledge Base from a [`Vocab`](/api/vocab) instance. +A function that creates a default, empty Knowledge Base from a +[`Vocab`](/api/vocab) instance. -| Name | Type | Description | -| ---------------------- | ---- | -------------------------------------------------------- | +| Name | Type | Description | +| ---------------------- | ---- | ------------------------------------------------------------------------- | | `entity_vector_length` | int | The length of the vectors encoding each entity in the KB - 64 by default. | ### spacy.CandidateGenerator.v1 {#CandidateGenerator} -A function that takes as input a [`KnowledgeBase`](/api/kb) and a [`Span`](/api/span) object denoting a -named entity, and returns a list of plausible -[`Candidate` objects](/api/kb/#candidate_init). +A function that takes as input a [`KnowledgeBase`](/api/kb) and a +[`Span`](/api/span) object denoting a named entity, and returns a list of +plausible [`Candidate` objects](/api/kb/#candidate_init). The default `CandidateGenerator` simply uses the text of a mention to find its potential aliases in the Knowledgebase. Note that this function is From 824f4b2107007886e5aa37962b76aa87fba7ef8c Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 6 Aug 2020 23:20:13 +0200 Subject: [PATCH 7/7] casing consistent --- website/docs/api/architectures.md | 4 ++-- website/docs/api/entitylinker.md | 9 +++++---- website/docs/usage/processing-pipelines.md | 5 +++-- website/docs/usage/spacy-101.md | 6 +++--- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index cb1f7095e..09c5a5b1c 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -260,7 +260,7 @@ If the `nO` dimension is not set, the TextCategorizer component will set it when ## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"} -An Entity Linker component disambiguates textual mentions (tagged as named +An `EntityLinker` component disambiguates textual mentions (tagged as named entities) to unique identifiers, grounding the named entities into the "real world". This requires 3 main components: @@ -312,7 +312,7 @@ If the `nO` dimension is not set, the Entity Linking component will set it when ### spacy.EmptyKB.v1 {#EmptyKB} -A function that creates a default, empty Knowledge Base from a +A function that creates a default, empty `KnowledgeBase` from a [`Vocab`](/api/vocab) instance. | Name | Type | Description | diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index 50ffe5c09..cb5145909 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -9,11 +9,12 @@ api_string_name: entity_linker api_trainable: true --- -An Entity Linker component disambiguates textual mentions (tagged as named +An `EntityLinker` component disambiguates textual mentions (tagged as named entities) to unique identifiers, grounding the named entities into the "real -world". It requires a Knowledge base, a function to generate plausible -candidates from that Knowledge base given a certain textual mention, and a ML -model to pick the right candidate, given the local context of the mention. +world". It requires a `KnowledgeBase`, as well as a function to generate +plausible candidates from that `KnowledgeBase` given a certain textual mention, +and a ML model to pick the right candidate, given the local context of the +mention. ## Config and implementation {#config} diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 7c47c0c73..ae1616f8b 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -380,8 +380,9 @@ table instead of only returning the structured data. > #### ✏️ Things to try > -> 1. Add the components `"ner"` and `"sentencizer"` _before_ the entity linker. -> The analysis should now show no problems, because requirements are met. +> 1. Add the components `"ner"` and `"sentencizer"` _before_ the +> `"entity_linker"`. The analysis should now show no problems, because +> requirements are met. ```python ### {executable="true"} diff --git a/website/docs/usage/spacy-101.md b/website/docs/usage/spacy-101.md index 19580dc0f..db471b1f0 100644 --- a/website/docs/usage/spacy-101.md +++ b/website/docs/usage/spacy-101.md @@ -122,7 +122,7 @@ related to more general machine learning functionality. | **Lemmatization** | Assigning the base forms of words. For example, the lemma of "was" is "be", and the lemma of "rats" is "rat". | | **Sentence Boundary Detection** (SBD) | Finding and segmenting individual sentences. | | **Named Entity Recognition** (NER) | Labelling named "real-world" objects, like persons, companies or locations. | -| **Entity Linking** (EL) | Disambiguating textual entities to unique identifiers in a Knowledge Base. | +| **Entity Linking** (EL) | Disambiguating textual entities to unique identifiers in a knowledge base. | | **Similarity** | Comparing words, text spans and documents and how similar they are to each other. | | **Text Classification** | Assigning categories or labels to a whole document, or parts of a document. | | **Rule-based Matching** | Finding sequences of tokens based on their texts and linguistic annotations, similar to regular expressions. | @@ -379,7 +379,7 @@ spaCy will also export the `Vocab` when you save a `Doc` or `nlp` object. This will give you the object and its encoded annotations, plus the "key" to decode it. -## Knowledge Base {#kb} +## Knowledge base {#kb} To support the entity linking task, spaCy stores external knowledge in a [`KnowledgeBase`](/api/kb). The knowledge base (KB) uses the `Vocab` to store @@ -426,7 +426,7 @@ print("Number of aliases in KB:", kb.get_size_aliases()) # 2 ### Candidate generation -Given a textual entity, the Knowledge Base can provide a list of plausible +Given a textual entity, the knowledge base can provide a list of plausible candidates or entity identifiers. The [`EntityLinker`](/api/entitylinker) will take this list of candidates as input, and disambiguate the mention to the most probable identifier, given the document context.