diff --git a/setup.py b/setup.py
index ec1bd35fa..899d940ed 100755
--- a/setup.py
+++ b/setup.py
@@ -30,7 +30,9 @@ MOD_NAMES = [
"spacy.lexeme",
"spacy.vocab",
"spacy.attrs",
- "spacy.kb",
+ "spacy.kb.candidate",
+ "spacy.kb.kb",
+ "spacy.kb.kb_in_memory",
"spacy.ml.parser_model",
"spacy.morphology",
"spacy.pipeline.dep_parser",
diff --git a/spacy/errors.py b/spacy/errors.py
index 5ee1476c2..e2201284f 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -709,9 +709,9 @@ class Errors(metaclass=ErrorsWithCodes):
"`nlp.enable_pipe` instead.")
E927 = ("Can't write to frozen list Maybe you're trying to modify a computed "
"property or default function argument?")
- E928 = ("A KnowledgeBase can only be serialized to/from from a directory, "
+ E928 = ("An InMemoryLookupKB can only be serialized to/from from a directory, "
"but the provided argument {loc} points to a file.")
- E929 = ("Couldn't read KnowledgeBase from {loc}. The path does not seem to exist.")
+ E929 = ("Couldn't read InMemoryLookupKB from {loc}. The path does not seem to exist.")
E930 = ("Received invalid get_examples callback in `{method}`. "
"Expected function that returns an iterable of Example objects but "
"got: {obj}")
@@ -941,6 +941,12 @@ class Errors(metaclass=ErrorsWithCodes):
"`{arg2}`={arg2_values} but these arguments are conflicting.")
E1043 = ("Expected None or a value in range [{range_start}, {range_end}] for entity linker threshold, but got "
"{value}.")
+ E1044 = ("Expected `candidates_batch_size` to be >= 1, but got: {value}")
+ E1045 = ("Encountered {parent} subclass without `{parent}.{method}` "
+ "method in '{name}'. If you want to use this method, make "
+ "sure it's overwritten on the subclass.")
+ E1046 = ("{cls_name} is an abstract class and cannot be instantiated. If you are looking for spaCy's default "
+ "knowledge base, use `InMemoryLookupKB`.")
# Deprecated model shortcuts, only used in errors and warnings
diff --git a/spacy/kb/__init__.py b/spacy/kb/__init__.py
new file mode 100644
index 000000000..1d70a9b34
--- /dev/null
+++ b/spacy/kb/__init__.py
@@ -0,0 +1,3 @@
+from .kb import KnowledgeBase
+from .kb_in_memory import InMemoryLookupKB
+from .candidate import Candidate, get_candidates, get_candidates_batch
diff --git a/spacy/kb/candidate.pxd b/spacy/kb/candidate.pxd
new file mode 100644
index 000000000..942ce9dd0
--- /dev/null
+++ b/spacy/kb/candidate.pxd
@@ -0,0 +1,12 @@
+from .kb cimport KnowledgeBase
+from libcpp.vector cimport vector
+from ..typedefs cimport hash_t
+
+# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
+cdef class Candidate:
+ cdef readonly KnowledgeBase kb
+ cdef hash_t entity_hash
+ cdef float entity_freq
+ cdef vector[float] entity_vector
+ cdef hash_t alias_hash
+ cdef float prior_prob
diff --git a/spacy/kb/candidate.pyx b/spacy/kb/candidate.pyx
new file mode 100644
index 000000000..c89efeb03
--- /dev/null
+++ b/spacy/kb/candidate.pyx
@@ -0,0 +1,74 @@
+# cython: infer_types=True, profile=True
+
+from typing import Iterable
+from .kb cimport KnowledgeBase
+from ..tokens import Span
+
+cdef class Candidate:
+ """A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
+ to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
+ algorithm which will disambiguate the various candidates to the correct one.
+ Each candidate (alias, entity) pair is assigned a certain prior probability.
+
+ DOCS: https://spacy.io/api/kb/#candidate-init
+ """
+
+ def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
+ self.kb = kb
+ self.entity_hash = entity_hash
+ self.entity_freq = entity_freq
+ self.entity_vector = entity_vector
+ self.alias_hash = alias_hash
+ self.prior_prob = prior_prob
+
+ @property
+ def entity(self) -> int:
+ """RETURNS (uint64): hash of the entity's KB ID/name"""
+ return self.entity_hash
+
+ @property
+ def entity_(self) -> str:
+ """RETURNS (str): ID/name of this entity in the KB"""
+ return self.kb.vocab.strings[self.entity_hash]
+
+ @property
+ def alias(self) -> int:
+ """RETURNS (uint64): hash of the alias"""
+ return self.alias_hash
+
+ @property
+ def alias_(self) -> str:
+ """RETURNS (str): ID of the original alias"""
+ return self.kb.vocab.strings[self.alias_hash]
+
+ @property
+ def entity_freq(self) -> float:
+ return self.entity_freq
+
+ @property
+ def entity_vector(self) -> Iterable[float]:
+ return self.entity_vector
+
+ @property
+ def prior_prob(self) -> float:
+ return self.prior_prob
+
+
+def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
+ """
+ Return candidate entities for a given mention and fetching appropriate entries from the index.
+ kb (KnowledgeBase): Knowledge base to query.
+ mention (Span): Entity mention for which to identify candidates.
+ RETURNS (Iterable[Candidate]): Identified candidates.
+ """
+ return kb.get_candidates(mention)
+
+
+def get_candidates_batch(kb: KnowledgeBase, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
+ """
+ Return candidate entities for the given mentions and fetching appropriate entries from the index.
+ kb (KnowledgeBase): Knowledge base to query.
+ mention (Iterable[Span]): Entity mentions for which to identify candidates.
+ RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
+ """
+ return kb.get_candidates_batch(mentions)
diff --git a/spacy/kb/kb.pxd b/spacy/kb/kb.pxd
new file mode 100644
index 000000000..1adeef8ae
--- /dev/null
+++ b/spacy/kb/kb.pxd
@@ -0,0 +1,10 @@
+"""Knowledge-base for entity or concept linking."""
+
+from cymem.cymem cimport Pool
+from libc.stdint cimport int64_t
+from ..vocab cimport Vocab
+
+cdef class KnowledgeBase:
+ cdef Pool mem
+ cdef readonly Vocab vocab
+ cdef readonly int64_t entity_vector_length
diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx
new file mode 100644
index 000000000..ce4bc0138
--- /dev/null
+++ b/spacy/kb/kb.pyx
@@ -0,0 +1,108 @@
+# cython: infer_types=True, profile=True
+
+from pathlib import Path
+from typing import Iterable, Tuple, Union
+from cymem.cymem cimport Pool
+
+from .candidate import Candidate
+from ..tokens import Span
+from ..util import SimpleFrozenList
+from ..errors import Errors
+
+
+cdef class KnowledgeBase:
+ """A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
+ to support entity linking of named entities to real-world concepts.
+ This is an abstract class and requires its operations to be implemented.
+
+ DOCS: https://spacy.io/api/kb
+ """
+
+ def __init__(self, vocab: Vocab, entity_vector_length: int):
+ """Create a KnowledgeBase."""
+ # Make sure abstract KB is not instantiated.
+ if self.__class__ == KnowledgeBase:
+ raise TypeError(
+ Errors.E1046.format(cls_name=self.__class__.__name__)
+ )
+
+ self.vocab = vocab
+ self.entity_vector_length = entity_vector_length
+ self.mem = Pool()
+
+ def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
+ """
+ Return candidate entities for specified texts. Each candidate defines the entity, the original alias,
+ and the prior probability of that alias resolving to that entity.
+ If no candidate is found for a given text, an empty list is returned.
+ mentions (Iterable[Span]): Mentions for which to get candidates.
+ RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
+ """
+ return [self.get_candidates(span) for span in mentions]
+
+ def get_candidates(self, mention: Span) -> Iterable[Candidate]:
+ """
+ Return candidate entities for specified text. Each candidate defines the entity, the original alias,
+ and the prior probability of that alias resolving to that entity.
+ If the no candidate is found for a given text, an empty list is returned.
+ mention (Span): Mention for which to get candidates.
+ RETURNS (Iterable[Candidate]): Identified candidates.
+ """
+ raise NotImplementedError(
+ Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__)
+ )
+
+ def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]:
+ """
+ Return vectors for entities.
+ entity (str): Entity name/ID.
+ RETURNS (Iterable[Iterable[float]]): Vectors for specified entities.
+ """
+ return [self.get_vector(entity) for entity in entities]
+
+ def get_vector(self, str entity) -> Iterable[float]:
+ """
+ Return vector for entity.
+ entity (str): Entity name/ID.
+ RETURNS (Iterable[float]): Vector for specified entity.
+ """
+ raise NotImplementedError(
+ Errors.E1045.format(parent="KnowledgeBase", method="get_vector", name=self.__name__)
+ )
+
+ def to_bytes(self, **kwargs) -> bytes:
+ """Serialize the current state to a binary string.
+ RETURNS (bytes): Current state as binary string.
+ """
+ raise NotImplementedError(
+ Errors.E1045.format(parent="KnowledgeBase", method="to_bytes", name=self.__name__)
+ )
+
+ def from_bytes(self, bytes_data: bytes, *, exclude: Tuple[str] = tuple()):
+ """Load state from a binary string.
+ bytes_data (bytes): KB state.
+ exclude (Tuple[str]): Properties to exclude when restoring KB.
+ """
+ raise NotImplementedError(
+ Errors.E1045.format(parent="KnowledgeBase", method="from_bytes", name=self.__name__)
+ )
+
+ def to_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
+ """
+ Write KnowledgeBase content to disk.
+ path (Union[str, Path]): Target file path.
+ exclude (Iterable[str]): List of components to exclude.
+ """
+ raise NotImplementedError(
+ Errors.E1045.format(parent="KnowledgeBase", method="to_disk", name=self.__name__)
+ )
+
+ def from_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
+ """
+ Load KnowledgeBase content from disk.
+ path (Union[str, Path]): Target file path.
+ exclude (Iterable[str]): List of components to exclude.
+ """
+ raise NotImplementedError(
+ Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__)
+ )
diff --git a/spacy/kb.pxd b/spacy/kb/kb_in_memory.pxd
similarity index 92%
rename from spacy/kb.pxd
rename to spacy/kb/kb_in_memory.pxd
index a823dbe1e..825a6bde9 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb/kb_in_memory.pxd
@@ -1,14 +1,12 @@
"""Knowledge-base for entity or concept linking."""
-from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
from libcpp.vector cimport vector
from libc.stdint cimport int32_t, int64_t
from libc.stdio cimport FILE
-from .vocab cimport Vocab
-from .typedefs cimport hash_t
-from .structs cimport KBEntryC, AliasC
-
+from ..typedefs cimport hash_t
+from ..structs cimport KBEntryC, AliasC
+from .kb cimport KnowledgeBase
ctypedef vector[KBEntryC] entry_vec
ctypedef vector[AliasC] alias_vec
@@ -16,21 +14,7 @@ ctypedef vector[float] float_vec
ctypedef vector[float_vec] float_matrix
-# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
-cdef class Candidate:
- cdef readonly KnowledgeBase kb
- cdef hash_t entity_hash
- cdef float entity_freq
- cdef vector[float] entity_vector
- cdef hash_t alias_hash
- cdef float prior_prob
-
-
-cdef class KnowledgeBase:
- cdef Pool mem
- cdef readonly Vocab vocab
- cdef int64_t entity_vector_length
-
+cdef class InMemoryLookupKB(KnowledgeBase):
# This maps 64bit keys (hash of unique entity string)
# to 64bit values (position of the _KBEntryC struct in the _entries vector).
# The PreshMap is pretty space efficient, as it uses open addressing. So
diff --git a/spacy/kb.pyx b/spacy/kb/kb_in_memory.pyx
similarity index 90%
rename from spacy/kb.pyx
rename to spacy/kb/kb_in_memory.pyx
index ae1983a8d..485e52c2f 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb/kb_in_memory.pyx
@@ -1,8 +1,7 @@
# cython: infer_types=True, profile=True
-from typing import Iterator, Iterable, Callable, Dict, Any
+from typing import Iterable, Callable, Dict, Any, Union
import srsly
-from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
from cpython.exc cimport PyErr_SetFromErrno
from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
@@ -12,85 +11,28 @@ from libcpp.vector cimport vector
from pathlib import Path
import warnings
-from .typedefs cimport hash_t
-from .errors import Errors, Warnings
-from . import util
-from .util import SimpleFrozenList, ensure_path
-
-cdef class Candidate:
- """A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
- to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
- algorithm which will disambiguate the various candidates to the correct one.
- Each candidate (alias, entity) pair is assigned to a certain prior probability.
-
- DOCS: https://spacy.io/api/kb/#candidate_init
- """
-
- def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
- self.kb = kb
- self.entity_hash = entity_hash
- self.entity_freq = entity_freq
- self.entity_vector = entity_vector
- self.alias_hash = alias_hash
- self.prior_prob = prior_prob
-
- @property
- def entity(self):
- """RETURNS (uint64): hash of the entity's KB ID/name"""
- return self.entity_hash
-
- @property
- def entity_(self):
- """RETURNS (str): ID/name of this entity in the KB"""
- return self.kb.vocab.strings[self.entity_hash]
-
- @property
- def alias(self):
- """RETURNS (uint64): hash of the alias"""
- return self.alias_hash
-
- @property
- def alias_(self):
- """RETURNS (str): ID of the original alias"""
- return self.kb.vocab.strings[self.alias_hash]
-
- @property
- def entity_freq(self):
- return self.entity_freq
-
- @property
- def entity_vector(self):
- return self.entity_vector
-
- @property
- def prior_prob(self):
- return self.prior_prob
+from ..tokens import Span
+from ..typedefs cimport hash_t
+from ..errors import Errors, Warnings
+from .. import util
+from ..util import SimpleFrozenList, ensure_path
+from ..vocab cimport Vocab
+from .kb cimport KnowledgeBase
+from .candidate import Candidate as Candidate
-def get_candidates(KnowledgeBase kb, span) -> Iterator[Candidate]:
- """
- Return candidate entities for a given span by using the text of the span as the alias
- and fetching appropriate entries from the index.
- This particular function is optimized to work with the built-in KB functionality,
- but any other custom candidate generation method can be used in combination with the KB as well.
- """
- return kb.get_alias_candidates(span.text)
-
-
-cdef class KnowledgeBase:
- """A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
+cdef class InMemoryLookupKB(KnowledgeBase):
+ """An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases,
to support entity linking of named entities to real-world concepts.
- DOCS: https://spacy.io/api/kb
+ DOCS: https://spacy.io/api/kb_in_memory
"""
def __init__(self, Vocab vocab, entity_vector_length):
- """Create a KnowledgeBase."""
- self.mem = Pool()
- self.entity_vector_length = entity_vector_length
+ """Create an InMemoryLookupKB."""
+ super().__init__(vocab, entity_vector_length)
self._entry_index = PreshMap()
self._alias_index = PreshMap()
- self.vocab = vocab
self._create_empty_vectors(dummy_hash=self.vocab.strings[""])
def _initialize_entities(self, int64_t nr_entities):
@@ -104,11 +46,6 @@ cdef class KnowledgeBase:
self._alias_index = PreshMap(nr_aliases + 1)
self._aliases_table = alias_vec(nr_aliases + 1)
- @property
- def entity_vector_length(self):
- """RETURNS (uint64): length of the entity vectors"""
- return self.entity_vector_length
-
def __len__(self):
return self.get_size_entities()
@@ -286,7 +223,10 @@ cdef class KnowledgeBase:
alias_entry.probs = probs
self._aliases_table[alias_index] = alias_entry
- def get_alias_candidates(self, str alias) -> Iterator[Candidate]:
+ def get_candidates(self, mention: Span) -> Iterable[Candidate]:
+ return self.get_alias_candidates(mention.text) # type: ignore
+
+ def get_alias_candidates(self, str alias) -> Iterable[Candidate]:
"""
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
and the prior probability of that alias resolving to that entity.
diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py
index d847342a3..4d18d216a 100644
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@@ -1,11 +1,12 @@
from pathlib import Path
from typing import Optional, Callable, Iterable, List, Tuple
from thinc.types import Floats2d
-from thinc.api import chain, clone, list2ragged, reduce_mean, residual
-from thinc.api import Model, Maxout, Linear, noop, tuplify, Ragged
+from thinc.api import chain, list2ragged, reduce_mean, residual
+from thinc.api import Model, Maxout, Linear, tuplify, Ragged
from ...util import registry
-from ...kb import KnowledgeBase, Candidate, get_candidates
+from ...kb import KnowledgeBase, InMemoryLookupKB
+from ...kb import Candidate, get_candidates, get_candidates_batch
from ...vocab import Vocab
from ...tokens import Span, Doc
from ..extract_spans import extract_spans
@@ -78,9 +79,11 @@ def span_maker_forward(model, docs: List[Doc], is_train) -> Tuple[Ragged, Callab
@registry.misc("spacy.KBFromFile.v1")
-def load_kb(kb_path: Path) -> Callable[[Vocab], KnowledgeBase]:
- def kb_from_file(vocab):
- kb = KnowledgeBase(vocab, entity_vector_length=1)
+def load_kb(
+ kb_path: Path,
+) -> Callable[[Vocab], KnowledgeBase]:
+ def kb_from_file(vocab: Vocab):
+ kb = InMemoryLookupKB(vocab, entity_vector_length=1)
kb.from_disk(kb_path)
return kb
@@ -88,9 +91,11 @@ def load_kb(kb_path: Path) -> Callable[[Vocab], KnowledgeBase]:
@registry.misc("spacy.EmptyKB.v1")
-def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
- def empty_kb_factory(vocab):
- return KnowledgeBase(vocab=vocab, entity_vector_length=entity_vector_length)
+def empty_kb(
+ entity_vector_length: int,
+) -> Callable[[Vocab], KnowledgeBase]:
+ def empty_kb_factory(vocab: Vocab):
+ return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length)
return empty_kb_factory
@@ -98,3 +103,10 @@ def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
@registry.misc("spacy.CandidateGenerator.v1")
def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
return get_candidates
+
+
+@registry.misc("spacy.CandidateBatchGenerator.v1")
+def create_candidates_batch() -> Callable[
+ [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
+]:
+ return get_candidates_batch
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 73a90b268..62845287b 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -53,9 +53,11 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
"incl_context": True,
"entity_vector_length": 64,
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
+ "get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
"overwrite": True,
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
"use_gold_ents": True,
+ "candidates_batch_size": 1,
"threshold": None,
},
default_score_weights={
@@ -75,9 +77,13 @@ def make_entity_linker(
incl_context: bool,
entity_vector_length: int,
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
+ get_candidates_batch: Callable[
+ [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
+ ],
overwrite: bool,
scorer: Optional[Callable],
use_gold_ents: bool,
+ candidates_batch_size: int,
threshold: Optional[float] = None,
):
"""Construct an EntityLinker component.
@@ -90,17 +96,21 @@ def make_entity_linker(
incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
incl_context (bool): Whether or not to include the local context in the model.
entity_vector_length (int): Size of encoding vectors in the KB.
- get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that
+ get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
produces a list of candidates, given a certain knowledge base and a textual mention.
+ get_candidates_batch (
+ Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
+ ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
scorer (Optional[Callable]): The scoring method.
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
component must provide entity annotations.
+ candidates_batch_size (int): Size of batches for entity candidate generation.
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold,
prediction is discarded. If None, predictions are not filtered by any threshold.
"""
if not model.attrs.get("include_span_maker", False):
- # The only difference in arguments here is that use_gold_ents is not available
+ # The only difference in arguments here is that use_gold_ents and threshold aren't available.
return EntityLinker_v1(
nlp.vocab,
model,
@@ -124,9 +134,11 @@ def make_entity_linker(
incl_context=incl_context,
entity_vector_length=entity_vector_length,
get_candidates=get_candidates,
+ get_candidates_batch=get_candidates_batch,
overwrite=overwrite,
scorer=scorer,
use_gold_ents=use_gold_ents,
+ candidates_batch_size=candidates_batch_size,
threshold=threshold,
)
@@ -160,9 +172,13 @@ class EntityLinker(TrainablePipe):
incl_context: bool,
entity_vector_length: int,
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
+ get_candidates_batch: Callable[
+ [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
+ ],
overwrite: bool = BACKWARD_OVERWRITE,
scorer: Optional[Callable] = entity_linker_score,
use_gold_ents: bool,
+ candidates_batch_size: int,
threshold: Optional[float] = None,
) -> None:
"""Initialize an entity linker.
@@ -178,10 +194,14 @@ class EntityLinker(TrainablePipe):
entity_vector_length (int): Size of encoding vectors in the KB.
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
produces a list of candidates, given a certain knowledge base and a textual mention.
- scorer (Optional[Callable]): The scoring method. Defaults to
- Scorer.score_links.
+ get_candidates_batch (
+ Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
+ Iterable[Candidate]]
+ ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
+ scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
component must provide entity annotations.
+ candidates_batch_size (int): Size of batches for entity candidate generation.
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the
threshold, prediction is discarded. If None, predictions are not filtered by any threshold.
DOCS: https://spacy.io/api/entitylinker#init
@@ -204,22 +224,27 @@ class EntityLinker(TrainablePipe):
self.incl_prior = incl_prior
self.incl_context = incl_context
self.get_candidates = get_candidates
+ self.get_candidates_batch = get_candidates_batch
self.cfg: Dict[str, Any] = {"overwrite": overwrite}
self.distance = CosineDistance(normalize=False)
# how many neighbour sentences to take into account
- # create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
+ # create an empty KB by default
self.kb = empty_kb(entity_vector_length)(self.vocab)
self.scorer = scorer
self.use_gold_ents = use_gold_ents
+ self.candidates_batch_size = candidates_batch_size
self.threshold = threshold
+ if candidates_batch_size < 1:
+ raise ValueError(Errors.E1044)
+
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
"""Define the KB of this pipe by providing a function that will
create it using this object's vocab."""
if not callable(kb_loader):
raise ValueError(Errors.E885.format(arg_type=type(kb_loader)))
- self.kb = kb_loader(self.vocab)
+ self.kb = kb_loader(self.vocab) # type: ignore
def validate_kb(self) -> None:
# Raise an error if the knowledge base is not initialized.
@@ -241,8 +266,8 @@ class EntityLinker(TrainablePipe):
get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects.
nlp (Language): The current nlp object the component is part of.
- kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates a KnowledgeBase from a Vocab instance.
- Note that providing this argument, will overwrite all data accumulated in the current KB.
+ kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates a KnowledgeBase from a Vocab
+ instance. Note that providing this argument will overwrite all data accumulated in the current KB.
Use this only when loading a KB as-such from file.
DOCS: https://spacy.io/api/entitylinker#initialize
@@ -419,66 +444,93 @@ class EntityLinker(TrainablePipe):
if len(doc) == 0:
continue
sentences = [s for s in doc.sents]
- # Looping through each entity (TODO: rewrite)
- for ent in doc.ents:
- sent_index = sentences.index(ent.sent)
- assert sent_index >= 0
- if self.incl_context:
- # get n_neighbour sentences, clipped to the length of the document
- start_sentence = max(0, sent_index - self.n_sents)
- end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
- start_token = sentences[start_sentence].start
- end_token = sentences[end_sentence].end
- sent_doc = doc[start_token:end_token].as_doc()
- # currently, the context is the same for each entity in a sentence (should be refined)
- sentence_encoding = self.model.predict([sent_doc])[0]
- sentence_encoding_t = sentence_encoding.T
- sentence_norm = xp.linalg.norm(sentence_encoding_t)
- entity_count += 1
- if ent.label_ in self.labels_discard:
- # ignoring this entity - setting to NIL
- final_kb_ids.append(self.NIL)
- else:
- candidates = list(self.get_candidates(self.kb, ent))
- if not candidates:
- # no prediction possible for this entity - setting to NIL
- final_kb_ids.append(self.NIL)
- elif len(candidates) == 1 and self.threshold is None:
- # shortcut for efficiency reasons: take the 1 candidate
- final_kb_ids.append(candidates[0].entity_)
- else:
- random.shuffle(candidates)
- # set all prior probabilities to 0 if incl_prior=False
- prior_probs = xp.asarray([c.prior_prob for c in candidates])
- if not self.incl_prior:
- prior_probs = xp.asarray([0.0 for _ in candidates])
- scores = prior_probs
- # add in similarity from the context
- if self.incl_context:
- entity_encodings = xp.asarray(
- [c.entity_vector for c in candidates]
- )
- entity_norm = xp.linalg.norm(entity_encodings, axis=1)
- if len(entity_encodings) != len(prior_probs):
- raise RuntimeError(
- Errors.E147.format(
- method="predict",
- msg="vectors not of equal length",
- )
- )
- # cosine similarity
- sims = xp.dot(entity_encodings, sentence_encoding_t) / (
- sentence_norm * entity_norm
- )
- if sims.shape != prior_probs.shape:
- raise ValueError(Errors.E161)
- scores = prior_probs + sims - (prior_probs * sims)
- final_kb_ids.append(
- candidates[scores.argmax().item()].entity_
- if self.threshold is None or scores.max() >= self.threshold
- else EntityLinker.NIL
+ # Loop over entities in batches.
+ for ent_idx in range(0, len(doc.ents), self.candidates_batch_size):
+ ent_batch = doc.ents[ent_idx : ent_idx + self.candidates_batch_size]
+
+ # Look up candidate entities.
+ valid_ent_idx = [
+ idx
+ for idx in range(len(ent_batch))
+ if ent_batch[idx].label_ not in self.labels_discard
+ ]
+
+ batch_candidates = list(
+ self.get_candidates_batch(
+ self.kb, [ent_batch[idx] for idx in valid_ent_idx]
+ )
+ if self.candidates_batch_size > 1
+ else [
+ self.get_candidates(self.kb, ent_batch[idx])
+ for idx in valid_ent_idx
+ ]
+ )
+
+ # Looping through each entity in batch (TODO: rewrite)
+ for j, ent in enumerate(ent_batch):
+ sent_index = sentences.index(ent.sent)
+ assert sent_index >= 0
+
+ if self.incl_context:
+ # get n_neighbour sentences, clipped to the length of the document
+ start_sentence = max(0, sent_index - self.n_sents)
+ end_sentence = min(
+ len(sentences) - 1, sent_index + self.n_sents
)
+ start_token = sentences[start_sentence].start
+ end_token = sentences[end_sentence].end
+ sent_doc = doc[start_token:end_token].as_doc()
+ # currently, the context is the same for each entity in a sentence (should be refined)
+ sentence_encoding = self.model.predict([sent_doc])[0]
+ sentence_encoding_t = sentence_encoding.T
+ sentence_norm = xp.linalg.norm(sentence_encoding_t)
+ entity_count += 1
+ if ent.label_ in self.labels_discard:
+ # ignoring this entity - setting to NIL
+ final_kb_ids.append(self.NIL)
+ else:
+ candidates = list(batch_candidates[j])
+ if not candidates:
+ # no prediction possible for this entity - setting to NIL
+ final_kb_ids.append(self.NIL)
+ elif len(candidates) == 1 and self.threshold is None:
+ # shortcut for efficiency reasons: take the 1 candidate
+ final_kb_ids.append(candidates[0].entity_)
+ else:
+ random.shuffle(candidates)
+ # set all prior probabilities to 0 if incl_prior=False
+ prior_probs = xp.asarray([c.prior_prob for c in candidates])
+ if not self.incl_prior:
+ prior_probs = xp.asarray([0.0 for _ in candidates])
+ scores = prior_probs
+ # add in similarity from the context
+ if self.incl_context:
+ entity_encodings = xp.asarray(
+ [c.entity_vector for c in candidates]
+ )
+ entity_norm = xp.linalg.norm(entity_encodings, axis=1)
+ if len(entity_encodings) != len(prior_probs):
+ raise RuntimeError(
+ Errors.E147.format(
+ method="predict",
+ msg="vectors not of equal length",
+ )
+ )
+ # cosine similarity
+ sims = xp.dot(entity_encodings, sentence_encoding_t) / (
+ sentence_norm * entity_norm
+ )
+ if sims.shape != prior_probs.shape:
+ raise ValueError(Errors.E161)
+ scores = prior_probs + sims - (prior_probs * sims)
+ final_kb_ids.append(
+ candidates[scores.argmax().item()].entity_
+ if self.threshold is None
+ or scores.max() >= self.threshold
+ else EntityLinker.NIL
+ )
+
if not (len(final_kb_ids) == entity_count):
err = Errors.E147.format(
method="predict", msg="result variables not of equal length"
diff --git a/spacy/pipeline/legacy/entity_linker.py b/spacy/pipeline/legacy/entity_linker.py
index 2f8a1f8ea..c14dfa1db 100644
--- a/spacy/pipeline/legacy/entity_linker.py
+++ b/spacy/pipeline/legacy/entity_linker.py
@@ -68,8 +68,7 @@ class EntityLinker_v1(TrainablePipe):
entity_vector_length (int): Size of encoding vectors in the KB.
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
produces a list of candidates, given a certain knowledge base and a textual mention.
- scorer (Optional[Callable]): The scoring method. Defaults to
- Scorer.score_links.
+ scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
DOCS: https://spacy.io/api/entitylinker#init
"""
self.vocab = vocab
@@ -115,7 +114,7 @@ class EntityLinker_v1(TrainablePipe):
get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects.
nlp (Language): The current nlp object the component is part of.
- kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates a KnowledgeBase from a Vocab instance.
+ kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates an InMemoryLookupKB from a Vocab instance.
Note that providing this argument, will overwrite all data accumulated in the current KB.
Use this only when loading a KB as-such from file.
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 82bc976bb..4d683acc5 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -6,7 +6,7 @@ from numpy.testing import assert_equal
from spacy import registry, util
from spacy.attrs import ENT_KB_ID
from spacy.compat import pickle
-from spacy.kb import Candidate, KnowledgeBase, get_candidates
+from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase
from spacy.lang.en import English
from spacy.ml import load_kb
from spacy.pipeline import EntityLinker
@@ -34,7 +34,7 @@ def assert_almost_equal(a, b):
def test_issue4674():
"""Test that setting entities with overlapping identifiers does not mess up IO"""
nlp = English()
- kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
+ kb = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
vector1 = [0.9, 1.1, 1.01]
vector2 = [1.8, 2.25, 2.01]
with pytest.warns(UserWarning):
@@ -51,7 +51,7 @@ def test_issue4674():
dir_path.mkdir()
file_path = dir_path / "kb"
kb.to_disk(str(file_path))
- kb2 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
+ kb2 = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
kb2.from_disk(str(file_path))
assert kb2.get_size_entities() == 1
@@ -59,9 +59,9 @@ def test_issue4674():
@pytest.mark.issue(6730)
def test_issue6730(en_vocab):
"""Ensure that the KB does not accept empty strings, but otherwise IO works fine."""
- from spacy.kb import KnowledgeBase
+ from spacy.kb.kb_in_memory import InMemoryLookupKB
- kb = KnowledgeBase(en_vocab, entity_vector_length=3)
+ kb = InMemoryLookupKB(en_vocab, entity_vector_length=3)
kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3])
with pytest.raises(ValueError):
@@ -127,7 +127,7 @@ def test_issue7065_b():
def create_kb(vocab):
# create artificial KB
- mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+ mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
mykb.add_entity(entity="Q270853", freq=12, entity_vector=[9, 1, -7])
mykb.add_alias(
alias="No. 8",
@@ -190,7 +190,7 @@ def test_no_entities():
def create_kb(vocab):
# create artificial KB
- mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+ mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9])
return mykb
@@ -231,7 +231,7 @@ def test_partial_links():
def create_kb(vocab):
# create artificial KB
- mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+ mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9])
return mykb
@@ -263,7 +263,7 @@ def test_partial_links():
def test_kb_valid_entities(nlp):
"""Test the valid construction of a KB with 3 entities and two aliases"""
- mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
+ mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
# adding entities
mykb.add_entity(entity="Q1", freq=19, entity_vector=[8, 4, 3])
@@ -292,7 +292,7 @@ def test_kb_valid_entities(nlp):
def test_kb_invalid_entities(nlp):
"""Test the invalid construction of a KB with an alias linked to a non-existing entity"""
- mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+ mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
# adding entities
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
@@ -308,7 +308,7 @@ def test_kb_invalid_entities(nlp):
def test_kb_invalid_probabilities(nlp):
"""Test the invalid construction of a KB with wrong prior probabilities"""
- mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+ mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
# adding entities
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
@@ -322,7 +322,7 @@ def test_kb_invalid_probabilities(nlp):
def test_kb_invalid_combination(nlp):
"""Test the invalid construction of a KB with non-matching entity and probability lists"""
- mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+ mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
# adding entities
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
@@ -338,7 +338,7 @@ def test_kb_invalid_combination(nlp):
def test_kb_invalid_entity_vector(nlp):
"""Test the invalid construction of a KB with non-matching entity vector lengths"""
- mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
+ mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
# adding entities
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1, 2, 3])
@@ -376,7 +376,7 @@ def test_kb_initialize_empty(nlp):
def test_kb_serialize(nlp):
"""Test serialization of the KB"""
- mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+ mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
with make_tempdir() as d:
# normal read-write behaviour
mykb.to_disk(d / "kb")
@@ -393,12 +393,12 @@ def test_kb_serialize(nlp):
@pytest.mark.issue(9137)
def test_kb_serialize_2(nlp):
v = [5, 6, 7, 8]
- kb1 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
+ kb1 = InMemoryLookupKB(vocab=nlp.vocab, entity_vector_length=4)
kb1.set_entities(["E1"], [1], [v])
assert kb1.get_vector("E1") == v
with make_tempdir() as d:
kb1.to_disk(d / "kb")
- kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
+ kb2 = InMemoryLookupKB(vocab=nlp.vocab, entity_vector_length=4)
kb2.from_disk(d / "kb")
assert kb2.get_vector("E1") == v
@@ -408,7 +408,7 @@ def test_kb_set_entities(nlp):
v = [5, 6, 7, 8]
v1 = [1, 1, 1, 0]
v2 = [2, 2, 2, 3]
- kb1 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
+ kb1 = InMemoryLookupKB(vocab=nlp.vocab, entity_vector_length=4)
kb1.set_entities(["E0"], [1], [v])
assert kb1.get_entity_strings() == ["E0"]
kb1.set_entities(["E1", "E2"], [1, 9], [v1, v2])
@@ -417,7 +417,7 @@ def test_kb_set_entities(nlp):
assert kb1.get_vector("E2") == v2
with make_tempdir() as d:
kb1.to_disk(d / "kb")
- kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
+ kb2 = InMemoryLookupKB(vocab=nlp.vocab, entity_vector_length=4)
kb2.from_disk(d / "kb")
assert set(kb2.get_entity_strings()) == {"E1", "E2"}
assert kb2.get_vector("E1") == v1
@@ -428,7 +428,7 @@ def test_kb_serialize_vocab(nlp):
"""Test serialization of the KB and custom strings"""
entity = "MyFunnyID"
assert entity not in nlp.vocab.strings
- mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+ mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
assert not mykb.contains_entity(entity)
mykb.add_entity(entity, freq=342, entity_vector=[3])
assert mykb.contains_entity(entity)
@@ -436,14 +436,14 @@ def test_kb_serialize_vocab(nlp):
with make_tempdir() as d:
# normal read-write behaviour
mykb.to_disk(d / "kb")
- mykb_new = KnowledgeBase(Vocab(), entity_vector_length=1)
+ mykb_new = InMemoryLookupKB(Vocab(), entity_vector_length=1)
mykb_new.from_disk(d / "kb")
assert entity in mykb_new.vocab.strings
def test_candidate_generation(nlp):
"""Test correct candidate generation"""
- mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+ mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
doc = nlp("douglas adam Adam shrubbery")
douglas_ent = doc[0:1]
@@ -481,7 +481,7 @@ def test_el_pipe_configuration(nlp):
ruler.add_patterns([pattern])
def create_kb(vocab):
- kb = KnowledgeBase(vocab, entity_vector_length=1)
+ kb = InMemoryLookupKB(vocab, entity_vector_length=1)
kb.add_entity(entity="Q2", freq=12, entity_vector=[2])
kb.add_entity(entity="Q3", freq=5, entity_vector=[3])
kb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1])
@@ -500,10 +500,21 @@ def test_el_pipe_configuration(nlp):
def get_lowercased_candidates(kb, span):
return kb.get_alias_candidates(span.text.lower())
+ def get_lowercased_candidates_batch(kb, spans):
+ return [get_lowercased_candidates(kb, span) for span in spans]
+
@registry.misc("spacy.LowercaseCandidateGenerator.v1")
- def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]:
+ def create_candidates() -> Callable[
+ [InMemoryLookupKB, "Span"], Iterable[Candidate]
+ ]:
return get_lowercased_candidates
+ @registry.misc("spacy.LowercaseCandidateBatchGenerator.v1")
+ def create_candidates_batch() -> Callable[
+ [InMemoryLookupKB, Iterable["Span"]], Iterable[Iterable[Candidate]]
+ ]:
+ return get_lowercased_candidates_batch
+
# replace the pipe with a new one with with a different candidate generator
entity_linker = nlp.replace_pipe(
"entity_linker",
@@ -511,6 +522,9 @@ def test_el_pipe_configuration(nlp):
config={
"incl_context": False,
"get_candidates": {"@misc": "spacy.LowercaseCandidateGenerator.v1"},
+ "get_candidates_batch": {
+ "@misc": "spacy.LowercaseCandidateBatchGenerator.v1"
+ },
},
)
entity_linker.set_kb(create_kb)
@@ -532,7 +546,7 @@ def test_nel_nsents(nlp):
def test_vocab_serialization(nlp):
"""Test that string information is retained across storage"""
- mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+ mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
# adding entities
mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
@@ -552,7 +566,7 @@ def test_vocab_serialization(nlp):
with make_tempdir() as d:
mykb.to_disk(d / "kb")
- kb_new_vocab = KnowledgeBase(Vocab(), entity_vector_length=1)
+ kb_new_vocab = InMemoryLookupKB(Vocab(), entity_vector_length=1)
kb_new_vocab.from_disk(d / "kb")
candidates = kb_new_vocab.get_alias_candidates("adam")
@@ -568,7 +582,7 @@ def test_vocab_serialization(nlp):
def test_append_alias(nlp):
"""Test that we can append additional alias-entity pairs"""
- mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+ mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
# adding entities
mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
@@ -599,7 +613,7 @@ def test_append_alias(nlp):
@pytest.mark.filterwarnings("ignore:\\[W036")
def test_append_invalid_alias(nlp):
"""Test that append an alias will throw an error if prior probs are exceeding 1"""
- mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+ mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
# adding entities
mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
@@ -621,7 +635,7 @@ def test_preserving_links_asdoc(nlp):
vector_length = 1
def create_kb(vocab):
- mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+ mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
# adding entities
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
mykb.add_entity(entity="Q2", freq=8, entity_vector=[1])
@@ -723,7 +737,7 @@ def test_overfitting_IO():
# create artificial KB - assign same prior weight to the two russ cochran's
# Q2146908 (Russ Cochran): American golfer
# Q7381115 (Russ Cochran): publisher
- mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+ mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
mykb.add_alias(
@@ -805,7 +819,7 @@ def test_kb_serialization():
kb_dir = tmp_dir / "kb"
nlp1 = English()
assert "Q2146908" not in nlp1.vocab.strings
- mykb = KnowledgeBase(nlp1.vocab, entity_vector_length=vector_length)
+ mykb = InMemoryLookupKB(nlp1.vocab, entity_vector_length=vector_length)
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
mykb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
assert "Q2146908" in nlp1.vocab.strings
@@ -828,7 +842,7 @@ def test_kb_serialization():
def test_kb_pickle():
# Test that the KB can be pickled
nlp = English()
- kb_1 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
+ kb_1 = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
kb_1.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
assert not kb_1.contains_alias("Russ Cochran")
kb_1.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
@@ -842,7 +856,7 @@ def test_kb_pickle():
def test_nel_pickle():
# Test that a pipeline with an EL component can be pickled
def create_kb(vocab):
- kb = KnowledgeBase(vocab, entity_vector_length=3)
+ kb = InMemoryLookupKB(vocab, entity_vector_length=3)
kb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
kb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
return kb
@@ -864,7 +878,7 @@ def test_nel_pickle():
def test_kb_to_bytes():
# Test that the KB's to_bytes method works correctly
nlp = English()
- kb_1 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
+ kb_1 = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
kb_1.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
kb_1.add_entity(entity="Q66", freq=9, entity_vector=[1, 2, 3])
kb_1.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
@@ -874,7 +888,7 @@ def test_kb_to_bytes():
)
assert kb_1.contains_alias("Russ Cochran")
kb_bytes = kb_1.to_bytes()
- kb_2 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
+ kb_2 = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
assert not kb_2.contains_alias("Russ Cochran")
kb_2 = kb_2.from_bytes(kb_bytes)
# check that both KBs are exactly the same
@@ -897,7 +911,7 @@ def test_kb_to_bytes():
def test_nel_to_bytes():
# Test that a pipeline with an EL component can be converted to bytes
def create_kb(vocab):
- kb = KnowledgeBase(vocab, entity_vector_length=3)
+ kb = InMemoryLookupKB(vocab, entity_vector_length=3)
kb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
kb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
return kb
@@ -987,7 +1001,7 @@ def test_legacy_architectures(name, config):
train_examples.append(Example.from_dict(doc, annotation))
def create_kb(vocab):
- mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+ mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
mykb.add_alias(
@@ -1054,7 +1068,7 @@ def test_no_gold_ents(patterns):
def create_kb(vocab):
# create artificial KB
- mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+ mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3])
mykb.add_alias("Kirby", ["Q613241"], [0.9])
# Placeholder
@@ -1104,7 +1118,7 @@ def test_tokenization_mismatch():
def create_kb(vocab):
# create placeholder KB
- mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+ mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3])
mykb.add_alias("Kirby", ["Q613241"], [0.9])
return mykb
@@ -1121,6 +1135,12 @@ def test_tokenization_mismatch():
nlp.evaluate(train_examples)
+def test_abstract_kb_instantiation():
+ """Test whether instantiation of abstract KB base class fails."""
+ with pytest.raises(TypeError):
+ KnowledgeBase(None, 3)
+
+
# fmt: off
@pytest.mark.parametrize(
"meet_threshold,config",
@@ -1151,7 +1171,7 @@ def test_threshold(meet_threshold: bool, config: Dict[str, Any]):
def create_kb(vocab):
# create artificial KB
- mykb = KnowledgeBase(vocab, entity_vector_length=3)
+ mykb = InMemoryLookupKB(vocab, entity_vector_length=3)
mykb.add_entity(entity=entity_id, freq=12, entity_vector=[6, -4, 3])
mykb.add_alias(
alias="Mahler",
diff --git a/spacy/tests/serialize/test_resource_warning.py b/spacy/tests/serialize/test_resource_warning.py
index a00b2a688..38701c6d9 100644
--- a/spacy/tests/serialize/test_resource_warning.py
+++ b/spacy/tests/serialize/test_resource_warning.py
@@ -3,7 +3,7 @@ from unittest import TestCase
import pytest
import srsly
from numpy import zeros
-from spacy.kb import KnowledgeBase, Writer
+from spacy.kb.kb_in_memory import InMemoryLookupKB, Writer
from spacy.vectors import Vectors
from spacy.language import Language
from spacy.pipeline import TrainablePipe
@@ -71,7 +71,7 @@ def entity_linker():
nlp = Language()
def create_kb(vocab):
- kb = KnowledgeBase(vocab, entity_vector_length=1)
+ kb = InMemoryLookupKB(vocab, entity_vector_length=1)
kb.add_entity("test", 0.0, zeros((1, 1), dtype="f"))
return kb
@@ -120,7 +120,7 @@ def test_writer_with_path_py35():
def test_save_and_load_knowledge_base():
nlp = Language()
- kb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+ kb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
with make_tempdir() as d:
path = d / "kb"
try:
@@ -129,7 +129,7 @@ def test_save_and_load_knowledge_base():
pytest.fail(str(e))
try:
- kb_loaded = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+ kb_loaded = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
kb_loaded.from_disk(path)
except Exception as e:
pytest.fail(str(e))
diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py
index 1e0ae3c76..8d3653ab1 100644
--- a/spacy/tests/serialize/test_serialize_kb.py
+++ b/spacy/tests/serialize/test_serialize_kb.py
@@ -2,7 +2,7 @@ from typing import Callable
from spacy import util
from spacy.util import ensure_path, registry, load_model_from_config
-from spacy.kb import KnowledgeBase
+from spacy.kb.kb_in_memory import InMemoryLookupKB
from spacy.vocab import Vocab
from thinc.api import Config
@@ -22,7 +22,7 @@ def test_serialize_kb_disk(en_vocab):
dir_path.mkdir()
file_path = dir_path / "kb"
kb1.to_disk(str(file_path))
- kb2 = KnowledgeBase(vocab=en_vocab, entity_vector_length=3)
+ kb2 = InMemoryLookupKB(vocab=en_vocab, entity_vector_length=3)
kb2.from_disk(str(file_path))
# final assertions
@@ -30,7 +30,7 @@ def test_serialize_kb_disk(en_vocab):
def _get_dummy_kb(vocab):
- kb = KnowledgeBase(vocab, entity_vector_length=3)
+ kb = InMemoryLookupKB(vocab, entity_vector_length=3)
kb.add_entity(entity="Q53", freq=33, entity_vector=[0, 5, 3])
kb.add_entity(entity="Q17", freq=2, entity_vector=[7, 1, 0])
kb.add_entity(entity="Q007", freq=7, entity_vector=[0, 0, 7])
@@ -104,7 +104,7 @@ def test_serialize_subclassed_kb():
custom_field = 666
"""
- class SubKnowledgeBase(KnowledgeBase):
+ class SubInMemoryLookupKB(InMemoryLookupKB):
def __init__(self, vocab, entity_vector_length, custom_field):
super().__init__(vocab, entity_vector_length)
self.custom_field = custom_field
@@ -112,9 +112,9 @@ def test_serialize_subclassed_kb():
@registry.misc("spacy.CustomKB.v1")
def custom_kb(
entity_vector_length: int, custom_field: int
- ) -> Callable[[Vocab], KnowledgeBase]:
+ ) -> Callable[[Vocab], InMemoryLookupKB]:
def custom_kb_factory(vocab):
- kb = SubKnowledgeBase(
+ kb = SubInMemoryLookupKB(
vocab=vocab,
entity_vector_length=entity_vector_length,
custom_field=custom_field,
@@ -129,7 +129,7 @@ def test_serialize_subclassed_kb():
nlp.initialize()
entity_linker = nlp.get_pipe("entity_linker")
- assert type(entity_linker.kb) == SubKnowledgeBase
+ assert type(entity_linker.kb) == SubInMemoryLookupKB
assert entity_linker.kb.entity_vector_length == 342
assert entity_linker.kb.custom_field == 666
@@ -139,6 +139,6 @@ def test_serialize_subclassed_kb():
nlp2 = util.load_model_from_path(tmp_dir)
entity_linker2 = nlp2.get_pipe("entity_linker")
# After IO, the KB is the standard one
- assert type(entity_linker2.kb) == KnowledgeBase
+ assert type(entity_linker2.kb) == InMemoryLookupKB
assert entity_linker2.kb.entity_vector_length == 342
assert not hasattr(entity_linker2.kb, "custom_field")
diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index 2537faff6..a3cb07b44 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -587,8 +587,8 @@ consists of either two or three subnetworks:
run once for each batch.
- **lower**: Construct a feature-specific vector for each `(token, feature)`
pair. This is also run once for each batch. Constructing the state
- representation is then a matter of summing the component features and
- applying the non-linearity.
+ representation is then a matter of summing the component features and applying
+ the non-linearity.
- **upper** (optional): A feed-forward network that predicts scores from the
state representation. If not present, the output from the lower model is used
as action scores directly.
@@ -628,8 +628,8 @@ same signature, but the `use_upper` argument was `True` by default.
> ```
Build a tagger model, using a provided token-to-vector component. The tagger
-model adds a linear layer with softmax activation to predict scores given
-the token vectors.
+model adds a linear layer with softmax activation to predict scores given the
+token vectors.
| Name | Description |
| ----------- | ------------------------------------------------------------------------------------------ |
@@ -919,6 +919,6 @@ A function that reads an existing `KnowledgeBase` from file.
A function that takes as input a [`KnowledgeBase`](/api/kb) and a
[`Span`](/api/span) object denoting a named entity, and returns a list of
-plausible [`Candidate`](/api/kb/#candidate) objects. The default
-`CandidateGenerator` uses the text of a mention to find its potential
-aliases in the `KnowledgeBase`. Note that this function is case-dependent.
+plausible [`Candidate`](/api/kb#candidate) objects. The default
+`CandidateGenerator` uses the text of a mention to find its potential aliases in
+the `KnowledgeBase`. Note that this function is case-dependent.
diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md
index 43e08a39c..40ec8afb5 100644
--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@@ -14,7 +14,8 @@ entities) to unique identifiers, grounding the named entities into the "real
world". It requires a `KnowledgeBase`, as well as a function to generate
plausible candidates from that `KnowledgeBase` given a certain textual mention,
and a machine learning model to pick the right candidate, given the local
-context of the mention.
+context of the mention. `EntityLinker` defaults to using the
+[`InMemoryLookupKB`](/api/kb_in_memory) implementation.
## Assigned Attributes {#assigned-attributes}
@@ -170,7 +171,7 @@ with the current vocab.
>
> ```python
> def create_kb(vocab):
-> kb = KnowledgeBase(vocab, entity_vector_length=128)
+> kb = InMemoryLookupKB(vocab, entity_vector_length=128)
> kb.add_entity(...)
> kb.add_alias(...)
> return kb
diff --git a/website/docs/api/kb.md b/website/docs/api/kb.md
index e7a8fcd6f..b217a1678 100644
--- a/website/docs/api/kb.md
+++ b/website/docs/api/kb.md
@@ -4,27 +4,45 @@ teaser:
A storage class for entities and aliases of a specific knowledge base
(ontology)
tag: class
-source: spacy/kb.pyx
+source: spacy/kb/kb.pyx
new: 2.2
---
-The `KnowledgeBase` object provides a method to generate
-[`Candidate`](/api/kb/#candidate) objects, which are plausible external
+The `KnowledgeBase` object is an abstract class providing a method to generate
+[`Candidate`](/api/kb#candidate) objects, which are plausible external
identifiers given a certain textual mention. Each such `Candidate` holds
information from the relevant KB entities, such as its frequency in text and
possible aliases. Each entity in the knowledge base also has a pretrained entity
vector of a fixed size.
+Beyond that, `KnowledgeBase` classes have to implement a number of utility
+functions called by the [`EntityLinker`](/api/entitylinker) component.
+
+
+
+This class was not abstract up to spaCy version 3.5. The `KnowledgeBase`
+implementation up to that point is available as `InMemoryLookupKB` from 3.5
+onwards.
+
+
+
## KnowledgeBase.\_\_init\_\_ {#init tag="method"}
-Create the knowledge base.
+`KnowledgeBase` is an abstract class and cannot be instantiated. Its child
+classes should call `__init__()` to set up some necessary attributes.
> #### Example
>
> ```python
> from spacy.kb import KnowledgeBase
+> from spacy.vocab import Vocab
+>
+> class FullyImplementedKB(KnowledgeBase):
+> def __init__(self, vocab: Vocab, entity_vector_length: int):
+> super().__init__(vocab, entity_vector_length)
+> ...
> vocab = nlp.vocab
-> kb = KnowledgeBase(vocab=vocab, entity_vector_length=64)
+> kb = FullyImplementedKB(vocab=vocab, entity_vector_length=64)
> ```
| Name | Description |
@@ -40,133 +58,66 @@ The length of the fixed-size entity vectors in the knowledge base.
| ----------- | ------------------------------------------------ |
| **RETURNS** | Length of the fixed-size entity vectors. ~~int~~ |
-## KnowledgeBase.add_entity {#add_entity tag="method"}
+## KnowledgeBase.get_candidates {#get_candidates tag="method"}
-Add an entity to the knowledge base, specifying its corpus frequency and entity
-vector, which should be of length
-[`entity_vector_length`](/api/kb#entity_vector_length).
+Given a certain textual mention as input, retrieve a list of candidate entities
+of type [`Candidate`](/api/kb#candidate).
> #### Example
>
> ```python
-> kb.add_entity(entity="Q42", freq=32, entity_vector=vector1)
-> kb.add_entity(entity="Q463035", freq=111, entity_vector=vector2)
+> from spacy.lang.en import English
+> nlp = English()
+> doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
+> candidates = kb.get_candidates(doc[0:2])
> ```
-| Name | Description |
-| --------------- | ---------------------------------------------------------- |
-| `entity` | The unique entity identifier. ~~str~~ |
-| `freq` | The frequency of the entity in a typical corpus. ~~float~~ |
-| `entity_vector` | The pretrained vector of the entity. ~~numpy.ndarray~~ |
+| Name | Description |
+| ----------- | -------------------------------------------------------------------- |
+| `mention` | The textual mention or alias. ~~Span~~ |
+| **RETURNS** | An iterable of relevant `Candidate` objects. ~~Iterable[Candidate]~~ |
-## KnowledgeBase.set_entities {#set_entities tag="method"}
+## KnowledgeBase.get_candidates_batch {#get_candidates_batch tag="method"}
-Define the full list of entities in the knowledge base, specifying the corpus
-frequency and entity vector for each entity.
+Same as [`get_candidates()`](/api/kb#get_candidates), but for an arbitrary
+number of mentions. The [`EntityLinker`](/api/entitylinker) component will call
+`get_candidates_batch()` instead of `get_candidates()`, if the config parameter
+`candidates_batch_size` is greater or equal than 1.
+
+The default implementation of `get_candidates_batch()` executes
+`get_candidates()` in a loop. We recommend implementing a more efficient way to
+retrieve candidates for multiple mentions at once, if performance is of concern
+to you.
> #### Example
>
> ```python
-> kb.set_entities(entity_list=["Q42", "Q463035"], freq_list=[32, 111], vector_list=[vector1, vector2])
+> from spacy.lang.en import English
+> nlp = English()
+> doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
+> candidates = kb.get_candidates((doc[0:2], doc[3:]))
> ```
-| Name | Description |
-| ------------- | ---------------------------------------------------------------- |
-| `entity_list` | List of unique entity identifiers. ~~Iterable[Union[str, int]]~~ |
-| `freq_list` | List of entity frequencies. ~~Iterable[int]~~ |
-| `vector_list` | List of entity vectors. ~~Iterable[numpy.ndarray]~~ |
-
-## KnowledgeBase.add_alias {#add_alias tag="method"}
-
-Add an alias or mention to the knowledge base, specifying its potential KB
-identifiers and their prior probabilities. The entity identifiers should refer
-to entities previously added with [`add_entity`](/api/kb#add_entity) or
-[`set_entities`](/api/kb#set_entities). The sum of the prior probabilities
-should not exceed 1. Note that an empty string can not be used as alias.
-
-> #### Example
->
-> ```python
-> kb.add_alias(alias="Douglas", entities=["Q42", "Q463035"], probabilities=[0.6, 0.3])
-> ```
-
-| Name | Description |
-| --------------- | --------------------------------------------------------------------------------- |
-| `alias` | The textual mention or alias. Can not be the empty string. ~~str~~ |
-| `entities` | The potential entities that the alias may refer to. ~~Iterable[Union[str, int]]~~ |
-| `probabilities` | The prior probabilities of each entity. ~~Iterable[float]~~ |
-
-## KnowledgeBase.\_\_len\_\_ {#len tag="method"}
-
-Get the total number of entities in the knowledge base.
-
-> #### Example
->
-> ```python
-> total_entities = len(kb)
-> ```
-
-| Name | Description |
-| ----------- | ----------------------------------------------------- |
-| **RETURNS** | The number of entities in the knowledge base. ~~int~~ |
-
-## KnowledgeBase.get_entity_strings {#get_entity_strings tag="method"}
-
-Get a list of all entity IDs in the knowledge base.
-
-> #### Example
->
-> ```python
-> all_entities = kb.get_entity_strings()
-> ```
-
-| Name | Description |
-| ----------- | --------------------------------------------------------- |
-| **RETURNS** | The list of entities in the knowledge base. ~~List[str]~~ |
-
-## KnowledgeBase.get_size_aliases {#get_size_aliases tag="method"}
-
-Get the total number of aliases in the knowledge base.
-
-> #### Example
->
-> ```python
-> total_aliases = kb.get_size_aliases()
-> ```
-
-| Name | Description |
-| ----------- | ---------------------------------------------------- |
-| **RETURNS** | The number of aliases in the knowledge base. ~~int~~ |
-
-## KnowledgeBase.get_alias_strings {#get_alias_strings tag="method"}
-
-Get a list of all aliases in the knowledge base.
-
-> #### Example
->
-> ```python
-> all_aliases = kb.get_alias_strings()
-> ```
-
-| Name | Description |
-| ----------- | -------------------------------------------------------- |
-| **RETURNS** | The list of aliases in the knowledge base. ~~List[str]~~ |
+| Name | Description |
+| ----------- | -------------------------------------------------------------------------------------------- |
+| `mentions` | The textual mention or alias. ~~Iterable[Span]~~ |
+| **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |
## KnowledgeBase.get_alias_candidates {#get_alias_candidates tag="method"}
-Given a certain textual mention as input, retrieve a list of candidate entities
-of type [`Candidate`](/api/kb/#candidate).
+
+This method is _not_ available from spaCy 3.5 onwards.
+
-> #### Example
->
-> ```python
-> candidates = kb.get_alias_candidates("Douglas")
-> ```
-
-| Name | Description |
-| ----------- | ------------------------------------------------------------- |
-| `alias` | The textual mention or alias. ~~str~~ |
-| **RETURNS** | The list of relevant `Candidate` objects. ~~List[Candidate]~~ |
+From spaCy 3.5 on `KnowledgeBase` is an abstract class (with
+[`InMemoryLookupKB`](/api/kb_in_memory) being a drop-in replacement) to allow
+more flexibility in customizing knowledge bases. Some of its methods were moved
+to [`InMemoryLookupKB`](/api/kb_in_memory) during this refactoring, one of those
+being `get_alias_candidates()`. This method is now available as
+[`InMemoryLookupKB.get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
+Note: [`InMemoryLookupKB.get_candidates()`](/api/kb_in_memory#get_candidates)
+defaults to
+[`InMemoryLookupKB.get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
## KnowledgeBase.get_vector {#get_vector tag="method"}
@@ -178,27 +129,30 @@ Given a certain entity ID, retrieve its pretrained entity vector.
> vector = kb.get_vector("Q42")
> ```
-| Name | Description |
-| ----------- | ------------------------------------ |
-| `entity` | The entity ID. ~~str~~ |
-| **RETURNS** | The entity vector. ~~numpy.ndarray~~ |
+| Name | Description |
+| ----------- | -------------------------------------- |
+| `entity` | The entity ID. ~~str~~ |
+| **RETURNS** | The entity vector. ~~Iterable[float]~~ |
-## KnowledgeBase.get_prior_prob {#get_prior_prob tag="method"}
+## KnowledgeBase.get_vectors {#get_vectors tag="method"}
-Given a certain entity ID and a certain textual mention, retrieve the prior
-probability of the fact that the mention links to the entity ID.
+Same as [`get_vector()`](/api/kb#get_vector), but for an arbitrary number of
+entity IDs.
+
+The default implementation of `get_vectors()` executes `get_vector()` in a loop.
+We recommend implementing a more efficient way to retrieve vectors for multiple
+entities at once, if performance is of concern to you.
> #### Example
>
> ```python
-> probability = kb.get_prior_prob("Q42", "Douglas")
+> vectors = kb.get_vectors(("Q42", "Q3107329"))
> ```
-| Name | Description |
-| ----------- | ------------------------------------------------------------------------- |
-| `entity` | The entity ID. ~~str~~ |
-| `alias` | The textual mention or alias. ~~str~~ |
-| **RETURNS** | The prior probability of the `alias` referring to the `entity`. ~~float~~ |
+| Name | Description |
+| ----------- | --------------------------------------------------------- |
+| `entities` | The entity IDs. ~~Iterable[str]~~ |
+| **RETURNS** | The entity vectors. ~~Iterable[Iterable[numpy.ndarray]]~~ |
## KnowledgeBase.to_disk {#to_disk tag="method"}
@@ -207,12 +161,13 @@ Save the current state of the knowledge base to a directory.
> #### Example
>
> ```python
-> kb.to_disk(loc)
+> kb.to_disk(path)
> ```
-| Name | Description |
-| ----- | ------------------------------------------------------------------------------------------------------------------------------------------ |
-| `loc` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| Name | Description |
+| --------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
+| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| `exclude` | List of components to exclude. ~~Iterable[str]~~ |
## KnowledgeBase.from_disk {#from_disk tag="method"}
@@ -222,16 +177,16 @@ Restore the state of the knowledge base from a given directory. Note that the
> #### Example
>
> ```python
-> from spacy.kb import KnowledgeBase
> from spacy.vocab import Vocab
> vocab = Vocab().from_disk("/path/to/vocab")
-> kb = KnowledgeBase(vocab=vocab, entity_vector_length=64)
+> kb = FullyImplementedKB(vocab=vocab, entity_vector_length=64)
> kb.from_disk("/path/to/kb")
> ```
| Name | Description |
| ----------- | ----------------------------------------------------------------------------------------------- |
| `loc` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| `exclude` | List of components to exclude. ~~Iterable[str]~~ |
| **RETURNS** | The modified `KnowledgeBase` object. ~~KnowledgeBase~~ |
## Candidate {#candidate tag="class"}
diff --git a/website/docs/api/kb_in_memory.md b/website/docs/api/kb_in_memory.md
new file mode 100644
index 000000000..c9ce624f0
--- /dev/null
+++ b/website/docs/api/kb_in_memory.md
@@ -0,0 +1,302 @@
+---
+title: InMemoryLookupKB
+teaser:
+ The default implementation of the KnowledgeBase interface. Stores all
+ information in-memory.
+tag: class
+source: spacy/kb/kb_in_memory.pyx
+new: 3.5
+---
+
+The `InMemoryLookupKB` class inherits from [`KnowledgeBase`](/api/kb) and
+implements all of its methods. It stores all KB data in-memory and generates
+[`Candidate`](/api/kb#candidate) objects by exactly matching mentions with
+entity names. It's highly optimized for both a low memory footprint and speed of
+retrieval.
+
+## InMemoryLookupKB.\_\_init\_\_ {#init tag="method"}
+
+Create the knowledge base.
+
+> #### Example
+>
+> ```python
+> from spacy.kb import KnowledgeBase
+> vocab = nlp.vocab
+> kb = KnowledgeBase(vocab=vocab, entity_vector_length=64)
+> ```
+
+| Name | Description |
+| ---------------------- | ------------------------------------------------ |
+| `vocab` | The shared vocabulary. ~~Vocab~~ |
+| `entity_vector_length` | Length of the fixed-size entity vectors. ~~int~~ |
+
+## InMemoryLookupKB.entity_vector_length {#entity_vector_length tag="property"}
+
+The length of the fixed-size entity vectors in the knowledge base.
+
+| Name | Description |
+| ----------- | ------------------------------------------------ |
+| **RETURNS** | Length of the fixed-size entity vectors. ~~int~~ |
+
+## InMemoryLookupKB.add_entity {#add_entity tag="method"}
+
+Add an entity to the knowledge base, specifying its corpus frequency and entity
+vector, which should be of length
+[`entity_vector_length`](/api/kb_in_memory#entity_vector_length).
+
+> #### Example
+>
+> ```python
+> kb.add_entity(entity="Q42", freq=32, entity_vector=vector1)
+> kb.add_entity(entity="Q463035", freq=111, entity_vector=vector2)
+> ```
+
+| Name | Description |
+| --------------- | ---------------------------------------------------------- |
+| `entity` | The unique entity identifier. ~~str~~ |
+| `freq` | The frequency of the entity in a typical corpus. ~~float~~ |
+| `entity_vector` | The pretrained vector of the entity. ~~numpy.ndarray~~ |
+
+## InMemoryLookupKB.set_entities {#set_entities tag="method"}
+
+Define the full list of entities in the knowledge base, specifying the corpus
+frequency and entity vector for each entity.
+
+> #### Example
+>
+> ```python
+> kb.set_entities(entity_list=["Q42", "Q463035"], freq_list=[32, 111], vector_list=[vector1, vector2])
+> ```
+
+| Name | Description |
+| ------------- | ---------------------------------------------------------------- |
+| `entity_list` | List of unique entity identifiers. ~~Iterable[Union[str, int]]~~ |
+| `freq_list` | List of entity frequencies. ~~Iterable[int]~~ |
+| `vector_list` | List of entity vectors. ~~Iterable[numpy.ndarray]~~ |
+
+## InMemoryLookupKB.add_alias {#add_alias tag="method"}
+
+Add an alias or mention to the knowledge base, specifying its potential KB
+identifiers and their prior probabilities. The entity identifiers should refer
+to entities previously added with [`add_entity`](/api/kb_in_memory#add_entity)
+or [`set_entities`](/api/kb_in_memory#set_entities). The sum of the prior
+probabilities should not exceed 1. Note that an empty string can not be used as
+alias.
+
+> #### Example
+>
+> ```python
+> kb.add_alias(alias="Douglas", entities=["Q42", "Q463035"], probabilities=[0.6, 0.3])
+> ```
+
+| Name | Description |
+| --------------- | --------------------------------------------------------------------------------- |
+| `alias` | The textual mention or alias. Can not be the empty string. ~~str~~ |
+| `entities` | The potential entities that the alias may refer to. ~~Iterable[Union[str, int]]~~ |
+| `probabilities` | The prior probabilities of each entity. ~~Iterable[float]~~ |
+
+## InMemoryLookupKB.\_\_len\_\_ {#len tag="method"}
+
+Get the total number of entities in the knowledge base.
+
+> #### Example
+>
+> ```python
+> total_entities = len(kb)
+> ```
+
+| Name | Description |
+| ----------- | ----------------------------------------------------- |
+| **RETURNS** | The number of entities in the knowledge base. ~~int~~ |
+
+## InMemoryLookupKB.get_entity_strings {#get_entity_strings tag="method"}
+
+Get a list of all entity IDs in the knowledge base.
+
+> #### Example
+>
+> ```python
+> all_entities = kb.get_entity_strings()
+> ```
+
+| Name | Description |
+| ----------- | --------------------------------------------------------- |
+| **RETURNS** | The list of entities in the knowledge base. ~~List[str]~~ |
+
+## InMemoryLookupKB.get_size_aliases {#get_size_aliases tag="method"}
+
+Get the total number of aliases in the knowledge base.
+
+> #### Example
+>
+> ```python
+> total_aliases = kb.get_size_aliases()
+> ```
+
+| Name | Description |
+| ----------- | ---------------------------------------------------- |
+| **RETURNS** | The number of aliases in the knowledge base. ~~int~~ |
+
+## InMemoryLookupKB.get_alias_strings {#get_alias_strings tag="method"}
+
+Get a list of all aliases in the knowledge base.
+
+> #### Example
+>
+> ```python
+> all_aliases = kb.get_alias_strings()
+> ```
+
+| Name | Description |
+| ----------- | -------------------------------------------------------- |
+| **RETURNS** | The list of aliases in the knowledge base. ~~List[str]~~ |
+
+## InMemoryLookupKB.get_candidates {#get_candidates tag="method"}
+
+Given a certain textual mention as input, retrieve a list of candidate entities
+of type [`Candidate`](/api/kb#candidate). Wraps
+[`get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
+
+> #### Example
+>
+> ```python
+> from spacy.lang.en import English
+> nlp = English()
+> doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
+> candidates = kb.get_candidates(doc[0:2])
+> ```
+
+| Name | Description |
+| ----------- | -------------------------------------------------------------------- |
+| `mention` | The textual mention or alias. ~~Span~~ |
+| **RETURNS** | An iterable of relevant `Candidate` objects. ~~Iterable[Candidate]~~ |
+
+## InMemoryLookupKB.get_candidates_batch {#get_candidates_batch tag="method"}
+
+Same as [`get_candidates()`](/api/kb_in_memory#get_candidates), but for an
+arbitrary number of mentions. The [`EntityLinker`](/api/entitylinker) component
+will call `get_candidates_batch()` instead of `get_candidates()`, if the config
+parameter `candidates_batch_size` is greater or equal than 1.
+
+The default implementation of `get_candidates_batch()` executes
+`get_candidates()` in a loop. We recommend implementing a more efficient way to
+retrieve candidates for multiple mentions at once, if performance is of concern
+to you.
+
+> #### Example
+>
+> ```python
+> from spacy.lang.en import English
+> nlp = English()
+> doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
+> candidates = kb.get_candidates((doc[0:2], doc[3:]))
+> ```
+
+| Name | Description |
+| ----------- | -------------------------------------------------------------------------------------------- |
+| `mentions` | The textual mention or alias. ~~Iterable[Span]~~ |
+| **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |
+
+## InMemoryLookupKB.get_alias_candidates {#get_alias_candidates tag="method"}
+
+Given a certain textual mention as input, retrieve a list of candidate entities
+of type [`Candidate`](/api/kb#candidate).
+
+> #### Example
+>
+> ```python
+> candidates = kb.get_alias_candidates("Douglas")
+> ```
+
+| Name | Description |
+| ----------- | ------------------------------------------------------------- |
+| `alias` | The textual mention or alias. ~~str~~ |
+| **RETURNS** | The list of relevant `Candidate` objects. ~~List[Candidate]~~ |
+
+## InMemoryLookupKB.get_vector {#get_vector tag="method"}
+
+Given a certain entity ID, retrieve its pretrained entity vector.
+
+> #### Example
+>
+> ```python
+> vector = kb.get_vector("Q42")
+> ```
+
+| Name | Description |
+| ----------- | ------------------------------------ |
+| `entity` | The entity ID. ~~str~~ |
+| **RETURNS** | The entity vector. ~~numpy.ndarray~~ |
+
+## InMemoryLookupKB.get_vectors {#get_vectors tag="method"}
+
+Same as [`get_vector()`](/api/kb_in_memory#get_vector), but for an arbitrary
+number of entity IDs.
+
+The default implementation of `get_vectors()` executes `get_vector()` in a loop.
+We recommend implementing a more efficient way to retrieve vectors for multiple
+entities at once, if performance is of concern to you.
+
+> #### Example
+>
+> ```python
+> vectors = kb.get_vectors(("Q42", "Q3107329"))
+> ```
+
+| Name | Description |
+| ----------- | --------------------------------------------------------- |
+| `entities` | The entity IDs. ~~Iterable[str]~~ |
+| **RETURNS** | The entity vectors. ~~Iterable[Iterable[numpy.ndarray]]~~ |
+
+## InMemoryLookupKB.get_prior_prob {#get_prior_prob tag="method"}
+
+Given a certain entity ID and a certain textual mention, retrieve the prior
+probability of the fact that the mention links to the entity ID.
+
+> #### Example
+>
+> ```python
+> probability = kb.get_prior_prob("Q42", "Douglas")
+> ```
+
+| Name | Description |
+| ----------- | ------------------------------------------------------------------------- |
+| `entity` | The entity ID. ~~str~~ |
+| `alias` | The textual mention or alias. ~~str~~ |
+| **RETURNS** | The prior probability of the `alias` referring to the `entity`. ~~float~~ |
+
+## InMemoryLookupKB.to_disk {#to_disk tag="method"}
+
+Save the current state of the knowledge base to a directory.
+
+> #### Example
+>
+> ```python
+> kb.to_disk(path)
+> ```
+
+| Name | Description |
+| --------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
+| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| `exclude` | List of components to exclude. ~~Iterable[str]~~ |
+
+## InMemoryLookupKB.from_disk {#from_disk tag="method"}
+
+Restore the state of the knowledge base from a given directory. Note that the
+[`Vocab`](/api/vocab) should also be the same as the one used to create the KB.
+
+> #### Example
+>
+> ```python
+> from spacy.vocab import Vocab
+> vocab = Vocab().from_disk("/path/to/vocab")
+> kb = FullyImplementedKB(vocab=vocab, entity_vector_length=64)
+> kb.from_disk("/path/to/kb")
+> ```
+
+| Name | Description |
+| ----------- | ----------------------------------------------------------------------------------------------- |
+| `loc` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| `exclude` | List of components to exclude. ~~Iterable[str]~~ |
+| **RETURNS** | The modified `KnowledgeBase` object. ~~KnowledgeBase~~ |
diff --git a/website/docs/usage/101/_architecture.md b/website/docs/usage/101/_architecture.md
index 22e2b961e..4ebca2756 100644
--- a/website/docs/usage/101/_architecture.md
+++ b/website/docs/usage/101/_architecture.md
@@ -78,7 +78,9 @@ operates on a `Doc` and gives you access to the matched tokens **in context**.
| Name | Description |
| ------------------------------------------------ | -------------------------------------------------------------------------------------------------- |
| [`Corpus`](/api/corpus) | Class for managing annotated corpora for training and evaluation data. |
-| [`KnowledgeBase`](/api/kb) | Storage for entities and aliases of a knowledge base for entity linking. |
+| [`KnowledgeBase`](/api/kb) | Abstract base class for storage and retrieval of data for entity linking. |
+| [`InMemoryLookupKB`](/api/kb_in_memory) | Implementation of `KnowledgeBase` storing all data in memory. |
+| [`Candidate`](/api/kb#candidate) | Object associating a textual mention with a specific entity contained in a `KnowledgeBase`. |
| [`Lookups`](/api/lookups) | Container for convenient access to large lookup tables and dictionaries. |
| [`MorphAnalysis`](/api/morphology#morphanalysis) | A morphological analysis. |
| [`Morphology`](/api/morphology) | Store morphological analyses and map them to and from hash values. |