mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
Refactor KB for easier customization (#11268)
* Add implementation of batching + backwards compatibility fixes. Tests indicate issue with batch disambiguation for custom singular entity lookups. * Fix tests. Add distinction w.r.t. batch size. * Remove redundant and add new comments. * Adjust comments. Fix variable naming in EL prediction. * Fix mypy errors. * Remove KB entity type config option. Change return types of candidate retrieval functions to Iterable from Iterator. Fix various other issues. * Update spacy/pipeline/entity_linker.py Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> * Update spacy/pipeline/entity_linker.py Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> * Update spacy/kb_base.pyx Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> * Update spacy/kb_base.pyx Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> * Update spacy/pipeline/entity_linker.py Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> * Add error messages to NotImplementedErrors. Remove redundant comment. * Fix imports. * Remove redundant comments. * Rename KnowledgeBase to InMemoryLookupKB and BaseKnowledgeBase to KnowledgeBase. * Fix tests. * Update spacy/errors.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Move KB into subdirectory. * Adjust imports after KB move to dedicated subdirectory. * Fix config imports. * Move Candidate + retrieval functions to separate module. Fix other, small issues. * Fix docstrings and error message w.r.t. class names. Fix typing for candidate retrieval functions. * Update spacy/kb/kb_in_memory.pyx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/ml/models/entity_linker.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Fix typing. * Change typing of mentions to be Span instead of Union[Span, str]. * Update docs. * Update EntityLinker and _architecture docs. * Update website/docs/api/entitylinker.md Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> * Adjust message for E1046. * Re-add section for Candidate in kb.md, add reference to dedicated page. * Update docs and docstrings. * Re-add section + reference for KnowledgeBase.get_alias_candidates() in docs. * Update spacy/kb/candidate.pyx * Update spacy/kb/kb_in_memory.pyx * Update spacy/pipeline/legacy/entity_linker.py * Remove canididate.md. Remove mistakenly added config snippet in entity_linker.py. Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
parent
f292569b1a
commit
1f23c615d7
4
setup.py
4
setup.py
|
@ -30,7 +30,9 @@ MOD_NAMES = [
|
||||||
"spacy.lexeme",
|
"spacy.lexeme",
|
||||||
"spacy.vocab",
|
"spacy.vocab",
|
||||||
"spacy.attrs",
|
"spacy.attrs",
|
||||||
"spacy.kb",
|
"spacy.kb.candidate",
|
||||||
|
"spacy.kb.kb",
|
||||||
|
"spacy.kb.kb_in_memory",
|
||||||
"spacy.ml.parser_model",
|
"spacy.ml.parser_model",
|
||||||
"spacy.morphology",
|
"spacy.morphology",
|
||||||
"spacy.pipeline.dep_parser",
|
"spacy.pipeline.dep_parser",
|
||||||
|
|
|
@ -709,9 +709,9 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
"`nlp.enable_pipe` instead.")
|
"`nlp.enable_pipe` instead.")
|
||||||
E927 = ("Can't write to frozen list Maybe you're trying to modify a computed "
|
E927 = ("Can't write to frozen list Maybe you're trying to modify a computed "
|
||||||
"property or default function argument?")
|
"property or default function argument?")
|
||||||
E928 = ("A KnowledgeBase can only be serialized to/from from a directory, "
|
E928 = ("An InMemoryLookupKB can only be serialized to/from from a directory, "
|
||||||
"but the provided argument {loc} points to a file.")
|
"but the provided argument {loc} points to a file.")
|
||||||
E929 = ("Couldn't read KnowledgeBase from {loc}. The path does not seem to exist.")
|
E929 = ("Couldn't read InMemoryLookupKB from {loc}. The path does not seem to exist.")
|
||||||
E930 = ("Received invalid get_examples callback in `{method}`. "
|
E930 = ("Received invalid get_examples callback in `{method}`. "
|
||||||
"Expected function that returns an iterable of Example objects but "
|
"Expected function that returns an iterable of Example objects but "
|
||||||
"got: {obj}")
|
"got: {obj}")
|
||||||
|
@ -941,6 +941,12 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
"`{arg2}`={arg2_values} but these arguments are conflicting.")
|
"`{arg2}`={arg2_values} but these arguments are conflicting.")
|
||||||
E1043 = ("Expected None or a value in range [{range_start}, {range_end}] for entity linker threshold, but got "
|
E1043 = ("Expected None or a value in range [{range_start}, {range_end}] for entity linker threshold, but got "
|
||||||
"{value}.")
|
"{value}.")
|
||||||
|
E1044 = ("Expected `candidates_batch_size` to be >= 1, but got: {value}")
|
||||||
|
E1045 = ("Encountered {parent} subclass without `{parent}.{method}` "
|
||||||
|
"method in '{name}'. If you want to use this method, make "
|
||||||
|
"sure it's overwritten on the subclass.")
|
||||||
|
E1046 = ("{cls_name} is an abstract class and cannot be instantiated. If you are looking for spaCy's default "
|
||||||
|
"knowledge base, use `InMemoryLookupKB`.")
|
||||||
|
|
||||||
|
|
||||||
# Deprecated model shortcuts, only used in errors and warnings
|
# Deprecated model shortcuts, only used in errors and warnings
|
||||||
|
|
3
spacy/kb/__init__.py
Normal file
3
spacy/kb/__init__.py
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
from .kb import KnowledgeBase
|
||||||
|
from .kb_in_memory import InMemoryLookupKB
|
||||||
|
from .candidate import Candidate, get_candidates, get_candidates_batch
|
12
spacy/kb/candidate.pxd
Normal file
12
spacy/kb/candidate.pxd
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
from .kb cimport KnowledgeBase
|
||||||
|
from libcpp.vector cimport vector
|
||||||
|
from ..typedefs cimport hash_t
|
||||||
|
|
||||||
|
# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
|
||||||
|
cdef class Candidate:
|
||||||
|
cdef readonly KnowledgeBase kb
|
||||||
|
cdef hash_t entity_hash
|
||||||
|
cdef float entity_freq
|
||||||
|
cdef vector[float] entity_vector
|
||||||
|
cdef hash_t alias_hash
|
||||||
|
cdef float prior_prob
|
74
spacy/kb/candidate.pyx
Normal file
74
spacy/kb/candidate.pyx
Normal file
|
@ -0,0 +1,74 @@
|
||||||
|
# cython: infer_types=True, profile=True
|
||||||
|
|
||||||
|
from typing import Iterable
|
||||||
|
from .kb cimport KnowledgeBase
|
||||||
|
from ..tokens import Span
|
||||||
|
|
||||||
|
cdef class Candidate:
|
||||||
|
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
|
||||||
|
to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
|
||||||
|
algorithm which will disambiguate the various candidates to the correct one.
|
||||||
|
Each candidate (alias, entity) pair is assigned a certain prior probability.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/kb/#candidate-init
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
|
||||||
|
self.kb = kb
|
||||||
|
self.entity_hash = entity_hash
|
||||||
|
self.entity_freq = entity_freq
|
||||||
|
self.entity_vector = entity_vector
|
||||||
|
self.alias_hash = alias_hash
|
||||||
|
self.prior_prob = prior_prob
|
||||||
|
|
||||||
|
@property
|
||||||
|
def entity(self) -> int:
|
||||||
|
"""RETURNS (uint64): hash of the entity's KB ID/name"""
|
||||||
|
return self.entity_hash
|
||||||
|
|
||||||
|
@property
|
||||||
|
def entity_(self) -> str:
|
||||||
|
"""RETURNS (str): ID/name of this entity in the KB"""
|
||||||
|
return self.kb.vocab.strings[self.entity_hash]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def alias(self) -> int:
|
||||||
|
"""RETURNS (uint64): hash of the alias"""
|
||||||
|
return self.alias_hash
|
||||||
|
|
||||||
|
@property
|
||||||
|
def alias_(self) -> str:
|
||||||
|
"""RETURNS (str): ID of the original alias"""
|
||||||
|
return self.kb.vocab.strings[self.alias_hash]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def entity_freq(self) -> float:
|
||||||
|
return self.entity_freq
|
||||||
|
|
||||||
|
@property
|
||||||
|
def entity_vector(self) -> Iterable[float]:
|
||||||
|
return self.entity_vector
|
||||||
|
|
||||||
|
@property
|
||||||
|
def prior_prob(self) -> float:
|
||||||
|
return self.prior_prob
|
||||||
|
|
||||||
|
|
||||||
|
def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
|
||||||
|
"""
|
||||||
|
Return candidate entities for a given mention and fetching appropriate entries from the index.
|
||||||
|
kb (KnowledgeBase): Knowledge base to query.
|
||||||
|
mention (Span): Entity mention for which to identify candidates.
|
||||||
|
RETURNS (Iterable[Candidate]): Identified candidates.
|
||||||
|
"""
|
||||||
|
return kb.get_candidates(mention)
|
||||||
|
|
||||||
|
|
||||||
|
def get_candidates_batch(kb: KnowledgeBase, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
|
||||||
|
"""
|
||||||
|
Return candidate entities for the given mentions and fetching appropriate entries from the index.
|
||||||
|
kb (KnowledgeBase): Knowledge base to query.
|
||||||
|
mention (Iterable[Span]): Entity mentions for which to identify candidates.
|
||||||
|
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
||||||
|
"""
|
||||||
|
return kb.get_candidates_batch(mentions)
|
10
spacy/kb/kb.pxd
Normal file
10
spacy/kb/kb.pxd
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
"""Knowledge-base for entity or concept linking."""
|
||||||
|
|
||||||
|
from cymem.cymem cimport Pool
|
||||||
|
from libc.stdint cimport int64_t
|
||||||
|
from ..vocab cimport Vocab
|
||||||
|
|
||||||
|
cdef class KnowledgeBase:
|
||||||
|
cdef Pool mem
|
||||||
|
cdef readonly Vocab vocab
|
||||||
|
cdef readonly int64_t entity_vector_length
|
108
spacy/kb/kb.pyx
Normal file
108
spacy/kb/kb.pyx
Normal file
|
@ -0,0 +1,108 @@
|
||||||
|
# cython: infer_types=True, profile=True
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterable, Tuple, Union
|
||||||
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
|
from .candidate import Candidate
|
||||||
|
from ..tokens import Span
|
||||||
|
from ..util import SimpleFrozenList
|
||||||
|
from ..errors import Errors
|
||||||
|
|
||||||
|
|
||||||
|
cdef class KnowledgeBase:
|
||||||
|
"""A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
|
||||||
|
to support entity linking of named entities to real-world concepts.
|
||||||
|
This is an abstract class and requires its operations to be implemented.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/kb
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, vocab: Vocab, entity_vector_length: int):
|
||||||
|
"""Create a KnowledgeBase."""
|
||||||
|
# Make sure abstract KB is not instantiated.
|
||||||
|
if self.__class__ == KnowledgeBase:
|
||||||
|
raise TypeError(
|
||||||
|
Errors.E1046.format(cls_name=self.__class__.__name__)
|
||||||
|
)
|
||||||
|
|
||||||
|
self.vocab = vocab
|
||||||
|
self.entity_vector_length = entity_vector_length
|
||||||
|
self.mem = Pool()
|
||||||
|
|
||||||
|
def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
|
||||||
|
"""
|
||||||
|
Return candidate entities for specified texts. Each candidate defines the entity, the original alias,
|
||||||
|
and the prior probability of that alias resolving to that entity.
|
||||||
|
If no candidate is found for a given text, an empty list is returned.
|
||||||
|
mentions (Iterable[Span]): Mentions for which to get candidates.
|
||||||
|
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
||||||
|
"""
|
||||||
|
return [self.get_candidates(span) for span in mentions]
|
||||||
|
|
||||||
|
def get_candidates(self, mention: Span) -> Iterable[Candidate]:
|
||||||
|
"""
|
||||||
|
Return candidate entities for specified text. Each candidate defines the entity, the original alias,
|
||||||
|
and the prior probability of that alias resolving to that entity.
|
||||||
|
If the no candidate is found for a given text, an empty list is returned.
|
||||||
|
mention (Span): Mention for which to get candidates.
|
||||||
|
RETURNS (Iterable[Candidate]): Identified candidates.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError(
|
||||||
|
Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__)
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]:
|
||||||
|
"""
|
||||||
|
Return vectors for entities.
|
||||||
|
entity (str): Entity name/ID.
|
||||||
|
RETURNS (Iterable[Iterable[float]]): Vectors for specified entities.
|
||||||
|
"""
|
||||||
|
return [self.get_vector(entity) for entity in entities]
|
||||||
|
|
||||||
|
def get_vector(self, str entity) -> Iterable[float]:
|
||||||
|
"""
|
||||||
|
Return vector for entity.
|
||||||
|
entity (str): Entity name/ID.
|
||||||
|
RETURNS (Iterable[float]): Vector for specified entity.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError(
|
||||||
|
Errors.E1045.format(parent="KnowledgeBase", method="get_vector", name=self.__name__)
|
||||||
|
)
|
||||||
|
|
||||||
|
def to_bytes(self, **kwargs) -> bytes:
|
||||||
|
"""Serialize the current state to a binary string.
|
||||||
|
RETURNS (bytes): Current state as binary string.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError(
|
||||||
|
Errors.E1045.format(parent="KnowledgeBase", method="to_bytes", name=self.__name__)
|
||||||
|
)
|
||||||
|
|
||||||
|
def from_bytes(self, bytes_data: bytes, *, exclude: Tuple[str] = tuple()):
|
||||||
|
"""Load state from a binary string.
|
||||||
|
bytes_data (bytes): KB state.
|
||||||
|
exclude (Tuple[str]): Properties to exclude when restoring KB.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError(
|
||||||
|
Errors.E1045.format(parent="KnowledgeBase", method="from_bytes", name=self.__name__)
|
||||||
|
)
|
||||||
|
|
||||||
|
def to_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
|
||||||
|
"""
|
||||||
|
Write KnowledgeBase content to disk.
|
||||||
|
path (Union[str, Path]): Target file path.
|
||||||
|
exclude (Iterable[str]): List of components to exclude.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError(
|
||||||
|
Errors.E1045.format(parent="KnowledgeBase", method="to_disk", name=self.__name__)
|
||||||
|
)
|
||||||
|
|
||||||
|
def from_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
|
||||||
|
"""
|
||||||
|
Load KnowledgeBase content from disk.
|
||||||
|
path (Union[str, Path]): Target file path.
|
||||||
|
exclude (Iterable[str]): List of components to exclude.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError(
|
||||||
|
Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__)
|
||||||
|
)
|
|
@ -1,14 +1,12 @@
|
||||||
"""Knowledge-base for entity or concept linking."""
|
"""Knowledge-base for entity or concept linking."""
|
||||||
from cymem.cymem cimport Pool
|
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
from libc.stdint cimport int32_t, int64_t
|
from libc.stdint cimport int32_t, int64_t
|
||||||
from libc.stdio cimport FILE
|
from libc.stdio cimport FILE
|
||||||
|
|
||||||
from .vocab cimport Vocab
|
from ..typedefs cimport hash_t
|
||||||
from .typedefs cimport hash_t
|
from ..structs cimport KBEntryC, AliasC
|
||||||
from .structs cimport KBEntryC, AliasC
|
from .kb cimport KnowledgeBase
|
||||||
|
|
||||||
|
|
||||||
ctypedef vector[KBEntryC] entry_vec
|
ctypedef vector[KBEntryC] entry_vec
|
||||||
ctypedef vector[AliasC] alias_vec
|
ctypedef vector[AliasC] alias_vec
|
||||||
|
@ -16,21 +14,7 @@ ctypedef vector[float] float_vec
|
||||||
ctypedef vector[float_vec] float_matrix
|
ctypedef vector[float_vec] float_matrix
|
||||||
|
|
||||||
|
|
||||||
# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
|
cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
cdef class Candidate:
|
|
||||||
cdef readonly KnowledgeBase kb
|
|
||||||
cdef hash_t entity_hash
|
|
||||||
cdef float entity_freq
|
|
||||||
cdef vector[float] entity_vector
|
|
||||||
cdef hash_t alias_hash
|
|
||||||
cdef float prior_prob
|
|
||||||
|
|
||||||
|
|
||||||
cdef class KnowledgeBase:
|
|
||||||
cdef Pool mem
|
|
||||||
cdef readonly Vocab vocab
|
|
||||||
cdef int64_t entity_vector_length
|
|
||||||
|
|
||||||
# This maps 64bit keys (hash of unique entity string)
|
# This maps 64bit keys (hash of unique entity string)
|
||||||
# to 64bit values (position of the _KBEntryC struct in the _entries vector).
|
# to 64bit values (position of the _KBEntryC struct in the _entries vector).
|
||||||
# The PreshMap is pretty space efficient, as it uses open addressing. So
|
# The PreshMap is pretty space efficient, as it uses open addressing. So
|
|
@ -1,8 +1,7 @@
|
||||||
# cython: infer_types=True, profile=True
|
# cython: infer_types=True, profile=True
|
||||||
from typing import Iterator, Iterable, Callable, Dict, Any
|
from typing import Iterable, Callable, Dict, Any, Union
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
from cymem.cymem cimport Pool
|
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
from cpython.exc cimport PyErr_SetFromErrno
|
from cpython.exc cimport PyErr_SetFromErrno
|
||||||
from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
|
from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
|
||||||
|
@ -12,85 +11,28 @@ from libcpp.vector cimport vector
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from .typedefs cimport hash_t
|
from ..tokens import Span
|
||||||
from .errors import Errors, Warnings
|
from ..typedefs cimport hash_t
|
||||||
from . import util
|
from ..errors import Errors, Warnings
|
||||||
from .util import SimpleFrozenList, ensure_path
|
from .. import util
|
||||||
|
from ..util import SimpleFrozenList, ensure_path
|
||||||
cdef class Candidate:
|
from ..vocab cimport Vocab
|
||||||
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
|
from .kb cimport KnowledgeBase
|
||||||
to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
|
from .candidate import Candidate as Candidate
|
||||||
algorithm which will disambiguate the various candidates to the correct one.
|
|
||||||
Each candidate (alias, entity) pair is assigned to a certain prior probability.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/kb/#candidate_init
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
|
|
||||||
self.kb = kb
|
|
||||||
self.entity_hash = entity_hash
|
|
||||||
self.entity_freq = entity_freq
|
|
||||||
self.entity_vector = entity_vector
|
|
||||||
self.alias_hash = alias_hash
|
|
||||||
self.prior_prob = prior_prob
|
|
||||||
|
|
||||||
@property
|
|
||||||
def entity(self):
|
|
||||||
"""RETURNS (uint64): hash of the entity's KB ID/name"""
|
|
||||||
return self.entity_hash
|
|
||||||
|
|
||||||
@property
|
|
||||||
def entity_(self):
|
|
||||||
"""RETURNS (str): ID/name of this entity in the KB"""
|
|
||||||
return self.kb.vocab.strings[self.entity_hash]
|
|
||||||
|
|
||||||
@property
|
|
||||||
def alias(self):
|
|
||||||
"""RETURNS (uint64): hash of the alias"""
|
|
||||||
return self.alias_hash
|
|
||||||
|
|
||||||
@property
|
|
||||||
def alias_(self):
|
|
||||||
"""RETURNS (str): ID of the original alias"""
|
|
||||||
return self.kb.vocab.strings[self.alias_hash]
|
|
||||||
|
|
||||||
@property
|
|
||||||
def entity_freq(self):
|
|
||||||
return self.entity_freq
|
|
||||||
|
|
||||||
@property
|
|
||||||
def entity_vector(self):
|
|
||||||
return self.entity_vector
|
|
||||||
|
|
||||||
@property
|
|
||||||
def prior_prob(self):
|
|
||||||
return self.prior_prob
|
|
||||||
|
|
||||||
|
|
||||||
def get_candidates(KnowledgeBase kb, span) -> Iterator[Candidate]:
|
cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
"""
|
"""An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases,
|
||||||
Return candidate entities for a given span by using the text of the span as the alias
|
|
||||||
and fetching appropriate entries from the index.
|
|
||||||
This particular function is optimized to work with the built-in KB functionality,
|
|
||||||
but any other custom candidate generation method can be used in combination with the KB as well.
|
|
||||||
"""
|
|
||||||
return kb.get_alias_candidates(span.text)
|
|
||||||
|
|
||||||
|
|
||||||
cdef class KnowledgeBase:
|
|
||||||
"""A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
|
|
||||||
to support entity linking of named entities to real-world concepts.
|
to support entity linking of named entities to real-world concepts.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/kb
|
DOCS: https://spacy.io/api/kb_in_memory
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, Vocab vocab, entity_vector_length):
|
def __init__(self, Vocab vocab, entity_vector_length):
|
||||||
"""Create a KnowledgeBase."""
|
"""Create an InMemoryLookupKB."""
|
||||||
self.mem = Pool()
|
super().__init__(vocab, entity_vector_length)
|
||||||
self.entity_vector_length = entity_vector_length
|
|
||||||
self._entry_index = PreshMap()
|
self._entry_index = PreshMap()
|
||||||
self._alias_index = PreshMap()
|
self._alias_index = PreshMap()
|
||||||
self.vocab = vocab
|
|
||||||
self._create_empty_vectors(dummy_hash=self.vocab.strings[""])
|
self._create_empty_vectors(dummy_hash=self.vocab.strings[""])
|
||||||
|
|
||||||
def _initialize_entities(self, int64_t nr_entities):
|
def _initialize_entities(self, int64_t nr_entities):
|
||||||
|
@ -104,11 +46,6 @@ cdef class KnowledgeBase:
|
||||||
self._alias_index = PreshMap(nr_aliases + 1)
|
self._alias_index = PreshMap(nr_aliases + 1)
|
||||||
self._aliases_table = alias_vec(nr_aliases + 1)
|
self._aliases_table = alias_vec(nr_aliases + 1)
|
||||||
|
|
||||||
@property
|
|
||||||
def entity_vector_length(self):
|
|
||||||
"""RETURNS (uint64): length of the entity vectors"""
|
|
||||||
return self.entity_vector_length
|
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return self.get_size_entities()
|
return self.get_size_entities()
|
||||||
|
|
||||||
|
@ -286,7 +223,10 @@ cdef class KnowledgeBase:
|
||||||
alias_entry.probs = probs
|
alias_entry.probs = probs
|
||||||
self._aliases_table[alias_index] = alias_entry
|
self._aliases_table[alias_index] = alias_entry
|
||||||
|
|
||||||
def get_alias_candidates(self, str alias) -> Iterator[Candidate]:
|
def get_candidates(self, mention: Span) -> Iterable[Candidate]:
|
||||||
|
return self.get_alias_candidates(mention.text) # type: ignore
|
||||||
|
|
||||||
|
def get_alias_candidates(self, str alias) -> Iterable[Candidate]:
|
||||||
"""
|
"""
|
||||||
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
|
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
|
||||||
and the prior probability of that alias resolving to that entity.
|
and the prior probability of that alias resolving to that entity.
|
|
@ -1,11 +1,12 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, Callable, Iterable, List, Tuple
|
from typing import Optional, Callable, Iterable, List, Tuple
|
||||||
from thinc.types import Floats2d
|
from thinc.types import Floats2d
|
||||||
from thinc.api import chain, clone, list2ragged, reduce_mean, residual
|
from thinc.api import chain, list2ragged, reduce_mean, residual
|
||||||
from thinc.api import Model, Maxout, Linear, noop, tuplify, Ragged
|
from thinc.api import Model, Maxout, Linear, tuplify, Ragged
|
||||||
|
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
from ...kb import KnowledgeBase, Candidate, get_candidates
|
from ...kb import KnowledgeBase, InMemoryLookupKB
|
||||||
|
from ...kb import Candidate, get_candidates, get_candidates_batch
|
||||||
from ...vocab import Vocab
|
from ...vocab import Vocab
|
||||||
from ...tokens import Span, Doc
|
from ...tokens import Span, Doc
|
||||||
from ..extract_spans import extract_spans
|
from ..extract_spans import extract_spans
|
||||||
|
@ -78,9 +79,11 @@ def span_maker_forward(model, docs: List[Doc], is_train) -> Tuple[Ragged, Callab
|
||||||
|
|
||||||
|
|
||||||
@registry.misc("spacy.KBFromFile.v1")
|
@registry.misc("spacy.KBFromFile.v1")
|
||||||
def load_kb(kb_path: Path) -> Callable[[Vocab], KnowledgeBase]:
|
def load_kb(
|
||||||
def kb_from_file(vocab):
|
kb_path: Path,
|
||||||
kb = KnowledgeBase(vocab, entity_vector_length=1)
|
) -> Callable[[Vocab], KnowledgeBase]:
|
||||||
|
def kb_from_file(vocab: Vocab):
|
||||||
|
kb = InMemoryLookupKB(vocab, entity_vector_length=1)
|
||||||
kb.from_disk(kb_path)
|
kb.from_disk(kb_path)
|
||||||
return kb
|
return kb
|
||||||
|
|
||||||
|
@ -88,9 +91,11 @@ def load_kb(kb_path: Path) -> Callable[[Vocab], KnowledgeBase]:
|
||||||
|
|
||||||
|
|
||||||
@registry.misc("spacy.EmptyKB.v1")
|
@registry.misc("spacy.EmptyKB.v1")
|
||||||
def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
|
def empty_kb(
|
||||||
def empty_kb_factory(vocab):
|
entity_vector_length: int,
|
||||||
return KnowledgeBase(vocab=vocab, entity_vector_length=entity_vector_length)
|
) -> Callable[[Vocab], KnowledgeBase]:
|
||||||
|
def empty_kb_factory(vocab: Vocab):
|
||||||
|
return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length)
|
||||||
|
|
||||||
return empty_kb_factory
|
return empty_kb_factory
|
||||||
|
|
||||||
|
@ -98,3 +103,10 @@ def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
|
||||||
@registry.misc("spacy.CandidateGenerator.v1")
|
@registry.misc("spacy.CandidateGenerator.v1")
|
||||||
def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
|
def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
|
||||||
return get_candidates
|
return get_candidates
|
||||||
|
|
||||||
|
|
||||||
|
@registry.misc("spacy.CandidateBatchGenerator.v1")
|
||||||
|
def create_candidates_batch() -> Callable[
|
||||||
|
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
||||||
|
]:
|
||||||
|
return get_candidates_batch
|
||||||
|
|
|
@ -53,9 +53,11 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
"incl_context": True,
|
"incl_context": True,
|
||||||
"entity_vector_length": 64,
|
"entity_vector_length": 64,
|
||||||
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
||||||
|
"get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
|
||||||
"overwrite": True,
|
"overwrite": True,
|
||||||
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
|
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
|
||||||
"use_gold_ents": True,
|
"use_gold_ents": True,
|
||||||
|
"candidates_batch_size": 1,
|
||||||
"threshold": None,
|
"threshold": None,
|
||||||
},
|
},
|
||||||
default_score_weights={
|
default_score_weights={
|
||||||
|
@ -75,9 +77,13 @@ def make_entity_linker(
|
||||||
incl_context: bool,
|
incl_context: bool,
|
||||||
entity_vector_length: int,
|
entity_vector_length: int,
|
||||||
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
||||||
|
get_candidates_batch: Callable[
|
||||||
|
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
||||||
|
],
|
||||||
overwrite: bool,
|
overwrite: bool,
|
||||||
scorer: Optional[Callable],
|
scorer: Optional[Callable],
|
||||||
use_gold_ents: bool,
|
use_gold_ents: bool,
|
||||||
|
candidates_batch_size: int,
|
||||||
threshold: Optional[float] = None,
|
threshold: Optional[float] = None,
|
||||||
):
|
):
|
||||||
"""Construct an EntityLinker component.
|
"""Construct an EntityLinker component.
|
||||||
|
@ -90,17 +96,21 @@ def make_entity_linker(
|
||||||
incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
|
incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
|
||||||
incl_context (bool): Whether or not to include the local context in the model.
|
incl_context (bool): Whether or not to include the local context in the model.
|
||||||
entity_vector_length (int): Size of encoding vectors in the KB.
|
entity_vector_length (int): Size of encoding vectors in the KB.
|
||||||
get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that
|
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
|
||||||
produces a list of candidates, given a certain knowledge base and a textual mention.
|
produces a list of candidates, given a certain knowledge base and a textual mention.
|
||||||
|
get_candidates_batch (
|
||||||
|
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
|
||||||
|
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
|
||||||
scorer (Optional[Callable]): The scoring method.
|
scorer (Optional[Callable]): The scoring method.
|
||||||
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
|
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
|
||||||
component must provide entity annotations.
|
component must provide entity annotations.
|
||||||
|
candidates_batch_size (int): Size of batches for entity candidate generation.
|
||||||
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold,
|
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold,
|
||||||
prediction is discarded. If None, predictions are not filtered by any threshold.
|
prediction is discarded. If None, predictions are not filtered by any threshold.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if not model.attrs.get("include_span_maker", False):
|
if not model.attrs.get("include_span_maker", False):
|
||||||
# The only difference in arguments here is that use_gold_ents is not available
|
# The only difference in arguments here is that use_gold_ents and threshold aren't available.
|
||||||
return EntityLinker_v1(
|
return EntityLinker_v1(
|
||||||
nlp.vocab,
|
nlp.vocab,
|
||||||
model,
|
model,
|
||||||
|
@ -124,9 +134,11 @@ def make_entity_linker(
|
||||||
incl_context=incl_context,
|
incl_context=incl_context,
|
||||||
entity_vector_length=entity_vector_length,
|
entity_vector_length=entity_vector_length,
|
||||||
get_candidates=get_candidates,
|
get_candidates=get_candidates,
|
||||||
|
get_candidates_batch=get_candidates_batch,
|
||||||
overwrite=overwrite,
|
overwrite=overwrite,
|
||||||
scorer=scorer,
|
scorer=scorer,
|
||||||
use_gold_ents=use_gold_ents,
|
use_gold_ents=use_gold_ents,
|
||||||
|
candidates_batch_size=candidates_batch_size,
|
||||||
threshold=threshold,
|
threshold=threshold,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -160,9 +172,13 @@ class EntityLinker(TrainablePipe):
|
||||||
incl_context: bool,
|
incl_context: bool,
|
||||||
entity_vector_length: int,
|
entity_vector_length: int,
|
||||||
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
||||||
|
get_candidates_batch: Callable[
|
||||||
|
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
||||||
|
],
|
||||||
overwrite: bool = BACKWARD_OVERWRITE,
|
overwrite: bool = BACKWARD_OVERWRITE,
|
||||||
scorer: Optional[Callable] = entity_linker_score,
|
scorer: Optional[Callable] = entity_linker_score,
|
||||||
use_gold_ents: bool,
|
use_gold_ents: bool,
|
||||||
|
candidates_batch_size: int,
|
||||||
threshold: Optional[float] = None,
|
threshold: Optional[float] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize an entity linker.
|
"""Initialize an entity linker.
|
||||||
|
@ -178,10 +194,14 @@ class EntityLinker(TrainablePipe):
|
||||||
entity_vector_length (int): Size of encoding vectors in the KB.
|
entity_vector_length (int): Size of encoding vectors in the KB.
|
||||||
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
|
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
|
||||||
produces a list of candidates, given a certain knowledge base and a textual mention.
|
produces a list of candidates, given a certain knowledge base and a textual mention.
|
||||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
get_candidates_batch (
|
||||||
Scorer.score_links.
|
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
|
||||||
|
Iterable[Candidate]]
|
||||||
|
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
|
||||||
|
scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
|
||||||
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
|
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
|
||||||
component must provide entity annotations.
|
component must provide entity annotations.
|
||||||
|
candidates_batch_size (int): Size of batches for entity candidate generation.
|
||||||
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the
|
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the
|
||||||
threshold, prediction is discarded. If None, predictions are not filtered by any threshold.
|
threshold, prediction is discarded. If None, predictions are not filtered by any threshold.
|
||||||
DOCS: https://spacy.io/api/entitylinker#init
|
DOCS: https://spacy.io/api/entitylinker#init
|
||||||
|
@ -204,22 +224,27 @@ class EntityLinker(TrainablePipe):
|
||||||
self.incl_prior = incl_prior
|
self.incl_prior = incl_prior
|
||||||
self.incl_context = incl_context
|
self.incl_context = incl_context
|
||||||
self.get_candidates = get_candidates
|
self.get_candidates = get_candidates
|
||||||
|
self.get_candidates_batch = get_candidates_batch
|
||||||
self.cfg: Dict[str, Any] = {"overwrite": overwrite}
|
self.cfg: Dict[str, Any] = {"overwrite": overwrite}
|
||||||
self.distance = CosineDistance(normalize=False)
|
self.distance = CosineDistance(normalize=False)
|
||||||
# how many neighbour sentences to take into account
|
# how many neighbour sentences to take into account
|
||||||
# create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
|
# create an empty KB by default
|
||||||
self.kb = empty_kb(entity_vector_length)(self.vocab)
|
self.kb = empty_kb(entity_vector_length)(self.vocab)
|
||||||
self.scorer = scorer
|
self.scorer = scorer
|
||||||
self.use_gold_ents = use_gold_ents
|
self.use_gold_ents = use_gold_ents
|
||||||
|
self.candidates_batch_size = candidates_batch_size
|
||||||
self.threshold = threshold
|
self.threshold = threshold
|
||||||
|
|
||||||
|
if candidates_batch_size < 1:
|
||||||
|
raise ValueError(Errors.E1044)
|
||||||
|
|
||||||
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
|
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
|
||||||
"""Define the KB of this pipe by providing a function that will
|
"""Define the KB of this pipe by providing a function that will
|
||||||
create it using this object's vocab."""
|
create it using this object's vocab."""
|
||||||
if not callable(kb_loader):
|
if not callable(kb_loader):
|
||||||
raise ValueError(Errors.E885.format(arg_type=type(kb_loader)))
|
raise ValueError(Errors.E885.format(arg_type=type(kb_loader)))
|
||||||
|
|
||||||
self.kb = kb_loader(self.vocab)
|
self.kb = kb_loader(self.vocab) # type: ignore
|
||||||
|
|
||||||
def validate_kb(self) -> None:
|
def validate_kb(self) -> None:
|
||||||
# Raise an error if the knowledge base is not initialized.
|
# Raise an error if the knowledge base is not initialized.
|
||||||
|
@ -241,8 +266,8 @@ class EntityLinker(TrainablePipe):
|
||||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
returns a representative sample of gold-standard Example objects.
|
returns a representative sample of gold-standard Example objects.
|
||||||
nlp (Language): The current nlp object the component is part of.
|
nlp (Language): The current nlp object the component is part of.
|
||||||
kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates a KnowledgeBase from a Vocab instance.
|
kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates a KnowledgeBase from a Vocab
|
||||||
Note that providing this argument, will overwrite all data accumulated in the current KB.
|
instance. Note that providing this argument will overwrite all data accumulated in the current KB.
|
||||||
Use this only when loading a KB as-such from file.
|
Use this only when loading a KB as-such from file.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entitylinker#initialize
|
DOCS: https://spacy.io/api/entitylinker#initialize
|
||||||
|
@ -419,66 +444,93 @@ class EntityLinker(TrainablePipe):
|
||||||
if len(doc) == 0:
|
if len(doc) == 0:
|
||||||
continue
|
continue
|
||||||
sentences = [s for s in doc.sents]
|
sentences = [s for s in doc.sents]
|
||||||
# Looping through each entity (TODO: rewrite)
|
|
||||||
for ent in doc.ents:
|
|
||||||
sent_index = sentences.index(ent.sent)
|
|
||||||
assert sent_index >= 0
|
|
||||||
|
|
||||||
if self.incl_context:
|
# Loop over entities in batches.
|
||||||
# get n_neighbour sentences, clipped to the length of the document
|
for ent_idx in range(0, len(doc.ents), self.candidates_batch_size):
|
||||||
start_sentence = max(0, sent_index - self.n_sents)
|
ent_batch = doc.ents[ent_idx : ent_idx + self.candidates_batch_size]
|
||||||
end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
|
|
||||||
start_token = sentences[start_sentence].start
|
# Look up candidate entities.
|
||||||
end_token = sentences[end_sentence].end
|
valid_ent_idx = [
|
||||||
sent_doc = doc[start_token:end_token].as_doc()
|
idx
|
||||||
# currently, the context is the same for each entity in a sentence (should be refined)
|
for idx in range(len(ent_batch))
|
||||||
sentence_encoding = self.model.predict([sent_doc])[0]
|
if ent_batch[idx].label_ not in self.labels_discard
|
||||||
sentence_encoding_t = sentence_encoding.T
|
]
|
||||||
sentence_norm = xp.linalg.norm(sentence_encoding_t)
|
|
||||||
entity_count += 1
|
batch_candidates = list(
|
||||||
if ent.label_ in self.labels_discard:
|
self.get_candidates_batch(
|
||||||
# ignoring this entity - setting to NIL
|
self.kb, [ent_batch[idx] for idx in valid_ent_idx]
|
||||||
final_kb_ids.append(self.NIL)
|
)
|
||||||
else:
|
if self.candidates_batch_size > 1
|
||||||
candidates = list(self.get_candidates(self.kb, ent))
|
else [
|
||||||
if not candidates:
|
self.get_candidates(self.kb, ent_batch[idx])
|
||||||
# no prediction possible for this entity - setting to NIL
|
for idx in valid_ent_idx
|
||||||
final_kb_ids.append(self.NIL)
|
]
|
||||||
elif len(candidates) == 1 and self.threshold is None:
|
)
|
||||||
# shortcut for efficiency reasons: take the 1 candidate
|
|
||||||
final_kb_ids.append(candidates[0].entity_)
|
# Looping through each entity in batch (TODO: rewrite)
|
||||||
else:
|
for j, ent in enumerate(ent_batch):
|
||||||
random.shuffle(candidates)
|
sent_index = sentences.index(ent.sent)
|
||||||
# set all prior probabilities to 0 if incl_prior=False
|
assert sent_index >= 0
|
||||||
prior_probs = xp.asarray([c.prior_prob for c in candidates])
|
|
||||||
if not self.incl_prior:
|
if self.incl_context:
|
||||||
prior_probs = xp.asarray([0.0 for _ in candidates])
|
# get n_neighbour sentences, clipped to the length of the document
|
||||||
scores = prior_probs
|
start_sentence = max(0, sent_index - self.n_sents)
|
||||||
# add in similarity from the context
|
end_sentence = min(
|
||||||
if self.incl_context:
|
len(sentences) - 1, sent_index + self.n_sents
|
||||||
entity_encodings = xp.asarray(
|
|
||||||
[c.entity_vector for c in candidates]
|
|
||||||
)
|
|
||||||
entity_norm = xp.linalg.norm(entity_encodings, axis=1)
|
|
||||||
if len(entity_encodings) != len(prior_probs):
|
|
||||||
raise RuntimeError(
|
|
||||||
Errors.E147.format(
|
|
||||||
method="predict",
|
|
||||||
msg="vectors not of equal length",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
# cosine similarity
|
|
||||||
sims = xp.dot(entity_encodings, sentence_encoding_t) / (
|
|
||||||
sentence_norm * entity_norm
|
|
||||||
)
|
|
||||||
if sims.shape != prior_probs.shape:
|
|
||||||
raise ValueError(Errors.E161)
|
|
||||||
scores = prior_probs + sims - (prior_probs * sims)
|
|
||||||
final_kb_ids.append(
|
|
||||||
candidates[scores.argmax().item()].entity_
|
|
||||||
if self.threshold is None or scores.max() >= self.threshold
|
|
||||||
else EntityLinker.NIL
|
|
||||||
)
|
)
|
||||||
|
start_token = sentences[start_sentence].start
|
||||||
|
end_token = sentences[end_sentence].end
|
||||||
|
sent_doc = doc[start_token:end_token].as_doc()
|
||||||
|
# currently, the context is the same for each entity in a sentence (should be refined)
|
||||||
|
sentence_encoding = self.model.predict([sent_doc])[0]
|
||||||
|
sentence_encoding_t = sentence_encoding.T
|
||||||
|
sentence_norm = xp.linalg.norm(sentence_encoding_t)
|
||||||
|
entity_count += 1
|
||||||
|
if ent.label_ in self.labels_discard:
|
||||||
|
# ignoring this entity - setting to NIL
|
||||||
|
final_kb_ids.append(self.NIL)
|
||||||
|
else:
|
||||||
|
candidates = list(batch_candidates[j])
|
||||||
|
if not candidates:
|
||||||
|
# no prediction possible for this entity - setting to NIL
|
||||||
|
final_kb_ids.append(self.NIL)
|
||||||
|
elif len(candidates) == 1 and self.threshold is None:
|
||||||
|
# shortcut for efficiency reasons: take the 1 candidate
|
||||||
|
final_kb_ids.append(candidates[0].entity_)
|
||||||
|
else:
|
||||||
|
random.shuffle(candidates)
|
||||||
|
# set all prior probabilities to 0 if incl_prior=False
|
||||||
|
prior_probs = xp.asarray([c.prior_prob for c in candidates])
|
||||||
|
if not self.incl_prior:
|
||||||
|
prior_probs = xp.asarray([0.0 for _ in candidates])
|
||||||
|
scores = prior_probs
|
||||||
|
# add in similarity from the context
|
||||||
|
if self.incl_context:
|
||||||
|
entity_encodings = xp.asarray(
|
||||||
|
[c.entity_vector for c in candidates]
|
||||||
|
)
|
||||||
|
entity_norm = xp.linalg.norm(entity_encodings, axis=1)
|
||||||
|
if len(entity_encodings) != len(prior_probs):
|
||||||
|
raise RuntimeError(
|
||||||
|
Errors.E147.format(
|
||||||
|
method="predict",
|
||||||
|
msg="vectors not of equal length",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
# cosine similarity
|
||||||
|
sims = xp.dot(entity_encodings, sentence_encoding_t) / (
|
||||||
|
sentence_norm * entity_norm
|
||||||
|
)
|
||||||
|
if sims.shape != prior_probs.shape:
|
||||||
|
raise ValueError(Errors.E161)
|
||||||
|
scores = prior_probs + sims - (prior_probs * sims)
|
||||||
|
final_kb_ids.append(
|
||||||
|
candidates[scores.argmax().item()].entity_
|
||||||
|
if self.threshold is None
|
||||||
|
or scores.max() >= self.threshold
|
||||||
|
else EntityLinker.NIL
|
||||||
|
)
|
||||||
|
|
||||||
if not (len(final_kb_ids) == entity_count):
|
if not (len(final_kb_ids) == entity_count):
|
||||||
err = Errors.E147.format(
|
err = Errors.E147.format(
|
||||||
method="predict", msg="result variables not of equal length"
|
method="predict", msg="result variables not of equal length"
|
||||||
|
|
|
@ -68,8 +68,7 @@ class EntityLinker_v1(TrainablePipe):
|
||||||
entity_vector_length (int): Size of encoding vectors in the KB.
|
entity_vector_length (int): Size of encoding vectors in the KB.
|
||||||
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
|
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
|
||||||
produces a list of candidates, given a certain knowledge base and a textual mention.
|
produces a list of candidates, given a certain knowledge base and a textual mention.
|
||||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
|
||||||
Scorer.score_links.
|
|
||||||
DOCS: https://spacy.io/api/entitylinker#init
|
DOCS: https://spacy.io/api/entitylinker#init
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
|
@ -115,7 +114,7 @@ class EntityLinker_v1(TrainablePipe):
|
||||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
returns a representative sample of gold-standard Example objects.
|
returns a representative sample of gold-standard Example objects.
|
||||||
nlp (Language): The current nlp object the component is part of.
|
nlp (Language): The current nlp object the component is part of.
|
||||||
kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates a KnowledgeBase from a Vocab instance.
|
kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates an InMemoryLookupKB from a Vocab instance.
|
||||||
Note that providing this argument, will overwrite all data accumulated in the current KB.
|
Note that providing this argument, will overwrite all data accumulated in the current KB.
|
||||||
Use this only when loading a KB as-such from file.
|
Use this only when loading a KB as-such from file.
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,7 @@ from numpy.testing import assert_equal
|
||||||
from spacy import registry, util
|
from spacy import registry, util
|
||||||
from spacy.attrs import ENT_KB_ID
|
from spacy.attrs import ENT_KB_ID
|
||||||
from spacy.compat import pickle
|
from spacy.compat import pickle
|
||||||
from spacy.kb import Candidate, KnowledgeBase, get_candidates
|
from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.ml import load_kb
|
from spacy.ml import load_kb
|
||||||
from spacy.pipeline import EntityLinker
|
from spacy.pipeline import EntityLinker
|
||||||
|
@ -34,7 +34,7 @@ def assert_almost_equal(a, b):
|
||||||
def test_issue4674():
|
def test_issue4674():
|
||||||
"""Test that setting entities with overlapping identifiers does not mess up IO"""
|
"""Test that setting entities with overlapping identifiers does not mess up IO"""
|
||||||
nlp = English()
|
nlp = English()
|
||||||
kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
kb = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
|
||||||
vector1 = [0.9, 1.1, 1.01]
|
vector1 = [0.9, 1.1, 1.01]
|
||||||
vector2 = [1.8, 2.25, 2.01]
|
vector2 = [1.8, 2.25, 2.01]
|
||||||
with pytest.warns(UserWarning):
|
with pytest.warns(UserWarning):
|
||||||
|
@ -51,7 +51,7 @@ def test_issue4674():
|
||||||
dir_path.mkdir()
|
dir_path.mkdir()
|
||||||
file_path = dir_path / "kb"
|
file_path = dir_path / "kb"
|
||||||
kb.to_disk(str(file_path))
|
kb.to_disk(str(file_path))
|
||||||
kb2 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
kb2 = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
|
||||||
kb2.from_disk(str(file_path))
|
kb2.from_disk(str(file_path))
|
||||||
assert kb2.get_size_entities() == 1
|
assert kb2.get_size_entities() == 1
|
||||||
|
|
||||||
|
@ -59,9 +59,9 @@ def test_issue4674():
|
||||||
@pytest.mark.issue(6730)
|
@pytest.mark.issue(6730)
|
||||||
def test_issue6730(en_vocab):
|
def test_issue6730(en_vocab):
|
||||||
"""Ensure that the KB does not accept empty strings, but otherwise IO works fine."""
|
"""Ensure that the KB does not accept empty strings, but otherwise IO works fine."""
|
||||||
from spacy.kb import KnowledgeBase
|
from spacy.kb.kb_in_memory import InMemoryLookupKB
|
||||||
|
|
||||||
kb = KnowledgeBase(en_vocab, entity_vector_length=3)
|
kb = InMemoryLookupKB(en_vocab, entity_vector_length=3)
|
||||||
kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3])
|
kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3])
|
||||||
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
|
@ -127,7 +127,7 @@ def test_issue7065_b():
|
||||||
|
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
# create artificial KB
|
# create artificial KB
|
||||||
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
|
mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
|
||||||
mykb.add_entity(entity="Q270853", freq=12, entity_vector=[9, 1, -7])
|
mykb.add_entity(entity="Q270853", freq=12, entity_vector=[9, 1, -7])
|
||||||
mykb.add_alias(
|
mykb.add_alias(
|
||||||
alias="No. 8",
|
alias="No. 8",
|
||||||
|
@ -190,7 +190,7 @@ def test_no_entities():
|
||||||
|
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
# create artificial KB
|
# create artificial KB
|
||||||
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
|
mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
|
||||||
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||||
mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9])
|
mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9])
|
||||||
return mykb
|
return mykb
|
||||||
|
@ -231,7 +231,7 @@ def test_partial_links():
|
||||||
|
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
# create artificial KB
|
# create artificial KB
|
||||||
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
|
mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
|
||||||
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||||
mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9])
|
mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9])
|
||||||
return mykb
|
return mykb
|
||||||
|
@ -263,7 +263,7 @@ def test_partial_links():
|
||||||
|
|
||||||
def test_kb_valid_entities(nlp):
|
def test_kb_valid_entities(nlp):
|
||||||
"""Test the valid construction of a KB with 3 entities and two aliases"""
|
"""Test the valid construction of a KB with 3 entities and two aliases"""
|
||||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
|
||||||
|
|
||||||
# adding entities
|
# adding entities
|
||||||
mykb.add_entity(entity="Q1", freq=19, entity_vector=[8, 4, 3])
|
mykb.add_entity(entity="Q1", freq=19, entity_vector=[8, 4, 3])
|
||||||
|
@ -292,7 +292,7 @@ def test_kb_valid_entities(nlp):
|
||||||
|
|
||||||
def test_kb_invalid_entities(nlp):
|
def test_kb_invalid_entities(nlp):
|
||||||
"""Test the invalid construction of a KB with an alias linked to a non-existing entity"""
|
"""Test the invalid construction of a KB with an alias linked to a non-existing entity"""
|
||||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
|
||||||
|
|
||||||
# adding entities
|
# adding entities
|
||||||
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
|
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
|
||||||
|
@ -308,7 +308,7 @@ def test_kb_invalid_entities(nlp):
|
||||||
|
|
||||||
def test_kb_invalid_probabilities(nlp):
|
def test_kb_invalid_probabilities(nlp):
|
||||||
"""Test the invalid construction of a KB with wrong prior probabilities"""
|
"""Test the invalid construction of a KB with wrong prior probabilities"""
|
||||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
|
||||||
|
|
||||||
# adding entities
|
# adding entities
|
||||||
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
|
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
|
||||||
|
@ -322,7 +322,7 @@ def test_kb_invalid_probabilities(nlp):
|
||||||
|
|
||||||
def test_kb_invalid_combination(nlp):
|
def test_kb_invalid_combination(nlp):
|
||||||
"""Test the invalid construction of a KB with non-matching entity and probability lists"""
|
"""Test the invalid construction of a KB with non-matching entity and probability lists"""
|
||||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
|
||||||
|
|
||||||
# adding entities
|
# adding entities
|
||||||
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
|
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
|
||||||
|
@ -338,7 +338,7 @@ def test_kb_invalid_combination(nlp):
|
||||||
|
|
||||||
def test_kb_invalid_entity_vector(nlp):
|
def test_kb_invalid_entity_vector(nlp):
|
||||||
"""Test the invalid construction of a KB with non-matching entity vector lengths"""
|
"""Test the invalid construction of a KB with non-matching entity vector lengths"""
|
||||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
|
||||||
|
|
||||||
# adding entities
|
# adding entities
|
||||||
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1, 2, 3])
|
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1, 2, 3])
|
||||||
|
@ -376,7 +376,7 @@ def test_kb_initialize_empty(nlp):
|
||||||
|
|
||||||
def test_kb_serialize(nlp):
|
def test_kb_serialize(nlp):
|
||||||
"""Test serialization of the KB"""
|
"""Test serialization of the KB"""
|
||||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
# normal read-write behaviour
|
# normal read-write behaviour
|
||||||
mykb.to_disk(d / "kb")
|
mykb.to_disk(d / "kb")
|
||||||
|
@ -393,12 +393,12 @@ def test_kb_serialize(nlp):
|
||||||
@pytest.mark.issue(9137)
|
@pytest.mark.issue(9137)
|
||||||
def test_kb_serialize_2(nlp):
|
def test_kb_serialize_2(nlp):
|
||||||
v = [5, 6, 7, 8]
|
v = [5, 6, 7, 8]
|
||||||
kb1 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
|
kb1 = InMemoryLookupKB(vocab=nlp.vocab, entity_vector_length=4)
|
||||||
kb1.set_entities(["E1"], [1], [v])
|
kb1.set_entities(["E1"], [1], [v])
|
||||||
assert kb1.get_vector("E1") == v
|
assert kb1.get_vector("E1") == v
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
kb1.to_disk(d / "kb")
|
kb1.to_disk(d / "kb")
|
||||||
kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
|
kb2 = InMemoryLookupKB(vocab=nlp.vocab, entity_vector_length=4)
|
||||||
kb2.from_disk(d / "kb")
|
kb2.from_disk(d / "kb")
|
||||||
assert kb2.get_vector("E1") == v
|
assert kb2.get_vector("E1") == v
|
||||||
|
|
||||||
|
@ -408,7 +408,7 @@ def test_kb_set_entities(nlp):
|
||||||
v = [5, 6, 7, 8]
|
v = [5, 6, 7, 8]
|
||||||
v1 = [1, 1, 1, 0]
|
v1 = [1, 1, 1, 0]
|
||||||
v2 = [2, 2, 2, 3]
|
v2 = [2, 2, 2, 3]
|
||||||
kb1 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
|
kb1 = InMemoryLookupKB(vocab=nlp.vocab, entity_vector_length=4)
|
||||||
kb1.set_entities(["E0"], [1], [v])
|
kb1.set_entities(["E0"], [1], [v])
|
||||||
assert kb1.get_entity_strings() == ["E0"]
|
assert kb1.get_entity_strings() == ["E0"]
|
||||||
kb1.set_entities(["E1", "E2"], [1, 9], [v1, v2])
|
kb1.set_entities(["E1", "E2"], [1, 9], [v1, v2])
|
||||||
|
@ -417,7 +417,7 @@ def test_kb_set_entities(nlp):
|
||||||
assert kb1.get_vector("E2") == v2
|
assert kb1.get_vector("E2") == v2
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
kb1.to_disk(d / "kb")
|
kb1.to_disk(d / "kb")
|
||||||
kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
|
kb2 = InMemoryLookupKB(vocab=nlp.vocab, entity_vector_length=4)
|
||||||
kb2.from_disk(d / "kb")
|
kb2.from_disk(d / "kb")
|
||||||
assert set(kb2.get_entity_strings()) == {"E1", "E2"}
|
assert set(kb2.get_entity_strings()) == {"E1", "E2"}
|
||||||
assert kb2.get_vector("E1") == v1
|
assert kb2.get_vector("E1") == v1
|
||||||
|
@ -428,7 +428,7 @@ def test_kb_serialize_vocab(nlp):
|
||||||
"""Test serialization of the KB and custom strings"""
|
"""Test serialization of the KB and custom strings"""
|
||||||
entity = "MyFunnyID"
|
entity = "MyFunnyID"
|
||||||
assert entity not in nlp.vocab.strings
|
assert entity not in nlp.vocab.strings
|
||||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
|
||||||
assert not mykb.contains_entity(entity)
|
assert not mykb.contains_entity(entity)
|
||||||
mykb.add_entity(entity, freq=342, entity_vector=[3])
|
mykb.add_entity(entity, freq=342, entity_vector=[3])
|
||||||
assert mykb.contains_entity(entity)
|
assert mykb.contains_entity(entity)
|
||||||
|
@ -436,14 +436,14 @@ def test_kb_serialize_vocab(nlp):
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
# normal read-write behaviour
|
# normal read-write behaviour
|
||||||
mykb.to_disk(d / "kb")
|
mykb.to_disk(d / "kb")
|
||||||
mykb_new = KnowledgeBase(Vocab(), entity_vector_length=1)
|
mykb_new = InMemoryLookupKB(Vocab(), entity_vector_length=1)
|
||||||
mykb_new.from_disk(d / "kb")
|
mykb_new.from_disk(d / "kb")
|
||||||
assert entity in mykb_new.vocab.strings
|
assert entity in mykb_new.vocab.strings
|
||||||
|
|
||||||
|
|
||||||
def test_candidate_generation(nlp):
|
def test_candidate_generation(nlp):
|
||||||
"""Test correct candidate generation"""
|
"""Test correct candidate generation"""
|
||||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
|
||||||
doc = nlp("douglas adam Adam shrubbery")
|
doc = nlp("douglas adam Adam shrubbery")
|
||||||
|
|
||||||
douglas_ent = doc[0:1]
|
douglas_ent = doc[0:1]
|
||||||
|
@ -481,7 +481,7 @@ def test_el_pipe_configuration(nlp):
|
||||||
ruler.add_patterns([pattern])
|
ruler.add_patterns([pattern])
|
||||||
|
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
kb = KnowledgeBase(vocab, entity_vector_length=1)
|
kb = InMemoryLookupKB(vocab, entity_vector_length=1)
|
||||||
kb.add_entity(entity="Q2", freq=12, entity_vector=[2])
|
kb.add_entity(entity="Q2", freq=12, entity_vector=[2])
|
||||||
kb.add_entity(entity="Q3", freq=5, entity_vector=[3])
|
kb.add_entity(entity="Q3", freq=5, entity_vector=[3])
|
||||||
kb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1])
|
kb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1])
|
||||||
|
@ -500,10 +500,21 @@ def test_el_pipe_configuration(nlp):
|
||||||
def get_lowercased_candidates(kb, span):
|
def get_lowercased_candidates(kb, span):
|
||||||
return kb.get_alias_candidates(span.text.lower())
|
return kb.get_alias_candidates(span.text.lower())
|
||||||
|
|
||||||
|
def get_lowercased_candidates_batch(kb, spans):
|
||||||
|
return [get_lowercased_candidates(kb, span) for span in spans]
|
||||||
|
|
||||||
@registry.misc("spacy.LowercaseCandidateGenerator.v1")
|
@registry.misc("spacy.LowercaseCandidateGenerator.v1")
|
||||||
def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]:
|
def create_candidates() -> Callable[
|
||||||
|
[InMemoryLookupKB, "Span"], Iterable[Candidate]
|
||||||
|
]:
|
||||||
return get_lowercased_candidates
|
return get_lowercased_candidates
|
||||||
|
|
||||||
|
@registry.misc("spacy.LowercaseCandidateBatchGenerator.v1")
|
||||||
|
def create_candidates_batch() -> Callable[
|
||||||
|
[InMemoryLookupKB, Iterable["Span"]], Iterable[Iterable[Candidate]]
|
||||||
|
]:
|
||||||
|
return get_lowercased_candidates_batch
|
||||||
|
|
||||||
# replace the pipe with a new one with with a different candidate generator
|
# replace the pipe with a new one with with a different candidate generator
|
||||||
entity_linker = nlp.replace_pipe(
|
entity_linker = nlp.replace_pipe(
|
||||||
"entity_linker",
|
"entity_linker",
|
||||||
|
@ -511,6 +522,9 @@ def test_el_pipe_configuration(nlp):
|
||||||
config={
|
config={
|
||||||
"incl_context": False,
|
"incl_context": False,
|
||||||
"get_candidates": {"@misc": "spacy.LowercaseCandidateGenerator.v1"},
|
"get_candidates": {"@misc": "spacy.LowercaseCandidateGenerator.v1"},
|
||||||
|
"get_candidates_batch": {
|
||||||
|
"@misc": "spacy.LowercaseCandidateBatchGenerator.v1"
|
||||||
|
},
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
entity_linker.set_kb(create_kb)
|
entity_linker.set_kb(create_kb)
|
||||||
|
@ -532,7 +546,7 @@ def test_nel_nsents(nlp):
|
||||||
|
|
||||||
def test_vocab_serialization(nlp):
|
def test_vocab_serialization(nlp):
|
||||||
"""Test that string information is retained across storage"""
|
"""Test that string information is retained across storage"""
|
||||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
|
||||||
|
|
||||||
# adding entities
|
# adding entities
|
||||||
mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
|
mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
|
||||||
|
@ -552,7 +566,7 @@ def test_vocab_serialization(nlp):
|
||||||
|
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
mykb.to_disk(d / "kb")
|
mykb.to_disk(d / "kb")
|
||||||
kb_new_vocab = KnowledgeBase(Vocab(), entity_vector_length=1)
|
kb_new_vocab = InMemoryLookupKB(Vocab(), entity_vector_length=1)
|
||||||
kb_new_vocab.from_disk(d / "kb")
|
kb_new_vocab.from_disk(d / "kb")
|
||||||
|
|
||||||
candidates = kb_new_vocab.get_alias_candidates("adam")
|
candidates = kb_new_vocab.get_alias_candidates("adam")
|
||||||
|
@ -568,7 +582,7 @@ def test_vocab_serialization(nlp):
|
||||||
|
|
||||||
def test_append_alias(nlp):
|
def test_append_alias(nlp):
|
||||||
"""Test that we can append additional alias-entity pairs"""
|
"""Test that we can append additional alias-entity pairs"""
|
||||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
|
||||||
|
|
||||||
# adding entities
|
# adding entities
|
||||||
mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
|
mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
|
||||||
|
@ -599,7 +613,7 @@ def test_append_alias(nlp):
|
||||||
@pytest.mark.filterwarnings("ignore:\\[W036")
|
@pytest.mark.filterwarnings("ignore:\\[W036")
|
||||||
def test_append_invalid_alias(nlp):
|
def test_append_invalid_alias(nlp):
|
||||||
"""Test that append an alias will throw an error if prior probs are exceeding 1"""
|
"""Test that append an alias will throw an error if prior probs are exceeding 1"""
|
||||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
|
||||||
|
|
||||||
# adding entities
|
# adding entities
|
||||||
mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
|
mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
|
||||||
|
@ -621,7 +635,7 @@ def test_preserving_links_asdoc(nlp):
|
||||||
vector_length = 1
|
vector_length = 1
|
||||||
|
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
|
mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
|
||||||
# adding entities
|
# adding entities
|
||||||
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
|
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
|
||||||
mykb.add_entity(entity="Q2", freq=8, entity_vector=[1])
|
mykb.add_entity(entity="Q2", freq=8, entity_vector=[1])
|
||||||
|
@ -723,7 +737,7 @@ def test_overfitting_IO():
|
||||||
# create artificial KB - assign same prior weight to the two russ cochran's
|
# create artificial KB - assign same prior weight to the two russ cochran's
|
||||||
# Q2146908 (Russ Cochran): American golfer
|
# Q2146908 (Russ Cochran): American golfer
|
||||||
# Q7381115 (Russ Cochran): publisher
|
# Q7381115 (Russ Cochran): publisher
|
||||||
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
|
mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
|
||||||
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||||
mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
|
mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
|
||||||
mykb.add_alias(
|
mykb.add_alias(
|
||||||
|
@ -805,7 +819,7 @@ def test_kb_serialization():
|
||||||
kb_dir = tmp_dir / "kb"
|
kb_dir = tmp_dir / "kb"
|
||||||
nlp1 = English()
|
nlp1 = English()
|
||||||
assert "Q2146908" not in nlp1.vocab.strings
|
assert "Q2146908" not in nlp1.vocab.strings
|
||||||
mykb = KnowledgeBase(nlp1.vocab, entity_vector_length=vector_length)
|
mykb = InMemoryLookupKB(nlp1.vocab, entity_vector_length=vector_length)
|
||||||
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||||
mykb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
|
mykb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
|
||||||
assert "Q2146908" in nlp1.vocab.strings
|
assert "Q2146908" in nlp1.vocab.strings
|
||||||
|
@ -828,7 +842,7 @@ def test_kb_serialization():
|
||||||
def test_kb_pickle():
|
def test_kb_pickle():
|
||||||
# Test that the KB can be pickled
|
# Test that the KB can be pickled
|
||||||
nlp = English()
|
nlp = English()
|
||||||
kb_1 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
kb_1 = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
|
||||||
kb_1.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
kb_1.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||||
assert not kb_1.contains_alias("Russ Cochran")
|
assert not kb_1.contains_alias("Russ Cochran")
|
||||||
kb_1.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
|
kb_1.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
|
||||||
|
@ -842,7 +856,7 @@ def test_kb_pickle():
|
||||||
def test_nel_pickle():
|
def test_nel_pickle():
|
||||||
# Test that a pipeline with an EL component can be pickled
|
# Test that a pipeline with an EL component can be pickled
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
kb = KnowledgeBase(vocab, entity_vector_length=3)
|
kb = InMemoryLookupKB(vocab, entity_vector_length=3)
|
||||||
kb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
kb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||||
kb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
|
kb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
|
||||||
return kb
|
return kb
|
||||||
|
@ -864,7 +878,7 @@ def test_nel_pickle():
|
||||||
def test_kb_to_bytes():
|
def test_kb_to_bytes():
|
||||||
# Test that the KB's to_bytes method works correctly
|
# Test that the KB's to_bytes method works correctly
|
||||||
nlp = English()
|
nlp = English()
|
||||||
kb_1 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
kb_1 = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
|
||||||
kb_1.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
kb_1.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||||
kb_1.add_entity(entity="Q66", freq=9, entity_vector=[1, 2, 3])
|
kb_1.add_entity(entity="Q66", freq=9, entity_vector=[1, 2, 3])
|
||||||
kb_1.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
|
kb_1.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
|
||||||
|
@ -874,7 +888,7 @@ def test_kb_to_bytes():
|
||||||
)
|
)
|
||||||
assert kb_1.contains_alias("Russ Cochran")
|
assert kb_1.contains_alias("Russ Cochran")
|
||||||
kb_bytes = kb_1.to_bytes()
|
kb_bytes = kb_1.to_bytes()
|
||||||
kb_2 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
kb_2 = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
|
||||||
assert not kb_2.contains_alias("Russ Cochran")
|
assert not kb_2.contains_alias("Russ Cochran")
|
||||||
kb_2 = kb_2.from_bytes(kb_bytes)
|
kb_2 = kb_2.from_bytes(kb_bytes)
|
||||||
# check that both KBs are exactly the same
|
# check that both KBs are exactly the same
|
||||||
|
@ -897,7 +911,7 @@ def test_kb_to_bytes():
|
||||||
def test_nel_to_bytes():
|
def test_nel_to_bytes():
|
||||||
# Test that a pipeline with an EL component can be converted to bytes
|
# Test that a pipeline with an EL component can be converted to bytes
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
kb = KnowledgeBase(vocab, entity_vector_length=3)
|
kb = InMemoryLookupKB(vocab, entity_vector_length=3)
|
||||||
kb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
kb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||||
kb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
|
kb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
|
||||||
return kb
|
return kb
|
||||||
|
@ -987,7 +1001,7 @@ def test_legacy_architectures(name, config):
|
||||||
train_examples.append(Example.from_dict(doc, annotation))
|
train_examples.append(Example.from_dict(doc, annotation))
|
||||||
|
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
|
mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
|
||||||
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||||
mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
|
mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
|
||||||
mykb.add_alias(
|
mykb.add_alias(
|
||||||
|
@ -1054,7 +1068,7 @@ def test_no_gold_ents(patterns):
|
||||||
|
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
# create artificial KB
|
# create artificial KB
|
||||||
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
|
mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
|
||||||
mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3])
|
mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3])
|
||||||
mykb.add_alias("Kirby", ["Q613241"], [0.9])
|
mykb.add_alias("Kirby", ["Q613241"], [0.9])
|
||||||
# Placeholder
|
# Placeholder
|
||||||
|
@ -1104,7 +1118,7 @@ def test_tokenization_mismatch():
|
||||||
|
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
# create placeholder KB
|
# create placeholder KB
|
||||||
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
|
mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
|
||||||
mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3])
|
mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3])
|
||||||
mykb.add_alias("Kirby", ["Q613241"], [0.9])
|
mykb.add_alias("Kirby", ["Q613241"], [0.9])
|
||||||
return mykb
|
return mykb
|
||||||
|
@ -1121,6 +1135,12 @@ def test_tokenization_mismatch():
|
||||||
nlp.evaluate(train_examples)
|
nlp.evaluate(train_examples)
|
||||||
|
|
||||||
|
|
||||||
|
def test_abstract_kb_instantiation():
|
||||||
|
"""Test whether instantiation of abstract KB base class fails."""
|
||||||
|
with pytest.raises(TypeError):
|
||||||
|
KnowledgeBase(None, 3)
|
||||||
|
|
||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"meet_threshold,config",
|
"meet_threshold,config",
|
||||||
|
@ -1151,7 +1171,7 @@ def test_threshold(meet_threshold: bool, config: Dict[str, Any]):
|
||||||
|
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
# create artificial KB
|
# create artificial KB
|
||||||
mykb = KnowledgeBase(vocab, entity_vector_length=3)
|
mykb = InMemoryLookupKB(vocab, entity_vector_length=3)
|
||||||
mykb.add_entity(entity=entity_id, freq=12, entity_vector=[6, -4, 3])
|
mykb.add_entity(entity=entity_id, freq=12, entity_vector=[6, -4, 3])
|
||||||
mykb.add_alias(
|
mykb.add_alias(
|
||||||
alias="Mahler",
|
alias="Mahler",
|
||||||
|
|
|
@ -3,7 +3,7 @@ from unittest import TestCase
|
||||||
import pytest
|
import pytest
|
||||||
import srsly
|
import srsly
|
||||||
from numpy import zeros
|
from numpy import zeros
|
||||||
from spacy.kb import KnowledgeBase, Writer
|
from spacy.kb.kb_in_memory import InMemoryLookupKB, Writer
|
||||||
from spacy.vectors import Vectors
|
from spacy.vectors import Vectors
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.pipeline import TrainablePipe
|
from spacy.pipeline import TrainablePipe
|
||||||
|
@ -71,7 +71,7 @@ def entity_linker():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
|
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
kb = KnowledgeBase(vocab, entity_vector_length=1)
|
kb = InMemoryLookupKB(vocab, entity_vector_length=1)
|
||||||
kb.add_entity("test", 0.0, zeros((1, 1), dtype="f"))
|
kb.add_entity("test", 0.0, zeros((1, 1), dtype="f"))
|
||||||
return kb
|
return kb
|
||||||
|
|
||||||
|
@ -120,7 +120,7 @@ def test_writer_with_path_py35():
|
||||||
|
|
||||||
def test_save_and_load_knowledge_base():
|
def test_save_and_load_knowledge_base():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
kb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
kb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
path = d / "kb"
|
path = d / "kb"
|
||||||
try:
|
try:
|
||||||
|
@ -129,7 +129,7 @@ def test_save_and_load_knowledge_base():
|
||||||
pytest.fail(str(e))
|
pytest.fail(str(e))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
kb_loaded = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
kb_loaded = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
|
||||||
kb_loaded.from_disk(path)
|
kb_loaded.from_disk(path)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pytest.fail(str(e))
|
pytest.fail(str(e))
|
||||||
|
|
|
@ -2,7 +2,7 @@ from typing import Callable
|
||||||
|
|
||||||
from spacy import util
|
from spacy import util
|
||||||
from spacy.util import ensure_path, registry, load_model_from_config
|
from spacy.util import ensure_path, registry, load_model_from_config
|
||||||
from spacy.kb import KnowledgeBase
|
from spacy.kb.kb_in_memory import InMemoryLookupKB
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
|
|
||||||
|
@ -22,7 +22,7 @@ def test_serialize_kb_disk(en_vocab):
|
||||||
dir_path.mkdir()
|
dir_path.mkdir()
|
||||||
file_path = dir_path / "kb"
|
file_path = dir_path / "kb"
|
||||||
kb1.to_disk(str(file_path))
|
kb1.to_disk(str(file_path))
|
||||||
kb2 = KnowledgeBase(vocab=en_vocab, entity_vector_length=3)
|
kb2 = InMemoryLookupKB(vocab=en_vocab, entity_vector_length=3)
|
||||||
kb2.from_disk(str(file_path))
|
kb2.from_disk(str(file_path))
|
||||||
|
|
||||||
# final assertions
|
# final assertions
|
||||||
|
@ -30,7 +30,7 @@ def test_serialize_kb_disk(en_vocab):
|
||||||
|
|
||||||
|
|
||||||
def _get_dummy_kb(vocab):
|
def _get_dummy_kb(vocab):
|
||||||
kb = KnowledgeBase(vocab, entity_vector_length=3)
|
kb = InMemoryLookupKB(vocab, entity_vector_length=3)
|
||||||
kb.add_entity(entity="Q53", freq=33, entity_vector=[0, 5, 3])
|
kb.add_entity(entity="Q53", freq=33, entity_vector=[0, 5, 3])
|
||||||
kb.add_entity(entity="Q17", freq=2, entity_vector=[7, 1, 0])
|
kb.add_entity(entity="Q17", freq=2, entity_vector=[7, 1, 0])
|
||||||
kb.add_entity(entity="Q007", freq=7, entity_vector=[0, 0, 7])
|
kb.add_entity(entity="Q007", freq=7, entity_vector=[0, 0, 7])
|
||||||
|
@ -104,7 +104,7 @@ def test_serialize_subclassed_kb():
|
||||||
custom_field = 666
|
custom_field = 666
|
||||||
"""
|
"""
|
||||||
|
|
||||||
class SubKnowledgeBase(KnowledgeBase):
|
class SubInMemoryLookupKB(InMemoryLookupKB):
|
||||||
def __init__(self, vocab, entity_vector_length, custom_field):
|
def __init__(self, vocab, entity_vector_length, custom_field):
|
||||||
super().__init__(vocab, entity_vector_length)
|
super().__init__(vocab, entity_vector_length)
|
||||||
self.custom_field = custom_field
|
self.custom_field = custom_field
|
||||||
|
@ -112,9 +112,9 @@ def test_serialize_subclassed_kb():
|
||||||
@registry.misc("spacy.CustomKB.v1")
|
@registry.misc("spacy.CustomKB.v1")
|
||||||
def custom_kb(
|
def custom_kb(
|
||||||
entity_vector_length: int, custom_field: int
|
entity_vector_length: int, custom_field: int
|
||||||
) -> Callable[[Vocab], KnowledgeBase]:
|
) -> Callable[[Vocab], InMemoryLookupKB]:
|
||||||
def custom_kb_factory(vocab):
|
def custom_kb_factory(vocab):
|
||||||
kb = SubKnowledgeBase(
|
kb = SubInMemoryLookupKB(
|
||||||
vocab=vocab,
|
vocab=vocab,
|
||||||
entity_vector_length=entity_vector_length,
|
entity_vector_length=entity_vector_length,
|
||||||
custom_field=custom_field,
|
custom_field=custom_field,
|
||||||
|
@ -129,7 +129,7 @@ def test_serialize_subclassed_kb():
|
||||||
nlp.initialize()
|
nlp.initialize()
|
||||||
|
|
||||||
entity_linker = nlp.get_pipe("entity_linker")
|
entity_linker = nlp.get_pipe("entity_linker")
|
||||||
assert type(entity_linker.kb) == SubKnowledgeBase
|
assert type(entity_linker.kb) == SubInMemoryLookupKB
|
||||||
assert entity_linker.kb.entity_vector_length == 342
|
assert entity_linker.kb.entity_vector_length == 342
|
||||||
assert entity_linker.kb.custom_field == 666
|
assert entity_linker.kb.custom_field == 666
|
||||||
|
|
||||||
|
@ -139,6 +139,6 @@ def test_serialize_subclassed_kb():
|
||||||
nlp2 = util.load_model_from_path(tmp_dir)
|
nlp2 = util.load_model_from_path(tmp_dir)
|
||||||
entity_linker2 = nlp2.get_pipe("entity_linker")
|
entity_linker2 = nlp2.get_pipe("entity_linker")
|
||||||
# After IO, the KB is the standard one
|
# After IO, the KB is the standard one
|
||||||
assert type(entity_linker2.kb) == KnowledgeBase
|
assert type(entity_linker2.kb) == InMemoryLookupKB
|
||||||
assert entity_linker2.kb.entity_vector_length == 342
|
assert entity_linker2.kb.entity_vector_length == 342
|
||||||
assert not hasattr(entity_linker2.kb, "custom_field")
|
assert not hasattr(entity_linker2.kb, "custom_field")
|
||||||
|
|
|
@ -587,8 +587,8 @@ consists of either two or three subnetworks:
|
||||||
run once for each batch.
|
run once for each batch.
|
||||||
- **lower**: Construct a feature-specific vector for each `(token, feature)`
|
- **lower**: Construct a feature-specific vector for each `(token, feature)`
|
||||||
pair. This is also run once for each batch. Constructing the state
|
pair. This is also run once for each batch. Constructing the state
|
||||||
representation is then a matter of summing the component features and
|
representation is then a matter of summing the component features and applying
|
||||||
applying the non-linearity.
|
the non-linearity.
|
||||||
- **upper** (optional): A feed-forward network that predicts scores from the
|
- **upper** (optional): A feed-forward network that predicts scores from the
|
||||||
state representation. If not present, the output from the lower model is used
|
state representation. If not present, the output from the lower model is used
|
||||||
as action scores directly.
|
as action scores directly.
|
||||||
|
@ -628,8 +628,8 @@ same signature, but the `use_upper` argument was `True` by default.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
Build a tagger model, using a provided token-to-vector component. The tagger
|
Build a tagger model, using a provided token-to-vector component. The tagger
|
||||||
model adds a linear layer with softmax activation to predict scores given
|
model adds a linear layer with softmax activation to predict scores given the
|
||||||
the token vectors.
|
token vectors.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | ------------------------------------------------------------------------------------------ |
|
| ----------- | ------------------------------------------------------------------------------------------ |
|
||||||
|
@ -919,6 +919,6 @@ A function that reads an existing `KnowledgeBase` from file.
|
||||||
|
|
||||||
A function that takes as input a [`KnowledgeBase`](/api/kb) and a
|
A function that takes as input a [`KnowledgeBase`](/api/kb) and a
|
||||||
[`Span`](/api/span) object denoting a named entity, and returns a list of
|
[`Span`](/api/span) object denoting a named entity, and returns a list of
|
||||||
plausible [`Candidate`](/api/kb/#candidate) objects. The default
|
plausible [`Candidate`](/api/kb#candidate) objects. The default
|
||||||
`CandidateGenerator` uses the text of a mention to find its potential
|
`CandidateGenerator` uses the text of a mention to find its potential aliases in
|
||||||
aliases in the `KnowledgeBase`. Note that this function is case-dependent.
|
the `KnowledgeBase`. Note that this function is case-dependent.
|
||||||
|
|
|
@ -14,7 +14,8 @@ entities) to unique identifiers, grounding the named entities into the "real
|
||||||
world". It requires a `KnowledgeBase`, as well as a function to generate
|
world". It requires a `KnowledgeBase`, as well as a function to generate
|
||||||
plausible candidates from that `KnowledgeBase` given a certain textual mention,
|
plausible candidates from that `KnowledgeBase` given a certain textual mention,
|
||||||
and a machine learning model to pick the right candidate, given the local
|
and a machine learning model to pick the right candidate, given the local
|
||||||
context of the mention.
|
context of the mention. `EntityLinker` defaults to using the
|
||||||
|
[`InMemoryLookupKB`](/api/kb_in_memory) implementation.
|
||||||
|
|
||||||
## Assigned Attributes {#assigned-attributes}
|
## Assigned Attributes {#assigned-attributes}
|
||||||
|
|
||||||
|
@ -170,7 +171,7 @@ with the current vocab.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> def create_kb(vocab):
|
> def create_kb(vocab):
|
||||||
> kb = KnowledgeBase(vocab, entity_vector_length=128)
|
> kb = InMemoryLookupKB(vocab, entity_vector_length=128)
|
||||||
> kb.add_entity(...)
|
> kb.add_entity(...)
|
||||||
> kb.add_alias(...)
|
> kb.add_alias(...)
|
||||||
> return kb
|
> return kb
|
||||||
|
|
|
@ -4,27 +4,45 @@ teaser:
|
||||||
A storage class for entities and aliases of a specific knowledge base
|
A storage class for entities and aliases of a specific knowledge base
|
||||||
(ontology)
|
(ontology)
|
||||||
tag: class
|
tag: class
|
||||||
source: spacy/kb.pyx
|
source: spacy/kb/kb.pyx
|
||||||
new: 2.2
|
new: 2.2
|
||||||
---
|
---
|
||||||
|
|
||||||
The `KnowledgeBase` object provides a method to generate
|
The `KnowledgeBase` object is an abstract class providing a method to generate
|
||||||
[`Candidate`](/api/kb/#candidate) objects, which are plausible external
|
[`Candidate`](/api/kb#candidate) objects, which are plausible external
|
||||||
identifiers given a certain textual mention. Each such `Candidate` holds
|
identifiers given a certain textual mention. Each such `Candidate` holds
|
||||||
information from the relevant KB entities, such as its frequency in text and
|
information from the relevant KB entities, such as its frequency in text and
|
||||||
possible aliases. Each entity in the knowledge base also has a pretrained entity
|
possible aliases. Each entity in the knowledge base also has a pretrained entity
|
||||||
vector of a fixed size.
|
vector of a fixed size.
|
||||||
|
|
||||||
|
Beyond that, `KnowledgeBase` classes have to implement a number of utility
|
||||||
|
functions called by the [`EntityLinker`](/api/entitylinker) component.
|
||||||
|
|
||||||
|
<Infobox variant="warning">
|
||||||
|
|
||||||
|
This class was not abstract up to spaCy version 3.5. The `KnowledgeBase`
|
||||||
|
implementation up to that point is available as `InMemoryLookupKB` from 3.5
|
||||||
|
onwards.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
## KnowledgeBase.\_\_init\_\_ {#init tag="method"}
|
## KnowledgeBase.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
Create the knowledge base.
|
`KnowledgeBase` is an abstract class and cannot be instantiated. Its child
|
||||||
|
classes should call `__init__()` to set up some necessary attributes.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.kb import KnowledgeBase
|
> from spacy.kb import KnowledgeBase
|
||||||
|
> from spacy.vocab import Vocab
|
||||||
|
>
|
||||||
|
> class FullyImplementedKB(KnowledgeBase):
|
||||||
|
> def __init__(self, vocab: Vocab, entity_vector_length: int):
|
||||||
|
> super().__init__(vocab, entity_vector_length)
|
||||||
|
> ...
|
||||||
> vocab = nlp.vocab
|
> vocab = nlp.vocab
|
||||||
> kb = KnowledgeBase(vocab=vocab, entity_vector_length=64)
|
> kb = FullyImplementedKB(vocab=vocab, entity_vector_length=64)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
@ -40,133 +58,66 @@ The length of the fixed-size entity vectors in the knowledge base.
|
||||||
| ----------- | ------------------------------------------------ |
|
| ----------- | ------------------------------------------------ |
|
||||||
| **RETURNS** | Length of the fixed-size entity vectors. ~~int~~ |
|
| **RETURNS** | Length of the fixed-size entity vectors. ~~int~~ |
|
||||||
|
|
||||||
## KnowledgeBase.add_entity {#add_entity tag="method"}
|
## KnowledgeBase.get_candidates {#get_candidates tag="method"}
|
||||||
|
|
||||||
Add an entity to the knowledge base, specifying its corpus frequency and entity
|
Given a certain textual mention as input, retrieve a list of candidate entities
|
||||||
vector, which should be of length
|
of type [`Candidate`](/api/kb#candidate).
|
||||||
[`entity_vector_length`](/api/kb#entity_vector_length).
|
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> kb.add_entity(entity="Q42", freq=32, entity_vector=vector1)
|
> from spacy.lang.en import English
|
||||||
> kb.add_entity(entity="Q463035", freq=111, entity_vector=vector2)
|
> nlp = English()
|
||||||
|
> doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
|
||||||
|
> candidates = kb.get_candidates(doc[0:2])
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| --------------- | ---------------------------------------------------------- |
|
| ----------- | -------------------------------------------------------------------- |
|
||||||
| `entity` | The unique entity identifier. ~~str~~ |
|
| `mention` | The textual mention or alias. ~~Span~~ |
|
||||||
| `freq` | The frequency of the entity in a typical corpus. ~~float~~ |
|
| **RETURNS** | An iterable of relevant `Candidate` objects. ~~Iterable[Candidate]~~ |
|
||||||
| `entity_vector` | The pretrained vector of the entity. ~~numpy.ndarray~~ |
|
|
||||||
|
|
||||||
## KnowledgeBase.set_entities {#set_entities tag="method"}
|
## KnowledgeBase.get_candidates_batch {#get_candidates_batch tag="method"}
|
||||||
|
|
||||||
Define the full list of entities in the knowledge base, specifying the corpus
|
Same as [`get_candidates()`](/api/kb#get_candidates), but for an arbitrary
|
||||||
frequency and entity vector for each entity.
|
number of mentions. The [`EntityLinker`](/api/entitylinker) component will call
|
||||||
|
`get_candidates_batch()` instead of `get_candidates()`, if the config parameter
|
||||||
|
`candidates_batch_size` is greater or equal than 1.
|
||||||
|
|
||||||
|
The default implementation of `get_candidates_batch()` executes
|
||||||
|
`get_candidates()` in a loop. We recommend implementing a more efficient way to
|
||||||
|
retrieve candidates for multiple mentions at once, if performance is of concern
|
||||||
|
to you.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> kb.set_entities(entity_list=["Q42", "Q463035"], freq_list=[32, 111], vector_list=[vector1, vector2])
|
> from spacy.lang.en import English
|
||||||
|
> nlp = English()
|
||||||
|
> doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
|
||||||
|
> candidates = kb.get_candidates((doc[0:2], doc[3:]))
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------- | ---------------------------------------------------------------- |
|
| ----------- | -------------------------------------------------------------------------------------------- |
|
||||||
| `entity_list` | List of unique entity identifiers. ~~Iterable[Union[str, int]]~~ |
|
| `mentions` | The textual mention or alias. ~~Iterable[Span]~~ |
|
||||||
| `freq_list` | List of entity frequencies. ~~Iterable[int]~~ |
|
| **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |
|
||||||
| `vector_list` | List of entity vectors. ~~Iterable[numpy.ndarray]~~ |
|
|
||||||
|
|
||||||
## KnowledgeBase.add_alias {#add_alias tag="method"}
|
|
||||||
|
|
||||||
Add an alias or mention to the knowledge base, specifying its potential KB
|
|
||||||
identifiers and their prior probabilities. The entity identifiers should refer
|
|
||||||
to entities previously added with [`add_entity`](/api/kb#add_entity) or
|
|
||||||
[`set_entities`](/api/kb#set_entities). The sum of the prior probabilities
|
|
||||||
should not exceed 1. Note that an empty string can not be used as alias.
|
|
||||||
|
|
||||||
> #### Example
|
|
||||||
>
|
|
||||||
> ```python
|
|
||||||
> kb.add_alias(alias="Douglas", entities=["Q42", "Q463035"], probabilities=[0.6, 0.3])
|
|
||||||
> ```
|
|
||||||
|
|
||||||
| Name | Description |
|
|
||||||
| --------------- | --------------------------------------------------------------------------------- |
|
|
||||||
| `alias` | The textual mention or alias. Can not be the empty string. ~~str~~ |
|
|
||||||
| `entities` | The potential entities that the alias may refer to. ~~Iterable[Union[str, int]]~~ |
|
|
||||||
| `probabilities` | The prior probabilities of each entity. ~~Iterable[float]~~ |
|
|
||||||
|
|
||||||
## KnowledgeBase.\_\_len\_\_ {#len tag="method"}
|
|
||||||
|
|
||||||
Get the total number of entities in the knowledge base.
|
|
||||||
|
|
||||||
> #### Example
|
|
||||||
>
|
|
||||||
> ```python
|
|
||||||
> total_entities = len(kb)
|
|
||||||
> ```
|
|
||||||
|
|
||||||
| Name | Description |
|
|
||||||
| ----------- | ----------------------------------------------------- |
|
|
||||||
| **RETURNS** | The number of entities in the knowledge base. ~~int~~ |
|
|
||||||
|
|
||||||
## KnowledgeBase.get_entity_strings {#get_entity_strings tag="method"}
|
|
||||||
|
|
||||||
Get a list of all entity IDs in the knowledge base.
|
|
||||||
|
|
||||||
> #### Example
|
|
||||||
>
|
|
||||||
> ```python
|
|
||||||
> all_entities = kb.get_entity_strings()
|
|
||||||
> ```
|
|
||||||
|
|
||||||
| Name | Description |
|
|
||||||
| ----------- | --------------------------------------------------------- |
|
|
||||||
| **RETURNS** | The list of entities in the knowledge base. ~~List[str]~~ |
|
|
||||||
|
|
||||||
## KnowledgeBase.get_size_aliases {#get_size_aliases tag="method"}
|
|
||||||
|
|
||||||
Get the total number of aliases in the knowledge base.
|
|
||||||
|
|
||||||
> #### Example
|
|
||||||
>
|
|
||||||
> ```python
|
|
||||||
> total_aliases = kb.get_size_aliases()
|
|
||||||
> ```
|
|
||||||
|
|
||||||
| Name | Description |
|
|
||||||
| ----------- | ---------------------------------------------------- |
|
|
||||||
| **RETURNS** | The number of aliases in the knowledge base. ~~int~~ |
|
|
||||||
|
|
||||||
## KnowledgeBase.get_alias_strings {#get_alias_strings tag="method"}
|
|
||||||
|
|
||||||
Get a list of all aliases in the knowledge base.
|
|
||||||
|
|
||||||
> #### Example
|
|
||||||
>
|
|
||||||
> ```python
|
|
||||||
> all_aliases = kb.get_alias_strings()
|
|
||||||
> ```
|
|
||||||
|
|
||||||
| Name | Description |
|
|
||||||
| ----------- | -------------------------------------------------------- |
|
|
||||||
| **RETURNS** | The list of aliases in the knowledge base. ~~List[str]~~ |
|
|
||||||
|
|
||||||
## KnowledgeBase.get_alias_candidates {#get_alias_candidates tag="method"}
|
## KnowledgeBase.get_alias_candidates {#get_alias_candidates tag="method"}
|
||||||
|
|
||||||
Given a certain textual mention as input, retrieve a list of candidate entities
|
<Infobox variant="warning">
|
||||||
of type [`Candidate`](/api/kb/#candidate).
|
This method is _not_ available from spaCy 3.5 onwards.
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
> #### Example
|
From spaCy 3.5 on `KnowledgeBase` is an abstract class (with
|
||||||
>
|
[`InMemoryLookupKB`](/api/kb_in_memory) being a drop-in replacement) to allow
|
||||||
> ```python
|
more flexibility in customizing knowledge bases. Some of its methods were moved
|
||||||
> candidates = kb.get_alias_candidates("Douglas")
|
to [`InMemoryLookupKB`](/api/kb_in_memory) during this refactoring, one of those
|
||||||
> ```
|
being `get_alias_candidates()`. This method is now available as
|
||||||
|
[`InMemoryLookupKB.get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
|
||||||
| Name | Description |
|
Note: [`InMemoryLookupKB.get_candidates()`](/api/kb_in_memory#get_candidates)
|
||||||
| ----------- | ------------------------------------------------------------- |
|
defaults to
|
||||||
| `alias` | The textual mention or alias. ~~str~~ |
|
[`InMemoryLookupKB.get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
|
||||||
| **RETURNS** | The list of relevant `Candidate` objects. ~~List[Candidate]~~ |
|
|
||||||
|
|
||||||
## KnowledgeBase.get_vector {#get_vector tag="method"}
|
## KnowledgeBase.get_vector {#get_vector tag="method"}
|
||||||
|
|
||||||
|
@ -178,27 +129,30 @@ Given a certain entity ID, retrieve its pretrained entity vector.
|
||||||
> vector = kb.get_vector("Q42")
|
> vector = kb.get_vector("Q42")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | ------------------------------------ |
|
| ----------- | -------------------------------------- |
|
||||||
| `entity` | The entity ID. ~~str~~ |
|
| `entity` | The entity ID. ~~str~~ |
|
||||||
| **RETURNS** | The entity vector. ~~numpy.ndarray~~ |
|
| **RETURNS** | The entity vector. ~~Iterable[float]~~ |
|
||||||
|
|
||||||
## KnowledgeBase.get_prior_prob {#get_prior_prob tag="method"}
|
## KnowledgeBase.get_vectors {#get_vectors tag="method"}
|
||||||
|
|
||||||
Given a certain entity ID and a certain textual mention, retrieve the prior
|
Same as [`get_vector()`](/api/kb#get_vector), but for an arbitrary number of
|
||||||
probability of the fact that the mention links to the entity ID.
|
entity IDs.
|
||||||
|
|
||||||
|
The default implementation of `get_vectors()` executes `get_vector()` in a loop.
|
||||||
|
We recommend implementing a more efficient way to retrieve vectors for multiple
|
||||||
|
entities at once, if performance is of concern to you.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> probability = kb.get_prior_prob("Q42", "Douglas")
|
> vectors = kb.get_vectors(("Q42", "Q3107329"))
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | ------------------------------------------------------------------------- |
|
| ----------- | --------------------------------------------------------- |
|
||||||
| `entity` | The entity ID. ~~str~~ |
|
| `entities` | The entity IDs. ~~Iterable[str]~~ |
|
||||||
| `alias` | The textual mention or alias. ~~str~~ |
|
| **RETURNS** | The entity vectors. ~~Iterable[Iterable[numpy.ndarray]]~~ |
|
||||||
| **RETURNS** | The prior probability of the `alias` referring to the `entity`. ~~float~~ |
|
|
||||||
|
|
||||||
## KnowledgeBase.to_disk {#to_disk tag="method"}
|
## KnowledgeBase.to_disk {#to_disk tag="method"}
|
||||||
|
|
||||||
|
@ -207,12 +161,13 @@ Save the current state of the knowledge base to a directory.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> kb.to_disk(loc)
|
> kb.to_disk(path)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----- | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
| --------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `loc` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||||
|
| `exclude` | List of components to exclude. ~~Iterable[str]~~ |
|
||||||
|
|
||||||
## KnowledgeBase.from_disk {#from_disk tag="method"}
|
## KnowledgeBase.from_disk {#from_disk tag="method"}
|
||||||
|
|
||||||
|
@ -222,16 +177,16 @@ Restore the state of the knowledge base from a given directory. Note that the
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.kb import KnowledgeBase
|
|
||||||
> from spacy.vocab import Vocab
|
> from spacy.vocab import Vocab
|
||||||
> vocab = Vocab().from_disk("/path/to/vocab")
|
> vocab = Vocab().from_disk("/path/to/vocab")
|
||||||
> kb = KnowledgeBase(vocab=vocab, entity_vector_length=64)
|
> kb = FullyImplementedKB(vocab=vocab, entity_vector_length=64)
|
||||||
> kb.from_disk("/path/to/kb")
|
> kb.from_disk("/path/to/kb")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | ----------------------------------------------------------------------------------------------- |
|
| ----------- | ----------------------------------------------------------------------------------------------- |
|
||||||
| `loc` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
| `loc` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||||
|
| `exclude` | List of components to exclude. ~~Iterable[str]~~ |
|
||||||
| **RETURNS** | The modified `KnowledgeBase` object. ~~KnowledgeBase~~ |
|
| **RETURNS** | The modified `KnowledgeBase` object. ~~KnowledgeBase~~ |
|
||||||
|
|
||||||
## Candidate {#candidate tag="class"}
|
## Candidate {#candidate tag="class"}
|
||||||
|
|
302
website/docs/api/kb_in_memory.md
Normal file
302
website/docs/api/kb_in_memory.md
Normal file
|
@ -0,0 +1,302 @@
|
||||||
|
---
|
||||||
|
title: InMemoryLookupKB
|
||||||
|
teaser:
|
||||||
|
The default implementation of the KnowledgeBase interface. Stores all
|
||||||
|
information in-memory.
|
||||||
|
tag: class
|
||||||
|
source: spacy/kb/kb_in_memory.pyx
|
||||||
|
new: 3.5
|
||||||
|
---
|
||||||
|
|
||||||
|
The `InMemoryLookupKB` class inherits from [`KnowledgeBase`](/api/kb) and
|
||||||
|
implements all of its methods. It stores all KB data in-memory and generates
|
||||||
|
[`Candidate`](/api/kb#candidate) objects by exactly matching mentions with
|
||||||
|
entity names. It's highly optimized for both a low memory footprint and speed of
|
||||||
|
retrieval.
|
||||||
|
|
||||||
|
## InMemoryLookupKB.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
|
Create the knowledge base.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from spacy.kb import KnowledgeBase
|
||||||
|
> vocab = nlp.vocab
|
||||||
|
> kb = KnowledgeBase(vocab=vocab, entity_vector_length=64)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ---------------------- | ------------------------------------------------ |
|
||||||
|
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||||
|
| `entity_vector_length` | Length of the fixed-size entity vectors. ~~int~~ |
|
||||||
|
|
||||||
|
## InMemoryLookupKB.entity_vector_length {#entity_vector_length tag="property"}
|
||||||
|
|
||||||
|
The length of the fixed-size entity vectors in the knowledge base.
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ------------------------------------------------ |
|
||||||
|
| **RETURNS** | Length of the fixed-size entity vectors. ~~int~~ |
|
||||||
|
|
||||||
|
## InMemoryLookupKB.add_entity {#add_entity tag="method"}
|
||||||
|
|
||||||
|
Add an entity to the knowledge base, specifying its corpus frequency and entity
|
||||||
|
vector, which should be of length
|
||||||
|
[`entity_vector_length`](/api/kb_in_memory#entity_vector_length).
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> kb.add_entity(entity="Q42", freq=32, entity_vector=vector1)
|
||||||
|
> kb.add_entity(entity="Q463035", freq=111, entity_vector=vector2)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| --------------- | ---------------------------------------------------------- |
|
||||||
|
| `entity` | The unique entity identifier. ~~str~~ |
|
||||||
|
| `freq` | The frequency of the entity in a typical corpus. ~~float~~ |
|
||||||
|
| `entity_vector` | The pretrained vector of the entity. ~~numpy.ndarray~~ |
|
||||||
|
|
||||||
|
## InMemoryLookupKB.set_entities {#set_entities tag="method"}
|
||||||
|
|
||||||
|
Define the full list of entities in the knowledge base, specifying the corpus
|
||||||
|
frequency and entity vector for each entity.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> kb.set_entities(entity_list=["Q42", "Q463035"], freq_list=[32, 111], vector_list=[vector1, vector2])
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ------------- | ---------------------------------------------------------------- |
|
||||||
|
| `entity_list` | List of unique entity identifiers. ~~Iterable[Union[str, int]]~~ |
|
||||||
|
| `freq_list` | List of entity frequencies. ~~Iterable[int]~~ |
|
||||||
|
| `vector_list` | List of entity vectors. ~~Iterable[numpy.ndarray]~~ |
|
||||||
|
|
||||||
|
## InMemoryLookupKB.add_alias {#add_alias tag="method"}
|
||||||
|
|
||||||
|
Add an alias or mention to the knowledge base, specifying its potential KB
|
||||||
|
identifiers and their prior probabilities. The entity identifiers should refer
|
||||||
|
to entities previously added with [`add_entity`](/api/kb_in_memory#add_entity)
|
||||||
|
or [`set_entities`](/api/kb_in_memory#set_entities). The sum of the prior
|
||||||
|
probabilities should not exceed 1. Note that an empty string can not be used as
|
||||||
|
alias.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> kb.add_alias(alias="Douglas", entities=["Q42", "Q463035"], probabilities=[0.6, 0.3])
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| --------------- | --------------------------------------------------------------------------------- |
|
||||||
|
| `alias` | The textual mention or alias. Can not be the empty string. ~~str~~ |
|
||||||
|
| `entities` | The potential entities that the alias may refer to. ~~Iterable[Union[str, int]]~~ |
|
||||||
|
| `probabilities` | The prior probabilities of each entity. ~~Iterable[float]~~ |
|
||||||
|
|
||||||
|
## InMemoryLookupKB.\_\_len\_\_ {#len tag="method"}
|
||||||
|
|
||||||
|
Get the total number of entities in the knowledge base.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> total_entities = len(kb)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ----------------------------------------------------- |
|
||||||
|
| **RETURNS** | The number of entities in the knowledge base. ~~int~~ |
|
||||||
|
|
||||||
|
## InMemoryLookupKB.get_entity_strings {#get_entity_strings tag="method"}
|
||||||
|
|
||||||
|
Get a list of all entity IDs in the knowledge base.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> all_entities = kb.get_entity_strings()
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | --------------------------------------------------------- |
|
||||||
|
| **RETURNS** | The list of entities in the knowledge base. ~~List[str]~~ |
|
||||||
|
|
||||||
|
## InMemoryLookupKB.get_size_aliases {#get_size_aliases tag="method"}
|
||||||
|
|
||||||
|
Get the total number of aliases in the knowledge base.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> total_aliases = kb.get_size_aliases()
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ---------------------------------------------------- |
|
||||||
|
| **RETURNS** | The number of aliases in the knowledge base. ~~int~~ |
|
||||||
|
|
||||||
|
## InMemoryLookupKB.get_alias_strings {#get_alias_strings tag="method"}
|
||||||
|
|
||||||
|
Get a list of all aliases in the knowledge base.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> all_aliases = kb.get_alias_strings()
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | -------------------------------------------------------- |
|
||||||
|
| **RETURNS** | The list of aliases in the knowledge base. ~~List[str]~~ |
|
||||||
|
|
||||||
|
## InMemoryLookupKB.get_candidates {#get_candidates tag="method"}
|
||||||
|
|
||||||
|
Given a certain textual mention as input, retrieve a list of candidate entities
|
||||||
|
of type [`Candidate`](/api/kb#candidate). Wraps
|
||||||
|
[`get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from spacy.lang.en import English
|
||||||
|
> nlp = English()
|
||||||
|
> doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
|
||||||
|
> candidates = kb.get_candidates(doc[0:2])
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | -------------------------------------------------------------------- |
|
||||||
|
| `mention` | The textual mention or alias. ~~Span~~ |
|
||||||
|
| **RETURNS** | An iterable of relevant `Candidate` objects. ~~Iterable[Candidate]~~ |
|
||||||
|
|
||||||
|
## InMemoryLookupKB.get_candidates_batch {#get_candidates_batch tag="method"}
|
||||||
|
|
||||||
|
Same as [`get_candidates()`](/api/kb_in_memory#get_candidates), but for an
|
||||||
|
arbitrary number of mentions. The [`EntityLinker`](/api/entitylinker) component
|
||||||
|
will call `get_candidates_batch()` instead of `get_candidates()`, if the config
|
||||||
|
parameter `candidates_batch_size` is greater or equal than 1.
|
||||||
|
|
||||||
|
The default implementation of `get_candidates_batch()` executes
|
||||||
|
`get_candidates()` in a loop. We recommend implementing a more efficient way to
|
||||||
|
retrieve candidates for multiple mentions at once, if performance is of concern
|
||||||
|
to you.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from spacy.lang.en import English
|
||||||
|
> nlp = English()
|
||||||
|
> doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
|
||||||
|
> candidates = kb.get_candidates((doc[0:2], doc[3:]))
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | -------------------------------------------------------------------------------------------- |
|
||||||
|
| `mentions` | The textual mention or alias. ~~Iterable[Span]~~ |
|
||||||
|
| **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |
|
||||||
|
|
||||||
|
## InMemoryLookupKB.get_alias_candidates {#get_alias_candidates tag="method"}
|
||||||
|
|
||||||
|
Given a certain textual mention as input, retrieve a list of candidate entities
|
||||||
|
of type [`Candidate`](/api/kb#candidate).
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> candidates = kb.get_alias_candidates("Douglas")
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ------------------------------------------------------------- |
|
||||||
|
| `alias` | The textual mention or alias. ~~str~~ |
|
||||||
|
| **RETURNS** | The list of relevant `Candidate` objects. ~~List[Candidate]~~ |
|
||||||
|
|
||||||
|
## InMemoryLookupKB.get_vector {#get_vector tag="method"}
|
||||||
|
|
||||||
|
Given a certain entity ID, retrieve its pretrained entity vector.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> vector = kb.get_vector("Q42")
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ------------------------------------ |
|
||||||
|
| `entity` | The entity ID. ~~str~~ |
|
||||||
|
| **RETURNS** | The entity vector. ~~numpy.ndarray~~ |
|
||||||
|
|
||||||
|
## InMemoryLookupKB.get_vectors {#get_vectors tag="method"}
|
||||||
|
|
||||||
|
Same as [`get_vector()`](/api/kb_in_memory#get_vector), but for an arbitrary
|
||||||
|
number of entity IDs.
|
||||||
|
|
||||||
|
The default implementation of `get_vectors()` executes `get_vector()` in a loop.
|
||||||
|
We recommend implementing a more efficient way to retrieve vectors for multiple
|
||||||
|
entities at once, if performance is of concern to you.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> vectors = kb.get_vectors(("Q42", "Q3107329"))
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | --------------------------------------------------------- |
|
||||||
|
| `entities` | The entity IDs. ~~Iterable[str]~~ |
|
||||||
|
| **RETURNS** | The entity vectors. ~~Iterable[Iterable[numpy.ndarray]]~~ |
|
||||||
|
|
||||||
|
## InMemoryLookupKB.get_prior_prob {#get_prior_prob tag="method"}
|
||||||
|
|
||||||
|
Given a certain entity ID and a certain textual mention, retrieve the prior
|
||||||
|
probability of the fact that the mention links to the entity ID.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> probability = kb.get_prior_prob("Q42", "Douglas")
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ------------------------------------------------------------------------- |
|
||||||
|
| `entity` | The entity ID. ~~str~~ |
|
||||||
|
| `alias` | The textual mention or alias. ~~str~~ |
|
||||||
|
| **RETURNS** | The prior probability of the `alias` referring to the `entity`. ~~float~~ |
|
||||||
|
|
||||||
|
## InMemoryLookupKB.to_disk {#to_disk tag="method"}
|
||||||
|
|
||||||
|
Save the current state of the knowledge base to a directory.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> kb.to_disk(path)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| --------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
|
| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||||
|
| `exclude` | List of components to exclude. ~~Iterable[str]~~ |
|
||||||
|
|
||||||
|
## InMemoryLookupKB.from_disk {#from_disk tag="method"}
|
||||||
|
|
||||||
|
Restore the state of the knowledge base from a given directory. Note that the
|
||||||
|
[`Vocab`](/api/vocab) should also be the same as the one used to create the KB.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from spacy.vocab import Vocab
|
||||||
|
> vocab = Vocab().from_disk("/path/to/vocab")
|
||||||
|
> kb = FullyImplementedKB(vocab=vocab, entity_vector_length=64)
|
||||||
|
> kb.from_disk("/path/to/kb")
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ----------------------------------------------------------------------------------------------- |
|
||||||
|
| `loc` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||||
|
| `exclude` | List of components to exclude. ~~Iterable[str]~~ |
|
||||||
|
| **RETURNS** | The modified `KnowledgeBase` object. ~~KnowledgeBase~~ |
|
|
@ -78,7 +78,9 @@ operates on a `Doc` and gives you access to the matched tokens **in context**.
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------------------------------ | -------------------------------------------------------------------------------------------------- |
|
| ------------------------------------------------ | -------------------------------------------------------------------------------------------------- |
|
||||||
| [`Corpus`](/api/corpus) | Class for managing annotated corpora for training and evaluation data. |
|
| [`Corpus`](/api/corpus) | Class for managing annotated corpora for training and evaluation data. |
|
||||||
| [`KnowledgeBase`](/api/kb) | Storage for entities and aliases of a knowledge base for entity linking. |
|
| [`KnowledgeBase`](/api/kb) | Abstract base class for storage and retrieval of data for entity linking. |
|
||||||
|
| [`InMemoryLookupKB`](/api/kb_in_memory) | Implementation of `KnowledgeBase` storing all data in memory. |
|
||||||
|
| [`Candidate`](/api/kb#candidate) | Object associating a textual mention with a specific entity contained in a `KnowledgeBase`. |
|
||||||
| [`Lookups`](/api/lookups) | Container for convenient access to large lookup tables and dictionaries. |
|
| [`Lookups`](/api/lookups) | Container for convenient access to large lookup tables and dictionaries. |
|
||||||
| [`MorphAnalysis`](/api/morphology#morphanalysis) | A morphological analysis. |
|
| [`MorphAnalysis`](/api/morphology#morphanalysis) | A morphological analysis. |
|
||||||
| [`Morphology`](/api/morphology) | Store morphological analyses and map them to and from hash values. |
|
| [`Morphology`](/api/morphology) | Store morphological analyses and map them to and from hash values. |
|
||||||
|
|
Loading…
Reference in New Issue
Block a user