mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-26 13:41:21 +03:00 
			
		
		
		
	* Add implementation of batching + backwards compatibility fixes. Tests indicate issue with batch disambiguation for custom singular entity lookups. * Fix tests. Add distinction w.r.t. batch size. * Remove redundant and add new comments. * Adjust comments. Fix variable naming in EL prediction. * Fix mypy errors. * Remove KB entity type config option. Change return types of candidate retrieval functions to Iterable from Iterator. Fix various other issues. * Update spacy/pipeline/entity_linker.py Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> * Update spacy/pipeline/entity_linker.py Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> * Update spacy/kb_base.pyx Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> * Update spacy/kb_base.pyx Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> * Update spacy/pipeline/entity_linker.py Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> * Add error messages to NotImplementedErrors. Remove redundant comment. * Fix imports. * Remove redundant comments. * Rename KnowledgeBase to InMemoryLookupKB and BaseKnowledgeBase to KnowledgeBase. * Fix tests. * Update spacy/errors.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Move KB into subdirectory. * Adjust imports after KB move to dedicated subdirectory. * Fix config imports. * Move Candidate + retrieval functions to separate module. Fix other, small issues. * Fix docstrings and error message w.r.t. class names. Fix typing for candidate retrieval functions. * Update spacy/kb/kb_in_memory.pyx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/ml/models/entity_linker.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Fix typing. * Change typing of mentions to be Span instead of Union[Span, str]. * Update docs. * Update EntityLinker and _architecture docs. * Update website/docs/api/entitylinker.md Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> * Adjust message for E1046. * Re-add section for Candidate in kb.md, add reference to dedicated page. * Update docs and docstrings. * Re-add section + reference for KnowledgeBase.get_alias_candidates() in docs. * Update spacy/kb/candidate.pyx * Update spacy/kb/kb_in_memory.pyx * Update spacy/pipeline/legacy/entity_linker.py * Remove canididate.md. Remove mistakenly added config snippet in entity_linker.py. Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
		
			
				
	
	
		
			109 lines
		
	
	
		
			4.4 KiB
		
	
	
	
		
			Cython
		
	
	
	
	
	
			
		
		
	
	
			109 lines
		
	
	
		
			4.4 KiB
		
	
	
	
		
			Cython
		
	
	
	
	
	
| # cython: infer_types=True, profile=True
 | |
| 
 | |
| from pathlib import Path
 | |
| from typing import Iterable, Tuple, Union
 | |
| from cymem.cymem cimport Pool
 | |
| 
 | |
| from .candidate import Candidate
 | |
| from ..tokens import Span
 | |
| from ..util import SimpleFrozenList
 | |
| from ..errors import Errors
 | |
| 
 | |
| 
 | |
| cdef class KnowledgeBase:
 | |
|     """A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
 | |
|     to support entity linking of named entities to real-world concepts.
 | |
|     This is an abstract class and requires its operations to be implemented.
 | |
| 
 | |
|     DOCS: https://spacy.io/api/kb
 | |
|     """
 | |
| 
 | |
|     def __init__(self, vocab: Vocab, entity_vector_length: int):
 | |
|         """Create a KnowledgeBase."""
 | |
|         # Make sure abstract KB is not instantiated.
 | |
|         if self.__class__ == KnowledgeBase:
 | |
|             raise TypeError(
 | |
|                 Errors.E1046.format(cls_name=self.__class__.__name__)
 | |
|             )
 | |
| 
 | |
|         self.vocab = vocab
 | |
|         self.entity_vector_length = entity_vector_length
 | |
|         self.mem = Pool()
 | |
| 
 | |
|     def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
 | |
|         """
 | |
|         Return candidate entities for specified texts. Each candidate defines the entity, the original alias,
 | |
|         and the prior probability of that alias resolving to that entity.
 | |
|         If no candidate is found for a given text, an empty list is returned.
 | |
|         mentions (Iterable[Span]): Mentions for which to get candidates.
 | |
|         RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
 | |
|         """
 | |
|         return [self.get_candidates(span) for span in mentions]
 | |
| 
 | |
|     def get_candidates(self, mention: Span) -> Iterable[Candidate]:
 | |
|         """
 | |
|         Return candidate entities for specified text. Each candidate defines the entity, the original alias,
 | |
|         and the prior probability of that alias resolving to that entity.
 | |
|         If the no candidate is found for a given text, an empty list is returned.
 | |
|         mention (Span): Mention for which to get candidates.
 | |
|         RETURNS (Iterable[Candidate]): Identified candidates.
 | |
|         """
 | |
|         raise NotImplementedError(
 | |
|             Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__)
 | |
|         )
 | |
| 
 | |
|     def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]:
 | |
|         """
 | |
|         Return vectors for entities.
 | |
|         entity (str): Entity name/ID.
 | |
|         RETURNS (Iterable[Iterable[float]]): Vectors for specified entities.
 | |
|         """
 | |
|         return [self.get_vector(entity) for entity in entities]
 | |
| 
 | |
|     def get_vector(self, str entity) -> Iterable[float]:
 | |
|         """
 | |
|         Return vector for entity.
 | |
|         entity (str): Entity name/ID.
 | |
|         RETURNS (Iterable[float]): Vector for specified entity.
 | |
|         """
 | |
|         raise NotImplementedError(
 | |
|             Errors.E1045.format(parent="KnowledgeBase", method="get_vector", name=self.__name__)
 | |
|         )
 | |
| 
 | |
|     def to_bytes(self, **kwargs) -> bytes:
 | |
|         """Serialize the current state to a binary string.
 | |
|         RETURNS (bytes): Current state as binary string.
 | |
|         """
 | |
|         raise NotImplementedError(
 | |
|             Errors.E1045.format(parent="KnowledgeBase", method="to_bytes", name=self.__name__)
 | |
|         )
 | |
| 
 | |
|     def from_bytes(self, bytes_data: bytes, *, exclude: Tuple[str] = tuple()):
 | |
|         """Load state from a binary string.
 | |
|         bytes_data (bytes): KB state.
 | |
|         exclude (Tuple[str]): Properties to exclude when restoring KB.
 | |
|         """
 | |
|         raise NotImplementedError(
 | |
|             Errors.E1045.format(parent="KnowledgeBase", method="from_bytes", name=self.__name__)
 | |
|         )
 | |
| 
 | |
|     def to_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
 | |
|         """
 | |
|         Write KnowledgeBase content to disk.
 | |
|         path (Union[str, Path]): Target file path.
 | |
|         exclude (Iterable[str]): List of components to exclude.
 | |
|         """
 | |
|         raise NotImplementedError(
 | |
|             Errors.E1045.format(parent="KnowledgeBase", method="to_disk", name=self.__name__)
 | |
|         )
 | |
| 
 | |
|     def from_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
 | |
|         """
 | |
|         Load KnowledgeBase content from disk.
 | |
|         path (Union[str, Path]): Target file path.
 | |
|         exclude (Iterable[str]): List of components to exclude.
 | |
|         """
 | |
|         raise NotImplementedError(
 | |
|             Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__)
 | |
|         )
 |