mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	* unit test for pickling KB * add pickling test for NEL * KB to_bytes and from_bytes * NEL to_bytes and from_bytes * xfail pickle tests for now * fix docs * cleanup
		
			
				
	
	
		
			716 lines
		
	
	
		
			28 KiB
		
	
	
	
		
			Cython
		
	
	
	
	
	
			
		
		
	
	
			716 lines
		
	
	
		
			28 KiB
		
	
	
	
		
			Cython
		
	
	
	
	
	
| # cython: infer_types=True, profile=True
 | |
| from typing import Iterator, Iterable
 | |
| 
 | |
| import srsly
 | |
| from cymem.cymem cimport Pool
 | |
| from preshed.maps cimport PreshMap
 | |
| from cpython.exc cimport PyErr_SetFromErrno
 | |
| from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
 | |
| from libc.stdint cimport int32_t, int64_t
 | |
| from libcpp.vector cimport vector
 | |
| 
 | |
| from pathlib import Path
 | |
| import warnings
 | |
| 
 | |
| from .typedefs cimport hash_t
 | |
| from .errors import Errors, Warnings
 | |
| from . import util
 | |
| from .util import SimpleFrozenList, ensure_path
 | |
| 
 | |
| cdef class Candidate:
 | |
|     """A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
 | |
|     to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
 | |
|     algorithm which will disambiguate the various candidates to the correct one.
 | |
|     Each candidate (alias, entity) pair is assigned to a certain prior probability.
 | |
| 
 | |
|     DOCS: https://spacy.io/api/kb/#candidate_init
 | |
|     """
 | |
| 
 | |
|     def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
 | |
|         self.kb = kb
 | |
|         self.entity_hash = entity_hash
 | |
|         self.entity_freq = entity_freq
 | |
|         self.entity_vector = entity_vector
 | |
|         self.alias_hash = alias_hash
 | |
|         self.prior_prob = prior_prob
 | |
| 
 | |
|     @property
 | |
|     def entity(self):
 | |
|         """RETURNS (uint64): hash of the entity's KB ID/name"""
 | |
|         return self.entity_hash
 | |
| 
 | |
|     @property
 | |
|     def entity_(self):
 | |
|         """RETURNS (str): ID/name of this entity in the KB"""
 | |
|         return self.kb.vocab.strings[self.entity_hash]
 | |
| 
 | |
|     @property
 | |
|     def alias(self):
 | |
|         """RETURNS (uint64): hash of the alias"""
 | |
|         return self.alias_hash
 | |
| 
 | |
|     @property
 | |
|     def alias_(self):
 | |
|         """RETURNS (str): ID of the original alias"""
 | |
|         return self.kb.vocab.strings[self.alias_hash]
 | |
| 
 | |
|     @property
 | |
|     def entity_freq(self):
 | |
|         return self.entity_freq
 | |
| 
 | |
|     @property
 | |
|     def entity_vector(self):
 | |
|         return self.entity_vector
 | |
| 
 | |
|     @property
 | |
|     def prior_prob(self):
 | |
|         return self.prior_prob
 | |
| 
 | |
| 
 | |
| def get_candidates(KnowledgeBase kb, span) -> Iterator[Candidate]:
 | |
|     """
 | |
|     Return candidate entities for a given span by using the text of the span as the alias
 | |
|     and fetching appropriate entries from the index.
 | |
|     This particular function is optimized to work with the built-in KB functionality,
 | |
|     but any other custom candidate generation method can be used in combination with the KB as well.
 | |
|     """
 | |
|     return kb.get_alias_candidates(span.text)
 | |
| 
 | |
| 
 | |
| cdef class KnowledgeBase:
 | |
|     """A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
 | |
|     to support entity linking of named entities to real-world concepts.
 | |
| 
 | |
|     DOCS: https://spacy.io/api/kb
 | |
|     """
 | |
| 
 | |
|     def __init__(self, Vocab vocab, entity_vector_length):
 | |
|         """Create a KnowledgeBase."""
 | |
|         self.mem = Pool()
 | |
|         self.entity_vector_length = entity_vector_length
 | |
|         self._entry_index = PreshMap()
 | |
|         self._alias_index = PreshMap()
 | |
|         self.vocab = vocab
 | |
|         self._create_empty_vectors(dummy_hash=self.vocab.strings[""])
 | |
| 
 | |
|     def initialize_entities(self, int64_t nr_entities):
 | |
|         self._entry_index = PreshMap(nr_entities + 1)
 | |
|         self._entries = entry_vec(nr_entities + 1)
 | |
|         self._vectors_table = float_matrix(nr_entities + 1)
 | |
| 
 | |
|     def initialize_aliases(self, int64_t nr_aliases):
 | |
|         self._alias_index = PreshMap(nr_aliases + 1)
 | |
|         self._aliases_table = alias_vec(nr_aliases + 1)
 | |
| 
 | |
|     @property
 | |
|     def entity_vector_length(self):
 | |
|         """RETURNS (uint64): length of the entity vectors"""
 | |
|         return self.entity_vector_length
 | |
| 
 | |
|     def __len__(self):
 | |
|         return self.get_size_entities()
 | |
| 
 | |
|     def get_size_entities(self):
 | |
|         return len(self._entry_index)
 | |
| 
 | |
|     def get_entity_strings(self):
 | |
|         return [self.vocab.strings[x] for x in self._entry_index]
 | |
| 
 | |
|     def get_size_aliases(self):
 | |
|         return len(self._alias_index)
 | |
| 
 | |
|     def get_alias_strings(self):
 | |
|         return [self.vocab.strings[x] for x in self._alias_index]
 | |
| 
 | |
|     def add_entity(self, unicode entity, float freq, vector[float] entity_vector):
 | |
|         """
 | |
|         Add an entity to the KB, optionally specifying its log probability based on corpus frequency
 | |
|         Return the hash of the entity ID/name at the end.
 | |
|         """
 | |
|         cdef hash_t entity_hash = self.vocab.strings.add(entity)
 | |
| 
 | |
|         # Return if this entity was added before
 | |
|         if entity_hash in self._entry_index:
 | |
|             warnings.warn(Warnings.W018.format(entity=entity))
 | |
|             return
 | |
| 
 | |
|         # Raise an error if the provided entity vector is not of the correct length
 | |
|         if len(entity_vector) != self.entity_vector_length:
 | |
|             raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
 | |
| 
 | |
|         vector_index = self.c_add_vector(entity_vector=entity_vector)
 | |
| 
 | |
|         new_index = self.c_add_entity(entity_hash=entity_hash,
 | |
|                                       freq=freq,
 | |
|                                       vector_index=vector_index,
 | |
|                                       feats_row=-1)  # Features table currently not implemented
 | |
|         self._entry_index[entity_hash] = new_index
 | |
| 
 | |
|         return entity_hash
 | |
| 
 | |
|     cpdef set_entities(self, entity_list, freq_list, vector_list):
 | |
|         if len(entity_list) != len(freq_list) or len(entity_list) != len(vector_list):
 | |
|             raise ValueError(Errors.E140)
 | |
| 
 | |
|         nr_entities = len(set(entity_list))
 | |
|         self.initialize_entities(nr_entities)
 | |
| 
 | |
|         i = 0
 | |
|         cdef KBEntryC entry
 | |
|         cdef hash_t entity_hash
 | |
|         while i < len(entity_list):
 | |
|             # only process this entity if its unique ID hadn't been added before
 | |
|             entity_hash = self.vocab.strings.add(entity_list[i])
 | |
|             if entity_hash in self._entry_index:
 | |
|                 warnings.warn(Warnings.W018.format(entity=entity_list[i]))
 | |
| 
 | |
|             else:
 | |
|                 entity_vector = vector_list[i]
 | |
|                 if len(entity_vector) != self.entity_vector_length:
 | |
|                     raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
 | |
| 
 | |
|                 entry.entity_hash = entity_hash
 | |
|                 entry.freq = freq_list[i]
 | |
| 
 | |
|                 vector_index = self.c_add_vector(entity_vector=vector_list[i])
 | |
|                 entry.vector_index = vector_index
 | |
| 
 | |
|                 entry.feats_row = -1   # Features table currently not implemented
 | |
| 
 | |
|                 self._entries[i+1] = entry
 | |
|                 self._entry_index[entity_hash] = i+1
 | |
| 
 | |
|             i += 1
 | |
| 
 | |
|     def contains_entity(self, unicode entity):
 | |
|         cdef hash_t entity_hash = self.vocab.strings.add(entity)
 | |
|         return entity_hash in self._entry_index
 | |
| 
 | |
|     def contains_alias(self, unicode alias):
 | |
|         cdef hash_t alias_hash = self.vocab.strings.add(alias)
 | |
|         return alias_hash in self._alias_index
 | |
| 
 | |
|     def add_alias(self, unicode alias, entities, probabilities):
 | |
|         """
 | |
|         For a given alias, add its potential entities and prior probabilies to the KB.
 | |
|         Return the alias_hash at the end
 | |
|         """
 | |
|         if alias is None or len(alias) == 0:
 | |
|             raise ValueError(Errors.E890.format(alias=alias))
 | |
| 
 | |
|         previous_alias_nr = self.get_size_aliases()
 | |
|         # Throw an error if the length of entities and probabilities are not the same
 | |
|         if not len(entities) == len(probabilities):
 | |
|             raise ValueError(Errors.E132.format(alias=alias,
 | |
|                                                 entities_length=len(entities),
 | |
|                                                 probabilities_length=len(probabilities)))
 | |
| 
 | |
|         # Throw an error if the probabilities sum up to more than 1 (allow for some rounding errors)
 | |
|         prob_sum = sum(probabilities)
 | |
|         if prob_sum > 1.00001:
 | |
|             raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum))
 | |
| 
 | |
|         cdef hash_t alias_hash = self.vocab.strings.add(alias)
 | |
| 
 | |
|         # Check whether this alias was added before
 | |
|         if alias_hash in self._alias_index:
 | |
|             warnings.warn(Warnings.W017.format(alias=alias))
 | |
|             return
 | |
| 
 | |
|         cdef vector[int64_t] entry_indices
 | |
|         cdef vector[float] probs
 | |
| 
 | |
|         for entity, prob in zip(entities, probabilities):
 | |
|             entity_hash = self.vocab.strings[entity]
 | |
|             if not entity_hash in self._entry_index:
 | |
|                 raise ValueError(Errors.E134.format(entity=entity))
 | |
| 
 | |
|             entry_index = <int64_t>self._entry_index.get(entity_hash)
 | |
|             entry_indices.push_back(int(entry_index))
 | |
|             probs.push_back(float(prob))
 | |
| 
 | |
|         new_index = self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs)
 | |
|         self._alias_index[alias_hash] = new_index
 | |
| 
 | |
|         if previous_alias_nr + 1 != self.get_size_aliases():
 | |
|             raise RuntimeError(Errors.E891.format(alias=alias))
 | |
|         return alias_hash
 | |
| 
 | |
|     def append_alias(self, unicode alias, unicode entity, float prior_prob, ignore_warnings=False):
 | |
|         """
 | |
|         For an alias already existing in the KB, extend its potential entities with one more.
 | |
|         Throw a warning if either the alias or the entity is unknown,
 | |
|         or when the combination is already previously recorded.
 | |
|         Throw an error if this entity+prior prob would exceed the sum of 1.
 | |
|         For efficiency, it's best to use the method `add_alias` as much as possible instead of this one.
 | |
|         """
 | |
|         # Check if the alias exists in the KB
 | |
|         cdef hash_t alias_hash = self.vocab.strings[alias]
 | |
|         if not alias_hash in self._alias_index:
 | |
|             raise ValueError(Errors.E176.format(alias=alias))
 | |
| 
 | |
|         # Check if the entity exists in the KB
 | |
|         cdef hash_t entity_hash = self.vocab.strings[entity]
 | |
|         if not entity_hash in self._entry_index:
 | |
|             raise ValueError(Errors.E134.format(entity=entity))
 | |
|         entry_index = <int64_t>self._entry_index.get(entity_hash)
 | |
| 
 | |
|         # Throw an error if the prior probabilities (including the new one) sum up to more than 1
 | |
|         alias_index = <int64_t>self._alias_index.get(alias_hash)
 | |
|         alias_entry = self._aliases_table[alias_index]
 | |
|         current_sum = sum([p for p in alias_entry.probs])
 | |
|         new_sum = current_sum + prior_prob
 | |
| 
 | |
|         if new_sum > 1.00001:
 | |
|             raise ValueError(Errors.E133.format(alias=alias, sum=new_sum))
 | |
| 
 | |
|         entry_indices = alias_entry.entry_indices
 | |
| 
 | |
|         is_present = False
 | |
|         for i in range(entry_indices.size()):
 | |
|             if entry_indices[i] == int(entry_index):
 | |
|                 is_present = True
 | |
| 
 | |
|         if is_present:
 | |
|             if not ignore_warnings:
 | |
|                 warnings.warn(Warnings.W024.format(entity=entity, alias=alias))
 | |
|         else:
 | |
|             entry_indices.push_back(int(entry_index))
 | |
|             alias_entry.entry_indices = entry_indices
 | |
| 
 | |
|             probs = alias_entry.probs
 | |
|             probs.push_back(float(prior_prob))
 | |
|             alias_entry.probs = probs
 | |
|             self._aliases_table[alias_index] = alias_entry
 | |
| 
 | |
|     def get_alias_candidates(self, unicode alias) -> Iterator[Candidate]:
 | |
|         """
 | |
|         Return candidate entities for an alias. Each candidate defines the entity, the original alias,
 | |
|         and the prior probability of that alias resolving to that entity.
 | |
|         If the alias is not known in the KB, and empty list is returned.
 | |
|         """
 | |
|         cdef hash_t alias_hash = self.vocab.strings[alias]
 | |
|         if not alias_hash in self._alias_index:
 | |
|             return []
 | |
|         alias_index = <int64_t>self._alias_index.get(alias_hash)
 | |
|         alias_entry = self._aliases_table[alias_index]
 | |
| 
 | |
|         return [Candidate(kb=self,
 | |
|                           entity_hash=self._entries[entry_index].entity_hash,
 | |
|                           entity_freq=self._entries[entry_index].freq,
 | |
|                           entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
 | |
|                           alias_hash=alias_hash,
 | |
|                           prior_prob=prior_prob)
 | |
|                 for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
 | |
|                 if entry_index != 0]
 | |
| 
 | |
|     def get_vector(self, unicode entity):
 | |
|         cdef hash_t entity_hash = self.vocab.strings[entity]
 | |
| 
 | |
|         # Return an empty list if this entity is unknown in this KB
 | |
|         if entity_hash not in self._entry_index:
 | |
|             return [0] * self.entity_vector_length
 | |
|         entry_index = self._entry_index[entity_hash]
 | |
| 
 | |
|         return self._vectors_table[self._entries[entry_index].vector_index]
 | |
| 
 | |
|     def get_prior_prob(self, unicode entity, unicode alias):
 | |
|         """ Return the prior probability of a given alias being linked to a given entity,
 | |
|         or return 0.0 when this combination is not known in the knowledge base"""
 | |
|         cdef hash_t alias_hash = self.vocab.strings[alias]
 | |
|         cdef hash_t entity_hash = self.vocab.strings[entity]
 | |
| 
 | |
|         if entity_hash not in self._entry_index or alias_hash not in self._alias_index:
 | |
|             return 0.0
 | |
| 
 | |
|         alias_index = <int64_t>self._alias_index.get(alias_hash)
 | |
|         entry_index = self._entry_index[entity_hash]
 | |
| 
 | |
|         alias_entry = self._aliases_table[alias_index]
 | |
|         for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs):
 | |
|             if self._entries[entry_index].entity_hash == entity_hash:
 | |
|                 return prior_prob
 | |
| 
 | |
|         return 0.0
 | |
| 
 | |
|     def to_bytes(self, **kwargs):
 | |
|         """Serialize the current state to a binary string.
 | |
|         """
 | |
|         def serialize_header():
 | |
|             header = (self.get_size_entities(), self.get_size_aliases(), self.entity_vector_length)
 | |
|             return srsly.json_dumps(header)
 | |
| 
 | |
|         def serialize_entries():
 | |
|             i = 1
 | |
|             tuples = []
 | |
|             for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
 | |
|                 entry = self._entries[entry_index]
 | |
|                 assert entry.entity_hash == entry_hash
 | |
|                 assert entry_index == i
 | |
|                 tuples.append((entry.entity_hash, entry.freq, entry.vector_index))
 | |
|                 i = i + 1
 | |
|             return srsly.json_dumps(tuples)
 | |
| 
 | |
|         def serialize_aliases():
 | |
|             i = 1
 | |
|             headers = []
 | |
|             indices_lists = []
 | |
|             probs_lists = []
 | |
|             for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]):
 | |
|                 alias = self._aliases_table[alias_index]
 | |
|                 assert alias_index == i
 | |
|                 candidate_length = len(alias.entry_indices)
 | |
|                 headers.append((alias_hash, candidate_length))
 | |
|                 indices_lists.append(alias.entry_indices)
 | |
|                 probs_lists.append(alias.probs)
 | |
|                 i = i + 1
 | |
|             headers_dump = srsly.json_dumps(headers)
 | |
|             indices_dump = srsly.json_dumps(indices_lists)
 | |
|             probs_dump = srsly.json_dumps(probs_lists)
 | |
|             return srsly.json_dumps((headers_dump, indices_dump, probs_dump))
 | |
| 
 | |
|         serializers = {
 | |
|             "header": serialize_header,
 | |
|             "entity_vectors": lambda: srsly.json_dumps(self._vectors_table),
 | |
|             "entries": serialize_entries,
 | |
|             "aliases": serialize_aliases,
 | |
|         }
 | |
|         return util.to_bytes(serializers, [])
 | |
| 
 | |
|     def from_bytes(self, bytes_data, *, exclude=tuple()):
 | |
|         """Load state from a binary string.
 | |
|         """
 | |
|         def deserialize_header(b):
 | |
|             header = srsly.json_loads(b)
 | |
|             nr_entities = header[0]
 | |
|             nr_aliases = header[1]
 | |
|             entity_vector_length = header[2]
 | |
|             self.initialize_entities(nr_entities)
 | |
|             self.initialize_aliases(nr_aliases)
 | |
|             self.entity_vector_length = entity_vector_length
 | |
| 
 | |
|         def deserialize_vectors(b):
 | |
|             self._vectors_table = srsly.json_loads(b)
 | |
| 
 | |
|         def deserialize_entries(b):
 | |
|             cdef KBEntryC entry
 | |
|             tuples = srsly.json_loads(b)
 | |
|             i = 1
 | |
|             for (entity_hash, freq, vector_index) in tuples:
 | |
|                 entry.entity_hash = entity_hash
 | |
|                 entry.freq = freq
 | |
|                 entry.vector_index = vector_index
 | |
|                 entry.feats_row = -1  # Features table currently not implemented
 | |
|                 self._entries[i] = entry
 | |
|                 self._entry_index[entity_hash] = i
 | |
|                 i += 1
 | |
| 
 | |
|         def deserialize_aliases(b):
 | |
|             cdef AliasC alias
 | |
|             i = 1
 | |
|             all_data = srsly.json_loads(b)
 | |
|             headers = srsly.json_loads(all_data[0])
 | |
|             indices = srsly.json_loads(all_data[1])
 | |
|             probs = srsly.json_loads(all_data[2])
 | |
|             for header, indices, probs in zip(headers, indices, probs):
 | |
|                 alias_hash, candidate_length = header
 | |
|                 alias.entry_indices = indices
 | |
|                 alias.probs = probs
 | |
|                 self._aliases_table[i] = alias
 | |
|                 self._alias_index[alias_hash] = i
 | |
|                 i += 1
 | |
| 
 | |
|         setters = {
 | |
|             "header": deserialize_header,
 | |
|             "entity_vectors": deserialize_vectors,
 | |
|             "entries": deserialize_entries,
 | |
|             "aliases": deserialize_aliases,
 | |
|         }
 | |
|         util.from_bytes(bytes_data, setters, exclude)
 | |
|         return self
 | |
| 
 | |
|     def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
 | |
|         path = ensure_path(path)
 | |
|         if not path.exists():
 | |
|             path.mkdir(parents=True)
 | |
|         if not path.is_dir():
 | |
|             raise ValueError(Errors.E928.format(loc=path))
 | |
|         serialize = {}
 | |
|         serialize["contents"] = lambda p: self.write_contents(p)
 | |
|         serialize["strings.json"] = lambda p: self.vocab.strings.to_disk(p)
 | |
|         util.to_disk(path, serialize, exclude)
 | |
| 
 | |
|     def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
 | |
|         path = ensure_path(path)
 | |
|         if not path.exists():
 | |
|             raise ValueError(Errors.E929.format(loc=path))
 | |
|         if not path.is_dir():
 | |
|             raise ValueError(Errors.E928.format(loc=path))
 | |
|         deserialize = {}
 | |
|         deserialize["contents"] = lambda p: self.read_contents(p)
 | |
|         deserialize["strings.json"] = lambda p: self.vocab.strings.from_disk(p)
 | |
|         util.from_disk(path, deserialize, exclude)
 | |
| 
 | |
|     def write_contents(self, file_path):
 | |
|         cdef Writer writer = Writer(file_path)
 | |
|         writer.write_header(self.get_size_entities(), self.entity_vector_length)
 | |
| 
 | |
|         # dumping the entity vectors in their original order
 | |
|         i = 0
 | |
|         for entity_vector in self._vectors_table:
 | |
|             for element in entity_vector:
 | |
|                 writer.write_vector_element(element)
 | |
|             i = i+1
 | |
| 
 | |
|         # dumping the entry records in the order in which they are in the _entries vector.
 | |
|         # index 0 is a dummy object not stored in the _entry_index and can be ignored.
 | |
|         i = 1
 | |
|         for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
 | |
|             entry = self._entries[entry_index]
 | |
|             assert entry.entity_hash == entry_hash
 | |
|             assert entry_index == i
 | |
|             writer.write_entry(entry.entity_hash, entry.freq, entry.vector_index)
 | |
|             i = i+1
 | |
| 
 | |
|         writer.write_alias_length(self.get_size_aliases())
 | |
| 
 | |
|         # dumping the aliases in the order in which they are in the _alias_index vector.
 | |
|         # index 0 is a dummy object not stored in the _aliases_table and can be ignored.
 | |
|         i = 1
 | |
|         for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]):
 | |
|             alias = self._aliases_table[alias_index]
 | |
|             assert alias_index == i
 | |
| 
 | |
|             candidate_length = len(alias.entry_indices)
 | |
|             writer.write_alias_header(alias_hash, candidate_length)
 | |
| 
 | |
|             for j in range(0, candidate_length):
 | |
|                 writer.write_alias(alias.entry_indices[j], alias.probs[j])
 | |
| 
 | |
|             i = i+1
 | |
| 
 | |
|         writer.close()
 | |
| 
 | |
|     def read_contents(self, file_path):
 | |
|         cdef hash_t entity_hash
 | |
|         cdef hash_t alias_hash
 | |
|         cdef int64_t entry_index
 | |
|         cdef float freq, prob
 | |
|         cdef int32_t vector_index
 | |
|         cdef KBEntryC entry
 | |
|         cdef AliasC alias
 | |
|         cdef float vector_element
 | |
| 
 | |
|         cdef Reader reader = Reader(file_path)
 | |
| 
 | |
|         # STEP 0: load header and initialize KB
 | |
|         cdef int64_t nr_entities
 | |
|         cdef int64_t entity_vector_length
 | |
|         reader.read_header(&nr_entities, &entity_vector_length)
 | |
| 
 | |
|         self.initialize_entities(nr_entities)
 | |
|         self.entity_vector_length = entity_vector_length
 | |
| 
 | |
|         # STEP 1: load entity vectors
 | |
|         cdef int i = 0
 | |
|         cdef int j = 0
 | |
|         while i < nr_entities:
 | |
|             entity_vector = float_vec(entity_vector_length)
 | |
|             j = 0
 | |
|             while j < entity_vector_length:
 | |
|                 reader.read_vector_element(&vector_element)
 | |
|                 entity_vector[j] = vector_element
 | |
|                 j = j+1
 | |
|             self._vectors_table[i] = entity_vector
 | |
|             i = i+1
 | |
| 
 | |
|         # STEP 2: load entities
 | |
|         # we assume that the entity data was written in sequence
 | |
|         # index 0 is a dummy object not stored in the _entry_index and can be ignored.
 | |
|         i = 1
 | |
|         while i <= nr_entities:
 | |
|             reader.read_entry(&entity_hash, &freq, &vector_index)
 | |
| 
 | |
|             entry.entity_hash = entity_hash
 | |
|             entry.freq = freq
 | |
|             entry.vector_index = vector_index
 | |
|             entry.feats_row = -1    # Features table currently not implemented
 | |
| 
 | |
|             self._entries[i] = entry
 | |
|             self._entry_index[entity_hash] = i
 | |
| 
 | |
|             i += 1
 | |
| 
 | |
|         # check that all entities were read in properly
 | |
|         assert nr_entities == self.get_size_entities()
 | |
| 
 | |
|         # STEP 3: load aliases
 | |
|         cdef int64_t nr_aliases
 | |
|         reader.read_alias_length(&nr_aliases)
 | |
|         self.initialize_aliases(nr_aliases)
 | |
| 
 | |
|         cdef int64_t nr_candidates
 | |
|         cdef vector[int64_t] entry_indices
 | |
|         cdef vector[float] probs
 | |
| 
 | |
|         i = 1
 | |
|         # we assume the alias data was written in sequence
 | |
|         # index 0 is a dummy object not stored in the _entry_index and can be ignored.
 | |
|         while i <= nr_aliases:
 | |
|             reader.read_alias_header(&alias_hash, &nr_candidates)
 | |
|             entry_indices = vector[int64_t](nr_candidates)
 | |
|             probs = vector[float](nr_candidates)
 | |
| 
 | |
|             for j in range(0, nr_candidates):
 | |
|                 reader.read_alias(&entry_index, &prob)
 | |
|                 entry_indices[j] = entry_index
 | |
|                 probs[j] = prob
 | |
| 
 | |
|             alias.entry_indices = entry_indices
 | |
|             alias.probs = probs
 | |
| 
 | |
|             self._aliases_table[i] = alias
 | |
|             self._alias_index[alias_hash] = i
 | |
| 
 | |
|             i += 1
 | |
| 
 | |
|         # check that all aliases were read in properly
 | |
|         assert nr_aliases == self.get_size_aliases()
 | |
| 
 | |
| 
 | |
| cdef class Writer:
 | |
|     def __init__(self, path):
 | |
|         assert isinstance(path, Path)
 | |
|         content = bytes(path)
 | |
|         cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content
 | |
|         self._fp = fopen(<char*>bytes_loc, 'wb')
 | |
|         if not self._fp:
 | |
|             raise IOError(Errors.E146.format(path=path))
 | |
|         fseek(self._fp, 0, 0)
 | |
| 
 | |
|     def close(self):
 | |
|         cdef size_t status = fclose(self._fp)
 | |
|         assert status == 0
 | |
| 
 | |
|     cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1:
 | |
|         self._write(&nr_entries, sizeof(nr_entries))
 | |
|         self._write(&entity_vector_length, sizeof(entity_vector_length))
 | |
| 
 | |
|     cdef int write_vector_element(self, float element) except -1:
 | |
|         self._write(&element, sizeof(element))
 | |
| 
 | |
|     cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1:
 | |
|         self._write(&entry_hash, sizeof(entry_hash))
 | |
|         self._write(&entry_freq, sizeof(entry_freq))
 | |
|         self._write(&vector_index, sizeof(vector_index))
 | |
|         # Features table currently not implemented and not written to file
 | |
| 
 | |
|     cdef int write_alias_length(self, int64_t alias_length) except -1:
 | |
|         self._write(&alias_length, sizeof(alias_length))
 | |
| 
 | |
|     cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1:
 | |
|         self._write(&alias_hash, sizeof(alias_hash))
 | |
|         self._write(&candidate_length, sizeof(candidate_length))
 | |
| 
 | |
|     cdef int write_alias(self, int64_t entry_index, float prob) except -1:
 | |
|         self._write(&entry_index, sizeof(entry_index))
 | |
|         self._write(&prob, sizeof(prob))
 | |
| 
 | |
|     cdef int _write(self, void* value, size_t size) except -1:
 | |
|         status = fwrite(value, size, 1, self._fp)
 | |
|         assert status == 1, status
 | |
| 
 | |
| 
 | |
| cdef class Reader:
 | |
|     def __init__(self, path):
 | |
|         content = bytes(path)
 | |
|         cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content
 | |
|         self._fp = fopen(<char*>bytes_loc, 'rb')
 | |
|         if not self._fp:
 | |
|             PyErr_SetFromErrno(IOError)
 | |
|         status = fseek(self._fp, 0, 0)  # this can be 0 if there is no header
 | |
| 
 | |
|     def __dealloc__(self):
 | |
|         fclose(self._fp)
 | |
| 
 | |
|     cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1:
 | |
|         status = self._read(nr_entries, sizeof(int64_t))
 | |
|         if status < 1:
 | |
|             if feof(self._fp):
 | |
|                 return 0  # end of file
 | |
|             raise IOError(Errors.E145.format(param="header"))
 | |
| 
 | |
|         status = self._read(entity_vector_length, sizeof(int64_t))
 | |
|         if status < 1:
 | |
|             if feof(self._fp):
 | |
|                 return 0  # end of file
 | |
|             raise IOError(Errors.E145.format(param="vector length"))
 | |
| 
 | |
|     cdef int read_vector_element(self, float* element) except -1:
 | |
|         status = self._read(element, sizeof(float))
 | |
|         if status < 1:
 | |
|             if feof(self._fp):
 | |
|                 return 0  # end of file
 | |
|             raise IOError(Errors.E145.format(param="vector element"))
 | |
| 
 | |
|     cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1:
 | |
|         status = self._read(entity_hash, sizeof(hash_t))
 | |
|         if status < 1:
 | |
|             if feof(self._fp):
 | |
|                 return 0  # end of file
 | |
|             raise IOError(Errors.E145.format(param="entity hash"))
 | |
| 
 | |
|         status = self._read(freq, sizeof(float))
 | |
|         if status < 1:
 | |
|             if feof(self._fp):
 | |
|                 return 0  # end of file
 | |
|             raise IOError(Errors.E145.format(param="entity freq"))
 | |
| 
 | |
|         status = self._read(vector_index, sizeof(int32_t))
 | |
|         if status < 1:
 | |
|             if feof(self._fp):
 | |
|                 return 0  # end of file
 | |
|             raise IOError(Errors.E145.format(param="vector index"))
 | |
| 
 | |
|         if feof(self._fp):
 | |
|             return 0
 | |
|         else:
 | |
|             return 1
 | |
| 
 | |
|     cdef int read_alias_length(self, int64_t* alias_length) except -1:
 | |
|         status = self._read(alias_length, sizeof(int64_t))
 | |
|         if status < 1:
 | |
|             if feof(self._fp):
 | |
|                 return 0  # end of file
 | |
|             raise IOError(Errors.E145.format(param="alias length"))
 | |
| 
 | |
|     cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1:
 | |
|         status = self._read(alias_hash, sizeof(hash_t))
 | |
|         if status < 1:
 | |
|             if feof(self._fp):
 | |
|                 return 0  # end of file
 | |
|             raise IOError(Errors.E145.format(param="alias hash"))
 | |
| 
 | |
|         status = self._read(candidate_length, sizeof(int64_t))
 | |
|         if status < 1:
 | |
|             if feof(self._fp):
 | |
|                 return 0  # end of file
 | |
|             raise IOError(Errors.E145.format(param="candidate length"))
 | |
| 
 | |
|     cdef int read_alias(self, int64_t* entry_index, float* prob) except -1:
 | |
|         status = self._read(entry_index, sizeof(int64_t))
 | |
|         if status < 1:
 | |
|             if feof(self._fp):
 | |
|                 return 0  # end of file
 | |
|             raise IOError(Errors.E145.format(param="entry index"))
 | |
| 
 | |
|         status = self._read(prob, sizeof(float))
 | |
|         if status < 1:
 | |
|             if feof(self._fp):
 | |
|                 return 0  # end of file
 | |
|             raise IOError(Errors.E145.format(param="prior probability"))
 | |
| 
 | |
|     cdef int _read(self, void* value, size_t size) except -1:
 | |
|         status = fread(value, size, 1, self._fp)
 | |
|         return status
 |