entity as one field instead of both ID and name

This commit is contained in:
svlandeg 2019-03-25 18:10:41 +01:00
parent 46f4eb5db3
commit 8814b9010d
5 changed files with 49 additions and 66 deletions

View File

@ -12,27 +12,27 @@ def create_kb(vocab):
kb = KnowledgeBase(vocab=vocab) kb = KnowledgeBase(vocab=vocab)
# adding entities # adding entities
entity_0 = "Q1004791" entity_0 = "Q1004791_Douglas"
print("adding entity", entity_0) print("adding entity", entity_0)
kb.add_entity(entity_id=entity_0, entity_name="Douglas", prob=0.5) kb.add_entity(entity=entity_0, prob=0.5)
entity_1 = "Q42" entity_1 = "Q42_Douglas_Adams"
print("adding entity", entity_1) print("adding entity", entity_1)
kb.add_entity(entity_id=entity_1, entity_name="Douglas Adams", prob=0.5) kb.add_entity(entity=entity_1, prob=0.5)
entity_2 = "Q5301561" entity_2 = "Q5301561_Douglas_Haig"
print("adding entity", entity_2) print("adding entity", entity_2)
kb.add_entity(entity_id=entity_2, entity_name="Douglas Haig", prob=0.5) kb.add_entity(entity=entity_2, prob=0.5)
# adding aliases # adding aliases
print() print()
alias_0 = "Douglas" alias_0 = "Douglas"
print("adding alias", alias_0) print("adding alias", alias_0)
kb.add_alias(alias=alias_0, entities=["Q1004791", "Q42", "Q5301561"], probabilities=[0.1, 0.6, 0.2]) kb.add_alias(alias=alias_0, entities=[entity_0, entity_1, entity_2], probabilities=[0.1, 0.6, 0.2])
alias_1 = "Douglas Adams" alias_1 = "Douglas Adams"
print("adding alias", alias_1) print("adding alias", alias_1)
kb.add_alias(alias=alias_1, entities=["Q42"], probabilities=[0.9]) kb.add_alias(alias=alias_1, entities=[entity_1], probabilities=[0.9])
print() print()
print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases()) print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases())
@ -49,7 +49,7 @@ def add_el(kb, nlp):
print() print()
print(len(candidates), "candidate(s) for", alias, ":") print(len(candidates), "candidate(s) for", alias, ":")
for c in candidates: for c in candidates:
print(" ", c.entity_id_, c.entity_name_, c.prior_prob) print(" ", c.entity_, c.prior_prob)
text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \ text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
"Douglas reminds us to always bring our towel. " \ "Douglas reminds us to always bring our towel. " \

View File

@ -14,8 +14,7 @@ from .typedefs cimport hash_t
cdef struct _EntryC: cdef struct _EntryC:
# The hash of this entry's unique ID and name in the kB # The hash of this entry's unique ID and name in the kB
hash_t entity_id_hash hash_t entity_hash
hash_t entity_name_hash
# Allows retrieval of one or more vectors. # Allows retrieval of one or more vectors.
# Each element of vector_rows should be an index into a vectors table. # Each element of vector_rows should be an index into a vectors table.
@ -48,7 +47,7 @@ cdef struct _AliasC:
cdef class Candidate: cdef class Candidate:
cdef readonly KnowledgeBase kb cdef readonly KnowledgeBase kb
cdef hash_t entity_id_hash cdef hash_t entity_hash
cdef hash_t alias_hash cdef hash_t alias_hash
cdef float prior_prob cdef float prior_prob
@ -97,7 +96,7 @@ cdef class KnowledgeBase:
# optional data, we can let users configure a DB as the backend for this. # optional data, we can let users configure a DB as the backend for this.
cdef object _features_table cdef object _features_table
cdef inline int64_t c_add_entity(self, hash_t entity_id_hash, hash_t entity_name_hash, float prob, cdef inline int64_t c_add_entity(self, hash_t entity_hash, float prob,
int32_t* vector_rows, int feats_row): int32_t* vector_rows, int feats_row):
"""Add an entry to the knowledge base.""" """Add an entry to the knowledge base."""
# This is what we'll map the hash key to. It's where the entry will sit # This is what we'll map the hash key to. It's where the entry will sit
@ -105,13 +104,12 @@ cdef class KnowledgeBase:
cdef int64_t new_index = self._entries.size() cdef int64_t new_index = self._entries.size()
self._entries.push_back( self._entries.push_back(
_EntryC( _EntryC(
entity_id_hash=entity_id_hash, entity_hash=entity_hash,
entity_name_hash=entity_name_hash,
vector_rows=vector_rows, vector_rows=vector_rows,
feats_row=feats_row, feats_row=feats_row,
prob=prob prob=prob
)) ))
self._entry_index[entity_id_hash] = new_index self._entry_index[entity_hash] = new_index
return new_index return new_index
cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs): cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs):
@ -136,8 +134,7 @@ cdef class KnowledgeBase:
self.vocab.strings.add("") self.vocab.strings.add("")
self._entries.push_back( self._entries.push_back(
_EntryC( _EntryC(
entity_id_hash=self.vocab.strings[""], entity_hash=self.vocab.strings[""],
entity_name_hash=self.vocab.strings[""],
vector_rows=&dummy_value, vector_rows=&dummy_value,
feats_row=dummy_value, feats_row=dummy_value,
prob=dummy_value prob=dummy_value

View File

@ -5,32 +5,21 @@ from spacy.errors import Errors, Warnings, user_warning
cdef class Candidate: cdef class Candidate:
def __init__(self, KnowledgeBase kb, entity_id_hash, alias_hash, prior_prob): def __init__(self, KnowledgeBase kb, entity_hash, alias_hash, prior_prob):
self.kb = kb self.kb = kb
self.entity_id_hash = entity_id_hash self.entity_hash = entity_hash
self.alias_hash = alias_hash self.alias_hash = alias_hash
self.prior_prob = prior_prob self.prior_prob = prior_prob
@property @property
def entity_id(self): def entity(self):
"""RETURNS (uint64): hash of the entity's KB ID""" """RETURNS (uint64): hash of the entity's KB ID/name"""
return self.entity_id_hash return self.entity_hash
@property @property
def entity_id_(self): def entity_(self):
"""RETURNS (unicode): ID of this entity in the KB""" """RETURNS (unicode): ID/name of this entity in the KB"""
return self.kb.vocab.strings[self.entity_id] return self.kb.vocab.strings[self.entity]
@property
def entity_name(self):
"""RETURNS (uint64): hash of the entity's KB name"""
entry_index = <int64_t>self.kb._entry_index.get(self.entity_id)
return self.kb._entries[entry_index].entity_name_hash
@property
def entity_name_(self):
"""RETURNS (unicode): name of this entity in the KB"""
return self.kb.vocab.strings[self.entity_name]
@property @property
def alias(self): def alias(self):
@ -65,28 +54,25 @@ cdef class KnowledgeBase:
def get_size_aliases(self): def get_size_aliases(self):
return self._aliases_table.size() - 1 # not counting dummy element on index 0 return self._aliases_table.size() - 1 # not counting dummy element on index 0
def add_entity(self, unicode entity_id, unicode entity_name=None, float prob=0.5, vectors=None, features=None): def add_entity(self, unicode entity, float prob=0.5, vectors=None, features=None):
""" """
Add an entity to the KB. Add an entity to the KB.
Return the hash of the entity ID at the end Return the hash of the entity ID at the end
""" """
if not entity_name: cdef hash_t entity_hash = self.vocab.strings.add(entity)
entity_name = entity_id
cdef hash_t id_hash = self.vocab.strings.add(entity_id)
cdef hash_t name_hash = self.vocab.strings.add(entity_name)
# Return if this entity was added before # Return if this entity was added before
if id_hash in self._entry_index: if entity_hash in self._entry_index:
user_warning(Warnings.W018.format(entity=entity_id)) user_warning(Warnings.W018.format(entity=entity))
return return
cdef int32_t dummy_value = 342 cdef int32_t dummy_value = 342
self.c_add_entity(entity_id_hash=id_hash, entity_name_hash=name_hash, prob=prob, self.c_add_entity(entity_hash=entity_hash, prob=prob,
vector_rows=&dummy_value, feats_row=dummy_value) vector_rows=&dummy_value, feats_row=dummy_value)
# TODO self._vectors_table.get_pointer(vectors), # TODO self._vectors_table.get_pointer(vectors),
# self._features_table.get(features)) # self._features_table.get(features))
return id_hash return entity_hash
def add_alias(self, unicode alias, entities, probabilities): def add_alias(self, unicode alias, entities, probabilities):
""" """
@ -118,11 +104,11 @@ cdef class KnowledgeBase:
cdef vector[float] probs cdef vector[float] probs
for entity, prob in zip(entities, probabilities): for entity, prob in zip(entities, probabilities):
entity_id_hash = self.vocab.strings[entity] entity_hash = self.vocab.strings[entity]
if not entity_id_hash in self._entry_index: if not entity_hash in self._entry_index:
raise ValueError(Errors.E134.format(alias=alias, entity=entity)) raise ValueError(Errors.E134.format(alias=alias, entity=entity))
entry_index = <int64_t>self._entry_index.get(entity_id_hash) entry_index = <int64_t>self._entry_index.get(entity_hash)
entry_indices.push_back(int(entry_index)) entry_indices.push_back(int(entry_index))
probs.push_back(float(prob)) probs.push_back(float(prob))
@ -138,7 +124,7 @@ cdef class KnowledgeBase:
alias_entry = self._aliases_table[alias_index] alias_entry = self._aliases_table[alias_index]
return [Candidate(kb=self, return [Candidate(kb=self,
entity_id_hash=self._entries[entry_index].entity_id_hash, entity_hash=self._entries[entry_index].entity_hash,
alias_hash=alias_hash, alias_hash=alias_hash,
prior_prob=prob) prior_prob=prob)
for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs) for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)

View File

@ -1096,7 +1096,7 @@ class EntityLinker(Pipe):
if candidates: if candidates:
best_candidate = max(candidates, key=lambda c: c.prior_prob) best_candidate = max(candidates, key=lambda c: c.prior_prob)
for token in ent: for token in ent:
token.ent_kb_id_ = best_candidate.entity_id_ token.ent_kb_id_ = best_candidate.entity_
def get_loss(self, docs, golds, scores): def get_loss(self, docs, golds, scores):
# TODO # TODO

View File

@ -17,9 +17,9 @@ def test_kb_valid_entities(nlp):
mykb = KnowledgeBase(nlp.vocab) mykb = KnowledgeBase(nlp.vocab)
# adding entities # adding entities
mykb.add_entity(entity_id=u'Q1', prob=0.9) mykb.add_entity(entity=u'Q1', prob=0.9)
mykb.add_entity(entity_id=u'Q2', prob=0.2) mykb.add_entity(entity=u'Q2')
mykb.add_entity(entity_id=u'Q3', prob=0.5) mykb.add_entity(entity=u'Q3', prob=0.5)
# adding aliases # adding aliases
mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.8, 0.2]) mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.8, 0.2])
@ -35,9 +35,9 @@ def test_kb_invalid_entities(nlp):
mykb = KnowledgeBase(nlp.vocab) mykb = KnowledgeBase(nlp.vocab)
# adding entities # adding entities
mykb.add_entity(entity_id=u'Q1', prob=0.9) mykb.add_entity(entity=u'Q1', prob=0.9)
mykb.add_entity(entity_id=u'Q2', prob=0.2) mykb.add_entity(entity=u'Q2', prob=0.2)
mykb.add_entity(entity_id=u'Q3', prob=0.5) mykb.add_entity(entity=u'Q3', prob=0.5)
# adding aliases - should fail because one of the given IDs is not valid # adding aliases - should fail because one of the given IDs is not valid
with pytest.raises(ValueError): with pytest.raises(ValueError):
@ -49,9 +49,9 @@ def test_kb_invalid_probabilities(nlp):
mykb = KnowledgeBase(nlp.vocab) mykb = KnowledgeBase(nlp.vocab)
# adding entities # adding entities
mykb.add_entity(entity_id=u'Q1', prob=0.9) mykb.add_entity(entity=u'Q1', prob=0.9)
mykb.add_entity(entity_id=u'Q2', prob=0.2) mykb.add_entity(entity=u'Q2', prob=0.2)
mykb.add_entity(entity_id=u'Q3', prob=0.5) mykb.add_entity(entity=u'Q3', prob=0.5)
# adding aliases - should fail because the sum of the probabilities exceeds 1 # adding aliases - should fail because the sum of the probabilities exceeds 1
with pytest.raises(ValueError): with pytest.raises(ValueError):
@ -63,9 +63,9 @@ def test_kb_invalid_combination(nlp):
mykb = KnowledgeBase(nlp.vocab) mykb = KnowledgeBase(nlp.vocab)
# adding entities # adding entities
mykb.add_entity(entity_id=u'Q1', prob=0.9) mykb.add_entity(entity=u'Q1', prob=0.9)
mykb.add_entity(entity_id=u'Q2', prob=0.2) mykb.add_entity(entity=u'Q2', prob=0.2)
mykb.add_entity(entity_id=u'Q3', prob=0.5) mykb.add_entity(entity=u'Q3', prob=0.5)
# adding aliases - should fail because the entities and probabilities vectors are not of equal length # adding aliases - should fail because the entities and probabilities vectors are not of equal length
with pytest.raises(ValueError): with pytest.raises(ValueError):
@ -77,9 +77,9 @@ def test_candidate_generation(nlp):
mykb = KnowledgeBase(nlp.vocab) mykb = KnowledgeBase(nlp.vocab)
# adding entities # adding entities
mykb.add_entity(entity_id=u'Q1', prob=0.9) mykb.add_entity(entity=u'Q1', prob=0.9)
mykb.add_entity(entity_id=u'Q2', prob=0.2) mykb.add_entity(entity=u'Q2', prob=0.2)
mykb.add_entity(entity_id=u'Q3', prob=0.5) mykb.add_entity(entity=u'Q3', prob=0.5)
# adding aliases # adding aliases
mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.8, 0.2]) mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.8, 0.2])