From 8814b9010d139f92bc817378eace25e24a817b7e Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 25 Mar 2019 18:10:41 +0100 Subject: [PATCH] entity as one field instead of both ID and name --- examples/pipeline/dummy_entity_linking.py | 18 ++++---- spacy/kb.pxd | 15 +++---- spacy/kb.pyx | 50 ++++++++--------------- spacy/pipeline/pipes.pyx | 2 +- spacy/tests/pipeline/test_el.py | 30 +++++++------- 5 files changed, 49 insertions(+), 66 deletions(-) diff --git a/examples/pipeline/dummy_entity_linking.py b/examples/pipeline/dummy_entity_linking.py index 84f5ea003..88415d040 100644 --- a/examples/pipeline/dummy_entity_linking.py +++ b/examples/pipeline/dummy_entity_linking.py @@ -12,27 +12,27 @@ def create_kb(vocab): kb = KnowledgeBase(vocab=vocab) # adding entities - entity_0 = "Q1004791" + entity_0 = "Q1004791_Douglas" print("adding entity", entity_0) - kb.add_entity(entity_id=entity_0, entity_name="Douglas", prob=0.5) + kb.add_entity(entity=entity_0, prob=0.5) - entity_1 = "Q42" + entity_1 = "Q42_Douglas_Adams" print("adding entity", entity_1) - kb.add_entity(entity_id=entity_1, entity_name="Douglas Adams", prob=0.5) + kb.add_entity(entity=entity_1, prob=0.5) - entity_2 = "Q5301561" + entity_2 = "Q5301561_Douglas_Haig" print("adding entity", entity_2) - kb.add_entity(entity_id=entity_2, entity_name="Douglas Haig", prob=0.5) + kb.add_entity(entity=entity_2, prob=0.5) # adding aliases print() alias_0 = "Douglas" print("adding alias", alias_0) - kb.add_alias(alias=alias_0, entities=["Q1004791", "Q42", "Q5301561"], probabilities=[0.1, 0.6, 0.2]) + kb.add_alias(alias=alias_0, entities=[entity_0, entity_1, entity_2], probabilities=[0.1, 0.6, 0.2]) alias_1 = "Douglas Adams" print("adding alias", alias_1) - kb.add_alias(alias=alias_1, entities=["Q42"], probabilities=[0.9]) + kb.add_alias(alias=alias_1, entities=[entity_1], probabilities=[0.9]) print() print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases()) @@ -49,7 +49,7 @@ def add_el(kb, nlp): print() print(len(candidates), "candidate(s) for", alias, ":") for c in candidates: - print(" ", c.entity_id_, c.entity_name_, c.prior_prob) + print(" ", c.entity_, c.prior_prob) text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \ "Douglas reminds us to always bring our towel. " \ diff --git a/spacy/kb.pxd b/spacy/kb.pxd index dc6701b89..e34a0a9ba 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -14,8 +14,7 @@ from .typedefs cimport hash_t cdef struct _EntryC: # The hash of this entry's unique ID and name in the kB - hash_t entity_id_hash - hash_t entity_name_hash + hash_t entity_hash # Allows retrieval of one or more vectors. # Each element of vector_rows should be an index into a vectors table. @@ -48,7 +47,7 @@ cdef struct _AliasC: cdef class Candidate: cdef readonly KnowledgeBase kb - cdef hash_t entity_id_hash + cdef hash_t entity_hash cdef hash_t alias_hash cdef float prior_prob @@ -97,7 +96,7 @@ cdef class KnowledgeBase: # optional data, we can let users configure a DB as the backend for this. cdef object _features_table - cdef inline int64_t c_add_entity(self, hash_t entity_id_hash, hash_t entity_name_hash, float prob, + cdef inline int64_t c_add_entity(self, hash_t entity_hash, float prob, int32_t* vector_rows, int feats_row): """Add an entry to the knowledge base.""" # This is what we'll map the hash key to. It's where the entry will sit @@ -105,13 +104,12 @@ cdef class KnowledgeBase: cdef int64_t new_index = self._entries.size() self._entries.push_back( _EntryC( - entity_id_hash=entity_id_hash, - entity_name_hash=entity_name_hash, + entity_hash=entity_hash, vector_rows=vector_rows, feats_row=feats_row, prob=prob )) - self._entry_index[entity_id_hash] = new_index + self._entry_index[entity_hash] = new_index return new_index cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs): @@ -136,8 +134,7 @@ cdef class KnowledgeBase: self.vocab.strings.add("") self._entries.push_back( _EntryC( - entity_id_hash=self.vocab.strings[""], - entity_name_hash=self.vocab.strings[""], + entity_hash=self.vocab.strings[""], vector_rows=&dummy_value, feats_row=dummy_value, prob=dummy_value diff --git a/spacy/kb.pyx b/spacy/kb.pyx index a6a8ca9ba..3a0a8b918 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -5,32 +5,21 @@ from spacy.errors import Errors, Warnings, user_warning cdef class Candidate: - def __init__(self, KnowledgeBase kb, entity_id_hash, alias_hash, prior_prob): + def __init__(self, KnowledgeBase kb, entity_hash, alias_hash, prior_prob): self.kb = kb - self.entity_id_hash = entity_id_hash + self.entity_hash = entity_hash self.alias_hash = alias_hash self.prior_prob = prior_prob @property - def entity_id(self): - """RETURNS (uint64): hash of the entity's KB ID""" - return self.entity_id_hash + def entity(self): + """RETURNS (uint64): hash of the entity's KB ID/name""" + return self.entity_hash @property - def entity_id_(self): - """RETURNS (unicode): ID of this entity in the KB""" - return self.kb.vocab.strings[self.entity_id] - - @property - def entity_name(self): - """RETURNS (uint64): hash of the entity's KB name""" - entry_index = self.kb._entry_index.get(self.entity_id) - return self.kb._entries[entry_index].entity_name_hash - - @property - def entity_name_(self): - """RETURNS (unicode): name of this entity in the KB""" - return self.kb.vocab.strings[self.entity_name] + def entity_(self): + """RETURNS (unicode): ID/name of this entity in the KB""" + return self.kb.vocab.strings[self.entity] @property def alias(self): @@ -65,28 +54,25 @@ cdef class KnowledgeBase: def get_size_aliases(self): return self._aliases_table.size() - 1 # not counting dummy element on index 0 - def add_entity(self, unicode entity_id, unicode entity_name=None, float prob=0.5, vectors=None, features=None): + def add_entity(self, unicode entity, float prob=0.5, vectors=None, features=None): """ Add an entity to the KB. Return the hash of the entity ID at the end """ - if not entity_name: - entity_name = entity_id - cdef hash_t id_hash = self.vocab.strings.add(entity_id) - cdef hash_t name_hash = self.vocab.strings.add(entity_name) + cdef hash_t entity_hash = self.vocab.strings.add(entity) # Return if this entity was added before - if id_hash in self._entry_index: - user_warning(Warnings.W018.format(entity=entity_id)) + if entity_hash in self._entry_index: + user_warning(Warnings.W018.format(entity=entity)) return cdef int32_t dummy_value = 342 - self.c_add_entity(entity_id_hash=id_hash, entity_name_hash=name_hash, prob=prob, + self.c_add_entity(entity_hash=entity_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value) # TODO self._vectors_table.get_pointer(vectors), # self._features_table.get(features)) - return id_hash + return entity_hash def add_alias(self, unicode alias, entities, probabilities): """ @@ -118,11 +104,11 @@ cdef class KnowledgeBase: cdef vector[float] probs for entity, prob in zip(entities, probabilities): - entity_id_hash = self.vocab.strings[entity] - if not entity_id_hash in self._entry_index: + entity_hash = self.vocab.strings[entity] + if not entity_hash in self._entry_index: raise ValueError(Errors.E134.format(alias=alias, entity=entity)) - entry_index = self._entry_index.get(entity_id_hash) + entry_index = self._entry_index.get(entity_hash) entry_indices.push_back(int(entry_index)) probs.push_back(float(prob)) @@ -138,7 +124,7 @@ cdef class KnowledgeBase: alias_entry = self._aliases_table[alias_index] return [Candidate(kb=self, - entity_id_hash=self._entries[entry_index].entity_id_hash, + entity_hash=self._entries[entry_index].entity_hash, alias_hash=alias_hash, prior_prob=prob) for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 09334948d..70cc46bfe 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1096,7 +1096,7 @@ class EntityLinker(Pipe): if candidates: best_candidate = max(candidates, key=lambda c: c.prior_prob) for token in ent: - token.ent_kb_id_ = best_candidate.entity_id_ + token.ent_kb_id_ = best_candidate.entity_ def get_loss(self, docs, golds, scores): # TODO diff --git a/spacy/tests/pipeline/test_el.py b/spacy/tests/pipeline/test_el.py index d68c84592..61baece68 100644 --- a/spacy/tests/pipeline/test_el.py +++ b/spacy/tests/pipeline/test_el.py @@ -17,9 +17,9 @@ def test_kb_valid_entities(nlp): mykb = KnowledgeBase(nlp.vocab) # adding entities - mykb.add_entity(entity_id=u'Q1', prob=0.9) - mykb.add_entity(entity_id=u'Q2', prob=0.2) - mykb.add_entity(entity_id=u'Q3', prob=0.5) + mykb.add_entity(entity=u'Q1', prob=0.9) + mykb.add_entity(entity=u'Q2') + mykb.add_entity(entity=u'Q3', prob=0.5) # adding aliases mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.8, 0.2]) @@ -35,9 +35,9 @@ def test_kb_invalid_entities(nlp): mykb = KnowledgeBase(nlp.vocab) # adding entities - mykb.add_entity(entity_id=u'Q1', prob=0.9) - mykb.add_entity(entity_id=u'Q2', prob=0.2) - mykb.add_entity(entity_id=u'Q3', prob=0.5) + mykb.add_entity(entity=u'Q1', prob=0.9) + mykb.add_entity(entity=u'Q2', prob=0.2) + mykb.add_entity(entity=u'Q3', prob=0.5) # adding aliases - should fail because one of the given IDs is not valid with pytest.raises(ValueError): @@ -49,9 +49,9 @@ def test_kb_invalid_probabilities(nlp): mykb = KnowledgeBase(nlp.vocab) # adding entities - mykb.add_entity(entity_id=u'Q1', prob=0.9) - mykb.add_entity(entity_id=u'Q2', prob=0.2) - mykb.add_entity(entity_id=u'Q3', prob=0.5) + mykb.add_entity(entity=u'Q1', prob=0.9) + mykb.add_entity(entity=u'Q2', prob=0.2) + mykb.add_entity(entity=u'Q3', prob=0.5) # adding aliases - should fail because the sum of the probabilities exceeds 1 with pytest.raises(ValueError): @@ -63,9 +63,9 @@ def test_kb_invalid_combination(nlp): mykb = KnowledgeBase(nlp.vocab) # adding entities - mykb.add_entity(entity_id=u'Q1', prob=0.9) - mykb.add_entity(entity_id=u'Q2', prob=0.2) - mykb.add_entity(entity_id=u'Q3', prob=0.5) + mykb.add_entity(entity=u'Q1', prob=0.9) + mykb.add_entity(entity=u'Q2', prob=0.2) + mykb.add_entity(entity=u'Q3', prob=0.5) # adding aliases - should fail because the entities and probabilities vectors are not of equal length with pytest.raises(ValueError): @@ -77,9 +77,9 @@ def test_candidate_generation(nlp): mykb = KnowledgeBase(nlp.vocab) # adding entities - mykb.add_entity(entity_id=u'Q1', prob=0.9) - mykb.add_entity(entity_id=u'Q2', prob=0.2) - mykb.add_entity(entity_id=u'Q3', prob=0.5) + mykb.add_entity(entity=u'Q1', prob=0.9) + mykb.add_entity(entity=u'Q2', prob=0.2) + mykb.add_entity(entity=u'Q3', prob=0.5) # adding aliases mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.8, 0.2])