From a48241e9a20a3cbb5d3644e09e88bc64f30c0cb3 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 21 Mar 2019 23:17:25 +0100 Subject: [PATCH] use nlp's vocab for stringstore --- examples/pipeline/dummy_entity_linking.py | 22 +++++++++--------- spacy/kb.pxd | 10 ++++---- spacy/kb.pyx | 20 ++++++++-------- spacy/tests/pipeline/test_el.py | 28 ++++++++++++++--------- 4 files changed, 43 insertions(+), 37 deletions(-) diff --git a/examples/pipeline/dummy_entity_linking.py b/examples/pipeline/dummy_entity_linking.py index c51f321e0..43d17c481 100644 --- a/examples/pipeline/dummy_entity_linking.py +++ b/examples/pipeline/dummy_entity_linking.py @@ -6,8 +6,8 @@ import spacy from spacy.kb import KnowledgeBase -def create_kb(): - kb = KnowledgeBase() +def create_kb(vocab): + kb = KnowledgeBase(vocab=vocab) # adding entities entity_0 = "Q1004791" @@ -25,11 +25,11 @@ def create_kb(): # adding aliases print() alias_0 = "Douglas" - print("adding alias", alias_0, "to all three entities") + print("adding alias", alias_0) kb.add_alias(alias=alias_0, entities=["Q1004791", "Q42", "Q5301561"], probabilities=[0.1, 0.6, 0.2]) alias_1 = "Douglas Adams" - print("adding alias", alias_1, "to just the one entity") + print("adding alias", alias_1) kb.add_alias(alias=alias_1, entities=["Q42"], probabilities=[0.9]) print() @@ -38,9 +38,7 @@ def create_kb(): return kb -def add_el(kb): - nlp = spacy.load('en_core_web_sm') - +def add_el(kb, nlp): el_pipe = nlp.create_pipe(name='el', config={"kb": kb}) nlp.add_pipe(el_pipe, last=True) @@ -49,10 +47,11 @@ def add_el(kb): print() print(len(candidates), "candidate(s) for", alias, ":") for c in candidates: - print(" ", c.entity_id_, c.entity_name_, c.alias_, c.prior_prob) + print(" ", c.entity_id_, c.entity_name_, c.prior_prob) text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \ - "Douglas reminds us to always bring our towel." + "Douglas reminds us to always bring our towel. " \ + "The main character in Doug's novel is called Arthur Dent." doc = nlp(text) print() @@ -65,5 +64,6 @@ def add_el(kb): if __name__ == "__main__": - mykb = create_kb() - add_el(mykb) + nlp = spacy.load('en_core_web_sm') + my_kb = create_kb(nlp.vocab) + add_el(my_kb, nlp) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index 490e05036..dc6701b89 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -4,7 +4,7 @@ from preshed.maps cimport PreshMap from libcpp.vector cimport vector from libc.stdint cimport int32_t, int64_t -from spacy.strings cimport StringStore +from spacy.vocab cimport Vocab from .typedefs cimport hash_t @@ -55,7 +55,7 @@ cdef class Candidate: cdef class KnowledgeBase: cdef Pool mem - cpdef readonly StringStore strings + cpdef readonly Vocab vocab # This maps 64bit keys (hash of unique entity string) # to 64bit values (position of the _EntryC struct in the _entries vector). @@ -133,11 +133,11 @@ cdef class KnowledgeBase: cf. https://github.com/explosion/preshed/issues/17 """ cdef int32_t dummy_value = 0 - self.strings.add("") + self.vocab.strings.add("") self._entries.push_back( _EntryC( - entity_id_hash=self.strings[""], - entity_name_hash=self.strings[""], + entity_id_hash=self.vocab.strings[""], + entity_name_hash=self.vocab.strings[""], vector_rows=&dummy_value, feats_row=dummy_value, prob=dummy_value diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 6d031fb91..186048a41 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -19,7 +19,7 @@ cdef class Candidate: property entity_id_: """RETURNS (unicode): ID of this entity in the KB""" def __get__(self): - return self.kb.strings[self.entity_id] + return self.kb.vocab.strings[self.entity_id] property entity_name: """RETURNS (uint64): hash of the entity's KB name""" @@ -30,7 +30,7 @@ cdef class Candidate: property entity_name_: """RETURNS (unicode): name of this entity in the KB""" def __get__(self): - return self.kb.strings[self.entity_name] + return self.kb.vocab.strings[self.entity_name] property alias: """RETURNS (uint64): hash of the alias""" @@ -40,7 +40,7 @@ cdef class Candidate: property alias_: """RETURNS (unicode): ID of the original alias""" def __get__(self): - return self.kb.strings[self.alias] + return self.kb.vocab.strings[self.alias] property prior_prob: def __get__(self): @@ -49,11 +49,11 @@ cdef class Candidate: cdef class KnowledgeBase: - def __init__(self): + def __init__(self, Vocab vocab): + self.vocab = vocab self._entry_index = PreshMap() self._alias_index = PreshMap() self.mem = Pool() - self.strings = StringStore() self._create_empty_vectors() def __len__(self): @@ -72,8 +72,8 @@ cdef class KnowledgeBase: """ if not entity_name: entity_name = entity_id - cdef hash_t id_hash = self.strings.add(entity_id) - cdef hash_t name_hash = self.strings.add(entity_name) + cdef hash_t id_hash = self.vocab.strings.add(entity_id) + cdef hash_t name_hash = self.vocab.strings.add(entity_name) # Return if this entity was added before if id_hash in self._entry_index: @@ -107,7 +107,7 @@ cdef class KnowledgeBase: raise ValueError("The sum of prior probabilities for alias '" + alias + "' should not exceed 1, " + "but found " + str(prob_sum)) - cdef hash_t alias_hash = self.strings.add(alias) + cdef hash_t alias_hash = self.vocab.strings.add(alias) # Return if this alias was added before if alias_hash in self._alias_index: @@ -120,7 +120,7 @@ cdef class KnowledgeBase: cdef vector[float] probs for entity, prob in zip(entities, probabilities): - entity_id_hash = self.strings[entity] + entity_id_hash = self.vocab.strings[entity] if not entity_id_hash in self._entry_index: raise ValueError("Alias '" + alias + "' defined for unknown entity '" + entity + "'") @@ -135,7 +135,7 @@ cdef class KnowledgeBase: def get_candidates(self, unicode alias): """ TODO: where to put this functionality ?""" - cdef hash_t alias_hash = self.strings[alias] + cdef hash_t alias_hash = self.vocab.strings[alias] alias_index = self._alias_index.get(alias_hash) alias_entry = self._aliases_table[alias_index] diff --git a/spacy/tests/pipeline/test_el.py b/spacy/tests/pipeline/test_el.py index 295b35cce..379661fc1 100644 --- a/spacy/tests/pipeline/test_el.py +++ b/spacy/tests/pipeline/test_el.py @@ -2,11 +2,17 @@ import pytest from spacy.kb import KnowledgeBase +from spacy.lang.en import English -def test_kb_valid_entities(): - """Test the valid construction of a KB with 3 entities and one alias""" - mykb = KnowledgeBase() +@pytest.fixture +def nlp(): + return English() + + +def test_kb_valid_entities(nlp): + """Test the valid construction of a KB with 3 entities and two aliases""" + mykb = KnowledgeBase(nlp.vocab) # adding entities mykb.add_entity(entity_id="Q1", prob=0.9) @@ -22,9 +28,9 @@ def test_kb_valid_entities(): assert(mykb.get_size_aliases() == 2) -def test_kb_invalid_entities(): +def test_kb_invalid_entities(nlp): """Test the invalid construction of a KB with an alias linked to a non-existing entity""" - mykb = KnowledgeBase() + mykb = KnowledgeBase(nlp.vocab) # adding entities mykb.add_entity(entity_id="Q1", prob=0.9) @@ -36,9 +42,9 @@ def test_kb_invalid_entities(): mykb.add_alias(alias="douglas", entities=["Q2", "Q342"], probabilities=[0.8, 0.2]) -def test_kb_invalid_probabilities(): +def test_kb_invalid_probabilities(nlp): """Test the invalid construction of a KB with wrong prior probabilities""" - mykb = KnowledgeBase() + mykb = KnowledgeBase(nlp.vocab) # adding entities mykb.add_entity(entity_id="Q1", prob=0.9) @@ -50,9 +56,9 @@ def test_kb_invalid_probabilities(): mykb.add_alias(alias="douglassss", entities=["Q2", "Q3"], probabilities=[0.8, 0.4]) -def test_kb_invalid_combination(): +def test_kb_invalid_combination(nlp): """Test the invalid construction of a KB with non-matching entity and probability lists""" - mykb = KnowledgeBase() + mykb = KnowledgeBase(nlp.vocab) # adding entities mykb.add_entity(entity_id="Q1", prob=0.9) @@ -64,9 +70,9 @@ def test_kb_invalid_combination(): mykb.add_alias(alias="douglassss", entities=["Q2", "Q3"], probabilities=[0.3, 0.4, 0.1]) -def test_candidate_generation(): +def test_candidate_generation(nlp): """Test correct candidate generation""" - mykb = KnowledgeBase() + mykb = KnowledgeBase(nlp.vocab) # adding entities mykb.add_entity(entity_id="Q1", prob=0.9)