From 173d45ec5ffecc3242b8a7ff2e1fbf9dd46fb9e6 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 6 Mar 2019 19:34:18 +0100 Subject: [PATCH 01/28] adding kb_id as field to token, el as nlp pipeline component --- sandbox_test_sofie/__init__.py | 0 sandbox_test_sofie/testing_el.py | 21 ++++++ spacy/language.py | 3 +- spacy/morphology.pxd | 2 + spacy/morphology.pyx | 3 + spacy/pipeline/__init__.py | 2 +- spacy/pipeline/pipes.pyx | 114 ++++++++++++++++++++++++++++++- spacy/structs.pxd | 2 + spacy/tokens/token.pyx | 8 +++ 9 files changed, 152 insertions(+), 3 deletions(-) create mode 100644 sandbox_test_sofie/__init__.py create mode 100644 sandbox_test_sofie/testing_el.py diff --git a/sandbox_test_sofie/__init__.py b/sandbox_test_sofie/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/sandbox_test_sofie/testing_el.py b/sandbox_test_sofie/testing_el.py new file mode 100644 index 000000000..8d9b0c21d --- /dev/null +++ b/sandbox_test_sofie/testing_el.py @@ -0,0 +1,21 @@ +import spacy + + +def add_el(): + nlp = spacy.load('en_core_web_sm') + print("pipes", nlp.pipe_names) + + el_pipe = nlp.create_pipe(name='el') + nlp.add_pipe(el_pipe, last=True) + + print("pipes", nlp.pipe_names) + print() + + text = "Australian striker John hits century" + doc = nlp(text) + for token in doc: + print("token", token.text, token.tag_, token.pos_, token.kb_id) + + +if __name__ == "__main__": + add_el() diff --git a/spacy/language.py b/spacy/language.py index 0c0cf8854..736899341 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -14,7 +14,7 @@ import srsly from .tokenizer import Tokenizer from .vocab import Vocab from .lemmatizer import Lemmatizer -from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer +from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer, EntityLinker from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens from .pipeline import EntityRuler @@ -114,6 +114,7 @@ class Language(object): "tagger": lambda nlp, **cfg: Tagger(nlp.vocab, **cfg), "parser": lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg), "ner": lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg), + "el": lambda nlp, **cfg: EntityLinker(nlp.vocab, **cfg), "similarity": lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg), "textcat": lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg), "sentencizer": lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg), diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index d0110b300..d674140b0 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -43,6 +43,8 @@ cdef class Morphology: cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1 + cdef int assign_kb_id(self, TokenC* token, kb_id) except -1 + cdef enum univ_morph_t: NIL = 0 diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index bd821d76f..92ca67f18 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -122,6 +122,9 @@ cdef class Morphology: else: flags[0] &= ~(one << flag_id) + cdef int assign_kb_id(self, TokenC* token, kb_id) except -1: + token.kb_id = kb_id + def add_special_case(self, unicode tag_str, unicode orth_str, attrs, force=False): """Add a special-case rule to the morphological analyser. Tokens whose diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py index d683cc989..170cc5ba7 100644 --- a/spacy/pipeline/__init__.py +++ b/spacy/pipeline/__init__.py @@ -1,7 +1,7 @@ # coding: utf8 from __future__ import unicode_literals -from .pipes import Tagger, DependencyParser, EntityRecognizer # noqa +from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker # noqa from .pipes import TextCategorizer, Tensorizer, Pipe # noqa from .entityruler import EntityRuler # noqa from .hooks import SentenceSegmenter, SimilarityHook # noqa diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index bde794e75..4eb3ecc80 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1040,4 +1040,116 @@ cdef class EntityRecognizer(Parser): if move[0] in ("B", "I", "L", "U"))) -__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer', 'TextCategorizer'] +class EntityLinker(Pipe): + name = 'el' + + @classmethod + def Model(cls, nr_class=1, **cfg): + embed_size = util.env_opt("embed_size", 2000) + if "token_vector_width" in cfg: + token_vector_width = cfg["token_vector_width"] + else: + token_vector_width = util.env_opt("token_vector_width", 96) + if cfg.get('architecture') == 'simple_cnn': + tok2vec = Tok2Vec(token_vector_width, embed_size, **cfg) + return None # build_simple_cnn_text_classifier(tok2vec, nr_class, **cfg) + else: + return None # build_text_classifier(nr_class, **cfg) + + + def __init__(self, vocab, model=True, **cfg): + self.vocab = vocab + self.model = model + self._rehearsal_model = None + self.cfg = dict(cfg) + + def __call__(self, doc): + # scores, tensors = self.predict([doc]) + scores, tensors = None, None + self.set_annotations([doc], scores, tensors=tensors) + return doc + + def pipe(self, stream, batch_size=128, n_threads=-1): + for docs in util.minibatch(stream, size=batch_size): + docs = list(docs) + scores, tensors = self.predict(docs) + self.set_annotations(docs, scores, tensors=tensors) + yield from docs + + def predict(self, docs): + # self.require_model() + scores = self.model(docs) + scores = self.model.ops.asarray(scores) + tensors = [doc.tensor for doc in docs] + return scores, tensors + + def set_annotations(self, docs, scores, tensors=None): + # TODO Sofie: actually implement this class instead of dummy implementation + for i, doc in enumerate(docs): + for token in doc: + token.kb_id = 342 + + def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None): + scores, bp_scores = self.model.begin_update(docs, drop=drop) + loss, d_scores = self.get_loss(docs, golds, scores) + bp_scores(d_scores, sgd=sgd) + if losses is not None: + losses.setdefault(self.name, 0.0) + losses[self.name] += loss + + def rehearse(self, docs, drop=0., sgd=None, losses=None): + if self._rehearsal_model is None: + return + scores, bp_scores = self.model.begin_update(docs, drop=drop) + target = self._rehearsal_model(docs) + gradient = scores - target + bp_scores(gradient, sgd=sgd) + if losses is not None: + losses.setdefault(self.name, 0.0) + losses[self.name] += (gradient**2).sum() + + def get_loss(self, docs, golds, scores): + truths = numpy.zeros((len(golds), len(self.labels)), dtype='f') + not_missing = numpy.ones((len(golds), len(self.labels)), dtype='f') + for i, gold in enumerate(golds): + for j, label in enumerate(self.labels): + if label in gold.cats: + truths[i, j] = gold.cats[label] + else: + not_missing[i, j] = 0. + truths = self.model.ops.asarray(truths) + not_missing = self.model.ops.asarray(not_missing) + d_scores = (scores-truths) / scores.shape[0] + d_scores *= not_missing + mean_square_error = (d_scores**2).sum(axis=1).mean() + return float(mean_square_error), d_scores + + def add_label(self, label): + if label in self.labels: + return 0 + if self.model not in (None, True, False): + # This functionality was available previously, but was broken. + # The problem is that we resize the last layer, but the last layer + # is actually just an ensemble. We're not resizing the child layers + # -- a huge problem. + raise ValueError(Errors.E116) + #smaller = self.model._layers[-1] + #larger = Affine(len(self.labels)+1, smaller.nI) + #copy_array(larger.W[:smaller.nO], smaller.W) + #copy_array(larger.b[:smaller.nO], smaller.b) + #self.model._layers[-1] = larger + self.labels = tuple(list(self.labels) + [label]) + return 1 + + def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, + **kwargs): + if self.model is True: + self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors') + self.model = self.Model(len(self.labels), **self.cfg) + link_vectors_to_models(self.vocab) + if sgd is None: + sgd = self.create_optimizer() + return sgd + + +__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer', 'TextCategorizer', 'EntityLinker'] diff --git a/spacy/structs.pxd b/spacy/structs.pxd index fa282cae7..86b738a5c 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -71,3 +71,5 @@ cdef struct TokenC: int ent_iob attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth.. hash_t ent_id + + hash_t kb_id diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index a69a0def8..39e408a89 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -307,6 +307,14 @@ cdef class Token: def __set__(self, attr_t tag): self.vocab.morphology.assign_tag(self.c, tag) + property kb_id: + """RETURNS (uint64): ID of entity (after Entity Linking).""" + def __get__(self): + return self.c.kb_id + + def __set__(self, attr_t kb_id): + self.vocab.morphology.assign_kb_id(self.c, kb_id) + property dep: """RETURNS (uint64): ID of syntactic dependency label.""" def __get__(self): From 5f002e9cede44a4ca8ef9ee9a74c6dea0e0455fb Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 14 Mar 2019 15:48:40 +0100 Subject: [PATCH 02/28] annotate kb_id through ents in doc --- sandbox_test_sofie/testing_el.py | 13 +++++++++---- spacy/morphology.pxd | 2 -- spacy/morphology.pyx | 3 --- spacy/pipeline/pipes.pyx | 6 ++++-- spacy/structs.pxd | 3 +-- spacy/tokens/doc.pyx | 16 +++++++++++----- spacy/tokens/span.pxd | 1 + spacy/tokens/span.pyx | 11 ++++++++++- spacy/tokens/token.pyx | 24 ++++++++++++++++-------- 9 files changed, 52 insertions(+), 27 deletions(-) diff --git a/sandbox_test_sofie/testing_el.py b/sandbox_test_sofie/testing_el.py index 8d9b0c21d..7883e44d4 100644 --- a/sandbox_test_sofie/testing_el.py +++ b/sandbox_test_sofie/testing_el.py @@ -3,18 +3,23 @@ import spacy def add_el(): nlp = spacy.load('en_core_web_sm') - print("pipes", nlp.pipe_names) + print("pipes before:", nlp.pipe_names) el_pipe = nlp.create_pipe(name='el') nlp.add_pipe(el_pipe, last=True) - print("pipes", nlp.pipe_names) + print("pipes after:", nlp.pipe_names) print() - text = "Australian striker John hits century" + text = "The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, reminds us to always bring our towel." doc = nlp(text) + for token in doc: - print("token", token.text, token.tag_, token.pos_, token.kb_id) + print("token", token.text, token.ent_type_, token.ent_kb_id_) + + print() + for ent in doc.ents: + print("ent", ent.text, ent.label_, ent.kb_id_) if __name__ == "__main__": diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index d674140b0..d0110b300 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -43,8 +43,6 @@ cdef class Morphology: cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1 - cdef int assign_kb_id(self, TokenC* token, kb_id) except -1 - cdef enum univ_morph_t: NIL = 0 diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 92ca67f18..bd821d76f 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -122,9 +122,6 @@ cdef class Morphology: else: flags[0] &= ~(one << flag_id) - cdef int assign_kb_id(self, TokenC* token, kb_id) except -1: - token.kb_id = kb_id - def add_special_case(self, unicode tag_str, unicode orth_str, attrs, force=False): """Add a special-case rule to the morphological analyser. Tokens whose diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 4eb3ecc80..e1e5471be 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1086,8 +1086,10 @@ class EntityLinker(Pipe): def set_annotations(self, docs, scores, tensors=None): # TODO Sofie: actually implement this class instead of dummy implementation for i, doc in enumerate(docs): - for token in doc: - token.kb_id = 342 + for ent in doc.ents: + if ent.label_ in ["PERSON", "PER"]: + for token in ent: + token.ent_kb_id_ = "Q42" def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None): scores, bp_scores = self.model.begin_update(docs, drop=drop) diff --git a/spacy/structs.pxd b/spacy/structs.pxd index 86b738a5c..154202c0d 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -70,6 +70,5 @@ cdef struct TokenC: int sent_start int ent_iob attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth.. + attr_t ent_kb_id hash_t ent_id - - hash_t kb_id diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 97ac10f76..7640368ec 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -279,7 +279,7 @@ cdef class Doc: def doc(self): return self - def char_span(self, int start_idx, int end_idx, label=0, vector=None): + def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None): """Create a `Span` object from the slice `doc.text[start : end]`. doc (Doc): The parent document. @@ -287,12 +287,15 @@ cdef class Doc: end (int): The index of the first character after the span. label (uint64 or string): A label to attach to the Span, e.g. for named entities. + kb_id (uint64 or string): An ID from a KB to capture the meaning of a named entity. vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. RETURNS (Span): The newly constructed object. """ if not isinstance(label, int): label = self.vocab.strings.add(label) + if not isinstance(kb_id, int): + kb_id = self.vocab.strings.add(kb_id) cdef int start = token_by_start(self.c, self.length, start_idx) if start == -1: return None @@ -301,7 +304,7 @@ cdef class Doc: return None # Currently we have the token index, we want the range-end index end += 1 - cdef Span span = Span(self, start, end, label=label, vector=vector) + cdef Span span = Span(self, start, end, label=label, kb_id=kb_id, vector=vector) return span def similarity(self, other): @@ -438,6 +441,7 @@ cdef class Doc: cdef const TokenC* token cdef int start = -1 cdef attr_t label = 0 + cdef attr_t kb_id = 0 output = [] for i in range(self.length): token = &self.c[i] @@ -447,16 +451,18 @@ cdef class Doc: raise ValueError(Errors.E093.format(seq=' '.join(seq))) elif token.ent_iob == 2 or token.ent_iob == 0: if start != -1: - output.append(Span(self, start, i, label=label)) + output.append(Span(self, start, i, label=label, kb_id=kb_id)) start = -1 label = 0 + kb_id = 0 elif token.ent_iob == 3: if start != -1: - output.append(Span(self, start, i, label=label)) + output.append(Span(self, start, i, label=label, kb_id=kb_id)) start = i label = token.ent_type + kb_id = token.ent_kb_id if start != -1: - output.append(Span(self, start, self.length, label=label)) + output.append(Span(self, start, self.length, label=label, kb_id=kb_id)) return tuple(output) def __set__(self, ents): diff --git a/spacy/tokens/span.pxd b/spacy/tokens/span.pxd index 9645189a5..f6f88a23e 100644 --- a/spacy/tokens/span.pxd +++ b/spacy/tokens/span.pxd @@ -11,6 +11,7 @@ cdef class Span: cdef readonly int start_char cdef readonly int end_char cdef readonly attr_t label + cdef readonly attr_t kb_id cdef public _vector cdef public _vector_norm diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index a418fc13f..f65c84ffb 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -45,13 +45,14 @@ cdef class Span: return Underscore.span_extensions.pop(name) def __cinit__(self, Doc doc, int start, int end, label=0, - vector=None, vector_norm=None): + vector=None, vector_norm=None, kb_id=0): """Create a `Span` object from the slice `doc[start : end]`. doc (Doc): The parent document. start (int): The index of the first token of the span. end (int): The index of the first token after the span. label (uint64): A label to attach to the Span, e.g. for named entities. + kb_id (uint64): An identifier from a Knowledge Base to capture the meaning of a named entity. vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. RETURNS (Span): The newly constructed object. @@ -73,6 +74,7 @@ cdef class Span: self.label = label self._vector = vector self._vector_norm = vector_norm + self.kb_id = kb_id def __richcmp__(self, Span other, int op): if other is None: @@ -592,6 +594,13 @@ cdef class Span: def __set__(self, unicode label_): self.label = self.doc.vocab.strings.add(label_) + property kb_id_: + """RETURNS (unicode): The named entity's KB ID.""" + def __get__(self): + return self.doc.vocab.strings[self.kb_id] + def __set__(self, unicode kb_id_): + raise NotImplementedError(TempErrors.T007.format(attr='kb_id_')) + cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1: # Don't allow spaces to be the root, if there are diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 39e408a89..ccf2f8249 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -307,14 +307,6 @@ cdef class Token: def __set__(self, attr_t tag): self.vocab.morphology.assign_tag(self.c, tag) - property kb_id: - """RETURNS (uint64): ID of entity (after Entity Linking).""" - def __get__(self): - return self.c.kb_id - - def __set__(self, attr_t kb_id): - self.vocab.morphology.assign_kb_id(self.c, kb_id) - property dep: """RETURNS (uint64): ID of syntactic dependency label.""" def __get__(self): @@ -699,6 +691,22 @@ cdef class Token: def __set__(self, name): self.c.ent_id = self.vocab.strings.add(name) + property ent_kb_id: + """RETURNS (uint64): Named entity KB ID.""" + def __get__(self): + return self.c.ent_kb_id + + def __set__(self, attr_t ent_kb_id): + self.c.ent_kb_id = ent_kb_id + + property ent_kb_id_: + """RETURNS (unicode): Named entity KB ID.""" + def __get__(self): + return self.vocab.strings[self.c.ent_kb_id] + + def __set__(self, ent_kb_id): + self.c.ent_kb_id = self.vocab.strings.add(ent_kb_id) + property whitespace_: """RETURNS (unicode): The trailing whitespace character, if present. """ From 097e5f3da1abdeca99b5a15b89df2883276a4ec7 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 15 Mar 2019 11:17:35 +0100 Subject: [PATCH 03/28] kb snippet, draft by Matt (wip) --- spacy/kb.pxd | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 spacy/kb.pxd diff --git a/spacy/kb.pxd b/spacy/kb.pxd new file mode 100644 index 000000000..939030098 --- /dev/null +++ b/spacy/kb.pxd @@ -0,0 +1,93 @@ +"""Knowledge-base for entity or concept linking.""" +from cymem.cymem cimport Pool +from preshed.maps cimport PreshMap +from libcpp.vector cimport vector +from libc.stdint cimport int32_t +from spacy.typedefs cimport attr_t + + +# Internal struct, for storage and disambiguation. This isn't what we return +# to the user as the answer to "here's your entity". It's the minimum number +# of bits we need to keep track of the answers. +cdef struct _EntryC: + + # Allows retrieval of one or more vectors. + # Each element of vector_rows should be an index into a vectors table. + # Every entry should have the same number of vectors, so we can avoid storing + # the number of vectors in each knowledge-base struct + const int32_t* vector_rows + + # Allows retrieval of a struct of non-vector features. We could make this a + # pointer, but we have 32 bits left over in the struct after prob, so we'd + # like this to only be 32 bits. We can also set this to -1, for the common + # case where there are no features. + int32_t feats_row + float prob # log probability of entity, based on corpus frequency + + +cdef class KnowledgeBase: + cdef Pool mem + + # This maps 64bit keys to 64bit values. Here the key would be a hash of + # a unique string name for the entity, and the value would be the position + # of the _EntryC struct in our vector. + # The PreshMap is pretty space efficient, as it uses open addressing. So + # the only overhead is the vacancy rate, which is approximately 30%. + cdef PreshMap _index + + # Each entry takes 128 bits, and again we'll have a 30% or so overhead for + # over allocation. + # In total we end up with (N*128*1.3)+(N*128*1.3) bits for N entries. + # Storing 1m entries would take 41.6mb under this scheme. + cdef vector[_EntryC] _entries + + # This is the part which might take more space: storing various + # categorical features for the entries, and storing vectors for disambiguation + # and possibly usage. + # If each entry gets a 300-dimensional vector, for 1m entries we would need + # 1.2gb. That gets expensive fast. What might be better is to avoid learning + # a unique vector for every entity. We could instead have a compositional + # model, that embeds different features of the entities into vectors. We'll + # still want some per-entity features, like the Wikipedia text or entity + # co-occurrence. Hopefully those vectors can be narrow, e.g. 64 dimensions. + cdef object _vectors_table + + # It's very useful to track categorical features, at least for output, even + # if they're not useful in the model itself. For instance, we should be + # able to track stuff like a person's date of birth or whatever. This can + # easily make the KB bigger, but if this isn't needed by the model, and it's + # optional data, we can let users configure a DB as the backend for this. + cdef object _features_table + + # This should map mention hashes to (entry_id, prob) tuples. The probability + # should be P(entity | mention), which is pretty important to know. + # We can pack both pieces of information into a 64-bit vale, to keep things + # efficient. + cdef object _aliases_table + + def __len__(self): + return self._entries.size() + + def add(self, name, float prob, vectors=None, features=None, aliases=None): + if name in self: + return + cdef attr_t orth = get_string_name(name) + self.c_add(orth, prob, self._vectors_table.get_pointer(vectors), + self._features_table.get(features)) + for alias in aliases: + self._aliases_table.add(alias, orth) + + cdef void c_add(self, attr_t orth, float prob, const int32_t* vector_rows, + int feats_row) nogil: + """Add an entry to the knowledge base.""" + # This is what we'll map the orth to. It's where the entry will sit + # in the vector of entries, so we can get it later. + cdef int64_t index = self.c.size() + self._entries.push_back( + _EntryC( + vector_rows=vector_rows, + feats_row=feats_row, + prob=prob + )) + self._index[orth] = index + return index \ No newline at end of file From b6bac4944495eddfc324d8da43569095fad20510 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 15 Mar 2019 11:37:24 +0100 Subject: [PATCH 04/28] documented some comments and todos --- spacy/kb.pxd | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index 939030098..1162c078f 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -22,7 +22,9 @@ cdef struct _EntryC: # like this to only be 32 bits. We can also set this to -1, for the common # case where there are no features. int32_t feats_row - float prob # log probability of entity, based on corpus frequency + + # log probability of entity, based on corpus frequency + float prob cdef class KnowledgeBase: @@ -61,7 +63,7 @@ cdef class KnowledgeBase: # This should map mention hashes to (entry_id, prob) tuples. The probability # should be P(entity | mention), which is pretty important to know. - # We can pack both pieces of information into a 64-bit vale, to keep things + # We can pack both pieces of information into a 64-bit value, to keep things # efficient. cdef object _aliases_table @@ -69,20 +71,25 @@ cdef class KnowledgeBase: return self._entries.size() def add(self, name, float prob, vectors=None, features=None, aliases=None): + # TODO: more friendly check for non-unique name if name in self: return + + # TODO: convert name to hash cdef attr_t orth = get_string_name(name) self.c_add(orth, prob, self._vectors_table.get_pointer(vectors), self._features_table.get(features)) - for alias in aliases: - self._aliases_table.add(alias, orth) + + # TODO: hash the aliases? + for alias, prob_alias in aliases: + self._aliases_table.add(alias, orth, prob_alias) cdef void c_add(self, attr_t orth, float prob, const int32_t* vector_rows, int feats_row) nogil: """Add an entry to the knowledge base.""" # This is what we'll map the orth to. It's where the entry will sit # in the vector of entries, so we can get it later. - cdef int64_t index = self.c.size() + cdef int64_t index = self._entries.size() self._entries.push_back( _EntryC( vector_rows=vector_rows, From dc603fb85e86eed939b322eacb6c9faf01437d3f Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 15 Mar 2019 15:00:53 +0100 Subject: [PATCH 05/28] hash the entity name --- spacy/kb.pxd | 18 +++++++++--------- spacy/tokens/span.pyx | 2 ++ 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index 1162c078f..e715cad88 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -2,8 +2,9 @@ from cymem.cymem cimport Pool from preshed.maps cimport PreshMap from libcpp.vector cimport vector -from libc.stdint cimport int32_t -from spacy.typedefs cimport attr_t +from libc.stdint cimport int32_t, int64_t +from .typedefs cimport attr_t, hash_t +from .strings cimport hash_string # Internal struct, for storage and disambiguation. This isn't what we return @@ -70,21 +71,20 @@ cdef class KnowledgeBase: def __len__(self): return self._entries.size() - def add(self, name, float prob, vectors=None, features=None, aliases=None): + def add_entity(self, name, float prob, vectors=None, features=None, aliases=None): # TODO: more friendly check for non-unique name if name in self: return - # TODO: convert name to hash - cdef attr_t orth = get_string_name(name) - self.c_add(orth, prob, self._vectors_table.get_pointer(vectors), + cdef hash_t key = hash_string(name) + self.c_add_entity(key, prob, self._vectors_table.get_pointer(vectors), self._features_table.get(features)) # TODO: hash the aliases? for alias, prob_alias in aliases: - self._aliases_table.add(alias, orth, prob_alias) + self._aliases_table.add(alias, key, prob_alias) - cdef void c_add(self, attr_t orth, float prob, const int32_t* vector_rows, + cdef void c_add_entity(self, hash_t key, float prob, const int32_t* vector_rows, int feats_row) nogil: """Add an entry to the knowledge base.""" # This is what we'll map the orth to. It's where the entry will sit @@ -96,5 +96,5 @@ cdef class KnowledgeBase: feats_row=feats_row, prob=prob )) - self._index[orth] = index + self._index[key] = index return index \ No newline at end of file diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index f65c84ffb..44ca74e9a 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -598,6 +598,8 @@ cdef class Span: """RETURNS (unicode): The named entity's KB ID.""" def __get__(self): return self.doc.vocab.strings[self.kb_id] + + # TODO: custom error msg like for label_ def __set__(self, unicode kb_id_): raise NotImplementedError(TempErrors.T007.format(attr='kb_id_')) From 56b55e3bcdedd0c39a1350a0d0fd1ea500385808 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 15 Mar 2019 16:05:23 +0100 Subject: [PATCH 06/28] add pyx and separate method to add aliases --- spacy/kb.pxd | 21 ++------------------- spacy/kb.pyx | 27 +++++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 19 deletions(-) create mode 100644 spacy/kb.pyx diff --git a/spacy/kb.pxd b/spacy/kb.pxd index e715cad88..9d9a21a8c 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -3,8 +3,7 @@ from cymem.cymem cimport Pool from preshed.maps cimport PreshMap from libcpp.vector cimport vector from libc.stdint cimport int32_t, int64_t -from .typedefs cimport attr_t, hash_t -from .strings cimport hash_string +from .typedefs cimport hash_t # Internal struct, for storage and disambiguation. This isn't what we return @@ -68,26 +67,10 @@ cdef class KnowledgeBase: # efficient. cdef object _aliases_table - def __len__(self): - return self._entries.size() - - def add_entity(self, name, float prob, vectors=None, features=None, aliases=None): - # TODO: more friendly check for non-unique name - if name in self: - return - - cdef hash_t key = hash_string(name) - self.c_add_entity(key, prob, self._vectors_table.get_pointer(vectors), - self._features_table.get(features)) - - # TODO: hash the aliases? - for alias, prob_alias in aliases: - self._aliases_table.add(alias, key, prob_alias) - cdef void c_add_entity(self, hash_t key, float prob, const int32_t* vector_rows, int feats_row) nogil: """Add an entry to the knowledge base.""" - # This is what we'll map the orth to. It's where the entry will sit + # This is what we'll map the hash key to. It's where the entry will sit # in the vector of entries, so we can get it later. cdef int64_t index = self._entries.size() self._entries.push_back( diff --git a/spacy/kb.pyx b/spacy/kb.pyx new file mode 100644 index 000000000..ce76f2fc4 --- /dev/null +++ b/spacy/kb.pyx @@ -0,0 +1,27 @@ +from .strings cimport hash_string + + +cdef class KnowledgeBase: + def __len__(self): + return self._entries.size() + + def add_entity(self, name, float prob, vectors=None, features=None, aliases=None): + # TODO: more friendly check for non-unique name + if name in self: + return + + cdef hash_t name_hash = hash_string(name) + self.c_add_entity(name_hash, prob, self._vectors_table.get_pointer(vectors), + self._features_table.get(features)) + + def add_alias(self, alias, entities, probabilities): + """For a given alias, add its potential entities and prior probabilies to the KB.""" + cdef hash_t alias_hash = hash_string(alias) + + # TODO: check len(entities) == len(probabilities) + for entity, prob in zip(entities, probabilities): + cdef hash_t entity_hash = hash_string(entity) + cdef int64_t entity_index = self._index[entity_hash] + # TODO: check that entity is already in this KB (entity_index is OK) + self._aliases_table.add(alias_hash, entity_index, prob) + From 3945fd21b0f15fb8fada0a5e2119821f9e26fbd1 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 18 Mar 2019 10:31:01 +0100 Subject: [PATCH 07/28] fix compile errors --- spacy/kb.pxd | 4 ++-- spacy/kb.pyx | 12 ++++++++---- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index 9d9a21a8c..3ba9c8bba 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -67,8 +67,8 @@ cdef class KnowledgeBase: # efficient. cdef object _aliases_table - cdef void c_add_entity(self, hash_t key, float prob, const int32_t* vector_rows, - int feats_row) nogil: + cdef inline int64_t c_add_entity(self, hash_t key, float prob, const int32_t* vector_rows, + int feats_row): """Add an entry to the knowledge base.""" # This is what we'll map the hash key to. It's where the entry will sit # in the vector of entries, so we can get it later. diff --git a/spacy/kb.pyx b/spacy/kb.pyx index ce76f2fc4..46acc2967 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -11,17 +11,21 @@ cdef class KnowledgeBase: return cdef hash_t name_hash = hash_string(name) - self.c_add_entity(name_hash, prob, self._vectors_table.get_pointer(vectors), - self._features_table.get(features)) + cdef int32_t dummy_value = 342 + self.c_add_entity(name_hash, prob, &dummy_value, dummy_value) + # TODO self._vectors_table.get_pointer(vectors), + # self._features_table.get(features)) def add_alias(self, alias, entities, probabilities): """For a given alias, add its potential entities and prior probabilies to the KB.""" cdef hash_t alias_hash = hash_string(alias) + cdef hash_t entity_hash = 0 + cdef int64_t entity_index = 0 # TODO: check len(entities) == len(probabilities) for entity, prob in zip(entities, probabilities): - cdef hash_t entity_hash = hash_string(entity) - cdef int64_t entity_index = self._index[entity_hash] + entity_hash = hash_string(entity) + entity_index = self._index[entity_hash] # TODO: check that entity is already in this KB (entity_index is OK) self._aliases_table.add(alias_hash, entity_index, prob) From 5ac7edf53c328c90ac4701ef687b0964ea4b756c Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 18 Mar 2019 12:38:40 +0100 Subject: [PATCH 08/28] adding aliases per entity in the KB --- spacy/kb.pxd | 53 +++++++++++++++++++++++++++++++++++++++------------- spacy/kb.pyx | 25 ++++++++++++++----------- 2 files changed, 54 insertions(+), 24 deletions(-) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index 3ba9c8bba..92a0c8b95 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -27,15 +27,25 @@ cdef struct _EntryC: float prob +# Each alias struct stores a list of Entry pointers with their prior probabilities +# for this specific mention/alias. +cdef struct _AliasC: + + # All entry candidates for this alias + const vector[int64_t] entry_indices + + # Prior probability P(entity|alias) - should sum up to (at most) 1. + const vector[float] probs + + cdef class KnowledgeBase: cdef Pool mem - # This maps 64bit keys to 64bit values. Here the key would be a hash of - # a unique string name for the entity, and the value would be the position - # of the _EntryC struct in our vector. + # This maps 64bit keys (hash of unique entity string) + # to 64bit values (position of the _EntryC struct in the _entries vector). # The PreshMap is pretty space efficient, as it uses open addressing. So # the only overhead is the vacancy rate, which is approximately 30%. - cdef PreshMap _index + cdef PreshMap _entry_index # Each entry takes 128 bits, and again we'll have a 30% or so overhead for # over allocation. @@ -43,6 +53,16 @@ cdef class KnowledgeBase: # Storing 1m entries would take 41.6mb under this scheme. cdef vector[_EntryC] _entries + # This maps 64bit keys (hash of unique alias string) + # to 64bit values (position of the _AliasC struct in the _aliases_table vector). + cdef PreshMap _alias_index + + # This should map mention hashes to (entry_id, prob) tuples. The probability + # should be P(entity | mention), which is pretty important to know. + # We can pack both pieces of information into a 64-bit value, to keep things + # efficient. + cdef vector[_AliasC] _aliases_table + # This is the part which might take more space: storing various # categorical features for the entries, and storing vectors for disambiguation # and possibly usage. @@ -61,23 +81,30 @@ cdef class KnowledgeBase: # optional data, we can let users configure a DB as the backend for this. cdef object _features_table - # This should map mention hashes to (entry_id, prob) tuples. The probability - # should be P(entity | mention), which is pretty important to know. - # We can pack both pieces of information into a 64-bit value, to keep things - # efficient. - cdef object _aliases_table - cdef inline int64_t c_add_entity(self, hash_t key, float prob, const int32_t* vector_rows, + cdef inline int64_t c_add_entity(self, hash_t entity_key, float prob, const int32_t* vector_rows, int feats_row): """Add an entry to the knowledge base.""" # This is what we'll map the hash key to. It's where the entry will sit # in the vector of entries, so we can get it later. - cdef int64_t index = self._entries.size() + cdef int64_t entity_index = self._entries.size() self._entries.push_back( _EntryC( vector_rows=vector_rows, feats_row=feats_row, prob=prob )) - self._index[key] = index - return index \ No newline at end of file + self._index[entity_key] = entity_index + return entity_index + + cdef inline int64_t c_add_aliases(self, hash_t alias_key, vector[int64_t] entry_indices, vector[float] probs): + """Connect a mention to a list of potential entities with their prior probabilities .""" + cdef int64_t alias_index = self._aliases_table.size() + + self._aliases_table.push_back( + _AliasC( + entry_indices=entry_indices, + probs=probs + )) + self._alias_index[alias_key] = alias_index + return alias_index \ No newline at end of file diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 46acc2967..0f6a7aecc 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -5,16 +5,16 @@ cdef class KnowledgeBase: def __len__(self): return self._entries.size() - def add_entity(self, name, float prob, vectors=None, features=None, aliases=None): + def add_entity(self, entity_id: str, float prob, vectors=None, features=None): # TODO: more friendly check for non-unique name - if name in self: + if entity_id in self: return - cdef hash_t name_hash = hash_string(name) + cdef hash_t id_hash = hash_string(entity_id) cdef int32_t dummy_value = 342 - self.c_add_entity(name_hash, prob, &dummy_value, dummy_value) + self.c_add_entity(entity_key=id_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value) # TODO self._vectors_table.get_pointer(vectors), - # self._features_table.get(features)) + # self._features_table.get(features)) def add_alias(self, alias, entities, probabilities): """For a given alias, add its potential entities and prior probabilies to the KB.""" @@ -22,10 +22,13 @@ cdef class KnowledgeBase: cdef hash_t entity_hash = 0 cdef int64_t entity_index = 0 - # TODO: check len(entities) == len(probabilities) - for entity, prob in zip(entities, probabilities): - entity_hash = hash_string(entity) - entity_index = self._index[entity_hash] - # TODO: check that entity is already in this KB (entity_index is OK) - self._aliases_table.add(alias_hash, entity_index, prob) + cdef vector[int64_t] entry_indices = [self._entry_index[hash_string(entity)] for entity in entities] + + self.c_add_aliases(alias_key=alias_hash, entry_indices=entry_indices, probs=probabilities) + + # TODO: check that alias hadn't been defined before + # TODO: check that entity is already in this KB (entity_index is OK) + # TODO: check sum(probabilities) <= 1 + # TODO: check len(entities) == len(probabilities) + From a14fb54b172993f2a72d9b83d2d2b8d116c6a609 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 18 Mar 2019 17:27:51 +0100 Subject: [PATCH 09/28] very minimal KB functionality working --- setup.py | 1 + spacy/kb.pxd | 17 ++++++++--- spacy/kb.pyx | 30 ++++++++++++------- .../sandbox_test_sofie}/__init__.py | 0 .../sandbox_test_sofie}/testing_el.py | 15 +++++++++- 5 files changed, 47 insertions(+), 16 deletions(-) rename {sandbox_test_sofie => spacy/sandbox_test_sofie}/__init__.py (100%) rename {sandbox_test_sofie => spacy/sandbox_test_sofie}/testing_el.py (67%) diff --git a/setup.py b/setup.py index 34c92ad2b..c27082c25 100755 --- a/setup.py +++ b/setup.py @@ -40,6 +40,7 @@ MOD_NAMES = [ "spacy.lexeme", "spacy.vocab", "spacy.attrs", + "spacy.kb", "spacy.morphology", "spacy.pipeline.pipes", "spacy.syntax.stateclass", diff --git a/spacy/kb.pxd b/spacy/kb.pxd index 92a0c8b95..43f3e83e8 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -4,6 +4,7 @@ from preshed.maps cimport PreshMap from libcpp.vector cimport vector from libc.stdint cimport int32_t, int64_t from .typedefs cimport hash_t +from .strings cimport hash_string # Internal struct, for storage and disambiguation. This isn't what we return @@ -32,10 +33,10 @@ cdef struct _EntryC: cdef struct _AliasC: # All entry candidates for this alias - const vector[int64_t] entry_indices + vector[int64_t] entry_indices # Prior probability P(entity|alias) - should sum up to (at most) 1. - const vector[float] probs + vector[float] probs cdef class KnowledgeBase: @@ -94,13 +95,21 @@ cdef class KnowledgeBase: feats_row=feats_row, prob=prob )) - self._index[entity_key] = entity_index + self._entry_index[entity_key] = entity_index return entity_index - cdef inline int64_t c_add_aliases(self, hash_t alias_key, vector[int64_t] entry_indices, vector[float] probs): + cdef inline int64_t c_add_aliases(self, hash_t alias_key, entities, probabilities): """Connect a mention to a list of potential entities with their prior probabilities .""" cdef int64_t alias_index = self._aliases_table.size() + cdef vector[int64_t] entry_indices + cdef vector[float] probs + + for entity, prob in zip(entities, probs): + entry_index = self._entry_index[hash_string(entity)] + entry_indices.push_back(entry_index) + probs.push_back(prob) + self._aliases_table.push_back( _AliasC( entry_indices=entry_indices, diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 0f6a7aecc..d2b8fffe1 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -1,34 +1,42 @@ -from .strings cimport hash_string +# cython: profile=True +# coding: utf8 +from preshed.maps import PreshMap cdef class KnowledgeBase: + + def __init__(self): + self._entry_index = PreshMap() + self._alias_index = PreshMap() + self.mem = Pool() + + def __len__(self): return self._entries.size() - def add_entity(self, entity_id: str, float prob, vectors=None, features=None): + def add_entity(self, unicode entity_id, float prob, vectors=None, features=None): + cdef hash_t id_hash = hash_string(entity_id) + # TODO: more friendly check for non-unique name - if entity_id in self: + if id_hash in self._entry_index: return - cdef hash_t id_hash = hash_string(entity_id) + cdef int32_t dummy_value = 342 self.c_add_entity(entity_key=id_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value) # TODO self._vectors_table.get_pointer(vectors), # self._features_table.get(features)) - def add_alias(self, alias, entities, probabilities): + def add_alias(self, unicode alias, entities, probabilities): """For a given alias, add its potential entities and prior probabilies to the KB.""" cdef hash_t alias_hash = hash_string(alias) - cdef hash_t entity_hash = 0 - cdef int64_t entity_index = 0 - - cdef vector[int64_t] entry_indices = [self._entry_index[hash_string(entity)] for entity in entities] - - self.c_add_aliases(alias_key=alias_hash, entry_indices=entry_indices, probs=probabilities) # TODO: check that alias hadn't been defined before # TODO: check that entity is already in this KB (entity_index is OK) # TODO: check sum(probabilities) <= 1 # TODO: check len(entities) == len(probabilities) + self.c_add_aliases(alias_key=alias_hash, entities=entities, probabilities=probabilities) + + diff --git a/sandbox_test_sofie/__init__.py b/spacy/sandbox_test_sofie/__init__.py similarity index 100% rename from sandbox_test_sofie/__init__.py rename to spacy/sandbox_test_sofie/__init__.py diff --git a/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py similarity index 67% rename from sandbox_test_sofie/testing_el.py rename to spacy/sandbox_test_sofie/testing_el.py index 7883e44d4..840d890b5 100644 --- a/sandbox_test_sofie/testing_el.py +++ b/spacy/sandbox_test_sofie/testing_el.py @@ -1,4 +1,16 @@ import spacy +from spacy.kb import KnowledgeBase + + +def create_kb(): + mykb = KnowledgeBase() + print("kb size", len(mykb)) + + entity_id = "Q42" + mykb.add_entity(entity_id=entity_id, prob=0.5) + print("adding entity", entity_id) + + print("kb size", len(mykb)) def add_el(): @@ -23,4 +35,5 @@ def add_el(): if __name__ == "__main__": - add_el() + # add_el() + create_kb() From a4d876d47101523a4b4d7591dddfe2fd780b2601 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 18 Mar 2019 17:50:01 +0100 Subject: [PATCH 10/28] adding and retrieving aliases --- spacy/kb.pxd | 8 +++++++- spacy/kb.pyx | 5 ++++- spacy/sandbox_test_sofie/testing_el.py | 20 +++++++++++++++++--- 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index 43f3e83e8..7ee7f38be 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -116,4 +116,10 @@ cdef class KnowledgeBase: probs=probs )) self._alias_index[alias_key] = alias_index - return alias_index \ No newline at end of file + return alias_index + + cdef inline c_get_candidates(self, hash_t alias_key): + cdef int64_t alias_index = self._alias_index[alias_key] + cdef _AliasC candidates = self._aliases_table[alias_index] + print("candidates", candidates) + diff --git a/spacy/kb.pyx b/spacy/kb.pyx index d2b8fffe1..f420e0b73 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -38,5 +38,8 @@ cdef class KnowledgeBase: self.c_add_aliases(alias_key=alias_hash, entities=entities, probabilities=probabilities) - + def get_candidates(self, unicode alias): + cdef hash_t alias_hash = hash_string(alias) + cdef _AliasC candidates = self.c_get_candidates(alias_key=alias_hash) + return candidates diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py index 840d890b5..9a5ab638d 100644 --- a/spacy/sandbox_test_sofie/testing_el.py +++ b/spacy/sandbox_test_sofie/testing_el.py @@ -6,12 +6,26 @@ def create_kb(): mykb = KnowledgeBase() print("kb size", len(mykb)) - entity_id = "Q42" - mykb.add_entity(entity_id=entity_id, prob=0.5) - print("adding entity", entity_id) + # adding entities + entity_42 = "Q42" # douglas adams + mykb.add_entity(entity_id=entity_42, prob=0.5) + print("adding entity", entity_42) + + entity_5301561 = "Q5301561" + mykb.add_entity(entity_id=entity_5301561, prob=0.5) + print("adding entity", entity_5301561) print("kb size", len(mykb)) + # adding aliases + alias = "douglas" + print("adding alias", alias) + mykb.add_alias(alias=alias, entities=["Q42", "Q5301561"], probabilities=[0.8, 0.2]) + print("kb size", len(mykb)) + + print("aliases for", alias) + mykb.get_candidates(alias) + def add_el(): nlp = spacy.load('en_core_web_sm') From c62cca3368fb451a40ef0815107f630e65ca6b25 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 19 Mar 2019 15:51:56 +0100 Subject: [PATCH 11/28] get candidates by alias --- spacy/kb.pxd | 4 ---- spacy/kb.pyx | 11 ++++++++--- spacy/sandbox_test_sofie/testing_el.py | 18 ++++++++++-------- 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index 7ee7f38be..d96502f41 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -118,8 +118,4 @@ cdef class KnowledgeBase: self._alias_index[alias_key] = alias_index return alias_index - cdef inline c_get_candidates(self, hash_t alias_key): - cdef int64_t alias_index = self._alias_index[alias_key] - cdef _AliasC candidates = self._aliases_table[alias_index] - print("candidates", candidates) diff --git a/spacy/kb.pyx b/spacy/kb.pyx index f420e0b73..b4369d59b 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -10,10 +10,15 @@ cdef class KnowledgeBase: self._alias_index = PreshMap() self.mem = Pool() - def __len__(self): + return self.get_size_entities() + + def get_size_entities(self): return self._entries.size() + def get_size_aliases(self): + return self._aliases_table.size() + def add_entity(self, unicode entity_id, float prob, vectors=None, features=None): cdef hash_t id_hash = hash_string(entity_id) @@ -40,6 +45,6 @@ cdef class KnowledgeBase: def get_candidates(self, unicode alias): cdef hash_t alias_hash = hash_string(alias) - cdef _AliasC candidates = self.c_get_candidates(alias_key=alias_hash) - return candidates + alias_index = self._alias_index.get(alias_hash) + return self._aliases_table[alias_index] diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py index 9a5ab638d..b6255f9f9 100644 --- a/spacy/sandbox_test_sofie/testing_el.py +++ b/spacy/sandbox_test_sofie/testing_el.py @@ -4,27 +4,29 @@ from spacy.kb import KnowledgeBase def create_kb(): mykb = KnowledgeBase() - print("kb size", len(mykb)) + print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases()) # adding entities entity_42 = "Q42" # douglas adams mykb.add_entity(entity_id=entity_42, prob=0.5) - print("adding entity", entity_42) + print(" adding entity", entity_42) entity_5301561 = "Q5301561" mykb.add_entity(entity_id=entity_5301561, prob=0.5) - print("adding entity", entity_5301561) + print(" adding entity", entity_5301561) - print("kb size", len(mykb)) + print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases()) # adding aliases alias = "douglas" - print("adding alias", alias) + print(" adding alias", alias) mykb.add_alias(alias=alias, entities=["Q42", "Q5301561"], probabilities=[0.8, 0.2]) - print("kb size", len(mykb)) - print("aliases for", alias) - mykb.get_candidates(alias) + print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases()) + + print("candidates for", alias) + candidates = mykb.get_candidates(alias) + print(" ", candidates) def add_el(): From 1fba7219fb42a07c8ca8b6a3d9fe191c8ee364af Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 19 Mar 2019 16:15:38 +0100 Subject: [PATCH 12/28] bugfix adding aliases --- spacy/kb.pxd | 10 +--------- spacy/kb.pyx | 12 +++++++++++- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index d96502f41..9f0a5e68d 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -98,18 +98,10 @@ cdef class KnowledgeBase: self._entry_index[entity_key] = entity_index return entity_index - cdef inline int64_t c_add_aliases(self, hash_t alias_key, entities, probabilities): + cdef inline int64_t c_add_aliases(self, hash_t alias_key, vector[int64_t] entry_indices, vector[float] probs): """Connect a mention to a list of potential entities with their prior probabilities .""" cdef int64_t alias_index = self._aliases_table.size() - cdef vector[int64_t] entry_indices - cdef vector[float] probs - - for entity, prob in zip(entities, probs): - entry_index = self._entry_index[hash_string(entity)] - entry_indices.push_back(entry_index) - probs.push_back(prob) - self._aliases_table.push_back( _AliasC( entry_indices=entry_indices, diff --git a/spacy/kb.pyx b/spacy/kb.pyx index b4369d59b..854feb069 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -35,13 +35,23 @@ cdef class KnowledgeBase: def add_alias(self, unicode alias, entities, probabilities): """For a given alias, add its potential entities and prior probabilies to the KB.""" cdef hash_t alias_hash = hash_string(alias) + cdef hash_t entity_hash + + cdef vector[int64_t] entry_indices + cdef vector[float] probs + + for entity, prob in zip(entities, probabilities): + entity_hash = hash_string(entity) + entry_index = self._entry_index.get(entity_hash) + entry_indices.push_back(int(entry_index)) + probs.push_back(float(prob)) # TODO: check that alias hadn't been defined before # TODO: check that entity is already in this KB (entity_index is OK) # TODO: check sum(probabilities) <= 1 # TODO: check len(entities) == len(probabilities) - self.c_add_aliases(alias_key=alias_hash, entities=entities, probabilities=probabilities) + self.c_add_aliases(alias_key=alias_hash, entry_indices=entry_indices, probs=probs) def get_candidates(self, unicode alias): cdef hash_t alias_hash = hash_string(alias) From 1d20f19208a33ce737ea467ab84131a005eb3550 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 19 Mar 2019 16:43:23 +0100 Subject: [PATCH 13/28] use StringStore --- spacy/kb.pxd | 4 +++- spacy/kb.pyx | 12 +++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index 9f0a5e68d..f4f60d478 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -3,8 +3,9 @@ from cymem.cymem cimport Pool from preshed.maps cimport PreshMap from libcpp.vector cimport vector from libc.stdint cimport int32_t, int64_t + +from spacy.strings cimport StringStore from .typedefs cimport hash_t -from .strings cimport hash_string # Internal struct, for storage and disambiguation. This isn't what we return @@ -41,6 +42,7 @@ cdef struct _AliasC: cdef class KnowledgeBase: cdef Pool mem + cpdef readonly StringStore strings # This maps 64bit keys (hash of unique entity string) # to 64bit values (position of the _EntryC struct in the _entries vector). diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 854feb069..969b43f6d 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -1,7 +1,5 @@ # cython: profile=True # coding: utf8 -from preshed.maps import PreshMap - cdef class KnowledgeBase: @@ -9,6 +7,7 @@ cdef class KnowledgeBase: self._entry_index = PreshMap() self._alias_index = PreshMap() self.mem = Pool() + self.strings = StringStore() def __len__(self): return self.get_size_entities() @@ -20,13 +19,12 @@ cdef class KnowledgeBase: return self._aliases_table.size() def add_entity(self, unicode entity_id, float prob, vectors=None, features=None): - cdef hash_t id_hash = hash_string(entity_id) + cdef hash_t id_hash = self.strings.add(entity_id) # TODO: more friendly check for non-unique name if id_hash in self._entry_index: return - cdef int32_t dummy_value = 342 self.c_add_entity(entity_key=id_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value) # TODO self._vectors_table.get_pointer(vectors), @@ -34,14 +32,14 @@ cdef class KnowledgeBase: def add_alias(self, unicode alias, entities, probabilities): """For a given alias, add its potential entities and prior probabilies to the KB.""" - cdef hash_t alias_hash = hash_string(alias) + cdef hash_t alias_hash = self.strings.add(alias) cdef hash_t entity_hash cdef vector[int64_t] entry_indices cdef vector[float] probs for entity, prob in zip(entities, probabilities): - entity_hash = hash_string(entity) + entity_hash = self.strings.add(entity) entry_index = self._entry_index.get(entity_hash) entry_indices.push_back(int(entry_index)) probs.push_back(float(prob)) @@ -54,7 +52,7 @@ cdef class KnowledgeBase: self.c_add_aliases(alias_key=alias_hash, entry_indices=entry_indices, probs=probs) def get_candidates(self, unicode alias): - cdef hash_t alias_hash = hash_string(alias) + cdef hash_t alias_hash = self.strings.add(alias) alias_index = self._alias_index.get(alias_hash) return self._aliases_table[alias_index] From 19d3a2f9aa637bfd6f813e452df324352fc60621 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 19 Mar 2019 17:39:35 +0100 Subject: [PATCH 14/28] raising error when adding alias for unknown entity + unit test --- spacy/kb.pyx | 6 ++++-- spacy/sandbox_test_sofie/testing_el.py | 6 +++++- spacy/tests/pipeline/test_el.py | 29 ++++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 3 deletions(-) create mode 100644 spacy/tests/pipeline/test_el.py diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 969b43f6d..ea23e5373 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -39,13 +39,15 @@ cdef class KnowledgeBase: cdef vector[float] probs for entity, prob in zip(entities, probabilities): - entity_hash = self.strings.add(entity) + entity_hash = self.strings[entity] + if not entity_hash in self._entry_index: + raise ValueError("Alias '" + alias + "' defined for unknown entity '" + entity + "'") + entry_index = self._entry_index.get(entity_hash) entry_indices.push_back(int(entry_index)) probs.push_back(float(prob)) # TODO: check that alias hadn't been defined before - # TODO: check that entity is already in this KB (entity_index is OK) # TODO: check sum(probabilities) <= 1 # TODO: check len(entities) == len(probabilities) diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py index b6255f9f9..b5b529d4b 100644 --- a/spacy/sandbox_test_sofie/testing_el.py +++ b/spacy/sandbox_test_sofie/testing_el.py @@ -7,6 +7,10 @@ def create_kb(): print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases()) # adding entities + entity_0 = "Q0" # douglas adams + mykb.add_entity(entity_id=entity_0, prob=0.5) + print(" adding entity", entity_0) + entity_42 = "Q42" # douglas adams mykb.add_entity(entity_id=entity_42, prob=0.5) print(" adding entity", entity_42) @@ -18,7 +22,7 @@ def create_kb(): print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases()) # adding aliases - alias = "douglas" + alias = "douglassss" print(" adding alias", alias) mykb.add_alias(alias=alias, entities=["Q42", "Q5301561"], probabilities=[0.8, 0.2]) diff --git a/spacy/tests/pipeline/test_el.py b/spacy/tests/pipeline/test_el.py new file mode 100644 index 000000000..ed88076ce --- /dev/null +++ b/spacy/tests/pipeline/test_el.py @@ -0,0 +1,29 @@ +import pytest + +from spacy.kb import KnowledgeBase + + +def test_kb_valid_entities(): + mykb = KnowledgeBase() + + # adding entities + mykb.add_entity(entity_id="Q1", prob=0.5) + mykb.add_entity(entity_id="Q2", prob=0.5) + mykb.add_entity(entity_id="Q3", prob=0.5) + + # adding aliases + mykb.add_alias(alias="douglassss", entities=["Q2", "Q3"], probabilities=[0.8, 0.2]) + + +def test_kb_invalid_entities(): + mykb = KnowledgeBase() + + # adding entities + mykb.add_entity(entity_id="Q1", prob=0.5) + mykb.add_entity(entity_id="Q2", prob=0.5) + mykb.add_entity(entity_id="Q3", prob=0.5) + + # adding aliases - should fail because one of the given IDs is not valid + with pytest.raises(ValueError): + mykb.add_alias(alias="douglassss", entities=["Q2", "Q342"], probabilities=[0.8, 0.2]) + From 2f2f8216486306e96d06c5e83f63131bcef92990 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 19 Mar 2019 21:35:24 +0100 Subject: [PATCH 15/28] avoid value 0 in preshmap and helpful user warnings --- spacy/kb.pxd | 19 +++++++++++++++++++ spacy/kb.pyx | 13 +++++++++++-- spacy/sandbox_test_sofie/testing_el.py | 20 +++++++++++++++++--- 3 files changed, 47 insertions(+), 5 deletions(-) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index f4f60d478..d0f31ebb4 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -112,4 +112,23 @@ cdef class KnowledgeBase: self._alias_index[alias_key] = alias_index return alias_index + cdef inline create_empty_vectors(self): + """ + Making sure the first element of each vector is a dummy, + because the PreshMap maps pointing to indices in these vectors can not contain 0 as value + cf. https://github.com/explosion/preshed/issues/17 + """ + cdef int32_t dummy_value = 0 + self._entries.push_back( + _EntryC( + vector_rows=&dummy_value, + feats_row=dummy_value, + prob=dummy_value + )) + self._aliases_table.push_back( + _AliasC( + entry_indices=[dummy_value], + probs=[dummy_value] + )) + diff --git a/spacy/kb.pyx b/spacy/kb.pyx index ea23e5373..f67519260 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -1,5 +1,6 @@ # cython: profile=True # coding: utf8 +from spacy.errors import user_warning cdef class KnowledgeBase: @@ -8,6 +9,7 @@ cdef class KnowledgeBase: self._alias_index = PreshMap() self.mem = Pool() self.strings = StringStore() + self.create_empty_vectors() def __len__(self): return self.get_size_entities() @@ -21,8 +23,9 @@ cdef class KnowledgeBase: def add_entity(self, unicode entity_id, float prob, vectors=None, features=None): cdef hash_t id_hash = self.strings.add(entity_id) - # TODO: more friendly check for non-unique name + # Return if this entity was added before if id_hash in self._entry_index: + user_warning("Entity " + entity_id + " already exists in the KB") return cdef int32_t dummy_value = 342 @@ -33,6 +36,12 @@ cdef class KnowledgeBase: def add_alias(self, unicode alias, entities, probabilities): """For a given alias, add its potential entities and prior probabilies to the KB.""" cdef hash_t alias_hash = self.strings.add(alias) + + # Return if this alias was added before + if alias_hash in self._alias_index: + user_warning("Alias " + alias + " already exists in the KB") + return + cdef hash_t entity_hash cdef vector[int64_t] entry_indices @@ -47,12 +56,12 @@ cdef class KnowledgeBase: entry_indices.push_back(int(entry_index)) probs.push_back(float(prob)) - # TODO: check that alias hadn't been defined before # TODO: check sum(probabilities) <= 1 # TODO: check len(entities) == len(probabilities) self.c_add_aliases(alias_key=alias_hash, entry_indices=entry_indices, probs=probs) + def get_candidates(self, unicode alias): cdef hash_t alias_hash = self.strings.add(alias) alias_index = self._alias_index.get(alias_hash) diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py index b5b529d4b..734eddd8d 100644 --- a/spacy/sandbox_test_sofie/testing_el.py +++ b/spacy/sandbox_test_sofie/testing_el.py @@ -1,23 +1,28 @@ +# coding: utf-8 import spacy from spacy.kb import KnowledgeBase def create_kb(): mykb = KnowledgeBase() + print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases()) # adding entities entity_0 = "Q0" # douglas adams - mykb.add_entity(entity_id=entity_0, prob=0.5) print(" adding entity", entity_0) + mykb.add_entity(entity_id=entity_0, prob=0.5) entity_42 = "Q42" # douglas adams - mykb.add_entity(entity_id=entity_42, prob=0.5) print(" adding entity", entity_42) + mykb.add_entity(entity_id=entity_42, prob=0.5) entity_5301561 = "Q5301561" - mykb.add_entity(entity_id=entity_5301561, prob=0.5) print(" adding entity", entity_5301561) + mykb.add_entity(entity_id=entity_5301561, prob=0.5) + + print(" adding entity", entity_5301561) + mykb.add_entity(entity_id=entity_5301561, prob=0.5) print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases()) @@ -32,6 +37,15 @@ def create_kb(): candidates = mykb.get_candidates(alias) print(" ", candidates) + print(" adding alias", alias) + mykb.add_alias(alias=alias, entities=["Q42"], probabilities=[0.9]) + + print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases()) + + print("candidates for", alias) + candidates = mykb.get_candidates(alias) + print(" ", candidates) + def add_el(): nlp = spacy.load('en_core_web_sm') From f0decf98f19f13ceb87dfeb78955f6241fecb69e Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 19 Mar 2019 21:43:48 +0100 Subject: [PATCH 16/28] check and unit test in case prior probs exceed 1 --- spacy/kb.pyx | 7 +++++++ spacy/sandbox_test_sofie/testing_el.py | 6 ++++++ spacy/tests/pipeline/test_el.py | 25 +++++++++++++++++++++---- 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/spacy/kb.pyx b/spacy/kb.pyx index f67519260..2b38202f3 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -35,6 +35,13 @@ cdef class KnowledgeBase: def add_alias(self, unicode alias, entities, probabilities): """For a given alias, add its potential entities and prior probabilies to the KB.""" + + # Throw an error if the probabilities sum up to more than 1 + prob_sum = sum(probabilities) + if prob_sum > 1: + raise ValueError("The sum of prior probabilities for alias '" + alias + "' should not exceed 1, " + "but found " + str(prob_sum)) + cdef hash_t alias_hash = self.strings.add(alias) # Return if this alias was added before diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py index 734eddd8d..71fecb7e6 100644 --- a/spacy/sandbox_test_sofie/testing_el.py +++ b/spacy/sandbox_test_sofie/testing_el.py @@ -42,6 +42,12 @@ def create_kb(): print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases()) + alias2 = "johny" + print(" adding alias2", alias2) + mykb.add_alias(alias=alias2, entities=["Q0", "Q42"], probabilities=[0.3, 1.1]) + + print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases()) + print("candidates for", alias) candidates = mykb.get_candidates(alias) print(" ", candidates) diff --git a/spacy/tests/pipeline/test_el.py b/spacy/tests/pipeline/test_el.py index ed88076ce..f9533ef82 100644 --- a/spacy/tests/pipeline/test_el.py +++ b/spacy/tests/pipeline/test_el.py @@ -1,14 +1,16 @@ +# coding: utf-8 import pytest from spacy.kb import KnowledgeBase def test_kb_valid_entities(): + """Test the valid construction of a KB with 3 entities and one alias""" mykb = KnowledgeBase() # adding entities - mykb.add_entity(entity_id="Q1", prob=0.5) - mykb.add_entity(entity_id="Q2", prob=0.5) + mykb.add_entity(entity_id="Q1", prob=0.9) + mykb.add_entity(entity_id="Q2", prob=0.2) mykb.add_entity(entity_id="Q3", prob=0.5) # adding aliases @@ -16,14 +18,29 @@ def test_kb_valid_entities(): def test_kb_invalid_entities(): + """Test the invalid construction of a KB with an alias linked to a non-existing entity""" mykb = KnowledgeBase() # adding entities - mykb.add_entity(entity_id="Q1", prob=0.5) - mykb.add_entity(entity_id="Q2", prob=0.5) + mykb.add_entity(entity_id="Q1", prob=0.9) + mykb.add_entity(entity_id="Q2", prob=0.2) mykb.add_entity(entity_id="Q3", prob=0.5) # adding aliases - should fail because one of the given IDs is not valid with pytest.raises(ValueError): mykb.add_alias(alias="douglassss", entities=["Q2", "Q342"], probabilities=[0.8, 0.2]) + +def test_kb_invalid_probabilities(): + """Test the invalid construction of a KB with wrong prior probabilities""" + mykb = KnowledgeBase() + + # adding entities + mykb.add_entity(entity_id="Q1", prob=0.9) + mykb.add_entity(entity_id="Q2", prob=0.2) + mykb.add_entity(entity_id="Q3", prob=0.5) + + # adding aliases - should fail because the sum of the probabilities exceeds 1 + with pytest.raises(ValueError): + mykb.add_alias(alias="douglassss", entities=["Q2", "Q3"], probabilities=[0.8, 0.4]) + From 7402bb4c06095b8a97ade868285cb4a6f999a622 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 19 Mar 2019 21:50:32 +0100 Subject: [PATCH 17/28] correct size, not counting dummy elements in the vector --- spacy/kb.pyx | 4 +-- spacy/sandbox_test_sofie/testing_el.py | 36 +++++++++++--------------- spacy/tests/pipeline/test_el.py | 9 +++++-- 3 files changed, 24 insertions(+), 25 deletions(-) diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 2b38202f3..bc7cddf11 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -15,10 +15,10 @@ cdef class KnowledgeBase: return self.get_size_entities() def get_size_entities(self): - return self._entries.size() + return self._entries.size() - 1 # not counting dummy element on index 0 def get_size_aliases(self): - return self._aliases_table.size() + return self._aliases_table.size() - 1 # not counting dummy element on index 0 def add_entity(self, unicode entity_id, float prob, vectors=None, features=None): cdef hash_t id_hash = self.strings.add(entity_id) diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py index 71fecb7e6..76151f27e 100644 --- a/spacy/sandbox_test_sofie/testing_el.py +++ b/spacy/sandbox_test_sofie/testing_el.py @@ -21,35 +21,29 @@ def create_kb(): print(" adding entity", entity_5301561) mykb.add_entity(entity_id=entity_5301561, prob=0.5) - print(" adding entity", entity_5301561) - mykb.add_entity(entity_id=entity_5301561, prob=0.5) - print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases()) # adding aliases - alias = "douglassss" - print(" adding alias", alias) - mykb.add_alias(alias=alias, entities=["Q42", "Q5301561"], probabilities=[0.8, 0.2]) - - print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases()) - - print("candidates for", alias) - candidates = mykb.get_candidates(alias) - print(" ", candidates) - - print(" adding alias", alias) - mykb.add_alias(alias=alias, entities=["Q42"], probabilities=[0.9]) - - print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases()) + alias1 = "douglassss" + print(" adding alias", alias1) + mykb.add_alias(alias=alias1, entities=["Q42", "Q5301561"], probabilities=[0.8, 0.2]) alias2 = "johny" - print(" adding alias2", alias2) - mykb.add_alias(alias=alias2, entities=["Q0", "Q42"], probabilities=[0.3, 1.1]) + print(" adding alias", alias2) + mykb.add_alias(alias=alias2, entities=["Q0", "Q42", "Q5301561"], probabilities=[0.3, 0.1, 0.4]) + + alias3 = "adam" + print(" adding alias", alias3) + mykb.add_alias(alias=alias3, entities=["Q42"], probabilities=[1.0]) print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases()) - print("candidates for", alias) - candidates = mykb.get_candidates(alias) + print("candidates for", alias1) + candidates = mykb.get_candidates(alias1) + print(" ", candidates) + + print("candidates for", alias3) + candidates = mykb.get_candidates(alias3) print(" ", candidates) diff --git a/spacy/tests/pipeline/test_el.py b/spacy/tests/pipeline/test_el.py index f9533ef82..cd71bcb48 100644 --- a/spacy/tests/pipeline/test_el.py +++ b/spacy/tests/pipeline/test_el.py @@ -14,7 +14,12 @@ def test_kb_valid_entities(): mykb.add_entity(entity_id="Q3", prob=0.5) # adding aliases - mykb.add_alias(alias="douglassss", entities=["Q2", "Q3"], probabilities=[0.8, 0.2]) + mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.2]) + mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) + + # test the size of the corresponding KB + assert(mykb.get_size_entities() == 3) + assert(mykb.get_size_aliases() == 2) def test_kb_invalid_entities(): @@ -28,7 +33,7 @@ def test_kb_invalid_entities(): # adding aliases - should fail because one of the given IDs is not valid with pytest.raises(ValueError): - mykb.add_alias(alias="douglassss", entities=["Q2", "Q342"], probabilities=[0.8, 0.2]) + mykb.add_alias(alias="douglas", entities=["Q2", "Q342"], probabilities=[0.8, 0.2]) def test_kb_invalid_probabilities(): From b7ca3de358fd53f87872215987f0a68e90ee3fb9 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 19 Mar 2019 21:55:10 +0100 Subject: [PATCH 18/28] check the length of entities and probabilities vector + unit test --- spacy/kb.pyx | 12 ++++++++---- spacy/tests/pipeline/test_el.py | 14 ++++++++++++++ 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/spacy/kb.pyx b/spacy/kb.pyx index bc7cddf11..ba694ce61 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -36,11 +36,18 @@ cdef class KnowledgeBase: def add_alias(self, unicode alias, entities, probabilities): """For a given alias, add its potential entities and prior probabilies to the KB.""" + # Throw an error if the length of entities and probabilities are not the same + if not len(entities) == len(probabilities): + raise ValueError("The vectors for entities and probabilities for alias '" + alias + + "' should have equal length, but found " + + str(len(entities)) + " and " + str(len(probabilities)) + "respectively.") + + # Throw an error if the probabilities sum up to more than 1 prob_sum = sum(probabilities) if prob_sum > 1: raise ValueError("The sum of prior probabilities for alias '" + alias + "' should not exceed 1, " - "but found " + str(prob_sum)) + + "but found " + str(prob_sum)) cdef hash_t alias_hash = self.strings.add(alias) @@ -63,9 +70,6 @@ cdef class KnowledgeBase: entry_indices.push_back(int(entry_index)) probs.push_back(float(prob)) - # TODO: check sum(probabilities) <= 1 - # TODO: check len(entities) == len(probabilities) - self.c_add_aliases(alias_key=alias_hash, entry_indices=entry_indices, probs=probs) diff --git a/spacy/tests/pipeline/test_el.py b/spacy/tests/pipeline/test_el.py index cd71bcb48..068a228d8 100644 --- a/spacy/tests/pipeline/test_el.py +++ b/spacy/tests/pipeline/test_el.py @@ -49,3 +49,17 @@ def test_kb_invalid_probabilities(): with pytest.raises(ValueError): mykb.add_alias(alias="douglassss", entities=["Q2", "Q3"], probabilities=[0.8, 0.4]) + +def test_kb_invalid_combination(): + """Test the invalid construction of a KB with non-matching entity and probability lists""" + mykb = KnowledgeBase() + + # adding entities + mykb.add_entity(entity_id="Q1", prob=0.9) + mykb.add_entity(entity_id="Q2", prob=0.2) + mykb.add_entity(entity_id="Q3", prob=0.5) + + # adding aliases - should fail because the entities and probabilities vectors are not of equal length + with pytest.raises(ValueError): + mykb.add_alias(alias="douglassss", entities=["Q2", "Q3"], probabilities=[0.3, 0.4, 0.1]) + From 81a9030ab7922beffcc307ccbf00bcf993353335 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 21 Mar 2019 00:04:06 +0100 Subject: [PATCH 19/28] create candidate object from entry pointer (not fully functional yet) --- spacy/kb.pxd | 20 ++++++++++-- spacy/kb.pyx | 45 ++++++++++++++++++++++++-- spacy/sandbox_test_sofie/testing_el.py | 8 ++--- 3 files changed, 65 insertions(+), 8 deletions(-) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index d0f31ebb4..c409cf1b4 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -13,11 +13,14 @@ from .typedefs cimport hash_t # of bits we need to keep track of the answers. cdef struct _EntryC: + # The hash of this entry's unique ID + hash_t entity_key + # Allows retrieval of one or more vectors. # Each element of vector_rows should be an index into a vectors table. # Every entry should have the same number of vectors, so we can avoid storing # the number of vectors in each knowledge-base struct - const int32_t* vector_rows + int32_t* vector_rows # Allows retrieval of a struct of non-vector features. We could make this a # pointer, but we have 32 bits left over in the struct after prob, so we'd @@ -40,6 +43,17 @@ cdef struct _AliasC: vector[float] probs +# TODO: document +cdef class Candidate: + + cdef _EntryC* entity + cdef hash_t alias_hash + cdef float prior_prob + + @staticmethod + cdef Candidate from_entry(_EntryC* entity, hash_t alias_hash, float prior_prob) + + cdef class KnowledgeBase: cdef Pool mem cpdef readonly StringStore strings @@ -85,7 +99,7 @@ cdef class KnowledgeBase: cdef object _features_table - cdef inline int64_t c_add_entity(self, hash_t entity_key, float prob, const int32_t* vector_rows, + cdef inline int64_t c_add_entity(self, hash_t entity_key, float prob, int32_t* vector_rows, int feats_row): """Add an entry to the knowledge base.""" # This is what we'll map the hash key to. It's where the entry will sit @@ -93,6 +107,7 @@ cdef class KnowledgeBase: cdef int64_t entity_index = self._entries.size() self._entries.push_back( _EntryC( + entity_key=entity_key, vector_rows=vector_rows, feats_row=feats_row, prob=prob @@ -121,6 +136,7 @@ cdef class KnowledgeBase: cdef int32_t dummy_value = 0 self._entries.push_back( _EntryC( + entity_key=self.strings.add(""), vector_rows=&dummy_value, feats_row=dummy_value, prob=dummy_value diff --git a/spacy/kb.pyx b/spacy/kb.pyx index ba694ce61..38bc48c7f 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -2,6 +2,35 @@ # coding: utf8 from spacy.errors import user_warning + +cdef class Candidate: + + + # def inline __cinit__(self, _EntryC entity, hash_t alias_hash, float prior_prob): + # self.alias_hash = alias_hash + # self.entity = entity + # self.prior_prob = prior_prob + + @staticmethod + cdef Candidate from_entry(_EntryC* entity, hash_t alias_hash, float prior_prob): + """Factory function to create Candidate objects from entity entries.""" + # Call to __new__ bypasses __init__ constructor + cdef Candidate candidate = Candidate.__new__(Candidate) + candidate.entity = entity + candidate.alias_hash = alias_hash + candidate.prior_prob = prior_prob + return candidate + + def __str__(self): + return "alias=" + self.strings[self.alias_hash] + \ + " prior_prob=" + str(self.prior_prob) + + #" entry=" + self.strings[self.entity_hash] + \ + + def __repr__(self): + return self.__str__() + + cdef class KnowledgeBase: def __init__(self): @@ -74,7 +103,19 @@ cdef class KnowledgeBase: def get_candidates(self, unicode alias): - cdef hash_t alias_hash = self.strings.add(alias) + cdef hash_t alias_hash = self.strings[alias] alias_index = self._alias_index.get(alias_hash) - return self._aliases_table[alias_index] + alias_entry = self._aliases_table[alias_index] + + for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs): + entity = <_EntryC>self._entries[entry_index] + # candidate = Candidate(entity=entity, alias_hash=alias_hash, prior_prob=prob) + candidate = Candidate.from_entry(entity=&entity, alias_hash=alias_hash, prior_prob=prob) + print(candidate) + + # return [Candidate(entity=<_EntryC>self._entries[self._entry_index[entry_index]], + # alias_hash=alias_hash, + # prior_prob=prob) + # for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)] + diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py index 76151f27e..c96c5552f 100644 --- a/spacy/sandbox_test_sofie/testing_el.py +++ b/spacy/sandbox_test_sofie/testing_el.py @@ -25,16 +25,16 @@ def create_kb(): # adding aliases alias1 = "douglassss" - print(" adding alias", alias1) + print(" adding alias", alias1, "to Q42 and Q5301561") mykb.add_alias(alias=alias1, entities=["Q42", "Q5301561"], probabilities=[0.8, 0.2]) alias2 = "johny" - print(" adding alias", alias2) + print(" adding alias", alias2, "to Q0, Q42 and Q5301561") mykb.add_alias(alias=alias2, entities=["Q0", "Q42", "Q5301561"], probabilities=[0.3, 0.1, 0.4]) alias3 = "adam" - print(" adding alias", alias3) - mykb.add_alias(alias=alias3, entities=["Q42"], probabilities=[1.0]) + print(" adding alias", alias3, "to Q42") + mykb.add_alias(alias=alias3, entities=["Q42"], probabilities=[0.9]) print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases()) From 0ff4ce6c59234517b8f70b0d5b672d42ea8c607f Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 21 Mar 2019 12:31:02 +0100 Subject: [PATCH 20/28] store entity hash instead of pointer --- spacy/kb.pxd | 21 +++++------ spacy/kb.pyx | 51 +++++++++----------------- spacy/sandbox_test_sofie/testing_el.py | 24 ++++++++++-- 3 files changed, 46 insertions(+), 50 deletions(-) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index c409cf1b4..c0998eadb 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -14,7 +14,7 @@ from .typedefs cimport hash_t cdef struct _EntryC: # The hash of this entry's unique ID - hash_t entity_key + hash_t entity_hash # Allows retrieval of one or more vectors. # Each element of vector_rows should be an index into a vectors table. @@ -46,13 +46,10 @@ cdef struct _AliasC: # TODO: document cdef class Candidate: - cdef _EntryC* entity + cdef hash_t entity_hash cdef hash_t alias_hash cdef float prior_prob - @staticmethod - cdef Candidate from_entry(_EntryC* entity, hash_t alias_hash, float prior_prob) - cdef class KnowledgeBase: cdef Pool mem @@ -98,8 +95,7 @@ cdef class KnowledgeBase: # optional data, we can let users configure a DB as the backend for this. cdef object _features_table - - cdef inline int64_t c_add_entity(self, hash_t entity_key, float prob, int32_t* vector_rows, + cdef inline int64_t c_add_entity(self, hash_t entity_hash, float prob, int32_t* vector_rows, int feats_row): """Add an entry to the knowledge base.""" # This is what we'll map the hash key to. It's where the entry will sit @@ -107,15 +103,15 @@ cdef class KnowledgeBase: cdef int64_t entity_index = self._entries.size() self._entries.push_back( _EntryC( - entity_key=entity_key, + entity_hash=entity_hash, vector_rows=vector_rows, feats_row=feats_row, prob=prob )) - self._entry_index[entity_key] = entity_index + self._entry_index[entity_hash] = entity_index return entity_index - cdef inline int64_t c_add_aliases(self, hash_t alias_key, vector[int64_t] entry_indices, vector[float] probs): + cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs): """Connect a mention to a list of potential entities with their prior probabilities .""" cdef int64_t alias_index = self._aliases_table.size() @@ -124,7 +120,7 @@ cdef class KnowledgeBase: entry_indices=entry_indices, probs=probs )) - self._alias_index[alias_key] = alias_index + self._alias_index[alias_hash] = alias_index return alias_index cdef inline create_empty_vectors(self): @@ -134,9 +130,10 @@ cdef class KnowledgeBase: cf. https://github.com/explosion/preshed/issues/17 """ cdef int32_t dummy_value = 0 + self.strings.add("") self._entries.push_back( _EntryC( - entity_key=self.strings.add(""), + entity_hash=self.strings.add(""), vector_rows=&dummy_value, feats_row=dummy_value, prob=dummy_value diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 38bc48c7f..cca24d4f8 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -5,30 +5,20 @@ from spacy.errors import user_warning cdef class Candidate: + def __init__(self, entity_hash, alias_hash, prior_prob): + self.entity_hash = entity_hash + self.alias_hash = alias_hash + self.prior_prob = prior_prob - # def inline __cinit__(self, _EntryC entity, hash_t alias_hash, float prior_prob): - # self.alias_hash = alias_hash - # self.entity = entity - # self.prior_prob = prior_prob + def get_entity_name(self, KnowledgeBase kb): + return kb.strings[self.entity_hash] - @staticmethod - cdef Candidate from_entry(_EntryC* entity, hash_t alias_hash, float prior_prob): - """Factory function to create Candidate objects from entity entries.""" - # Call to __new__ bypasses __init__ constructor - cdef Candidate candidate = Candidate.__new__(Candidate) - candidate.entity = entity - candidate.alias_hash = alias_hash - candidate.prior_prob = prior_prob - return candidate + def get_alias_name(self, KnowledgeBase kb): + return kb.strings[self.alias_hash] - def __str__(self): - return "alias=" + self.strings[self.alias_hash] + \ - " prior_prob=" + str(self.prior_prob) - - #" entry=" + self.strings[self.entity_hash] + \ - - def __repr__(self): - return self.__str__() + property prior_prob: + def __get__(self): + return self.prior_prob cdef class KnowledgeBase: @@ -58,7 +48,7 @@ cdef class KnowledgeBase: return cdef int32_t dummy_value = 342 - self.c_add_entity(entity_key=id_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value) + self.c_add_entity(entity_hash=id_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value) # TODO self._vectors_table.get_pointer(vectors), # self._features_table.get(features)) @@ -99,7 +89,7 @@ cdef class KnowledgeBase: entry_indices.push_back(int(entry_index)) probs.push_back(float(prob)) - self.c_add_aliases(alias_key=alias_hash, entry_indices=entry_indices, probs=probs) + self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs) def get_candidates(self, unicode alias): @@ -107,15 +97,8 @@ cdef class KnowledgeBase: alias_index = self._alias_index.get(alias_hash) alias_entry = self._aliases_table[alias_index] - for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs): - entity = <_EntryC>self._entries[entry_index] - # candidate = Candidate(entity=entity, alias_hash=alias_hash, prior_prob=prob) - candidate = Candidate.from_entry(entity=&entity, alias_hash=alias_hash, prior_prob=prob) - print(candidate) - - # return [Candidate(entity=<_EntryC>self._entries[self._entry_index[entry_index]], - # alias_hash=alias_hash, - # prior_prob=prob) - # for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)] - + return [Candidate(entity_hash=self._entries[entry_index].entity_hash, + alias_hash=alias_hash, + prior_prob=prob) + for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)] diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py index c96c5552f..5c0d6a037 100644 --- a/spacy/sandbox_test_sofie/testing_el.py +++ b/spacy/sandbox_test_sofie/testing_el.py @@ -39,12 +39,28 @@ def create_kb(): print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases()) print("candidates for", alias1) - candidates = mykb.get_candidates(alias1) - print(" ", candidates) + candidates1 = mykb.get_candidates(alias1) + for candidate in candidates1: + print(" candidate") + print(" name", candidate.get_entity_name(mykb)) + print(" alias", candidate.get_alias_name(mykb)) + print(" prior_prob", candidate.prior_prob) + + print("candidates for", alias2) + candidates2 = mykb.get_candidates(alias2) + for candidate in candidates2: + print(" candidate") + print(" name", candidate.get_entity_name(mykb)) + print(" alias", candidate.get_alias_name(mykb)) + print(" prior_prob", candidate.prior_prob) print("candidates for", alias3) - candidates = mykb.get_candidates(alias3) - print(" ", candidates) + candidates3 = mykb.get_candidates(alias3) + for candidate in candidates3: + print(" candidate") + print(" name", candidate.get_entity_name(mykb)) + print(" alias", candidate.get_alias_name(mykb)) + print(" prior_prob", candidate.prior_prob) def add_el(): From 34969dddebb91d78e419bcaf221002ca1fdca354 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 21 Mar 2019 12:48:59 +0100 Subject: [PATCH 21/28] unit test on number of candidates generated --- spacy/tests/pipeline/test_el.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/spacy/tests/pipeline/test_el.py b/spacy/tests/pipeline/test_el.py index 068a228d8..78ee0f358 100644 --- a/spacy/tests/pipeline/test_el.py +++ b/spacy/tests/pipeline/test_el.py @@ -63,3 +63,20 @@ def test_kb_invalid_combination(): with pytest.raises(ValueError): mykb.add_alias(alias="douglassss", entities=["Q2", "Q3"], probabilities=[0.3, 0.4, 0.1]) + +def test_candidate_generation(): + """Test correct candidate generation""" + mykb = KnowledgeBase() + + # adding entities + mykb.add_entity(entity_id="Q1", prob=0.9) + mykb.add_entity(entity_id="Q2", prob=0.2) + mykb.add_entity(entity_id="Q3", prob=0.5) + + # adding aliases + mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.2]) + mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) + + # test the size of the relevant candidates + assert(len(mykb.get_candidates("douglas")) == 2) + assert(len(mykb.get_candidates("adam")) == 1) From 6ba4079f7c4489967044200a903c252975aebaca Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 21 Mar 2019 13:26:12 +0100 Subject: [PATCH 22/28] property getters and keep track of KB internally --- spacy/kb.pxd | 1 + spacy/kb.pyx | 41 +++++++++++++++++++++----- spacy/sandbox_test_sofie/testing_el.py | 34 +++++++-------------- 3 files changed, 46 insertions(+), 30 deletions(-) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index c0998eadb..54ee49a3f 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -46,6 +46,7 @@ cdef struct _AliasC: # TODO: document cdef class Candidate: + cdef readonly KnowledgeBase kb cdef hash_t entity_hash cdef hash_t alias_hash cdef float prior_prob diff --git a/spacy/kb.pyx b/spacy/kb.pyx index cca24d4f8..52c8ad8f0 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -5,16 +5,31 @@ from spacy.errors import user_warning cdef class Candidate: - def __init__(self, entity_hash, alias_hash, prior_prob): + def __init__(self, KnowledgeBase kb, entity_hash, alias_hash, prior_prob): + self.kb = kb self.entity_hash = entity_hash self.alias_hash = alias_hash self.prior_prob = prior_prob - def get_entity_name(self, KnowledgeBase kb): - return kb.strings[self.entity_hash] + property kb_id_: + """RETURNS (unicode): ID of this entity in the KB""" + def __get__(self): + return self.kb.strings[self.entity_hash] - def get_alias_name(self, KnowledgeBase kb): - return kb.strings[self.alias_hash] + property kb_id: + """RETURNS (uint64): hash of the entity's KB ID""" + def __get__(self): + return self.entity_hash + + property alias_: + """RETURNS (unicode): ID of the original alias""" + def __get__(self): + return self.kb.strings[self.alias_hash] + + property alias: + """RETURNS (uint64): hash of the alias""" + def __get__(self): + return self.alias_hash property prior_prob: def __get__(self): @@ -40,6 +55,10 @@ cdef class KnowledgeBase: return self._aliases_table.size() - 1 # not counting dummy element on index 0 def add_entity(self, unicode entity_id, float prob, vectors=None, features=None): + """ + Add an entity to the KB. + Return the hash of the entity ID at the end + """ cdef hash_t id_hash = self.strings.add(entity_id) # Return if this entity was added before @@ -52,8 +71,13 @@ cdef class KnowledgeBase: # TODO self._vectors_table.get_pointer(vectors), # self._features_table.get(features)) + return id_hash + def add_alias(self, unicode alias, entities, probabilities): - """For a given alias, add its potential entities and prior probabilies to the KB.""" + """ + For a given alias, add its potential entities and prior probabilies to the KB. + Return the alias_hash at the end + """ # Throw an error if the length of entities and probabilities are not the same if not len(entities) == len(probabilities): @@ -91,13 +115,16 @@ cdef class KnowledgeBase: self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs) + return alias_hash + def get_candidates(self, unicode alias): cdef hash_t alias_hash = self.strings[alias] alias_index = self._alias_index.get(alias_hash) alias_entry = self._aliases_table[alias_index] - return [Candidate(entity_hash=self._entries[entry_index].entity_hash, + return [Candidate(kb=self, + entity_hash=self._entries[entry_index].entity_hash, alias_hash=alias_hash, prior_prob=prob) for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)] diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py index 5c0d6a037..3a81effbc 100644 --- a/spacy/sandbox_test_sofie/testing_el.py +++ b/spacy/sandbox_test_sofie/testing_el.py @@ -38,29 +38,17 @@ def create_kb(): print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases()) - print("candidates for", alias1) - candidates1 = mykb.get_candidates(alias1) - for candidate in candidates1: - print(" candidate") - print(" name", candidate.get_entity_name(mykb)) - print(" alias", candidate.get_alias_name(mykb)) - print(" prior_prob", candidate.prior_prob) - - print("candidates for", alias2) - candidates2 = mykb.get_candidates(alias2) - for candidate in candidates2: - print(" candidate") - print(" name", candidate.get_entity_name(mykb)) - print(" alias", candidate.get_alias_name(mykb)) - print(" prior_prob", candidate.prior_prob) - - print("candidates for", alias3) - candidates3 = mykb.get_candidates(alias3) - for candidate in candidates3: - print(" candidate") - print(" name", candidate.get_entity_name(mykb)) - print(" alias", candidate.get_alias_name(mykb)) - print(" prior_prob", candidate.prior_prob) + for alias in [alias1, alias2, alias3]: + print() + print("candidates for", alias) + candidates = mykb.get_candidates(alias) + for candidate in candidates: + print(" candidate") + print(" kb_id", candidate.kb_id) + print(" kb_id_", candidate.kb_id_) + print(" alias", candidate.alias) + print(" alias_", candidate.alias_) + print(" prior_prob", candidate.prior_prob) def add_el(): From a5d5a0593066aa75877970a12951edb4b5b6a430 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 21 Mar 2019 13:32:21 +0100 Subject: [PATCH 23/28] Entity class --- spacy/kb.pxd | 8 ++++++++ spacy/kb.pyx | 22 ++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index 54ee49a3f..4ae34bfa7 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -43,6 +43,14 @@ cdef struct _AliasC: vector[float] probs +# TODO: document +cdef class Entity: + + cdef readonly KnowledgeBase kb + cdef hash_t entity_hash + cdef float confidence + + # TODO: document cdef class Candidate: diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 52c8ad8f0..4776e9d34 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -3,6 +3,28 @@ from spacy.errors import user_warning +cdef class Entity: + + def __init__(self, KnowledgeBase kb, entity_hash, confidence): + self.kb = kb + self.entity_hash = entity_hash + self.confidence = confidence + + property kb_id_: + """RETURNS (unicode): ID of this entity in the KB""" + def __get__(self): + return self.kb.strings[self.entity_hash] + + property kb_id: + """RETURNS (uint64): hash of the entity's KB ID""" + def __get__(self): + return self.entity_hash + + property confidence: + def __get__(self): + return self.confidence + + cdef class Candidate: def __init__(self, KnowledgeBase kb, entity_hash, alias_hash, prior_prob): From 26afa4800f16901a4bda3be8d0b84e64905202b8 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 21 Mar 2019 15:24:40 +0100 Subject: [PATCH 24/28] ensure no candidates are returned for unknown aliases --- spacy/kb.pyx | 3 ++- spacy/sandbox_test_sofie/testing_el.py | 19 +++++-------------- spacy/tests/pipeline/test_el.py | 1 + 3 files changed, 8 insertions(+), 15 deletions(-) diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 4776e9d34..62080e1be 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -149,5 +149,6 @@ cdef class KnowledgeBase: entity_hash=self._entries[entry_index].entity_hash, alias_hash=alias_hash, prior_prob=prob) - for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)] + for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs) + if entry_index != 0] diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py index 3a81effbc..03261806b 100644 --- a/spacy/sandbox_test_sofie/testing_el.py +++ b/spacy/sandbox_test_sofie/testing_el.py @@ -7,6 +7,7 @@ def create_kb(): mykb = KnowledgeBase() print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases()) + print() # adding entities entity_0 = "Q0" # douglas adams @@ -22,33 +23,23 @@ def create_kb(): mykb.add_entity(entity_id=entity_5301561, prob=0.5) print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases()) + print() # adding aliases alias1 = "douglassss" print(" adding alias", alias1, "to Q42 and Q5301561") mykb.add_alias(alias=alias1, entities=["Q42", "Q5301561"], probabilities=[0.8, 0.2]) - alias2 = "johny" - print(" adding alias", alias2, "to Q0, Q42 and Q5301561") - mykb.add_alias(alias=alias2, entities=["Q0", "Q42", "Q5301561"], probabilities=[0.3, 0.1, 0.4]) - alias3 = "adam" print(" adding alias", alias3, "to Q42") mykb.add_alias(alias=alias3, entities=["Q42"], probabilities=[0.9]) print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases()) + print() - for alias in [alias1, alias2, alias3]: - print() - print("candidates for", alias) + for alias in [alias1, "rubbish", alias3]: candidates = mykb.get_candidates(alias) - for candidate in candidates: - print(" candidate") - print(" kb_id", candidate.kb_id) - print(" kb_id_", candidate.kb_id_) - print(" alias", candidate.alias) - print(" alias_", candidate.alias_) - print(" prior_prob", candidate.prior_prob) + print(len(candidates), "candidates for", alias) def add_el(): diff --git a/spacy/tests/pipeline/test_el.py b/spacy/tests/pipeline/test_el.py index 78ee0f358..295b35cce 100644 --- a/spacy/tests/pipeline/test_el.py +++ b/spacy/tests/pipeline/test_el.py @@ -80,3 +80,4 @@ def test_candidate_generation(): # test the size of the relevant candidates assert(len(mykb.get_candidates("douglas")) == 2) assert(len(mykb.get_candidates("adam")) == 1) + assert(len(mykb.get_candidates("shrubbery")) == 0) From d0c763ba447282d53ac7d25354afde468f0e4a73 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 21 Mar 2019 17:33:25 +0100 Subject: [PATCH 25/28] minimal EL pipe --- spacy/kb.pxd | 14 ++-- spacy/kb.pyx | 3 +- spacy/language.py | 4 + spacy/pipeline/pipes.pyx | 100 ++++--------------------- spacy/sandbox_test_sofie/testing_el.py | 17 +++-- 5 files changed, 37 insertions(+), 101 deletions(-) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index 4ae34bfa7..5fd239998 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -109,7 +109,7 @@ cdef class KnowledgeBase: """Add an entry to the knowledge base.""" # This is what we'll map the hash key to. It's where the entry will sit # in the vector of entries, so we can get it later. - cdef int64_t entity_index = self._entries.size() + cdef int64_t new_index = self._entries.size() self._entries.push_back( _EntryC( entity_hash=entity_hash, @@ -117,22 +117,22 @@ cdef class KnowledgeBase: feats_row=feats_row, prob=prob )) - self._entry_index[entity_hash] = entity_index - return entity_index + self._entry_index[entity_hash] = new_index + return new_index cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs): """Connect a mention to a list of potential entities with their prior probabilities .""" - cdef int64_t alias_index = self._aliases_table.size() + cdef int64_t new_index = self._aliases_table.size() self._aliases_table.push_back( _AliasC( entry_indices=entry_indices, probs=probs )) - self._alias_index[alias_hash] = alias_index - return alias_index + self._alias_index[alias_hash] = new_index + return new_index - cdef inline create_empty_vectors(self): + cdef inline _create_empty_vectors(self): """ Making sure the first element of each vector is a dummy, because the PreshMap maps pointing to indices in these vectors can not contain 0 as value diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 62080e1be..33a79da04 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -65,7 +65,7 @@ cdef class KnowledgeBase: self._alias_index = PreshMap() self.mem = Pool() self.strings = StringStore() - self.create_empty_vectors() + self._create_empty_vectors() def __len__(self): return self.get_size_entities() @@ -151,4 +151,3 @@ cdef class KnowledgeBase: prior_prob=prob) for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs) if entry_index != 0] - diff --git a/spacy/language.py b/spacy/language.py index 736899341..f80d8699d 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -209,6 +209,10 @@ class Language(object): def entity(self): return self.get_pipe("ner") + @property + def linker(self): + return self.get_pipe("el") + @property def matcher(self): return self.get_pipe("matcher") diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index e1e5471be..5866518a7 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1045,44 +1045,28 @@ class EntityLinker(Pipe): @classmethod def Model(cls, nr_class=1, **cfg): - embed_size = util.env_opt("embed_size", 2000) - if "token_vector_width" in cfg: - token_vector_width = cfg["token_vector_width"] - else: - token_vector_width = util.env_opt("token_vector_width", 96) - if cfg.get('architecture') == 'simple_cnn': - tok2vec = Tok2Vec(token_vector_width, embed_size, **cfg) - return None # build_simple_cnn_text_classifier(tok2vec, nr_class, **cfg) - else: - return None # build_text_classifier(nr_class, **cfg) + # TODO: non-dummy EL implementation + return None - - def __init__(self, vocab, model=True, **cfg): - self.vocab = vocab - self.model = model - self._rehearsal_model = None + def __init__(self, model=True, **cfg): + self.model = False self.cfg = dict(cfg) + self.kb = self.cfg["kb"] def __call__(self, doc): - # scores, tensors = self.predict([doc]) - scores, tensors = None, None - self.set_annotations([doc], scores, tensors=tensors) + self.set_annotations([doc], scores=None, tensors=None) return doc def pipe(self, stream, batch_size=128, n_threads=-1): + """Apply the pipe to a stream of documents. + Both __call__ and pipe should delegate to the `predict()` + and `set_annotations()` methods. + """ for docs in util.minibatch(stream, size=batch_size): docs = list(docs) - scores, tensors = self.predict(docs) - self.set_annotations(docs, scores, tensors=tensors) + self.set_annotations(docs, scores=None, tensors=None) yield from docs - def predict(self, docs): - # self.require_model() - scores = self.model(docs) - scores = self.model.ops.asarray(scores) - tensors = [doc.tensor for doc in docs] - return scores, tensors - def set_annotations(self, docs, scores, tensors=None): # TODO Sofie: actually implement this class instead of dummy implementation for i, doc in enumerate(docs): @@ -1091,67 +1075,13 @@ class EntityLinker(Pipe): for token in ent: token.ent_kb_id_ = "Q42" - def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None): - scores, bp_scores = self.model.begin_update(docs, drop=drop) - loss, d_scores = self.get_loss(docs, golds, scores) - bp_scores(d_scores, sgd=sgd) - if losses is not None: - losses.setdefault(self.name, 0.0) - losses[self.name] += loss - - def rehearse(self, docs, drop=0., sgd=None, losses=None): - if self._rehearsal_model is None: - return - scores, bp_scores = self.model.begin_update(docs, drop=drop) - target = self._rehearsal_model(docs) - gradient = scores - target - bp_scores(gradient, sgd=sgd) - if losses is not None: - losses.setdefault(self.name, 0.0) - losses[self.name] += (gradient**2).sum() - def get_loss(self, docs, golds, scores): - truths = numpy.zeros((len(golds), len(self.labels)), dtype='f') - not_missing = numpy.ones((len(golds), len(self.labels)), dtype='f') - for i, gold in enumerate(golds): - for j, label in enumerate(self.labels): - if label in gold.cats: - truths[i, j] = gold.cats[label] - else: - not_missing[i, j] = 0. - truths = self.model.ops.asarray(truths) - not_missing = self.model.ops.asarray(not_missing) - d_scores = (scores-truths) / scores.shape[0] - d_scores *= not_missing - mean_square_error = (d_scores**2).sum(axis=1).mean() - return float(mean_square_error), d_scores + # TODO + pass def add_label(self, label): - if label in self.labels: - return 0 - if self.model not in (None, True, False): - # This functionality was available previously, but was broken. - # The problem is that we resize the last layer, but the last layer - # is actually just an ensemble. We're not resizing the child layers - # -- a huge problem. - raise ValueError(Errors.E116) - #smaller = self.model._layers[-1] - #larger = Affine(len(self.labels)+1, smaller.nI) - #copy_array(larger.W[:smaller.nO], smaller.W) - #copy_array(larger.b[:smaller.nO], smaller.b) - #self.model._layers[-1] = larger - self.labels = tuple(list(self.labels) + [label]) - return 1 - - def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, - **kwargs): - if self.model is True: - self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors') - self.model = self.Model(len(self.labels), **self.cfg) - link_vectors_to_models(self.vocab) - if sgd is None: - sgd = self.create_optimizer() - return sgd + # TODO + pass __all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer', 'TextCategorizer', 'EntityLinker'] diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py index 03261806b..f6296bf89 100644 --- a/spacy/sandbox_test_sofie/testing_el.py +++ b/spacy/sandbox_test_sofie/testing_el.py @@ -37,16 +37,14 @@ def create_kb(): print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases()) print() - for alias in [alias1, "rubbish", alias3]: - candidates = mykb.get_candidates(alias) - print(len(candidates), "candidates for", alias) + return mykb -def add_el(): +def add_el(kb): nlp = spacy.load('en_core_web_sm') print("pipes before:", nlp.pipe_names) - el_pipe = nlp.create_pipe(name='el') + el_pipe = nlp.create_pipe(name='el', config={"kb": kb}) nlp.add_pipe(el_pipe, last=True) print("pipes after:", nlp.pipe_names) @@ -62,7 +60,12 @@ def add_el(): for ent in doc.ents: print("ent", ent.text, ent.label_, ent.kb_id_) + print() + for alias in ["douglassss", "rubbish", "adam"]: + candidates = nlp.linker.kb.get_candidates(alias) + print(len(candidates), "candidates for", alias) + if __name__ == "__main__": - # add_el() - create_kb() + mykb = create_kb() + add_el(mykb) From 24a0c4a8d449b64033e80c1986e823cc44443490 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 21 Mar 2019 18:20:57 +0100 Subject: [PATCH 26/28] name per entity --- spacy/kb.pxd | 21 ++++++---- spacy/kb.pyx | 58 ++++++++++++++++---------- spacy/sandbox_test_sofie/testing_el.py | 10 +++-- 3 files changed, 54 insertions(+), 35 deletions(-) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index 5fd239998..cffbcd5d1 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -13,8 +13,9 @@ from .typedefs cimport hash_t # of bits we need to keep track of the answers. cdef struct _EntryC: - # The hash of this entry's unique ID - hash_t entity_hash + # The hash of this entry's unique ID and name in the kB + hash_t entity_id_hash + hash_t entity_name_hash # Allows retrieval of one or more vectors. # Each element of vector_rows should be an index into a vectors table. @@ -47,7 +48,7 @@ cdef struct _AliasC: cdef class Entity: cdef readonly KnowledgeBase kb - cdef hash_t entity_hash + cdef hash_t entity_id_hash cdef float confidence @@ -55,7 +56,7 @@ cdef class Entity: cdef class Candidate: cdef readonly KnowledgeBase kb - cdef hash_t entity_hash + cdef hash_t entity_id_hash cdef hash_t alias_hash cdef float prior_prob @@ -104,20 +105,21 @@ cdef class KnowledgeBase: # optional data, we can let users configure a DB as the backend for this. cdef object _features_table - cdef inline int64_t c_add_entity(self, hash_t entity_hash, float prob, int32_t* vector_rows, - int feats_row): + cdef inline int64_t c_add_entity(self, hash_t entity_id_hash, hash_t entity_name_hash, float prob, + int32_t* vector_rows, int feats_row): """Add an entry to the knowledge base.""" # This is what we'll map the hash key to. It's where the entry will sit # in the vector of entries, so we can get it later. cdef int64_t new_index = self._entries.size() self._entries.push_back( _EntryC( - entity_hash=entity_hash, + entity_id_hash=entity_id_hash, + entity_name_hash=entity_name_hash, vector_rows=vector_rows, feats_row=feats_row, prob=prob )) - self._entry_index[entity_hash] = new_index + self._entry_index[entity_id_hash] = new_index return new_index cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs): @@ -142,7 +144,8 @@ cdef class KnowledgeBase: self.strings.add("") self._entries.push_back( _EntryC( - entity_hash=self.strings.add(""), + entity_id_hash=self.strings[""], + entity_name_hash=self.strings[""], vector_rows=&dummy_value, feats_row=dummy_value, prob=dummy_value diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 33a79da04..e51cb087d 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -5,20 +5,20 @@ from spacy.errors import user_warning cdef class Entity: - def __init__(self, KnowledgeBase kb, entity_hash, confidence): + def __init__(self, KnowledgeBase kb, entity_id_hash, confidence): self.kb = kb - self.entity_hash = entity_hash + self.entity_id_hash = entity_id_hash self.confidence = confidence property kb_id_: """RETURNS (unicode): ID of this entity in the KB""" def __get__(self): - return self.kb.strings[self.entity_hash] + return self.kb.strings[self.entity_id_hash] property kb_id: """RETURNS (uint64): hash of the entity's KB ID""" def __get__(self): - return self.entity_hash + return self.entity_id_hash property confidence: def __get__(self): @@ -27,32 +27,43 @@ cdef class Entity: cdef class Candidate: - def __init__(self, KnowledgeBase kb, entity_hash, alias_hash, prior_prob): + def __init__(self, KnowledgeBase kb, entity_id_hash, alias_hash, prior_prob): self.kb = kb - self.entity_hash = entity_hash + self.entity_id_hash = entity_id_hash self.alias_hash = alias_hash self.prior_prob = prior_prob - property kb_id_: - """RETURNS (unicode): ID of this entity in the KB""" - def __get__(self): - return self.kb.strings[self.entity_hash] - - property kb_id: + property entity_id: """RETURNS (uint64): hash of the entity's KB ID""" def __get__(self): - return self.entity_hash + return self.entity_id_hash - property alias_: - """RETURNS (unicode): ID of the original alias""" + property entity_id_: + """RETURNS (unicode): ID of this entity in the KB""" def __get__(self): - return self.kb.strings[self.alias_hash] + return self.kb.strings[self.entity_id] + + property entity_name: + """RETURNS (uint64): hash of the entity's KB name""" + def __get__(self): + entry_index = self.kb._entry_index.get(self.entity_id) + return self.kb._entries[entry_index].entity_name_hash + + property entity_name_: + """RETURNS (unicode): name of this entity in the KB""" + def __get__(self): + return self.kb.strings[self.entity_name] property alias: """RETURNS (uint64): hash of the alias""" def __get__(self): return self.alias_hash + property alias_: + """RETURNS (unicode): ID of the original alias""" + def __get__(self): + return self.kb.strings[self.alias] + property prior_prob: def __get__(self): return self.prior_prob @@ -76,12 +87,15 @@ cdef class KnowledgeBase: def get_size_aliases(self): return self._aliases_table.size() - 1 # not counting dummy element on index 0 - def add_entity(self, unicode entity_id, float prob, vectors=None, features=None): + def add_entity(self, unicode entity_id, unicode entity_name=None, float prob=0.5, vectors=None, features=None): """ Add an entity to the KB. Return the hash of the entity ID at the end """ + if not entity_name: + entity_name = entity_id cdef hash_t id_hash = self.strings.add(entity_id) + cdef hash_t name_hash = self.strings.add(entity_name) # Return if this entity was added before if id_hash in self._entry_index: @@ -89,7 +103,7 @@ cdef class KnowledgeBase: return cdef int32_t dummy_value = 342 - self.c_add_entity(entity_hash=id_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value) + self.c_add_entity(entity_id_hash=id_hash, entity_name_hash=name_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value) # TODO self._vectors_table.get_pointer(vectors), # self._features_table.get(features)) @@ -127,11 +141,11 @@ cdef class KnowledgeBase: cdef vector[float] probs for entity, prob in zip(entities, probabilities): - entity_hash = self.strings[entity] - if not entity_hash in self._entry_index: + entity_id_hash = self.strings[entity] + if not entity_id_hash in self._entry_index: raise ValueError("Alias '" + alias + "' defined for unknown entity '" + entity + "'") - entry_index = self._entry_index.get(entity_hash) + entry_index = self._entry_index.get(entity_id_hash) entry_indices.push_back(int(entry_index)) probs.push_back(float(prob)) @@ -146,7 +160,7 @@ cdef class KnowledgeBase: alias_entry = self._aliases_table[alias_index] return [Candidate(kb=self, - entity_hash=self._entries[entry_index].entity_hash, + entity_id_hash=self._entries[entry_index].entity_id_hash, alias_hash=alias_hash, prior_prob=prob) for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs) diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py index f6296bf89..c7b0a3a07 100644 --- a/spacy/sandbox_test_sofie/testing_el.py +++ b/spacy/sandbox_test_sofie/testing_el.py @@ -12,15 +12,15 @@ def create_kb(): # adding entities entity_0 = "Q0" # douglas adams print(" adding entity", entity_0) - mykb.add_entity(entity_id=entity_0, prob=0.5) + mykb.add_entity(entity_id=entity_0, entity_name="queZero", prob=0.5) entity_42 = "Q42" # douglas adams print(" adding entity", entity_42) - mykb.add_entity(entity_id=entity_42, prob=0.5) + mykb.add_entity(entity_id=entity_42, entity_name="que42", prob=0.5) entity_5301561 = "Q5301561" print(" adding entity", entity_5301561) - mykb.add_entity(entity_id=entity_5301561, prob=0.5) + mykb.add_entity(entity_id=entity_5301561, entity_name="queMore", prob=0.5) print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases()) print() @@ -63,7 +63,9 @@ def add_el(kb): print() for alias in ["douglassss", "rubbish", "adam"]: candidates = nlp.linker.kb.get_candidates(alias) - print(len(candidates), "candidates for", alias) + print(len(candidates), "candidates for", alias, ":") + for c in candidates: + print(" ", c.entity_id_, c.entity_name_, c.alias_) if __name__ == "__main__": From 6e2433b95e1a6dd5f773cc49e3a8b553ef09421b Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 21 Mar 2019 18:55:01 +0100 Subject: [PATCH 27/28] select candidate with highest prior probabiity --- examples/pipeline/dummy_entity_linking.py | 69 +++++++++++++++++++++ spacy/kb.pxd | 10 +--- spacy/kb.pyx | 26 +------- spacy/pipeline/pipes.pyx | 11 +++- spacy/sandbox_test_sofie/testing_el.py | 73 ----------------------- 5 files changed, 81 insertions(+), 108 deletions(-) create mode 100644 examples/pipeline/dummy_entity_linking.py delete mode 100644 spacy/sandbox_test_sofie/testing_el.py diff --git a/examples/pipeline/dummy_entity_linking.py b/examples/pipeline/dummy_entity_linking.py new file mode 100644 index 000000000..c51f321e0 --- /dev/null +++ b/examples/pipeline/dummy_entity_linking.py @@ -0,0 +1,69 @@ +# coding: utf-8 +"""Demonstrate how to build a simple knowledge base and run an Entity Linking algorithm. +Currently still a bit of a dummy algorithm: taking simply the entity with highest probability for a given alias +""" +import spacy +from spacy.kb import KnowledgeBase + + +def create_kb(): + kb = KnowledgeBase() + + # adding entities + entity_0 = "Q1004791" + print("adding entity", entity_0) + kb.add_entity(entity_id=entity_0, entity_name="Douglas", prob=0.5) + + entity_1 = "Q42" + print("adding entity", entity_1) + kb.add_entity(entity_id=entity_1, entity_name="Douglas Adams", prob=0.5) + + entity_2 = "Q5301561" + print("adding entity", entity_2) + kb.add_entity(entity_id=entity_2, entity_name="Douglas Haig", prob=0.5) + + # adding aliases + print() + alias_0 = "Douglas" + print("adding alias", alias_0, "to all three entities") + kb.add_alias(alias=alias_0, entities=["Q1004791", "Q42", "Q5301561"], probabilities=[0.1, 0.6, 0.2]) + + alias_1 = "Douglas Adams" + print("adding alias", alias_1, "to just the one entity") + kb.add_alias(alias=alias_1, entities=["Q42"], probabilities=[0.9]) + + print() + print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases()) + + return kb + + +def add_el(kb): + nlp = spacy.load('en_core_web_sm') + + el_pipe = nlp.create_pipe(name='el', config={"kb": kb}) + nlp.add_pipe(el_pipe, last=True) + + for alias in ["Douglas Adams", "Douglas"]: + candidates = nlp.linker.kb.get_candidates(alias) + print() + print(len(candidates), "candidate(s) for", alias, ":") + for c in candidates: + print(" ", c.entity_id_, c.entity_name_, c.alias_, c.prior_prob) + + text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \ + "Douglas reminds us to always bring our towel." + doc = nlp(text) + + print() + for token in doc: + print("token", token.text, token.ent_type_, token.ent_kb_id_) + + print() + for ent in doc.ents: + print("ent", ent.text, ent.label_, ent.kb_id_) + + +if __name__ == "__main__": + mykb = create_kb() + add_el(mykb) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index cffbcd5d1..490e05036 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -44,15 +44,7 @@ cdef struct _AliasC: vector[float] probs -# TODO: document -cdef class Entity: - - cdef readonly KnowledgeBase kb - cdef hash_t entity_id_hash - cdef float confidence - - -# TODO: document +# Object used by the Entity Linker that summarizes one entity-alias candidate combination. cdef class Candidate: cdef readonly KnowledgeBase kb diff --git a/spacy/kb.pyx b/spacy/kb.pyx index e51cb087d..6d031fb91 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -3,28 +3,6 @@ from spacy.errors import user_warning -cdef class Entity: - - def __init__(self, KnowledgeBase kb, entity_id_hash, confidence): - self.kb = kb - self.entity_id_hash = entity_id_hash - self.confidence = confidence - - property kb_id_: - """RETURNS (unicode): ID of this entity in the KB""" - def __get__(self): - return self.kb.strings[self.entity_id_hash] - - property kb_id: - """RETURNS (uint64): hash of the entity's KB ID""" - def __get__(self): - return self.entity_id_hash - - property confidence: - def __get__(self): - return self.confidence - - cdef class Candidate: def __init__(self, KnowledgeBase kb, entity_id_hash, alias_hash, prior_prob): @@ -103,7 +81,8 @@ cdef class KnowledgeBase: return cdef int32_t dummy_value = 342 - self.c_add_entity(entity_id_hash=id_hash, entity_name_hash=name_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value) + self.c_add_entity(entity_id_hash=id_hash, entity_name_hash=name_hash, prob=prob, + vector_rows=&dummy_value, feats_row=dummy_value) # TODO self._vectors_table.get_pointer(vectors), # self._features_table.get(features)) @@ -155,6 +134,7 @@ cdef class KnowledgeBase: def get_candidates(self, unicode alias): + """ TODO: where to put this functionality ?""" cdef hash_t alias_hash = self.strings[alias] alias_index = self._alias_index.get(alias_hash) alias_entry = self._aliases_table[alias_index] diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 5866518a7..b554eb2b6 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1068,12 +1068,17 @@ class EntityLinker(Pipe): yield from docs def set_annotations(self, docs, scores, tensors=None): - # TODO Sofie: actually implement this class instead of dummy implementation + """ + Currently implemented as taking the KB entry with highest prior probability for each named entity + TODO: actually use context etc + """ for i, doc in enumerate(docs): for ent in doc.ents: - if ent.label_ in ["PERSON", "PER"]: + candidates = self.kb.get_candidates(ent.text) + if candidates: + best_candidate = max(candidates, key=lambda c: c.prior_prob) for token in ent: - token.ent_kb_id_ = "Q42" + token.ent_kb_id_ = best_candidate.entity_id_ def get_loss(self, docs, golds, scores): # TODO diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py deleted file mode 100644 index c7b0a3a07..000000000 --- a/spacy/sandbox_test_sofie/testing_el.py +++ /dev/null @@ -1,73 +0,0 @@ -# coding: utf-8 -import spacy -from spacy.kb import KnowledgeBase - - -def create_kb(): - mykb = KnowledgeBase() - - print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases()) - print() - - # adding entities - entity_0 = "Q0" # douglas adams - print(" adding entity", entity_0) - mykb.add_entity(entity_id=entity_0, entity_name="queZero", prob=0.5) - - entity_42 = "Q42" # douglas adams - print(" adding entity", entity_42) - mykb.add_entity(entity_id=entity_42, entity_name="que42", prob=0.5) - - entity_5301561 = "Q5301561" - print(" adding entity", entity_5301561) - mykb.add_entity(entity_id=entity_5301561, entity_name="queMore", prob=0.5) - - print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases()) - print() - - # adding aliases - alias1 = "douglassss" - print(" adding alias", alias1, "to Q42 and Q5301561") - mykb.add_alias(alias=alias1, entities=["Q42", "Q5301561"], probabilities=[0.8, 0.2]) - - alias3 = "adam" - print(" adding alias", alias3, "to Q42") - mykb.add_alias(alias=alias3, entities=["Q42"], probabilities=[0.9]) - - print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases()) - print() - - return mykb - - -def add_el(kb): - nlp = spacy.load('en_core_web_sm') - print("pipes before:", nlp.pipe_names) - - el_pipe = nlp.create_pipe(name='el', config={"kb": kb}) - nlp.add_pipe(el_pipe, last=True) - - print("pipes after:", nlp.pipe_names) - print() - - text = "The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, reminds us to always bring our towel." - doc = nlp(text) - - for token in doc: - print("token", token.text, token.ent_type_, token.ent_kb_id_) - - print() - for ent in doc.ents: - print("ent", ent.text, ent.label_, ent.kb_id_) - - print() - for alias in ["douglassss", "rubbish", "adam"]: - candidates = nlp.linker.kb.get_candidates(alias) - print(len(candidates), "candidates for", alias, ":") - for c in candidates: - print(" ", c.entity_id_, c.entity_name_, c.alias_) - - -if __name__ == "__main__": - mykb = create_kb() - add_el(mykb) From 4820b43313f83fcbdc51eacbe270d6fa3d738214 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 21 Mar 2019 23:17:25 +0100 Subject: [PATCH 28/28] use nlp's vocab for stringstore --- examples/pipeline/dummy_entity_linking.py | 22 +++++++++--------- spacy/kb.pxd | 10 ++++---- spacy/kb.pyx | 20 ++++++++-------- spacy/tests/pipeline/test_el.py | 28 ++++++++++++++--------- 4 files changed, 43 insertions(+), 37 deletions(-) diff --git a/examples/pipeline/dummy_entity_linking.py b/examples/pipeline/dummy_entity_linking.py index c51f321e0..43d17c481 100644 --- a/examples/pipeline/dummy_entity_linking.py +++ b/examples/pipeline/dummy_entity_linking.py @@ -6,8 +6,8 @@ import spacy from spacy.kb import KnowledgeBase -def create_kb(): - kb = KnowledgeBase() +def create_kb(vocab): + kb = KnowledgeBase(vocab=vocab) # adding entities entity_0 = "Q1004791" @@ -25,11 +25,11 @@ def create_kb(): # adding aliases print() alias_0 = "Douglas" - print("adding alias", alias_0, "to all three entities") + print("adding alias", alias_0) kb.add_alias(alias=alias_0, entities=["Q1004791", "Q42", "Q5301561"], probabilities=[0.1, 0.6, 0.2]) alias_1 = "Douglas Adams" - print("adding alias", alias_1, "to just the one entity") + print("adding alias", alias_1) kb.add_alias(alias=alias_1, entities=["Q42"], probabilities=[0.9]) print() @@ -38,9 +38,7 @@ def create_kb(): return kb -def add_el(kb): - nlp = spacy.load('en_core_web_sm') - +def add_el(kb, nlp): el_pipe = nlp.create_pipe(name='el', config={"kb": kb}) nlp.add_pipe(el_pipe, last=True) @@ -49,10 +47,11 @@ def add_el(kb): print() print(len(candidates), "candidate(s) for", alias, ":") for c in candidates: - print(" ", c.entity_id_, c.entity_name_, c.alias_, c.prior_prob) + print(" ", c.entity_id_, c.entity_name_, c.prior_prob) text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \ - "Douglas reminds us to always bring our towel." + "Douglas reminds us to always bring our towel. " \ + "The main character in Doug's novel is called Arthur Dent." doc = nlp(text) print() @@ -65,5 +64,6 @@ def add_el(kb): if __name__ == "__main__": - mykb = create_kb() - add_el(mykb) + nlp = spacy.load('en_core_web_sm') + my_kb = create_kb(nlp.vocab) + add_el(my_kb, nlp) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index 490e05036..dc6701b89 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -4,7 +4,7 @@ from preshed.maps cimport PreshMap from libcpp.vector cimport vector from libc.stdint cimport int32_t, int64_t -from spacy.strings cimport StringStore +from spacy.vocab cimport Vocab from .typedefs cimport hash_t @@ -55,7 +55,7 @@ cdef class Candidate: cdef class KnowledgeBase: cdef Pool mem - cpdef readonly StringStore strings + cpdef readonly Vocab vocab # This maps 64bit keys (hash of unique entity string) # to 64bit values (position of the _EntryC struct in the _entries vector). @@ -133,11 +133,11 @@ cdef class KnowledgeBase: cf. https://github.com/explosion/preshed/issues/17 """ cdef int32_t dummy_value = 0 - self.strings.add("") + self.vocab.strings.add("") self._entries.push_back( _EntryC( - entity_id_hash=self.strings[""], - entity_name_hash=self.strings[""], + entity_id_hash=self.vocab.strings[""], + entity_name_hash=self.vocab.strings[""], vector_rows=&dummy_value, feats_row=dummy_value, prob=dummy_value diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 6d031fb91..186048a41 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -19,7 +19,7 @@ cdef class Candidate: property entity_id_: """RETURNS (unicode): ID of this entity in the KB""" def __get__(self): - return self.kb.strings[self.entity_id] + return self.kb.vocab.strings[self.entity_id] property entity_name: """RETURNS (uint64): hash of the entity's KB name""" @@ -30,7 +30,7 @@ cdef class Candidate: property entity_name_: """RETURNS (unicode): name of this entity in the KB""" def __get__(self): - return self.kb.strings[self.entity_name] + return self.kb.vocab.strings[self.entity_name] property alias: """RETURNS (uint64): hash of the alias""" @@ -40,7 +40,7 @@ cdef class Candidate: property alias_: """RETURNS (unicode): ID of the original alias""" def __get__(self): - return self.kb.strings[self.alias] + return self.kb.vocab.strings[self.alias] property prior_prob: def __get__(self): @@ -49,11 +49,11 @@ cdef class Candidate: cdef class KnowledgeBase: - def __init__(self): + def __init__(self, Vocab vocab): + self.vocab = vocab self._entry_index = PreshMap() self._alias_index = PreshMap() self.mem = Pool() - self.strings = StringStore() self._create_empty_vectors() def __len__(self): @@ -72,8 +72,8 @@ cdef class KnowledgeBase: """ if not entity_name: entity_name = entity_id - cdef hash_t id_hash = self.strings.add(entity_id) - cdef hash_t name_hash = self.strings.add(entity_name) + cdef hash_t id_hash = self.vocab.strings.add(entity_id) + cdef hash_t name_hash = self.vocab.strings.add(entity_name) # Return if this entity was added before if id_hash in self._entry_index: @@ -107,7 +107,7 @@ cdef class KnowledgeBase: raise ValueError("The sum of prior probabilities for alias '" + alias + "' should not exceed 1, " + "but found " + str(prob_sum)) - cdef hash_t alias_hash = self.strings.add(alias) + cdef hash_t alias_hash = self.vocab.strings.add(alias) # Return if this alias was added before if alias_hash in self._alias_index: @@ -120,7 +120,7 @@ cdef class KnowledgeBase: cdef vector[float] probs for entity, prob in zip(entities, probabilities): - entity_id_hash = self.strings[entity] + entity_id_hash = self.vocab.strings[entity] if not entity_id_hash in self._entry_index: raise ValueError("Alias '" + alias + "' defined for unknown entity '" + entity + "'") @@ -135,7 +135,7 @@ cdef class KnowledgeBase: def get_candidates(self, unicode alias): """ TODO: where to put this functionality ?""" - cdef hash_t alias_hash = self.strings[alias] + cdef hash_t alias_hash = self.vocab.strings[alias] alias_index = self._alias_index.get(alias_hash) alias_entry = self._aliases_table[alias_index] diff --git a/spacy/tests/pipeline/test_el.py b/spacy/tests/pipeline/test_el.py index 295b35cce..379661fc1 100644 --- a/spacy/tests/pipeline/test_el.py +++ b/spacy/tests/pipeline/test_el.py @@ -2,11 +2,17 @@ import pytest from spacy.kb import KnowledgeBase +from spacy.lang.en import English -def test_kb_valid_entities(): - """Test the valid construction of a KB with 3 entities and one alias""" - mykb = KnowledgeBase() +@pytest.fixture +def nlp(): + return English() + + +def test_kb_valid_entities(nlp): + """Test the valid construction of a KB with 3 entities and two aliases""" + mykb = KnowledgeBase(nlp.vocab) # adding entities mykb.add_entity(entity_id="Q1", prob=0.9) @@ -22,9 +28,9 @@ def test_kb_valid_entities(): assert(mykb.get_size_aliases() == 2) -def test_kb_invalid_entities(): +def test_kb_invalid_entities(nlp): """Test the invalid construction of a KB with an alias linked to a non-existing entity""" - mykb = KnowledgeBase() + mykb = KnowledgeBase(nlp.vocab) # adding entities mykb.add_entity(entity_id="Q1", prob=0.9) @@ -36,9 +42,9 @@ def test_kb_invalid_entities(): mykb.add_alias(alias="douglas", entities=["Q2", "Q342"], probabilities=[0.8, 0.2]) -def test_kb_invalid_probabilities(): +def test_kb_invalid_probabilities(nlp): """Test the invalid construction of a KB with wrong prior probabilities""" - mykb = KnowledgeBase() + mykb = KnowledgeBase(nlp.vocab) # adding entities mykb.add_entity(entity_id="Q1", prob=0.9) @@ -50,9 +56,9 @@ def test_kb_invalid_probabilities(): mykb.add_alias(alias="douglassss", entities=["Q2", "Q3"], probabilities=[0.8, 0.4]) -def test_kb_invalid_combination(): +def test_kb_invalid_combination(nlp): """Test the invalid construction of a KB with non-matching entity and probability lists""" - mykb = KnowledgeBase() + mykb = KnowledgeBase(nlp.vocab) # adding entities mykb.add_entity(entity_id="Q1", prob=0.9) @@ -64,9 +70,9 @@ def test_kb_invalid_combination(): mykb.add_alias(alias="douglassss", entities=["Q2", "Q3"], probabilities=[0.3, 0.4, 0.1]) -def test_candidate_generation(): +def test_candidate_generation(nlp): """Test correct candidate generation""" - mykb = KnowledgeBase() + mykb = KnowledgeBase(nlp.vocab) # adding entities mykb.add_entity(entity_id="Q1", prob=0.9)