annotate kb_id through ents in doc

This commit is contained in:
svlandeg 2019-03-14 15:48:40 +01:00
parent 173d45ec5f
commit 5f002e9ced
9 changed files with 52 additions and 27 deletions

View File

@ -3,18 +3,23 @@ import spacy
def add_el():
nlp = spacy.load('en_core_web_sm')
print("pipes", nlp.pipe_names)
print("pipes before:", nlp.pipe_names)
el_pipe = nlp.create_pipe(name='el')
nlp.add_pipe(el_pipe, last=True)
print("pipes", nlp.pipe_names)
print("pipes after:", nlp.pipe_names)
print()
text = "Australian striker John hits century"
text = "The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, reminds us to always bring our towel."
doc = nlp(text)
for token in doc:
print("token", token.text, token.tag_, token.pos_, token.kb_id)
print("token", token.text, token.ent_type_, token.ent_kb_id_)
print()
for ent in doc.ents:
print("ent", ent.text, ent.label_, ent.kb_id_)
if __name__ == "__main__":

View File

@ -43,8 +43,6 @@ cdef class Morphology:
cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
cdef int assign_kb_id(self, TokenC* token, kb_id) except -1
cdef enum univ_morph_t:
NIL = 0

View File

@ -122,9 +122,6 @@ cdef class Morphology:
else:
flags[0] &= ~(one << flag_id)
cdef int assign_kb_id(self, TokenC* token, kb_id) except -1:
token.kb_id = kb_id
def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
force=False):
"""Add a special-case rule to the morphological analyser. Tokens whose

View File

@ -1086,8 +1086,10 @@ class EntityLinker(Pipe):
def set_annotations(self, docs, scores, tensors=None):
# TODO Sofie: actually implement this class instead of dummy implementation
for i, doc in enumerate(docs):
for token in doc:
token.kb_id = 342
for ent in doc.ents:
if ent.label_ in ["PERSON", "PER"]:
for token in ent:
token.ent_kb_id_ = "Q42"
def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
scores, bp_scores = self.model.begin_update(docs, drop=drop)

View File

@ -70,6 +70,5 @@ cdef struct TokenC:
int sent_start
int ent_iob
attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
attr_t ent_kb_id
hash_t ent_id
hash_t kb_id

View File

@ -279,7 +279,7 @@ cdef class Doc:
def doc(self):
return self
def char_span(self, int start_idx, int end_idx, label=0, vector=None):
def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None):
"""Create a `Span` object from the slice `doc.text[start : end]`.
doc (Doc): The parent document.
@ -287,12 +287,15 @@ cdef class Doc:
end (int): The index of the first character after the span.
label (uint64 or string): A label to attach to the Span, e.g. for
named entities.
kb_id (uint64 or string): An ID from a KB to capture the meaning of a named entity.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
the span.
RETURNS (Span): The newly constructed object.
"""
if not isinstance(label, int):
label = self.vocab.strings.add(label)
if not isinstance(kb_id, int):
kb_id = self.vocab.strings.add(kb_id)
cdef int start = token_by_start(self.c, self.length, start_idx)
if start == -1:
return None
@ -301,7 +304,7 @@ cdef class Doc:
return None
# Currently we have the token index, we want the range-end index
end += 1
cdef Span span = Span(self, start, end, label=label, vector=vector)
cdef Span span = Span(self, start, end, label=label, kb_id=kb_id, vector=vector)
return span
def similarity(self, other):
@ -438,6 +441,7 @@ cdef class Doc:
cdef const TokenC* token
cdef int start = -1
cdef attr_t label = 0
cdef attr_t kb_id = 0
output = []
for i in range(self.length):
token = &self.c[i]
@ -447,16 +451,18 @@ cdef class Doc:
raise ValueError(Errors.E093.format(seq=' '.join(seq)))
elif token.ent_iob == 2 or token.ent_iob == 0:
if start != -1:
output.append(Span(self, start, i, label=label))
output.append(Span(self, start, i, label=label, kb_id=kb_id))
start = -1
label = 0
kb_id = 0
elif token.ent_iob == 3:
if start != -1:
output.append(Span(self, start, i, label=label))
output.append(Span(self, start, i, label=label, kb_id=kb_id))
start = i
label = token.ent_type
kb_id = token.ent_kb_id
if start != -1:
output.append(Span(self, start, self.length, label=label))
output.append(Span(self, start, self.length, label=label, kb_id=kb_id))
return tuple(output)
def __set__(self, ents):

View File

@ -11,6 +11,7 @@ cdef class Span:
cdef readonly int start_char
cdef readonly int end_char
cdef readonly attr_t label
cdef readonly attr_t kb_id
cdef public _vector
cdef public _vector_norm

View File

@ -45,13 +45,14 @@ cdef class Span:
return Underscore.span_extensions.pop(name)
def __cinit__(self, Doc doc, int start, int end, label=0,
vector=None, vector_norm=None):
vector=None, vector_norm=None, kb_id=0):
"""Create a `Span` object from the slice `doc[start : end]`.
doc (Doc): The parent document.
start (int): The index of the first token of the span.
end (int): The index of the first token after the span.
label (uint64): A label to attach to the Span, e.g. for named entities.
kb_id (uint64): An identifier from a Knowledge Base to capture the meaning of a named entity.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation
of the span.
RETURNS (Span): The newly constructed object.
@ -73,6 +74,7 @@ cdef class Span:
self.label = label
self._vector = vector
self._vector_norm = vector_norm
self.kb_id = kb_id
def __richcmp__(self, Span other, int op):
if other is None:
@ -592,6 +594,13 @@ cdef class Span:
def __set__(self, unicode label_):
self.label = self.doc.vocab.strings.add(label_)
property kb_id_:
"""RETURNS (unicode): The named entity's KB ID."""
def __get__(self):
return self.doc.vocab.strings[self.kb_id]
def __set__(self, unicode kb_id_):
raise NotImplementedError(TempErrors.T007.format(attr='kb_id_'))
cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
# Don't allow spaces to be the root, if there are

View File

@ -307,14 +307,6 @@ cdef class Token:
def __set__(self, attr_t tag):
self.vocab.morphology.assign_tag(self.c, tag)
property kb_id:
"""RETURNS (uint64): ID of entity (after Entity Linking)."""
def __get__(self):
return self.c.kb_id
def __set__(self, attr_t kb_id):
self.vocab.morphology.assign_kb_id(self.c, kb_id)
property dep:
"""RETURNS (uint64): ID of syntactic dependency label."""
def __get__(self):
@ -699,6 +691,22 @@ cdef class Token:
def __set__(self, name):
self.c.ent_id = self.vocab.strings.add(name)
property ent_kb_id:
"""RETURNS (uint64): Named entity KB ID."""
def __get__(self):
return self.c.ent_kb_id
def __set__(self, attr_t ent_kb_id):
self.c.ent_kb_id = ent_kb_id
property ent_kb_id_:
"""RETURNS (unicode): Named entity KB ID."""
def __get__(self):
return self.vocab.strings[self.c.ent_kb_id]
def __set__(self, ent_kb_id):
self.c.ent_kb_id = self.vocab.strings.add(ent_kb_id)
property whitespace_:
"""RETURNS (unicode): The trailing whitespace character, if present.
"""