mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
annotate kb_id through ents in doc
This commit is contained in:
parent
d849eb2455
commit
735fc2a735
|
@ -3,18 +3,23 @@ import spacy
|
||||||
|
|
||||||
def add_el():
|
def add_el():
|
||||||
nlp = spacy.load('en_core_web_sm')
|
nlp = spacy.load('en_core_web_sm')
|
||||||
print("pipes", nlp.pipe_names)
|
print("pipes before:", nlp.pipe_names)
|
||||||
|
|
||||||
el_pipe = nlp.create_pipe(name='el')
|
el_pipe = nlp.create_pipe(name='el')
|
||||||
nlp.add_pipe(el_pipe, last=True)
|
nlp.add_pipe(el_pipe, last=True)
|
||||||
|
|
||||||
print("pipes", nlp.pipe_names)
|
print("pipes after:", nlp.pipe_names)
|
||||||
print()
|
print()
|
||||||
|
|
||||||
text = "Australian striker John hits century"
|
text = "The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, reminds us to always bring our towel."
|
||||||
doc = nlp(text)
|
doc = nlp(text)
|
||||||
|
|
||||||
for token in doc:
|
for token in doc:
|
||||||
print("token", token.text, token.tag_, token.pos_, token.kb_id)
|
print("token", token.text, token.ent_type_, token.ent_kb_id_)
|
||||||
|
|
||||||
|
print()
|
||||||
|
for ent in doc.ents:
|
||||||
|
print("ent", ent.text, ent.label_, ent.kb_id_)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -43,8 +43,6 @@ cdef class Morphology:
|
||||||
|
|
||||||
cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
|
cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
|
||||||
|
|
||||||
cdef int assign_kb_id(self, TokenC* token, kb_id) except -1
|
|
||||||
|
|
||||||
|
|
||||||
cdef enum univ_morph_t:
|
cdef enum univ_morph_t:
|
||||||
NIL = 0
|
NIL = 0
|
||||||
|
|
|
@ -123,9 +123,6 @@ cdef class Morphology:
|
||||||
else:
|
else:
|
||||||
flags[0] &= ~(one << flag_id)
|
flags[0] &= ~(one << flag_id)
|
||||||
|
|
||||||
cdef int assign_kb_id(self, TokenC* token, kb_id) except -1:
|
|
||||||
token.kb_id = kb_id
|
|
||||||
|
|
||||||
def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
|
def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
|
||||||
force=False):
|
force=False):
|
||||||
"""Add a special-case rule to the morphological analyser. Tokens whose
|
"""Add a special-case rule to the morphological analyser. Tokens whose
|
||||||
|
|
|
@ -1104,8 +1104,10 @@ class EntityLinker(Pipe):
|
||||||
def set_annotations(self, docs, scores, tensors=None):
|
def set_annotations(self, docs, scores, tensors=None):
|
||||||
# TODO Sofie: actually implement this class instead of dummy implementation
|
# TODO Sofie: actually implement this class instead of dummy implementation
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
for token in doc:
|
for ent in doc.ents:
|
||||||
token.kb_id = 342
|
if ent.label_ in ["PERSON", "PER"]:
|
||||||
|
for token in ent:
|
||||||
|
token.ent_kb_id_ = "Q42"
|
||||||
|
|
||||||
def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
|
def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
|
||||||
scores, bp_scores = self.model.begin_update(docs, drop=drop)
|
scores, bp_scores = self.model.begin_update(docs, drop=drop)
|
||||||
|
|
|
@ -70,6 +70,5 @@ cdef struct TokenC:
|
||||||
int sent_start
|
int sent_start
|
||||||
int ent_iob
|
int ent_iob
|
||||||
attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
|
attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
|
||||||
|
attr_t ent_kb_id
|
||||||
hash_t ent_id
|
hash_t ent_id
|
||||||
|
|
||||||
hash_t kb_id
|
|
||||||
|
|
|
@ -326,7 +326,7 @@ cdef class Doc:
|
||||||
def doc(self):
|
def doc(self):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def char_span(self, int start_idx, int end_idx, label=0, vector=None):
|
def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None):
|
||||||
"""Create a `Span` object from the slice `doc.text[start : end]`.
|
"""Create a `Span` object from the slice `doc.text[start : end]`.
|
||||||
|
|
||||||
doc (Doc): The parent document.
|
doc (Doc): The parent document.
|
||||||
|
@ -334,6 +334,7 @@ cdef class Doc:
|
||||||
end (int): The index of the first character after the span.
|
end (int): The index of the first character after the span.
|
||||||
label (uint64 or string): A label to attach to the Span, e.g. for
|
label (uint64 or string): A label to attach to the Span, e.g. for
|
||||||
named entities.
|
named entities.
|
||||||
|
kb_id (uint64 or string): An ID from a KB to capture the meaning of a named entity.
|
||||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
|
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
|
||||||
the span.
|
the span.
|
||||||
RETURNS (Span): The newly constructed object.
|
RETURNS (Span): The newly constructed object.
|
||||||
|
@ -342,6 +343,8 @@ cdef class Doc:
|
||||||
"""
|
"""
|
||||||
if not isinstance(label, int):
|
if not isinstance(label, int):
|
||||||
label = self.vocab.strings.add(label)
|
label = self.vocab.strings.add(label)
|
||||||
|
if not isinstance(kb_id, int):
|
||||||
|
kb_id = self.vocab.strings.add(kb_id)
|
||||||
cdef int start = token_by_start(self.c, self.length, start_idx)
|
cdef int start = token_by_start(self.c, self.length, start_idx)
|
||||||
if start == -1:
|
if start == -1:
|
||||||
return None
|
return None
|
||||||
|
@ -350,7 +353,7 @@ cdef class Doc:
|
||||||
return None
|
return None
|
||||||
# Currently we have the token index, we want the range-end index
|
# Currently we have the token index, we want the range-end index
|
||||||
end += 1
|
end += 1
|
||||||
cdef Span span = Span(self, start, end, label=label, vector=vector)
|
cdef Span span = Span(self, start, end, label=label, kb_id=kb_id, vector=vector)
|
||||||
return span
|
return span
|
||||||
|
|
||||||
def similarity(self, other):
|
def similarity(self, other):
|
||||||
|
@ -484,6 +487,7 @@ cdef class Doc:
|
||||||
cdef const TokenC* token
|
cdef const TokenC* token
|
||||||
cdef int start = -1
|
cdef int start = -1
|
||||||
cdef attr_t label = 0
|
cdef attr_t label = 0
|
||||||
|
cdef attr_t kb_id = 0
|
||||||
output = []
|
output = []
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
token = &self.c[i]
|
token = &self.c[i]
|
||||||
|
@ -493,16 +497,18 @@ cdef class Doc:
|
||||||
raise ValueError(Errors.E093.format(seq=" ".join(seq)))
|
raise ValueError(Errors.E093.format(seq=" ".join(seq)))
|
||||||
elif token.ent_iob == 2 or token.ent_iob == 0:
|
elif token.ent_iob == 2 or token.ent_iob == 0:
|
||||||
if start != -1:
|
if start != -1:
|
||||||
output.append(Span(self, start, i, label=label))
|
output.append(Span(self, start, i, label=label, kb_id=kb_id))
|
||||||
start = -1
|
start = -1
|
||||||
label = 0
|
label = 0
|
||||||
|
kb_id = 0
|
||||||
elif token.ent_iob == 3:
|
elif token.ent_iob == 3:
|
||||||
if start != -1:
|
if start != -1:
|
||||||
output.append(Span(self, start, i, label=label))
|
output.append(Span(self, start, i, label=label, kb_id=kb_id))
|
||||||
start = i
|
start = i
|
||||||
label = token.ent_type
|
label = token.ent_type
|
||||||
|
kb_id = token.ent_kb_id
|
||||||
if start != -1:
|
if start != -1:
|
||||||
output.append(Span(self, start, self.length, label=label))
|
output.append(Span(self, start, self.length, label=label, kb_id=kb_id))
|
||||||
return tuple(output)
|
return tuple(output)
|
||||||
|
|
||||||
def __set__(self, ents):
|
def __set__(self, ents):
|
||||||
|
|
|
@ -11,6 +11,7 @@ cdef class Span:
|
||||||
cdef readonly int start_char
|
cdef readonly int start_char
|
||||||
cdef readonly int end_char
|
cdef readonly int end_char
|
||||||
cdef readonly attr_t label
|
cdef readonly attr_t label
|
||||||
|
cdef readonly attr_t kb_id
|
||||||
|
|
||||||
cdef public _vector
|
cdef public _vector
|
||||||
cdef public _vector_norm
|
cdef public _vector_norm
|
||||||
|
|
|
@ -85,13 +85,14 @@ cdef class Span:
|
||||||
return Underscore.span_extensions.pop(name)
|
return Underscore.span_extensions.pop(name)
|
||||||
|
|
||||||
def __cinit__(self, Doc doc, int start, int end, label=0, vector=None,
|
def __cinit__(self, Doc doc, int start, int end, label=0, vector=None,
|
||||||
vector_norm=None):
|
vector_norm=None, kb_id=0):
|
||||||
"""Create a `Span` object from the slice `doc[start : end]`.
|
"""Create a `Span` object from the slice `doc[start : end]`.
|
||||||
|
|
||||||
doc (Doc): The parent document.
|
doc (Doc): The parent document.
|
||||||
start (int): The index of the first token of the span.
|
start (int): The index of the first token of the span.
|
||||||
end (int): The index of the first token after the span.
|
end (int): The index of the first token after the span.
|
||||||
label (uint64): A label to attach to the Span, e.g. for named entities.
|
label (uint64): A label to attach to the Span, e.g. for named entities.
|
||||||
|
kb_id (uint64): An identifier from a Knowledge Base to capture the meaning of a named entity.
|
||||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation
|
vector (ndarray[ndim=1, dtype='float32']): A meaning representation
|
||||||
of the span.
|
of the span.
|
||||||
RETURNS (Span): The newly constructed object.
|
RETURNS (Span): The newly constructed object.
|
||||||
|
@ -115,6 +116,7 @@ cdef class Span:
|
||||||
self.label = label
|
self.label = label
|
||||||
self._vector = vector
|
self._vector = vector
|
||||||
self._vector_norm = vector_norm
|
self._vector_norm = vector_norm
|
||||||
|
self.kb_id = kb_id
|
||||||
|
|
||||||
def __richcmp__(self, Span other, int op):
|
def __richcmp__(self, Span other, int op):
|
||||||
if other is None:
|
if other is None:
|
||||||
|
@ -655,6 +657,13 @@ cdef class Span:
|
||||||
label_ = ''
|
label_ = ''
|
||||||
raise NotImplementedError(Errors.E129.format(start=self.start, end=self.end, label=label_))
|
raise NotImplementedError(Errors.E129.format(start=self.start, end=self.end, label=label_))
|
||||||
|
|
||||||
|
property kb_id_:
|
||||||
|
"""RETURNS (unicode): The named entity's KB ID."""
|
||||||
|
def __get__(self):
|
||||||
|
return self.doc.vocab.strings[self.kb_id]
|
||||||
|
def __set__(self, unicode kb_id_):
|
||||||
|
raise NotImplementedError(TempErrors.T007.format(attr='kb_id_'))
|
||||||
|
|
||||||
|
|
||||||
cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
|
cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
|
||||||
# Don't allow spaces to be the root, if there are
|
# Don't allow spaces to be the root, if there are
|
||||||
|
|
|
@ -354,14 +354,6 @@ cdef class Token:
|
||||||
def __set__(self, attr_t tag):
|
def __set__(self, attr_t tag):
|
||||||
self.vocab.morphology.assign_tag(self.c, tag)
|
self.vocab.morphology.assign_tag(self.c, tag)
|
||||||
|
|
||||||
property kb_id:
|
|
||||||
"""RETURNS (uint64): ID of entity (after Entity Linking)."""
|
|
||||||
def __get__(self):
|
|
||||||
return self.c.kb_id
|
|
||||||
|
|
||||||
def __set__(self, attr_t kb_id):
|
|
||||||
self.vocab.morphology.assign_kb_id(self.c, kb_id)
|
|
||||||
|
|
||||||
property dep:
|
property dep:
|
||||||
"""RETURNS (uint64): ID of syntactic dependency label."""
|
"""RETURNS (uint64): ID of syntactic dependency label."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -777,6 +769,22 @@ cdef class Token:
|
||||||
def __set__(self, name):
|
def __set__(self, name):
|
||||||
self.c.ent_id = self.vocab.strings.add(name)
|
self.c.ent_id = self.vocab.strings.add(name)
|
||||||
|
|
||||||
|
property ent_kb_id:
|
||||||
|
"""RETURNS (uint64): Named entity KB ID."""
|
||||||
|
def __get__(self):
|
||||||
|
return self.c.ent_kb_id
|
||||||
|
|
||||||
|
def __set__(self, attr_t ent_kb_id):
|
||||||
|
self.c.ent_kb_id = ent_kb_id
|
||||||
|
|
||||||
|
property ent_kb_id_:
|
||||||
|
"""RETURNS (unicode): Named entity KB ID."""
|
||||||
|
def __get__(self):
|
||||||
|
return self.vocab.strings[self.c.ent_kb_id]
|
||||||
|
|
||||||
|
def __set__(self, ent_kb_id):
|
||||||
|
self.c.ent_kb_id = self.vocab.strings.add(ent_kb_id)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def whitespace_(self):
|
def whitespace_(self):
|
||||||
"""RETURNS (unicode): The trailing whitespace character, if present."""
|
"""RETURNS (unicode): The trailing whitespace character, if present."""
|
||||||
|
|
Loading…
Reference in New Issue
Block a user