adding kb_id as field to token, el as nlp pipeline component

This commit is contained in:
svlandeg 2019-03-06 19:34:18 +01:00
parent d811c97da1
commit d849eb2455
9 changed files with 153 additions and 3 deletions

View File

View File

@ -0,0 +1,21 @@
import spacy
def add_el():
nlp = spacy.load('en_core_web_sm')
print("pipes", nlp.pipe_names)
el_pipe = nlp.create_pipe(name='el')
nlp.add_pipe(el_pipe, last=True)
print("pipes", nlp.pipe_names)
print()
text = "Australian striker John hits century"
doc = nlp(text)
for token in doc:
print("token", token.text, token.tag_, token.pos_, token.kb_id)
if __name__ == "__main__":
add_el()

View File

@ -14,7 +14,7 @@ import srsly
from .tokenizer import Tokenizer
from .vocab import Vocab
from .lemmatizer import Lemmatizer
from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer, EntityLinker
from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter
from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
from .pipeline import EntityRuler
@ -117,6 +117,7 @@ class Language(object):
"tagger": lambda nlp, **cfg: Tagger(nlp.vocab, **cfg),
"parser": lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg),
"ner": lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg),
"el": lambda nlp, **cfg: EntityLinker(nlp.vocab, **cfg),
"similarity": lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
"textcat": lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg),
"sentencizer": lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg),

View File

@ -43,6 +43,8 @@ cdef class Morphology:
cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
cdef int assign_kb_id(self, TokenC* token, kb_id) except -1
cdef enum univ_morph_t:
NIL = 0

View File

@ -123,6 +123,9 @@ cdef class Morphology:
else:
flags[0] &= ~(one << flag_id)
cdef int assign_kb_id(self, TokenC* token, kb_id) except -1:
token.kb_id = kb_id
def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
force=False):
"""Add a special-case rule to the morphological analyser. Tokens whose

View File

@ -1,7 +1,7 @@
# coding: utf8
from __future__ import unicode_literals
from .pipes import Tagger, DependencyParser, EntityRecognizer
from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker
from .pipes import TextCategorizer, Tensorizer, Pipe
from .entityruler import EntityRuler
from .hooks import SentenceSegmenter, SimilarityHook
@ -11,6 +11,7 @@ __all__ = [
"Tagger",
"DependencyParser",
"EntityRecognizer",
"EntityLinker",
"TextCategorizer",
"Tensorizer",
"Pipe",

View File

@ -1058,4 +1058,116 @@ cdef class EntityRecognizer(Parser):
if move[0] in ("B", "I", "L", "U")))
__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer"]
class EntityLinker(Pipe):
name = 'el'
@classmethod
def Model(cls, nr_class=1, **cfg):
embed_size = util.env_opt("embed_size", 2000)
if "token_vector_width" in cfg:
token_vector_width = cfg["token_vector_width"]
else:
token_vector_width = util.env_opt("token_vector_width", 96)
if cfg.get('architecture') == 'simple_cnn':
tok2vec = Tok2Vec(token_vector_width, embed_size, **cfg)
return None # build_simple_cnn_text_classifier(tok2vec, nr_class, **cfg)
else:
return None # build_text_classifier(nr_class, **cfg)
def __init__(self, vocab, model=True, **cfg):
self.vocab = vocab
self.model = model
self._rehearsal_model = None
self.cfg = dict(cfg)
def __call__(self, doc):
# scores, tensors = self.predict([doc])
scores, tensors = None, None
self.set_annotations([doc], scores, tensors=tensors)
return doc
def pipe(self, stream, batch_size=128, n_threads=-1):
for docs in util.minibatch(stream, size=batch_size):
docs = list(docs)
scores, tensors = self.predict(docs)
self.set_annotations(docs, scores, tensors=tensors)
yield from docs
def predict(self, docs):
# self.require_model()
scores = self.model(docs)
scores = self.model.ops.asarray(scores)
tensors = [doc.tensor for doc in docs]
return scores, tensors
def set_annotations(self, docs, scores, tensors=None):
# TODO Sofie: actually implement this class instead of dummy implementation
for i, doc in enumerate(docs):
for token in doc:
token.kb_id = 342
def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
scores, bp_scores = self.model.begin_update(docs, drop=drop)
loss, d_scores = self.get_loss(docs, golds, scores)
bp_scores(d_scores, sgd=sgd)
if losses is not None:
losses.setdefault(self.name, 0.0)
losses[self.name] += loss
def rehearse(self, docs, drop=0., sgd=None, losses=None):
if self._rehearsal_model is None:
return
scores, bp_scores = self.model.begin_update(docs, drop=drop)
target = self._rehearsal_model(docs)
gradient = scores - target
bp_scores(gradient, sgd=sgd)
if losses is not None:
losses.setdefault(self.name, 0.0)
losses[self.name] += (gradient**2).sum()
def get_loss(self, docs, golds, scores):
truths = numpy.zeros((len(golds), len(self.labels)), dtype='f')
not_missing = numpy.ones((len(golds), len(self.labels)), dtype='f')
for i, gold in enumerate(golds):
for j, label in enumerate(self.labels):
if label in gold.cats:
truths[i, j] = gold.cats[label]
else:
not_missing[i, j] = 0.
truths = self.model.ops.asarray(truths)
not_missing = self.model.ops.asarray(not_missing)
d_scores = (scores-truths) / scores.shape[0]
d_scores *= not_missing
mean_square_error = (d_scores**2).sum(axis=1).mean()
return float(mean_square_error), d_scores
def add_label(self, label):
if label in self.labels:
return 0
if self.model not in (None, True, False):
# This functionality was available previously, but was broken.
# The problem is that we resize the last layer, but the last layer
# is actually just an ensemble. We're not resizing the child layers
# -- a huge problem.
raise ValueError(Errors.E116)
#smaller = self.model._layers[-1]
#larger = Affine(len(self.labels)+1, smaller.nI)
#copy_array(larger.W[:smaller.nO], smaller.W)
#copy_array(larger.b[:smaller.nO], smaller.b)
#self.model._layers[-1] = larger
self.labels = tuple(list(self.labels) + [label])
return 1
def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
**kwargs):
if self.model is True:
self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors')
self.model = self.Model(len(self.labels), **self.cfg)
link_vectors_to_models(self.vocab)
if sgd is None:
sgd = self.create_optimizer()
return sgd
__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker"]

View File

@ -71,3 +71,5 @@ cdef struct TokenC:
int ent_iob
attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
hash_t ent_id
hash_t kb_id

View File

@ -354,6 +354,14 @@ cdef class Token:
def __set__(self, attr_t tag):
self.vocab.morphology.assign_tag(self.c, tag)
property kb_id:
"""RETURNS (uint64): ID of entity (after Entity Linking)."""
def __get__(self):
return self.c.kb_id
def __set__(self, attr_t kb_id):
self.vocab.morphology.assign_kb_id(self.c, kb_id)
property dep:
"""RETURNS (uint64): ID of syntactic dependency label."""
def __get__(self):