mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
adding kb_id as field to token, el as nlp pipeline component
This commit is contained in:
parent
d811c97da1
commit
d849eb2455
0
sandbox_test_sofie/__init__.py
Normal file
0
sandbox_test_sofie/__init__.py
Normal file
21
sandbox_test_sofie/testing_el.py
Normal file
21
sandbox_test_sofie/testing_el.py
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
import spacy
|
||||||
|
|
||||||
|
|
||||||
|
def add_el():
|
||||||
|
nlp = spacy.load('en_core_web_sm')
|
||||||
|
print("pipes", nlp.pipe_names)
|
||||||
|
|
||||||
|
el_pipe = nlp.create_pipe(name='el')
|
||||||
|
nlp.add_pipe(el_pipe, last=True)
|
||||||
|
|
||||||
|
print("pipes", nlp.pipe_names)
|
||||||
|
print()
|
||||||
|
|
||||||
|
text = "Australian striker John hits century"
|
||||||
|
doc = nlp(text)
|
||||||
|
for token in doc:
|
||||||
|
print("token", token.text, token.tag_, token.pos_, token.kb_id)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
add_el()
|
|
@ -14,7 +14,7 @@ import srsly
|
||||||
from .tokenizer import Tokenizer
|
from .tokenizer import Tokenizer
|
||||||
from .vocab import Vocab
|
from .vocab import Vocab
|
||||||
from .lemmatizer import Lemmatizer
|
from .lemmatizer import Lemmatizer
|
||||||
from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
|
from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer, EntityLinker
|
||||||
from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter
|
from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter
|
||||||
from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
|
from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
|
||||||
from .pipeline import EntityRuler
|
from .pipeline import EntityRuler
|
||||||
|
@ -117,6 +117,7 @@ class Language(object):
|
||||||
"tagger": lambda nlp, **cfg: Tagger(nlp.vocab, **cfg),
|
"tagger": lambda nlp, **cfg: Tagger(nlp.vocab, **cfg),
|
||||||
"parser": lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg),
|
"parser": lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg),
|
||||||
"ner": lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg),
|
"ner": lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg),
|
||||||
|
"el": lambda nlp, **cfg: EntityLinker(nlp.vocab, **cfg),
|
||||||
"similarity": lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
|
"similarity": lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
|
||||||
"textcat": lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg),
|
"textcat": lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg),
|
||||||
"sentencizer": lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg),
|
"sentencizer": lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg),
|
||||||
|
|
|
@ -43,6 +43,8 @@ cdef class Morphology:
|
||||||
|
|
||||||
cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
|
cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
|
||||||
|
|
||||||
|
cdef int assign_kb_id(self, TokenC* token, kb_id) except -1
|
||||||
|
|
||||||
|
|
||||||
cdef enum univ_morph_t:
|
cdef enum univ_morph_t:
|
||||||
NIL = 0
|
NIL = 0
|
||||||
|
|
|
@ -123,6 +123,9 @@ cdef class Morphology:
|
||||||
else:
|
else:
|
||||||
flags[0] &= ~(one << flag_id)
|
flags[0] &= ~(one << flag_id)
|
||||||
|
|
||||||
|
cdef int assign_kb_id(self, TokenC* token, kb_id) except -1:
|
||||||
|
token.kb_id = kb_id
|
||||||
|
|
||||||
def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
|
def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
|
||||||
force=False):
|
force=False):
|
||||||
"""Add a special-case rule to the morphological analyser. Tokens whose
|
"""Add a special-case rule to the morphological analyser. Tokens whose
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from .pipes import Tagger, DependencyParser, EntityRecognizer
|
from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker
|
||||||
from .pipes import TextCategorizer, Tensorizer, Pipe
|
from .pipes import TextCategorizer, Tensorizer, Pipe
|
||||||
from .entityruler import EntityRuler
|
from .entityruler import EntityRuler
|
||||||
from .hooks import SentenceSegmenter, SimilarityHook
|
from .hooks import SentenceSegmenter, SimilarityHook
|
||||||
|
@ -11,6 +11,7 @@ __all__ = [
|
||||||
"Tagger",
|
"Tagger",
|
||||||
"DependencyParser",
|
"DependencyParser",
|
||||||
"EntityRecognizer",
|
"EntityRecognizer",
|
||||||
|
"EntityLinker",
|
||||||
"TextCategorizer",
|
"TextCategorizer",
|
||||||
"Tensorizer",
|
"Tensorizer",
|
||||||
"Pipe",
|
"Pipe",
|
||||||
|
|
|
@ -1058,4 +1058,116 @@ cdef class EntityRecognizer(Parser):
|
||||||
if move[0] in ("B", "I", "L", "U")))
|
if move[0] in ("B", "I", "L", "U")))
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer"]
|
class EntityLinker(Pipe):
|
||||||
|
name = 'el'
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def Model(cls, nr_class=1, **cfg):
|
||||||
|
embed_size = util.env_opt("embed_size", 2000)
|
||||||
|
if "token_vector_width" in cfg:
|
||||||
|
token_vector_width = cfg["token_vector_width"]
|
||||||
|
else:
|
||||||
|
token_vector_width = util.env_opt("token_vector_width", 96)
|
||||||
|
if cfg.get('architecture') == 'simple_cnn':
|
||||||
|
tok2vec = Tok2Vec(token_vector_width, embed_size, **cfg)
|
||||||
|
return None # build_simple_cnn_text_classifier(tok2vec, nr_class, **cfg)
|
||||||
|
else:
|
||||||
|
return None # build_text_classifier(nr_class, **cfg)
|
||||||
|
|
||||||
|
|
||||||
|
def __init__(self, vocab, model=True, **cfg):
|
||||||
|
self.vocab = vocab
|
||||||
|
self.model = model
|
||||||
|
self._rehearsal_model = None
|
||||||
|
self.cfg = dict(cfg)
|
||||||
|
|
||||||
|
def __call__(self, doc):
|
||||||
|
# scores, tensors = self.predict([doc])
|
||||||
|
scores, tensors = None, None
|
||||||
|
self.set_annotations([doc], scores, tensors=tensors)
|
||||||
|
return doc
|
||||||
|
|
||||||
|
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||||
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
|
docs = list(docs)
|
||||||
|
scores, tensors = self.predict(docs)
|
||||||
|
self.set_annotations(docs, scores, tensors=tensors)
|
||||||
|
yield from docs
|
||||||
|
|
||||||
|
def predict(self, docs):
|
||||||
|
# self.require_model()
|
||||||
|
scores = self.model(docs)
|
||||||
|
scores = self.model.ops.asarray(scores)
|
||||||
|
tensors = [doc.tensor for doc in docs]
|
||||||
|
return scores, tensors
|
||||||
|
|
||||||
|
def set_annotations(self, docs, scores, tensors=None):
|
||||||
|
# TODO Sofie: actually implement this class instead of dummy implementation
|
||||||
|
for i, doc in enumerate(docs):
|
||||||
|
for token in doc:
|
||||||
|
token.kb_id = 342
|
||||||
|
|
||||||
|
def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
|
||||||
|
scores, bp_scores = self.model.begin_update(docs, drop=drop)
|
||||||
|
loss, d_scores = self.get_loss(docs, golds, scores)
|
||||||
|
bp_scores(d_scores, sgd=sgd)
|
||||||
|
if losses is not None:
|
||||||
|
losses.setdefault(self.name, 0.0)
|
||||||
|
losses[self.name] += loss
|
||||||
|
|
||||||
|
def rehearse(self, docs, drop=0., sgd=None, losses=None):
|
||||||
|
if self._rehearsal_model is None:
|
||||||
|
return
|
||||||
|
scores, bp_scores = self.model.begin_update(docs, drop=drop)
|
||||||
|
target = self._rehearsal_model(docs)
|
||||||
|
gradient = scores - target
|
||||||
|
bp_scores(gradient, sgd=sgd)
|
||||||
|
if losses is not None:
|
||||||
|
losses.setdefault(self.name, 0.0)
|
||||||
|
losses[self.name] += (gradient**2).sum()
|
||||||
|
|
||||||
|
def get_loss(self, docs, golds, scores):
|
||||||
|
truths = numpy.zeros((len(golds), len(self.labels)), dtype='f')
|
||||||
|
not_missing = numpy.ones((len(golds), len(self.labels)), dtype='f')
|
||||||
|
for i, gold in enumerate(golds):
|
||||||
|
for j, label in enumerate(self.labels):
|
||||||
|
if label in gold.cats:
|
||||||
|
truths[i, j] = gold.cats[label]
|
||||||
|
else:
|
||||||
|
not_missing[i, j] = 0.
|
||||||
|
truths = self.model.ops.asarray(truths)
|
||||||
|
not_missing = self.model.ops.asarray(not_missing)
|
||||||
|
d_scores = (scores-truths) / scores.shape[0]
|
||||||
|
d_scores *= not_missing
|
||||||
|
mean_square_error = (d_scores**2).sum(axis=1).mean()
|
||||||
|
return float(mean_square_error), d_scores
|
||||||
|
|
||||||
|
def add_label(self, label):
|
||||||
|
if label in self.labels:
|
||||||
|
return 0
|
||||||
|
if self.model not in (None, True, False):
|
||||||
|
# This functionality was available previously, but was broken.
|
||||||
|
# The problem is that we resize the last layer, but the last layer
|
||||||
|
# is actually just an ensemble. We're not resizing the child layers
|
||||||
|
# -- a huge problem.
|
||||||
|
raise ValueError(Errors.E116)
|
||||||
|
#smaller = self.model._layers[-1]
|
||||||
|
#larger = Affine(len(self.labels)+1, smaller.nI)
|
||||||
|
#copy_array(larger.W[:smaller.nO], smaller.W)
|
||||||
|
#copy_array(larger.b[:smaller.nO], smaller.b)
|
||||||
|
#self.model._layers[-1] = larger
|
||||||
|
self.labels = tuple(list(self.labels) + [label])
|
||||||
|
return 1
|
||||||
|
|
||||||
|
def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
|
||||||
|
**kwargs):
|
||||||
|
if self.model is True:
|
||||||
|
self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors')
|
||||||
|
self.model = self.Model(len(self.labels), **self.cfg)
|
||||||
|
link_vectors_to_models(self.vocab)
|
||||||
|
if sgd is None:
|
||||||
|
sgd = self.create_optimizer()
|
||||||
|
return sgd
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker"]
|
||||||
|
|
|
@ -71,3 +71,5 @@ cdef struct TokenC:
|
||||||
int ent_iob
|
int ent_iob
|
||||||
attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
|
attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
|
||||||
hash_t ent_id
|
hash_t ent_id
|
||||||
|
|
||||||
|
hash_t kb_id
|
||||||
|
|
|
@ -354,6 +354,14 @@ cdef class Token:
|
||||||
def __set__(self, attr_t tag):
|
def __set__(self, attr_t tag):
|
||||||
self.vocab.morphology.assign_tag(self.c, tag)
|
self.vocab.morphology.assign_tag(self.c, tag)
|
||||||
|
|
||||||
|
property kb_id:
|
||||||
|
"""RETURNS (uint64): ID of entity (after Entity Linking)."""
|
||||||
|
def __get__(self):
|
||||||
|
return self.c.kb_id
|
||||||
|
|
||||||
|
def __set__(self, attr_t kb_id):
|
||||||
|
self.vocab.morphology.assign_kb_id(self.c, kb_id)
|
||||||
|
|
||||||
property dep:
|
property dep:
|
||||||
"""RETURNS (uint64): ID of syntactic dependency label."""
|
"""RETURNS (uint64): ID of syntactic dependency label."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user