diff --git a/sandbox_test_sofie/__init__.py b/sandbox_test_sofie/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/sandbox_test_sofie/testing_el.py b/sandbox_test_sofie/testing_el.py new file mode 100644 index 000000000..8d9b0c21d --- /dev/null +++ b/sandbox_test_sofie/testing_el.py @@ -0,0 +1,21 @@ +import spacy + + +def add_el(): + nlp = spacy.load('en_core_web_sm') + print("pipes", nlp.pipe_names) + + el_pipe = nlp.create_pipe(name='el') + nlp.add_pipe(el_pipe, last=True) + + print("pipes", nlp.pipe_names) + print() + + text = "Australian striker John hits century" + doc = nlp(text) + for token in doc: + print("token", token.text, token.tag_, token.pos_, token.kb_id) + + +if __name__ == "__main__": + add_el() diff --git a/spacy/language.py b/spacy/language.py index 0c0cf8854..736899341 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -14,7 +14,7 @@ import srsly from .tokenizer import Tokenizer from .vocab import Vocab from .lemmatizer import Lemmatizer -from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer +from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer, EntityLinker from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens from .pipeline import EntityRuler @@ -114,6 +114,7 @@ class Language(object): "tagger": lambda nlp, **cfg: Tagger(nlp.vocab, **cfg), "parser": lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg), "ner": lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg), + "el": lambda nlp, **cfg: EntityLinker(nlp.vocab, **cfg), "similarity": lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg), "textcat": lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg), "sentencizer": lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg), diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index d0110b300..d674140b0 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -43,6 +43,8 @@ cdef class Morphology: cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1 + cdef int assign_kb_id(self, TokenC* token, kb_id) except -1 + cdef enum univ_morph_t: NIL = 0 diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index bd821d76f..92ca67f18 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -122,6 +122,9 @@ cdef class Morphology: else: flags[0] &= ~(one << flag_id) + cdef int assign_kb_id(self, TokenC* token, kb_id) except -1: + token.kb_id = kb_id + def add_special_case(self, unicode tag_str, unicode orth_str, attrs, force=False): """Add a special-case rule to the morphological analyser. Tokens whose diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py index d683cc989..170cc5ba7 100644 --- a/spacy/pipeline/__init__.py +++ b/spacy/pipeline/__init__.py @@ -1,7 +1,7 @@ # coding: utf8 from __future__ import unicode_literals -from .pipes import Tagger, DependencyParser, EntityRecognizer # noqa +from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker # noqa from .pipes import TextCategorizer, Tensorizer, Pipe # noqa from .entityruler import EntityRuler # noqa from .hooks import SentenceSegmenter, SimilarityHook # noqa diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index bde794e75..4eb3ecc80 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1040,4 +1040,116 @@ cdef class EntityRecognizer(Parser): if move[0] in ("B", "I", "L", "U"))) -__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer', 'TextCategorizer'] +class EntityLinker(Pipe): + name = 'el' + + @classmethod + def Model(cls, nr_class=1, **cfg): + embed_size = util.env_opt("embed_size", 2000) + if "token_vector_width" in cfg: + token_vector_width = cfg["token_vector_width"] + else: + token_vector_width = util.env_opt("token_vector_width", 96) + if cfg.get('architecture') == 'simple_cnn': + tok2vec = Tok2Vec(token_vector_width, embed_size, **cfg) + return None # build_simple_cnn_text_classifier(tok2vec, nr_class, **cfg) + else: + return None # build_text_classifier(nr_class, **cfg) + + + def __init__(self, vocab, model=True, **cfg): + self.vocab = vocab + self.model = model + self._rehearsal_model = None + self.cfg = dict(cfg) + + def __call__(self, doc): + # scores, tensors = self.predict([doc]) + scores, tensors = None, None + self.set_annotations([doc], scores, tensors=tensors) + return doc + + def pipe(self, stream, batch_size=128, n_threads=-1): + for docs in util.minibatch(stream, size=batch_size): + docs = list(docs) + scores, tensors = self.predict(docs) + self.set_annotations(docs, scores, tensors=tensors) + yield from docs + + def predict(self, docs): + # self.require_model() + scores = self.model(docs) + scores = self.model.ops.asarray(scores) + tensors = [doc.tensor for doc in docs] + return scores, tensors + + def set_annotations(self, docs, scores, tensors=None): + # TODO Sofie: actually implement this class instead of dummy implementation + for i, doc in enumerate(docs): + for token in doc: + token.kb_id = 342 + + def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None): + scores, bp_scores = self.model.begin_update(docs, drop=drop) + loss, d_scores = self.get_loss(docs, golds, scores) + bp_scores(d_scores, sgd=sgd) + if losses is not None: + losses.setdefault(self.name, 0.0) + losses[self.name] += loss + + def rehearse(self, docs, drop=0., sgd=None, losses=None): + if self._rehearsal_model is None: + return + scores, bp_scores = self.model.begin_update(docs, drop=drop) + target = self._rehearsal_model(docs) + gradient = scores - target + bp_scores(gradient, sgd=sgd) + if losses is not None: + losses.setdefault(self.name, 0.0) + losses[self.name] += (gradient**2).sum() + + def get_loss(self, docs, golds, scores): + truths = numpy.zeros((len(golds), len(self.labels)), dtype='f') + not_missing = numpy.ones((len(golds), len(self.labels)), dtype='f') + for i, gold in enumerate(golds): + for j, label in enumerate(self.labels): + if label in gold.cats: + truths[i, j] = gold.cats[label] + else: + not_missing[i, j] = 0. + truths = self.model.ops.asarray(truths) + not_missing = self.model.ops.asarray(not_missing) + d_scores = (scores-truths) / scores.shape[0] + d_scores *= not_missing + mean_square_error = (d_scores**2).sum(axis=1).mean() + return float(mean_square_error), d_scores + + def add_label(self, label): + if label in self.labels: + return 0 + if self.model not in (None, True, False): + # This functionality was available previously, but was broken. + # The problem is that we resize the last layer, but the last layer + # is actually just an ensemble. We're not resizing the child layers + # -- a huge problem. + raise ValueError(Errors.E116) + #smaller = self.model._layers[-1] + #larger = Affine(len(self.labels)+1, smaller.nI) + #copy_array(larger.W[:smaller.nO], smaller.W) + #copy_array(larger.b[:smaller.nO], smaller.b) + #self.model._layers[-1] = larger + self.labels = tuple(list(self.labels) + [label]) + return 1 + + def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, + **kwargs): + if self.model is True: + self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors') + self.model = self.Model(len(self.labels), **self.cfg) + link_vectors_to_models(self.vocab) + if sgd is None: + sgd = self.create_optimizer() + return sgd + + +__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer', 'TextCategorizer', 'EntityLinker'] diff --git a/spacy/structs.pxd b/spacy/structs.pxd index fa282cae7..86b738a5c 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -71,3 +71,5 @@ cdef struct TokenC: int ent_iob attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth.. hash_t ent_id + + hash_t kb_id diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index a69a0def8..39e408a89 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -307,6 +307,14 @@ cdef class Token: def __set__(self, attr_t tag): self.vocab.morphology.assign_tag(self.c, tag) + property kb_id: + """RETURNS (uint64): ID of entity (after Entity Linking).""" + def __get__(self): + return self.c.kb_id + + def __set__(self, attr_t kb_id): + self.vocab.morphology.assign_kb_id(self.c, kb_id) + property dep: """RETURNS (uint64): ID of syntactic dependency label.""" def __get__(self):