From 8d1e64be69f13de8ae141b2a5f312ca1e8e18e6d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 21 May 2017 17:52:30 -0500 Subject: [PATCH] Add experimental NeuralLabeller --- spacy/language.py | 4 +++- spacy/pipeline.pyx | 42 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/spacy/language.py b/spacy/language.py index 288799834..2f14ea3de 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -16,6 +16,7 @@ from .syntax.parser import get_templates from .syntax.nonproj import PseudoProjectivity from .pipeline import NeuralDependencyParser, EntityRecognizer from .pipeline import TokenVectorEncoder, NeuralTagger, NeuralEntityRecognizer +from .pipeline import NeuralLabeller from .compat import json_dumps from .attrs import IS_STOP from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES @@ -230,7 +231,7 @@ class Language(object): for doc, gold in docs_golds: yield doc, gold - def begin_training(self, gold_tuples, **cfg): + def begin_training(self, get_gold_tuples, **cfg): """Allocate models, pre-process training data and acquire a trainer and optimizer. Used as a contextmanager. @@ -244,6 +245,7 @@ class Language(object): >>> for docs, golds in epoch: >>> state = nlp.update(docs, golds, sgd=optimizer) """ + self.pipeline.append(NeuralLabeller(self.vocab)) # Populate vocab for _, annots_brackets in get_gold_tuples(): for annots, _ in annots_brackets: diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 91217b80b..6f949a5b9 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -31,6 +31,7 @@ from .syntax.stateclass cimport StateClass from .gold cimport GoldParse from .morphology cimport Morphology from .vocab cimport Vocab +from .syntax.nonproj import PseudoProjectivity from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats @@ -148,6 +149,7 @@ class TokenVectorEncoder(object): if self.model is True: self.model = self.Model() + def use_params(self, params): """Replace weights of models in the pipeline with those provided in the params dictionary. @@ -252,6 +254,46 @@ class NeuralTagger(object): with self.model.use_params(params): yield +class NeuralLabeller(NeuralTagger): + name = 'nn_labeller' + def __init__(self, vocab, model=True): + self.vocab = vocab + self.model = model + self.labels = {} + + def set_annotations(self, docs, dep_ids): + pass + + def begin_training(self, gold_tuples, pipeline=None): + gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples) + for raw_text, annots_brackets in gold_tuples: + for annots, brackets in annots_brackets: + ids, words, tags, heads, deps, ents = annots + for dep in deps: + if dep not in self.labels: + self.labels[dep] = len(self.labels) + token_vector_width = pipeline[0].model.nO + self.model = with_flatten( + Softmax(len(self.labels), token_vector_width)) + + def get_loss(self, docs, golds, scores): + scores = self.model.ops.flatten(scores) + cdef int idx = 0 + correct = numpy.zeros((scores.shape[0],), dtype='i') + guesses = scores.argmax(axis=1) + for gold in golds: + for tag in gold.labels: + if tag is None: + correct[idx] = guesses[idx] + else: + correct[idx] = self.labels[tag] + idx += 1 + correct = self.model.ops.xp.array(correct, dtype='i') + d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1]) + loss = (d_scores**2).sum() + d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) + return float(loss), d_scores + cdef class EntityRecognizer(LinearParser): """Annotate named entities on Doc objects."""