From 8d1e64be69f13de8ae141b2a5f312ca1e8e18e6d Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 21 May 2017 17:52:30 -0500
Subject: [PATCH] Add experimental NeuralLabeller

---
 spacy/language.py  |  4 +++-
 spacy/pipeline.pyx | 42 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/spacy/language.py b/spacy/language.py
index 288799834..2f14ea3de 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -16,6 +16,7 @@ from .syntax.parser import get_templates
 from .syntax.nonproj import PseudoProjectivity
 from .pipeline import NeuralDependencyParser, EntityRecognizer
 from .pipeline import TokenVectorEncoder, NeuralTagger, NeuralEntityRecognizer
+from .pipeline import NeuralLabeller
 from .compat import json_dumps
 from .attrs import IS_STOP
 from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
@@ -230,7 +231,7 @@ class Language(object):
         for doc, gold in docs_golds:
             yield doc, gold
 
-    def begin_training(self, gold_tuples, **cfg):
+    def begin_training(self, get_gold_tuples, **cfg):
         """Allocate models, pre-process training data and acquire a trainer and
         optimizer. Used as a contextmanager.
 
@@ -244,6 +245,7 @@ class Language(object):
             >>>        for docs, golds in epoch:
             >>>            state = nlp.update(docs, golds, sgd=optimizer)
         """
+        self.pipeline.append(NeuralLabeller(self.vocab))
         # Populate vocab
         for _, annots_brackets in get_gold_tuples():
             for annots, _ in annots_brackets:
diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx
index 91217b80b..6f949a5b9 100644
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@@ -31,6 +31,7 @@ from .syntax.stateclass cimport StateClass
 from .gold cimport GoldParse
 from .morphology cimport Morphology
 from .vocab cimport Vocab
+from .syntax.nonproj import PseudoProjectivity
 
 from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
 from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats
@@ -148,6 +149,7 @@ class TokenVectorEncoder(object):
         if self.model is True:
             self.model = self.Model()
 
+
     def use_params(self, params):
         """Replace weights of models in the pipeline with those provided in the
         params dictionary.
@@ -252,6 +254,46 @@ class NeuralTagger(object):
         with self.model.use_params(params):
             yield
 
+class NeuralLabeller(NeuralTagger):
+    name = 'nn_labeller'
+    def __init__(self, vocab, model=True):
+        self.vocab = vocab
+        self.model = model
+        self.labels = {}
+
+    def set_annotations(self, docs, dep_ids):
+        pass
+
+    def begin_training(self, gold_tuples, pipeline=None):
+        gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples)
+        for raw_text, annots_brackets in gold_tuples:
+            for annots, brackets in annots_brackets:
+                ids, words, tags, heads, deps, ents = annots
+                for dep in deps:
+                    if dep not in self.labels:
+                        self.labels[dep] = len(self.labels)
+        token_vector_width = pipeline[0].model.nO
+        self.model = with_flatten(
+            Softmax(len(self.labels), token_vector_width))
+
+    def get_loss(self, docs, golds, scores):
+        scores = self.model.ops.flatten(scores)
+        cdef int idx = 0
+        correct = numpy.zeros((scores.shape[0],), dtype='i')
+        guesses = scores.argmax(axis=1)
+        for gold in golds:
+            for tag in gold.labels:
+                if tag is None:
+                    correct[idx] = guesses[idx]
+                else:
+                    correct[idx] = self.labels[tag]
+                idx += 1
+        correct = self.model.ops.xp.array(correct, dtype='i')
+        d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
+        loss = (d_scores**2).sum()
+        d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
+        return float(loss), d_scores
+
 
 cdef class EntityRecognizer(LinearParser):
     """Annotate named entities on Doc objects."""