From 755d7d486c298962e718ed1cd738d68390431cab Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 14 Mar 2017 21:28:43 +0100
Subject: [PATCH] WIP on hash kernel

---
 setup.py                   |   1 +
 spacy/_ml.pxd              |  31 +++++++
 spacy/_ml.pyx              | 151 ++++++++++++++++++++++++++++++++
 spacy/about.py             |   6 +-
 spacy/syntax/_state.pxd    |  12 +--
 spacy/syntax/arc_eager.pyx |   8 +-
 spacy/syntax/parser.pxd    |  11 ++-
 spacy/syntax/parser.pyx    | 170 +++++++++++++++++++++++--------------
 spacy/tagger.pxd           |  15 ++--
 spacy/tagger.pyx           | 153 +++++++++++++--------------------
 spacy/train.py             |   4 +-
 11 files changed, 383 insertions(+), 179 deletions(-)
 create mode 100644 spacy/_ml.pxd
 create mode 100644 spacy/_ml.pyx

diff --git a/setup.py b/setup.py
index 26f395ea5..373d5af9d 100644
--- a/setup.py
+++ b/setup.py
@@ -56,6 +56,7 @@ MOD_NAMES = [
     'spacy.lexeme',
     'spacy.vocab',
     'spacy.attrs',
+    'spacy._ml',
     'spacy.morphology',
     'spacy.tagger',
     'spacy.pipeline',
diff --git a/spacy/_ml.pxd b/spacy/_ml.pxd
new file mode 100644
index 000000000..4f2f42427
--- /dev/null
+++ b/spacy/_ml.pxd
@@ -0,0 +1,31 @@
+from thinc.linear.features cimport ConjunctionExtracter
+from thinc.typedefs cimport atom_t, weight_t
+from thinc.structs cimport FeatureC
+from libc.stdint cimport uint32_t
+cimport numpy as np
+from cymem.cymem cimport Pool
+
+
+cdef class LinearModel:
+    cdef ConjunctionExtracter extracter
+    cdef readonly int nr_class
+    cdef readonly uint32_t nr_weight
+    cdef public weight_t learn_rate
+    cdef Pool mem
+    cdef weight_t* W
+    cdef weight_t* d_W
+
+    cdef void hinge_lossC(self, weight_t* d_scores,
+            const weight_t* scores, const weight_t* costs) nogil
+
+    cdef void log_lossC(self, weight_t* d_scores,
+            const weight_t* scores, const weight_t* costs) nogil
+
+    cdef void regression_lossC(self, weight_t* d_scores,
+            const weight_t* scores, const weight_t* costs) nogil
+
+    cdef void set_scoresC(self, weight_t* scores,
+            const FeatureC* features, int nr_feat) nogil
+
+    cdef void set_gradientC(self, const weight_t* d_scores, const FeatureC*
+            features, int nr_feat) nogil
diff --git a/spacy/_ml.pyx b/spacy/_ml.pyx
new file mode 100644
index 000000000..c3413f561
--- /dev/null
+++ b/spacy/_ml.pyx
@@ -0,0 +1,151 @@
+# cython: infer_types=True
+# cython: profile=True
+# cython: cdivision=True
+
+from libcpp.vector cimport vector
+from libc.stdint cimport uint64_t, uint32_t, int32_t
+from libc.string cimport memcpy, memset
+cimport libcpp.algorithm
+from libc.math cimport exp
+
+from cymem.cymem cimport Pool
+from thinc.linalg cimport Vec, VecVec
+from murmurhash.mrmr cimport hash64
+cimport numpy as np
+import numpy
+np.import_array()
+
+
+cdef class LinearModel:
+    def __init__(self, int nr_class, templates, weight_t learn_rate=0.001,
+            size=2**18):
+        self.extracter = ConjunctionExtracter(templates)
+        self.nr_weight = size
+        self.nr_class = nr_class
+        self.learn_rate = learn_rate
+        self.mem = Pool()
+        self.W = <weight_t*>self.mem.alloc(self.nr_weight * self.nr_class,
+                                           sizeof(weight_t))
+        self.d_W = <weight_t*>self.mem.alloc(self.nr_weight * self.nr_class,
+                                           sizeof(weight_t))
+
+    cdef void hinge_lossC(self, weight_t* d_scores,
+            const weight_t* scores, const weight_t* costs) nogil:
+        guess = 0
+        best = -1
+        for i in range(1, self.nr_class):
+            if scores[i] > scores[guess]:
+                guess = i
+            if costs[i] == 0 and (best == -1 or scores[i] > scores[best]):
+                best = i
+        if best != -1 and scores[guess] >= scores[best]:
+            d_scores[guess] = 1.
+            d_scores[best] = -1.
+
+    cdef void log_lossC(self, weight_t* d_scores,
+            const weight_t* scores, const weight_t* costs) nogil:
+        for i in range(self.nr_class):
+            if costs[i] <= 0:
+                break
+        else:
+            return
+        cdef double Z = 1e-10
+        cdef double gZ = 1e-10
+        cdef double max_ = scores[0]
+        cdef double g_max = -9000
+        for i in range(self.nr_class):
+            max_ = max(max_, scores[i])
+            if costs[i] <= 0:
+                g_max = max(g_max, scores[i])
+        for i in range(self.nr_class):
+            Z += exp(scores[i]-max_)
+            if costs[i] <= 0:
+                gZ += exp(scores[i]-g_max)
+        for i in range(self.nr_class):
+            score = exp(scores[i]-max_)
+            if costs[i] >= 1:
+                d_scores[i] = score / Z
+            else:
+                g_score = exp(scores[i]-g_max)
+                d_scores[i] = (score / Z) - (g_score / gZ)
+
+    cdef void regression_lossC(self, weight_t* d_scores,
+            const weight_t* scores, const weight_t* costs) nogil:
+        best = -1
+        for i in range(self.nr_class):
+            if costs[i] <= 0:
+                if best == -1:
+                    best = i
+                elif scores[i] > scores[best]:
+                    best = i
+        if best == -1:
+            return
+        for i in range(self.nr_class):
+            if scores[i] < scores[best]:
+                d_scores[i] = 0
+            elif costs[i] <= 0 and scores[i] == best:
+                continue
+            else:
+                d_scores[i] = scores[i] - -costs[i]
+
+    cdef void set_scoresC(self, weight_t* scores,
+            const FeatureC* features, int nr_feat) nogil:
+        cdef uint64_t nr_weight = self.nr_weight
+        cdef int nr_class = self.nr_class
+        cdef vector[uint64_t] indices
+        # Collect all feature indices
+        cdef uint32_t[2] hashed
+        cdef FeatureC feat
+        cdef uint64_t hash2
+        for feat in features[:nr_feat]:
+            if feat.value == 0:
+                continue
+            memcpy(hashed, &feat.key, sizeof(hashed))
+            indices.push_back(hashed[0] % nr_weight)
+            indices.push_back(hashed[1] % nr_weight)
+
+        # Sort them, to improve memory access pattern
+        libcpp.algorithm.sort(indices.begin(), indices.end())
+        for idx in indices:
+            W = &self.W[idx * nr_class]
+            for clas in range(nr_class):
+                scores[clas] += W[clas]
+
+    cdef void set_gradientC(self, const weight_t* d_scores, const FeatureC*
+            features, int nr_feat) nogil:
+        cdef uint64_t nr_weight = self.nr_weight
+        cdef int nr_class = self.nr_class
+        cdef vector[uint64_t] indices
+        # Collect all feature indices
+        cdef uint32_t[2] hashed
+        cdef uint64_t hash2
+        for feat in features[:nr_feat]:
+            if feat.value == 0:
+                continue
+            memcpy(hashed, &feat.key, sizeof(hashed))
+            indices.push_back(hashed[0] % nr_weight)
+            indices.push_back(hashed[1] % nr_weight)
+
+        # Sort them, to improve memory access pattern
+        libcpp.algorithm.sort(indices.begin(), indices.end())
+        for idx in indices:
+            W = &self.W[idx * nr_class]
+            for clas in range(nr_class):
+                if d_scores[clas] < 0:
+                    W[clas] -= self.learn_rate * max(-10., d_scores[clas])
+                else:
+                    W[clas] -= self.learn_rate * min(10., d_scores[clas])
+
+    @property
+    def nr_active_feat(self):
+        return self.nr_weight
+
+    @property
+    def nr_feat(self):
+        return self.extracter.nr_templ
+
+    def end_training(self, *args, **kwargs):
+        pass
+
+    def dump(self, *args, **kwargs):
+        pass
diff --git a/spacy/about.py b/spacy/about.py
index d51dea286..57e845a5c 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -4,13 +4,13 @@
 # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
 
 __title__ = 'spacy'
-__version__ = '1.6.0'
+__version__ = '1.7.0'
 __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
 __uri__ = 'https://spacy.io'
 __author__ = 'Matthew Honnibal'
 __email__ = 'matt@explosion.ai'
 __license__ = 'MIT'
 __models__ = {
-    'en': 'en>=1.1.0,<1.2.0',
-    'de': 'de>=1.0.0,<1.1.0',
+    'en': 'en>=1.2.0,<1.3.0',
+    'de': 'de>=1.2.0,<1.3.0',
 }
diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd
index c764e877d..383e91faa 100644
--- a/spacy/syntax/_state.pxd
+++ b/spacy/syntax/_state.pxd
@@ -304,11 +304,13 @@ cdef cppclass StateC:
             this._break = this._b_i
 
     void clone(const StateC* src) nogil:
-        memcpy(this._sent, src._sent, this.length * sizeof(TokenC))
-        memcpy(this._stack, src._stack, this.length * sizeof(int))
-        memcpy(this._buffer, src._buffer, this.length * sizeof(int))
-        memcpy(this._ents, src._ents, this.length * sizeof(Entity))
-        memcpy(this.shifted, src.shifted, this.length * sizeof(this.shifted[0]))
+        # This is still quadratic, but make it a it faster.
+        # Not carefully reviewed for accuracy yet.
+        memcpy(this._sent, src._sent, this.B(1) * sizeof(TokenC))
+        memcpy(this._stack, src._stack, this._s_i * sizeof(int))
+        memcpy(this._buffer, src._buffer, this._b_i * sizeof(int))
+        memcpy(this._ents, src._ents, this._e_i * sizeof(Entity))
+        memcpy(this.shifted, src.shifted, this.B(2) * sizeof(this.shifted[0]))
         this.length = src.length
         this._b_i = src._b_i
         this._s_i = src._s_i
diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx
index 7049b8595..a0e2bf4d0 100644
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@@ -70,7 +70,7 @@ cdef weight_t push_cost(StateClass stcls, const GoldParseC* gold, int target) no
 cdef weight_t pop_cost(StateClass stcls, const GoldParseC* gold, int target) nogil:
     cdef weight_t cost = 0
     cdef int i, B_i
-    for i in range(stcls.buffer_length()):
+    for i in range(min(30, stcls.buffer_length())):
         B_i = stcls.B(i)
         cost += gold.heads[B_i] == target
         cost += gold.heads[target] == B_i
@@ -268,10 +268,12 @@ cdef class Break:
         cdef int i, j, S_i, B_i
         for i in range(s.stack_depth()):
             S_i = s.S(i)
-            for j in range(s.buffer_length()):
+            for j in range(min(30, s.buffer_length())):
                 B_i = s.B(j)
                 cost += gold.heads[S_i] == B_i
                 cost += gold.heads[B_i] == S_i
+                if cost != 0:
+                    break
         # Check for sentence boundary --- if it's here, we can't have any deps
         # between stack and buffer, so rest of action is irrelevant.
         s0_root = _get_root(s.S(0), gold)
@@ -462,7 +464,7 @@ cdef class ArcEager(TransitionSystem):
         cdef int* labels = gold.c.labels
         cdef int* heads = gold.c.heads
 
-        n_gold = 0
+        cdef int n_gold = 0
         for i in range(self.n_moves):
             if self.c[i].is_valid(stcls.c, self.c[i].label):
                 is_valid[i] = True
diff --git a/spacy/syntax/parser.pxd b/spacy/syntax/parser.pxd
index aaed10303..020e1e793 100644
--- a/spacy/syntax/parser.pxd
+++ b/spacy/syntax/parser.pxd
@@ -1,5 +1,6 @@
 from thinc.linear.avgtron cimport AveragedPerceptron
-from thinc.typedefs cimport atom_t
+from thinc.linear.features cimport ConjunctionExtracter
+from thinc.typedefs cimport atom_t, weight_t
 from thinc.structs cimport FeatureC
 
 from .stateclass cimport StateClass
@@ -8,17 +9,19 @@ from ..vocab cimport Vocab
 from ..tokens.doc cimport Doc
 from ..structs cimport TokenC
 from ._state cimport StateC
+from .._ml cimport LinearModel
 
 
-cdef class ParserModel(AveragedPerceptron):
+cdef class ParserModel(LinearModel):
     cdef int set_featuresC(self, atom_t* context, FeatureC* features,
                             const StateC* state) nogil
-
-
+ 
+ 
 cdef class Parser:
     cdef readonly Vocab vocab
     cdef readonly ParserModel model
     cdef readonly TransitionSystem moves
     cdef readonly object cfg
+    cdef public object optimizer
 
     cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) with gil
diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx
index 804542cc8..dc157d13d 100644
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@@ -1,4 +1,6 @@
 # cython: infer_types=True
+# cython: cdivision=True
+# cython: profile=True
 """
 MALT-style dependency parser
 """
@@ -20,15 +22,22 @@ import shutil
 import json
 import sys
 from .nonproj import PseudoProjectivity
+import numpy
+import random
+cimport numpy as np
+np.import_array()
 
 from cymem.cymem cimport Pool, Address
-from murmurhash.mrmr cimport hash64
+from murmurhash.mrmr cimport hash64, hash32
 from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
 from thinc.linear.avgtron cimport AveragedPerceptron
 from thinc.linalg cimport VecVec
 from thinc.structs cimport SparseArrayC
 from preshed.maps cimport MapStruct
 from preshed.maps cimport map_get
+from thinc.neural.ops import NumpyOps
+from thinc.neural.optimizers import Adam
+from thinc.neural.optimizers import SGD
 
 from thinc.structs cimport FeatureC
 from thinc.structs cimport ExampleC
@@ -51,6 +60,7 @@ from ._parse_features cimport CONTEXT_SIZE
 from ._parse_features cimport fill_context
 from .stateclass cimport StateClass
 from ._state cimport StateC
+from .._ml cimport LinearModel
 
 
 DEBUG = False
@@ -72,57 +82,65 @@ def get_templates(name):
                 pf.tree_shape + pf.trigrams)
 
 
-cdef class ParserModel(AveragedPerceptron):
+#cdef class ParserModel(AveragedPerceptron):
+#    cdef int set_featuresC(self, atom_t* context, FeatureC* features,
+#            const StateC* state) nogil:
+#        fill_context(context, state)
+#        nr_feat = self.extracter.set_features(features, context)
+#        return nr_feat
+#
+#    def update(self, Example eg, itn=0):
+#        '''Does regression on negative cost. Sort of cute?'''
+#        self.time += 1
+#        best = arg_max_if_gold(eg.c.scores, eg.c.costs, eg.c.nr_class)
+#        guess = eg.guess
+#        cdef weight_t loss = 0.0
+#        if guess == best:
+#            return loss
+#        for clas in [guess, best]:
+#            loss += (-eg.c.costs[clas] - eg.c.scores[clas]) ** 2
+#            d_loss = eg.c.scores[clas] - -eg.c.costs[clas]
+#            for feat in eg.c.features[:eg.c.nr_feat]:
+#                self.update_weight_ftrl(feat.key, clas, feat.value * d_loss)
+#        return loss
+#
+#    def update_from_histories(self, TransitionSystem moves, Doc doc, histories, weight_t min_grad=0.0):
+#        cdef Pool mem = Pool()
+#        features = <FeatureC*>mem.alloc(self.nr_feat, sizeof(FeatureC))
+#
+#        cdef StateClass stcls
+#
+#        cdef class_t clas
+#        self.time += 1
+#        cdef atom_t[CONTEXT_SIZE] atoms
+#        histories = [(grad, hist) for grad, hist in histories if abs(grad) >= min_grad and hist]
+#        if not histories:
+#            return None
+#        gradient = [Counter() for _ in range(max([max(h)+1 for _, h in histories]))]
+#        for d_loss, history in histories:
+#            stcls = StateClass.init(doc.c, doc.length)
+#            moves.initialize_state(stcls.c)
+#            for clas in history:
+#                nr_feat = self.set_featuresC(atoms, features, stcls.c)
+#                clas_grad = gradient[clas]
+#                for feat in features[:nr_feat]:
+#                    clas_grad[feat.key] += d_loss * feat.value
+#                moves.c[clas].do(stcls.c, moves.c[clas].label)
+#        cdef feat_t key
+#        cdef weight_t d_feat
+#        for clas, clas_grad in enumerate(gradient):
+#            for key, d_feat in clas_grad.items():
+#                if d_feat != 0:
+#                    self.update_weight_ftrl(key, clas, d_feat)
+#
+
+cdef class ParserModel(LinearModel):
     cdef int set_featuresC(self, atom_t* context, FeatureC* features,
             const StateC* state) nogil:
         fill_context(context, state)
         nr_feat = self.extracter.set_features(features, context)
         return nr_feat
 
-    def update(self, Example eg, itn=0):
-        '''Does regression on negative cost. Sort of cute?'''
-        self.time += 1
-        best = arg_max_if_gold(eg.c.scores, eg.c.costs, eg.c.nr_class)
-        guess = eg.guess
-        cdef weight_t loss = 0.0
-        if guess == best:
-            return loss
-        for clas in [guess, best]:
-            loss += (-eg.c.costs[clas] - eg.c.scores[clas]) ** 2
-            d_loss = eg.c.scores[clas] - -eg.c.costs[clas]
-            for feat in eg.c.features[:eg.c.nr_feat]:
-                self.update_weight_ftrl(feat.key, clas, feat.value * d_loss)
-        return loss
-
-    def update_from_histories(self, TransitionSystem moves, Doc doc, histories, weight_t min_grad=0.0):
-        cdef Pool mem = Pool()
-        features = <FeatureC*>mem.alloc(self.nr_feat, sizeof(FeatureC))
-
-        cdef StateClass stcls
-
-        cdef class_t clas
-        self.time += 1
-        cdef atom_t[CONTEXT_SIZE] atoms
-        histories = [(grad, hist) for grad, hist in histories if abs(grad) >= min_grad and hist]
-        if not histories:
-            return None
-        gradient = [Counter() for _ in range(max([max(h)+1 for _, h in histories]))]
-        for d_loss, history in histories:
-            stcls = StateClass.init(doc.c, doc.length)
-            moves.initialize_state(stcls.c)
-            for clas in history:
-                nr_feat = self.set_featuresC(atoms, features, stcls.c)
-                clas_grad = gradient[clas]
-                for feat in features[:nr_feat]:
-                    clas_grad[feat.key] += d_loss * feat.value
-                moves.c[clas].do(stcls.c, moves.c[clas].label)
-        cdef feat_t key
-        cdef weight_t d_feat
-        for clas, clas_grad in enumerate(gradient):
-            for key, d_feat in clas_grad.items():
-                if d_feat != 0:
-                    self.update_weight_ftrl(key, clas, d_feat)
-
 
 cdef class Parser:
     """Base class of the DependencyParser and EntityRecognizer."""
@@ -174,9 +192,14 @@ cdef class Parser:
             cfg['features'] = get_templates(cfg['features'])
         elif 'features' not in cfg:
             cfg['features'] = self.feature_templates
-        self.model = ParserModel(cfg['features'])
-        self.model.l1_penalty = cfg.get('L1', 1e-8)
-        self.model.learn_rate = cfg.get('learn_rate', 0.001)
+        self.model = ParserModel(self.moves.n_moves, cfg['features'],
+                                 size=2**18,
+                                 learn_rate=cfg.get('learn_rate', 0.001))
+        #self.model.l1_penalty = cfg.get('L1', 1e-8)
+        #self.model.learn_rate = cfg.get('learn_rate', 0.001)
+
+        self.optimizer = SGD(NumpyOps(), cfg.get('learn_rate', 0.001),
+                             momentum=0.9)
 
         self.cfg = cfg
 
@@ -300,27 +323,48 @@ cdef class Parser:
         self.moves.preprocess_gold(gold)
         cdef StateClass stcls = StateClass.init(tokens.c, tokens.length)
         self.moves.initialize_state(stcls.c)
+
+        cdef int nr_class = self.model.nr_class
         cdef Pool mem = Pool()
-        cdef Example eg = Example(
-                nr_class=self.moves.n_moves,
-                nr_atom=CONTEXT_SIZE,
-                nr_feat=self.model.nr_feat)
+        d_scores = <weight_t*>mem.alloc(nr_class, sizeof(weight_t))
+        scores = <weight_t*>mem.alloc(nr_class, sizeof(weight_t))
+        costs = <weight_t*>mem.alloc(nr_class, sizeof(weight_t))
+        features = <FeatureC*>mem.alloc(self.model.nr_feat, sizeof(FeatureC))
+        is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
+        cdef atom_t[CONTEXT_SIZE] context
+
         cdef weight_t loss = 0
         cdef Transition action
+        words = [w.text for w in tokens]
+ 
         while not stcls.is_final():
-            eg.c.nr_feat = self.model.set_featuresC(eg.c.atoms, eg.c.features,
-                                                    stcls.c)
-            self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold)
-            self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat)
-            guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)
-            self.model.update(eg)
+
+            nr_feat = self.model.set_featuresC(context, features, stcls.c)
+            self.moves.set_costs(is_valid, costs, stcls, gold)
+            self.model.set_scoresC(scores, features, nr_feat)
+
+            guess = VecVec.arg_max_if_true(scores, is_valid, nr_class)
+            best = arg_max_if_gold(scores, costs, nr_class)
+
+            self.model.regression_lossC(d_scores, scores, costs)
+            self.model.set_gradientC(d_scores, features, nr_feat) 
 
             action = self.moves.c[guess]
             action.do(stcls.c, action.label)
-            loss += eg.costs[guess]
-            eg.fill_scores(0, eg.c.nr_class)
-            eg.fill_costs(0, eg.c.nr_class)
-            eg.fill_is_valid(1, eg.c.nr_class)
+            #print(scores[guess], scores[best], d_scores[guess], costs[guess],
+            #    self.moves.move_name(action.move, action.label), stcls.print_state(words))
+
+            loss += scores[guess]
+            memset(context, 0, sizeof(context))
+            memset(features, 0, sizeof(features[0]) * nr_feat)
+            memset(scores, 0, sizeof(scores[0]) * nr_class)
+            memset(d_scores, 0, sizeof(d_scores[0]) * nr_class)
+            memset(costs, 0, sizeof(costs[0]) * nr_class)
+            for i in range(nr_class):
+                is_valid[i] = 1
+        #if itn % 100 == 0:
+        #    self.optimizer(self.model.model[0].ravel(),
+        #        self.model.model[1].ravel(), key=1)
         return loss
 
     def step_through(self, Doc doc):
diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd
index ed4e3d9c4..deab79fab 100644
--- a/spacy/tagger.pxd
+++ b/spacy/tagger.pxd
@@ -1,15 +1,14 @@
-from thinc.linear.avgtron cimport AveragedPerceptron
-from thinc.extra.eg cimport Example
-from thinc.structs cimport ExampleC
-from thinc.linear.features cimport ConjunctionExtracter
-
 from .structs cimport TokenC
 from .vocab cimport Vocab
+from ._ml cimport LinearModel
+from thinc.structs cimport FeatureC
+from thinc.typedefs cimport atom_t
 
 
-cdef class TaggerModel:
-    cdef ConjunctionExtracter extracter
-    cdef object model
+cdef class TaggerModel(LinearModel):
+    cdef int set_featuresC(self, FeatureC* features, atom_t* context,
+            const TokenC* tokens, int i) nogil
+ 
 
 
 cdef class Tagger:
diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx
index 1c11387b3..76807b328 100644
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@@ -16,9 +16,8 @@ from thinc.extra.eg cimport Example
 from thinc.structs cimport ExampleC
 from thinc.linear.avgtron cimport AveragedPerceptron
 from thinc.linalg cimport Vec, VecVec
-from thinc.linear.linear import LinearModel
 from thinc.structs cimport FeatureC
-from thinc.neural.optimizers import Adam
+from thinc.neural.optimizers import Adam, SGD
 from thinc.neural.ops import NumpyOps
 
 from .typedefs cimport attr_t
@@ -80,69 +79,16 @@ cpdef enum:
     N_CONTEXT_FIELDS
 
 
-cdef class TaggerModel:
-    def __init__(self, int nr_tag, templates):
-        self.extracter = ConjunctionExtracter(templates)
-        self.model = LinearModel(nr_tag)
-
-    def begin_update(self, atom_t[:, ::1] contexts, drop=0.):
-        cdef vector[uint64_t]* keys = new vector[uint64_t]()
-        cdef vector[float]* values = new vector[float]()
-        cdef vector[int64_t]* lengths = new vector[int64_t]()
-        features = new vector[FeatureC](self.extracter.nr_templ)
-        features.resize(self.extracter.nr_templ)
-        cdef FeatureC feat
-        cdef int i, j
-        for i in range(contexts.shape[0]):
-            nr_feat = self.extracter.set_features(features.data(), &contexts[i, 0])
-            for j in range(nr_feat):
-                keys.push_back(features.at(j).key)
-                values.push_back(features.at(j).value)
-            lengths.push_back(nr_feat)
-        cdef np.ndarray[uint64_t, ndim=1] py_keys
-        cdef np.ndarray[float, ndim=1] py_values
-        cdef np.ndarray[long, ndim=1] py_lengths
-        py_keys = vector_uint64_2numpy(keys)
-        py_values = vector_float_2numpy(values)
-        py_lengths = vector_long_2numpy(lengths)
-        instance = (py_keys, py_values, py_lengths)
-        del keys
-        del values
-        del lengths
-        del features
-        return self.model.begin_update(instance, drop=drop)
-
-    def end_training(self, *args, **kwargs):
-        pass
-
-    def dump(self, *args, **kwargs):
-        pass
-
-
-cdef np.ndarray[uint64_t, ndim=1] vector_uint64_2numpy(vector[uint64_t]* vec):
-    cdef np.ndarray[uint64_t, ndim=1, mode="c"] arr = np.zeros(vec.size(), dtype='uint64')
-    memcpy(arr.data, vec.data(), sizeof(uint64_t) * vec.size())
-    return arr
-
-
-cdef np.ndarray[long, ndim=1] vector_long_2numpy(vector[int64_t]* vec):
-    cdef np.ndarray[long, ndim=1, mode="c"] arr = np.zeros(vec.size(), dtype='int64')
-    memcpy(arr.data, vec.data(), sizeof(int64_t) * vec.size())
-    return arr
-
-
-cdef np.ndarray[float, ndim=1] vector_float_2numpy(vector[float]* vec):
-    cdef np.ndarray[float, ndim=1, mode="c"] arr = np.zeros(vec.size(), dtype='float32')
-    memcpy(arr.data, vec.data(), sizeof(float) * vec.size())
-    return arr
-
-
-cdef void fill_context(atom_t* context, const TokenC* tokens, int i) nogil:
-    _fill_from_token(&context[P2_orth], &tokens[i-2])
-    _fill_from_token(&context[P1_orth], &tokens[i-1])
-    _fill_from_token(&context[W_orth], &tokens[i])
-    _fill_from_token(&context[N1_orth], &tokens[i+1])
-    _fill_from_token(&context[N2_orth], &tokens[i+2])
+cdef class TaggerModel(LinearModel):
+    cdef int set_featuresC(self, FeatureC* features, atom_t* context,
+            const TokenC* tokens, int i) nogil:
+        _fill_from_token(&context[P2_orth], &tokens[i-2])
+        _fill_from_token(&context[P1_orth], &tokens[i-1])
+        _fill_from_token(&context[W_orth], &tokens[i])
+        _fill_from_token(&context[N1_orth], &tokens[i+1])
+        _fill_from_token(&context[N2_orth], &tokens[i+2])
+        nr_feat = self.extracter.set_features(features, context)
+        return nr_feat
 
 
 cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
@@ -213,8 +159,10 @@ cdef class Tagger:
             The newly constructed object.
         """
         if model is None:
+            print("Create tagger")
             model = TaggerModel(vocab.morphology.n_tags,
-                        cfg.get('features', self.feature_templates))
+                        cfg.get('features', self.feature_templates),
+                        learn_rate=0.01, size=2**18)
         self.vocab = vocab
         self.model = model
         # TODO: Move this to tag map
@@ -223,7 +171,7 @@ cdef class Tagger:
             self.freqs[TAG][self.vocab.strings[tag]] = 1
         self.freqs[TAG][0] = 1
         self.cfg = cfg
-        self.optimizer = Adam(NumpyOps(), 0.001)
+        self.optimizer = SGD(NumpyOps(), 0.001, momentum=0.9)
 
     @property
     def tag_names(self):
@@ -250,20 +198,22 @@ cdef class Tagger:
         if tokens.length == 0:
             return 0
 
-        cdef atom_t[1][N_CONTEXT_FIELDS] c_context
-        memset(c_context, 0, sizeof(c_context))
-        cdef atom_t[:, ::1] context = c_context
-        cdef float[:, ::1] scores
+        cdef atom_t[N_CONTEXT_FIELDS] context
 
         cdef int nr_class = self.vocab.morphology.n_tags
+        cdef Pool mem = Pool()
+        scores = <weight_t*>mem.alloc(nr_class, sizeof(weight_t))
+        features = <FeatureC*>mem.alloc(self.model.nr_feat, sizeof(FeatureC))
         for i in range(tokens.length):
             if tokens.c[i].pos == 0:
-                fill_context(&context[0, 0], tokens.c, i)
-                scores, _ = self.model.begin_update(context)
-
-                guess = Vec.arg_max(&scores[0, 0], nr_class)
+                nr_feat = self.model.set_featuresC(features, context, tokens.c, i)
+                self.model.set_scoresC(scores,
+                    features, nr_feat)
+                guess = Vec.arg_max(scores, nr_class)
                 self.vocab.morphology.assign_tag_id(&tokens.c[i], guess)
-                memset(&scores[0, 0], 0, sizeof(float) * scores.size)
+                memset(scores, 0, sizeof(weight_t) * nr_class)
+                memset(features, 0, sizeof(FeatureC) * nr_feat)
+                memset(context, 0, sizeof(N_CONTEXT_FIELDS))
         tokens.is_tagged = True
         tokens._py_tokens = [None] * tokens.length
 
@@ -295,7 +245,6 @@ cdef class Tagger:
         Returns (int):
             Number of tags correct.
         """
-        cdef int nr_class = self.vocab.morphology.n_tags
         gold_tag_strs = gold.tags
         assert len(tokens) == len(gold_tag_strs)
         for tag in gold_tag_strs:
@@ -303,27 +252,47 @@ cdef class Tagger:
                 msg = ("Unrecognized gold tag: %s. tag_map.json must contain all "
                        "gold tags, to maintain coarse-grained mapping.")
                 raise ValueError(msg % tag)
-        golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs]
+        cdef Pool mem = Pool()
+        golds = <int*>mem.alloc(sizeof(int), len(gold_tag_strs))
+        for i, g in enumerate(gold_tag_strs):
+            golds[i] = self.tag_names.index(g) if g is not None else -1
+
+        cdef atom_t[N_CONTEXT_FIELDS] context
+        cdef int nr_class = self.model.nr_class
+        costs = <weight_t*>mem.alloc(sizeof(weight_t), nr_class)
+        features = <FeatureC*>mem.alloc(sizeof(FeatureC), self.model.nr_feat)
+        scores = <weight_t*>mem.alloc(sizeof(weight_t), nr_class)
+        d_scores = <weight_t*>mem.alloc(sizeof(weight_t), nr_class)
+
         cdef int correct = 0
-
-        cdef atom_t[:, ::1] context = np.zeros((1, N_CONTEXT_FIELDS), dtype='uint64')
-        cdef float[:, ::1] scores
-
         for i in range(tokens.length):
-            fill_context(&context[0, 0], tokens.c, i)
-            scores, finish_update = self.model.begin_update(context)
-            guess = Vec.arg_max(&scores[0, 0], nr_class)
-            self.vocab.morphology.assign_tag_id(&tokens.c[i], guess)
+            nr_feat = self.model.set_featuresC(features, context, tokens.c, i)
+            self.model.set_scoresC(scores,
+                features, nr_feat)
 
             if golds[i] != -1:
-                scores[0, golds[i]] -= 1 
-                finish_update(scores, lambda *args, **kwargs: None)
+                for j in range(nr_class):
+                    costs[j] = 1
+                costs[golds[i]] = 0
+            self.model.log_lossC(d_scores, scores, costs)
+            self.model.set_gradientC(d_scores, features, nr_feat)
+
+            guess = Vec.arg_max(scores, nr_class)
+            #print(tokens[i].text, golds[i], guess, [features[i].key for i in range(nr_feat)])
+
+            self.vocab.morphology.assign_tag_id(&tokens.c[i], guess)
 
-            if (golds[i] in (guess, -1)):
-                correct += 1
             self.freqs[TAG][tokens.c[i].tag] += 1
-        self.optimizer(self.model.model.weights, self.model.model.d_weights,
-            key=self.model.model.id)
+            correct += costs[guess] == 0
+
+            memset(features, 0, sizeof(FeatureC) * nr_feat)
+            memset(costs, 0, sizeof(weight_t) * nr_class)
+            memset(scores, 0, sizeof(weight_t) * nr_class)
+            memset(d_scores, 0, sizeof(weight_t) * nr_class)
+ 
+        #if itn % 10 == 0:
+        #    self.optimizer(self.model.weights.ravel(), self.model.d_weights.ravel(),
+        #                   key=1)
         tokens.is_tagged = True
         tokens._py_tokens = [None] * tokens.length
         return correct
diff --git a/spacy/train.py b/spacy/train.py
index 175c99cf2..2f8748791 100644
--- a/spacy/train.py
+++ b/spacy/train.py
@@ -14,6 +14,7 @@ class Trainer(object):
         self.nlp = nlp
         self.gold_tuples = gold_tuples
         self.nr_epoch = 0
+        self.nr_itn = 0
 
     def epochs(self, nr_epoch, augment_data=None, gold_preproc=False):
         cached_golds = {}
@@ -36,6 +37,7 @@ class Trainer(object):
                     golds = self.make_golds(docs, paragraph_tuples)
                 for doc, gold in zip(docs, golds):
                     yield doc, gold
+                    self.nr_itn += 1
 
         indices = list(range(len(self.gold_tuples)))
         for itn in range(nr_epoch):
@@ -46,7 +48,7 @@ class Trainer(object):
     def update(self, doc, gold):
         for process in self.nlp.pipeline:
             if hasattr(process, 'update'):
-                loss = process.update(doc, gold, itn=self.nr_epoch)
+                loss = process.update(doc, gold, itn=self.nr_itn)
             process(doc)
         return doc