From 75aeccc0644c09dcd59d126cc1f627c2823bbce0 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 26 Jun 2015 06:25:36 +0200
Subject: [PATCH] * Rejig parser interface to use new thinc.api.Example class,
 in prep of theano model. Comment out beam search

---
 spacy/_ml.pyx                      |  36 ++++++
 spacy/syntax/arc_eager.pyx         |  29 ++---
 spacy/syntax/ner.pyx               |  21 ----
 spacy/syntax/parser.pxd            |   3 -
 spacy/syntax/parser.pyx            | 178 ++++++++++++-----------------
 spacy/syntax/transition_system.pxd |   8 +-
 spacy/syntax/transition_system.pyx |  35 ++----
 7 files changed, 132 insertions(+), 178 deletions(-)

diff --git a/spacy/_ml.pyx b/spacy/_ml.pyx
index be647c2dd..df66a1791 100644
--- a/spacy/_ml.pyx
+++ b/spacy/_ml.pyx
@@ -10,6 +10,7 @@ import cython
 import numpy.random
 
 from thinc.features cimport Feature, count_feats
+from thinc.api cimport Example
 
 
 cdef int arg_max(const weight_t* scores, const int n_classes) nogil:
@@ -23,6 +24,30 @@ cdef int arg_max(const weight_t* scores, const int n_classes) nogil:
     return best
 
 
+cdef int arg_max_if_true(const weight_t* scores, const bint* is_valid,
+                         const int n_classes) nogil:
+    cdef int i
+    cdef int best = 0
+    cdef weight_t mode = -900000
+    for i in range(n_classes):
+        if is_valid[i] and scores[i] > mode:
+            mode = scores[i]
+            best = i
+    return best
+
+
+cdef int arg_max_if_zero(const weight_t* scores, const int* costs,
+                         const int n_classes) nogil:
+    cdef int i
+    cdef int best = 0
+    cdef weight_t mode = -900000
+    for i in range(n_classes):
+        if costs[i] == 0 and scores[i] > mode:
+            mode = scores[i]
+            best = i
+    return best
+
+
 cdef class Model:
     def __init__(self, n_classes, templates, model_loc=None):
         if model_loc is not None and path.isdir(model_loc):
@@ -34,6 +59,17 @@ cdef class Model:
         if self.model_loc and path.exists(self.model_loc):
             self._model.load(self.model_loc, freq_thresh=0)
 
+    def predict(self, Example eg):
+        self.set_scores(eg.scores, eg.atoms)
+        eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.n_classes)
+
+    def train(self, Example eg):
+        self.set_scores(eg.scores, eg.atoms)
+        eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.n_classes)
+        eg.best = arg_max_if_zero(eg.scores, eg.costs, self.n_classes)
+        eg.cost = eg.costs[eg.guess]
+        self.update(eg.atoms, eg.guess, eg.best, eg.cost)
+
     cdef const weight_t* score(self, atom_t* context) except NULL:
         cdef int n_feats
         feats = self._extractor.get_feats(context, &n_feats)
diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx
index 29e62cb4e..a83e19ec2 100644
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@@ -398,7 +398,8 @@ cdef class ArcEager(TransitionSystem):
             n_valid += output[i]
         assert n_valid >= 1
 
-    cdef int set_costs(self, int* output, StateClass stcls, GoldParse gold) except -1:
+    cdef int set_costs(self, bint* is_valid, int* costs, 
+                       StateClass stcls, GoldParse gold) except -1:
         cdef int i, move, label
         cdef label_cost_func_t[N_MOVES] label_cost_funcs
         cdef move_cost_func_t[N_MOVES] move_cost_funcs
@@ -423,30 +424,14 @@ cdef class ArcEager(TransitionSystem):
         n_gold = 0
         for i in range(self.n_moves):
             if self.c[i].is_valid(stcls, self.c[i].label):
+                is_valid[i] = True
                 move = self.c[i].move
                 label = self.c[i].label
                 if move_costs[move] == -1:
                     move_costs[move] = move_cost_funcs[move](stcls, &gold.c)
-                output[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label)
-                n_gold += output[i] == 0
+                costs[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label)
+                n_gold += costs[i] == 0
             else:
-                output[i] = 9000
+                is_valid[i] = False
+                costs[i] = 9000
         assert n_gold >= 1
-
-    cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *:
-        cdef bint[N_MOVES] is_valid
-        is_valid[SHIFT] = Shift.is_valid(stcls, -1)
-        is_valid[REDUCE] = Reduce.is_valid(stcls, -1)
-        is_valid[LEFT] = LeftArc.is_valid(stcls, -1)
-        is_valid[RIGHT] = RightArc.is_valid(stcls, -1)
-        is_valid[BREAK] = Break.is_valid(stcls, -1)
-        cdef Transition best
-        cdef weight_t score = MIN_SCORE
-        cdef int i
-        for i in range(self.n_moves):
-            if scores[i] > score and is_valid[self.c[i].move]:
-                best = self.c[i]
-                score = scores[i]
-        assert best.clas < self.n_moves
-        assert score > MIN_SCORE, (stcls.stack_depth(), stcls.buffer_length(), stcls.is_final(), stcls._b_i, stcls.length)
-        return best
diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx
index 4a47a20a8..b145df7ac 100644
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@@ -128,27 +128,6 @@ cdef class BiluoPushDown(TransitionSystem):
             raise Exception(move)
         return t
 
-    cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *:
-        cdef int best = -1
-        cdef weight_t score = -90000
-        cdef const Transition* m
-        cdef int i
-        for i in range(self.n_moves):
-            m = &self.c[i]
-            if m.is_valid(stcls, m.label) and scores[i] > score:
-                best = i
-                score = scores[i]
-        assert best >= 0
-        cdef Transition t = self.c[best]
-        t.score = score
-        return t
-
-    cdef int set_valid(self, bint* output, StateClass stcls) except -1:
-        cdef int i
-        for i in range(self.n_moves):
-            m = &self.c[i]
-            output[i] = m.is_valid(stcls, m.label)
-
 
 cdef class Missing:
     @staticmethod
diff --git a/spacy/syntax/parser.pxd b/spacy/syntax/parser.pxd
index 103ff9c02..11ac6bbb8 100644
--- a/spacy/syntax/parser.pxd
+++ b/spacy/syntax/parser.pxd
@@ -11,6 +11,3 @@ cdef class Parser:
     cdef readonly object cfg
     cdef readonly Model model
     cdef readonly TransitionSystem moves
-
-    cdef int _greedy_parse(self, Tokens tokens) except -1
-    cdef int _beam_parse(self, Tokens tokens) except -1
diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx
index e36d10a38..2f6c3cd98 100644
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@@ -19,17 +19,10 @@ from cymem.cymem cimport Pool, Address
 from murmurhash.mrmr cimport hash64
 from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
 
-
 from util import Config
 
-from thinc.features cimport Extractor
-from thinc.features cimport Feature
-from thinc.features cimport count_feats
+from thinc.api cimport Example
 
-from thinc.learner cimport LinearModel
-
-from thinc.search cimport Beam
-from thinc.search cimport MaxViolation
 
 from ..tokens cimport Tokens, TokenC
 from ..strings cimport StringStore
@@ -72,35 +65,86 @@ cdef class Parser:
         self.model = Model(self.moves.n_moves, templates, model_dir)
 
     def __call__(self, Tokens tokens):
-        if self.cfg.get('beam_width', 1) < 1:
-            self._greedy_parse(tokens)
-        else:
-            self._beam_parse(tokens)
+        cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
+        self.moves.initialize_state(stcls)
+
+        cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE)
+        while not stcls.is_final():
+            eg.wipe()
+            fill_context(eg.atoms, stcls)
+            self.moves.set_valid(eg.is_valid, stcls)
+
+            self.model.predict(eg)
+
+            self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label)
+        self.moves.finalize_state(stcls)
+        tokens.set_parse(stcls._sent)
 
     def train(self, Tokens tokens, GoldParse gold):
         self.moves.preprocess_gold(gold)
-        if self.cfg.beam_width < 1:
-            return self._greedy_train(tokens, gold)
-        else:
-            return self._beam_train(tokens, gold)
-
-    cdef int _greedy_parse(self, Tokens tokens) except -1:
-        cdef atom_t[CONTEXT_SIZE] context
-        cdef int n_feats
-        cdef Pool mem = Pool()
         cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
         self.moves.initialize_state(stcls)
-        cdef Transition guess
-        words = [w.orth_ for w in tokens]
+        cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE)
+        cdef int cost = 0
         while not stcls.is_final():
-            fill_context(context, stcls)
-            scores = self.model.score(context)
-            guess = self.moves.best_valid(scores, stcls)
-            #print self.moves.move_name(guess.move, guess.label), stcls.print_state(words)
-            guess.do(stcls, guess.label)
-            assert stcls._s_i >= 0
-        self.moves.finalize_state(stcls)
-        tokens.set_parse(stcls._sent)
+            eg.wipe()
+            fill_context(eg.atoms, stcls)
+            self.moves.set_costs(eg.is_valid, eg.costs, stcls, gold)
+
+            self.model.train(eg)
+
+            self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label)
+            cost += eg.cost
+        return cost
+
+
+# These are passed as callbacks to thinc.search.Beam
+"""
+cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
+    dest = <StateClass>_dest
+    src = <StateClass>_src
+    moves = <const Transition*>_moves
+    dest.clone(src)
+    moves[clas].do(dest, moves[clas].label)
+
+
+cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
+    cdef StateClass st = StateClass.init(<const TokenC*>tokens, length)
+    st.fast_forward()
+    Py_INCREF(st)
+    return <void*>st
+
+
+cdef int _check_final_state(void* _state, void* extra_args) except -1:
+    return (<StateClass>_state).is_final()
+
+
+def _cleanup(Beam beam):
+    for i in range(beam.width):
+        Py_XDECREF(<PyObject*>beam._states[i].content)
+        Py_XDECREF(<PyObject*>beam._parents[i].content)
+
+cdef hash_t _hash_state(void* _state, void* _) except 0:
+    return <hash_t>_state
+    
+    #state = <const State*>_state
+    #cdef atom_t[10] rep
+
+    #rep[0] = state.stack[0] if state.stack_len >= 1 else 0
+    #rep[1] = state.stack[-1] if state.stack_len >= 2 else 0
+    #rep[2] = state.stack[-2] if state.stack_len >= 3 else 0
+    #rep[3] = state.i
+    #rep[4] = state.sent[state.stack[0]].l_kids if state.stack_len >= 1 else 0
+    #rep[5] = state.sent[state.stack[0]].r_kids if state.stack_len >= 1 else 0
+    #rep[6] = state.sent[state.stack[0]].dep if state.stack_len >= 1 else 0
+    #rep[7] = state.sent[state.stack[-1]].dep if state.stack_len >= 2 else 0
+    #if get_left(state, get_n0(state), 1) != NULL:
+    #    rep[8] = get_left(state, get_n0(state), 1).dep 
+    #else:
+    #    rep[8] = 0
+    #rep[9] = state.sent[state.i].l_kids
+    #return hash64(rep, sizeof(atom_t) * 10, 0)
+
 
     cdef int _beam_parse(self, Tokens tokens) except -1:
         cdef Beam beam = Beam(self.moves.n_moves, self.cfg.beam_width)
@@ -114,30 +158,6 @@ cdef class Parser:
         tokens.set_parse(state._sent)
         _cleanup(beam)
 
-    def _greedy_train(self, Tokens tokens, GoldParse gold):
-        cdef Pool mem = Pool()
-        cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
-        self.moves.initialize_state(stcls)
-
-        cdef int cost
-        cdef const Feature* feats
-        cdef const weight_t* scores
-        cdef Transition guess
-        cdef Transition best
-        cdef atom_t[CONTEXT_SIZE] context
-        loss = 0
-        words = [w.orth_ for w in tokens]
-        history = []
-        while not stcls.is_final():
-            fill_context(context, stcls)
-            scores = self.model.score(context)
-            guess = self.moves.best_valid(scores, stcls)
-            best = self.moves.best_gold(scores, stcls, gold)
-            cost = guess.get_cost(stcls, &gold.c, guess.label)
-            self.model.update(context, guess.clas, best.clas, cost)
-            guess.do(stcls, guess.label)
-            loss += cost
-        return loss
 
     def _beam_train(self, Tokens tokens, GoldParse gold_parse):
         cdef Beam pred = Beam(self.moves.n_moves, self.cfg.beam_width)
@@ -200,50 +220,4 @@ cdef class Parser:
             count_feats(counts[clas], feats, n_feats, inc)
             self.moves.c[clas].do(stcls, self.moves.c[clas].label)
 
-
-# These are passed as callbacks to thinc.search.Beam
-
-cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
-    dest = <StateClass>_dest
-    src = <StateClass>_src
-    moves = <const Transition*>_moves
-    dest.clone(src)
-    moves[clas].do(dest, moves[clas].label)
-
-
-cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
-    cdef StateClass st = StateClass.init(<const TokenC*>tokens, length)
-    st.fast_forward()
-    Py_INCREF(st)
-    return <void*>st
-
-
-cdef int _check_final_state(void* _state, void* extra_args) except -1:
-    return (<StateClass>_state).is_final()
-
-
-def _cleanup(Beam beam):
-    for i in range(beam.width):
-        Py_XDECREF(<PyObject*>beam._states[i].content)
-        Py_XDECREF(<PyObject*>beam._parents[i].content)
-
-cdef hash_t _hash_state(void* _state, void* _) except 0:
-    return <hash_t>_state
-    
-    #state = <const State*>_state
-    #cdef atom_t[10] rep
-
-    #rep[0] = state.stack[0] if state.stack_len >= 1 else 0
-    #rep[1] = state.stack[-1] if state.stack_len >= 2 else 0
-    #rep[2] = state.stack[-2] if state.stack_len >= 3 else 0
-    #rep[3] = state.i
-    #rep[4] = state.sent[state.stack[0]].l_kids if state.stack_len >= 1 else 0
-    #rep[5] = state.sent[state.stack[0]].r_kids if state.stack_len >= 1 else 0
-    #rep[6] = state.sent[state.stack[0]].dep if state.stack_len >= 1 else 0
-    #rep[7] = state.sent[state.stack[-1]].dep if state.stack_len >= 2 else 0
-    #if get_left(state, get_n0(state), 1) != NULL:
-    #    rep[8] = get_left(state, get_n0(state), 1).dep 
-    #else:
-    #    rep[8] = 0
-    #rep[9] = state.sent[state.i].l_kids
-    #return hash64(rep, sizeof(atom_t) * 10, 0)
+"""
diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd
index d9bd2b3e6..35f0ada30 100644
--- a/spacy/syntax/transition_system.pxd
+++ b/spacy/syntax/transition_system.pxd
@@ -46,9 +46,5 @@ cdef class TransitionSystem:
 
     cdef int set_valid(self, bint* output, StateClass state) except -1
     
-    cdef int set_costs(self, int* output, StateClass state, GoldParse gold) except -1
-
-    cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *
-
-    cdef Transition best_gold(self, const weight_t* scores, StateClass state,
-                              GoldParse gold) except *
+    cdef int set_costs(self, bint* is_valid, int* costs,
+                       StateClass state, GoldParse gold) except -1
diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx
index 927498cba..b13c75ba3 100644
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@@ -43,30 +43,17 @@ cdef class TransitionSystem:
     cdef Transition init_transition(self, int clas, int move, int label) except *:
         raise NotImplementedError
 
-    cdef Transition best_valid(self, const weight_t* scores, StateClass s) except *:
-        raise NotImplementedError
-    
-    cdef int set_valid(self, bint* output, StateClass state) except -1:
-        raise NotImplementedError
-
-    cdef int set_costs(self, int* output, StateClass stcls, GoldParse gold) except -1:
+    cdef int set_valid(self, bint* is_valid, StateClass stcls) except -1:
         cdef int i
         for i in range(self.n_moves):
-            if self.c[i].is_valid(stcls, self.c[i].label):
-                output[i] = self.c[i].get_cost(stcls, &gold.c, self.c[i].label)
+            is_valid[i] = self.c[i].is_valid(stcls, self.c[i].label)
+
+    cdef int set_costs(self, bint* is_valid, int* costs,
+                       StateClass stcls, GoldParse gold) except -1:
+        cdef int i
+        self.set_valid(is_valid, stcls)
+        for i in range(self.n_moves):
+            if is_valid[i]:
+                costs[i] = self.c[i].get_cost(stcls, &gold.c, self.c[i].label)
             else:
-                output[i] = 9000
-
-    cdef Transition best_gold(self, const weight_t* scores, StateClass stcls,
-                              GoldParse gold) except *:
-        cdef Transition best
-        cdef weight_t score = MIN_SCORE
-        cdef int i
-        for i in range(self.n_moves):
-            if self.c[i].is_valid(stcls, self.c[i].label):
-                cost = self.c[i].get_cost(stcls, &gold.c, self.c[i].label)
-                if scores[i] > score and cost == 0:
-                    best = self.c[i]
-                    score = scores[i]
-        assert score > MIN_SCORE
-        return best
+                costs[i] = 9000