From bdaac7ab445c247e8137950ee66d698806a4830c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 23 May 2017 02:59:31 -0500
Subject: [PATCH] WIP on improving parser efficiency

---
 spacy/cli/train.py                      |  17 ++-
 spacy/gold.pyx                          |  19 ++--
 spacy/language.py                       |   2 +-
 spacy/matcher.pyx                       |   3 +-
 spacy/pipeline.pyx                      |  20 +---
 spacy/syntax/nn_parser.pxd              |   4 +-
 spacy/syntax/nn_parser.pyx              | 139 +++++++++++++++---------
 spacy/tests/regression/test_issue429.py |   5 +-
 spacy/tests/test_matcher.py             |   4 +-
 9 files changed, 119 insertions(+), 94 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 2945794e7..07e97fe1e 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -9,6 +9,7 @@ from pathlib import Path
 import dill
 import tqdm
 from thinc.neural.optimizers import linear_decay
+from timeit import default_timer as timer
 
 from ..tokens.doc import Doc
 from ..scorer import Scorer
@@ -81,8 +82,13 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
                 batch_size = min(batch_size, max_batch_size)
                 dropout = linear_decay(orig_dropout, dropout_decay, i*n_train_docs+idx)
         with nlp.use_params(optimizer.averages):
+            start = timer()
             scorer = nlp.evaluate(corpus.dev_docs(nlp))
-        print_progress(i, {}, scorer.scores)
+            end = timer()
+            n_words = scorer.tokens.tp + scorer.tokens.fn
+            assert n_words != 0
+            wps = n_words / (end-start)
+        print_progress(i, {}, scorer.scores, wps=wps)
     with (output_path / 'model.bin').open('wb') as file_:
         with nlp.use_params(optimizer.averages):
             dill.dump(nlp, file_, -1)
@@ -98,14 +104,14 @@ def _render_parses(i, to_render):
         file_.write(html)
 
 
-def print_progress(itn, losses, dev_scores):
-    # TODO: Fix!
+def print_progress(itn, losses, dev_scores, wps=0.0):
     scores = {}
     for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
-                'ents_p', 'ents_r', 'ents_f']:
+                'ents_p', 'ents_r', 'ents_f', 'wps']:
         scores[col] = 0.0
     scores.update(losses)
     scores.update(dev_scores)
+    scores[wps] = wps
     tpl = '\t'.join((
         '{:d}',
         '{dep_loss:.3f}',
@@ -115,7 +121,8 @@ def print_progress(itn, losses, dev_scores):
         '{ents_r:.3f}',
         '{ents_f:.3f}',
         '{tags_acc:.3f}',
-        '{token_acc:.3f}'))
+        '{token_acc:.3f}',
+        '{wps:.1f}'))
     print(tpl.format(itn, **scores))
 
 
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 651cefe2f..53bd25890 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -144,7 +144,7 @@ def _min_edit_path(cand_words, gold_words):
 class GoldCorpus(object):
     """An annotated corpus, using the JSON file format. Manages
     annotations for tagging, dependency parsing and NER."""
-    def __init__(self, train_path, dev_path, limit=None):
+    def __init__(self, train_path, dev_path, gold_preproc=True, limit=None):
         """Create a GoldCorpus.
 
         train_path (unicode or Path): File or directory of training data.
@@ -184,7 +184,7 @@ class GoldCorpus(object):
             n += 1
         return n
 
-    def train_docs(self, nlp, shuffle=0, gold_preproc=True,
+    def train_docs(self, nlp, shuffle=0, gold_preproc=False,
                    projectivize=False):
         train_tuples = self.train_tuples
         if projectivize:
@@ -195,7 +195,7 @@ class GoldCorpus(object):
         gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc)
         yield from gold_docs
 
-    def dev_docs(self, nlp, gold_preproc=True):
+    def dev_docs(self, nlp, gold_preproc=False):
         gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc)
         gold_docs = nlp.preprocess_gold(gold_docs)
         yield from gold_docs
@@ -203,6 +203,11 @@ class GoldCorpus(object):
     @classmethod
     def iter_gold_docs(cls, nlp, tuples, gold_preproc):
         for raw_text, paragraph_tuples in tuples:
+            if gold_preproc:
+                raw_text = None
+            else:
+                paragraph_tuples = merge_sents(paragraph_tuples)
+
             docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
                                   gold_preproc)
             golds = cls._make_golds(docs, paragraph_tuples)
@@ -211,15 +216,11 @@ class GoldCorpus(object):
 
     @classmethod
     def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc):
-        if gold_preproc:
-            return [Doc(nlp.vocab, words=sent_tuples[0][1])
-                for sent_tuples in paragraph_tuples]
-        elif raw_text is not None:
+        if raw_text is not None:
             return [nlp.make_doc(raw_text)]
         else:
-            docs = [Doc(nlp.vocab, words=sent_tuples[0][1])
+            return [Doc(nlp.vocab, words=sent_tuples[0][1])
                 for sent_tuples in paragraph_tuples]
-            return merge_sents(docs)
 
     @classmethod
     def _make_golds(cls, docs, paragraph_tuples):
diff --git a/spacy/language.py b/spacy/language.py
index 37f7ae207..cc4c29867 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -334,7 +334,7 @@ class Language(object):
             >>>     for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
             >>>         assert doc.is_parsed
         """
-        #docs = (self.make_doc(text) for text in texts)
+        docs = (self.make_doc(text) for text in texts)
         docs = texts
         for proc in self.pipeline:
             name = getattr(proc, 'name', None)
diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx
index 24bb7b65e..20e2a8993 100644
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@@ -215,7 +215,7 @@ cdef class Matcher:
         """
         return len(self._patterns)
 
-    def add(self, key, on_match, *patterns):
+    def add(self, key, *patterns, **kwargs):
         """Add a match-rule to the matcher.
         A match-rule consists of: an ID key, an on_match callback, and one or
         more patterns. If the key exists, the patterns are appended to the
@@ -227,6 +227,7 @@ cdef class Matcher:
         descriptors can also include quantifiers. There are currently important
         known problems with the quantifiers – see the docs.
         """
+        on_match = kwargs.get('on_match', None)
         for pattern in patterns:
             if len(pattern) == 0:
                 msg = ("Cannot add pattern for zero tokens to matcher.\n"
diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx
index cb68846af..af71b1ad6 100644
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@@ -167,7 +167,7 @@ class NeuralTagger(object):
         self.model = model
 
     def __call__(self, doc):
-        tags = self.predict(doc.tensor)
+        tags = self.predict([doc.tensor])
         self.set_annotations([doc], tags)
 
     def pipe(self, stream, batch_size=128, n_threads=-1):
@@ -340,24 +340,6 @@ cdef class NeuralEntityRecognizer(NeuralParser):
 
     nr_feature = 6
 
-    def get_token_ids(self, states):
-        cdef StateClass state
-        cdef int n_tokens = 6
-        ids = numpy.zeros((len(states), n_tokens), dtype='i', order='c')
-        for i, state in enumerate(states):
-            ids[i, 0] = state.c.B(0)-1
-            ids[i, 1] = state.c.B(0)
-            ids[i, 2] = state.c.B(1)
-            ids[i, 3] = state.c.E(0)
-            ids[i, 4] = state.c.E(0)-1
-            ids[i, 5] = state.c.E(0)+1
-            for j in range(6):
-                if ids[i, j] >= state.c.length:
-                    ids[i, j] = -1
-                if ids[i, j] >= 0:
-                    ids[i, j] += state.c.offset
-        return ids
-
 
 cdef class BeamDependencyParser(BeamParser):
     TransitionSystem = ArcEager
diff --git a/spacy/syntax/nn_parser.pxd b/spacy/syntax/nn_parser.pxd
index 8692185e5..f6963ea18 100644
--- a/spacy/syntax/nn_parser.pxd
+++ b/spacy/syntax/nn_parser.pxd
@@ -15,7 +15,7 @@ cdef class Parser:
     cdef readonly object cfg
 
     cdef void _parse_step(self, StateC* state,
-            const float* feat_weights,
-            int nr_class, int nr_feat) nogil
+            int* token_ids, float* scores, int* is_valid,
+            const float* feat_weights, int nr_class, int nr_feat) nogil
 
     #cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 995ff5278..1b96bae36 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -19,6 +19,7 @@ import numpy.random
 cimport numpy as np
 
 from libcpp.vector cimport vector
+from libcpp.pair cimport pair
 from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
 from cpython.exc cimport PyErr_CheckSignals
 from libc.stdint cimport uint32_t, uint64_t
@@ -68,6 +69,9 @@ def set_debug(val):
     DEBUG = val
 
 
+ctypedef pair[int, StateC*] step_t
+
+
 cdef class precompute_hiddens:
     '''Allow a model to be "primed" by pre-computing input features in bulk.
 
@@ -119,6 +123,9 @@ cdef class precompute_hiddens:
             self._is_synchronized = True
         return <float*>self._cached.data
 
+    def get_bp_hiddens(self):
+        return self._bp_hiddens
+
     def __call__(self, X):
         return self.begin_update(X)[0]
 
@@ -308,7 +315,6 @@ cdef class Parser:
         cdef:
             precompute_hiddens state2vec
             StateClass state
-            Pool mem
             const float* feat_weights
             StateC* st
             vector[StateC*] next_step, this_step
@@ -336,7 +342,14 @@ cdef class Parser:
         cdef int i
         while not next_step.empty():
             for i in cython.parallel.prange(next_step.size(), num_threads=4, nogil=True):
-                self._parse_step(next_step[i], feat_weights, nr_class, nr_feat)
+                token_ids = <int*>calloc(nr_feat, sizeof(int))
+                scores = <float*>calloc(nr_class, sizeof(float))
+                is_valid = <int*>calloc(nr_class, sizeof(int))
+                self._parse_step(next_step[i], token_ids, scores, is_valid,
+                                 feat_weights, nr_class, nr_feat)
+                free(is_valid)
+                free(scores)
+                free(token_ids)
             this_step, next_step = next_step, this_step
             next_step.clear()
             for st in this_step:
@@ -345,12 +358,8 @@ cdef class Parser:
         return states
 
     cdef void _parse_step(self, StateC* state,
-            const float* feat_weights,
-            int nr_class, int nr_feat) nogil:
-        token_ids = <int*>calloc(nr_feat, sizeof(int))
-        scores = <float*>calloc(nr_class, sizeof(float))
-        is_valid = <int*>calloc(nr_class, sizeof(int))
-
+            int* token_ids, float* scores, int* is_valid,
+            const float* feat_weights, int nr_class, int nr_feat) nogil:
         state.set_context_tokens(token_ids, nr_feat)
         sum_state_features(scores,
             feat_weights, token_ids, 1, nr_feat, nr_class)
@@ -359,66 +368,90 @@ cdef class Parser:
         action = self.moves.c[guess]
         action.do(state, action.label)
 
-        free(is_valid)
-        free(scores)
-        free(token_ids)
-
     def update(self, docs_tokvecs, golds, drop=0., sgd=None):
+        cdef:
+            precompute_hiddens state2vec
+            StateClass state
+            const float* feat_weights
+            StateC* st
+            vector[step_t] next_step, this_step
+            cdef int[:, ::1] is_valid, token_ids
+            cdef float[:, ::1] scores, d_scores, costs
+            int nr_state, nr_feat, nr_class
+
         docs, tokvec_lists = docs_tokvecs
-        tokvecs = self.model[0].ops.flatten(tokvec_lists)
         if isinstance(docs, Doc) and isinstance(golds, GoldParse):
             docs = [docs]
             golds = [golds]
+        assert len(docs) == len(golds) == len(tokvec_lists)
 
+        nr_state = len(docs)
+        nr_feat = self.nr_feature
+        nr_class = self.moves.n_moves
+
+        token_ids = numpy.zeros((nr_state, nr_feat), dtype='i')
+        is_valid = numpy.zeros((nr_state, nr_class), dtype='i')
+        scores = numpy.zeros((nr_state, nr_class), dtype='f')
+        d_scores = numpy.zeros((nr_state, nr_class), dtype='f')
+        costs = numpy.zeros((nr_state, nr_class), dtype='f')
+
+        tokvecs = self.model[0].ops.flatten(tokvec_lists)
         cuda_stream = get_cuda_stream()
+        state2vec, vec2scores = self.get_batch_model(nr_state, tokvecs,
+                                                     cuda_stream, drop)
+
         golds = [self.moves.preprocess_gold(g) for g in golds]
-
         states = self.moves.init_batch(docs)
-        state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream,
-                                                      drop)
-
-        todo = [(s, g) for (s, g) in zip(states, golds)
-                if not s.is_final() and g is not None]
+        cdef step_t step
+        cdef int i
+        for i, state in enumerate(states):
+            if not state.c.is_final():
+                step.first = i
+                step.second = state.c
+                next_step.push_back(step)
+                self.moves.set_costs(&is_valid[i, 0], &costs[i, 0], state, golds[i])
 
+        feat_weights = state2vec.get_feat_weights()
+        bp_hiddens = state2vec.get_bp_hiddens()
+        d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
         backprops = []
-        cdef float loss = 0.
-        while len(todo) >= 3:
-            states, golds = zip(*todo)
 
-            token_ids = self.get_token_ids(states)
-            vector, bp_vector = state2vec.begin_update(token_ids, drop=drop)
-            scores, bp_scores = vec2scores.begin_update(vector, drop=drop)
+        while next_step.size():
+            # Allocate these each step, so copy an be async
+            np_token_ids = numpy.zeros((nr_state, nr_feat), dtype='i')
+            np_d_scores = numpy.zeros((nr_state, nr_class), dtype='f')
+            token_ids = np_token_ids
+            d_scores = np_d_scores
+            for step in next_step:
+                i = step.first
+                st = step.second
+                self._parse_step(st, &token_ids[i, 0],
+                    &scores[i, 0], &is_valid[i, 0],
+                    feat_weights, nr_class, nr_feat)
+                cpu_log_loss(&d_scores[i, 0],
+                    &costs[i, 0], &is_valid[i, 0], &scores[i, 0], nr_class)
+            backprops.append((
+                get_async(cuda_stream, np_token_ids),
+                get_async(cuda_stream, np_d_scores)))
+            this_step, next_step = next_step, this_step
+            next_step.clear()
+            for step in this_step:
+                i = step.first
+                st = step.second
+                if not st.is_final():
+                    next_step.push_back(step)
+                    self.moves.set_costs(&is_valid[i, 0], &costs[i, 0],
+                                         states[i], golds[i])
+        cuda_stream.synchronize()
+        for gpu_token_ids, gpu_d_scores in backprops:
+            d_features = bp_hiddens((gpu_d_scores, gpu_token_ids), sgd)
+            d_features *= (gpu_token_ids >= 0).reshape((nr_state, nr_feat, 1))
 
-            d_scores = self.get_batch_loss(states, golds, scores)
-            d_vector = bp_scores(d_scores, sgd=sgd)
-
-            if isinstance(self.model[0].ops, CupyOps) \
-            and not isinstance(token_ids, state2vec.ops.xp.ndarray):
-                # Move token_ids and d_vector to CPU, asynchronously
-                backprops.append((
-                    get_async(cuda_stream, token_ids),
-                    get_async(cuda_stream, d_vector),
-                    bp_vector
-                ))
-            else:
-                backprops.append((token_ids, d_vector, bp_vector))
-            self.transition_batch(states, scores)
-            todo = [st for st in todo if not st[0].is_final()]
-        # Tells CUDA to block, so our async copies complete.
-        if cuda_stream is not None:
-            cuda_stream.synchronize()
-        d_tokvecs = state2vec.ops.allocate(tokvecs.shape)
-        xp = state2vec.ops.xp # Handle for numpy/cupy
-        for token_ids, d_vector, bp_vector in backprops:
-            d_state_features = bp_vector(d_vector, sgd=sgd)
-            active_feats = token_ids * (token_ids >= 0)
-            active_feats = active_feats.reshape((token_ids.shape[0], token_ids.shape[1], 1))
+            xp = self.model[0].ops.xp
             if hasattr(xp, 'scatter_add'):
-                xp.scatter_add(d_tokvecs,
-                    token_ids, d_state_features * active_feats)
+                xp.scatter_add(d_tokvecs, gpu_token_ids, d_features)
             else:
-                xp.add.at(d_tokvecs,
-                    token_ids, d_state_features * active_feats)
+                xp.add.at(d_tokvecs, gpu_token_ids, d_features)
         return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
 
     def get_batch_model(self, batch_size, tokvecs, stream, dropout):
diff --git a/spacy/tests/regression/test_issue429.py b/spacy/tests/regression/test_issue429.py
index 2782a0fb2..c5dc6989b 100644
--- a/spacy/tests/regression/test_issue429.py
+++ b/spacy/tests/regression/test_issue429.py
@@ -17,8 +17,9 @@ def test_issue429(EN):
 
     doc = EN('a')
     matcher = Matcher(EN.vocab)
-    matcher.add('TEST', on_match=merge_phrases, [{'ORTH': 'a'}])
-    doc = EN.tokenizer('a b c')
+    matcher.add('TEST', [{'ORTH': 'a'}], on_match=merge_phrases)
+    doc = EN.make_doc('a b c')
+
     EN.tagger(doc)
     matcher(doc)
     EN.entity(doc)
diff --git a/spacy/tests/test_matcher.py b/spacy/tests/test_matcher.py
index 2f6764e06..9bbc9b24d 100644
--- a/spacy/tests/test_matcher.py
+++ b/spacy/tests/test_matcher.py
@@ -1,8 +1,8 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-from ...matcher import Matcher, PhraseMatcher
-from ..util import get_doc
+from ..matcher import Matcher, PhraseMatcher
+from .util import get_doc
 
 import pytest