From dd54511c4fa2554386925c6fe5c861afc1f824f8 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 27 Mar 2018 09:39:59 +0000
Subject: [PATCH 1/4] Pass data as a function in begin_training methods

---
 spacy/language.py          |  2 +-
 spacy/pipeline.pyx         | 22 +++++++++++-----------
 spacy/syntax/nn_parser.pyx |  8 ++++----
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 4e74327a3..33225da48 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -463,7 +463,7 @@ class Language(object):
         self._optimizer = sgd
         for name, proc in self.pipeline:
             if hasattr(proc, 'begin_training'):
-                proc.begin_training(get_gold_tuples(),
+                proc.begin_training(get_gold_tuples,
                                     pipeline=self.pipeline,
                                     sgd=self._optimizer,
                                     **cfg)
diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx
index bcf42b724..01a2b16e4 100644
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@@ -184,7 +184,7 @@ class Pipe(object):
         return create_default_optimizer(self.model.ops,
                                         **self.cfg.get('optimizer', {}))
 
-    def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None,
+    def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
                        **kwargs):
         """Initialize the pipe for training, using data exampes if available.
         If no model has been initialized yet, the model is added."""
@@ -386,7 +386,7 @@ class Tensorizer(Pipe):
         loss = (d_scores**2).sum()
         return loss, d_scores
 
-    def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None,
+    def begin_training(self, gold_tuples=lambda: [], pipeline=None, sgd=None,
                         **kwargs):
         """Allocate models, pre-process training data and acquire an
         optimizer.
@@ -510,11 +510,11 @@ class Tagger(Pipe):
         d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
         return float(loss), d_scores
 
-    def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None,
+    def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
                        **kwargs):
         orig_tag_map = dict(self.vocab.morphology.tag_map)
         new_tag_map = OrderedDict()
-        for raw_text, annots_brackets in gold_tuples:
+        for raw_text, annots_brackets in get_gold_tuples():
             for annots, brackets in annots_brackets:
                 ids, words, tags, heads, deps, ents = annots
                 for tag in tags:
@@ -687,9 +687,9 @@ class MultitaskObjective(Tagger):
     def set_annotations(self, docs, dep_ids, tensors=None):
         pass
 
-    def begin_training(self, gold_tuples=tuple(), pipeline=None, tok2vec=None,
+    def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, tok2vec=None,
                        sgd=None, **kwargs):
-        gold_tuples = nonproj.preprocess_training_data(gold_tuples)
+        gold_tuples = nonproj.preprocess_training_data(get_gold_tuples())
         for raw_text, annots_brackets in gold_tuples:
             for annots, brackets in annots_brackets:
                 ids, words, tags, heads, deps, ents = annots
@@ -962,7 +962,7 @@ class TextCategorizer(Pipe):
         self.labels.append(label)
         return 1
 
-    def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None):
+    def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None):
         if pipeline and getattr(pipeline[0], 'name', None) == 'tensorizer':
             token_vector_width = pipeline[0].model.nO
         else:
@@ -990,10 +990,10 @@ cdef class DependencyParser(Parser):
         labeller = MultitaskObjective(self.vocab, target=target)
         self._multitasks.append(labeller)
 
-    def init_multitask_objectives(self, gold_tuples, pipeline, sgd=None, **cfg):
+    def init_multitask_objectives(self, get_gold_tuples, pipeline, sgd=None, **cfg):
         for labeller in self._multitasks:
             tok2vec = self.model[0]
-            labeller.begin_training(gold_tuples, pipeline=pipeline,
+            labeller.begin_training(get_gold_tuples, pipeline=pipeline,
                                     tok2vec=tok2vec, sgd=sgd)
 
     def __reduce__(self):
@@ -1011,10 +1011,10 @@ cdef class EntityRecognizer(Parser):
         labeller = MultitaskObjective(self.vocab, target=target)
         self._multitasks.append(labeller)
 
-    def init_multitask_objectives(self, gold_tuples, pipeline, sgd=None, **cfg):
+    def init_multitask_objectives(self, get_gold_tuples, pipeline, sgd=None, **cfg):
         for labeller in self._multitasks:
             tok2vec = self.model[0]
-            labeller.begin_training(gold_tuples, pipeline=pipeline,
+            labeller.begin_training(get_gold_tuples, pipeline=pipeline,
                                     tok2vec=tok2vec)
 
     def __reduce__(self):
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 759ccacab..af38b2c2a 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -833,11 +833,11 @@ cdef class Parser:
             copy_array(larger.b[:smaller.nO], smaller.b)
             self.model[-1]._layers[-1] = larger
 
-    def begin_training(self, gold_tuples, pipeline=None, sgd=None, **cfg):
+    def begin_training(self, get_gold_tuples, pipeline=None, sgd=None, **cfg):
         if 'model' in cfg:
             self.model = cfg['model']
         cfg.setdefault('min_action_freq', 30)
-        actions = self.moves.get_actions(gold_parses=gold_tuples,
+        actions = self.moves.get_actions(gold_parses=get_gold_tuples(),
                                          min_freq=cfg.get('min_action_freq', 30))
         self.moves.initialize_actions(actions)
         cfg.setdefault('token_vector_width', 128)
@@ -849,7 +849,7 @@ cdef class Parser:
             self.model[1].begin_training(
                 self.model[1].ops.allocate((5, cfg['token_vector_width'])))
             if pipeline is not None:
-                self.init_multitask_objectives(gold_tuples, pipeline, sgd=sgd, **cfg)
+                self.init_multitask_objectives(get_gold_tuples, pipeline, sgd=sgd, **cfg)
             link_vectors_to_models(self.vocab)
         else:
             if sgd is None:
@@ -863,7 +863,7 @@ cdef class Parser:
         # Defined in subclasses, to avoid circular import
         raise NotImplementedError
     
-    def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
+    def init_multitask_objectives(self, get_gold_tuples, pipeline, **cfg):
         '''Setup models for secondary objectives, to benefit from multi-task
         learning. This method is intended to be overridden by subclasses.
 

From 8bbd26579cad81a8a347ed8c097e389110569312 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 27 Mar 2018 09:53:35 +0000
Subject: [PATCH 2/4] Support GPU in UD training script

---
 spacy/cli/ud_train.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/spacy/cli/ud_train.py b/spacy/cli/ud_train.py
index b827d4a4f..853cff9b3 100644
--- a/spacy/cli/ud_train.py
+++ b/spacy/cli/ud_train.py
@@ -254,7 +254,7 @@ def load_nlp(corpus, config):
         nlp.vocab.from_disk(Path(config.vectors) / 'vocab')
     return nlp
 
-def initialize_pipeline(nlp, docs, golds, config):
+def initialize_pipeline(nlp, docs, golds, config, device):
     nlp.add_pipe(nlp.create_pipe('parser'))
     if config.multitask_tag:
         nlp.parser.add_multitask_objective('tag')
@@ -265,7 +265,7 @@ def initialize_pipeline(nlp, docs, golds, config):
         for tag in gold.tags:
             if tag is not None:
                 nlp.tagger.add_label(tag)
-    return nlp.begin_training(lambda: golds_to_gold_tuples(docs, golds))
+    return nlp.begin_training(lambda: golds_to_gold_tuples(docs, golds), device=device)
 
 
 ########################
@@ -318,15 +318,14 @@ class TreebankPaths(object):
             "positional", None, str),
     parses_dir=("Directory to write the development parses", "positional", None, Path),
     config=("Path to json formatted config file", "positional"),
-    limit=("Size limit", "option", "n", int)
+    limit=("Size limit", "option", "n", int),
+    use_gpu=("Use GPU", "option", "g", int)
 )
-def main(ud_dir, parses_dir, config, corpus, limit=0):
+def main(ud_dir, parses_dir, config, corpus, limit=0, use_gpu=-1):
+    spacy.util.fix_random_seed()
     lang.zh.Chinese.Defaults.use_jieba = False
     lang.ja.Japanese.Defaults.use_janome = False
 
-    random.seed(0)
-    numpy.random.seed(0)
-
     config = Config.load(config)
     paths = TreebankPaths(ud_dir, corpus)
     if not (parses_dir / corpus).exists():
@@ -337,9 +336,9 @@ def main(ud_dir, parses_dir, config, corpus, limit=0):
     docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(),
                             max_doc_length=config.max_doc_length, limit=limit)
 
-    optimizer = initialize_pipeline(nlp, docs, golds, config)
+    optimizer = initialize_pipeline(nlp, docs, golds, config, use_gpu)
 
-    batch_sizes = compounding(config.batch_size //10, config.batch_size, 1.001)
+    batch_sizes = compounding(config.batch_size//10, config.batch_size, 1.001)
     for i in range(config.nr_epoch):
         docs = [nlp.make_doc(doc.text) for doc in docs]
         Xs = list(zip(docs, golds))

From 987e1533a4907e35468c5cc55ca4b80131bf5036 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 27 Mar 2018 10:08:12 +0000
Subject: [PATCH 3/4] Use 8 features in parser

---
 spacy/syntax/nn_parser.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index af38b2c2a..d772be40b 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -736,7 +736,7 @@ cdef class Parser:
                                        lower, stream, drop=0.0)
         return (tokvecs, bp_tokvecs), state2vec, upper
 
-    nr_feature = 13
+    nr_feature = 8
 
     def get_token_ids(self, states):
         cdef StateClass state

From 25280b7013a041a003f7eefe9c51a3f216d7c33d Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 27 Mar 2018 10:08:38 +0000
Subject: [PATCH 4/4] Try to make sum_state_features faster

---
 spacy/syntax/nn_parser.pyx | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index d772be40b..8fd870939 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -165,16 +165,17 @@ cdef void sum_state_features(float* output,
     cdef const float* feature
     padding = cached
     cached += F * O
+    cdef int id_stride = F*O
+    cdef float one = 1.
     for b in range(B):
         for f in range(F):
             if token_ids[f] < 0:
                 feature = &padding[f*O]
             else:
-                idx = token_ids[f] * F * O + f*O
+                idx = token_ids[f] * id_stride + f*O
                 feature = &cached[idx]
-            VecVec.add_i(output,
-                feature, 1., O)
-        output += O
+            openblas.simple_axpy(&output[b*O], O,
+                feature, one)
         token_ids += F