From fc8d3a112c77eadbcf60d1f97161af2d0530b05e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 18 May 2017 04:36:53 -0500
Subject: [PATCH] Add util.env_opt support: Can set hyper params through
 environment variables.

---
 spacy/cli/train.py         | 22 +++++++++++-----------
 spacy/gold.pyx             |  4 +++-
 spacy/syntax/nn_parser.pyx | 12 ++++++++----
 spacy/train.py             | 34 ++++++++++++++++++++++++----------
 spacy/util.py              | 19 ++++++++++++++++++-
 5 files changed, 64 insertions(+), 27 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 5c23587bc..33501800c 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -13,11 +13,11 @@ from ..gold import GoldParse, merge_sents
 from ..gold import read_json_file as read_gold_json
 from ..util import prints
 from .. import util
-from .. import displacy 
+from .. import displacy
 
 
 def train(language, output_dir, train_data, dev_data, n_iter, n_sents,
-          tagger, parser, ner, parser_L1):
+          use_gpu, tagger, parser, ner, parser_L1):
     output_path = util.ensure_path(output_dir)
     train_path = util.ensure_path(train_data)
     dev_path = util.ensure_path(dev_data)
@@ -46,7 +46,7 @@ def train(language, output_dir, train_data, dev_data, n_iter, n_sents,
     gold_train = list(read_gold_json(train_path, limit=n_sents))
     gold_dev = list(read_gold_json(dev_path, limit=n_sents)) if dev_path else None
 
-    train_model(lang, gold_train, gold_dev, output_path, n_iter)
+    train_model(lang, gold_train, gold_dev, output_path, n_iter, use_gpu=use_gpu)
     if gold_dev:
         scorer = evaluate(lang, gold_dev, output_path)
         print_results(scorer)
@@ -65,28 +65,28 @@ def train_config(config):
 def train_model(Language, train_data, dev_data, output_path, n_iter, **cfg):
     print("Itn.\tDep. Loss\tUAS\tNER F.\tTag %\tToken %")
 
-    nlp = Language(pipeline=['token_vectors', 'tags', 'dependencies'])
-
+    nlp = Language(pipeline=['token_vectors', 'tags']) #, 'dependencies'])
+    dropout = util.env_opt('dropout', 0.0)
     # TODO: Get spaCy using Thinc's trainer and optimizer
     with nlp.begin_training(train_data, **cfg) as (trainer, optimizer):
         for itn, epoch in enumerate(trainer.epochs(n_iter, gold_preproc=True)):
             losses = defaultdict(float)
             to_render = []
             for i, (docs, golds) in enumerate(epoch):
-                state = nlp.update(docs, golds, drop=0., sgd=optimizer)
+                state = nlp.update(docs, golds, drop=dropout, sgd=optimizer)
                 losses['dep_loss'] += state.get('parser_loss', 0.0)
+                losses['tag_loss'] += state.get('tagger_loss', 0.0)
                 to_render.insert(0, nlp(docs[-1].text))
                 to_render[0].user_data['title'] = "Batch %d" % i
                 with Path('/tmp/entities.html').open('w') as file_:
-                    html = displacy.render(to_render[:5], style='ent', page=True,
-                                           options={'compact': True})
+                    html = displacy.render(to_render[:5], style='ent', page=True)
                     file_.write(html)
                 with Path('/tmp/parses.html').open('w') as file_:
-                    html = displacy.render(to_render[:5], style='dep', page=True,
-                                           options={'compact': True})
+                    html = displacy.render(to_render[:5], style='dep', page=True)
                     file_.write(html)
             if dev_data:
-                dev_scores = trainer.evaluate(dev_data).scores
+                with nlp.use_params(optimizer.averages):
+                    dev_scores = trainer.evaluate(dev_data).scores
             else:
                 dev_scores = defaultdict(float)
             print_progress(itn, losses, dev_scores)
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index e7098843b..7e00030a4 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -8,6 +8,7 @@ import ujson
 
 from .syntax import nonproj
 from .util import ensure_path
+from . import util
 
 
 def tags_to_entities(tags):
@@ -138,7 +139,8 @@ def _min_edit_path(cand_words, gold_words):
     return prev_costs[n_gold], previous_row[-1]
 
 
-def read_json_file(loc, docs_filter=None, make_supertags=False, limit=None):
+def read_json_file(loc, docs_filter=None, make_supertags=True, limit=None):
+    make_supertags = util.env_opt('make_supertags', make_supertags)
     loc = ensure_path(loc)
     if loc.is_dir():
         for filename in loc.iterdir():
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 7178ffa6b..8c04a327a 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -134,7 +134,7 @@ cdef class precompute_hiddens:
             <float*>hiddens.data, &ids[0,0],
             token_ids.shape[0], self.nF, self.nO*self.nP)
 
-        output, bp_output = self._apply_nonlinearity(state_vector) 
+        output, bp_output = self._apply_nonlinearity(state_vector)
 
         def backward(d_output, sgd=None):
             # This will usually be on GPU
@@ -220,10 +220,13 @@ cdef class Parser:
     """
     @classmethod
     def Model(cls, nr_class, token_vector_width=128, hidden_width=128, **cfg):
+        token_vector_width = util.env_opt('token_vector_width', token_vector_width)
+        hidden_width = util.env_opt('hidden_width', hidden_width)
+        maxout_pieces = util.env_opt('parser_maxout_pieces', 1)
         lower = PrecomputableMaxouts(hidden_width,
                     nF=cls.nr_feature,
                     nI=token_vector_width,
-                    pieces=cfg.get('maxout_pieces', 1))
+                    pieces=maxout_pieces)
 
         with Model.use_device('cpu'):
             upper = chain(
@@ -346,7 +349,8 @@ cdef class Parser:
 
         backprops = []
         cdef float loss = 0.
-        while todo:
+        cutoff = max(1, len(todo) // 10)
+        while len(todo) >= cutoff:
             states, golds = zip(*todo)
 
             token_ids = self.get_token_ids(states)
@@ -398,7 +402,7 @@ cdef class Parser:
     def get_token_ids(self, states):
         cdef StateClass state
         cdef int n_tokens = self.nr_feature
-        ids = numpy.zeros((len(states), n_tokens), dtype='i', order='c')
+        ids = numpy.zeros((len(states), n_tokens), dtype='i', order='C')
         for i, state in enumerate(states):
             state.set_context_tokens(ids[i])
         return ids
diff --git a/spacy/train.py b/spacy/train.py
index b62cfeb99..7eeb83900 100644
--- a/spacy/train.py
+++ b/spacy/train.py
@@ -7,25 +7,32 @@ from cytoolz import partition_all
 
 from thinc.neural.optimizers import Adam
 from thinc.neural.ops import NumpyOps, CupyOps
+from thinc.neural.train import Trainer as ThincTrainer
 
 from .syntax.nonproj import PseudoProjectivity
 from .gold import GoldParse, merge_sents
 from .scorer import Scorer
 from .tokens.doc import Doc
+from . import util
 
 
 class Trainer(object):
     """
     Manage training of an NLP pipeline.
     """
-    def __init__(self, nlp, gold_tuples):
+    def __init__(self, nlp, gold_tuples, **cfg):
         self.nlp = nlp
         self.nr_epoch = 0
         self.optimizer = Adam(NumpyOps(), 0.001)
         self.gold_tuples = gold_tuples
+        self.cfg = cfg
+        self.batch_size = float(util.env_opt('min_batch_size', 4))
+        self.max_batch_size = util.env_opt('max_batch_size', 64)
+        self.accel_batch_size = util.env_opt('batch_accel', 1.001)
 
     def epochs(self, nr_epoch, augment_data=None, gold_preproc=False):
         cached_golds = {}
+        cached_docs = {}
         def _epoch(indices):
             all_docs = []
             all_golds = []
@@ -36,20 +43,26 @@ class Trainer(object):
                 else:
                     paragraph_tuples = merge_sents(paragraph_tuples)
                 if augment_data is None:
-                    docs = self.make_docs(raw_text, paragraph_tuples)
-                    if i in cached_golds:
-                        golds = cached_golds[i]
-                    else:
-                        golds = self.make_golds(docs, paragraph_tuples)
+                    if i not in cached_docs:
+                        cached_docs[i] = self.make_docs(raw_text, paragraph_tuples)
+                    docs = cached_docs[i]
+                    if i not in cached_golds:
+                        cached_golds[i] = self.make_golds(docs, paragraph_tuples)
+                    golds = cached_golds[i]
                 else:
                     raw_text, paragraph_tuples = augment_data(raw_text, paragraph_tuples)
                     docs = self.make_docs(raw_text, paragraph_tuples)
                     golds = self.make_golds(docs, paragraph_tuples)
                 all_docs.extend(docs)
                 all_golds.extend(golds)
-            for batch in partition_all(12, zip(tqdm.tqdm(all_docs), all_golds)):
-                X, y = zip(*batch)
+
+            thinc_trainer = ThincTrainer(self.nlp.pipeline[0].model)
+            thinc_trainer.batch_size = int(self.batch_size)
+            thinc_trainer.nb_epoch = 1
+            for X, y in thinc_trainer.iterate(all_docs, all_golds):
                 yield X, y
+                thinc_trainer.batch_size = min(int(self.batch_size), self.max_batch_size)
+                self.batch_size *= self.accel_batch_size
 
         indices = list(range(len(self.gold_tuples)))
         for itn in range(nr_epoch):
@@ -78,8 +91,9 @@ class Trainer(object):
         if raw_text is not None:
             return [self.nlp.make_doc(raw_text)]
         else:
-            return [Doc(self.nlp.vocab, words=sent_tuples[0][1])
-                    for sent_tuples in paragraph_tuples]
+            return [
+                Doc(self.nlp.vocab, words=sent_tuples[0][1])
+                for sent_tuples in paragraph_tuples]
 
     def make_golds(self, docs, paragraph_tuples):
         if len(docs) == 1:
diff --git a/spacy/util.py b/spacy/util.py
index 717e4f160..ef2e78b3b 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1,6 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals, print_function
 
+import os
 import ujson
 import pip
 import importlib
@@ -160,7 +161,23 @@ def get_async(stream, numpy_array):
     if cupy is None:
         return numpy_array
     else:
-        return cupy.array(numpy_array, stream=stream)
+        array = cupy.ndarray(numpy_array.shape, order='C',
+                           dtype=numpy_array.dtype)
+        array.set(numpy_array, stream=stream)
+        return array
+
+
+def env_opt(name, default=None):
+    type_convert = type(default)
+    if name in os.environ:
+        print("Get from env", name, os.environ[name])
+        return type_convert(os.environ[name])
+    elif 'SPACY_' + name.upper() in os.environ:
+        print("Get from env", name, os.environ['SPACY_' + name.upper()])
+        return type_convert(os.environ['SPACY_' + name.upper()])
+    else:
+        print("Default", name, default)
+        return default
 
 
 def read_regex(path):