Add util.env_opt support: Can set hyper params through environment variables.

This commit is contained in:
Matthew Honnibal 2017-05-18 04:36:53 -05:00
parent d2626fdb45
commit fc8d3a112c
5 changed files with 64 additions and 27 deletions

View File

@ -17,7 +17,7 @@ from .. import displacy
def train(language, output_dir, train_data, dev_data, n_iter, n_sents, def train(language, output_dir, train_data, dev_data, n_iter, n_sents,
tagger, parser, ner, parser_L1): use_gpu, tagger, parser, ner, parser_L1):
output_path = util.ensure_path(output_dir) output_path = util.ensure_path(output_dir)
train_path = util.ensure_path(train_data) train_path = util.ensure_path(train_data)
dev_path = util.ensure_path(dev_data) dev_path = util.ensure_path(dev_data)
@ -46,7 +46,7 @@ def train(language, output_dir, train_data, dev_data, n_iter, n_sents,
gold_train = list(read_gold_json(train_path, limit=n_sents)) gold_train = list(read_gold_json(train_path, limit=n_sents))
gold_dev = list(read_gold_json(dev_path, limit=n_sents)) if dev_path else None gold_dev = list(read_gold_json(dev_path, limit=n_sents)) if dev_path else None
train_model(lang, gold_train, gold_dev, output_path, n_iter) train_model(lang, gold_train, gold_dev, output_path, n_iter, use_gpu=use_gpu)
if gold_dev: if gold_dev:
scorer = evaluate(lang, gold_dev, output_path) scorer = evaluate(lang, gold_dev, output_path)
print_results(scorer) print_results(scorer)
@ -65,27 +65,27 @@ def train_config(config):
def train_model(Language, train_data, dev_data, output_path, n_iter, **cfg): def train_model(Language, train_data, dev_data, output_path, n_iter, **cfg):
print("Itn.\tDep. Loss\tUAS\tNER F.\tTag %\tToken %") print("Itn.\tDep. Loss\tUAS\tNER F.\tTag %\tToken %")
nlp = Language(pipeline=['token_vectors', 'tags', 'dependencies']) nlp = Language(pipeline=['token_vectors', 'tags']) #, 'dependencies'])
dropout = util.env_opt('dropout', 0.0)
# TODO: Get spaCy using Thinc's trainer and optimizer # TODO: Get spaCy using Thinc's trainer and optimizer
with nlp.begin_training(train_data, **cfg) as (trainer, optimizer): with nlp.begin_training(train_data, **cfg) as (trainer, optimizer):
for itn, epoch in enumerate(trainer.epochs(n_iter, gold_preproc=True)): for itn, epoch in enumerate(trainer.epochs(n_iter, gold_preproc=True)):
losses = defaultdict(float) losses = defaultdict(float)
to_render = [] to_render = []
for i, (docs, golds) in enumerate(epoch): for i, (docs, golds) in enumerate(epoch):
state = nlp.update(docs, golds, drop=0., sgd=optimizer) state = nlp.update(docs, golds, drop=dropout, sgd=optimizer)
losses['dep_loss'] += state.get('parser_loss', 0.0) losses['dep_loss'] += state.get('parser_loss', 0.0)
losses['tag_loss'] += state.get('tagger_loss', 0.0)
to_render.insert(0, nlp(docs[-1].text)) to_render.insert(0, nlp(docs[-1].text))
to_render[0].user_data['title'] = "Batch %d" % i to_render[0].user_data['title'] = "Batch %d" % i
with Path('/tmp/entities.html').open('w') as file_: with Path('/tmp/entities.html').open('w') as file_:
html = displacy.render(to_render[:5], style='ent', page=True, html = displacy.render(to_render[:5], style='ent', page=True)
options={'compact': True})
file_.write(html) file_.write(html)
with Path('/tmp/parses.html').open('w') as file_: with Path('/tmp/parses.html').open('w') as file_:
html = displacy.render(to_render[:5], style='dep', page=True, html = displacy.render(to_render[:5], style='dep', page=True)
options={'compact': True})
file_.write(html) file_.write(html)
if dev_data: if dev_data:
with nlp.use_params(optimizer.averages):
dev_scores = trainer.evaluate(dev_data).scores dev_scores = trainer.evaluate(dev_data).scores
else: else:
dev_scores = defaultdict(float) dev_scores = defaultdict(float)

View File

@ -8,6 +8,7 @@ import ujson
from .syntax import nonproj from .syntax import nonproj
from .util import ensure_path from .util import ensure_path
from . import util
def tags_to_entities(tags): def tags_to_entities(tags):
@ -138,7 +139,8 @@ def _min_edit_path(cand_words, gold_words):
return prev_costs[n_gold], previous_row[-1] return prev_costs[n_gold], previous_row[-1]
def read_json_file(loc, docs_filter=None, make_supertags=False, limit=None): def read_json_file(loc, docs_filter=None, make_supertags=True, limit=None):
make_supertags = util.env_opt('make_supertags', make_supertags)
loc = ensure_path(loc) loc = ensure_path(loc)
if loc.is_dir(): if loc.is_dir():
for filename in loc.iterdir(): for filename in loc.iterdir():

View File

@ -220,10 +220,13 @@ cdef class Parser:
""" """
@classmethod @classmethod
def Model(cls, nr_class, token_vector_width=128, hidden_width=128, **cfg): def Model(cls, nr_class, token_vector_width=128, hidden_width=128, **cfg):
token_vector_width = util.env_opt('token_vector_width', token_vector_width)
hidden_width = util.env_opt('hidden_width', hidden_width)
maxout_pieces = util.env_opt('parser_maxout_pieces', 1)
lower = PrecomputableMaxouts(hidden_width, lower = PrecomputableMaxouts(hidden_width,
nF=cls.nr_feature, nF=cls.nr_feature,
nI=token_vector_width, nI=token_vector_width,
pieces=cfg.get('maxout_pieces', 1)) pieces=maxout_pieces)
with Model.use_device('cpu'): with Model.use_device('cpu'):
upper = chain( upper = chain(
@ -346,7 +349,8 @@ cdef class Parser:
backprops = [] backprops = []
cdef float loss = 0. cdef float loss = 0.
while todo: cutoff = max(1, len(todo) // 10)
while len(todo) >= cutoff:
states, golds = zip(*todo) states, golds = zip(*todo)
token_ids = self.get_token_ids(states) token_ids = self.get_token_ids(states)
@ -398,7 +402,7 @@ cdef class Parser:
def get_token_ids(self, states): def get_token_ids(self, states):
cdef StateClass state cdef StateClass state
cdef int n_tokens = self.nr_feature cdef int n_tokens = self.nr_feature
ids = numpy.zeros((len(states), n_tokens), dtype='i', order='c') ids = numpy.zeros((len(states), n_tokens), dtype='i', order='C')
for i, state in enumerate(states): for i, state in enumerate(states):
state.set_context_tokens(ids[i]) state.set_context_tokens(ids[i])
return ids return ids

View File

@ -7,25 +7,32 @@ from cytoolz import partition_all
from thinc.neural.optimizers import Adam from thinc.neural.optimizers import Adam
from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.train import Trainer as ThincTrainer
from .syntax.nonproj import PseudoProjectivity from .syntax.nonproj import PseudoProjectivity
from .gold import GoldParse, merge_sents from .gold import GoldParse, merge_sents
from .scorer import Scorer from .scorer import Scorer
from .tokens.doc import Doc from .tokens.doc import Doc
from . import util
class Trainer(object): class Trainer(object):
""" """
Manage training of an NLP pipeline. Manage training of an NLP pipeline.
""" """
def __init__(self, nlp, gold_tuples): def __init__(self, nlp, gold_tuples, **cfg):
self.nlp = nlp self.nlp = nlp
self.nr_epoch = 0 self.nr_epoch = 0
self.optimizer = Adam(NumpyOps(), 0.001) self.optimizer = Adam(NumpyOps(), 0.001)
self.gold_tuples = gold_tuples self.gold_tuples = gold_tuples
self.cfg = cfg
self.batch_size = float(util.env_opt('min_batch_size', 4))
self.max_batch_size = util.env_opt('max_batch_size', 64)
self.accel_batch_size = util.env_opt('batch_accel', 1.001)
def epochs(self, nr_epoch, augment_data=None, gold_preproc=False): def epochs(self, nr_epoch, augment_data=None, gold_preproc=False):
cached_golds = {} cached_golds = {}
cached_docs = {}
def _epoch(indices): def _epoch(indices):
all_docs = [] all_docs = []
all_golds = [] all_golds = []
@ -36,20 +43,26 @@ class Trainer(object):
else: else:
paragraph_tuples = merge_sents(paragraph_tuples) paragraph_tuples = merge_sents(paragraph_tuples)
if augment_data is None: if augment_data is None:
docs = self.make_docs(raw_text, paragraph_tuples) if i not in cached_docs:
if i in cached_golds: cached_docs[i] = self.make_docs(raw_text, paragraph_tuples)
docs = cached_docs[i]
if i not in cached_golds:
cached_golds[i] = self.make_golds(docs, paragraph_tuples)
golds = cached_golds[i] golds = cached_golds[i]
else:
golds = self.make_golds(docs, paragraph_tuples)
else: else:
raw_text, paragraph_tuples = augment_data(raw_text, paragraph_tuples) raw_text, paragraph_tuples = augment_data(raw_text, paragraph_tuples)
docs = self.make_docs(raw_text, paragraph_tuples) docs = self.make_docs(raw_text, paragraph_tuples)
golds = self.make_golds(docs, paragraph_tuples) golds = self.make_golds(docs, paragraph_tuples)
all_docs.extend(docs) all_docs.extend(docs)
all_golds.extend(golds) all_golds.extend(golds)
for batch in partition_all(12, zip(tqdm.tqdm(all_docs), all_golds)):
X, y = zip(*batch) thinc_trainer = ThincTrainer(self.nlp.pipeline[0].model)
thinc_trainer.batch_size = int(self.batch_size)
thinc_trainer.nb_epoch = 1
for X, y in thinc_trainer.iterate(all_docs, all_golds):
yield X, y yield X, y
thinc_trainer.batch_size = min(int(self.batch_size), self.max_batch_size)
self.batch_size *= self.accel_batch_size
indices = list(range(len(self.gold_tuples))) indices = list(range(len(self.gold_tuples)))
for itn in range(nr_epoch): for itn in range(nr_epoch):
@ -78,7 +91,8 @@ class Trainer(object):
if raw_text is not None: if raw_text is not None:
return [self.nlp.make_doc(raw_text)] return [self.nlp.make_doc(raw_text)]
else: else:
return [Doc(self.nlp.vocab, words=sent_tuples[0][1]) return [
Doc(self.nlp.vocab, words=sent_tuples[0][1])
for sent_tuples in paragraph_tuples] for sent_tuples in paragraph_tuples]
def make_golds(self, docs, paragraph_tuples): def make_golds(self, docs, paragraph_tuples):

View File

@ -1,6 +1,7 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
import os
import ujson import ujson
import pip import pip
import importlib import importlib
@ -160,7 +161,23 @@ def get_async(stream, numpy_array):
if cupy is None: if cupy is None:
return numpy_array return numpy_array
else: else:
return cupy.array(numpy_array, stream=stream) array = cupy.ndarray(numpy_array.shape, order='C',
dtype=numpy_array.dtype)
array.set(numpy_array, stream=stream)
return array
def env_opt(name, default=None):
type_convert = type(default)
if name in os.environ:
print("Get from env", name, os.environ[name])
return type_convert(os.environ[name])
elif 'SPACY_' + name.upper() in os.environ:
print("Get from env", name, os.environ['SPACY_' + name.upper()])
return type_convert(os.environ['SPACY_' + name.upper()])
else:
print("Default", name, default)
return default
def read_regex(path): def read_regex(path):