diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 5c23587bc..33501800c 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -13,11 +13,11 @@ from ..gold import GoldParse, merge_sents from ..gold import read_json_file as read_gold_json from ..util import prints from .. import util -from .. import displacy +from .. import displacy def train(language, output_dir, train_data, dev_data, n_iter, n_sents, - tagger, parser, ner, parser_L1): + use_gpu, tagger, parser, ner, parser_L1): output_path = util.ensure_path(output_dir) train_path = util.ensure_path(train_data) dev_path = util.ensure_path(dev_data) @@ -46,7 +46,7 @@ def train(language, output_dir, train_data, dev_data, n_iter, n_sents, gold_train = list(read_gold_json(train_path, limit=n_sents)) gold_dev = list(read_gold_json(dev_path, limit=n_sents)) if dev_path else None - train_model(lang, gold_train, gold_dev, output_path, n_iter) + train_model(lang, gold_train, gold_dev, output_path, n_iter, use_gpu=use_gpu) if gold_dev: scorer = evaluate(lang, gold_dev, output_path) print_results(scorer) @@ -65,28 +65,28 @@ def train_config(config): def train_model(Language, train_data, dev_data, output_path, n_iter, **cfg): print("Itn.\tDep. Loss\tUAS\tNER F.\tTag %\tToken %") - nlp = Language(pipeline=['token_vectors', 'tags', 'dependencies']) - + nlp = Language(pipeline=['token_vectors', 'tags']) #, 'dependencies']) + dropout = util.env_opt('dropout', 0.0) # TODO: Get spaCy using Thinc's trainer and optimizer with nlp.begin_training(train_data, **cfg) as (trainer, optimizer): for itn, epoch in enumerate(trainer.epochs(n_iter, gold_preproc=True)): losses = defaultdict(float) to_render = [] for i, (docs, golds) in enumerate(epoch): - state = nlp.update(docs, golds, drop=0., sgd=optimizer) + state = nlp.update(docs, golds, drop=dropout, sgd=optimizer) losses['dep_loss'] += state.get('parser_loss', 0.0) + losses['tag_loss'] += state.get('tagger_loss', 0.0) to_render.insert(0, nlp(docs[-1].text)) to_render[0].user_data['title'] = "Batch %d" % i with Path('/tmp/entities.html').open('w') as file_: - html = displacy.render(to_render[:5], style='ent', page=True, - options={'compact': True}) + html = displacy.render(to_render[:5], style='ent', page=True) file_.write(html) with Path('/tmp/parses.html').open('w') as file_: - html = displacy.render(to_render[:5], style='dep', page=True, - options={'compact': True}) + html = displacy.render(to_render[:5], style='dep', page=True) file_.write(html) if dev_data: - dev_scores = trainer.evaluate(dev_data).scores + with nlp.use_params(optimizer.averages): + dev_scores = trainer.evaluate(dev_data).scores else: dev_scores = defaultdict(float) print_progress(itn, losses, dev_scores) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index e7098843b..7e00030a4 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -8,6 +8,7 @@ import ujson from .syntax import nonproj from .util import ensure_path +from . import util def tags_to_entities(tags): @@ -138,7 +139,8 @@ def _min_edit_path(cand_words, gold_words): return prev_costs[n_gold], previous_row[-1] -def read_json_file(loc, docs_filter=None, make_supertags=False, limit=None): +def read_json_file(loc, docs_filter=None, make_supertags=True, limit=None): + make_supertags = util.env_opt('make_supertags', make_supertags) loc = ensure_path(loc) if loc.is_dir(): for filename in loc.iterdir(): diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 7178ffa6b..8c04a327a 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -134,7 +134,7 @@ cdef class precompute_hiddens: hiddens.data, &ids[0,0], token_ids.shape[0], self.nF, self.nO*self.nP) - output, bp_output = self._apply_nonlinearity(state_vector) + output, bp_output = self._apply_nonlinearity(state_vector) def backward(d_output, sgd=None): # This will usually be on GPU @@ -220,10 +220,13 @@ cdef class Parser: """ @classmethod def Model(cls, nr_class, token_vector_width=128, hidden_width=128, **cfg): + token_vector_width = util.env_opt('token_vector_width', token_vector_width) + hidden_width = util.env_opt('hidden_width', hidden_width) + maxout_pieces = util.env_opt('parser_maxout_pieces', 1) lower = PrecomputableMaxouts(hidden_width, nF=cls.nr_feature, nI=token_vector_width, - pieces=cfg.get('maxout_pieces', 1)) + pieces=maxout_pieces) with Model.use_device('cpu'): upper = chain( @@ -346,7 +349,8 @@ cdef class Parser: backprops = [] cdef float loss = 0. - while todo: + cutoff = max(1, len(todo) // 10) + while len(todo) >= cutoff: states, golds = zip(*todo) token_ids = self.get_token_ids(states) @@ -398,7 +402,7 @@ cdef class Parser: def get_token_ids(self, states): cdef StateClass state cdef int n_tokens = self.nr_feature - ids = numpy.zeros((len(states), n_tokens), dtype='i', order='c') + ids = numpy.zeros((len(states), n_tokens), dtype='i', order='C') for i, state in enumerate(states): state.set_context_tokens(ids[i]) return ids diff --git a/spacy/train.py b/spacy/train.py index b62cfeb99..7eeb83900 100644 --- a/spacy/train.py +++ b/spacy/train.py @@ -7,25 +7,32 @@ from cytoolz import partition_all from thinc.neural.optimizers import Adam from thinc.neural.ops import NumpyOps, CupyOps +from thinc.neural.train import Trainer as ThincTrainer from .syntax.nonproj import PseudoProjectivity from .gold import GoldParse, merge_sents from .scorer import Scorer from .tokens.doc import Doc +from . import util class Trainer(object): """ Manage training of an NLP pipeline. """ - def __init__(self, nlp, gold_tuples): + def __init__(self, nlp, gold_tuples, **cfg): self.nlp = nlp self.nr_epoch = 0 self.optimizer = Adam(NumpyOps(), 0.001) self.gold_tuples = gold_tuples + self.cfg = cfg + self.batch_size = float(util.env_opt('min_batch_size', 4)) + self.max_batch_size = util.env_opt('max_batch_size', 64) + self.accel_batch_size = util.env_opt('batch_accel', 1.001) def epochs(self, nr_epoch, augment_data=None, gold_preproc=False): cached_golds = {} + cached_docs = {} def _epoch(indices): all_docs = [] all_golds = [] @@ -36,20 +43,26 @@ class Trainer(object): else: paragraph_tuples = merge_sents(paragraph_tuples) if augment_data is None: - docs = self.make_docs(raw_text, paragraph_tuples) - if i in cached_golds: - golds = cached_golds[i] - else: - golds = self.make_golds(docs, paragraph_tuples) + if i not in cached_docs: + cached_docs[i] = self.make_docs(raw_text, paragraph_tuples) + docs = cached_docs[i] + if i not in cached_golds: + cached_golds[i] = self.make_golds(docs, paragraph_tuples) + golds = cached_golds[i] else: raw_text, paragraph_tuples = augment_data(raw_text, paragraph_tuples) docs = self.make_docs(raw_text, paragraph_tuples) golds = self.make_golds(docs, paragraph_tuples) all_docs.extend(docs) all_golds.extend(golds) - for batch in partition_all(12, zip(tqdm.tqdm(all_docs), all_golds)): - X, y = zip(*batch) + + thinc_trainer = ThincTrainer(self.nlp.pipeline[0].model) + thinc_trainer.batch_size = int(self.batch_size) + thinc_trainer.nb_epoch = 1 + for X, y in thinc_trainer.iterate(all_docs, all_golds): yield X, y + thinc_trainer.batch_size = min(int(self.batch_size), self.max_batch_size) + self.batch_size *= self.accel_batch_size indices = list(range(len(self.gold_tuples))) for itn in range(nr_epoch): @@ -78,8 +91,9 @@ class Trainer(object): if raw_text is not None: return [self.nlp.make_doc(raw_text)] else: - return [Doc(self.nlp.vocab, words=sent_tuples[0][1]) - for sent_tuples in paragraph_tuples] + return [ + Doc(self.nlp.vocab, words=sent_tuples[0][1]) + for sent_tuples in paragraph_tuples] def make_golds(self, docs, paragraph_tuples): if len(docs) == 1: diff --git a/spacy/util.py b/spacy/util.py index 717e4f160..ef2e78b3b 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1,6 +1,7 @@ # coding: utf8 from __future__ import unicode_literals, print_function +import os import ujson import pip import importlib @@ -160,7 +161,23 @@ def get_async(stream, numpy_array): if cupy is None: return numpy_array else: - return cupy.array(numpy_array, stream=stream) + array = cupy.ndarray(numpy_array.shape, order='C', + dtype=numpy_array.dtype) + array.set(numpy_array, stream=stream) + return array + + +def env_opt(name, default=None): + type_convert = type(default) + if name in os.environ: + print("Get from env", name, os.environ[name]) + return type_convert(os.environ[name]) + elif 'SPACY_' + name.upper() in os.environ: + print("Get from env", name, os.environ['SPACY_' + name.upper()]) + return type_convert(os.environ['SPACY_' + name.upper()]) + else: + print("Default", name, default) + return default def read_regex(path):