From 4c9202249d820d45dde13abae5e3f6b448785225 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 21 May 2017 09:07:06 -0500 Subject: [PATCH] Refactor training, to fix memory leak --- spacy/__main__.py | 26 ++++++++++++-- spacy/cli/train.py | 89 ++++++++++++++-------------------------------- spacy/language.py | 38 ++++++++++++++------ 3 files changed, 77 insertions(+), 76 deletions(-) diff --git a/spacy/__main__.py b/spacy/__main__.py index e0f042a62..2bfec1920 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -129,9 +129,31 @@ class CLI(object): print("\n Command %r does not exist." "\n Use the --help flag for a list of available commands.\n" % name) +@plac.annotations( + lang=("model language", "positional", None, str), + output_dir=("output directory to store model in", "positional", None, str), + train_data=("location of JSON-formatted training data", "positional", None, str), + dev_data=("location of JSON-formatted development data (optional)", "positional", None, str), + n_iter=("number of iterations", "option", "n", int), + nsents=("number of sentences", "option", None, int), + use_gpu=("Use GPU", "flag", "g", bool), + no_tagger=("Don't train tagger", "flag", "T", bool), + no_parser=("Don't train parser", "flag", "P", bool), + no_entities=("Don't train NER", "flag", "N", bool) +) +def train(self, lang, output_dir, train_data, dev_data=None, n_iter=15, + nsents=0, use_gpu=False, + no_tagger=False, no_parser=False, no_entities=False): + """ + Train a model. Expects data in spaCy's JSON format. + """ + nsents = nsents or None + cli_train(lang, output_dir, train_data, dev_data, n_iter, nsents, + use_gpu, no_tagger, no_parser, no_entities) + if __name__ == '__main__': import plac import sys - sys.argv[0] = 'spacy' - plac.Interpreter.call(CLI) + if sys.argv[1] == 'train': + plac.call(train) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index fa7d85798..98fb61fa2 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -6,18 +6,19 @@ from collections import defaultdict import cytoolz from pathlib import Path import dill +import tqdm from ..tokens.doc import Doc from ..scorer import Scorer from ..gold import GoldParse, merge_sents -from ..gold import read_json_file as read_gold_json +from ..gold import GoldCorpus from ..util import prints from .. import util from .. import displacy -def train(language, output_dir, train_data, dev_data, n_iter, n_sents, - use_gpu, no_tagger, no_parser, no_entities, parser_L1): +def train(lang_id, output_dir, train_data, dev_data, n_iter, n_sents, + use_gpu, no_tagger, no_parser, no_entities): output_path = util.ensure_path(output_dir) train_path = util.ensure_path(train_data) dev_path = util.ensure_path(dev_data) @@ -28,70 +29,32 @@ def train(language, output_dir, train_data, dev_data, n_iter, n_sents, if dev_path and not dev_path.exists(): prints(dev_path, title="Development data not found", exits=True) - lang = util.get_lang_class(language) - parser_cfg = { - 'pseudoprojective': True, - 'L1': parser_L1, - 'n_iter': n_iter, - 'lang': language, - 'features': lang.Defaults.parser_features} - entity_cfg = { - 'n_iter': n_iter, - 'lang': language, - 'features': lang.Defaults.entity_features} - tagger_cfg = { - 'n_iter': n_iter, - 'lang': language, - 'features': lang.Defaults.tagger_features} - gold_train = list(read_gold_json(train_path, limit=n_sents)) - gold_dev = list(read_gold_json(dev_path, limit=n_sents)) - - train_model(lang, gold_train, gold_dev, output_path, n_iter, - no_tagger=no_tagger, no_parser=no_parser, no_entities=no_entities, - use_gpu=use_gpu) - if gold_dev: - scorer = evaluate(lang, gold_dev, output_path) - print_results(scorer) - - -def train_config(config): - config_path = util.ensure_path(config) - if not config_path.is_file(): - prints(config_path, title="Config file not found", exits=True) - config = json.load(config_path) - for setting in []: - if setting not in config.keys(): - prints("%s not found in config file." % setting, title="Missing setting") - - -def train_model(Language, train_data, dev_data, output_path, n_iter, **cfg): - print("Itn.\tDep. Loss\tUAS\tNER F.\tTag %\tToken %") + lang_class = util.get_lang_class(lang_id) pipeline = ['token_vectors', 'tags', 'dependencies', 'entities'] - if cfg.get('no_tagger') and 'tags' in pipeline: - pipeline.remove('tags') - if cfg.get('no_parser') and 'dependencies' in pipeline: - pipeline.remove('dependencies') - if cfg.get('no_entities') and 'entities' in pipeline: - pipeline.remove('entities') - print(pipeline) - nlp = Language(pipeline=pipeline) + if no_tagger and 'tags' in pipeline: pipeline.remove('tags') + if no_parser and 'dependencies' in pipeline: pipeline.remove('dependencies') + if no_entities and 'entities' in pipeline: pipeline.remove('entities') + + nlp = lang_class(pipeline=pipeline) + corpus = GoldCorpus(train_path, dev_path) + dropout = util.env_opt('dropout', 0.0) - # TODO: Get spaCy using Thinc's trainer and optimizer - with nlp.begin_training(train_data, **cfg) as (trainer, optimizer): - for itn, epoch in enumerate(trainer.epochs(n_iter, gold_preproc=False)): - losses = defaultdict(float) - for i, (docs, golds) in enumerate(epoch): + + optimizer = nlp.begin_training(lambda: corpus.train_tuples, use_gpu=use_gpu) + n_train_docs = corpus.count_train() + print("Itn.\tDep. Loss\tUAS\tNER F.\tTag %\tToken %") + for i in range(n_iter): + with tqdm.tqdm(total=n_train_docs) as pbar: + train_docs = corpus.train_docs(nlp, shuffle=i) + for batch in cytoolz.partition_all(20, train_docs): + docs, golds = zip(*batch) + docs = list(docs) + golds = list(golds) nlp.update(docs, golds, drop=dropout, sgd=optimizer) - for doc in docs: - doc.tensor = None - doc._py_tokens = [] - if dev_data: - with nlp.use_params(optimizer.averages): - dev_scores = trainer.evaluate(dev_data, gold_preproc=False).scores - else: - dev_scores = defaultdict(float) - print_progress(itn, losses, dev_scores) + pbar.update(len(docs)) + scorer = nlp.evaluate(corpus.dev_docs(nlp)) + print_progress(i, {}, scorer.scores) with (output_path / 'model.bin').open('wb') as file_: dill.dump(nlp, file_, -1) diff --git a/spacy/language.py b/spacy/language.py index 6538b9e27..12964784c 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -6,12 +6,12 @@ import dill import numpy from thinc.neural import Model from thinc.neural.ops import NumpyOps, CupyOps +from thinc.neural.optimizers import Adam from .tokenizer import Tokenizer from .vocab import Vocab from .tagger import Tagger from .lemmatizer import Lemmatizer -from .train import Trainer from .syntax.parser import get_templates from .syntax.nonproj import PseudoProjectivity from .pipeline import NeuralDependencyParser, EntityRecognizer @@ -23,6 +23,7 @@ from .lang.tokenizer_exceptions import TOKEN_MATCH from .lang.tag_map import TAG_MAP from .lang.lex_attrs import LEX_ATTRS from . import util +from .scorer import Scorer class BaseDefaults(object): @@ -181,8 +182,8 @@ class Language(object): for proc in self.pipeline[1:]: grads = {} tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop) - d_tokvecses = proc.update((docs, tokvecses), golds, sgd=get_grads, drop=drop) - bp_tokvecses(d_tokvecses, sgd=get_grads) + d_tokvecses = proc.update((docs, tokvecses), golds, sgd=sgd, drop=drop) + bp_tokvecses(d_tokvecses, sgd=sgd) if sgd is not None: for key, (W, dW) in grads.items(): # TODO: Unhack this when thinc improves @@ -191,16 +192,24 @@ class Language(object): else: sgd.ops = CupyOps() sgd(W, dW, key=key) + for key in list(grads.keys()): + grads.pop(key) + for doc in docs: + doc.tensor = None - @contextmanager - def begin_training(self, gold_tuples, **cfg): + def preprocess_gold(self, docs_golds): + for proc in self.pipeline: + if hasattr(proc, 'preprocess_gold'): + docs_golds = proc.preprocess_gold(docs_golds) + for doc, gold in docs_golds: + yield doc, gold + + def begin_training(self, get_gold_tuples, **cfg): # Populate vocab - for _, annots_brackets in gold_tuples: + for _, annots_brackets in get_gold_tuples(): for annots, _ in annots_brackets: for word in annots[1]: _ = self.vocab[word] - # Handle crossing dependencies - gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples) contexts = [] if cfg.get('use_gpu'): Model.ops = CupyOps() @@ -208,11 +217,18 @@ class Language(object): print("Use GPU") for proc in self.pipeline: if hasattr(proc, 'begin_training'): - context = proc.begin_training(gold_tuples, + context = proc.begin_training(get_gold_tuples(), pipeline=self.pipeline) contexts.append(context) - trainer = Trainer(self, gold_tuples, **cfg) - yield trainer, trainer.optimizer + optimizer = Adam(Model.ops, 0.001) + return optimizer + + def evaluate(self, docs_golds): + docs, golds = zip(*docs_golds) + scorer = Scorer() + for doc, gold in zip(self.pipe(docs), golds): + scorer.score(doc, gold) + return scorer @contextmanager def use_params(self, params, **cfg):