spaCy/spacy/train.py

# coding: utf8
from __future__ import absolute_import, unicode_literals

import random
import tqdm
from cytoolz import partition_all

from thinc.neural.optimizers import Adam
from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.train import Trainer as ThincTrainer

from .syntax.nonproj import PseudoProjectivity
from .gold import GoldParse, merge_sents
from .scorer import Scorer
from .tokens.doc import Doc
from . import util


class Trainer(object):
    """
    Manage training of an NLP pipeline.
    """
    def __init__(self, nlp, gold_tuples, **cfg):
        self.nlp = nlp
        self.nr_epoch = 0
        self.optimizer = Adam(NumpyOps(), 0.001)
        self.gold_tuples = gold_tuples
        self.cfg = cfg
        self.batch_size = float(util.env_opt('min_batch_size', 4))
        self.max_batch_size = util.env_opt('max_batch_size', 64)
        self.accel_batch_size = util.env_opt('batch_accel', 1.001)

    def epochs(self, nr_epoch, augment_data=None, gold_preproc=False):
        cached_golds = {}
        cached_docs = {}
        def _epoch(indices):
            all_docs = []
            all_golds = []
            for i in indices:
                raw_text, paragraph_tuples = self.gold_tuples[i]
                if gold_preproc:
                    raw_text = None
                else:
                    paragraph_tuples = merge_sents(paragraph_tuples)
                if augment_data is None:
                    if i not in cached_docs:
                        cached_docs[i] = self.make_docs(raw_text, paragraph_tuples)
                    docs = cached_docs[i]
                    if i not in cached_golds:
                        cached_golds[i] = self.make_golds(docs, paragraph_tuples)
                    golds = cached_golds[i]
                else:
                    raw_text, paragraph_tuples = augment_data(raw_text, paragraph_tuples)
                    docs = self.make_docs(raw_text, paragraph_tuples)
                    golds = self.make_golds(docs, paragraph_tuples)
                all_docs.extend(docs)
                all_golds.extend(golds)

            thinc_trainer = ThincTrainer(self.nlp.pipeline[0].model)
            thinc_trainer.batch_size = int(self.batch_size)
            thinc_trainer.nb_epoch = 1
            for X, y in thinc_trainer.iterate(all_docs, all_golds):
                yield X, y
                thinc_trainer.batch_size = min(int(self.batch_size), self.max_batch_size)
                self.batch_size *= self.accel_batch_size

        indices = list(range(len(self.gold_tuples)))
        for itn in range(nr_epoch):
            random.shuffle(indices)
            yield _epoch(indices)
            self.nr_epoch += 1

    def evaluate(self, dev_sents, gold_preproc=False):
        all_docs = []
        all_golds = []
        for raw_text, paragraph_tuples in dev_sents:
            if gold_preproc:
                raw_text = None
            else:
                paragraph_tuples = merge_sents(paragraph_tuples)
            docs = self.make_docs(raw_text, paragraph_tuples)
            golds = self.make_golds(docs, paragraph_tuples)
            all_docs.extend(docs)
            all_golds.extend(golds)
        scorer = Scorer()
        for doc, gold in zip(self.nlp.pipe(all_docs), all_golds):
            scorer.score(doc, gold)
        return scorer

    def make_docs(self, raw_text, paragraph_tuples):
        if raw_text is not None:
            return [self.nlp.make_doc(raw_text)]
        else:
            return [
                Doc(self.nlp.vocab, words=sent_tuples[0][1])
                for sent_tuples in paragraph_tuples]

    def make_golds(self, docs, paragraph_tuples):
        if len(docs) == 1:
            return [GoldParse.from_annot_tuples(docs[0], sent_tuples[0])
                    for sent_tuples in paragraph_tuples]
        else:
            return [GoldParse.from_annot_tuples(doc, sent_tuples[0])
                    for doc, sent_tuples in zip(docs, paragraph_tuples)]
Clean up imports, unused code, whitespace, docstrings 2017-04-15 13:05:47 +03:00			`# coding: utf8`
			`from __future__ import absolute_import, unicode_literals`
Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 13:24:24 +03:00
			`import random`
Improve output on trainer 2017-03-11 20:12:48 +03:00			`import tqdm`
Get data flowing through pipeline. Needs redesign 2017-05-16 12:21:59 +03:00			`from cytoolz import partition_all`
Improve integration of NN parser, to support unified training API 2017-05-15 22:46:08 +03:00
			`from thinc.neural.optimizers import Adam`
			`from thinc.neural.ops import NumpyOps, CupyOps`
Add util.env_opt support: Can set hyper params through environment variables. 2017-05-18 12:36:53 +03:00			`from thinc.neural.train import Trainer as ThincTrainer`
Improve integration of NN parser, to support unified training API 2017-05-15 22:46:08 +03:00
Redesign training to integrate NN components * Obsolete .parser, .entity etc names in favour of .pipeline * Components no longer create models on initialization * Models created by loading method (from_disk(), from_bytes() etc), or .begin_training() * Add .predict(), .set_annotations() methods in components * Pass state through pipeline, to allow components to share information more flexibly. 2017-05-16 17:17:30 +03:00			`from .syntax.nonproj import PseudoProjectivity`
Clean up imports, unused code, whitespace, docstrings 2017-04-15 13:05:47 +03:00			`from .gold import GoldParse, merge_sents`
Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 13:24:24 +03:00			`from .scorer import Scorer`
Get data flowing through pipeline. Needs redesign 2017-05-16 12:21:59 +03:00			`from .tokens.doc import Doc`
Add util.env_opt support: Can set hyper params through environment variables. 2017-05-18 12:36:53 +03:00			`from . import util`
Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 13:24:24 +03:00

			`class Trainer(object):`
Use consistent formatting for docstrings 2017-04-15 12:59:21 +03:00			`"""`
			`Manage training of an NLP pipeline.`
			`"""`
Add util.env_opt support: Can set hyper params through environment variables. 2017-05-18 12:36:53 +03:00			`def __init__(self, nlp, gold_tuples, **cfg):`
Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 13:24:24 +03:00			`self.nlp = nlp`
Improve output on trainer 2017-03-11 20:12:48 +03:00			`self.nr_epoch = 0`
Get data flowing through pipeline. Needs redesign 2017-05-16 12:21:59 +03:00			`self.optimizer = Adam(NumpyOps(), 0.001)`
Get spaCy train command working with neural network * Integrate models into pipeline * Add basic serialization (maybe incorrect) * Fix pickle on vocab 2017-05-17 13:04:50 +03:00			`self.gold_tuples = gold_tuples`
Add util.env_opt support: Can set hyper params through environment variables. 2017-05-18 12:36:53 +03:00			`self.cfg = cfg`
			`self.batch_size = float(util.env_opt('min_batch_size', 4))`
			`self.max_batch_size = util.env_opt('max_batch_size', 64)`
			`self.accel_batch_size = util.env_opt('batch_accel', 1.001)`
Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 13:24:24 +03:00
Update train method 2016-10-13 04:24:53 +03:00			`def epochs(self, nr_epoch, augment_data=None, gold_preproc=False):`
Fix train.py for 1.0 2016-11-25 17:55:33 +03:00			`cached_golds = {}`
Add util.env_opt support: Can set hyper params through environment variables. 2017-05-18 12:36:53 +03:00			`cached_docs = {}`
Fix train.py for 1.0 2016-11-25 17:55:33 +03:00			`def _epoch(indices):`
Get spaCy train command working with neural network * Integrate models into pipeline * Add basic serialization (maybe incorrect) * Fix pickle on vocab 2017-05-17 13:04:50 +03:00			`all_docs = []`
			`all_golds = []`
			`for i in indices:`
Fix train.py for 1.0 2016-11-25 17:55:33 +03:00			`raw_text, paragraph_tuples = self.gold_tuples[i]`
Update train method 2016-10-13 04:24:53 +03:00			`if gold_preproc:`
			`raw_text = None`
			`else:`
			`paragraph_tuples = merge_sents(paragraph_tuples)`
Fix train.py for 1.0 2016-11-25 17:55:33 +03:00			`if augment_data is None:`
Add util.env_opt support: Can set hyper params through environment variables. 2017-05-18 12:36:53 +03:00			`if i not in cached_docs:`
			`cached_docs[i] = self.make_docs(raw_text, paragraph_tuples)`
			`docs = cached_docs[i]`
			`if i not in cached_golds:`
			`cached_golds[i] = self.make_golds(docs, paragraph_tuples)`
			`golds = cached_golds[i]`
Fix train.py for 1.0 2016-11-25 17:55:33 +03:00			`else:`
Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 13:24:24 +03:00			`raw_text, paragraph_tuples = augment_data(raw_text, paragraph_tuples)`
Fix train.py for 1.0 2016-11-25 17:55:33 +03:00			`docs = self.make_docs(raw_text, paragraph_tuples)`
			`golds = self.make_golds(docs, paragraph_tuples)`
Get spaCy train command working with neural network * Integrate models into pipeline * Add basic serialization (maybe incorrect) * Fix pickle on vocab 2017-05-17 13:04:50 +03:00			`all_docs.extend(docs)`
			`all_golds.extend(golds)`
Add util.env_opt support: Can set hyper params through environment variables. 2017-05-18 12:36:53 +03:00
			`thinc_trainer = ThincTrainer(self.nlp.pipeline[0].model)`
			`thinc_trainer.batch_size = int(self.batch_size)`
			`thinc_trainer.nb_epoch = 1`
			`for X, y in thinc_trainer.iterate(all_docs, all_golds):`
Get spaCy train command working with neural network * Integrate models into pipeline * Add basic serialization (maybe incorrect) * Fix pickle on vocab 2017-05-17 13:04:50 +03:00			`yield X, y`
Add util.env_opt support: Can set hyper params through environment variables. 2017-05-18 12:36:53 +03:00			`thinc_trainer.batch_size = min(int(self.batch_size), self.max_batch_size)`
			`self.batch_size *= self.accel_batch_size`
Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 13:24:24 +03:00
Fix train.py for 1.0 2016-11-25 17:55:33 +03:00			`indices = list(range(len(self.gold_tuples)))`
Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 13:24:24 +03:00			`for itn in range(nr_epoch):`
Fix train.py for 1.0 2016-11-25 17:55:33 +03:00			`random.shuffle(indices)`
			`yield _epoch(indices)`
Improve output on trainer 2017-03-11 20:12:48 +03:00			`self.nr_epoch += 1`

Update train method 2016-10-13 04:24:53 +03:00			`def evaluate(self, dev_sents, gold_preproc=False):`
Fix GPU evaluation 2017-05-18 16:31:15 +03:00			`all_docs = []`
			`all_golds = []`
Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 13:24:24 +03:00			`for raw_text, paragraph_tuples in dev_sents:`
Update train method 2016-10-13 04:24:53 +03:00			`if gold_preproc:`
			`raw_text = None`
			`else:`
			`paragraph_tuples = merge_sents(paragraph_tuples)`
Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 13:24:24 +03:00			`docs = self.make_docs(raw_text, paragraph_tuples)`
			`golds = self.make_golds(docs, paragraph_tuples)`
Fix GPU evaluation 2017-05-18 16:31:15 +03:00			`all_docs.extend(docs)`
			`all_golds.extend(golds)`
			`scorer = Scorer()`
			`for doc, gold in zip(self.nlp.pipe(all_docs), all_golds):`
			`scorer.score(doc, gold)`
Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 13:24:24 +03:00			`return scorer`

			`def make_docs(self, raw_text, paragraph_tuples):`
			`if raw_text is not None:`
Get data flowing through pipeline. Needs redesign 2017-05-16 12:21:59 +03:00			`return [self.nlp.make_doc(raw_text)]`
Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 13:24:24 +03:00			`else:`
Add util.env_opt support: Can set hyper params through environment variables. 2017-05-18 12:36:53 +03:00			`return [`
			`Doc(self.nlp.vocab, words=sent_tuples[0][1])`
			`for sent_tuples in paragraph_tuples]`
Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 13:24:24 +03:00
			`def make_golds(self, docs, paragraph_tuples):`
			`if len(docs) == 1:`
Fix train.py for 1.0 2016-11-25 17:55:33 +03:00			`return [GoldParse.from_annot_tuples(docs[0], sent_tuples[0])`
Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 13:24:24 +03:00			`for sent_tuples in paragraph_tuples]`
			`else:`
Fix train.py for 1.0 2016-11-25 17:55:33 +03:00			`return [GoldParse.from_annot_tuples(doc, sent_tuples[0])`
Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 13:24:24 +03:00			`for doc, sent_tuples in zip(docs, paragraph_tuples)]`