# coding: utf8 from __future__ import absolute_import, unicode_literals import random import tqdm from cytoolz import partition_all from thinc.neural.optimizers import Adam from thinc.neural.ops import NumpyOps, CupyOps from .gold import GoldParse, merge_sents from .scorer import Scorer from .tokens.doc import Doc class Trainer(object): """ Manage training of an NLP pipeline. """ def __init__(self, nlp, gold_tuples): self.nlp = nlp self.gold_tuples = gold_tuples self.nr_epoch = 0 self.optimizer = Adam(NumpyOps(), 0.001) def epochs(self, nr_epoch, augment_data=None, gold_preproc=False): cached_golds = {} def _epoch(indices): for i in tqdm.tqdm(indices): raw_text, paragraph_tuples = self.gold_tuples[i] if gold_preproc: raw_text = None else: paragraph_tuples = merge_sents(paragraph_tuples) if augment_data is None: docs = self.make_docs(raw_text, paragraph_tuples) if i in cached_golds: golds = cached_golds[i] else: golds = self.make_golds(docs, paragraph_tuples) else: raw_text, paragraph_tuples = augment_data(raw_text, paragraph_tuples) docs = self.make_docs(raw_text, paragraph_tuples) golds = self.make_golds(docs, paragraph_tuples) for doc, gold in zip(docs, golds): yield doc, gold indices = list(range(len(self.gold_tuples))) for itn in range(nr_epoch): random.shuffle(indices) yield _epoch(indices) self.nr_epoch += 1 def update(self, docs, golds, drop=0.): for process in self.nlp.pipeline: if hasattr(process, 'update'): loss = process.update(doc, gold, sgd=self.sgd, drop=drop, itn=self.nr_epoch) self.sgd.finish_update() else: process(doc) return doc def evaluate(self, dev_sents, gold_preproc=False): scorer = Scorer() for raw_text, paragraph_tuples in dev_sents: if gold_preproc: raw_text = None else: paragraph_tuples = merge_sents(paragraph_tuples) docs = self.make_docs(raw_text, paragraph_tuples) golds = self.make_golds(docs, paragraph_tuples) for doc, gold in zip(docs, golds): for process in self.nlp.pipeline: process(doc) scorer.score(doc, gold) return scorer def make_docs(self, raw_text, paragraph_tuples): if raw_text is not None: return [self.nlp.make_doc(raw_text)] else: return [Doc(self.nlp.vocab, words=sent_tuples[0][1]) for sent_tuples in paragraph_tuples] def make_golds(self, docs, paragraph_tuples): if len(docs) == 1: return [GoldParse.from_annot_tuples(docs[0], sent_tuples[0]) for sent_tuples in paragraph_tuples] else: return [GoldParse.from_annot_tuples(doc, sent_tuples[0]) for doc, sent_tuples in zip(docs, paragraph_tuples)]