2017-04-15 13:05:47 +03:00
|
|
|
# coding: utf8
|
|
|
|
from __future__ import absolute_import, unicode_literals
|
2016-10-09 13:24:24 +03:00
|
|
|
|
|
|
|
import random
|
2017-03-11 20:12:48 +03:00
|
|
|
import tqdm
|
2017-05-16 12:21:59 +03:00
|
|
|
from cytoolz import partition_all
|
2017-05-15 22:46:08 +03:00
|
|
|
|
|
|
|
from thinc.neural.optimizers import Adam
|
|
|
|
from thinc.neural.ops import NumpyOps, CupyOps
|
|
|
|
|
2017-05-16 17:17:30 +03:00
|
|
|
from .syntax.nonproj import PseudoProjectivity
|
2017-04-15 13:05:47 +03:00
|
|
|
from .gold import GoldParse, merge_sents
|
2016-10-09 13:24:24 +03:00
|
|
|
from .scorer import Scorer
|
2017-05-16 12:21:59 +03:00
|
|
|
from .tokens.doc import Doc
|
2016-10-09 13:24:24 +03:00
|
|
|
|
|
|
|
|
|
|
|
class Trainer(object):
|
2017-04-15 12:59:21 +03:00
|
|
|
"""
|
|
|
|
Manage training of an NLP pipeline.
|
|
|
|
"""
|
2016-10-09 13:24:24 +03:00
|
|
|
def __init__(self, nlp, gold_tuples):
|
|
|
|
self.nlp = nlp
|
2017-05-16 17:17:30 +03:00
|
|
|
self.gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples)
|
2017-03-11 20:12:48 +03:00
|
|
|
self.nr_epoch = 0
|
2017-05-16 12:21:59 +03:00
|
|
|
self.optimizer = Adam(NumpyOps(), 0.001)
|
2016-10-09 13:24:24 +03:00
|
|
|
|
2016-10-13 04:24:53 +03:00
|
|
|
def epochs(self, nr_epoch, augment_data=None, gold_preproc=False):
|
2016-11-25 17:55:33 +03:00
|
|
|
cached_golds = {}
|
|
|
|
def _epoch(indices):
|
2017-03-11 20:12:48 +03:00
|
|
|
for i in tqdm.tqdm(indices):
|
2016-11-25 17:55:33 +03:00
|
|
|
raw_text, paragraph_tuples = self.gold_tuples[i]
|
2016-10-13 04:24:53 +03:00
|
|
|
if gold_preproc:
|
|
|
|
raw_text = None
|
|
|
|
else:
|
|
|
|
paragraph_tuples = merge_sents(paragraph_tuples)
|
2016-11-25 17:55:33 +03:00
|
|
|
if augment_data is None:
|
|
|
|
docs = self.make_docs(raw_text, paragraph_tuples)
|
|
|
|
if i in cached_golds:
|
|
|
|
golds = cached_golds[i]
|
|
|
|
else:
|
|
|
|
golds = self.make_golds(docs, paragraph_tuples)
|
|
|
|
else:
|
2016-10-09 13:24:24 +03:00
|
|
|
raw_text, paragraph_tuples = augment_data(raw_text, paragraph_tuples)
|
2016-11-25 17:55:33 +03:00
|
|
|
docs = self.make_docs(raw_text, paragraph_tuples)
|
|
|
|
golds = self.make_golds(docs, paragraph_tuples)
|
2017-05-16 17:17:30 +03:00
|
|
|
yield docs, golds
|
2016-10-09 13:24:24 +03:00
|
|
|
|
2016-11-25 17:55:33 +03:00
|
|
|
indices = list(range(len(self.gold_tuples)))
|
2016-10-09 13:24:24 +03:00
|
|
|
for itn in range(nr_epoch):
|
2016-11-25 17:55:33 +03:00
|
|
|
random.shuffle(indices)
|
|
|
|
yield _epoch(indices)
|
2017-03-11 20:12:48 +03:00
|
|
|
self.nr_epoch += 1
|
|
|
|
|
2016-10-13 04:24:53 +03:00
|
|
|
def evaluate(self, dev_sents, gold_preproc=False):
|
2016-10-09 13:24:24 +03:00
|
|
|
scorer = Scorer()
|
|
|
|
for raw_text, paragraph_tuples in dev_sents:
|
2016-10-13 04:24:53 +03:00
|
|
|
if gold_preproc:
|
|
|
|
raw_text = None
|
|
|
|
else:
|
|
|
|
paragraph_tuples = merge_sents(paragraph_tuples)
|
2016-10-09 13:24:24 +03:00
|
|
|
docs = self.make_docs(raw_text, paragraph_tuples)
|
|
|
|
golds = self.make_golds(docs, paragraph_tuples)
|
|
|
|
for doc, gold in zip(docs, golds):
|
2017-05-16 17:17:30 +03:00
|
|
|
state = {}
|
2016-11-25 17:55:33 +03:00
|
|
|
for process in self.nlp.pipeline:
|
2017-05-16 17:17:30 +03:00
|
|
|
assert state is not None, process.name
|
|
|
|
state = process(doc, state=state)
|
2016-10-09 13:24:24 +03:00
|
|
|
scorer.score(doc, gold)
|
|
|
|
return scorer
|
|
|
|
|
|
|
|
def make_docs(self, raw_text, paragraph_tuples):
|
|
|
|
if raw_text is not None:
|
2017-05-16 12:21:59 +03:00
|
|
|
return [self.nlp.make_doc(raw_text)]
|
2016-10-09 13:24:24 +03:00
|
|
|
else:
|
2017-05-16 12:21:59 +03:00
|
|
|
return [Doc(self.nlp.vocab, words=sent_tuples[0][1])
|
2016-10-09 13:24:24 +03:00
|
|
|
for sent_tuples in paragraph_tuples]
|
|
|
|
|
|
|
|
def make_golds(self, docs, paragraph_tuples):
|
|
|
|
if len(docs) == 1:
|
2016-11-25 17:55:33 +03:00
|
|
|
return [GoldParse.from_annot_tuples(docs[0], sent_tuples[0])
|
2016-10-09 13:24:24 +03:00
|
|
|
for sent_tuples in paragraph_tuples]
|
|
|
|
else:
|
2016-11-25 17:55:33 +03:00
|
|
|
return [GoldParse.from_annot_tuples(doc, sent_tuples[0])
|
2016-10-09 13:24:24 +03:00
|
|
|
for doc, sent_tuples in zip(docs, paragraph_tuples)]
|