spaCy/spacy/train.py
Matthew Honnibal 8cf097ca88 Redesign training to integrate NN components
* Obsolete .parser, .entity etc names in favour of .pipeline
* Components no longer create models on initialization
* Models created by loading method (from_disk(), from_bytes() etc), or
    .begin_training()
* Add .predict(), .set_annotations() methods in components
* Pass state through pipeline, to allow components to share information
    more flexibly.
2017-05-16 16:17:30 +02:00

85 lines
3.1 KiB
Python

# coding: utf8
from __future__ import absolute_import, unicode_literals
import random
import tqdm
from cytoolz import partition_all
from thinc.neural.optimizers import Adam
from thinc.neural.ops import NumpyOps, CupyOps
from .syntax.nonproj import PseudoProjectivity
from .gold import GoldParse, merge_sents
from .scorer import Scorer
from .tokens.doc import Doc
class Trainer(object):
"""
Manage training of an NLP pipeline.
"""
def __init__(self, nlp, gold_tuples):
self.nlp = nlp
self.gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples)
self.nr_epoch = 0
self.optimizer = Adam(NumpyOps(), 0.001)
def epochs(self, nr_epoch, augment_data=None, gold_preproc=False):
cached_golds = {}
def _epoch(indices):
for i in tqdm.tqdm(indices):
raw_text, paragraph_tuples = self.gold_tuples[i]
if gold_preproc:
raw_text = None
else:
paragraph_tuples = merge_sents(paragraph_tuples)
if augment_data is None:
docs = self.make_docs(raw_text, paragraph_tuples)
if i in cached_golds:
golds = cached_golds[i]
else:
golds = self.make_golds(docs, paragraph_tuples)
else:
raw_text, paragraph_tuples = augment_data(raw_text, paragraph_tuples)
docs = self.make_docs(raw_text, paragraph_tuples)
golds = self.make_golds(docs, paragraph_tuples)
yield docs, golds
indices = list(range(len(self.gold_tuples)))
for itn in range(nr_epoch):
random.shuffle(indices)
yield _epoch(indices)
self.nr_epoch += 1
def evaluate(self, dev_sents, gold_preproc=False):
scorer = Scorer()
for raw_text, paragraph_tuples in dev_sents:
if gold_preproc:
raw_text = None
else:
paragraph_tuples = merge_sents(paragraph_tuples)
docs = self.make_docs(raw_text, paragraph_tuples)
golds = self.make_golds(docs, paragraph_tuples)
for doc, gold in zip(docs, golds):
state = {}
for process in self.nlp.pipeline:
assert state is not None, process.name
state = process(doc, state=state)
scorer.score(doc, gold)
return scorer
def make_docs(self, raw_text, paragraph_tuples):
if raw_text is not None:
return [self.nlp.make_doc(raw_text)]
else:
return [Doc(self.nlp.vocab, words=sent_tuples[0][1])
for sent_tuples in paragraph_tuples]
def make_golds(self, docs, paragraph_tuples):
if len(docs) == 1:
return [GoldParse.from_annot_tuples(docs[0], sent_tuples[0])
for sent_tuples in paragraph_tuples]
else:
return [GoldParse.from_annot_tuples(doc, sent_tuples[0])
for doc, sent_tuples in zip(docs, paragraph_tuples)]