spaCy/spacy/train.py

70 lines
2.5 KiB
Python
Raw Normal View History

from __future__ import absolute_import
from __future__ import unicode_literals
import random
from .gold import GoldParse
from .scorer import Scorer
2016-10-13 04:24:53 +03:00
from .gold import merge_sents
class Trainer(object):
2016-10-12 15:26:02 +03:00
'''Manage training of an NLP pipeline.'''
def __init__(self, nlp, gold_tuples):
self.nlp = nlp
self.gold_tuples = gold_tuples
2016-10-13 04:24:53 +03:00
def epochs(self, nr_epoch, augment_data=None, gold_preproc=False):
def _epoch():
for raw_text, paragraph_tuples in self.gold_tuples:
2016-10-13 04:24:53 +03:00
if gold_preproc:
raw_text = None
else:
paragraph_tuples = merge_sents(paragraph_tuples)
if augment_data is not None:
raw_text, paragraph_tuples = augment_data(raw_text, paragraph_tuples)
docs = self.make_docs(raw_text, paragraph_tuples)
golds = self.make_golds(docs, paragraph_tuples)
for doc, gold in zip(docs, golds):
yield doc, gold
for itn in range(nr_epoch):
random.shuffle(self.gold_tuples)
yield _epoch()
def update(self, doc, gold):
2016-10-16 00:53:46 +03:00
for process in self.nlp.pipeline:
if hasattr(process, 'update'):
process.update(doc, gold)
process(doc)
return doc
2016-10-13 04:24:53 +03:00
def evaluate(self, dev_sents, gold_preproc=False):
scorer = Scorer()
for raw_text, paragraph_tuples in dev_sents:
2016-10-13 04:24:53 +03:00
if gold_preproc:
raw_text = None
else:
paragraph_tuples = merge_sents(paragraph_tuples)
docs = self.make_docs(raw_text, paragraph_tuples)
golds = self.make_golds(docs, paragraph_tuples)
for doc, gold in zip(docs, golds):
2016-10-27 19:02:19 +03:00
for process in self.nlp.pipeline:
process(doc)
scorer.score(doc, gold)
return scorer
def make_docs(self, raw_text, paragraph_tuples):
if raw_text is not None:
return [self.nlp.tokenizer(raw_text)]
else:
return [self.nlp.tokenizer.tokens_from_list(sent_tuples[0][1])
for sent_tuples in paragraph_tuples]
def make_golds(self, docs, paragraph_tuples):
if len(docs) == 1:
return [GoldParse(docs[0], sent_tuples[0])
for sent_tuples in paragraph_tuples]
else:
return [GoldParse(doc, sent_tuples[0])
for doc, sent_tuples in zip(docs, paragraph_tuples)]