mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			84 lines
		
	
	
		
			3.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			84 lines
		
	
	
		
			3.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# coding: utf8
 | 
						|
from __future__ import absolute_import, unicode_literals
 | 
						|
 | 
						|
import random
 | 
						|
import tqdm
 | 
						|
from .gold import GoldParse, merge_sents
 | 
						|
from .scorer import Scorer
 | 
						|
 | 
						|
 | 
						|
class Trainer(object):
 | 
						|
    """
 | 
						|
    Manage training of an NLP pipeline.
 | 
						|
    """
 | 
						|
    def __init__(self, nlp, gold_tuples):
 | 
						|
        self.nlp = nlp
 | 
						|
        self.gold_tuples = gold_tuples
 | 
						|
        self.nr_epoch = 0
 | 
						|
 | 
						|
    def epochs(self, nr_epoch, augment_data=None, gold_preproc=False):
 | 
						|
        cached_golds = {}
 | 
						|
        def _epoch(indices):
 | 
						|
            for i in tqdm.tqdm(indices):
 | 
						|
                raw_text, paragraph_tuples = self.gold_tuples[i]
 | 
						|
                if gold_preproc:
 | 
						|
                    raw_text = None
 | 
						|
                else:
 | 
						|
                    paragraph_tuples = merge_sents(paragraph_tuples)
 | 
						|
                if augment_data is None:
 | 
						|
                    docs = self.make_docs(raw_text, paragraph_tuples)
 | 
						|
                    if i in cached_golds:
 | 
						|
                        golds = cached_golds[i]
 | 
						|
                    else:
 | 
						|
                        golds = self.make_golds(docs, paragraph_tuples)
 | 
						|
                else:
 | 
						|
                    raw_text, paragraph_tuples = augment_data(raw_text, paragraph_tuples)
 | 
						|
                    docs = self.make_docs(raw_text, paragraph_tuples)
 | 
						|
                    golds = self.make_golds(docs, paragraph_tuples)
 | 
						|
                for doc, gold in zip(docs, golds):
 | 
						|
                    yield doc, gold
 | 
						|
 | 
						|
        indices = list(range(len(self.gold_tuples)))
 | 
						|
        for itn in range(nr_epoch):
 | 
						|
            random.shuffle(indices)
 | 
						|
            yield _epoch(indices)
 | 
						|
            self.nr_epoch += 1
 | 
						|
 | 
						|
    def update(self, doc, gold):
 | 
						|
        for process in self.nlp.pipeline:
 | 
						|
            if hasattr(process, 'update'):
 | 
						|
                loss = process.update(doc, gold, itn=self.nr_epoch)
 | 
						|
            else:
 | 
						|
                process(doc)
 | 
						|
        return doc
 | 
						|
 | 
						|
    def evaluate(self, dev_sents, gold_preproc=False):
 | 
						|
        scorer = Scorer()
 | 
						|
        for raw_text, paragraph_tuples in dev_sents:
 | 
						|
            if gold_preproc:
 | 
						|
                raw_text = None
 | 
						|
            else:
 | 
						|
                paragraph_tuples = merge_sents(paragraph_tuples)
 | 
						|
            docs = self.make_docs(raw_text, paragraph_tuples)
 | 
						|
            golds = self.make_golds(docs, paragraph_tuples)
 | 
						|
            for doc, gold in zip(docs, golds):
 | 
						|
                for process in self.nlp.pipeline:
 | 
						|
                    process(doc)
 | 
						|
                scorer.score(doc, gold)
 | 
						|
        return scorer
 | 
						|
 | 
						|
    def make_docs(self, raw_text, paragraph_tuples):
 | 
						|
        if raw_text is not None:
 | 
						|
            return [self.nlp.tokenizer(raw_text)]
 | 
						|
        else:
 | 
						|
            return [self.nlp.tokenizer.tokens_from_list(sent_tuples[0][1])
 | 
						|
                    for sent_tuples in paragraph_tuples]
 | 
						|
 | 
						|
    def make_golds(self, docs, paragraph_tuples):
 | 
						|
        if len(docs) == 1:
 | 
						|
            return [GoldParse.from_annot_tuples(docs[0], sent_tuples[0])
 | 
						|
                    for sent_tuples in paragraph_tuples]
 | 
						|
        else:
 | 
						|
            return [GoldParse.from_annot_tuples(doc, sent_tuples[0])
 | 
						|
                    for doc, sent_tuples in zip(docs, paragraph_tuples)]
 |