mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	* remove _convert_examples * fix test_gold, raise TypeError if tuples are used instead of Example's * throwing proper errors when the wrong type of objects are passed * fix deprectated format in tests * fix deprectated format in parser tests * fix tests for NEL, morph, senter, tagger, textcat * update regression tests with new Example format * use make_doc * more fixes to nlp.update calls * few more small fixes for rehearse and evaluate * only import ml_datasets if really necessary
		
			
				
	
	
		
			97 lines
		
	
	
		
			3.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			97 lines
		
	
	
		
			3.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
"""Prevent catastrophic forgetting with rehearsal updates."""
 | 
						|
import plac
 | 
						|
import random
 | 
						|
import warnings
 | 
						|
import srsly
 | 
						|
import spacy
 | 
						|
from spacy.gold import Example
 | 
						|
from spacy.util import minibatch, compounding
 | 
						|
 | 
						|
# TODO: further fix & test this script for v.3 ? (read_gold_data is never called)
 | 
						|
 | 
						|
LABEL = "ANIMAL"
 | 
						|
TRAIN_DATA = [
 | 
						|
    (
 | 
						|
        "Horses are too tall and they pretend to care about your feelings",
 | 
						|
        {"entities": [(0, 6, "ANIMAL")]},
 | 
						|
    ),
 | 
						|
    ("Do they bite?", {"entities": []}),
 | 
						|
    (
 | 
						|
        "horses are too tall and they pretend to care about your feelings",
 | 
						|
        {"entities": [(0, 6, "ANIMAL")]},
 | 
						|
    ),
 | 
						|
    ("horses pretend to care about your feelings", {"entities": [(0, 6, "ANIMAL")]}),
 | 
						|
    (
 | 
						|
        "they pretend to care about your feelings, those horses",
 | 
						|
        {"entities": [(48, 54, "ANIMAL")]},
 | 
						|
    ),
 | 
						|
    ("horses?", {"entities": [(0, 6, "ANIMAL")]}),
 | 
						|
]
 | 
						|
 | 
						|
 | 
						|
def read_raw_data(nlp, jsonl_loc):
 | 
						|
    for json_obj in srsly.read_jsonl(jsonl_loc):
 | 
						|
        if json_obj["text"].strip():
 | 
						|
            doc = nlp.make_doc(json_obj["text"])
 | 
						|
            yield Example.from_dict(doc, {})
 | 
						|
 | 
						|
 | 
						|
def read_gold_data(nlp, gold_loc):
 | 
						|
    examples = []
 | 
						|
    for json_obj in srsly.read_jsonl(gold_loc):
 | 
						|
        doc = nlp.make_doc(json_obj["text"])
 | 
						|
        ents = [(ent["start"], ent["end"], ent["label"]) for ent in json_obj["spans"]]
 | 
						|
        example = Example.from_dict(doc, {"entities": ents})
 | 
						|
        examples.append(example)
 | 
						|
    return examples
 | 
						|
 | 
						|
 | 
						|
def main(model_name, unlabelled_loc):
 | 
						|
    n_iter = 10
 | 
						|
    dropout = 0.2
 | 
						|
    batch_size = 4
 | 
						|
    nlp = spacy.load(model_name)
 | 
						|
    nlp.get_pipe("ner").add_label(LABEL)
 | 
						|
    raw_examples = list(read_raw_data(nlp, unlabelled_loc))
 | 
						|
    optimizer = nlp.resume_training()
 | 
						|
    # Avoid use of Adam when resuming training. I don't understand this well
 | 
						|
    # yet, but I'm getting weird results from Adam. Try commenting out the
 | 
						|
    # nlp.update(), and using Adam -- you'll find the models drift apart.
 | 
						|
    # I guess Adam is losing precision, introducing gradient noise?
 | 
						|
    optimizer.learn_rate = 0.1
 | 
						|
    optimizer.b1 = 0.0
 | 
						|
    optimizer.b2 = 0.0
 | 
						|
    sizes = compounding(1.0, 4.0, 1.001)
 | 
						|
 | 
						|
    train_examples = []
 | 
						|
    for text, annotations in TRAIN_DATA:
 | 
						|
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
 | 
						|
 | 
						|
    with nlp.select_pipes(enable="ner") and warnings.catch_warnings():
 | 
						|
        # show warnings for misaligned entity spans once
 | 
						|
        warnings.filterwarnings("once", category=UserWarning, module="spacy")
 | 
						|
 | 
						|
        for itn in range(n_iter):
 | 
						|
            random.shuffle(train_examples)
 | 
						|
            random.shuffle(raw_examples)
 | 
						|
            losses = {}
 | 
						|
            r_losses = {}
 | 
						|
            # batch up the examples using spaCy's minibatch
 | 
						|
            raw_batches = minibatch(raw_examples, size=4)
 | 
						|
            for batch in minibatch(train_examples, size=sizes):
 | 
						|
                nlp.update(batch, sgd=optimizer, drop=dropout, losses=losses)
 | 
						|
                raw_batch = list(next(raw_batches))
 | 
						|
                nlp.rehearse(raw_batch, sgd=optimizer, losses=r_losses)
 | 
						|
            print("Losses", losses)
 | 
						|
            print("R. Losses", r_losses)
 | 
						|
    print(nlp.get_pipe("ner").model.unseen_classes)
 | 
						|
    test_text = "Do you like horses?"
 | 
						|
    doc = nlp(test_text)
 | 
						|
    print("Entities in '%s'" % test_text)
 | 
						|
    for ent in doc.ents:
 | 
						|
        print(ent.label_, ent.text)
 | 
						|
 | 
						|
 | 
						|
if __name__ == "__main__":
 | 
						|
    plac.call(main)
 |