From b5c74dbb34f035b71732e8bc37f0a43c859459ae Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 26 Oct 2017 15:15:37 +0200 Subject: [PATCH] Update parser training example --- examples/training/train_parser.py | 153 +++++++++++++++++++----------- 1 file changed, 95 insertions(+), 58 deletions(-) diff --git a/examples/training/train_parser.py b/examples/training/train_parser.py index 8c3119704..d2c15c4c2 100644 --- a/examples/training/train_parser.py +++ b/examples/training/train_parser.py @@ -1,75 +1,112 @@ +#!/usr/bin/env python +# coding: utf8 +""" +Example of training spaCy dependency parser, starting off with an existing model +or a blank model. + +For more details, see the documentation: +* Training: https://alpha.spacy.io/usage/training +* Dependency Parse: https://alpha.spacy.io/usage/linguistic-features#dependency-parse + +Developed for: spaCy 2.0.0a18 +Last updated for: spaCy 2.0.0a18 +""" from __future__ import unicode_literals, print_function -import json -import pathlib + import random +from pathlib import Path import spacy -from spacy.pipeline import DependencyParser from spacy.gold import GoldParse from spacy.tokens import Doc -def train_parser(nlp, train_data, left_labels, right_labels): - parser = DependencyParser( - nlp.vocab, - left_labels=left_labels, - right_labels=right_labels) - for itn in range(1000): - random.shuffle(train_data) - loss = 0 - for words, heads, deps in train_data: - doc = Doc(nlp.vocab, words=words) - gold = GoldParse(doc, heads=heads, deps=deps) - loss += parser.update(doc, gold) - parser.model.end_training() - return parser +# training data +TRAIN_DATA = [ + ( + ['They', 'trade', 'mortgage', '-', 'backed', 'securities', '.'], + [1, 1, 4, 4, 5, 1, 1], + ['nsubj', 'ROOT', 'compound', 'punct', 'nmod', 'dobj', 'punct'] + ), + ( + ['I', 'like', 'London', 'and', 'Berlin', '.'], + [1, 1, 1, 2, 2, 1], + ['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct'] + ) +] -def main(model_dir=None): - if model_dir is not None: - model_dir = pathlib.Path(model_dir) - if not model_dir.exists(): - model_dir.mkdir() - assert model_dir.is_dir() +def main(model=None, output_dir=None, n_iter=1000): + """Load the model, set up the pipeline and train the parser. - nlp = spacy.load('en', tagger=False, parser=False, entity=False, add_vectors=False) + model (unicode): Model name to start off with. If None, a blank English + Language class is created. + output_dir (unicode / Path): Optional output directory. If None, no model + will be saved. + n_iter (int): Number of iterations during training. + """ + if model is not None: + nlp = spacy.load(model) # load existing spaCy model + print("Loaded model '%s'" % model) + else: + nlp = spacy.blank('en') # create blank Language class + print("Created blank 'en' model") - train_data = [ - ( - ['They', 'trade', 'mortgage', '-', 'backed', 'securities', '.'], - [1, 1, 4, 4, 5, 1, 1], - ['nsubj', 'ROOT', 'compound', 'punct', 'nmod', 'dobj', 'punct'] - ), - ( - ['I', 'like', 'London', 'and', 'Berlin', '.'], - [1, 1, 1, 2, 2, 1], - ['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct'] - ) - ] - left_labels = set() - right_labels = set() - for _, heads, deps in train_data: - for i, (head, dep) in enumerate(zip(heads, deps)): - if i < head: - left_labels.add(dep) - elif i > head: - right_labels.add(dep) - parser = train_parser(nlp, train_data, sorted(left_labels), sorted(right_labels)) + # add the parser to the pipeline if it doesn't exist + # nlp.create_pipe works for built-ins that are registered with spaCy + if 'parser' not in nlp.pipe_names: + parser = nlp.create_pipe('parser') + nlp.add_pipe(parser, first=True) + # otherwise, get it, so we can add labels to it + else: + parser = nlp.get_pipe('parser') - doc = Doc(nlp.vocab, words=['I', 'like', 'securities', '.']) - parser(doc) - for word in doc: - print(word.text, word.dep_, word.head.text) + # add labels to the parser + for _, heads, deps in TRAIN_DATA: + for dep in deps: + parser.add_label(dep) - if model_dir is not None: - with (model_dir / 'config.json').open('w') as file_: - json.dump(parser.cfg, file_) - parser.model.dump(str(model_dir / 'model')) + # get names of other pipes to disable them during training + other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser'] + with nlp.disable_pipes(*other_pipes) as disabled: # only train parser + optimizer = nlp.begin_training(lambda: []) + for itn in range(n_iter): + random.shuffle(TRAIN_DATA) + losses = {} + for words, heads, deps in TRAIN_DATA: + doc = Doc(nlp.vocab, words=words) + gold = GoldParse(doc, heads=heads, deps=deps) + nlp.update([doc], [gold], sgd=optimizer, losses=losses) + print(losses) + + # test the trained model + test_text = "I like securities." + doc = nlp(test_text) + print('Dependencies', [(t.text, t.dep_, t.head.text) for t in doc]) + + # save model to output directory + if output_dir is not None: + output_dir = Path(output_dir) + if not output_dir.exists(): + output_dir.mkdir() + nlp.to_disk(output_dir) + print("Saved model to", output_dir) + + # test the save model + print("Loading from", output_dir) + nlp2 = spacy.load(output_dir) + doc = nlp2(test_text) + print('Dependencies', [(t.text, t.dep_, t.head.text) for t in doc]) if __name__ == '__main__': - main() - # I nsubj like - # like ROOT like - # securities dobj like - # . cc securities + import plac + plac.call(main) + + # expected result: + # [ + # ('I', 'nsubj', 'like'), + # ('like', 'ROOT', 'like'), + # ('securities', 'dobj', 'like'), + # ('.', 'punct', 'like') + # ]