Update parser training example

This commit is contained in:
ines 2017-10-26 15:15:37 +02:00
parent 586b9047fd
commit b5c74dbb34

View File

@ -1,75 +1,112 @@
#!/usr/bin/env python
# coding: utf8
"""
Example of training spaCy dependency parser, starting off with an existing model
or a blank model.
For more details, see the documentation:
* Training: https://alpha.spacy.io/usage/training
* Dependency Parse: https://alpha.spacy.io/usage/linguistic-features#dependency-parse
Developed for: spaCy 2.0.0a18
Last updated for: spaCy 2.0.0a18
"""
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
import json
import pathlib
import random import random
from pathlib import Path
import spacy import spacy
from spacy.pipeline import DependencyParser
from spacy.gold import GoldParse from spacy.gold import GoldParse
from spacy.tokens import Doc from spacy.tokens import Doc
def train_parser(nlp, train_data, left_labels, right_labels): # training data
parser = DependencyParser( TRAIN_DATA = [
nlp.vocab, (
left_labels=left_labels, ['They', 'trade', 'mortgage', '-', 'backed', 'securities', '.'],
right_labels=right_labels) [1, 1, 4, 4, 5, 1, 1],
for itn in range(1000): ['nsubj', 'ROOT', 'compound', 'punct', 'nmod', 'dobj', 'punct']
random.shuffle(train_data) ),
loss = 0 (
for words, heads, deps in train_data: ['I', 'like', 'London', 'and', 'Berlin', '.'],
doc = Doc(nlp.vocab, words=words) [1, 1, 1, 2, 2, 1],
gold = GoldParse(doc, heads=heads, deps=deps) ['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct']
loss += parser.update(doc, gold) )
parser.model.end_training() ]
return parser
def main(model_dir=None): def main(model=None, output_dir=None, n_iter=1000):
if model_dir is not None: """Load the model, set up the pipeline and train the parser.
model_dir = pathlib.Path(model_dir)
if not model_dir.exists():
model_dir.mkdir()
assert model_dir.is_dir()
nlp = spacy.load('en', tagger=False, parser=False, entity=False, add_vectors=False) model (unicode): Model name to start off with. If None, a blank English
Language class is created.
output_dir (unicode / Path): Optional output directory. If None, no model
will be saved.
n_iter (int): Number of iterations during training.
"""
if model is not None:
nlp = spacy.load(model) # load existing spaCy model
print("Loaded model '%s'" % model)
else:
nlp = spacy.blank('en') # create blank Language class
print("Created blank 'en' model")
train_data = [ # add the parser to the pipeline if it doesn't exist
( # nlp.create_pipe works for built-ins that are registered with spaCy
['They', 'trade', 'mortgage', '-', 'backed', 'securities', '.'], if 'parser' not in nlp.pipe_names:
[1, 1, 4, 4, 5, 1, 1], parser = nlp.create_pipe('parser')
['nsubj', 'ROOT', 'compound', 'punct', 'nmod', 'dobj', 'punct'] nlp.add_pipe(parser, first=True)
), # otherwise, get it, so we can add labels to it
( else:
['I', 'like', 'London', 'and', 'Berlin', '.'], parser = nlp.get_pipe('parser')
[1, 1, 1, 2, 2, 1],
['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct']
)
]
left_labels = set()
right_labels = set()
for _, heads, deps in train_data:
for i, (head, dep) in enumerate(zip(heads, deps)):
if i < head:
left_labels.add(dep)
elif i > head:
right_labels.add(dep)
parser = train_parser(nlp, train_data, sorted(left_labels), sorted(right_labels))
doc = Doc(nlp.vocab, words=['I', 'like', 'securities', '.']) # add labels to the parser
parser(doc) for _, heads, deps in TRAIN_DATA:
for word in doc: for dep in deps:
print(word.text, word.dep_, word.head.text) parser.add_label(dep)
if model_dir is not None: # get names of other pipes to disable them during training
with (model_dir / 'config.json').open('w') as file_: other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser']
json.dump(parser.cfg, file_) with nlp.disable_pipes(*other_pipes) as disabled: # only train parser
parser.model.dump(str(model_dir / 'model')) optimizer = nlp.begin_training(lambda: [])
for itn in range(n_iter):
random.shuffle(TRAIN_DATA)
losses = {}
for words, heads, deps in TRAIN_DATA:
doc = Doc(nlp.vocab, words=words)
gold = GoldParse(doc, heads=heads, deps=deps)
nlp.update([doc], [gold], sgd=optimizer, losses=losses)
print(losses)
# test the trained model
test_text = "I like securities."
doc = nlp(test_text)
print('Dependencies', [(t.text, t.dep_, t.head.text) for t in doc])
# save model to output directory
if output_dir is not None:
output_dir = Path(output_dir)
if not output_dir.exists():
output_dir.mkdir()
nlp.to_disk(output_dir)
print("Saved model to", output_dir)
# test the save model
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
doc = nlp2(test_text)
print('Dependencies', [(t.text, t.dep_, t.head.text) for t in doc])
if __name__ == '__main__': if __name__ == '__main__':
main() import plac
# I nsubj like plac.call(main)
# like ROOT like
# securities dobj like # expected result:
# . cc securities # [
# ('I', 'nsubj', 'like'),
# ('like', 'ROOT', 'like'),
# ('securities', 'dobj', 'like'),
# ('.', 'punct', 'like')
# ]