Update parser training example

This commit is contained in:
ines 2017-10-26 15:15:37 +02:00
parent 586b9047fd
commit b5c74dbb34

View File

@ -1,40 +1,28 @@
#!/usr/bin/env python
# coding: utf8
"""
Example of training spaCy dependency parser, starting off with an existing model
or a blank model.
For more details, see the documentation:
* Training: https://alpha.spacy.io/usage/training
* Dependency Parse: https://alpha.spacy.io/usage/linguistic-features#dependency-parse
Developed for: spaCy 2.0.0a18
Last updated for: spaCy 2.0.0a18
"""
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
import json
import pathlib
import random import random
from pathlib import Path
import spacy import spacy
from spacy.pipeline import DependencyParser
from spacy.gold import GoldParse from spacy.gold import GoldParse
from spacy.tokens import Doc from spacy.tokens import Doc
def train_parser(nlp, train_data, left_labels, right_labels): # training data
parser = DependencyParser( TRAIN_DATA = [
nlp.vocab,
left_labels=left_labels,
right_labels=right_labels)
for itn in range(1000):
random.shuffle(train_data)
loss = 0
for words, heads, deps in train_data:
doc = Doc(nlp.vocab, words=words)
gold = GoldParse(doc, heads=heads, deps=deps)
loss += parser.update(doc, gold)
parser.model.end_training()
return parser
def main(model_dir=None):
if model_dir is not None:
model_dir = pathlib.Path(model_dir)
if not model_dir.exists():
model_dir.mkdir()
assert model_dir.is_dir()
nlp = spacy.load('en', tagger=False, parser=False, entity=False, add_vectors=False)
train_data = [
( (
['They', 'trade', 'mortgage', '-', 'backed', 'securities', '.'], ['They', 'trade', 'mortgage', '-', 'backed', 'securities', '.'],
[1, 1, 4, 4, 5, 1, 1], [1, 1, 4, 4, 5, 1, 1],
@ -46,30 +34,79 @@ def main(model_dir=None):
['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct'] ['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct']
) )
] ]
left_labels = set()
right_labels = set()
for _, heads, deps in train_data:
for i, (head, dep) in enumerate(zip(heads, deps)):
if i < head:
left_labels.add(dep)
elif i > head:
right_labels.add(dep)
parser = train_parser(nlp, train_data, sorted(left_labels), sorted(right_labels))
doc = Doc(nlp.vocab, words=['I', 'like', 'securities', '.'])
parser(doc)
for word in doc:
print(word.text, word.dep_, word.head.text)
if model_dir is not None: def main(model=None, output_dir=None, n_iter=1000):
with (model_dir / 'config.json').open('w') as file_: """Load the model, set up the pipeline and train the parser.
json.dump(parser.cfg, file_)
parser.model.dump(str(model_dir / 'model')) model (unicode): Model name to start off with. If None, a blank English
Language class is created.
output_dir (unicode / Path): Optional output directory. If None, no model
will be saved.
n_iter (int): Number of iterations during training.
"""
if model is not None:
nlp = spacy.load(model) # load existing spaCy model
print("Loaded model '%s'" % model)
else:
nlp = spacy.blank('en') # create blank Language class
print("Created blank 'en' model")
# add the parser to the pipeline if it doesn't exist
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'parser' not in nlp.pipe_names:
parser = nlp.create_pipe('parser')
nlp.add_pipe(parser, first=True)
# otherwise, get it, so we can add labels to it
else:
parser = nlp.get_pipe('parser')
# add labels to the parser
for _, heads, deps in TRAIN_DATA:
for dep in deps:
parser.add_label(dep)
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser']
with nlp.disable_pipes(*other_pipes) as disabled: # only train parser
optimizer = nlp.begin_training(lambda: [])
for itn in range(n_iter):
random.shuffle(TRAIN_DATA)
losses = {}
for words, heads, deps in TRAIN_DATA:
doc = Doc(nlp.vocab, words=words)
gold = GoldParse(doc, heads=heads, deps=deps)
nlp.update([doc], [gold], sgd=optimizer, losses=losses)
print(losses)
# test the trained model
test_text = "I like securities."
doc = nlp(test_text)
print('Dependencies', [(t.text, t.dep_, t.head.text) for t in doc])
# save model to output directory
if output_dir is not None:
output_dir = Path(output_dir)
if not output_dir.exists():
output_dir.mkdir()
nlp.to_disk(output_dir)
print("Saved model to", output_dir)
# test the save model
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
doc = nlp2(test_text)
print('Dependencies', [(t.text, t.dep_, t.head.text) for t in doc])
if __name__ == '__main__': if __name__ == '__main__':
main() import plac
# I nsubj like plac.call(main)
# like ROOT like
# securities dobj like # expected result:
# . cc securities # [
# ('I', 'nsubj', 'like'),
# ('like', 'ROOT', 'like'),
# ('securities', 'dobj', 'like'),
# ('.', 'punct', 'like')
# ]