💫 Update training examples and use minibatching (#2830)

<!--- Provide a general summary of your changes in the title. -->

## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.

### Types of change
enhancements

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
This commit is contained in:
Ines Montani 2018-10-10 01:40:29 +02:00 committed by GitHub
parent f784e42ffe
commit 4cd9ec0f00
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 41 additions and 20 deletions

View File

@ -21,8 +21,9 @@ from __future__ import unicode_literals, print_function
import plac
import random
import spacy
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
# training data: texts, heads and dependency labels
@ -63,7 +64,7 @@ TRAIN_DATA = [
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
output_dir=("Optional output directory", "option", "o", Path),
n_iter=("Number of training iterations", "option", "n", int))
def main(model=None, output_dir=None, n_iter=5):
def main(model=None, output_dir=None, n_iter=15):
"""Load the model, set up the pipeline and train the parser."""
if model is not None:
nlp = spacy.load(model) # load existing spaCy model
@ -89,9 +90,12 @@ def main(model=None, output_dir=None, n_iter=5):
for itn in range(n_iter):
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in TRAIN_DATA:
nlp.update([text], [annotations], sgd=optimizer, losses=losses)
print(losses)
# batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
print('Losses', losses)
# test the trained model
test_model(nlp)
@ -135,7 +139,8 @@ if __name__ == '__main__':
# [
# ('find', 'ROOT', 'find'),
# ('cheapest', 'QUALITY', 'gym'),
# ('gym', 'PLACE', 'find')
# ('gym', 'PLACE', 'find'),
# ('near', 'ATTRIBUTE', 'gym'),
# ('work', 'LOCATION', 'near')
# ]
# show me the best hotel in berlin

View File

@ -15,6 +15,7 @@ import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
# training data
@ -62,14 +63,17 @@ def main(model=None, output_dir=None, n_iter=100):
for itn in range(n_iter):
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in TRAIN_DATA:
# batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(
[text], # batch of texts
[annotations], # batch of annotations
texts, # batch of texts
annotations, # batch of annotations
drop=0.5, # dropout - make it harder to memorise data
sgd=optimizer, # callable to update weights
losses=losses)
print(losses)
print('Losses', losses)
# test the trained model
for text, _ in TRAIN_DATA:

View File

@ -31,6 +31,7 @@ import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
# new entity label
@ -73,7 +74,7 @@ TRAIN_DATA = [
new_model_name=("New model name for model meta.", "option", "nm", str),
output_dir=("Optional output directory", "option", "o", Path),
n_iter=("Number of training iterations", "option", "n", int))
def main(model=None, new_model_name='animal', output_dir=None, n_iter=20):
def main(model=None, new_model_name='animal', output_dir=None, n_iter=10):
"""Set up the pipeline and entity recognizer, and train the new entity."""
if model is not None:
nlp = spacy.load(model) # load existing spaCy model
@ -104,10 +105,13 @@ def main(model=None, new_model_name='animal', output_dir=None, n_iter=20):
for itn in range(n_iter):
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in TRAIN_DATA:
nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
# batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
losses=losses)
print(losses)
print('Losses', losses)
# test the trained model
test_text = 'Do you like horses?'

View File

@ -13,6 +13,7 @@ import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
# training data
@ -62,9 +63,12 @@ def main(model=None, output_dir=None, n_iter=10):
for itn in range(n_iter):
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in TRAIN_DATA:
nlp.update([text], [annotations], sgd=optimizer, losses=losses)
print(losses)
# batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
print('Losses', losses)
# test the trained model
test_text = "I like securities."

View File

@ -16,6 +16,7 @@ import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
# You need to define a mapping from your data's part-of-speech tag names to the
@ -63,9 +64,12 @@ def main(lang='en', output_dir=None, n_iter=25):
for i in range(n_iter):
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in TRAIN_DATA:
nlp.update([text], [annotations], sgd=optimizer, losses=losses)
print(losses)
# batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
print('Losses', losses)
# test the trained model
test_text = "I like blue eggs"