💫 Update training examples and use minibatching (#2830)

## Description Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results. ### Types of change enhancements ## Checklist  - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2025-07-15 10:42:34 +03:00 · 2018-10-10 01:40:29 +02:00 · 2018-10-10 01:40:29 +02:00 · 4cd9ec0f00
commit 4cd9ec0f00
parent f784e42ffe
5 changed files with 41 additions and 20 deletions
--- a/examples/training/train_intent_parser.py
+++ b/examples/training/train_intent_parser.py
@ -21,8 +21,9 @@ from __future__ import unicode_literals, print_function
 import plac
 import random
 import spacy
 from pathlib import Path
 import spacy
 from spacy.util import minibatch, compounding
 # training data: texts, heads and dependency labels
@ -63,7 +64,7 @@ TRAIN_DATA = [
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int))
-def main(model=None, output_dir=None, n_iter=5):
+def main(model=None, output_dir=None, n_iter=15):
    """Load the model, set up the pipeline and train the parser."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
@ -89,9 +90,12 @@ def main(model=None, output_dir=None, n_iter=5):
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
-            for text, annotations in TRAIN_DATA:
+            # batch up the examples using spaCy's minibatch
-                nlp.update([text], [annotations], sgd=optimizer, losses=losses)
+            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
-            print(losses)
+            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, losses=losses)
            print('Losses', losses)
    # test the trained model
    test_model(nlp)
@ -135,7 +139,8 @@ if __name__ == '__main__':
    # [
    #   ('find', 'ROOT', 'find'),
    #   ('cheapest', 'QUALITY', 'gym'),
-    #   ('gym', 'PLACE', 'find')
+    #   ('gym', 'PLACE', 'find'),
    #   ('near', 'ATTRIBUTE', 'gym'),
    #   ('work', 'LOCATION', 'near')
    # ]
    # show me the best hotel in berlin
--- a/examples/training/train_ner.py
+++ b/examples/training/train_ner.py
@ -15,6 +15,7 @@ import plac
 import random
 from pathlib import Path
 import spacy
 from spacy.util import minibatch, compounding
 # training data
@ -62,14 +63,17 @@ def main(model=None, output_dir=None, n_iter=100):
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
-            for text, annotations in TRAIN_DATA:
+            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
-                    [text],  # batch of texts
+                    texts,  # batch of texts
-                    [annotations],  # batch of annotations
+                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
-            print(losses)
+            print('Losses', losses)
    # test the trained model
    for text, _ in TRAIN_DATA:
--- a/examples/training/train_new_entity_type.py
+++ b/examples/training/train_new_entity_type.py
@ -31,6 +31,7 @@ import plac
 import random
 from pathlib import Path
 import spacy
 from spacy.util import minibatch, compounding
 # new entity label
@ -73,7 +74,7 @@ TRAIN_DATA = [
    new_model_name=("New model name for model meta.", "option", "nm", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int))
-def main(model=None, new_model_name='animal', output_dir=None, n_iter=20):
+def main(model=None, new_model_name='animal', output_dir=None, n_iter=10):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
@ -104,10 +105,13 @@ def main(model=None, new_model_name='animal', output_dir=None, n_iter=20):
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
-            for text, annotations in TRAIN_DATA:
+            # batch up the examples using spaCy's minibatch
-                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
+            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
                           losses=losses)
-            print(losses)
+            print('Losses', losses)
    # test the trained model
    test_text = 'Do you like horses?'
--- a/examples/training/train_parser.py
+++ b/examples/training/train_parser.py
@ -13,6 +13,7 @@ import plac
 import random
 from pathlib import Path
 import spacy
 from spacy.util import minibatch, compounding
 # training data
@ -62,9 +63,12 @@ def main(model=None, output_dir=None, n_iter=10):
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
-            for text, annotations in TRAIN_DATA:
+            # batch up the examples using spaCy's minibatch
-                nlp.update([text], [annotations], sgd=optimizer, losses=losses)
+            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
-            print(losses)
+            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, losses=losses)
            print('Losses', losses)
    # test the trained model
    test_text = "I like securities."
--- a/examples/training/train_tagger.py
+++ b/examples/training/train_tagger.py
@ -16,6 +16,7 @@ import plac
 import random
 from pathlib import Path
 import spacy
 from spacy.util import minibatch, compounding
 # You need to define a mapping from your data's part-of-speech tag names to the
@ -63,9 +64,12 @@ def main(lang='en', output_dir=None, n_iter=25):
    for i in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
-        for text, annotations in TRAIN_DATA:
+        # batch up the examples using spaCy's minibatch
-            nlp.update([text], [annotations], sgd=optimizer, losses=losses)
+        batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
-        print(losses)
+        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, losses=losses)
        print('Losses', losses)
    # test the trained model
    test_text = "I like blue eggs"