From fe498b3d5e1e29fb756c203d50e217e8f258e561 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 6 Nov 2017 23:14:04 +0100
Subject: [PATCH] Update training examples to use "simple style"

---
 examples/training/train_intent_parser.py   | 81 ++++++++++------------
 examples/training/train_ner.py             | 63 ++++++++---------
 examples/training/train_new_entity_type.py | 55 ++++++---------
 examples/training/train_parser.py          | 33 ++++-----
 examples/training/train_tagger.py          | 21 +++---
 examples/training/train_textcat.py         | 16 ++---
 6 files changed, 116 insertions(+), 153 deletions(-)

diff --git a/examples/training/train_intent_parser.py b/examples/training/train_intent_parser.py
index b51a4a10c..7e678a3d1 100644
--- a/examples/training/train_intent_parser.py
+++ b/examples/training/train_intent_parser.py
@@ -14,55 +14,49 @@ following types of relations: ROOT, PLACE, QUALITY, ATTRIBUTE, TIME, LOCATION.
 ('best', 'QUALITY', 'hotel') --> hotel with QUALITY best
 ('hotel', 'PLACE', 'show') --> show PLACE hotel
 ('berlin', 'LOCATION', 'hotel') --> hotel with LOCATION berlin
+
+Developed for: spaCy 2.0.0a18
+Last updated for: spaCy 2.0.0a19
 """
 from __future__ import unicode_literals, print_function
 
 import plac
 import random
 import spacy
-from spacy.gold import GoldParse
-from spacy.tokens import Doc
 from pathlib import Path
 
 
-# training data: words, head and dependency labels
+# training data: texts, heads and dependency labels
 # for no relation, we simply chose an arbitrary dependency label, e.g. '-'
 TRAIN_DATA = [
-    (
-        ['find', 'a', 'cafe', 'with', 'great', 'wifi'],
-        [0, 2, 0, 5, 5, 2],  # index of token head
-        ['ROOT', '-', 'PLACE', '-', 'QUALITY', 'ATTRIBUTE']
-    ),
-    (
-        ['find', 'a', 'hotel', 'near', 'the', 'beach'],
-        [0, 2, 0, 5, 5, 2],
-        ['ROOT', '-', 'PLACE', 'QUALITY', '-', 'ATTRIBUTE']
-    ),
-    (
-        ['find', 'me', 'the', 'closest', 'gym', 'that', "'s", 'open', 'late'],
-        [0, 0, 4, 4, 0, 6, 4, 6, 6],
-        ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'ATTRIBUTE', 'TIME']
-    ),
-    (
-        ['show', 'me', 'the', 'cheapest', 'store', 'that', 'sells', 'flowers'],
-        [0, 0, 4, 4, 0, 4, 4, 4],  # attach "flowers" to store!
-        ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'PRODUCT']
-    ),
-    (
-        ['find', 'a', 'nice', 'restaurant', 'in', 'london'],
-        [0, 3, 3, 0, 3, 3],
-        ['ROOT', '-', 'QUALITY', 'PLACE', '-', 'LOCATION']
-    ),
-    (
-        ['show', 'me', 'the', 'coolest', 'hostel', 'in', 'berlin'],
-        [0, 0, 4, 4, 0, 4, 4],
-        ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', 'LOCATION']
-    ),
-    (
-        ['find', 'a', 'good', 'italian', 'restaurant', 'near', 'work'],
-        [0, 4, 4, 4, 0, 4, 5],
-        ['ROOT', '-', 'QUALITY', 'ATTRIBUTE', 'PLACE', 'ATTRIBUTE', 'LOCATION']
-    )
+    ("find a cafe with great wifi", {
+        'heads': [0, 2, 0, 5, 5, 2],  # index of token head
+        'deps': ['ROOT', '-', 'PLACE', '-', 'QUALITY', 'ATTRIBUTE']
+    }),
+    ("find a hotel near the beach", {
+        'heads': [0, 2, 0, 5, 5, 2],
+        'deps': ['ROOT', '-', 'PLACE', 'QUALITY', '-', 'ATTRIBUTE']
+    }),
+    ("find me the closest gym that's open late", {
+        'heads': [0, 0, 4, 4, 0, 6, 4, 6, 6],
+        'deps': ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'ATTRIBUTE', 'TIME']
+    }),
+    ("show me the cheapest store that sells flowers", {
+        'heads': [0, 0, 4, 4, 0, 4, 4, 4],  # attach "flowers" to store!
+        'deps': ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'PRODUCT']
+    }),
+    ("find a nice restaurant in london", {
+        'heads': [0, 3, 3, 0, 3, 3],
+        'deps': ['ROOT', '-', 'QUALITY', 'PLACE', '-', 'LOCATION']
+    }),
+    ("show me the coolest hostel in berlin", {
+        'heads': [0, 0, 4, 4, 0, 4, 4],
+        'deps': ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', 'LOCATION']
+    }),
+    ("find a good italian restaurant near work", {
+        'heads': [0, 4, 4, 4, 0, 4, 5],
+        'deps': ['ROOT', '-', 'QUALITY', 'ATTRIBUTE', 'PLACE', 'ATTRIBUTE', 'LOCATION']
+    })
 ]
 
 
@@ -88,8 +82,8 @@ def main(model=None, output_dir=None, n_iter=100):
     else:
         parser = nlp.get_pipe('parser')
 
-    for _, _, deps in TRAIN_DATA:
-        for dep in deps:
+    for text, annotations in TRAIN_DATA:
+        for dep in annotations.get('deps', []):
             parser.add_label(dep)
 
     other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser']
@@ -98,10 +92,8 @@ def main(model=None, output_dir=None, n_iter=100):
         for itn in range(n_iter):
             random.shuffle(TRAIN_DATA)
             losses = {}
-            for words, heads, deps in TRAIN_DATA:
-                doc = Doc(nlp.vocab, words=words)
-                gold = GoldParse(doc, heads=heads, deps=deps)
-                nlp.update([doc], [gold], sgd=optimizer, losses=losses)
+            for text, annotations in TRAIN_DATA:
+                nlp.update([text], [annotations], sgd=optimizer, losses=losses)
             print(losses)
 
     # test the trained model
@@ -147,6 +139,7 @@ if __name__ == '__main__':
     #   ('find', 'ROOT', 'find'),
     #   ('cheapest', 'QUALITY', 'gym'),
     #   ('gym', 'PLACE', 'find')
+    #   ('work', 'LOCATION', 'near')
     # ]
     # show me the best hotel in berlin
     # [
diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py
index e95cce4c9..79b74535d 100644
--- a/examples/training/train_ner.py
+++ b/examples/training/train_ner.py
@@ -8,22 +8,24 @@ For more details, see the documentation:
 * NER: https://alpha.spacy.io/usage/linguistic-features#named-entities
 
 Developed for: spaCy 2.0.0a18
-Last updated for: spaCy 2.0.0a18
+Last updated for: spaCy 2.0.0a19
 """
 from __future__ import unicode_literals, print_function
 
 import plac
 import random
 from pathlib import Path
-
 import spacy
-from spacy.gold import GoldParse, biluo_tags_from_offsets
 
 
 # training data
 TRAIN_DATA = [
-    ('Who is Shaka Khan?', [(7, 17, 'PERSON')]),
-    ('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')])
+    ('Who is Shaka Khan?', {
+        'entities': [(7, 17, 'PERSON')]
+    }),
+    ('I like London and Berlin.', {
+        'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]
+    })
 ]
 
 
@@ -45,25 +47,28 @@ def main(model=None, output_dir=None, n_iter=100):
     if 'ner' not in nlp.pipe_names:
         ner = nlp.create_pipe('ner')
         nlp.add_pipe(ner, last=True)
+    # otherwise, get it so we can add labels
+    else:
+        ner = nlp.get_pipe('ner')
 
-    # function that allows begin_training to get the training data
-    get_data = lambda: reformat_train_data(nlp.tokenizer, TRAIN_DATA)
+    # add labels
+    for _, annotations in TRAIN_DATA:
+        for ent in annotations.get('entities'):
+            ner.add_label(ent[2])
 
     # get names of other pipes to disable them during training
     other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
     with nlp.disable_pipes(*other_pipes):  # only train NER
-        optimizer = nlp.begin_training(get_data)
+        optimizer = nlp.begin_training()
         for itn in range(n_iter):
             random.shuffle(TRAIN_DATA)
             losses = {}
-            for raw_text, entity_offsets in TRAIN_DATA:
-                doc = nlp.make_doc(raw_text)
-                gold = GoldParse(doc, entities=entity_offsets)
+            for text, annotations in TRAIN_DATA:
                 nlp.update(
-                    [doc], # Batch of Doc objects
-                    [gold], # Batch of GoldParse objects
-                    drop=0.5, # Dropout -- make it harder to memorise data
-                    sgd=optimizer, # Callable to update weights
+                    [text],  # batch of texts
+                    [annotations],  # batch of annotations
+                    drop=0.5,  # dropout - make it harder to memorise data
+                    sgd=optimizer,  # callable to update weights
                     losses=losses)
             print(losses)
 
@@ -90,25 +95,13 @@ def main(model=None, output_dir=None, n_iter=100):
             print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
 
 
-def reformat_train_data(tokenizer, examples):
-    """Reformat data to match JSON format.
-    https://alpha.spacy.io/api/annotation#json-input
-
-    tokenizer (Tokenizer): Tokenizer to process the raw text.
-    examples (list): The trainig data.
-    RETURNS (list): The reformatted training data."""
-    output = []
-    for i, (text, entity_offsets) in enumerate(examples):
-        doc = tokenizer(text)
-        ner_tags = biluo_tags_from_offsets(tokenizer(text), entity_offsets)
-        words = [w.text for w in doc]
-        tags = ['-'] * len(doc)
-        heads = [0] * len(doc)
-        deps = [''] * len(doc)
-        sentence = (range(len(doc)), words, tags, heads, deps, ner_tags)
-        output.append((text, [(sentence, [])]))
-    return output
-
-
 if __name__ == '__main__':
     plac.call(main)
+
+    # Expected output:
+    # Entities [('Shaka Khan', 'PERSON')]
+    # Tokens [('Who', '', 2), ('is', '', 2), ('Shaka', 'PERSON', 3),
+    # ('Khan', 'PERSON', 1), ('?', '', 2)]
+    # Entities [('London', 'LOC'), ('Berlin', 'LOC')]
+    # Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3),
+    # ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)]
diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py
index b43c5b61f..7ce7dc1d5 100644
--- a/examples/training/train_new_entity_type.py
+++ b/examples/training/train_new_entity_type.py
@@ -24,16 +24,14 @@ For more details, see the documentation:
 * NER: https://alpha.spacy.io/usage/linguistic-features#named-entities
 
 Developed for: spaCy 2.0.0a18
-Last updated for: spaCy 2.0.0a18
+Last updated for: spaCy 2.0.0a19
 """
 from __future__ import unicode_literals, print_function
 
 import plac
 import random
 from pathlib import Path
-
 import spacy
-from spacy.gold import GoldParse, minibatch
 
 
 # new entity label
@@ -45,20 +43,29 @@ LABEL = 'ANIMAL'
 # model might learn the new type, but "forget" what it previously knew.
 # https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
 TRAIN_DATA = [
-    ("Horses are too tall and they pretend to care about your feelings",
-     [(0, 6, 'ANIMAL')]),
+    ("Horses are too tall and they pretend to care about your feelings", {
+        'entities': [(0, 6, 'ANIMAL')]
+    }),
 
-    ("Do they bite?", []),
+    ("Do they bite?", {
+        'entities': []
+    }),
 
-    ("horses are too tall and they pretend to care about your feelings",
-     [(0, 6, 'ANIMAL')]),
+    ("horses are too tall and they pretend to care about your feelings", {
+        'entities': [(0, 6, 'ANIMAL')]
+    }),
 
-    ("horses pretend to care about your feelings", [(0, 6, 'ANIMAL')]),
+    ("horses pretend to care about your feelings", {
+        'entities': [(0, 6, 'ANIMAL')]
+    }),
 
-    ("they pretend to care about your feelings, those horses",
-     [(48, 54, 'ANIMAL')]),
+    ("they pretend to care about your feelings, those horses", {
+        'entities': [(48, 54, 'ANIMAL')]
+    }),
 
-    ("horses?", [(0, 6, 'ANIMAL')])
+    ("horses?", {
+        'entities': [(0, 6, 'ANIMAL')]
+    })
 ]
 
 
@@ -90,15 +97,13 @@ def main(model=None, new_model_name='animal', output_dir=None, n_iter=50):
     # get names of other pipes to disable them during training
     other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
     with nlp.disable_pipes(*other_pipes):  # only train NER
-        random.seed(0)
         optimizer = nlp.begin_training()
         for itn in range(n_iter):
+            random.shuffle(TRAIN_DATA)
             losses = {}
-            gold_parses = get_gold_parses(nlp.make_doc, TRAIN_DATA)
-            for batch in minibatch(gold_parses, size=3):
-                docs, golds = zip(*batch)
-                nlp.update(docs, golds, losses=losses, sgd=optimizer,
-                           drop=0.35)
+            for text, annotations in TRAIN_DATA:
+                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
+                           losses=losses)
             print(losses)
 
     # test the trained model
@@ -125,19 +130,5 @@ def main(model=None, new_model_name='animal', output_dir=None, n_iter=50):
             print(ent.label_, ent.text)
 
 
-def get_gold_parses(tokenizer, train_data):
-    """Shuffle and create GoldParse objects.
-
-    tokenizer (Tokenizer): Tokenizer to processs the raw text.
-    train_data (list): The training data.
-    YIELDS (tuple): (doc, gold) tuples.
-    """
-    random.shuffle(train_data)
-    for raw_text, entity_offsets in train_data:
-        doc = tokenizer(raw_text)
-        gold = GoldParse(doc, entities=entity_offsets)
-        yield doc, gold
-
-
 if __name__ == '__main__':
     plac.call(main)
diff --git a/examples/training/train_parser.py b/examples/training/train_parser.py
index 9e1d10414..c19ff7ac1 100644
--- a/examples/training/train_parser.py
+++ b/examples/training/train_parser.py
@@ -13,24 +13,19 @@ from __future__ import unicode_literals, print_function
 import plac
 import random
 from pathlib import Path
-
 import spacy
-from spacy.gold import GoldParse
-from spacy.tokens import Doc
 
 
 # training data
 TRAIN_DATA = [
-    (
-        ['They', 'trade',  'mortgage', '-', 'backed', 'securities', '.'],
-        [1, 1, 4, 4, 5, 1, 1],
-        ['nsubj', 'ROOT', 'compound', 'punct', 'nmod', 'dobj', 'punct']
-    ),
-    (
-        ['I', 'like', 'London', 'and', 'Berlin', '.'],
-        [1, 1, 1, 2, 2, 1],
-        ['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct']
-    )
+    ("They trade mortgage-backed securities.", {
+        'heads': [1, 1, 4, 4, 5, 1, 1],
+        'deps': ['nsubj', 'ROOT', 'compound', 'punct', 'nmod', 'dobj', 'punct']
+    }),
+    ("I like London and Berlin", {
+        'heads': [1, 1, 1, 2, 2, 1],
+        'deps': ['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct']
+    })
 ]
 
 
@@ -38,7 +33,7 @@ TRAIN_DATA = [
     model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
     output_dir=("Optional output directory", "option", "o", Path),
     n_iter=("Number of training iterations", "option", "n", int))
-def main(model=None, output_dir=None, n_iter=1000):
+def main(model=None, output_dir=None, n_iter=10):
     """Load the model, set up the pipeline and train the parser."""
     if model is not None:
         nlp = spacy.load(model)  # load existing spaCy model
@@ -57,8 +52,8 @@ def main(model=None, output_dir=None, n_iter=1000):
         parser = nlp.get_pipe('parser')
 
     # add labels to the parser
-    for _, _, deps in TRAIN_DATA:
-        for dep in deps:
+    for _, annotations in TRAIN_DATA:
+        for dep in annotations.get('deps', []):
             parser.add_label(dep)
 
     # get names of other pipes to disable them during training
@@ -68,10 +63,8 @@ def main(model=None, output_dir=None, n_iter=1000):
         for itn in range(n_iter):
             random.shuffle(TRAIN_DATA)
             losses = {}
-            for words, heads, deps in TRAIN_DATA:
-                doc = Doc(nlp.vocab, words=words)
-                gold = GoldParse(doc, heads=heads, deps=deps)
-                nlp.update([doc], [gold], sgd=optimizer, losses=losses)
+            for text, annotations in TRAIN_DATA:
+                nlp.update([text], [annotations], sgd=optimizer, losses=losses)
             print(losses)
 
     # test the trained model
diff --git a/examples/training/train_tagger.py b/examples/training/train_tagger.py
index 161f7910c..f1ec17663 100644
--- a/examples/training/train_tagger.py
+++ b/examples/training/train_tagger.py
@@ -9,17 +9,14 @@ the documentation:
 * POS Tagging: https://alpha.spacy.io/usage/linguistic-features#pos-tagging
 
 Developed for: spaCy 2.0.0a18
-Last updated for: spaCy 2.0.0a18
+Last updated for: spaCy 2.0.0a19
 """
 from __future__ import unicode_literals, print_function
 
 import plac
 import random
 from pathlib import Path
-
 import spacy
-from spacy.tokens import Doc
-from spacy.gold import GoldParse
 
 
 # You need to define a mapping from your data's part-of-speech tag names to the
@@ -29,16 +26,16 @@ from spacy.gold import GoldParse
 # You may also specify morphological features for your tags, from the universal
 # scheme.
 TAG_MAP = {
-    'N': {"pos": "NOUN"},
-    'V': {"pos": "VERB"},
-    'J': {"pos": "ADJ"}
+    'N': {'pos': 'NOUN'},
+    'V': {'pos': 'VERB'},
+    'J': {'pos': 'ADJ'}
 }
 
 # Usually you'll read this in, of course. Data formats vary.
 # Ensure your strings are unicode.
 TRAIN_DATA = [
-    (["I", "like", "green", "eggs"], ["N", "V", "J", "N"]),
-    (["Eat", "blue", "ham"], ["V", "J", "N"])
+    ("I like green eggs", {'tags': ['N', 'V', 'J', 'N']}),
+    ("Eat blue ham", {'tags': ['V', 'J', 'N']})
 ]
 
 
@@ -64,10 +61,8 @@ def main(lang='en', output_dir=None, n_iter=25):
     for i in range(n_iter):
         random.shuffle(TRAIN_DATA)
         losses = {}
-        for words, tags in TRAIN_DATA:
-            doc = Doc(nlp.vocab, words=words)
-            gold = GoldParse(doc, tags=tags)
-            nlp.update([doc], [gold], sgd=optimizer, losses=losses)
+        for text, annotations in TRAIN_DATA:
+            nlp.update([text], [annotations], sgd=optimizer, losses=losses)
         print(losses)
 
     # test the trained model
diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py
index d1cf3ab8a..07fba47c6 100644
--- a/examples/training/train_textcat.py
+++ b/examples/training/train_textcat.py
@@ -9,7 +9,7 @@ see the documentation:
 * Text classification: https://alpha.spacy.io/usage/text-classification
 
 Developed for: spaCy 2.0.0a18
-Last updated for: spaCy 2.0.0a18
+Last updated for: spaCy 2.0.0a19
 """
 from __future__ import unicode_literals, print_function
 import plac
@@ -18,9 +18,8 @@ from pathlib import Path
 import thinc.extra.datasets
 
 import spacy
-from spacy.gold import GoldParse, minibatch
+from spacy.gold import minibatch
 from spacy.util import compounding
-from spacy.pipeline import TextCategorizer
 
 
 @plac.annotations(
@@ -52,10 +51,8 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000):
     print("Loading IMDB data...")
     (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)
     print("Using %d training examples" % n_texts)
-    train_docs = [nlp.tokenizer(text) for text in train_texts]
-    train_gold = [GoldParse(doc, cats=cats) for doc, cats in
-                  zip(train_docs, train_cats)]
-    train_data = list(zip(train_docs, train_gold))
+    train_data = list(zip(train_texts,
+                          [{'cats': cats} for cats in train_cats]))
 
     # get names of other pipes to disable them during training
     other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
@@ -68,8 +65,9 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000):
             # batch up the examples using spaCy's minibatch
             batches = minibatch(train_data, size=compounding(4., 32., 1.001))
             for batch in batches:
-                docs, golds = zip(*batch)
-                nlp.update(docs, golds, sgd=optimizer, drop=0.2, losses=losses)
+                texts, annotations = zip(*batch)
+                nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
+                           losses=losses)
             with textcat.model.use_params(optimizer.averages):
                 # evaluate on the dev data split off in load_data()
                 scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)