From f7ad8e8c83bdf040ea60cfcfaf785f93da406d0a Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 17 Jun 2020 12:05:58 +0200
Subject: [PATCH] various fixes in scripts - needs to be further tested

---
 bin/ud/ud_train.py                           | 40 +++++-------
 examples/training/conllu.py                  | 66 +++++---------------
 examples/training/ner_multitask_objective.py | 32 +++++-----
 spacy/cli/converters/conllu2json.py          | 39 +++---------
 spacy/gold/gold_io.pyx                       |  3 +-
 spacy/language.py                            |  4 +-
 spacy/tests/pipeline/test_sentencizer.py     |  2 +-
 spacy/tests/test_util.py                     |  1 -
 spacy/tokenizer.pyx                          |  2 +-
 9 files changed, 63 insertions(+), 126 deletions(-)

diff --git a/bin/ud/ud_train.py b/bin/ud/ud_train.py
index aa5050f3a..7bf5dbb5e 100644
--- a/bin/ud/ud_train.py
+++ b/bin/ud/ud_train.py
@@ -14,7 +14,7 @@ import spacy
 import spacy.util
 from bin.ud import conll17_ud_eval
 from spacy.tokens import Token, Doc
-from spacy.gold import GoldParse, Example
+from spacy.gold import Example
 from spacy.util import compounding, minibatch, minibatch_by_words
 from spacy.syntax.nonproj import projectivize
 from spacy.matcher import Matcher
@@ -83,11 +83,11 @@ def read_data(
                 sent["heads"].append(head)
                 sent["deps"].append("ROOT" if dep == "root" else dep)
                 sent["spaces"].append(space_after == "_")
-            sent["entities"] = ["-"] * len(sent["words"])
+            sent["entities"] = ["-"] * len(sent["words"])    # TODO: doc-level format
             sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"])
             if oracle_segments:
                 docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
-                golds.append(GoldParse(docs[-1], **sent))
+                golds.append(sent)
                 assert golds[-1].morphology is not None
 
             sent_annots.append(sent)
@@ -151,28 +151,27 @@ def read_conllu(file_):
 
 def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
     # Flatten the conll annotations, and adjust the head indices
-    flat = defaultdict(list)
+    gold = defaultdict(list)
     sent_starts = []
     for sent in sent_annots:
-        flat["heads"].extend(len(flat["words"])+head for head in sent["heads"])
+        gold["heads"].extend(len(gold["words"])+head for head in sent["heads"])
         for field in ["words", "tags", "deps", "morphology", "entities", "spaces"]:
-            flat[field].extend(sent[field])
+            gold[field].extend(sent[field])
         sent_starts.append(True)
         sent_starts.extend([False] * (len(sent["words"]) - 1))
     # Construct text if necessary
-    assert len(flat["words"]) == len(flat["spaces"])
+    assert len(gold["words"]) == len(gold["spaces"])
     if text is None:
         text = "".join(
-            word + " " * space for word, space in zip(flat["words"], flat["spaces"])
+            word + " " * space for word, space in zip(gold["words"], gold["spaces"])
         )
     doc = nlp.make_doc(text)
-    flat.pop("spaces")
-    gold = GoldParse(doc, **flat)
-    gold.sent_starts = sent_starts
+    gold.pop("spaces")
+    gold["sent_starts"] = sent_starts
     for i in range(len(gold.heads)):
         if random.random() < drop_deps:
-            gold.heads[i] = None
-            gold.labels[i] = None
+            gold["heads"][i] = None
+            gold["labels"][i] = None
 
     return doc, gold
 
@@ -183,15 +182,10 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
 
 
 def golds_to_gold_data(docs, golds):
-    """Get out the training data format used by begin_training, given the
-    GoldParse objects."""
+    """Get out the training data format used by begin_training"""
     data = []
     for doc, gold in zip(docs, golds):
-        example = Example(doc=doc)
-        example.add_doc_annotation(cats=gold.cats)
-        token_annotation_dict = gold.orig.to_dict()
-        example.add_token_annotation(**token_annotation_dict)
-        example.goldparse = gold
+        example = Example.from_dict(doc, gold)
         data.append(example)
     return data
 
@@ -359,8 +353,8 @@ def initialize_pipeline(nlp, examples, config, device):
         nlp.parser.add_multitask_objective("tag")
     if config.multitask_sent:
         nlp.parser.add_multitask_objective("sent_start")
-    for ex in examples:
-        gold = ex.gold
+    for eg in examples:
+        gold = eg.gold
         for tag in gold.tags:
             if tag is not None:
                 nlp.tagger.add_label(tag)
@@ -541,7 +535,7 @@ def main(
         else:
             batches = minibatch(examples, size=batch_sizes)
         losses = {}
-        n_train_words = sum(len(ex.doc) for ex in examples)
+        n_train_words = sum(len(eg.doc) for eg in examples)
         with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
             for batch in batches:
                 pbar.update(sum(len(ex.doc) for ex in batch))
diff --git a/examples/training/conllu.py b/examples/training/conllu.py
index bf47be72a..0758775cf 100644
--- a/examples/training/conllu.py
+++ b/examples/training/conllu.py
@@ -12,7 +12,7 @@ import tqdm
 import spacy
 import spacy.util
 from spacy.tokens import Token, Doc
-from spacy.gold import GoldParse, Example
+from spacy.gold import Example
 from spacy.syntax.nonproj import projectivize
 from collections import defaultdict
 from spacy.matcher import Matcher
@@ -33,31 +33,6 @@ random.seed(0)
 numpy.random.seed(0)
 
 
-def minibatch_by_words(examples, size=5000):
-    random.shuffle(examples)
-    if isinstance(size, int):
-        size_ = itertools.repeat(size)
-    else:
-        size_ = size
-    examples = iter(examples)
-    while True:
-        batch_size = next(size_)
-        batch = []
-        while batch_size >= 0:
-            try:
-                example = next(examples)
-            except StopIteration:
-                if batch:
-                    yield batch
-                return
-            batch_size -= len(example.doc)
-            batch.append(example)
-        if batch:
-            yield batch
-        else:
-            break
-
-
 ################
 # Data reading #
 ################
@@ -110,7 +85,7 @@ def read_data(
             sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"])
             if oracle_segments:
                 docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
-                golds.append(GoldParse(docs[-1], **sent))
+                golds.append(sent)
 
             sent_annots.append(sent)
             if raw_text and max_doc_length and len(sent_annots) >= max_doc_length:
@@ -159,20 +134,19 @@ def read_conllu(file_):
 
 def _make_gold(nlp, text, sent_annots):
     # Flatten the conll annotations, and adjust the head indices
-    flat = defaultdict(list)
+    gold = defaultdict(list)
     for sent in sent_annots:
-        flat["heads"].extend(len(flat["words"]) + head for head in sent["heads"])
+        gold["heads"].extend(len(gold["words"]) + head for head in sent["heads"])
         for field in ["words", "tags", "deps", "entities", "spaces"]:
-            flat[field].extend(sent[field])
+            gold[field].extend(sent[field])
     # Construct text if necessary
-    assert len(flat["words"]) == len(flat["spaces"])
+    assert len(gold["words"]) == len(gold["spaces"])
     if text is None:
         text = "".join(
-            word + " " * space for word, space in zip(flat["words"], flat["spaces"])
+            word + " " * space for word, space in zip(gold["words"], gold["spaces"])
         )
     doc = nlp.make_doc(text)
-    flat.pop("spaces")
-    gold = GoldParse(doc, **flat)
+    gold.pop("spaces")
     return doc, gold
 
 
@@ -182,15 +156,10 @@ def _make_gold(nlp, text, sent_annots):
 
 
 def golds_to_gold_data(docs, golds):
-    """Get out the training data format used by begin_training, given the
-    GoldParse objects."""
+    """Get out the training data format used by begin_training."""
     data = []
     for doc, gold in zip(docs, golds):
-        example = Example(doc=doc)
-        example.add_doc_annotation(cats=gold.cats)
-        token_annotation_dict = gold.orig.to_dict()
-        example.add_token_annotation(**token_annotation_dict)
-        example.goldparse = gold
+        example = Example.from_dict(doc, gold)
         data.append(example)
     return data
 
@@ -313,15 +282,15 @@ def initialize_pipeline(nlp, examples, config):
         nlp.parser.add_multitask_objective("sent_start")
     nlp.parser.moves.add_action(2, "subtok")
     nlp.add_pipe(nlp.create_pipe("tagger"))
-    for ex in examples:
-        for tag in ex.gold.tags:
+    for eg in examples:
+        for tag in eg.gold.tags:
             if tag is not None:
                 nlp.tagger.add_label(tag)
     # Replace labels that didn't make the frequency cutoff
     actions = set(nlp.parser.labels)
     label_set = set([act.split("-")[1] for act in actions if "-" in act])
-    for ex in examples:
-        gold = ex.gold
+    for eg in examples:
+        gold = eg.gold
         for i, label in enumerate(gold.labels):
             if label is not None and label not in label_set:
                 gold.labels[i] = label.split("||")[0]
@@ -415,13 +384,12 @@ def main(ud_dir, parses_dir, config, corpus, limit=0):
     optimizer = initialize_pipeline(nlp, examples, config)
 
     for i in range(config.nr_epoch):
-        docs = [nlp.make_doc(example.doc.text) for example in examples]
-        batches = minibatch_by_words(examples, size=config.batch_size)
+        batches = spacy.minibatch_by_words(examples, size=config.batch_size)
         losses = {}
-        n_train_words = sum(len(doc) for doc in docs)
+        n_train_words = sum(len(eg.reference.doc) for eg in examples)
         with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
             for batch in batches:
-                pbar.update(sum(len(ex.doc) for ex in batch))
+                pbar.update(sum(len(eg.reference.doc) for eg in batch))
                 nlp.update(
                     examples=batch, sgd=optimizer, drop=config.dropout, losses=losses,
                 )
diff --git a/examples/training/ner_multitask_objective.py b/examples/training/ner_multitask_objective.py
index 7561d4877..baa6d7f06 100644
--- a/examples/training/ner_multitask_objective.py
+++ b/examples/training/ner_multitask_objective.py
@@ -24,8 +24,10 @@ import random
 import plac
 import spacy
 import os.path
+
+from spacy.gold.example import Example
 from spacy.tokens import Doc
-from spacy.gold import read_json_file, GoldParse
+from spacy.gold import read_json_file
 
 random.seed(0)
 
@@ -59,27 +61,25 @@ def main(n_iter=10):
     print(nlp.pipeline)
 
     print("Create data", len(TRAIN_DATA))
-    optimizer = nlp.begin_training(get_examples=lambda: TRAIN_DATA)
+    optimizer = nlp.begin_training()
     for itn in range(n_iter):
         random.shuffle(TRAIN_DATA)
         losses = {}
-        for example in TRAIN_DATA:
-            for token_annotation in example.token_annotations:
-                doc = Doc(nlp.vocab, words=token_annotation.words)
-                gold = GoldParse.from_annotation(doc, example.doc_annotation, token_annotation)
-
-                nlp.update(
-                    examples=[(doc, gold)],  # 1 example
-                    drop=0.2,  # dropout - make it harder to memorise data
-                    sgd=optimizer,  # callable to update weights
-                    losses=losses,
-                )
+        for example_dict in TRAIN_DATA:
+            doc = Doc(nlp.vocab, words=example_dict["words"])
+            example = Example.from_dict(doc, example_dict)
+            nlp.update(
+                examples=[example],  # 1 example
+                drop=0.2,  # dropout - make it harder to memorise data
+                sgd=optimizer,  # callable to update weights
+                losses=losses,
+            )
         print(losses.get("nn_labeller", 0.0), losses["ner"])
 
     # test the trained model
-    for example in TRAIN_DATA:
-        if example.text is not None:
-            doc = nlp(example.text)
+    for example_dict in TRAIN_DATA:
+        if "text" in example_dict:
+            doc = nlp(example_dict["text"])
             print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
             print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
 
diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py
index 2cf5f7942..a7d59b9ba 100644
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@@ -1,8 +1,7 @@
 import re
 
 from ...gold import Example
-from ...gold import iob_to_biluo, spans_from_biluo_tags, biluo_tags_from_offsets
-from ...gold import TokenAnnotation
+from ...gold import iob_to_biluo, spans_from_biluo_tags
 from ...language import Language
 from ...tokens import Doc, Token
 from .conll_ner2json import n_sents_info
@@ -42,10 +41,10 @@ def conllu2json(
     )
     has_ner_tags = has_ner(input_data, MISC_NER_PATTERN)
     for i, example in enumerate(conll_data):
-        raw += example.text
+        raw += example.predicted.text
         sentences.append(
             generate_sentence(
-                example.token_annotation,
+                example,
                 has_ner_tags,
                 MISC_NER_PATTERN,
                 ner_map=ner_map,
@@ -268,36 +267,14 @@ def example_from_conllu_sentence(
         doc = merge_conllu_subtokens(lines, doc)
 
     # create Example from custom Doc annotation
-    ids, words, tags, heads, deps = [], [], [], [], []
-    pos, lemmas, morphs, spaces = [], [], [], []
+    words, spaces = [], []
     for i, t in enumerate(doc):
-        ids.append(i)
         words.append(t._.merged_orth)
-        if append_morphology and t._.merged_morph:
-            tags.append(t.tag_ + "__" + t._.merged_morph)
-        else:
-            tags.append(t.tag_)
-        pos.append(t.pos_)
-        morphs.append(t._.merged_morph)
-        lemmas.append(t._.merged_lemma)
-        heads.append(t.head.i)
-        deps.append(t.dep_)
         spaces.append(t._.merged_spaceafter)
-    ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
-    ents = biluo_tags_from_offsets(doc, ent_offsets)
-    example = Example(doc=Doc(vocab, words=words, spaces=spaces))
-    example.token_annotation = TokenAnnotation(
-        ids=ids,
-        words=words,
-        tags=tags,
-        pos=pos,
-        morphs=morphs,
-        lemmas=lemmas,
-        heads=heads,
-        deps=deps,
-        entities=ents,
-    )
-    return example
+        if append_morphology and t._.merged_morph:
+            t.tag_ = t.tag_ + "__" + t._.merged_morph
+
+    return Example(predicted=Doc(vocab, words=words, spaces=spaces), reference=doc)
 
 
 def merge_conllu_subtokens(lines, doc):
diff --git a/spacy/gold/gold_io.pyx b/spacy/gold/gold_io.pyx
index 967bee060..ea37df9f2 100644
--- a/spacy/gold/gold_io.pyx
+++ b/spacy/gold/gold_io.pyx
@@ -69,6 +69,7 @@ def docs_to_json(docs, id=0, ner_missing_tag="O"):
 
 
 def read_json_file(loc, docs_filter=None, limit=None):
+    """Read Example dictionaries from a json file or directory."""
     loc = util.ensure_path(loc)
     if loc.is_dir():
         for filename in loc.iterdir():
@@ -105,7 +106,7 @@ def json_to_annotations(doc):
             sent_start_i = len(words)
             for i, token in enumerate(sent["tokens"]):
                 words.append(token["orth"])
-                spaces.append(token["space"])
+                spaces.append(token.get("space", True))
                 ids.append(token.get('id', sent_start_i + i))
                 tags.append(token.get('tag', "-"))
                 pos.append(token.get("pos", ""))
diff --git a/spacy/language.py b/spacy/language.py
index d632bdf02..c168afeea 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -804,7 +804,6 @@ class Language(object):
         cleanup=False,
         component_cfg=None,
         n_process=1,
-        as_example=False,
     ):
         """Process texts as a stream, and yield `Doc` objects in order.
 
@@ -837,8 +836,7 @@ class Language(object):
                 batch_size=batch_size,
                 disable=disable,
                 n_process=n_process,
-                component_cfg=component_cfg,
-                as_example=as_example,
+                component_cfg=component_cfg
             )
             for doc, context in zip(docs, contexts):
                 yield (doc, context)
diff --git a/spacy/tests/pipeline/test_sentencizer.py b/spacy/tests/pipeline/test_sentencizer.py
index 5c00b97ce..6dfa0acee 100644
--- a/spacy/tests/pipeline/test_sentencizer.py
+++ b/spacy/tests/pipeline/test_sentencizer.py
@@ -26,7 +26,7 @@ def test_sentencizer_pipe():
         sent_starts = [t.is_sent_start for t in doc]
         assert sent_starts == [True, False, True, False, False, False, False]
         assert len(list(doc.sents)) == 2
-    for ex in nlp.pipe(texts, as_example=True):
+    for ex in nlp.pipe(texts):
         doc = ex.doc
         assert doc.is_sentenced
         sent_starts = [t.is_sent_start for t in doc]
diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py
index d396dc74d..9d02c6c6a 100644
--- a/spacy/tests/test_util.py
+++ b/spacy/tests/test_util.py
@@ -1,5 +1,4 @@
 import pytest
-from spacy.gold import Example
 
 from .util import get_random_doc
 
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 538bf60e9..764f592cb 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -205,7 +205,7 @@ cdef class Tokenizer:
             doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws
         return doc
 
-    def pipe(self, texts, batch_size=1000, n_threads=-1, as_example=False):
+    def pipe(self, texts, batch_size=1000, n_threads=-1):
         """Tokenize a stream of texts.
 
         texts: A sequence of unicode texts.