From fcb4f7a6db10b94a5ae2f2b961009c67382295ef Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 6 Jun 2020 14:15:12 +0200
Subject: [PATCH 01/56] Start breaking down gold.pyx

---
 spacy/_gold/align.py | 81 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)
 create mode 100644 spacy/_gold/align.py

diff --git a/spacy/_gold/align.py b/spacy/_gold/align.py
new file mode 100644
index 000000000..7703232b2
--- /dev/null
+++ b/spacy/_gold/align.py
@@ -0,0 +1,81 @@
+import numpy
+from .errors import Errors, AlignmentError
+
+
+def align(tokens_a, tokens_b):
+    """Calculate alignment tables between two tokenizations.
+
+    tokens_a (List[str]): The candidate tokenization.
+    tokens_b (List[str]): The reference tokenization.
+    RETURNS: (tuple): A 5-tuple consisting of the following information:
+      * cost (int): The number of misaligned tokens.
+      * a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`.
+        For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns
+        to `tokens_b[6]`. If there's no one-to-one alignment for a token,
+        it has the value -1.
+      * b2a (List[int]): The same as `a2b`, but mapping the other direction.
+      * a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a`
+        to indices in `tokens_b`, where multiple tokens of `tokens_a` align to
+        the same token of `tokens_b`.
+      * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
+            direction.
+    """
+    tokens_a = _normalize_for_alignment(tokens_a)
+    tokens_b = _normalize_for_alignment(tokens_b)
+    cost = 0
+    a2b = numpy.empty(len(tokens_a), dtype="i")
+    b2a = numpy.empty(len(tokens_b), dtype="i")
+    a2b.fill(-1)
+    b2a.fill(-1)
+    a2b_multi = {}
+    b2a_multi = {}
+    i = 0
+    j = 0
+    offset_a = 0
+    offset_b = 0
+    while i < len(tokens_a) and j < len(tokens_b):
+        a = tokens_a[i][offset_a:]
+        b = tokens_b[j][offset_b:]
+        if a == b:
+            if offset_a == offset_b == 0:
+                a2b[i] = j
+                b2a[j] = i
+            elif offset_a == 0:
+                cost += 2
+                a2b_multi[i] = j
+            elif offset_b == 0:
+                cost += 2
+                b2a_multi[j] = i
+            offset_a = offset_b = 0
+            i += 1
+            j += 1
+        elif a == "":
+            assert offset_a == 0
+            cost += 1
+            i += 1
+        elif b == "":
+            assert offset_b == 0
+            cost += 1
+            j += 1
+        elif b.startswith(a):
+            cost += 1
+            if offset_a == 0:
+                a2b_multi[i] = j
+            i += 1
+            offset_a = 0
+            offset_b += len(a)
+        elif a.startswith(b):
+            cost += 1
+            if offset_b == 0:
+                b2a_multi[j] = i
+            j += 1
+            offset_b = 0
+            offset_a += len(b)
+        else:
+            assert "".join(tokens_a) != "".join(tokens_b)
+            raise AlignmentError(Errors.E186.format(tok_a=tokens_a, tok_b=tokens_b))
+    return cost, a2b, b2a, a2b_multi, b2a_multi
+
+
+def _normalize_for_alignment(tokens):
+    return [w.replace(" ", "").lower() for w in tokens]

From 6005b94e741eb4125894d9200b185ba5e590b245 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 6 Jun 2020 14:19:06 +0200
Subject: [PATCH 02/56] Add data augmentation

---
 spacy/_gold/augment.py | 126 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 126 insertions(+)
 create mode 100644 spacy/_gold/augment.py

diff --git a/spacy/_gold/augment.py b/spacy/_gold/augment.py
new file mode 100644
index 000000000..02c812825
--- /dev/null
+++ b/spacy/_gold/augment.py
@@ -0,0 +1,126 @@
+import random
+import itertools
+from .exmaple import Example
+
+
+def make_orth_variants(nlp, example, orth_variant_level=0.0):
+    if random.random() >= orth_variant_level:
+        return example
+    if not example.token_annotation:
+        return example
+    raw = example.text
+    lower = False
+    if random.random() >= 0.5:
+        lower = True
+        if raw is not None:
+            raw = raw.lower()
+    ndsv = nlp.Defaults.single_orth_variants
+    ndpv = nlp.Defaults.paired_orth_variants
+    # modify words in paragraph_tuples
+    variant_example = Example(doc=raw)
+    token_annotation = example.token_annotation
+    words = token_annotation.words
+    tags = token_annotation.tags
+    if not words or not tags:
+        # add the unmodified annotation
+        token_dict = token_annotation.to_dict()
+        variant_example.set_token_annotation(**token_dict)
+    else:
+        if lower:
+            words = [w.lower() for w in words]
+        # single variants
+        punct_choices = [random.choice(x["variants"]) for x in ndsv]
+        for word_idx in range(len(words)):
+            for punct_idx in range(len(ndsv)):
+                if tags[word_idx] in ndsv[punct_idx]["tags"] \
+                        and words[word_idx] in ndsv[punct_idx]["variants"]:
+                    words[word_idx] = punct_choices[punct_idx]
+        # paired variants
+        punct_choices = [random.choice(x["variants"]) for x in ndpv]
+        for word_idx in range(len(words)):
+            for punct_idx in range(len(ndpv)):
+                if tags[word_idx] in ndpv[punct_idx]["tags"] \
+                        and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
+                    # backup option: random left vs. right from pair
+                    pair_idx = random.choice([0, 1])
+                    # best option: rely on paired POS tags like `` / ''
+                    if len(ndpv[punct_idx]["tags"]) == 2:
+                        pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx])
+                    # next best option: rely on position in variants
+                    # (may not be unambiguous, so order of variants matters)
+                    else:
+                        for pair in ndpv[punct_idx]["variants"]:
+                            if words[word_idx] in pair:
+                                pair_idx = pair.index(words[word_idx])
+                    words[word_idx] = punct_choices[punct_idx][pair_idx]
+
+        token_dict = token_annotation.to_dict()
+        token_dict["words"] = words
+        token_dict["tags"] = tags
+        variant_example.set_token_annotation(**token_dict)
+    # modify raw to match variant_paragraph_tuples
+    if raw is not None:
+        variants = []
+        for single_variants in ndsv:
+            variants.extend(single_variants["variants"])
+        for paired_variants in ndpv:
+            variants.extend(list(itertools.chain.from_iterable(paired_variants["variants"])))
+        # store variants in reverse length order to be able to prioritize
+        # longer matches (e.g., "---" before "--")
+        variants = sorted(variants, key=lambda x: len(x))
+        variants.reverse()
+        variant_raw = ""
+        raw_idx = 0
+        # add initial whitespace
+        while raw_idx < len(raw) and raw[raw_idx].isspace():
+            variant_raw += raw[raw_idx]
+            raw_idx += 1
+        for word in variant_example.token_annotation.words:
+            match_found = False
+            # skip whitespace words
+            if word.isspace():
+                match_found = True
+            # add identical word
+            elif word not in variants and raw[raw_idx:].startswith(word):
+                variant_raw += word
+                raw_idx += len(word)
+                match_found = True
+            # add variant word
+            else:
+                for variant in variants:
+                    if not match_found and \
+                            raw[raw_idx:].startswith(variant):
+                        raw_idx += len(variant)
+                        variant_raw += word
+                        match_found = True
+            # something went wrong, abort
+            # (add a warning message?)
+            if not match_found:
+                return example
+            # add following whitespace
+            while raw_idx < len(raw) and raw[raw_idx].isspace():
+                variant_raw += raw[raw_idx]
+                raw_idx += 1
+        variant_example.doc = variant_raw
+        return variant_example
+    return variant_example
+
+
+def add_noise(orig, noise_level):
+    if random.random() >= noise_level:
+        return orig
+    elif type(orig) == list:
+        corrupted = [_corrupt(word, noise_level) for word in orig]
+        corrupted = [w for w in corrupted if w]
+        return corrupted
+    else:
+        return "".join(_corrupt(c, noise_level) for c in orig)
+
+
+def _corrupt(c, noise_level):
+    if random.random() >= noise_level:
+        return c
+    elif c in [".", "'", "!", "?", ","]:
+        return "\n"
+    else:
+        return c.lower()

From cce6a51a9cacae940f81d78c2408b5cd235209db Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 6 Jun 2020 14:22:27 +0200
Subject: [PATCH 03/56] Add annotation classes

---
 spacy/_gold/annotation.py | 123 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 123 insertions(+)
 create mode 100644 spacy/_gold/annotation.py

diff --git a/spacy/_gold/annotation.py b/spacy/_gold/annotation.py
new file mode 100644
index 000000000..cd8ac0717
--- /dev/null
+++ b/spacy/_gold/annotation.py
@@ -0,0 +1,123 @@
+class TokenAnnotation:
+    def __init__(
+        self,
+        ids=None,
+        words=None,
+        tags=None,
+        pos=None,
+        morphs=None,
+        lemmas=None,
+        heads=None,
+        deps=None,
+        entities=None,
+        sent_starts=None,
+        brackets=None,
+    ):
+        self.ids = ids if ids else []
+        self.words = words if words else []
+        self.tags = tags if tags else []
+        self.pos = pos if pos else []
+        self.morphs = morphs if morphs else []
+        self.lemmas = lemmas if lemmas else []
+        self.heads = heads if heads else []
+        self.deps = deps if deps else []
+        self.entities = entities if entities else []
+        self.sent_starts = sent_starts if sent_starts else []
+        self.brackets_by_start = {}
+        if brackets:
+            for b_start, b_end, b_label in brackets:
+                self.brackets_by_start.setdefault(b_start, []).append((b_end, b_label))
+
+    @property
+    def brackets(self):
+        brackets = []
+        for start, ends_labels in self.brackets_by_start.items():
+            for end, label in ends_labels:
+                brackets.append((start, end, label))
+        return brackets
+
+    @classmethod
+    def from_dict(cls, token_dict):
+        return cls(
+            ids=token_dict.get("ids", None),
+            words=token_dict.get("words", None),
+            tags=token_dict.get("tags", None),
+            pos=token_dict.get("pos", None),
+            morphs=token_dict.get("morphs", None),
+            lemmas=token_dict.get("lemmas", None),
+            heads=token_dict.get("heads", None),
+            deps=token_dict.get("deps", None),
+            entities=token_dict.get("entities", None),
+            sent_starts=token_dict.get("sent_starts", None),
+            brackets=token_dict.get("brackets", None),
+        )
+
+    def to_dict(self):
+        return {
+            "ids": self.ids,
+            "words": self.words,
+            "tags": self.tags,
+            "pos": self.pos,
+            "morphs": self.morphs,
+            "lemmas": self.lemmas,
+            "heads": self.heads,
+            "deps": self.deps,
+            "entities": self.entities,
+            "sent_starts": self.sent_starts,
+            "brackets": self.brackets,
+        }
+
+    def get_id(self, i):
+        return self.ids[i] if i < len(self.ids) else i
+
+    def get_word(self, i):
+        return self.words[i] if i < len(self.words) else ""
+
+    def get_tag(self, i):
+        return self.tags[i] if i < len(self.tags) else "-"
+
+    def get_pos(self, i):
+        return self.pos[i] if i < len(self.pos) else ""
+
+    def get_morph(self, i):
+        return self.morphs[i] if i < len(self.morphs) else ""
+
+    def get_lemma(self, i):
+        return self.lemmas[i] if i < len(self.lemmas) else ""
+
+    def get_head(self, i):
+        return self.heads[i] if i < len(self.heads) else i
+
+    def get_dep(self, i):
+        return self.deps[i] if i < len(self.deps) else ""
+
+    def get_entity(self, i):
+        return self.entities[i] if i < len(self.entities) else "-"
+
+    def get_sent_start(self, i):
+        return self.sent_starts[i] if i < len(self.sent_starts) else None
+
+    def __str__(self):
+        return str(self.to_dict())
+
+    def __repr__(self):
+        return self.__str__()
+
+
+class DocAnnotation:
+    def __init__(self, cats=None, links=None):
+        self.cats = cats if cats else {}
+        self.links = links if links else {}
+
+    @classmethod
+    def from_dict(cls, doc_dict):
+        return cls(cats=doc_dict.get("cats", None), links=doc_dict.get("links", None))
+
+    def to_dict(self):
+        return {"cats": self.cats, "links": self.links}
+
+    def __str__(self):
+        return str(self.to_dict())
+
+    def __repr__(self):
+        return self.__str__()

From 1fb8fc6ea9e9f290af12c1ea1b9755757e64c610 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 6 Jun 2020 14:24:35 +0200
Subject: [PATCH 04/56] Add Example class

---
 spacy/_gold/example.py | 199 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 199 insertions(+)
 create mode 100644 spacy/_gold/example.py

diff --git a/spacy/_gold/example.py b/spacy/_gold/example.py
new file mode 100644
index 000000000..db9e10093
--- /dev/null
+++ b/spacy/_gold/example.py
@@ -0,0 +1,199 @@
+from .annotation import TokenAnnotation, DocAnnotation
+from .gold_parse import GoldParse
+
+
+class Example:
+    def __init__(self, doc_annotation=None, token_annotation=None, doc=None,
+                 goldparse=None):
+        """ Doc can either be text, or an actual Doc """
+        self.doc = doc
+        self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation()
+        self.token_annotation = token_annotation if token_annotation else TokenAnnotation()
+        self.goldparse = goldparse
+
+    @classmethod
+    def from_gold(cls, goldparse, doc=None):
+        doc_annotation = DocAnnotation(cats=goldparse.cats, links=goldparse.links)
+        token_annotation = goldparse.get_token_annotation()
+        return cls(doc_annotation, token_annotation, doc)
+
+    @classmethod
+    def from_dict(cls, example_dict, doc=None):
+        token_dict = example_dict.get("token_annotation", {})
+        token_annotation = TokenAnnotation.from_dict(token_dict)
+        doc_dict = example_dict.get("doc_annotation", {})
+        doc_annotation = DocAnnotation.from_dict(doc_dict)
+        return cls(doc_annotation, token_annotation, doc)
+
+    def to_dict(self):
+        """ Note that this method does NOT export the doc, only the annotations ! """
+        token_dict = self.token_annotation.to_dict()
+        doc_dict = self.doc_annotation.to_dict()
+        return {"token_annotation": token_dict, "doc_annotation": doc_dict}
+
+    @property
+    def text(self):
+        if self.doc is None:
+            return None
+        if isinstance(self.doc, Doc):
+            return self.doc.text
+        return self.doc
+
+    @property
+    def gold(self):
+        if self.goldparse is None:
+            doc, gold = self.get_gold_parses()[0]
+            self.goldparse = gold
+        return self.goldparse
+
+    def set_token_annotation(self, ids=None, words=None, tags=None, pos=None,
+                             morphs=None, lemmas=None, heads=None, deps=None,
+                             entities=None, sent_starts=None, brackets=None):
+        self.token_annotation = TokenAnnotation(ids=ids, words=words, tags=tags,
+                            pos=pos, morphs=morphs, lemmas=lemmas, heads=heads,
+                            deps=deps, entities=entities,
+                            sent_starts=sent_starts, brackets=brackets)
+
+    def set_doc_annotation(self, cats=None, links=None):
+        if cats:
+            self.doc_annotation.cats = cats
+        if links:
+            self.doc_annotation.links = links
+
+    def split_sents(self):
+        """ Split the token annotations into multiple Examples based on
+        sent_starts and return a list of the new Examples"""
+        if not self.token_annotation.words:
+            return [self]
+        s_example = Example(doc=None, doc_annotation=self.doc_annotation)
+        s_ids, s_words, s_tags, s_pos, s_morphs = [], [], [], [], []
+        s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], []
+        s_brackets = []
+        sent_start_i = 0
+        t = self.token_annotation
+        split_examples = []
+        for i in range(len(t.words)):
+            if i > 0 and t.sent_starts[i] == 1:
+                s_example.set_token_annotation(ids=s_ids,
+                        words=s_words, tags=s_tags, pos=s_pos, morphs=s_morphs,
+                        lemmas=s_lemmas, heads=s_heads, deps=s_deps,
+                        entities=s_ents, sent_starts=s_sent_starts,
+                        brackets=s_brackets)
+                split_examples.append(s_example)
+                s_example = Example(doc=None, doc_annotation=self.doc_annotation)
+                s_ids, s_words, s_tags, s_pos, s_heads = [], [], [], [], []
+                s_deps, s_ents, s_morphs, s_lemmas = [], [], [], []
+                s_sent_starts, s_brackets = [], []
+                sent_start_i = i
+            s_ids.append(t.get_id(i))
+            s_words.append(t.get_word(i))
+            s_tags.append(t.get_tag(i))
+            s_pos.append(t.get_pos(i))
+            s_morphs.append(t.get_morph(i))
+            s_lemmas.append(t.get_lemma(i))
+            s_heads.append(t.get_head(i) - sent_start_i)
+            s_deps.append(t.get_dep(i))
+            s_ents.append(t.get_entity(i))
+            s_sent_starts.append(t.get_sent_start(i))
+            for b_end, b_label in t.brackets_by_start.get(i, []):
+                s_brackets.append(
+                    (i - sent_start_i, b_end - sent_start_i, b_label)
+                )
+            i += 1
+        s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags,
+                pos=s_pos, morphs=s_morphs, lemmas=s_lemmas, heads=s_heads,
+                deps=s_deps, entities=s_ents, sent_starts=s_sent_starts,
+                brackets=s_brackets)
+        split_examples.append(s_example)
+        return split_examples
+
+
+    def get_gold_parses(self, merge=True, vocab=None, make_projective=False,
+                        ignore_misaligned=False):
+        """Return a list of (doc, GoldParse) objects.
+        If merge is set to True, keep all Token annotations as one big list."""
+        d = self.doc_annotation
+        # merge == do not modify Example
+        if merge:
+            t = self.token_annotation
+            doc = self.doc
+            if doc is None or not isinstance(doc, Doc):
+                if not vocab:
+                    raise ValueError(Errors.E998)
+                doc = Doc(vocab, words=t.words)
+            try:
+                gp = GoldParse.from_annotation(doc, d, t,
+                                               make_projective=make_projective)
+            except AlignmentError:
+                if ignore_misaligned:
+                    gp = None
+                else:
+                    raise
+            return [(doc, gp)]
+        # not merging: one GoldParse per sentence, defining docs with the words
+        # from each sentence
+        else:
+            parses = []
+            split_examples = self.split_sents()
+            for split_example in split_examples:
+                if not vocab:
+                    raise ValueError(Errors.E998)
+                split_doc = Doc(vocab, words=split_example.token_annotation.words)
+                try:
+                    gp = GoldParse.from_annotation(split_doc, d,
+                            split_example.token_annotation,
+                            make_projective=make_projective)
+                except AlignmentError:
+                    if ignore_misaligned:
+                        gp = None
+                    else:
+                        raise
+                if gp is not None:
+                    parses.append((split_doc, gp))
+            return parses
+
+    @classmethod
+    def to_example_objects(cls, examples, make_doc=None, keep_raw_text=False):
+        """
+        Return a list of Example objects, from a variety of input formats.
+        make_doc needs to be provided when the examples contain text strings and keep_raw_text=False
+        """
+        if isinstance(examples, Example):
+            return [examples]
+        if isinstance(examples, tuple):
+            examples = [examples]
+        converted_examples = []
+        for ex in examples:
+            if isinstance(ex, Example):
+                converted_examples.append(ex)
+            # convert string to Doc to Example
+            elif isinstance(ex, str):
+                if keep_raw_text:
+                    converted_examples.append(Example(doc=ex))
+                else:
+                    doc = make_doc(ex)
+                    converted_examples.append(Example(doc=doc))
+            # convert Doc to Example
+            elif isinstance(ex, Doc):
+                converted_examples.append(Example(doc=ex))
+            # convert tuples to Example
+            elif isinstance(ex, tuple) and len(ex) == 2:
+                doc, gold = ex
+                gold_dict = {}
+                # convert string to Doc
+                if isinstance(doc, str) and not keep_raw_text:
+                    doc = make_doc(doc)
+                # convert dict to GoldParse
+                if isinstance(gold, dict):
+                    gold_dict = gold
+                    if doc is not None or gold.get("words", None) is not None:
+                        gold = GoldParse(doc, **gold)
+                    else:
+                        gold = None
+                if gold is not None:
+                    converted_examples.append(Example.from_gold(goldparse=gold, doc=doc))
+                else:
+                    raise ValueError(Errors.E999.format(gold_dict=gold_dict))
+            else:
+                converted_examples.append(ex)
+        return converted_examples

From a663d44b1b93e34f5247402d551a3045968812bc Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 6 Jun 2020 14:28:37 +0200
Subject: [PATCH 05/56] Add GoldCorpus

---
 spacy/_gold/corpus.py | 277 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 277 insertions(+)
 create mode 100644 spacy/_gold/corpus.py

diff --git a/spacy/_gold/corpus.py b/spacy/_gold/corpus.py
new file mode 100644
index 000000000..2fdfd8d2a
--- /dev/null
+++ b/spacy/_gold/corpus.py
@@ -0,0 +1,277 @@
+import random
+import shutil
+import tempfile
+import srsly
+from pathlib import Path
+import itertools
+from ..tokens import Doc
+from .. import util
+from ..errors import Errors
+from .gold_utils import read_json_file, read_json_object
+from .augment import make_orth_variants, add_noise
+from .exmaple import Example
+
+
+class GoldCorpus(object):
+    """An annotated corpus, using the JSON file format. Manages
+    annotations for tagging, dependency parsing and NER.
+
+    DOCS: https://spacy.io/api/goldcorpus
+    """
+
+    def __init__(self, train, dev, gold_preproc=False, limit=None):
+        """Create a GoldCorpus.
+
+        train (str / Path): File or directory of training data.
+        dev (str / Path): File or directory of development data.
+        RETURNS (GoldCorpus): The newly created object.
+        """
+        self.limit = limit
+        if isinstance(train, str) or isinstance(train, Path):
+            train = self.read_examples(self.walk_corpus(train))
+            dev = self.read_examples(self.walk_corpus(dev))
+        # Write temp directory with one doc per file, so we can shuffle and stream
+        self.tmp_dir = Path(tempfile.mkdtemp())
+        self.write_msgpack(self.tmp_dir / "train", train, limit=self.limit)
+        self.write_msgpack(self.tmp_dir / "dev", dev, limit=self.limit)
+
+    def __del__(self):
+        shutil.rmtree(self.tmp_dir)
+
+    @staticmethod
+    def write_msgpack(directory, examples, limit=0):
+        if not directory.exists():
+            directory.mkdir()
+        n = 0
+        for i, example in enumerate(examples):
+            ex_dict = example.to_dict()
+            text = example.text
+            srsly.write_msgpack(directory / f"{i}.msg", (text, ex_dict))
+            n += 1
+            if limit and n >= limit:
+                break
+
+    @staticmethod
+    def walk_corpus(path):
+        path = util.ensure_path(path)
+        if not path.is_dir():
+            return [path]
+        paths = [path]
+        locs = []
+        seen = set()
+        for path in paths:
+            if str(path) in seen:
+                continue
+            seen.add(str(path))
+            if path.parts[-1].startswith("."):
+                continue
+            elif path.is_dir():
+                paths.extend(path.iterdir())
+            elif path.parts[-1].endswith((".json", ".jsonl")):
+                locs.append(path)
+        return locs
+
+    @staticmethod
+    def read_examples(locs, limit=0):
+        """ Yield training examples """
+        i = 0
+        for loc in locs:
+            loc = util.ensure_path(loc)
+            file_name = loc.parts[-1]
+            if file_name.endswith("json"):
+                examples = read_json_file(loc)
+            elif file_name.endswith("jsonl"):
+                gold_tuples = srsly.read_jsonl(loc)
+                first_gold_tuple = next(gold_tuples)
+                gold_tuples = itertools.chain([first_gold_tuple], gold_tuples)
+                # TODO: proper format checks with schemas
+                if isinstance(first_gold_tuple, dict):
+                    if first_gold_tuple.get("paragraphs", None):
+                        examples = read_json_object(gold_tuples)
+                    elif first_gold_tuple.get("doc_annotation", None):
+                        examples = []
+                        for ex_dict in gold_tuples:
+                            doc = ex_dict.get("doc", None)
+                            if doc is None:
+                                doc = ex_dict.get("text", None)
+                            if not (
+                                doc is None
+                                or isinstance(doc, Doc)
+                                or isinstance(doc, str)
+                            ):
+                                raise ValueError(Errors.E987.format(type=type(doc)))
+                            examples.append(Example.from_dict(ex_dict, doc=doc))
+
+            elif file_name.endswith("msg"):
+                text, ex_dict = srsly.read_msgpack(loc)
+                examples = [Example.from_dict(ex_dict, doc=text)]
+            else:
+                supported = ("json", "jsonl", "msg")
+                raise ValueError(Errors.E124.format(path=loc, formats=supported))
+            try:
+                for example in examples:
+                    yield example
+                    i += 1
+                    if limit and i >= limit:
+                        return
+            except KeyError as e:
+                msg = "Missing key {}".format(e)
+                raise KeyError(Errors.E996.format(file=file_name, msg=msg))
+            except UnboundLocalError:
+                msg = "Unexpected document structure"
+                raise ValueError(Errors.E996.format(file=file_name, msg=msg))
+
+    @property
+    def dev_examples(self):
+        locs = (self.tmp_dir / "dev").iterdir()
+        yield from self.read_examples(locs, limit=self.limit)
+
+    @property
+    def train_examples(self):
+        locs = (self.tmp_dir / "train").iterdir()
+        yield from self.read_examples(locs, limit=self.limit)
+
+    def count_train(self):
+        """Returns count of words in train examples"""
+        n = 0
+        i = 0
+        for example in self.train_examples:
+            n += len(example.token_annotation.words)
+            if self.limit and i >= self.limit:
+                break
+            i += 1
+        return n
+
+    def train_dataset(
+        self,
+        nlp,
+        gold_preproc=False,
+        max_length=None,
+        noise_level=0.0,
+        orth_variant_level=0.0,
+        ignore_misaligned=False,
+    ):
+        locs = list((self.tmp_dir / "train").iterdir())
+        random.shuffle(locs)
+        train_examples = self.read_examples(locs, limit=self.limit)
+        gold_examples = self.iter_gold_docs(
+            nlp,
+            train_examples,
+            gold_preproc,
+            max_length=max_length,
+            noise_level=noise_level,
+            orth_variant_level=orth_variant_level,
+            make_projective=True,
+            ignore_misaligned=ignore_misaligned,
+        )
+        yield from gold_examples
+
+    def train_dataset_without_preprocessing(
+        self, nlp, gold_preproc=False, ignore_misaligned=False
+    ):
+        examples = self.iter_gold_docs(
+            nlp,
+            self.train_examples,
+            gold_preproc=gold_preproc,
+            ignore_misaligned=ignore_misaligned,
+        )
+        yield from examples
+
+    def dev_dataset(self, nlp, gold_preproc=False, ignore_misaligned=False):
+        examples = self.iter_gold_docs(
+            nlp,
+            self.dev_examples,
+            gold_preproc=gold_preproc,
+            ignore_misaligned=ignore_misaligned,
+        )
+        yield from examples
+
+    @classmethod
+    def iter_gold_docs(
+        cls,
+        nlp,
+        examples,
+        gold_preproc,
+        max_length=None,
+        noise_level=0.0,
+        orth_variant_level=0.0,
+        make_projective=False,
+        ignore_misaligned=False,
+    ):
+        """ Setting gold_preproc will result in creating a doc per sentence """
+        for example in examples:
+            if gold_preproc:
+                split_examples = example.split_sents()
+                example_golds = []
+                for split_example in split_examples:
+                    split_example_docs = cls._make_docs(
+                        nlp,
+                        split_example,
+                        gold_preproc,
+                        noise_level=noise_level,
+                        orth_variant_level=orth_variant_level,
+                    )
+                    split_example_golds = cls._make_golds(
+                        split_example_docs,
+                        vocab=nlp.vocab,
+                        make_projective=make_projective,
+                        ignore_misaligned=ignore_misaligned,
+                    )
+                    example_golds.extend(split_example_golds)
+            else:
+                example_docs = cls._make_docs(
+                    nlp,
+                    example,
+                    gold_preproc,
+                    noise_level=noise_level,
+                    orth_variant_level=orth_variant_level,
+                )
+                example_golds = cls._make_golds(
+                    example_docs,
+                    vocab=nlp.vocab,
+                    make_projective=make_projective,
+                    ignore_misaligned=ignore_misaligned,
+                )
+            for ex in example_golds:
+                if ex.goldparse is not None:
+                    if (not max_length) or len(ex.doc) < max_length:
+                        yield ex
+
+    @classmethod
+    def _make_docs(
+        cls, nlp, example, gold_preproc, noise_level=0.0, orth_variant_level=0.0
+    ):
+        var_example = make_orth_variants(
+            nlp, example, orth_variant_level=orth_variant_level
+        )
+        # gold_preproc is not used ?!
+        if example.text is not None:
+            var_text = add_noise(var_example.text, noise_level)
+            var_doc = nlp.make_doc(var_text)
+            var_example.doc = var_doc
+        else:
+            var_doc = Doc(
+                nlp.vocab,
+                words=add_noise(var_example.token_annotation.words, noise_level),
+            )
+            var_example.doc = var_doc
+        return [var_example]
+
+    @classmethod
+    def _make_golds(
+        cls, examples, vocab=None, make_projective=False, ignore_misaligned=False
+    ):
+        filtered_examples = []
+        for example in examples:
+            gold_parses = example.get_gold_parses(
+                vocab=vocab,
+                make_projective=make_projective,
+                ignore_misaligned=ignore_misaligned,
+            )
+            assert len(gold_parses) == 1
+            doc, gold = gold_parses[0]
+            if doc:
+                assert doc == example.doc
+                example.goldparse = gold
+                filtered_examples.append(example)
+        return filtered_examples

From 53e6473e2466a247a007d2e6c87d22a65bf4bff3 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 6 Jun 2020 14:29:06 +0200
Subject: [PATCH 06/56] Add to/from dict helpers

---
 spacy/util.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/spacy/util.py b/spacy/util.py
index bc6c98a82..e7d4c8697 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -812,16 +812,23 @@ def filter_spans(spans):
 
 
 def to_bytes(getters, exclude):
+    return srsly.msgpack_dumps(to_dict(getters, exclude))
+
+
+def from_bytes(bytes_data, setters, exclude):
+    return from_dict(srsly.msgpack_loads(bytes_data), setters, exclude)
+
+
+def to_dict(getters, exclude):
     serialized = {}
     for key, getter in getters.items():
         # Split to support file names like meta.json
         if key.split(".")[0] not in exclude:
             serialized[key] = getter()
-    return srsly.msgpack_dumps(serialized)
+    return serialized
 
 
-def from_bytes(bytes_data, setters, exclude):
-    msg = srsly.msgpack_loads(bytes_data)
+def from_dict(msg, setters, exclude):
     for key, setter in setters.items():
         # Split to support file names like meta.json
         if key.split(".")[0] not in exclude and key in msg:

From 156466ca69355de61c537541db22bc1632a7daff Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 6 Jun 2020 14:39:14 +0200
Subject: [PATCH 07/56] Add iob_utils

---
 spacy/_gold/iob_utils.py | 189 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 189 insertions(+)
 create mode 100644 spacy/_gold/iob_utils.py

diff --git a/spacy/_gold/iob_utils.py b/spacy/_gold/iob_utils.py
new file mode 100644
index 000000000..2f0f116a1
--- /dev/null
+++ b/spacy/_gold/iob_utils.py
@@ -0,0 +1,189 @@
+import warnings
+from ..errors import Errors, Warnings
+from ..tokens import Span
+
+
+def iob_to_biluo(tags):
+    out = []
+    tags = list(tags)
+    while tags:
+        out.extend(_consume_os(tags))
+        out.extend(_consume_ent(tags))
+    return out
+
+
+def biluo_to_iob(tags):
+    out = []
+    for tag in tags:
+        tag = tag.replace("U-", "B-", 1).replace("L-", "I-", 1)
+        out.append(tag)
+    return out
+
+
+def _consume_os(tags):
+    while tags and tags[0] == "O":
+        yield tags.pop(0)
+
+
+def _consume_ent(tags):
+    if not tags:
+        return []
+    tag = tags.pop(0)
+    target_in = "I" + tag[1:]
+    target_last = "L" + tag[1:]
+    length = 1
+    while tags and tags[0] in {target_in, target_last}:
+        length += 1
+        tags.pop(0)
+    label = tag[2:]
+    if length == 1:
+        if len(label) == 0:
+            raise ValueError(Errors.E177.format(tag=tag))
+        return ["U-" + label]
+    else:
+        start = "B-" + label
+        end = "L-" + label
+        middle = [f"I-{label}" for _ in range(1, length - 1)]
+        return [start] + middle + [end]
+
+
+def biluo_tags_from_offsets(doc, entities, missing="O"):
+    """Encode labelled spans into per-token tags, using the
+    Begin/In/Last/Unit/Out scheme (BILUO).
+
+    doc (Doc): The document that the entity offsets refer to. The output tags
+        will refer to the token boundaries within the document.
+    entities (iterable): A sequence of `(start, end, label)` triples. `start`
+        and `end` should be character-offset integers denoting the slice into
+        the original string.
+    RETURNS (list): A list of unicode strings, describing the tags. Each tag
+        string will be of the form either "", "O" or "{action}-{label}", where
+        action is one of "B", "I", "L", "U". The string "-" is used where the
+        entity offsets don't align with the tokenization in the `Doc` object.
+        The training algorithm will view these as missing values. "O" denotes a
+        non-entity token. "B" denotes the beginning of a multi-token entity,
+        "I" the inside of an entity of three or more tokens, and "L" the end
+        of an entity of two or more tokens. "U" denotes a single-token entity.
+
+    EXAMPLE:
+        >>> text = 'I like London.'
+        >>> entities = [(len('I like '), len('I like London'), 'LOC')]
+        >>> doc = nlp.tokenizer(text)
+        >>> tags = biluo_tags_from_offsets(doc, entities)
+        >>> assert tags == ["O", "O", 'U-LOC', "O"]
+    """
+    # Ensure no overlapping entity labels exist
+    tokens_in_ents = {}
+
+    starts = {token.idx: token.i for token in doc}
+    ends = {token.idx + len(token): token.i for token in doc}
+    biluo = ["-" for _ in doc]
+    # Handle entity cases
+    for start_char, end_char, label in entities:
+        for token_index in range(start_char, end_char):
+            if token_index in tokens_in_ents.keys():
+                raise ValueError(
+                    Errors.E103.format(
+                        span1=(
+                            tokens_in_ents[token_index][0],
+                            tokens_in_ents[token_index][1],
+                            tokens_in_ents[token_index][2],
+                        ),
+                        span2=(start_char, end_char, label),
+                    )
+                )
+            tokens_in_ents[token_index] = (start_char, end_char, label)
+
+        start_token = starts.get(start_char)
+        end_token = ends.get(end_char)
+        # Only interested if the tokenization is correct
+        if start_token is not None and end_token is not None:
+            if start_token == end_token:
+                biluo[start_token] = f"U-{label}"
+            else:
+                biluo[start_token] = f"B-{label}"
+                for i in range(start_token + 1, end_token):
+                    biluo[i] = f"I-{label}"
+                biluo[end_token] = f"L-{label}"
+    # Now distinguish the O cases from ones where we miss the tokenization
+    entity_chars = set()
+    for start_char, end_char, label in entities:
+        for i in range(start_char, end_char):
+            entity_chars.add(i)
+    for token in doc:
+        for i in range(token.idx, token.idx + len(token)):
+            if i in entity_chars:
+                break
+        else:
+            biluo[token.i] = missing
+    if "-" in biluo:
+        ent_str = str(entities)
+        warnings.warn(
+            Warnings.W030.format(
+                text=doc.text[:50] + "..." if len(doc.text) > 50 else doc.text,
+                entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
+            )
+        )
+    return biluo
+
+
+def spans_from_biluo_tags(doc, tags):
+    """Encode per-token tags following the BILUO scheme into Span object, e.g.
+    to overwrite the doc.ents.
+
+    doc (Doc): The document that the BILUO tags refer to.
+    entities (iterable): A sequence of BILUO tags with each tag describing one
+        token. Each tags string will be of the form of either "", "O" or
+        "{action}-{label}", where action is one of "B", "I", "L", "U".
+    RETURNS (list): A sequence of Span objects.
+    """
+    token_offsets = tags_to_entities(tags)
+    spans = []
+    for label, start_idx, end_idx in token_offsets:
+        span = Span(doc, start_idx, end_idx + 1, label=label)
+        spans.append(span)
+    return spans
+
+
+def offsets_from_biluo_tags(doc, tags):
+    """Encode per-token tags following the BILUO scheme into entity offsets.
+
+    doc (Doc): The document that the BILUO tags refer to.
+    entities (iterable): A sequence of BILUO tags with each tag describing one
+        token. Each tags string will be of the form of either "", "O" or
+        "{action}-{label}", where action is one of "B", "I", "L", "U".
+    RETURNS (list): A sequence of `(start, end, label)` triples. `start` and
+        `end` will be character-offset integers denoting the slice into the
+        original string.
+    """
+    spans = spans_from_biluo_tags(doc, tags)
+    return [(span.start_char, span.end_char, span.label_) for span in spans]
+
+
+def tags_to_entities(tags):
+    entities = []
+    start = None
+    for i, tag in enumerate(tags):
+        if tag is None:
+            continue
+        if tag.startswith("O"):
+            # TODO: We shouldn't be getting these malformed inputs. Fix this.
+            if start is not None:
+                start = None
+            continue
+        elif tag == "-":
+            continue
+        elif tag.startswith("I"):
+            if start is None:
+                raise ValueError(Errors.E067.format(tags=tags[: i + 1]))
+            continue
+        if tag.startswith("U"):
+            entities.append((tag[2:], i, i))
+        elif tag.startswith("B"):
+            start = i
+        elif tag.startswith("L"):
+            entities.append((tag[2:], start, i))
+            start = None
+        else:
+            raise ValueError(Errors.E068.format(tag=tag))
+    return entities

From 32c8fb1372a8f143d471352192440d5ca2d33740 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 6 Jun 2020 14:41:49 +0200
Subject: [PATCH 08/56] Add gold_io.pyx

---
 spacy/_gold/gold_io.pyx | 202 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 202 insertions(+)
 create mode 100644 spacy/_gold/gold_io.pyx

diff --git a/spacy/_gold/gold_io.pyx b/spacy/_gold/gold_io.pyx
new file mode 100644
index 000000000..15581c151
--- /dev/null
+++ b/spacy/_gold/gold_io.pyx
@@ -0,0 +1,202 @@
+import warnings
+import srsly
+from .. import util
+from ..errors import Warnings
+from ..tokens import Token, Doc
+from .example import Example
+from .iob_utils import biluo_tags_from_offsets
+
+
+def merge_sents(sents):
+    m_deps = [[], [], [], [], [], []]
+    m_cats = {}
+    m_brackets = []
+    i = 0
+    for (ids, words, tags, heads, labels, ner), (cats, brackets) in sents:
+        m_deps[0].extend(id_ + i for id_ in ids)
+        m_deps[1].extend(words)
+        m_deps[2].extend(tags)
+        m_deps[3].extend(head + i for head in heads)
+        m_deps[4].extend(labels)
+        m_deps[5].extend(ner)
+        m_brackets.extend((b["first"] + i, b["last"] + i, b["label"])
+                          for b in brackets)
+        m_cats.update(cats)
+        i += len(ids)
+    return [(m_deps, (m_cats, m_brackets))]
+
+
+def docs_to_json(docs, id=0, ner_missing_tag="O"):
+    """Convert a list of Doc objects into the JSON-serializable format used by
+    the spacy train command.
+
+    docs (iterable / Doc): The Doc object(s) to convert.
+    id (int): Id for the JSON.
+    RETURNS (dict): The data in spaCy's JSON format
+        - each input doc will be treated as a paragraph in the output doc
+    """
+    if isinstance(docs, Doc):
+        docs = [docs]
+    json_doc = {"id": id, "paragraphs": []}
+    for i, doc in enumerate(docs):
+        json_para = {'raw': doc.text, "sentences": [], "cats": []}
+        for cat, val in doc.cats.items():
+            json_cat = {"label": cat, "value": val}
+            json_para["cats"].append(json_cat)
+        ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
+        biluo_tags = biluo_tags_from_offsets(doc, ent_offsets, missing=ner_missing_tag)
+        for j, sent in enumerate(doc.sents):
+            json_sent = {"tokens": [], "brackets": []}
+            for token in sent:
+                json_token = {"id": token.i, "orth": token.text}
+                if doc.is_tagged:
+                    json_token["tag"] = token.tag_
+                    json_token["pos"] = token.pos_
+                    json_token["morph"] = token.morph_
+                    json_token["lemma"] = token.lemma_
+                if doc.is_parsed:
+                    json_token["head"] = token.head.i-token.i
+                    json_token["dep"] = token.dep_
+                json_token["ner"] = biluo_tags[token.i]
+                json_sent["tokens"].append(json_token)
+            json_para["sentences"].append(json_sent)
+        json_doc["paragraphs"].append(json_para)
+    return json_doc
+
+
+def json_to_examples(doc):
+    """Convert an item in the JSON-formatted training data to the format
+    used by GoldParse.
+
+    doc (dict): One entry in the training data.
+    YIELDS (Example): The reformatted data - one training example per paragraph
+    """
+    for paragraph in doc["paragraphs"]:
+        example = Example(doc=paragraph.get("raw", None))
+        words = []
+        ids = []
+        tags = []
+        pos = []
+        morphs = []
+        lemmas = []
+        heads = []
+        labels = []
+        ner = []
+        sent_starts = []
+        brackets = []
+        for sent in paragraph["sentences"]:
+            sent_start_i = len(words)
+            for i, token in enumerate(sent["tokens"]):
+                words.append(token["orth"])
+                ids.append(token.get('id', sent_start_i + i))
+                tags.append(token.get('tag', "-"))
+                pos.append(token.get("pos", ""))
+                morphs.append(token.get("morph", ""))
+                lemmas.append(token.get("lemma", ""))
+                heads.append(token.get("head", 0) + sent_start_i + i)
+                labels.append(token.get("dep", ""))
+                # Ensure ROOT label is case-insensitive
+                if labels[-1].lower() == "root":
+                    labels[-1] = "ROOT"
+                ner.append(token.get("ner", "-"))
+                if i == 0:
+                    sent_starts.append(1)
+                else:
+                    sent_starts.append(0)
+            if "brackets" in sent:
+                brackets.extend((b["first"] + sent_start_i,
+                                 b["last"] + sent_start_i, b["label"])
+                                 for b in sent["brackets"])
+        cats = {}
+        for cat in paragraph.get("cats", {}):
+            cats[cat["label"]] = cat["value"]
+        example.set_token_annotation(ids=ids, words=words, tags=tags,
+                pos=pos, morphs=morphs, lemmas=lemmas, heads=heads,
+                deps=labels, entities=ner, sent_starts=sent_starts,
+                brackets=brackets)
+        example.set_doc_annotation(cats=cats)
+        yield example
+
+
+def read_json_file(loc, docs_filter=None, limit=None):
+    loc = util.ensure_path(loc)
+    if loc.is_dir():
+        for filename in loc.iterdir():
+            yield from read_json_file(loc / filename, limit=limit)
+    else:
+        for doc in json_iterate(loc):
+            if docs_filter is not None and not docs_filter(doc):
+                continue
+            for json_data in json_to_examples(doc):
+                yield json_data
+
+
+def read_json_object(json_corpus_section):
+    """Take a list of JSON-formatted documents (e.g. from an already loaded
+    training data file) and yield annotations in the GoldParse format.
+
+    json_corpus_section (list): The data.
+    YIELDS (Example): The reformatted data - one training example per paragraph
+    """
+    for json_doc in json_corpus_section:
+        examples = json_to_examples(json_doc)
+        for ex in examples:
+            yield ex
+
+
+def json_iterate(loc):
+    # We should've made these files jsonl...But since we didn't, parse out
+    # the docs one-by-one to reduce memory usage.
+    # It's okay to read in the whole file -- just don't parse it into JSON.
+    cdef bytes py_raw
+    loc = util.ensure_path(loc)
+    with loc.open("rb") as file_:
+        py_raw = file_.read()
+    cdef long file_length = len(py_raw)
+    if file_length > 2 ** 30:
+        warnings.warn(Warnings.W027.format(size=file_length))
+
+    raw = <char*>py_raw
+    cdef int square_depth = 0
+    cdef int curly_depth = 0
+    cdef int inside_string = 0
+    cdef int escape = 0
+    cdef long start = -1
+    cdef char c
+    cdef char quote = ord('"')
+    cdef char backslash = ord("\\")
+    cdef char open_square = ord("[")
+    cdef char close_square = ord("]")
+    cdef char open_curly = ord("{")
+    cdef char close_curly = ord("}")
+    for i in range(file_length):
+        c = raw[i]
+        if escape:
+            escape = False
+            continue
+        if c == backslash:
+            escape = True
+            continue
+        if c == quote:
+            inside_string = not inside_string
+            continue
+        if inside_string:
+            continue
+        if c == open_square:
+            square_depth += 1
+        elif c == close_square:
+            square_depth -= 1
+        elif c == open_curly:
+            if square_depth == 1 and curly_depth == 0:
+                start = i
+            curly_depth += 1
+        elif c == close_curly:
+            curly_depth -= 1
+            if square_depth == 1 and curly_depth == 0:
+                py_str = py_raw[start : i + 1].decode("utf8")
+                try:
+                    yield srsly.json_loads(py_str)
+                except Exception:
+                    print(py_str)
+                    raise
+                start = -1

From 7b873ce2b15494c4da9ea5919fabe1e360681d5c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 6 Jun 2020 15:09:25 +0200
Subject: [PATCH 09/56] Move GoldParse under spacy.syntax

---
 spacy/syntax/arc_eager.pxd         |   2 +-
 spacy/syntax/arc_eager.pyx         |   2 +-
 spacy/syntax/gold_parse.pxd        |  39 ++++
 spacy/syntax/gold_parse.pyx        | 311 +++++++++++++++++++++++++++++
 spacy/syntax/ner.pxd               |   2 +-
 spacy/syntax/ner.pyx               |   2 +-
 spacy/syntax/nn_parser.pyx         |   2 +-
 spacy/syntax/transition_system.pxd |   4 +-
 8 files changed, 357 insertions(+), 7 deletions(-)
 create mode 100644 spacy/syntax/gold_parse.pxd
 create mode 100644 spacy/syntax/gold_parse.pyx

diff --git a/spacy/syntax/arc_eager.pxd b/spacy/syntax/arc_eager.pxd
index 14d706548..96dd37a36 100644
--- a/spacy/syntax/arc_eager.pxd
+++ b/spacy/syntax/arc_eager.pxd
@@ -3,7 +3,7 @@ from cymem.cymem cimport Pool
 from .stateclass cimport StateClass
 from ..typedefs cimport weight_t, attr_t
 from .transition_system cimport TransitionSystem, Transition
-from ..gold cimport GoldParseC
+from .gold_parse cimport GoldParseC
 
 
 cdef class ArcEager(TransitionSystem):
diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx
index 19be95f3f..df8c7d563 100644
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@@ -8,7 +8,7 @@ import json
 
 from ..typedefs cimport hash_t, attr_t
 from ..strings cimport hash_string
-from ..gold cimport GoldParse, GoldParseC
+from .gold_parse cimport GoldParse, GoldParseC
 from ..structs cimport TokenC
 from ..tokens.doc cimport Doc, set_children_from_heads
 from .stateclass cimport StateClass
diff --git a/spacy/syntax/gold_parse.pxd b/spacy/syntax/gold_parse.pxd
new file mode 100644
index 000000000..9815513d0
--- /dev/null
+++ b/spacy/syntax/gold_parse.pxd
@@ -0,0 +1,39 @@
+from cymem.cymem cimport Pool
+from .transition_system cimport Transition
+from ..typedefs cimport attr_t
+
+
+cdef struct GoldParseC:
+    int* tags
+    int* heads
+    int* has_dep
+    int* sent_start
+    attr_t* labels
+    int** brackets
+    Transition* ner
+
+
+cdef class GoldParse:
+    cdef Pool mem
+
+    cdef GoldParseC c
+    cdef readonly object orig
+
+    cdef int length
+    cdef public int loss
+    cdef public list words
+    cdef public list tags
+    cdef public list pos
+    cdef public list morphs
+    cdef public list lemmas
+    cdef public list sent_starts
+    cdef public list heads
+    cdef public list labels
+    cdef public dict orths
+    cdef public list ner
+    cdef public dict brackets
+    cdef public dict cats
+    cdef public dict links
+
+    cdef readonly list cand_to_gold
+    cdef readonly list gold_to_cand
diff --git a/spacy/syntax/gold_parse.pyx b/spacy/syntax/gold_parse.pyx
new file mode 100644
index 000000000..59e8f4bbb
--- /dev/null
+++ b/spacy/syntax/gold_parse.pyx
@@ -0,0 +1,311 @@
+# cython: profile=True
+import re
+import random
+import numpy
+import tempfile
+import shutil
+import itertools
+from pathlib import Path
+import srsly
+import warnings
+
+from .. import util
+from ..syntax import nonproj
+from ..tokens import Doc, Span
+from ..errors import Errors, AlignmentError, Warnings
+from .iob_utils import offsets_from_biluo_tags
+from .align import align
+
+
+punct_re = re.compile(r"\W")
+
+def is_punct_label(label):
+    return label == "P" or label.lower() == "punct"
+
+
+cdef class GoldParse:
+    """Collection for training annotations.
+
+    DOCS: https://spacy.io/api/goldparse
+    """
+    @classmethod
+    def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False):
+        return cls(doc, words=token_annotation.words,
+                   tags=token_annotation.tags,
+                   pos=token_annotation.pos,
+                   morphs=token_annotation.morphs,
+                   lemmas=token_annotation.lemmas,
+                   heads=token_annotation.heads,
+                   deps=token_annotation.deps,
+                   entities=token_annotation.entities,
+                   sent_starts=token_annotation.sent_starts,
+                   cats=doc_annotation.cats,
+                   links=doc_annotation.links,
+                   make_projective=make_projective)
+
+    def get_token_annotation(self):
+        ids = None
+        if self.words:
+            ids = list(range(len(self.words)))
+
+        return TokenAnnotation(ids=ids, words=self.words, tags=self.tags,
+                               pos=self.pos, morphs=self.morphs,
+                               lemmas=self.lemmas, heads=self.heads,
+                               deps=self.labels, entities=self.ner,
+                               sent_starts=self.sent_starts)
+
+    def __init__(self, doc, words=None, tags=None, pos=None, morphs=None,
+                 lemmas=None, heads=None, deps=None, entities=None,
+                 sent_starts=None, make_projective=False, cats=None,
+                 links=None):
+        """Create a GoldParse. The fields will not be initialized if len(doc) is zero.
+
+        doc (Doc): The document the annotations refer to.
+        words (iterable): A sequence of unicode word strings.
+        tags (iterable): A sequence of strings, representing tag annotations.
+        pos (iterable): A sequence of strings, representing UPOS annotations.
+        morphs (iterable): A sequence of strings, representing morph
+            annotations.
+        lemmas (iterable): A sequence of strings, representing lemma
+            annotations.
+        heads (iterable): A sequence of integers, representing syntactic
+            head offsets.
+        deps (iterable): A sequence of strings, representing the syntactic
+            relation types.
+        entities (iterable): A sequence of named entity annotations, either as
+            BILUO tag strings, or as `(start_char, end_char, label)` tuples,
+            representing the entity positions.
+        sent_starts (iterable): A sequence of sentence position tags, 1 for
+            the first word in a sentence, 0 for all others.
+        cats (dict): Labels for text classification. Each key in the dictionary
+            may be a string or an int, or a `(start_char, end_char, label)`
+            tuple, indicating that the label is applied to only part of the
+            document (usually a sentence). Unlike entity annotations, label
+            annotations can overlap, i.e. a single word can be covered by
+            multiple labelled spans. The TextCategorizer component expects
+            true examples of a label to have the value 1.0, and negative
+            examples of a label to have the value 0.0. Labels not in the
+            dictionary are treated as missing - the gradient for those labels
+            will be zero.
+        links (dict): A dict with `(start_char, end_char)` keys,
+            and the values being dicts with kb_id:value entries,
+            representing the external IDs in a knowledge base (KB)
+            mapped to either 1.0 or 0.0, indicating positive and
+            negative examples respectively.
+        RETURNS (GoldParse): The newly constructed object.
+        """
+        self.mem = Pool()
+        self.loss = 0
+        self.length = len(doc)
+
+        self.cats = {} if cats is None else dict(cats)
+        self.links = {} if links is None else dict(links)
+
+        # temporary doc for aligning entity annotation
+        entdoc = None
+
+        # avoid allocating memory if the doc does not contain any tokens
+        if self.length == 0:
+            self.words = []
+            self.tags = []
+            self.heads = []
+            self.labels = []
+            self.ner = []
+            self.morphs = []
+            # set a minimal orig so that the scorer can score an empty doc
+            self.orig = TokenAnnotation(ids=[])
+        else:
+            if not words:
+                words = [token.text for token in doc]
+            if not tags:
+                tags = [None for _ in words]
+            if not pos:
+                pos = [None for _ in words]
+            if not morphs:
+                morphs = [None for _ in words]
+            if not lemmas:
+                lemmas = [None for _ in words]
+            if not heads:
+                heads = [None for _ in words]
+            if not deps:
+                deps = [None for _ in words]
+            if not sent_starts:
+                sent_starts = [None for _ in words]
+            if entities is None:
+                entities = ["-" for _ in words]
+            elif len(entities) == 0:
+                entities = ["O" for _ in words]
+            else:
+                # Translate the None values to '-', to make processing easier.
+                # See Issue #2603
+                entities = [(ent if ent is not None else "-") for ent in entities]
+                if not isinstance(entities[0], str):
+                    # Assume we have entities specified by character offset.
+                    # Create a temporary Doc corresponding to provided words
+                    # (to preserve gold tokenization) and text (to preserve
+                    # character offsets).
+                    entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text)
+                    entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces)
+                    entdoc_entities = biluo_tags_from_offsets(entdoc, entities)
+                    # There may be some additional whitespace tokens in the
+                    # temporary doc, so check that the annotations align with
+                    # the provided words while building a list of BILUO labels.
+                    entities = []
+                    words_offset = 0
+                    for i in range(len(entdoc_words)):
+                        if words[i + words_offset] == entdoc_words[i]:
+                            entities.append(entdoc_entities[i])
+                        else:
+                            words_offset -= 1
+                    if len(entities) != len(words):
+                        warnings.warn(Warnings.W029.format(text=doc.text))
+                        entities = ["-" for _ in words]
+
+            # These are filled by the tagger/parser/entity recogniser
+            self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
+            self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
+            self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
+            self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int))
+            self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))
+            self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
+
+            self.words = [None] * len(doc)
+            self.tags = [None] * len(doc)
+            self.pos = [None] * len(doc)
+            self.morphs = [None] * len(doc)
+            self.lemmas = [None] * len(doc)
+            self.heads = [None] * len(doc)
+            self.labels = [None] * len(doc)
+            self.ner = [None] * len(doc)
+            self.sent_starts = [None] * len(doc)
+
+            # This needs to be done before we align the words
+            if make_projective and any(heads) and any(deps) :
+                heads, deps = nonproj.projectivize(heads, deps)
+
+            # Do many-to-one alignment for misaligned tokens.
+            # If we over-segment, we'll have one gold word that covers a sequence
+            # of predicted words
+            # If we under-segment, we'll have one predicted word that covers a
+            # sequence of gold words.
+            # If we "mis-segment", we'll have a sequence of predicted words covering
+            # a sequence of gold words. That's many-to-many -- we don't do that
+            # except for NER spans where the start and end can be aligned.
+            cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words)
+
+            self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
+            self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
+
+            self.orig = TokenAnnotation(ids=list(range(len(words))),
+                    words=words, tags=tags, pos=pos, morphs=morphs,
+                    lemmas=lemmas, heads=heads, deps=deps, entities=entities,
+                    sent_starts=sent_starts, brackets=[])
+
+            for i, gold_i in enumerate(self.cand_to_gold):
+                if doc[i].text.isspace():
+                    self.words[i] = doc[i].text
+                    self.tags[i] = "_SP"
+                    self.pos[i] = "SPACE"
+                    self.morphs[i] = None
+                    self.lemmas[i] = None
+                    self.heads[i] = None
+                    self.labels[i] = None
+                    self.ner[i] = None
+                    self.sent_starts[i] = 0
+                if gold_i is None:
+                    if i in i2j_multi:
+                        self.words[i] = words[i2j_multi[i]]
+                        self.tags[i] = tags[i2j_multi[i]]
+                        self.pos[i] = pos[i2j_multi[i]]
+                        self.morphs[i] = morphs[i2j_multi[i]]
+                        self.lemmas[i] = lemmas[i2j_multi[i]]
+                        self.sent_starts[i] = sent_starts[i2j_multi[i]]
+                        is_last = i2j_multi[i] != i2j_multi.get(i+1)
+                        # Set next word in multi-token span as head, until last
+                        if not is_last:
+                            self.heads[i] = i+1
+                            self.labels[i] = "subtok"
+                        else:
+                            head_i = heads[i2j_multi[i]]
+                            if head_i:
+                                self.heads[i] = self.gold_to_cand[head_i]
+                            self.labels[i] = deps[i2j_multi[i]]
+                        ner_tag = entities[i2j_multi[i]]
+                        # Assign O/- for many-to-one O/- NER tags
+                        if ner_tag in ("O", "-"):
+                             self.ner[i] = ner_tag
+                else:
+                    self.words[i] = words[gold_i]
+                    self.tags[i] = tags[gold_i]
+                    self.pos[i] = pos[gold_i]
+                    self.morphs[i] = morphs[gold_i]
+                    self.lemmas[i] = lemmas[gold_i]
+                    self.sent_starts[i] = sent_starts[gold_i]
+                    if heads[gold_i] is None:
+                        self.heads[i] = None
+                    else:
+                        self.heads[i] = self.gold_to_cand[heads[gold_i]]
+                    self.labels[i] = deps[gold_i]
+                    self.ner[i] = entities[gold_i]
+            # Assign O/- for one-to-many O/- NER tags
+            for j, cand_j in enumerate(self.gold_to_cand):
+                if cand_j is None:
+                    if j in j2i_multi:
+                        i = j2i_multi[j]
+                        ner_tag = entities[j]
+                        if ner_tag in ("O", "-"):
+                            self.ner[i] = ner_tag
+
+            # If there is entity annotation and some tokens remain unaligned,
+            # align all entities at the character level to account for all
+            # possible token misalignments within the entity spans
+            if any([e not in ("O", "-") for e in entities]) and None in self.ner:
+                # If the temporary entdoc wasn't created above, initialize it
+                if not entdoc:
+                    entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text)
+                    entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces)
+                # Get offsets based on gold words and BILUO entities
+                entdoc_offsets = offsets_from_biluo_tags(entdoc, entities)
+                aligned_offsets = []
+                aligned_spans = []
+                # Filter offsets to identify those that align with doc tokens
+                for offset in entdoc_offsets:
+                    span = doc.char_span(offset[0], offset[1])
+                    if span and not span.text.isspace():
+                        aligned_offsets.append(offset)
+                        aligned_spans.append(span)
+                # Convert back to BILUO for doc tokens and assign NER for all
+                # aligned spans
+                biluo_tags = biluo_tags_from_offsets(doc, aligned_offsets, missing=None)
+                for span in aligned_spans:
+                    for i in range(span.start, span.end):
+                        self.ner[i] = biluo_tags[i]
+
+            # Prevent whitespace that isn't within entities from being tagged as
+            # an entity.
+            for i in range(len(self.ner)):
+                if self.tags[i] == "_SP":
+                    prev_ner = self.ner[i-1] if i >= 1 else None
+                    next_ner = self.ner[i+1] if (i+1) < len(self.ner) else None
+                    if prev_ner == "O" or next_ner == "O":
+                        self.ner[i] = "O"
+
+            cycle = nonproj.contains_cycle(self.heads)
+            if cycle is not None:
+                raise ValueError(Errors.E069.format(cycle=cycle,
+                    cycle_tokens=" ".join([f"'{self.words[tok_id]}'" for tok_id in cycle]),
+                    doc_tokens=" ".join(words[:50])))
+
+    def __len__(self):
+        """Get the number of gold-standard tokens.
+
+        RETURNS (int): The number of gold-standard tokens.
+        """
+        return self.length
+
+    @property
+    def is_projective(self):
+        """Whether the provided syntactic annotations form a projective
+        dependency tree.
+        """
+        return not nonproj.is_nonproj_tree(self.heads)
diff --git a/spacy/syntax/ner.pxd b/spacy/syntax/ner.pxd
index 647f98fc0..739b8dc1f 100644
--- a/spacy/syntax/ner.pxd
+++ b/spacy/syntax/ner.pxd
@@ -1,6 +1,6 @@
 from .transition_system cimport TransitionSystem
 from .transition_system cimport Transition
-from ..gold cimport GoldParseC
+from .gold_parse cimport GoldParseC
 from ..typedefs cimport attr_t
 
 
diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx
index ff74be601..4061304d8 100644
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@@ -7,7 +7,7 @@ from .stateclass cimport StateClass
 from ._state cimport StateC
 from .transition_system cimport Transition
 from .transition_system cimport do_func_t
-from ..gold cimport GoldParseC, GoldParse
+from .gold_parse cimport GoldParseC, GoldParse
 from ..lexeme cimport Lexeme
 from ..attrs cimport IS_SPACE
 
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index fcaff444e..12f56ba67 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -20,7 +20,7 @@ import numpy
 import warnings
 
 from ..tokens.doc cimport Doc
-from ..gold cimport GoldParse
+from .gold_parse cimport GoldParse
 from ..typedefs cimport weight_t, class_t, hash_t
 from ._parser_model cimport alloc_activations, free_activations
 from ._parser_model cimport predict_states, arg_max_if_valid
diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd
index 5fd3b5c5f..33f96c331 100644
--- a/spacy/syntax/transition_system.pxd
+++ b/spacy/syntax/transition_system.pxd
@@ -2,8 +2,8 @@ from cymem.cymem cimport Pool
 
 from ..typedefs cimport attr_t, weight_t
 from ..structs cimport TokenC
-from ..gold cimport GoldParse
-from ..gold cimport GoldParseC
+from .gold_parse cimport GoldParse
+from .gold_parse cimport GoldParseC
 from ..strings cimport StringStore
 from .stateclass cimport StateClass
 from ._state cimport StateC

From 1d2e39d97476ed1f01b3a711db69b1ce9a4917d6 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 6 Jun 2020 15:10:10 +0200
Subject: [PATCH 10/56] Support to_dict in Doc

---
 spacy/tokens/doc.pyx | 32 +++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index debab6aeb..3aa27e451 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -881,6 +881,32 @@ cdef class Doc:
     def to_bytes(self, exclude=tuple(), **kwargs):
         """Serialize, i.e. export the document contents to a binary string.
 
+        exclude (list): String names of serialization fields to exclude.
+        RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
+            all annotations.
+
+        DOCS: https://spacy.io/api/doc#to_bytes
+        """
+        return srsly.msgpack_dumps(self.to_dict(exclude=exclude, **kwargs))
+
+    def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
+        """Deserialize, i.e. import the document contents from a binary string.
+
+        data (bytes): The string to load from.
+        exclude (list): String names of serialization fields to exclude.
+        RETURNS (Doc): Itself.
+
+        DOCS: https://spacy.io/api/doc#from_bytes
+        """
+        return self.from_dict(
+            srsly.msgpack_loads(bytes_data),
+            exclude=exclude,
+            **kwargs
+        )
+
+    def to_dict(self, exclude=tuple(), **kwargs):
+        """Export the document contents to a dictionary for serialization.
+
         exclude (list): String names of serialization fields to exclude.
         RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
             all annotations.
@@ -917,9 +943,9 @@ cdef class Doc:
                 serializers["user_data_keys"] = lambda: srsly.msgpack_dumps(user_data_keys)
             if "user_data_values" not in exclude:
                 serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values)
-        return util.to_bytes(serializers, exclude)
+        return util.to_dict(serializers, exclude)
 
-    def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
+    def from_dict(self, msg, exclude=tuple(), **kwargs):
         """Deserialize, i.e. import the document contents from a binary string.
 
         data (bytes): The string to load from.
@@ -943,7 +969,6 @@ cdef class Doc:
         for key in kwargs:
             if key in deserializers or key in ("user_data",):
                 raise ValueError(Errors.E128.format(arg=key))
-        msg = util.from_bytes(bytes_data, deserializers, exclude)
         # Msgpack doesn't distinguish between lists and tuples, which is
         # vexing for user data. As a best guess, we *know* that within
         # keys, we must have tuples. In values we just have to hope
@@ -975,6 +1000,7 @@ cdef class Doc:
         self.from_array(msg["array_head"][2:], attrs[:, 2:])
         return self
 
+
     def extend_tensor(self, tensor):
         """Concatenate a new tensor onto the doc.tensor object.
 

From 3baa1ada03d4b4746091be74e733bc94984d3f36 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 6 Jun 2020 15:10:33 +0200
Subject: [PATCH 11/56] Refactr spacy.gold

---
 spacy/gold.pxd |   68 ---
 spacy/gold.pyx | 1407 ------------------------------------------------
 2 files changed, 1475 deletions(-)
 delete mode 100644 spacy/gold.pxd
 delete mode 100644 spacy/gold.pyx

diff --git a/spacy/gold.pxd b/spacy/gold.pxd
deleted file mode 100644
index bf724868f..000000000
--- a/spacy/gold.pxd
+++ /dev/null
@@ -1,68 +0,0 @@
-from cymem.cymem cimport Pool
-
-from .typedefs cimport attr_t
-from .syntax.transition_system cimport Transition
-
-from .tokens import Doc
-
-
-cdef struct GoldParseC:
-    int* tags
-    int* heads
-    int* has_dep
-    int* sent_start
-    attr_t* labels
-    int** brackets
-    Transition* ner
-
-
-cdef class GoldParse:
-    cdef Pool mem
-
-    cdef GoldParseC c
-    cdef readonly TokenAnnotation orig
-
-    cdef int length
-    cdef public int loss
-    cdef public list words
-    cdef public list tags
-    cdef public list pos
-    cdef public list morphs
-    cdef public list lemmas
-    cdef public list sent_starts
-    cdef public list heads
-    cdef public list labels
-    cdef public dict orths
-    cdef public list ner
-    cdef public dict brackets
-    cdef public dict cats
-    cdef public dict links
-
-    cdef readonly list cand_to_gold
-    cdef readonly list gold_to_cand
-
-
-cdef class TokenAnnotation:
-    cdef public list ids
-    cdef public list words
-    cdef public list tags
-    cdef public list pos
-    cdef public list morphs
-    cdef public list lemmas
-    cdef public list heads
-    cdef public list deps
-    cdef public list entities
-    cdef public list sent_starts
-    cdef public dict brackets_by_start
-
-
-cdef class DocAnnotation:
-    cdef public object cats
-    cdef public object links
-
-
-cdef class Example:
-    cdef public object doc
-    cdef public TokenAnnotation token_annotation
-    cdef public DocAnnotation doc_annotation
-    cdef public object goldparse
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
deleted file mode 100644
index 1e58f0635..000000000
--- a/spacy/gold.pyx
+++ /dev/null
@@ -1,1407 +0,0 @@
-# cython: profile=True
-import re
-import random
-import numpy
-import tempfile
-import shutil
-import itertools
-from pathlib import Path
-import srsly
-import warnings
-
-from .syntax import nonproj
-from .tokens import Doc, Span
-from .errors import Errors, AlignmentError, Warnings
-from . import util
-
-
-punct_re = re.compile(r"\W")
-
-
-def tags_to_entities(tags):
-    entities = []
-    start = None
-    for i, tag in enumerate(tags):
-        if tag is None:
-            continue
-        if tag.startswith("O"):
-            # TODO: We shouldn't be getting these malformed inputs. Fix this.
-            if start is not None:
-                start = None
-            continue
-        elif tag == "-":
-            continue
-        elif tag.startswith("I"):
-            if start is None:
-                raise ValueError(Errors.E067.format(tags=tags[:i + 1]))
-            continue
-        if tag.startswith("U"):
-            entities.append((tag[2:], i, i))
-        elif tag.startswith("B"):
-            start = i
-        elif tag.startswith("L"):
-            entities.append((tag[2:], start, i))
-            start = None
-        else:
-            raise ValueError(Errors.E068.format(tag=tag))
-    return entities
-
-
-def merge_sents(sents):
-    m_deps = [[], [], [], [], [], []]
-    m_cats = {}
-    m_brackets = []
-    i = 0
-    for (ids, words, tags, heads, labels, ner), (cats, brackets) in sents:
-        m_deps[0].extend(id_ + i for id_ in ids)
-        m_deps[1].extend(words)
-        m_deps[2].extend(tags)
-        m_deps[3].extend(head + i for head in heads)
-        m_deps[4].extend(labels)
-        m_deps[5].extend(ner)
-        m_brackets.extend((b["first"] + i, b["last"] + i, b["label"])
-                          for b in brackets)
-        m_cats.update(cats)
-        i += len(ids)
-    return [(m_deps, (m_cats, m_brackets))]
-
-
-def _normalize_for_alignment(tokens):
-    return [w.replace(" ", "").lower() for w in tokens]
-
-
-def align(tokens_a, tokens_b):
-    """Calculate alignment tables between two tokenizations.
-
-    tokens_a (List[str]): The candidate tokenization.
-    tokens_b (List[str]): The reference tokenization.
-    RETURNS: (tuple): A 5-tuple consisting of the following information:
-      * cost (int): The number of misaligned tokens.
-      * a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`.
-        For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns
-        to `tokens_b[6]`. If there's no one-to-one alignment for a token,
-        it has the value -1.
-      * b2a (List[int]): The same as `a2b`, but mapping the other direction.
-      * a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a`
-        to indices in `tokens_b`, where multiple tokens of `tokens_a` align to
-        the same token of `tokens_b`.
-      * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
-            direction.
-    """
-    tokens_a = _normalize_for_alignment(tokens_a)
-    tokens_b = _normalize_for_alignment(tokens_b)
-    cost = 0
-    a2b = numpy.empty(len(tokens_a), dtype="i")
-    b2a = numpy.empty(len(tokens_b), dtype="i")
-    a2b.fill(-1)
-    b2a.fill(-1)
-    a2b_multi = {}
-    b2a_multi = {}
-    i = 0
-    j = 0
-    offset_a = 0
-    offset_b = 0
-    while i < len(tokens_a) and j < len(tokens_b):
-        a = tokens_a[i][offset_a:]
-        b = tokens_b[j][offset_b:]
-        if a == b:
-            if offset_a == offset_b == 0:
-                a2b[i] = j
-                b2a[j] = i
-            elif offset_a == 0:
-                cost += 2
-                a2b_multi[i] = j
-            elif offset_b == 0:
-                cost += 2
-                b2a_multi[j] = i
-            offset_a = offset_b = 0
-            i += 1
-            j += 1
-        elif a == "":
-            assert offset_a == 0
-            cost += 1
-            i += 1
-        elif b == "":
-            assert offset_b == 0
-            cost += 1
-            j += 1
-        elif b.startswith(a):
-            cost += 1
-            if offset_a == 0:
-                a2b_multi[i] = j
-            i += 1
-            offset_a = 0
-            offset_b += len(a)
-        elif a.startswith(b):
-            cost += 1
-            if offset_b == 0:
-                b2a_multi[j] = i
-            j += 1
-            offset_b = 0
-            offset_a += len(b)
-        else:
-            assert "".join(tokens_a) != "".join(tokens_b)
-            raise AlignmentError(Errors.E186.format(tok_a=tokens_a, tok_b=tokens_b))
-    return cost, a2b, b2a, a2b_multi, b2a_multi
-
-
-class GoldCorpus(object):
-    """An annotated corpus, using the JSON file format. Manages
-    annotations for tagging, dependency parsing and NER.
-
-    DOCS: https://spacy.io/api/goldcorpus
-    """
-    def __init__(self, train, dev, gold_preproc=False, limit=None):
-        """Create a GoldCorpus.
-
-        train (str / Path): File or directory of training data.
-        dev (str / Path): File or directory of development data.
-        RETURNS (GoldCorpus): The newly created object.
-        """
-        self.limit = limit
-        if isinstance(train, str) or isinstance(train, Path):
-            train = self.read_examples(self.walk_corpus(train))
-            dev = self.read_examples(self.walk_corpus(dev))
-        # Write temp directory with one doc per file, so we can shuffle and stream
-        self.tmp_dir = Path(tempfile.mkdtemp())
-        self.write_msgpack(self.tmp_dir / "train", train, limit=self.limit)
-        self.write_msgpack(self.tmp_dir / "dev", dev, limit=self.limit)
-
-    def __del__(self):
-        shutil.rmtree(self.tmp_dir)
-
-    @staticmethod
-    def write_msgpack(directory, examples, limit=0):
-        if not directory.exists():
-            directory.mkdir()
-        n = 0
-        for i, example in enumerate(examples):
-            ex_dict = example.to_dict()
-            text = example.text
-            srsly.write_msgpack(directory / f"{i}.msg", (text, ex_dict))
-            n += 1
-            if limit and n >= limit:
-                break
-
-    @staticmethod
-    def walk_corpus(path):
-        path = util.ensure_path(path)
-        if not path.is_dir():
-            return [path]
-        paths = [path]
-        locs = []
-        seen = set()
-        for path in paths:
-            if str(path) in seen:
-                continue
-            seen.add(str(path))
-            if path.parts[-1].startswith("."):
-                continue
-            elif path.is_dir():
-                paths.extend(path.iterdir())
-            elif path.parts[-1].endswith((".json", ".jsonl")):
-                locs.append(path)
-        return locs
-
-    @staticmethod
-    def read_examples(locs, limit=0):
-        """ Yield training examples """
-        i = 0
-        for loc in locs:
-            loc = util.ensure_path(loc)
-            file_name = loc.parts[-1]
-            if file_name.endswith("json"):
-                examples = read_json_file(loc)
-            elif file_name.endswith("jsonl"):
-                gold_tuples = srsly.read_jsonl(loc)
-                first_gold_tuple = next(gold_tuples)
-                gold_tuples = itertools.chain([first_gold_tuple], gold_tuples)
-                # TODO: proper format checks with schemas
-                if isinstance(first_gold_tuple, dict):
-                    if first_gold_tuple.get("paragraphs", None):
-                        examples = read_json_object(gold_tuples)
-                    elif first_gold_tuple.get("doc_annotation", None):
-                        examples = []
-                        for ex_dict in gold_tuples:
-                            doc = ex_dict.get("doc", None)
-                            if doc is None:
-                                doc = ex_dict.get("text", None)
-                            if not (doc is None or isinstance(doc, Doc) or isinstance(doc, str)):
-                                raise ValueError(Errors.E987.format(type=type(doc)))
-                            examples.append(Example.from_dict(ex_dict, doc=doc))
-
-            elif file_name.endswith("msg"):
-                text, ex_dict = srsly.read_msgpack(loc)
-                examples = [Example.from_dict(ex_dict, doc=text)]
-            else:
-                supported = ("json", "jsonl", "msg")
-                raise ValueError(Errors.E124.format(path=loc, formats=supported))
-            try:
-                for example in examples:
-                    yield example
-                    i += 1
-                    if limit and i >= limit:
-                        return
-            except KeyError as e:
-                msg = "Missing key {}".format(e)
-                raise KeyError(Errors.E996.format(file=file_name, msg=msg))
-            except UnboundLocalError as e:
-                msg = "Unexpected document structure"
-                raise ValueError(Errors.E996.format(file=file_name, msg=msg))
-
-    @property
-    def dev_examples(self):
-        locs = (self.tmp_dir / "dev").iterdir()
-        yield from self.read_examples(locs, limit=self.limit)
-
-    @property
-    def train_examples(self):
-        locs = (self.tmp_dir / "train").iterdir()
-        yield from self.read_examples(locs, limit=self.limit)
-
-    def count_train(self):
-        """Returns count of words in train examples"""
-        n = 0
-        i = 0
-        for example in self.train_examples:
-            n += len(example.token_annotation.words)
-            if self.limit and i >= self.limit:
-                break
-            i += 1
-        return n
-
-    def train_dataset(self, nlp, gold_preproc=False, max_length=None,
-                    noise_level=0.0, orth_variant_level=0.0,
-                    ignore_misaligned=False):
-        locs = list((self.tmp_dir / 'train').iterdir())
-        random.shuffle(locs)
-        train_examples = self.read_examples(locs, limit=self.limit)
-        gold_examples = self.iter_gold_docs(nlp, train_examples, gold_preproc,
-                                        max_length=max_length,
-                                        noise_level=noise_level,
-                                        orth_variant_level=orth_variant_level,
-                                        make_projective=True,
-                                        ignore_misaligned=ignore_misaligned)
-        yield from gold_examples
-
-    def train_dataset_without_preprocessing(self, nlp, gold_preproc=False,
-                                            ignore_misaligned=False):
-        examples = self.iter_gold_docs(nlp, self.train_examples,
-                                       gold_preproc=gold_preproc,
-                                       ignore_misaligned=ignore_misaligned)
-        yield from examples
-
-    def dev_dataset(self, nlp, gold_preproc=False, ignore_misaligned=False):
-        examples = self.iter_gold_docs(nlp, self.dev_examples,
-                                       gold_preproc=gold_preproc,
-                                       ignore_misaligned=ignore_misaligned)
-        yield from examples
-
-    @classmethod
-    def iter_gold_docs(cls, nlp, examples, gold_preproc, max_length=None,
-                       noise_level=0.0, orth_variant_level=0.0,
-                       make_projective=False, ignore_misaligned=False):
-        """ Setting gold_preproc will result in creating a doc per sentence """
-        for example in examples:
-            if gold_preproc:
-                split_examples = example.split_sents()
-                example_golds = []
-                for split_example in split_examples:
-                    split_example_docs = cls._make_docs(nlp, split_example,
-                            gold_preproc, noise_level=noise_level,
-                            orth_variant_level=orth_variant_level)
-                    split_example_golds = cls._make_golds(split_example_docs,
-                            vocab=nlp.vocab, make_projective=make_projective,
-                            ignore_misaligned=ignore_misaligned)
-                    example_golds.extend(split_example_golds)
-            else:
-                example_docs = cls._make_docs(nlp, example,
-                        gold_preproc, noise_level=noise_level,
-                        orth_variant_level=orth_variant_level)
-                example_golds = cls._make_golds(example_docs, vocab=nlp.vocab,
-                        make_projective=make_projective,
-                        ignore_misaligned=ignore_misaligned)
-            for ex in example_golds:
-                if ex.goldparse is not None:
-                    if (not max_length) or len(ex.doc) < max_length:
-                        yield ex
-
-    @classmethod
-    def _make_docs(cls, nlp, example, gold_preproc, noise_level=0.0, orth_variant_level=0.0):
-        var_example = make_orth_variants(nlp, example, orth_variant_level=orth_variant_level)
-        # gold_preproc is not used ?!
-        if example.text is not None:
-            var_text = add_noise(var_example.text, noise_level)
-            var_doc = nlp.make_doc(var_text)
-            var_example.doc = var_doc
-        else:
-            var_doc = Doc(nlp.vocab, words=add_noise(var_example.token_annotation.words, noise_level))
-            var_example.doc = var_doc
-        return [var_example]
-
-    @classmethod
-    def _make_golds(cls, examples, vocab=None, make_projective=False,
-                    ignore_misaligned=False):
-        filtered_examples = []
-        for example in examples:
-            gold_parses = example.get_gold_parses(vocab=vocab,
-                    make_projective=make_projective,
-                    ignore_misaligned=ignore_misaligned)
-            assert len(gold_parses) == 1
-            doc, gold = gold_parses[0]
-            if doc:
-                assert doc == example.doc
-                example.goldparse = gold
-                filtered_examples.append(example)
-        return filtered_examples
-
-
-def make_orth_variants(nlp, example, orth_variant_level=0.0):
-    if random.random() >= orth_variant_level:
-        return example
-    if not example.token_annotation:
-        return example
-    raw = example.text
-    lower = False
-    if random.random() >= 0.5:
-        lower = True
-        if raw is not None:
-            raw = raw.lower()
-    ndsv = nlp.Defaults.single_orth_variants
-    ndpv = nlp.Defaults.paired_orth_variants
-    # modify words in paragraph_tuples
-    variant_example = Example(doc=raw)
-    token_annotation = example.token_annotation
-    words = token_annotation.words
-    tags = token_annotation.tags
-    if not words or not tags:
-       # add the unmodified annotation
-        token_dict = token_annotation.to_dict()
-        variant_example.set_token_annotation(**token_dict)
-    else:
-        if lower:
-            words = [w.lower() for w in words]
-        # single variants
-        punct_choices = [random.choice(x["variants"]) for x in ndsv]
-        for word_idx in range(len(words)):
-            for punct_idx in range(len(ndsv)):
-                if tags[word_idx] in ndsv[punct_idx]["tags"] \
-                        and words[word_idx] in ndsv[punct_idx]["variants"]:
-                    words[word_idx] = punct_choices[punct_idx]
-        # paired variants
-        punct_choices = [random.choice(x["variants"]) for x in ndpv]
-        for word_idx in range(len(words)):
-            for punct_idx in range(len(ndpv)):
-                if tags[word_idx] in ndpv[punct_idx]["tags"] \
-                        and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
-                    # backup option: random left vs. right from pair
-                    pair_idx = random.choice([0, 1])
-                    # best option: rely on paired POS tags like `` / ''
-                    if len(ndpv[punct_idx]["tags"]) == 2:
-                        pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx])
-                    # next best option: rely on position in variants
-                    # (may not be unambiguous, so order of variants matters)
-                    else:
-                        for pair in ndpv[punct_idx]["variants"]:
-                            if words[word_idx] in pair:
-                                pair_idx = pair.index(words[word_idx])
-                    words[word_idx] = punct_choices[punct_idx][pair_idx]
-
-        token_dict = token_annotation.to_dict()
-        token_dict["words"] = words
-        token_dict["tags"] = tags
-        variant_example.set_token_annotation(**token_dict)
-    # modify raw to match variant_paragraph_tuples
-    if raw is not None:
-        variants = []
-        for single_variants in ndsv:
-            variants.extend(single_variants["variants"])
-        for paired_variants in ndpv:
-            variants.extend(list(itertools.chain.from_iterable(paired_variants["variants"])))
-        # store variants in reverse length order to be able to prioritize
-        # longer matches (e.g., "---" before "--")
-        variants = sorted(variants, key=lambda x: len(x))
-        variants.reverse()
-        variant_raw = ""
-        raw_idx = 0
-        # add initial whitespace
-        while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
-            variant_raw += raw[raw_idx]
-            raw_idx += 1
-        for word in variant_example.token_annotation.words:
-            match_found = False
-            # skip whitespace words
-            if word.isspace():
-                match_found = True
-            # add identical word
-            elif word not in variants and raw[raw_idx:].startswith(word):
-                variant_raw += word
-                raw_idx += len(word)
-                match_found = True
-            # add variant word
-            else:
-                for variant in variants:
-                    if not match_found and \
-                            raw[raw_idx:].startswith(variant):
-                        raw_idx += len(variant)
-                        variant_raw += word
-                        match_found = True
-            # something went wrong, abort
-            # (add a warning message?)
-            if not match_found:
-                return example
-            # add following whitespace
-            while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
-                variant_raw += raw[raw_idx]
-                raw_idx += 1
-        variant_example.doc = variant_raw
-        return variant_example
-    return variant_example
-
-
-def add_noise(orig, noise_level):
-    if random.random() >= noise_level:
-        return orig
-    elif type(orig) == list:
-        corrupted = [_corrupt(word, noise_level) for word in orig]
-        corrupted = [w for w in corrupted if w]
-        return corrupted
-    else:
-        return "".join(_corrupt(c, noise_level) for c in orig)
-
-
-def _corrupt(c, noise_level):
-    if random.random() >= noise_level:
-        return c
-    elif c in [".", "'", "!", "?", ","]:
-        return "\n"
-    else:
-        return c.lower()
-
-
-def read_json_object(json_corpus_section):
-    """Take a list of JSON-formatted documents (e.g. from an already loaded
-    training data file) and yield annotations in the GoldParse format.
-
-    json_corpus_section (list): The data.
-    YIELDS (Example): The reformatted data - one training example per paragraph
-    """
-    for json_doc in json_corpus_section:
-        examples = json_to_examples(json_doc)
-        for ex in examples:
-            yield ex
-
-
-def json_to_examples(doc):
-    """Convert an item in the JSON-formatted training data to the format
-    used by GoldParse.
-
-    doc (dict): One entry in the training data.
-    YIELDS (Example): The reformatted data - one training example per paragraph
-    """
-    paragraphs = []
-    for paragraph in doc["paragraphs"]:
-        example = Example(doc=paragraph.get("raw", None))
-        words = []
-        ids = []
-        tags = []
-        pos = []
-        morphs = []
-        lemmas = []
-        heads = []
-        labels = []
-        ner = []
-        sent_starts = []
-        brackets = []
-        for sent in paragraph["sentences"]:
-            sent_start_i = len(words)
-            for i, token in enumerate(sent["tokens"]):
-                words.append(token["orth"])
-                ids.append(token.get('id', sent_start_i + i))
-                tags.append(token.get('tag', "-"))
-                pos.append(token.get("pos", ""))
-                morphs.append(token.get("morph", ""))
-                lemmas.append(token.get("lemma", ""))
-                heads.append(token.get("head", 0) + sent_start_i + i)
-                labels.append(token.get("dep", ""))
-                # Ensure ROOT label is case-insensitive
-                if labels[-1].lower() == "root":
-                    labels[-1] = "ROOT"
-                ner.append(token.get("ner", "-"))
-                if i == 0:
-                    sent_starts.append(1)
-                else:
-                    sent_starts.append(0)
-            if "brackets" in sent:
-                brackets.extend((b["first"] + sent_start_i,
-                                 b["last"] + sent_start_i, b["label"])
-                                 for b in sent["brackets"])
-        cats = {}
-        for cat in paragraph.get("cats", {}):
-            cats[cat["label"]] = cat["value"]
-        example.set_token_annotation(ids=ids, words=words, tags=tags,
-                pos=pos, morphs=morphs, lemmas=lemmas, heads=heads,
-                deps=labels, entities=ner, sent_starts=sent_starts,
-                brackets=brackets)
-        example.set_doc_annotation(cats=cats)
-        yield example
-
-
-def read_json_file(loc, docs_filter=None, limit=None):
-    loc = util.ensure_path(loc)
-    if loc.is_dir():
-        for filename in loc.iterdir():
-            yield from read_json_file(loc / filename, limit=limit)
-    else:
-        for doc in _json_iterate(loc):
-            if docs_filter is not None and not docs_filter(doc):
-                continue
-            for json_data in json_to_examples(doc):
-                yield json_data
-
-
-def _json_iterate(loc):
-    # We should've made these files jsonl...But since we didn't, parse out
-    # the docs one-by-one to reduce memory usage.
-    # It's okay to read in the whole file -- just don't parse it into JSON.
-    cdef bytes py_raw
-    loc = util.ensure_path(loc)
-    with loc.open("rb") as file_:
-        py_raw = file_.read()
-    cdef long file_length = len(py_raw)
-    if file_length > 2 ** 30:
-        warnings.warn(Warnings.W027.format(size=file_length))
-
-    raw = <char*>py_raw
-    cdef int square_depth = 0
-    cdef int curly_depth = 0
-    cdef int inside_string = 0
-    cdef int escape = 0
-    cdef long start = -1
-    cdef char c
-    cdef char quote = ord('"')
-    cdef char backslash = ord("\\")
-    cdef char open_square = ord("[")
-    cdef char close_square = ord("]")
-    cdef char open_curly = ord("{")
-    cdef char close_curly = ord("}")
-    for i in range(file_length):
-        c = raw[i]
-        if escape:
-            escape = False
-            continue
-        if c == backslash:
-            escape = True
-            continue
-        if c == quote:
-            inside_string = not inside_string
-            continue
-        if inside_string:
-            continue
-        if c == open_square:
-            square_depth += 1
-        elif c == close_square:
-            square_depth -= 1
-        elif c == open_curly:
-            if square_depth == 1 and curly_depth == 0:
-                start = i
-            curly_depth += 1
-        elif c == close_curly:
-            curly_depth -= 1
-            if square_depth == 1 and curly_depth == 0:
-                py_str = py_raw[start : i + 1].decode("utf8")
-                try:
-                    yield srsly.json_loads(py_str)
-                except Exception:
-                    print(py_str)
-                    raise
-                start = -1
-
-
-def iob_to_biluo(tags):
-    out = []
-    tags = list(tags)
-    while tags:
-        out.extend(_consume_os(tags))
-        out.extend(_consume_ent(tags))
-    return out
-
-
-def biluo_to_iob(tags):
-    out = []
-    for tag in tags:
-        tag = tag.replace("U-", "B-", 1).replace("L-", "I-", 1)
-        out.append(tag)
-    return out
-
-
-def _consume_os(tags):
-    while tags and tags[0] == "O":
-        yield tags.pop(0)
-
-
-def _consume_ent(tags):
-    if not tags:
-        return []
-    tag = tags.pop(0)
-    target_in = "I" + tag[1:]
-    target_last = "L" + tag[1:]
-    length = 1
-    while tags and tags[0] in {target_in, target_last}:
-        length += 1
-        tags.pop(0)
-    label = tag[2:]
-    if length == 1:
-        if len(label) == 0:
-            raise ValueError(Errors.E177.format(tag=tag))
-        return ["U-" + label]
-    else:
-        start = "B-" + label
-        end = "L-" + label
-        middle = [f"I-{label}" for _ in range(1, length - 1)]
-        return [start] + middle + [end]
-
-
-cdef class TokenAnnotation:
-    def __init__(self, ids=None, words=None, tags=None, pos=None, morphs=None,
-            lemmas=None, heads=None, deps=None, entities=None, sent_starts=None,
-            brackets=None):
-        self.ids = ids if ids else []
-        self.words = words if words else []
-        self.tags = tags if tags else []
-        self.pos = pos if pos else []
-        self.morphs = morphs if morphs else []
-        self.lemmas = lemmas if lemmas else []
-        self.heads = heads if heads else []
-        self.deps = deps if deps else []
-        self.entities = entities if entities else []
-        self.sent_starts = sent_starts if sent_starts else []
-        self.brackets_by_start = {}
-        if brackets:
-            for b_start, b_end, b_label in brackets:
-                self.brackets_by_start.setdefault(b_start, []).append((b_end, b_label))
-
-    @property
-    def brackets(self):
-        brackets = []
-        for start, ends_labels in self.brackets_by_start.items():
-            for end, label in ends_labels:
-                brackets.append((start, end, label))
-        return brackets
-
-    @classmethod
-    def from_dict(cls, token_dict):
-        return cls(ids=token_dict.get("ids", None),
-                   words=token_dict.get("words", None),
-                   tags=token_dict.get("tags", None),
-                   pos=token_dict.get("pos", None),
-                   morphs=token_dict.get("morphs", None),
-                   lemmas=token_dict.get("lemmas", None),
-                   heads=token_dict.get("heads", None),
-                   deps=token_dict.get("deps", None),
-                   entities=token_dict.get("entities", None),
-                   sent_starts=token_dict.get("sent_starts", None),
-                   brackets=token_dict.get("brackets", None))
-
-    def to_dict(self):
-        return {"ids": self.ids,
-                "words": self.words,
-                "tags": self.tags,
-                "pos": self.pos,
-                "morphs": self.morphs,
-                "lemmas": self.lemmas,
-                "heads": self.heads,
-                "deps": self.deps,
-                "entities": self.entities,
-                "sent_starts": self.sent_starts,
-                "brackets": self.brackets}
-
-    def get_id(self, i):
-        return self.ids[i] if i < len(self.ids) else i
-
-    def get_word(self, i):
-        return self.words[i] if i < len(self.words) else ""
-
-    def get_tag(self, i):
-        return self.tags[i] if i < len(self.tags) else "-"
-
-    def get_pos(self, i):
-        return self.pos[i] if i < len(self.pos) else ""
-
-    def get_morph(self, i):
-        return self.morphs[i] if i < len(self.morphs) else ""
-
-    def get_lemma(self, i):
-        return self.lemmas[i] if i < len(self.lemmas) else ""
-
-    def get_head(self, i):
-        return self.heads[i] if i < len(self.heads) else i
-
-    def get_dep(self, i):
-        return self.deps[i] if i < len(self.deps) else ""
-
-    def get_entity(self, i):
-        return self.entities[i] if i < len(self.entities) else "-"
-
-    def get_sent_start(self, i):
-        return self.sent_starts[i] if i < len(self.sent_starts) else None
-
-    def __str__(self):
-        return str(self.to_dict())
-
-    def __repr__(self):
-        return self.__str__()
-
-
-cdef class DocAnnotation:
-    def __init__(self, cats=None, links=None):
-        self.cats = cats if cats else {}
-        self.links = links if links else {}
-
-    @classmethod
-    def from_dict(cls, doc_dict):
-        return cls(cats=doc_dict.get("cats", None), links=doc_dict.get("links", None))
-
-    def to_dict(self):
-        return {"cats": self.cats, "links": self.links}
-
-    def __str__(self):
-        return str(self.to_dict())
-
-    def __repr__(self):
-        return self.__str__()
-
-
-cdef class Example:
-    def __init__(self, doc_annotation=None, token_annotation=None, doc=None,
-                 goldparse=None):
-        """ Doc can either be text, or an actual Doc """
-        self.doc = doc
-        self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation()
-        self.token_annotation = token_annotation if token_annotation else TokenAnnotation()
-        self.goldparse = goldparse
-
-    @classmethod
-    def from_gold(cls, goldparse, doc=None):
-        doc_annotation = DocAnnotation(cats=goldparse.cats, links=goldparse.links)
-        token_annotation = goldparse.get_token_annotation()
-        return cls(doc_annotation, token_annotation, doc)
-
-    @classmethod
-    def from_dict(cls, example_dict, doc=None):
-        token_dict = example_dict.get("token_annotation", {})
-        token_annotation = TokenAnnotation.from_dict(token_dict)
-        doc_dict = example_dict.get("doc_annotation", {})
-        doc_annotation = DocAnnotation.from_dict(doc_dict)
-        return cls(doc_annotation, token_annotation, doc)
-
-    def to_dict(self):
-        """ Note that this method does NOT export the doc, only the annotations ! """
-        token_dict = self.token_annotation.to_dict()
-        doc_dict = self.doc_annotation.to_dict()
-        return {"token_annotation": token_dict, "doc_annotation": doc_dict}
-
-    @property
-    def text(self):
-        if self.doc is None:
-            return None
-        if isinstance(self.doc, Doc):
-            return self.doc.text
-        return self.doc
-
-    @property
-    def gold(self):
-        if self.goldparse is None:
-            doc, gold = self.get_gold_parses()[0]
-            self.goldparse = gold
-        return self.goldparse
-
-    def set_token_annotation(self, ids=None, words=None, tags=None, pos=None,
-                             morphs=None, lemmas=None, heads=None, deps=None,
-                             entities=None, sent_starts=None, brackets=None):
-        self.token_annotation = TokenAnnotation(ids=ids, words=words, tags=tags,
-                            pos=pos, morphs=morphs, lemmas=lemmas, heads=heads,
-                            deps=deps, entities=entities,
-                            sent_starts=sent_starts, brackets=brackets)
-
-    def set_doc_annotation(self, cats=None, links=None):
-        if cats:
-            self.doc_annotation.cats = cats
-        if links:
-            self.doc_annotation.links = links
-
-    def split_sents(self):
-        """ Split the token annotations into multiple Examples based on
-        sent_starts and return a list of the new Examples"""
-        if not self.token_annotation.words:
-            return [self]
-        s_example = Example(doc=None, doc_annotation=self.doc_annotation)
-        s_ids, s_words, s_tags, s_pos, s_morphs = [], [], [], [], []
-        s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], []
-        s_brackets = []
-        sent_start_i = 0
-        cdef TokenAnnotation t = self.token_annotation
-        split_examples = []
-        cdef int b_start, b_end
-        cdef unicode b_label
-        for i in range(len(t.words)):
-            if i > 0 and t.sent_starts[i] == 1:
-                s_example.set_token_annotation(ids=s_ids,
-                        words=s_words, tags=s_tags, pos=s_pos, morphs=s_morphs,
-                        lemmas=s_lemmas, heads=s_heads, deps=s_deps,
-                        entities=s_ents, sent_starts=s_sent_starts,
-                        brackets=s_brackets)
-                split_examples.append(s_example)
-                s_example = Example(doc=None, doc_annotation=self.doc_annotation)
-                s_ids, s_words, s_tags, s_pos, s_heads = [], [], [], [], []
-                s_deps, s_ents, s_morphs, s_lemmas = [], [], [], []
-                s_sent_starts, s_brackets = [], []
-                sent_start_i = i
-            s_ids.append(t.get_id(i))
-            s_words.append(t.get_word(i))
-            s_tags.append(t.get_tag(i))
-            s_pos.append(t.get_pos(i))
-            s_morphs.append(t.get_morph(i))
-            s_lemmas.append(t.get_lemma(i))
-            s_heads.append(t.get_head(i) - sent_start_i)
-            s_deps.append(t.get_dep(i))
-            s_ents.append(t.get_entity(i))
-            s_sent_starts.append(t.get_sent_start(i))
-            for b_end, b_label in t.brackets_by_start.get(i, []):
-                s_brackets.append(
-                    (i - sent_start_i, b_end - sent_start_i, b_label)
-                )
-            i += 1
-        s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags,
-                pos=s_pos, morphs=s_morphs, lemmas=s_lemmas, heads=s_heads,
-                deps=s_deps, entities=s_ents, sent_starts=s_sent_starts,
-                brackets=s_brackets)
-        split_examples.append(s_example)
-        return split_examples
-
-
-    def get_gold_parses(self, merge=True, vocab=None, make_projective=False,
-                        ignore_misaligned=False):
-        """Return a list of (doc, GoldParse) objects.
-        If merge is set to True, keep all Token annotations as one big list."""
-        d = self.doc_annotation
-        # merge == do not modify Example
-        if merge:
-            t = self.token_annotation
-            doc = self.doc
-            if doc is None or not isinstance(doc, Doc):
-                if not vocab:
-                    raise ValueError(Errors.E998)
-                doc = Doc(vocab, words=t.words)
-            try:
-                gp = GoldParse.from_annotation(doc, d, t,
-                                               make_projective=make_projective)
-            except AlignmentError:
-                if ignore_misaligned:
-                    gp = None
-                else:
-                    raise
-            return [(doc, gp)]
-        # not merging: one GoldParse per sentence, defining docs with the words
-        # from each sentence
-        else:
-            parses = []
-            split_examples = self.split_sents()
-            for split_example in split_examples:
-                if not vocab:
-                    raise ValueError(Errors.E998)
-                split_doc = Doc(vocab, words=split_example.token_annotation.words)
-                try:
-                    gp = GoldParse.from_annotation(split_doc, d,
-                            split_example.token_annotation,
-                            make_projective=make_projective)
-                except AlignmentError:
-                    if ignore_misaligned:
-                        gp = None
-                    else:
-                        raise
-                if gp is not None:
-                    parses.append((split_doc, gp))
-            return parses
-
-    @classmethod
-    def to_example_objects(cls, examples, make_doc=None, keep_raw_text=False):
-        """
-        Return a list of Example objects, from a variety of input formats.
-        make_doc needs to be provided when the examples contain text strings and keep_raw_text=False
-        """
-        if isinstance(examples, Example):
-            return [examples]
-        if isinstance(examples, tuple):
-            examples = [examples]
-        converted_examples = []
-        for ex in examples:
-            if isinstance(ex, Example):
-                converted_examples.append(ex)
-            # convert string to Doc to Example
-            elif isinstance(ex, str):
-                if keep_raw_text:
-                    converted_examples.append(Example(doc=ex))
-                else:
-                    doc = make_doc(ex)
-                    converted_examples.append(Example(doc=doc))
-            # convert Doc to Example
-            elif isinstance(ex, Doc):
-                converted_examples.append(Example(doc=ex))
-            # convert tuples to Example
-            elif isinstance(ex, tuple) and len(ex) == 2:
-                doc, gold = ex
-                gold_dict = {}
-                # convert string to Doc
-                if isinstance(doc, str) and not keep_raw_text:
-                    doc = make_doc(doc)
-                # convert dict to GoldParse
-                if isinstance(gold, dict):
-                    gold_dict = gold
-                    if doc is not None or gold.get("words", None) is not None:
-                        gold = GoldParse(doc, **gold)
-                    else:
-                        gold = None
-                if gold is not None:
-                    converted_examples.append(Example.from_gold(goldparse=gold, doc=doc))
-                else:
-                    raise ValueError(Errors.E999.format(gold_dict=gold_dict))
-            else:
-                converted_examples.append(ex)
-        return converted_examples
-
-
-cdef class GoldParse:
-    """Collection for training annotations.
-
-    DOCS: https://spacy.io/api/goldparse
-    """
-    @classmethod
-    def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False):
-        return cls(doc, words=token_annotation.words,
-                   tags=token_annotation.tags,
-                   pos=token_annotation.pos,
-                   morphs=token_annotation.morphs,
-                   lemmas=token_annotation.lemmas,
-                   heads=token_annotation.heads,
-                   deps=token_annotation.deps,
-                   entities=token_annotation.entities,
-                   sent_starts=token_annotation.sent_starts,
-                   cats=doc_annotation.cats,
-                   links=doc_annotation.links,
-                   make_projective=make_projective)
-
-    def get_token_annotation(self):
-        ids = None
-        if self.words:
-            ids = list(range(len(self.words)))
-
-        return TokenAnnotation(ids=ids, words=self.words, tags=self.tags,
-                               pos=self.pos, morphs=self.morphs,
-                               lemmas=self.lemmas, heads=self.heads,
-                               deps=self.labels, entities=self.ner,
-                               sent_starts=self.sent_starts)
-
-    def __init__(self, doc, words=None, tags=None, pos=None, morphs=None,
-                 lemmas=None, heads=None, deps=None, entities=None,
-                 sent_starts=None, make_projective=False, cats=None,
-                 links=None):
-        """Create a GoldParse. The fields will not be initialized if len(doc) is zero.
-
-        doc (Doc): The document the annotations refer to.
-        words (iterable): A sequence of unicode word strings.
-        tags (iterable): A sequence of strings, representing tag annotations.
-        pos (iterable): A sequence of strings, representing UPOS annotations.
-        morphs (iterable): A sequence of strings, representing morph
-            annotations.
-        lemmas (iterable): A sequence of strings, representing lemma
-            annotations.
-        heads (iterable): A sequence of integers, representing syntactic
-            head offsets.
-        deps (iterable): A sequence of strings, representing the syntactic
-            relation types.
-        entities (iterable): A sequence of named entity annotations, either as
-            BILUO tag strings, or as `(start_char, end_char, label)` tuples,
-            representing the entity positions.
-        sent_starts (iterable): A sequence of sentence position tags, 1 for
-            the first word in a sentence, 0 for all others.
-        cats (dict): Labels for text classification. Each key in the dictionary
-            may be a string or an int, or a `(start_char, end_char, label)`
-            tuple, indicating that the label is applied to only part of the
-            document (usually a sentence). Unlike entity annotations, label
-            annotations can overlap, i.e. a single word can be covered by
-            multiple labelled spans. The TextCategorizer component expects
-            true examples of a label to have the value 1.0, and negative
-            examples of a label to have the value 0.0. Labels not in the
-            dictionary are treated as missing - the gradient for those labels
-            will be zero.
-        links (dict): A dict with `(start_char, end_char)` keys,
-            and the values being dicts with kb_id:value entries,
-            representing the external IDs in a knowledge base (KB)
-            mapped to either 1.0 or 0.0, indicating positive and
-            negative examples respectively.
-        RETURNS (GoldParse): The newly constructed object.
-        """
-        self.mem = Pool()
-        self.loss = 0
-        self.length = len(doc)
-
-        self.cats = {} if cats is None else dict(cats)
-        self.links = {} if links is None else dict(links)
-
-        # temporary doc for aligning entity annotation
-        entdoc = None
-
-        # avoid allocating memory if the doc does not contain any tokens
-        if self.length == 0:
-            self.words = []
-            self.tags = []
-            self.heads = []
-            self.labels = []
-            self.ner = []
-            self.morphs = []
-            # set a minimal orig so that the scorer can score an empty doc
-            self.orig = TokenAnnotation(ids=[])
-        else:
-            if not words:
-                words = [token.text for token in doc]
-            if not tags:
-                tags = [None for _ in words]
-            if not pos:
-                pos = [None for _ in words]
-            if not morphs:
-                morphs = [None for _ in words]
-            if not lemmas:
-                lemmas = [None for _ in words]
-            if not heads:
-                heads = [None for _ in words]
-            if not deps:
-                deps = [None for _ in words]
-            if not sent_starts:
-                sent_starts = [None for _ in words]
-            if entities is None:
-                entities = ["-" for _ in words]
-            elif len(entities) == 0:
-                entities = ["O" for _ in words]
-            else:
-                # Translate the None values to '-', to make processing easier.
-                # See Issue #2603
-                entities = [(ent if ent is not None else "-") for ent in entities]
-                if not isinstance(entities[0], str):
-                    # Assume we have entities specified by character offset.
-                    # Create a temporary Doc corresponding to provided words
-                    # (to preserve gold tokenization) and text (to preserve
-                    # character offsets).
-                    entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text)
-                    entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces)
-                    entdoc_entities = biluo_tags_from_offsets(entdoc, entities)
-                    # There may be some additional whitespace tokens in the
-                    # temporary doc, so check that the annotations align with
-                    # the provided words while building a list of BILUO labels.
-                    entities = []
-                    words_offset = 0
-                    for i in range(len(entdoc_words)):
-                        if words[i + words_offset] == entdoc_words[i]:
-                            entities.append(entdoc_entities[i])
-                        else:
-                            words_offset -= 1
-                    if len(entities) != len(words):
-                        warnings.warn(Warnings.W029.format(text=doc.text))
-                        entities = ["-" for _ in words]
-
-            # These are filled by the tagger/parser/entity recogniser
-            self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
-            self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
-            self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
-            self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int))
-            self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))
-            self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
-
-            self.words = [None] * len(doc)
-            self.tags = [None] * len(doc)
-            self.pos = [None] * len(doc)
-            self.morphs = [None] * len(doc)
-            self.lemmas = [None] * len(doc)
-            self.heads = [None] * len(doc)
-            self.labels = [None] * len(doc)
-            self.ner = [None] * len(doc)
-            self.sent_starts = [None] * len(doc)
-
-            # This needs to be done before we align the words
-            if make_projective and any(heads) and any(deps) :
-                heads, deps = nonproj.projectivize(heads, deps)
-
-            # Do many-to-one alignment for misaligned tokens.
-            # If we over-segment, we'll have one gold word that covers a sequence
-            # of predicted words
-            # If we under-segment, we'll have one predicted word that covers a
-            # sequence of gold words.
-            # If we "mis-segment", we'll have a sequence of predicted words covering
-            # a sequence of gold words. That's many-to-many -- we don't do that
-            # except for NER spans where the start and end can be aligned.
-            cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words)
-
-            self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
-            self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
-
-            self.orig = TokenAnnotation(ids=list(range(len(words))),
-                    words=words, tags=tags, pos=pos, morphs=morphs,
-                    lemmas=lemmas, heads=heads, deps=deps, entities=entities,
-                    sent_starts=sent_starts, brackets=[])
-
-            for i, gold_i in enumerate(self.cand_to_gold):
-                if doc[i].text.isspace():
-                    self.words[i] = doc[i].text
-                    self.tags[i] = "_SP"
-                    self.pos[i] = "SPACE"
-                    self.morphs[i] = None
-                    self.lemmas[i] = None
-                    self.heads[i] = None
-                    self.labels[i] = None
-                    self.ner[i] = None
-                    self.sent_starts[i] = 0
-                if gold_i is None:
-                    if i in i2j_multi:
-                        self.words[i] = words[i2j_multi[i]]
-                        self.tags[i] = tags[i2j_multi[i]]
-                        self.pos[i] = pos[i2j_multi[i]]
-                        self.morphs[i] = morphs[i2j_multi[i]]
-                        self.lemmas[i] = lemmas[i2j_multi[i]]
-                        self.sent_starts[i] = sent_starts[i2j_multi[i]]
-                        is_last = i2j_multi[i] != i2j_multi.get(i+1)
-                        # Set next word in multi-token span as head, until last
-                        if not is_last:
-                            self.heads[i] = i+1
-                            self.labels[i] = "subtok"
-                        else:
-                            head_i = heads[i2j_multi[i]]
-                            if head_i:
-                                self.heads[i] = self.gold_to_cand[head_i]
-                            self.labels[i] = deps[i2j_multi[i]]
-                        ner_tag = entities[i2j_multi[i]]
-                        # Assign O/- for many-to-one O/- NER tags
-                        if ner_tag in ("O", "-"):
-                             self.ner[i] = ner_tag
-                else:
-                    self.words[i] = words[gold_i]
-                    self.tags[i] = tags[gold_i]
-                    self.pos[i] = pos[gold_i]
-                    self.morphs[i] = morphs[gold_i]
-                    self.lemmas[i] = lemmas[gold_i]
-                    self.sent_starts[i] = sent_starts[gold_i]
-                    if heads[gold_i] is None:
-                        self.heads[i] = None
-                    else:
-                        self.heads[i] = self.gold_to_cand[heads[gold_i]]
-                    self.labels[i] = deps[gold_i]
-                    self.ner[i] = entities[gold_i]
-            # Assign O/- for one-to-many O/- NER tags
-            for j, cand_j in enumerate(self.gold_to_cand):
-                if cand_j is None:
-                    if j in j2i_multi:
-                        i = j2i_multi[j]
-                        ner_tag = entities[j]
-                        if ner_tag in ("O", "-"):
-                            self.ner[i] = ner_tag
-
-            # If there is entity annotation and some tokens remain unaligned,
-            # align all entities at the character level to account for all
-            # possible token misalignments within the entity spans
-            if any([e not in ("O", "-") for e in entities]) and None in self.ner:
-                # If the temporary entdoc wasn't created above, initialize it
-                if not entdoc:
-                    entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text)
-                    entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces)
-                # Get offsets based on gold words and BILUO entities
-                entdoc_offsets = offsets_from_biluo_tags(entdoc, entities)
-                aligned_offsets = []
-                aligned_spans = []
-                # Filter offsets to identify those that align with doc tokens
-                for offset in entdoc_offsets:
-                    span = doc.char_span(offset[0], offset[1])
-                    if span and not span.text.isspace():
-                        aligned_offsets.append(offset)
-                        aligned_spans.append(span)
-                # Convert back to BILUO for doc tokens and assign NER for all
-                # aligned spans
-                biluo_tags = biluo_tags_from_offsets(doc, aligned_offsets, missing=None)
-                for span in aligned_spans:
-                    for i in range(span.start, span.end):
-                        self.ner[i] = biluo_tags[i]
-
-            # Prevent whitespace that isn't within entities from being tagged as
-            # an entity.
-            for i in range(len(self.ner)):
-                if self.tags[i] == "_SP":
-                    prev_ner = self.ner[i-1] if i >= 1 else None
-                    next_ner = self.ner[i+1] if (i+1) < len(self.ner) else None
-                    if prev_ner == "O" or next_ner == "O":
-                        self.ner[i] = "O"
-
-            cycle = nonproj.contains_cycle(self.heads)
-            if cycle is not None:
-                raise ValueError(Errors.E069.format(cycle=cycle,
-                    cycle_tokens=" ".join([f"'{self.words[tok_id]}'" for tok_id in cycle]),
-                    doc_tokens=" ".join(words[:50])))
-
-    def __len__(self):
-        """Get the number of gold-standard tokens.
-
-        RETURNS (int): The number of gold-standard tokens.
-        """
-        return self.length
-
-    @property
-    def is_projective(self):
-        """Whether the provided syntactic annotations form a projective
-        dependency tree.
-        """
-        return not nonproj.is_nonproj_tree(self.heads)
-
-
-def docs_to_json(docs, id=0, ner_missing_tag="O"):
-    """Convert a list of Doc objects into the JSON-serializable format used by
-    the spacy train command.
-
-    docs (iterable / Doc): The Doc object(s) to convert.
-    id (int): Id for the JSON.
-    RETURNS (dict): The data in spaCy's JSON format
-        - each input doc will be treated as a paragraph in the output doc
-    """
-    if isinstance(docs, Doc):
-        docs = [docs]
-    json_doc = {"id": id, "paragraphs": []}
-    for i, doc in enumerate(docs):
-        json_para = {'raw': doc.text, "sentences": [], "cats": []}
-        for cat, val in doc.cats.items():
-            json_cat = {"label": cat, "value": val}
-            json_para["cats"].append(json_cat)
-        ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
-        biluo_tags = biluo_tags_from_offsets(doc, ent_offsets, missing=ner_missing_tag)
-        for j, sent in enumerate(doc.sents):
-            json_sent = {"tokens": [], "brackets": []}
-            for token in sent:
-                json_token = {"id": token.i, "orth": token.text}
-                if doc.is_tagged:
-                    json_token["tag"] = token.tag_
-                    json_token["pos"] = token.pos_
-                    json_token["morph"] = token.morph_
-                    json_token["lemma"] = token.lemma_
-                if doc.is_parsed:
-                    json_token["head"] = token.head.i-token.i
-                    json_token["dep"] = token.dep_
-                json_token["ner"] = biluo_tags[token.i]
-                json_sent["tokens"].append(json_token)
-            json_para["sentences"].append(json_sent)
-        json_doc["paragraphs"].append(json_para)
-    return json_doc
-
-
-def biluo_tags_from_offsets(doc, entities, missing="O"):
-    """Encode labelled spans into per-token tags, using the
-    Begin/In/Last/Unit/Out scheme (BILUO).
-
-    doc (Doc): The document that the entity offsets refer to. The output tags
-        will refer to the token boundaries within the document.
-    entities (iterable): A sequence of `(start, end, label)` triples. `start`
-        and `end` should be character-offset integers denoting the slice into
-        the original string.
-    RETURNS (list): A list of unicode strings, describing the tags. Each tag
-        string will be of the form either "", "O" or "{action}-{label}", where
-        action is one of "B", "I", "L", "U". The string "-" is used where the
-        entity offsets don't align with the tokenization in the `Doc` object.
-        The training algorithm will view these as missing values. "O" denotes a
-        non-entity token. "B" denotes the beginning of a multi-token entity,
-        "I" the inside of an entity of three or more tokens, and "L" the end
-        of an entity of two or more tokens. "U" denotes a single-token entity.
-
-    EXAMPLE:
-        >>> text = 'I like London.'
-        >>> entities = [(len('I like '), len('I like London'), 'LOC')]
-        >>> doc = nlp.tokenizer(text)
-        >>> tags = biluo_tags_from_offsets(doc, entities)
-        >>> assert tags == ["O", "O", 'U-LOC', "O"]
-    """
-    # Ensure no overlapping entity labels exist
-    tokens_in_ents = {}
-
-    starts = {token.idx: token.i for token in doc}
-    ends = {token.idx + len(token): token.i for token in doc}
-    biluo = ["-" for _ in doc]
-    # Handle entity cases
-    for start_char, end_char, label in entities:
-        for token_index in range(start_char, end_char):
-            if token_index in tokens_in_ents.keys():
-                raise ValueError(Errors.E103.format(
-                    span1=(tokens_in_ents[token_index][0],
-                            tokens_in_ents[token_index][1],
-                            tokens_in_ents[token_index][2]),
-                    span2=(start_char, end_char, label)))
-            tokens_in_ents[token_index] = (start_char, end_char, label)
-
-        start_token = starts.get(start_char)
-        end_token = ends.get(end_char)
-        # Only interested if the tokenization is correct
-        if start_token is not None and end_token is not None:
-            if start_token == end_token:
-                biluo[start_token] = f"U-{label}"
-            else:
-                biluo[start_token] = f"B-{label}"
-                for i in range(start_token+1, end_token):
-                    biluo[i] = f"I-{label}"
-                biluo[end_token] = f"L-{label}"
-    # Now distinguish the O cases from ones where we miss the tokenization
-    entity_chars = set()
-    for start_char, end_char, label in entities:
-        for i in range(start_char, end_char):
-            entity_chars.add(i)
-    for token in doc:
-        for i in range(token.idx, token.idx + len(token)):
-            if i in entity_chars:
-                break
-        else:
-            biluo[token.i] = missing
-    if "-" in biluo:
-        ent_str = str(entities)
-        warnings.warn(Warnings.W030.format(
-            text=doc.text[:50] + "..." if len(doc.text) > 50 else doc.text,
-            entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str
-        ))
-    return biluo
-
-
-def spans_from_biluo_tags(doc, tags):
-    """Encode per-token tags following the BILUO scheme into Span object, e.g.
-    to overwrite the doc.ents.
-
-    doc (Doc): The document that the BILUO tags refer to.
-    entities (iterable): A sequence of BILUO tags with each tag describing one
-        token. Each tags string will be of the form of either "", "O" or
-        "{action}-{label}", where action is one of "B", "I", "L", "U".
-    RETURNS (list): A sequence of Span objects.
-    """
-    token_offsets = tags_to_entities(tags)
-    spans = []
-    for label, start_idx, end_idx in token_offsets:
-        span = Span(doc, start_idx, end_idx + 1, label=label)
-        spans.append(span)
-    return spans
-
-
-def offsets_from_biluo_tags(doc, tags):
-    """Encode per-token tags following the BILUO scheme into entity offsets.
-
-    doc (Doc): The document that the BILUO tags refer to.
-    entities (iterable): A sequence of BILUO tags with each tag describing one
-        token. Each tags string will be of the form of either "", "O" or
-        "{action}-{label}", where action is one of "B", "I", "L", "U".
-    RETURNS (list): A sequence of `(start, end, label)` triples. `start` and
-        `end` will be character-offset integers denoting the slice into the
-        original string.
-    """
-    spans = spans_from_biluo_tags(doc, tags)
-    return [(span.start_char, span.end_char, span.label_) for span in spans]
-
-
-def is_punct_label(label):
-    return label == "P" or label.lower() == "punct"

From 866179350be1e86d0f56463a52976007fb07e63d Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 6 Jun 2020 15:11:13 +0200
Subject: [PATCH 12/56] Fix import

---
 spacy/_gold/example.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/_gold/example.py b/spacy/_gold/example.py
index db9e10093..7528c360e 100644
--- a/spacy/_gold/example.py
+++ b/spacy/_gold/example.py
@@ -1,5 +1,6 @@
 from .annotation import TokenAnnotation, DocAnnotation
-from .gold_parse import GoldParse
+# We're hoping to kill this GoldParse dependency but for now match semantics.
+from ..syntax.gold_parse import GoldParse
 
 
 class Example:

From 0f9b4bbfea2a7eb9d79797108d3e1e776b0f4a25 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 6 Jun 2020 15:12:52 +0200
Subject: [PATCH 13/56] Fix imports

---
 spacy/_gold/example.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/spacy/_gold/example.py b/spacy/_gold/example.py
index 7528c360e..969ba0374 100644
--- a/spacy/_gold/example.py
+++ b/spacy/_gold/example.py
@@ -1,4 +1,6 @@
 from .annotation import TokenAnnotation, DocAnnotation
+from ..errors import Errors, AlignmentError
+from ..tokens import Doc
 # We're hoping to kill this GoldParse dependency but for now match semantics.
 from ..syntax.gold_parse import GoldParse
 

From 17533a92863ffc8d7bf34155293e3f74018fe096 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 6 Jun 2020 15:13:07 +0200
Subject: [PATCH 14/56] Format

---
 spacy/_gold/augment.py |  18 ++++---
 spacy/_gold/example.py | 107 +++++++++++++++++++++++++++++------------
 2 files changed, 87 insertions(+), 38 deletions(-)

diff --git a/spacy/_gold/augment.py b/spacy/_gold/augment.py
index 02c812825..1fffe6187 100644
--- a/spacy/_gold/augment.py
+++ b/spacy/_gold/augment.py
@@ -32,15 +32,18 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
         punct_choices = [random.choice(x["variants"]) for x in ndsv]
         for word_idx in range(len(words)):
             for punct_idx in range(len(ndsv)):
-                if tags[word_idx] in ndsv[punct_idx]["tags"] \
-                        and words[word_idx] in ndsv[punct_idx]["variants"]:
+                if (
+                    tags[word_idx] in ndsv[punct_idx]["tags"]
+                    and words[word_idx] in ndsv[punct_idx]["variants"]
+                ):
                     words[word_idx] = punct_choices[punct_idx]
         # paired variants
         punct_choices = [random.choice(x["variants"]) for x in ndpv]
         for word_idx in range(len(words)):
             for punct_idx in range(len(ndpv)):
-                if tags[word_idx] in ndpv[punct_idx]["tags"] \
-                        and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
+                if tags[word_idx] in ndpv[punct_idx]["tags"] and words[
+                    word_idx
+                ] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
                     # backup option: random left vs. right from pair
                     pair_idx = random.choice([0, 1])
                     # best option: rely on paired POS tags like `` / ''
@@ -64,7 +67,9 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
         for single_variants in ndsv:
             variants.extend(single_variants["variants"])
         for paired_variants in ndpv:
-            variants.extend(list(itertools.chain.from_iterable(paired_variants["variants"])))
+            variants.extend(
+                list(itertools.chain.from_iterable(paired_variants["variants"]))
+            )
         # store variants in reverse length order to be able to prioritize
         # longer matches (e.g., "---" before "--")
         variants = sorted(variants, key=lambda x: len(x))
@@ -88,8 +93,7 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
             # add variant word
             else:
                 for variant in variants:
-                    if not match_found and \
-                            raw[raw_idx:].startswith(variant):
+                    if not match_found and raw[raw_idx:].startswith(variant):
                         raw_idx += len(variant)
                         variant_raw += word
                         match_found = True
diff --git a/spacy/_gold/example.py b/spacy/_gold/example.py
index 969ba0374..c637c5540 100644
--- a/spacy/_gold/example.py
+++ b/spacy/_gold/example.py
@@ -1,17 +1,21 @@
 from .annotation import TokenAnnotation, DocAnnotation
 from ..errors import Errors, AlignmentError
 from ..tokens import Doc
+
 # We're hoping to kill this GoldParse dependency but for now match semantics.
 from ..syntax.gold_parse import GoldParse
 
 
 class Example:
-    def __init__(self, doc_annotation=None, token_annotation=None, doc=None,
-                 goldparse=None):
+    def __init__(
+        self, doc_annotation=None, token_annotation=None, doc=None, goldparse=None
+    ):
         """ Doc can either be text, or an actual Doc """
         self.doc = doc
         self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation()
-        self.token_annotation = token_annotation if token_annotation else TokenAnnotation()
+        self.token_annotation = (
+            token_annotation if token_annotation else TokenAnnotation()
+        )
         self.goldparse = goldparse
 
     @classmethod
@@ -49,13 +53,33 @@ class Example:
             self.goldparse = gold
         return self.goldparse
 
-    def set_token_annotation(self, ids=None, words=None, tags=None, pos=None,
-                             morphs=None, lemmas=None, heads=None, deps=None,
-                             entities=None, sent_starts=None, brackets=None):
-        self.token_annotation = TokenAnnotation(ids=ids, words=words, tags=tags,
-                            pos=pos, morphs=morphs, lemmas=lemmas, heads=heads,
-                            deps=deps, entities=entities,
-                            sent_starts=sent_starts, brackets=brackets)
+    def set_token_annotation(
+        self,
+        ids=None,
+        words=None,
+        tags=None,
+        pos=None,
+        morphs=None,
+        lemmas=None,
+        heads=None,
+        deps=None,
+        entities=None,
+        sent_starts=None,
+        brackets=None,
+    ):
+        self.token_annotation = TokenAnnotation(
+            ids=ids,
+            words=words,
+            tags=tags,
+            pos=pos,
+            morphs=morphs,
+            lemmas=lemmas,
+            heads=heads,
+            deps=deps,
+            entities=entities,
+            sent_starts=sent_starts,
+            brackets=brackets,
+        )
 
     def set_doc_annotation(self, cats=None, links=None):
         if cats:
@@ -77,11 +101,19 @@ class Example:
         split_examples = []
         for i in range(len(t.words)):
             if i > 0 and t.sent_starts[i] == 1:
-                s_example.set_token_annotation(ids=s_ids,
-                        words=s_words, tags=s_tags, pos=s_pos, morphs=s_morphs,
-                        lemmas=s_lemmas, heads=s_heads, deps=s_deps,
-                        entities=s_ents, sent_starts=s_sent_starts,
-                        brackets=s_brackets)
+                s_example.set_token_annotation(
+                    ids=s_ids,
+                    words=s_words,
+                    tags=s_tags,
+                    pos=s_pos,
+                    morphs=s_morphs,
+                    lemmas=s_lemmas,
+                    heads=s_heads,
+                    deps=s_deps,
+                    entities=s_ents,
+                    sent_starts=s_sent_starts,
+                    brackets=s_brackets,
+                )
                 split_examples.append(s_example)
                 s_example = Example(doc=None, doc_annotation=self.doc_annotation)
                 s_ids, s_words, s_tags, s_pos, s_heads = [], [], [], [], []
@@ -99,20 +131,27 @@ class Example:
             s_ents.append(t.get_entity(i))
             s_sent_starts.append(t.get_sent_start(i))
             for b_end, b_label in t.brackets_by_start.get(i, []):
-                s_brackets.append(
-                    (i - sent_start_i, b_end - sent_start_i, b_label)
-                )
+                s_brackets.append((i - sent_start_i, b_end - sent_start_i, b_label))
             i += 1
-        s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags,
-                pos=s_pos, morphs=s_morphs, lemmas=s_lemmas, heads=s_heads,
-                deps=s_deps, entities=s_ents, sent_starts=s_sent_starts,
-                brackets=s_brackets)
+        s_example.set_token_annotation(
+            ids=s_ids,
+            words=s_words,
+            tags=s_tags,
+            pos=s_pos,
+            morphs=s_morphs,
+            lemmas=s_lemmas,
+            heads=s_heads,
+            deps=s_deps,
+            entities=s_ents,
+            sent_starts=s_sent_starts,
+            brackets=s_brackets,
+        )
         split_examples.append(s_example)
         return split_examples
 
-
-    def get_gold_parses(self, merge=True, vocab=None, make_projective=False,
-                        ignore_misaligned=False):
+    def get_gold_parses(
+        self, merge=True, vocab=None, make_projective=False, ignore_misaligned=False
+    ):
         """Return a list of (doc, GoldParse) objects.
         If merge is set to True, keep all Token annotations as one big list."""
         d = self.doc_annotation
@@ -125,8 +164,9 @@ class Example:
                     raise ValueError(Errors.E998)
                 doc = Doc(vocab, words=t.words)
             try:
-                gp = GoldParse.from_annotation(doc, d, t,
-                                               make_projective=make_projective)
+                gp = GoldParse.from_annotation(
+                    doc, d, t, make_projective=make_projective
+                )
             except AlignmentError:
                 if ignore_misaligned:
                     gp = None
@@ -143,9 +183,12 @@ class Example:
                     raise ValueError(Errors.E998)
                 split_doc = Doc(vocab, words=split_example.token_annotation.words)
                 try:
-                    gp = GoldParse.from_annotation(split_doc, d,
-                            split_example.token_annotation,
-                            make_projective=make_projective)
+                    gp = GoldParse.from_annotation(
+                        split_doc,
+                        d,
+                        split_example.token_annotation,
+                        make_projective=make_projective,
+                    )
                 except AlignmentError:
                     if ignore_misaligned:
                         gp = None
@@ -194,7 +237,9 @@ class Example:
                     else:
                         gold = None
                 if gold is not None:
-                    converted_examples.append(Example.from_gold(goldparse=gold, doc=doc))
+                    converted_examples.append(
+                        Example.from_gold(goldparse=gold, doc=doc)
+                    )
                 else:
                     raise ValueError(Errors.E999.format(gold_dict=gold_dict))
             else:

From 7f135736f4528a7b9551dc49d0457e95dcc42deb Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 6 Jun 2020 15:28:52 +0200
Subject: [PATCH 15/56] Fix imports

---
 spacy/_gold/align.py   | 2 +-
 spacy/_gold/augment.py | 2 +-
 spacy/_gold/corpus.py  | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/spacy/_gold/align.py b/spacy/_gold/align.py
index 7703232b2..ac2700c1f 100644
--- a/spacy/_gold/align.py
+++ b/spacy/_gold/align.py
@@ -1,5 +1,5 @@
 import numpy
-from .errors import Errors, AlignmentError
+from ..errors import Errors, AlignmentError
 
 
 def align(tokens_a, tokens_b):
diff --git a/spacy/_gold/augment.py b/spacy/_gold/augment.py
index 1fffe6187..656308214 100644
--- a/spacy/_gold/augment.py
+++ b/spacy/_gold/augment.py
@@ -1,6 +1,6 @@
 import random
 import itertools
-from .exmaple import Example
+from .example import Example
 
 
 def make_orth_variants(nlp, example, orth_variant_level=0.0):
diff --git a/spacy/_gold/corpus.py b/spacy/_gold/corpus.py
index 2fdfd8d2a..b0b454745 100644
--- a/spacy/_gold/corpus.py
+++ b/spacy/_gold/corpus.py
@@ -7,9 +7,9 @@ import itertools
 from ..tokens import Doc
 from .. import util
 from ..errors import Errors
-from .gold_utils import read_json_file, read_json_object
+from .gold_io import read_json_file, read_json_object
 from .augment import make_orth_variants, add_noise
-from .exmaple import Example
+from .example import Example
 
 
 class GoldCorpus(object):

From 74204116a3e83708589a253a120c5941d9b6863e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 6 Jun 2020 15:29:32 +0200
Subject: [PATCH 16/56] Rename _gold -> gold

---
 spacy/{_gold => gold}/align.py      | 0
 spacy/{_gold => gold}/annotation.py | 0
 spacy/{_gold => gold}/augment.py    | 0
 spacy/{_gold => gold}/corpus.py     | 0
 spacy/{_gold => gold}/example.py    | 0
 spacy/{_gold => gold}/gold_io.pyx   | 0
 spacy/{_gold => gold}/iob_utils.py  | 0
 7 files changed, 0 insertions(+), 0 deletions(-)
 rename spacy/{_gold => gold}/align.py (100%)
 rename spacy/{_gold => gold}/annotation.py (100%)
 rename spacy/{_gold => gold}/augment.py (100%)
 rename spacy/{_gold => gold}/corpus.py (100%)
 rename spacy/{_gold => gold}/example.py (100%)
 rename spacy/{_gold => gold}/gold_io.pyx (100%)
 rename spacy/{_gold => gold}/iob_utils.py (100%)

diff --git a/spacy/_gold/align.py b/spacy/gold/align.py
similarity index 100%
rename from spacy/_gold/align.py
rename to spacy/gold/align.py
diff --git a/spacy/_gold/annotation.py b/spacy/gold/annotation.py
similarity index 100%
rename from spacy/_gold/annotation.py
rename to spacy/gold/annotation.py
diff --git a/spacy/_gold/augment.py b/spacy/gold/augment.py
similarity index 100%
rename from spacy/_gold/augment.py
rename to spacy/gold/augment.py
diff --git a/spacy/_gold/corpus.py b/spacy/gold/corpus.py
similarity index 100%
rename from spacy/_gold/corpus.py
rename to spacy/gold/corpus.py
diff --git a/spacy/_gold/example.py b/spacy/gold/example.py
similarity index 100%
rename from spacy/_gold/example.py
rename to spacy/gold/example.py
diff --git a/spacy/_gold/gold_io.pyx b/spacy/gold/gold_io.pyx
similarity index 100%
rename from spacy/_gold/gold_io.pyx
rename to spacy/gold/gold_io.pyx
diff --git a/spacy/_gold/iob_utils.py b/spacy/gold/iob_utils.py
similarity index 100%
rename from spacy/_gold/iob_utils.py
rename to spacy/gold/iob_utils.py

From 53b00991fd844892e3efa6090426a99babf91e9d Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 6 Jun 2020 15:36:46 +0200
Subject: [PATCH 17/56] Fix imports

---
 setup.py                     | 3 ++-
 spacy/syntax/_beam_utils.pyx | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index d16615f5f..864a4036a 100755
--- a/setup.py
+++ b/setup.py
@@ -35,13 +35,14 @@ MOD_NAMES = [
     "spacy.syntax.stateclass",
     "spacy.syntax._state",
     "spacy.tokenizer",
+    "spacy.syntax.gold_parse",
     "spacy.syntax.nn_parser",
     "spacy.syntax._parser_model",
     "spacy.syntax._beam_utils",
     "spacy.syntax.nonproj",
     "spacy.syntax.transition_system",
     "spacy.syntax.arc_eager",
-    "spacy.gold",
+    "spacy.gold.gold_io",
     "spacy.tokens.doc",
     "spacy.tokens.span",
     "spacy.tokens.token",
diff --git a/spacy/syntax/_beam_utils.pyx b/spacy/syntax/_beam_utils.pyx
index 03702e54e..46bff1af9 100644
--- a/spacy/syntax/_beam_utils.pyx
+++ b/spacy/syntax/_beam_utils.pyx
@@ -9,7 +9,7 @@ import numpy
 
 from ..typedefs cimport hash_t, class_t
 from .transition_system cimport TransitionSystem, Transition
-from ..gold cimport GoldParse
+from .gold_parse cimport GoldParse
 from .stateclass cimport StateC, StateClass
 
 from ..errors import Errors

From 6e87ca1f452b3a12aa6100d38a4dd0e1bf8bba4b Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 6 Jun 2020 15:36:58 +0200
Subject: [PATCH 18/56] Fix imports

---
 spacy/syntax/_parser_model.pyx | 2 +-
 spacy/syntax/gold_parse.pyx    | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx
index 60d22a1ab..7a4eccfc4 100644
--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@@ -16,7 +16,7 @@ from thinc.api import Linear, Model, CupyOps, NumpyOps, use_ops, noop
 
 from ..typedefs cimport weight_t, class_t, hash_t
 from ..tokens.doc cimport Doc
-from ..gold cimport GoldParse
+from .gold_parse cimport GoldParse
 from .stateclass cimport StateClass
 from .transition_system cimport Transition
 
diff --git a/spacy/syntax/gold_parse.pyx b/spacy/syntax/gold_parse.pyx
index 59e8f4bbb..df4059a21 100644
--- a/spacy/syntax/gold_parse.pyx
+++ b/spacy/syntax/gold_parse.pyx
@@ -10,11 +10,12 @@ import srsly
 import warnings
 
 from .. import util
-from ..syntax import nonproj
+from . import nonproj
 from ..tokens import Doc, Span
 from ..errors import Errors, AlignmentError, Warnings
-from .iob_utils import offsets_from_biluo_tags
-from .align import align
+from ..gold.annotation import TokenAnnotation
+from ..gold.iob_utils import offsets_from_biluo_tags, biluo_tags_from_offsets
+from ..gold.align import align
 
 
 punct_re = re.compile(r"\W")

From b69fa77ccc854d69a00654ecbdfb6ec069a794b7 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 6 Jun 2020 15:38:46 +0200
Subject: [PATCH 19/56] Add missing inits

---
 spacy/gold/__init__.pxd |  0
 spacy/gold/__init__.py  | 13 +++++++++++++
 2 files changed, 13 insertions(+)
 create mode 100644 spacy/gold/__init__.pxd
 create mode 100644 spacy/gold/__init__.py

diff --git a/spacy/gold/__init__.pxd b/spacy/gold/__init__.pxd
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/gold/__init__.py b/spacy/gold/__init__.py
new file mode 100644
index 000000000..b8d35972d
--- /dev/null
+++ b/spacy/gold/__init__.py
@@ -0,0 +1,13 @@
+from .corpus import GoldCorpus
+from ..syntax.gold_parse import GoldParse
+from .example import Example
+from .annotation import TokenAnnotation, DocAnnotation
+from .align import align
+
+from .iob_utils import iob_to_biluo, biluo_to_iob
+from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags
+from .iob_utils import spans_from_biluo_tags
+from .iob_utils import tags_to_entities
+
+from .gold_io import docs_to_json
+from .gold_io import read_json_file, read_json_object

From 084271c9e9a3f50095e7c1e55a0218d42e21205e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 8 Jun 2020 22:09:57 +0200
Subject: [PATCH 20/56] Remove GoldParse from public API

* Move get_parses_from_example to spacy.syntax

* Get GoldParse out of Example

* Avoid expecting GoldParse input in parser

* Add Alignment to spacy.gold.align

* Update Example object

* Add comment

* Update pipeline

* Fix imports

* Simplify gold_io

* WIP on GoldCorpus

* Update test

* Xfail some gold tests

* Remove ignore_misaligned option from GoldCorpus

* Fix Example constructor

* Update test

* Fix usage of Example

* Add deprecated_get_gold method on Example

* Patch scorer

* Fix test

* Fix test

* Update tests

* Xfail a test

* Fix passing of make_projective

* Pass make_projective by default

* Hack data format in Example.from_dict

* Update tests

* Fix example.from_dict

* Update morphologizer

* Fix entity linker

* Add get_field to TokenAnnotation

* Fix Example.get_aligned

* Update test

* Fix alignment

* Fix corpus

* Fix GoldCorpus

* Handle misaligned

* Format

* Fix missing import
---
 spacy/cli/train_from_config.py           |   4 +-
 spacy/gold/__init__.py                   |   2 +-
 spacy/gold/align.py                      |  20 +++
 spacy/gold/annotation.py                 |  24 ++++
 spacy/gold/corpus.py                     |  62 +++-------
 spacy/gold/example.py                    | 149 ++++++++++-------------
 spacy/gold/gold_io.pyx                   |  60 +++++----
 spacy/language.py                        |   1 +
 spacy/pipeline/morphologizer.pyx         |   2 +-
 spacy/pipeline/pipes.pyx                 |  52 ++++----
 spacy/scorer.py                          |   2 +-
 spacy/syntax/gold_parse.pyx              |  51 ++++++++
 spacy/syntax/nn_parser.pyx               |  15 ++-
 spacy/tests/parser/test_add_label.py     |  12 +-
 spacy/tests/parser/test_neural_parser.py |   2 +-
 spacy/tests/parser/test_preset_sbd.py    |   3 +-
 spacy/tests/test_gold.py                 | 137 ++++++++-------------
 spacy/tests/test_language.py             |  20 +--
 18 files changed, 315 insertions(+), 303 deletions(-)

diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py
index a6d0a0abc..c4db5f6ba 100644
--- a/spacy/cli/train_from_config.py
+++ b/spacy/cli/train_from_config.py
@@ -11,6 +11,7 @@ from thinc.api import Model, use_pytorch_for_gpu_memory
 import random
 
 from ..gold import GoldCorpus
+from ..gold import Example
 from .. import util
 from ..errors import Errors
 from ..ml import models   # don't remove - required to load the built-in architectures
@@ -243,7 +244,7 @@ def create_train_batches(nlp, corpus, cfg):
             orth_variant_level=cfg["orth_variant_level"],
             gold_preproc=cfg["gold_preproc"],
             max_length=cfg["max_length"],
-            ignore_misaligned=True,
+            ignore_misaligned=True
         ))
         if len(train_examples) == 0:
             raise ValueError(Errors.E988)
@@ -271,6 +272,7 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
                 nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True
             )
         )
+
         n_words = sum(len(ex.doc) for ex in dev_examples)
         start_time = timer()
             
diff --git a/spacy/gold/__init__.py b/spacy/gold/__init__.py
index b8d35972d..5e41d30cb 100644
--- a/spacy/gold/__init__.py
+++ b/spacy/gold/__init__.py
@@ -10,4 +10,4 @@ from .iob_utils import spans_from_biluo_tags
 from .iob_utils import tags_to_entities
 
 from .gold_io import docs_to_json
-from .gold_io import read_json_file, read_json_object
+from .gold_io import read_json_file
diff --git a/spacy/gold/align.py b/spacy/gold/align.py
index ac2700c1f..49e8aaa98 100644
--- a/spacy/gold/align.py
+++ b/spacy/gold/align.py
@@ -2,6 +2,26 @@ import numpy
 from ..errors import Errors, AlignmentError
 
 
+class Alignment:
+    def __init__(self, spacy_words, gold_words):
+        # Do many-to-one alignment for misaligned tokens.
+        # If we over-segment, we'll have one gold word that covers a sequence
+        # of predicted words
+        # If we under-segment, we'll have one predicted word that covers a
+        # sequence of gold words.
+        # If we "mis-segment", we'll have a sequence of predicted words covering
+        # a sequence of gold words. That's many-to-many -- we don't do that
+        # except for NER spans where the start and end can be aligned.
+        cost, i2j, j2i, i2j_multi, j2i_multi = align(spacy_words, gold_words)
+        self.cost = cost
+        self.i2j = i2j
+        self.j2i = j2i
+        self.i2j_multi = i2j_multi
+        self.j2i_multi = j2i_multi
+        self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
+        self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
+
+
 def align(tokens_a, tokens_b):
     """Calculate alignment tables between two tokenizations.
 
diff --git a/spacy/gold/annotation.py b/spacy/gold/annotation.py
index cd8ac0717..6bae679c3 100644
--- a/spacy/gold/annotation.py
+++ b/spacy/gold/annotation.py
@@ -28,6 +28,30 @@ class TokenAnnotation:
             for b_start, b_end, b_label in brackets:
                 self.brackets_by_start.setdefault(b_start, []).append((b_end, b_label))
 
+    def get_field(self, field):
+        if field == "id":
+            return self.ids
+        elif field == "word":
+            return self.words
+        elif field == "tag":
+            return self.tags
+        elif field == "pos":
+            return self.pos
+        elif field == "morph":
+            return self.morphs
+        elif field == "lemma":
+            return self.lemmas
+        elif field == "head":
+            return self.heads
+        elif field == "dep":
+            return self.deps
+        elif field == "ner":
+            return self.entities
+        elif field == "sent_start":
+            return self.sent_starts
+        else:
+            raise ValueError(f"Unknown field: {field}")
+
     @property
     def brackets(self):
         brackets = []
diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py
index b0b454745..9462f0aa4 100644
--- a/spacy/gold/corpus.py
+++ b/spacy/gold/corpus.py
@@ -6,8 +6,8 @@ from pathlib import Path
 import itertools
 from ..tokens import Doc
 from .. import util
-from ..errors import Errors
-from .gold_io import read_json_file, read_json_object
+from ..errors import Errors, AlignmentError
+from .gold_io import read_json_file, json_to_examples
 from .augment import make_orth_variants, add_noise
 from .example import Example
 
@@ -43,9 +43,8 @@ class GoldCorpus(object):
         if not directory.exists():
             directory.mkdir()
         n = 0
-        for i, example in enumerate(examples):
-            ex_dict = example.to_dict()
-            text = example.text
+        for i, ex_dict in enumerate(examples):
+            text = ex_dict["text"]
             srsly.write_msgpack(directory / f"{i}.msg", (text, ex_dict))
             n += 1
             if limit and n >= limit:
@@ -87,7 +86,9 @@ class GoldCorpus(object):
                 # TODO: proper format checks with schemas
                 if isinstance(first_gold_tuple, dict):
                     if first_gold_tuple.get("paragraphs", None):
-                        examples = read_json_object(gold_tuples)
+                        examples = []
+                        for json_doc in gold_tuples:
+                            examples.extend(json_to_examples(json_doc))
                     elif first_gold_tuple.get("doc_annotation", None):
                         examples = []
                         for ex_dict in gold_tuples:
@@ -117,7 +118,7 @@ class GoldCorpus(object):
             except KeyError as e:
                 msg = "Missing key {}".format(e)
                 raise KeyError(Errors.E996.format(file=file_name, msg=msg))
-            except UnboundLocalError:
+            except UnboundLocalError as e:
                 msg = "Unexpected document structure"
                 raise ValueError(Errors.E996.format(file=file_name, msg=msg))
 
@@ -200,9 +201,9 @@ class GoldCorpus(object):
     ):
         """ Setting gold_preproc will result in creating a doc per sentence """
         for example in examples:
+            example_docs = []
             if gold_preproc:
                 split_examples = example.split_sents()
-                example_golds = []
                 for split_example in split_examples:
                     split_example_docs = cls._make_docs(
                         nlp,
@@ -211,13 +212,7 @@ class GoldCorpus(object):
                         noise_level=noise_level,
                         orth_variant_level=orth_variant_level,
                     )
-                    split_example_golds = cls._make_golds(
-                        split_example_docs,
-                        vocab=nlp.vocab,
-                        make_projective=make_projective,
-                        ignore_misaligned=ignore_misaligned,
-                    )
-                    example_golds.extend(split_example_golds)
+                    example_docs.extend(split_example_docs)
             else:
                 example_docs = cls._make_docs(
                     nlp,
@@ -226,16 +221,14 @@ class GoldCorpus(object):
                     noise_level=noise_level,
                     orth_variant_level=orth_variant_level,
                 )
-                example_golds = cls._make_golds(
-                    example_docs,
-                    vocab=nlp.vocab,
-                    make_projective=make_projective,
-                    ignore_misaligned=ignore_misaligned,
-                )
-            for ex in example_golds:
-                if ex.goldparse is not None:
-                    if (not max_length) or len(ex.doc) < max_length:
-                        yield ex
+            for ex in example_docs:
+                if (not max_length) or len(ex.doc) < max_length:
+                    if ignore_misaligned:
+                        try:
+                            _ = ex._deprecated_get_gold()
+                        except AlignmentError:
+                            continue
+                    yield ex
 
     @classmethod
     def _make_docs(
@@ -256,22 +249,3 @@ class GoldCorpus(object):
             )
             var_example.doc = var_doc
         return [var_example]
-
-    @classmethod
-    def _make_golds(
-        cls, examples, vocab=None, make_projective=False, ignore_misaligned=False
-    ):
-        filtered_examples = []
-        for example in examples:
-            gold_parses = example.get_gold_parses(
-                vocab=vocab,
-                make_projective=make_projective,
-                ignore_misaligned=ignore_misaligned,
-            )
-            assert len(gold_parses) == 1
-            doc, gold = gold_parses[0]
-            if doc:
-                assert doc == example.doc
-                example.goldparse = gold
-                filtered_examples.append(example)
-        return filtered_examples
diff --git a/spacy/gold/example.py b/spacy/gold/example.py
index c637c5540..1d8665572 100644
--- a/spacy/gold/example.py
+++ b/spacy/gold/example.py
@@ -1,36 +1,56 @@
 from .annotation import TokenAnnotation, DocAnnotation
+from .align import Alignment
 from ..errors import Errors, AlignmentError
 from ..tokens import Doc
 
-# We're hoping to kill this GoldParse dependency but for now match semantics.
-from ..syntax.gold_parse import GoldParse
-
 
 class Example:
-    def __init__(
-        self, doc_annotation=None, token_annotation=None, doc=None, goldparse=None
-    ):
+    def __init__(self, doc=None, doc_annotation=None, token_annotation=None):
         """ Doc can either be text, or an actual Doc """
         self.doc = doc
         self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation()
         self.token_annotation = (
             token_annotation if token_annotation else TokenAnnotation()
         )
-        self.goldparse = goldparse
+        self._alignment = None
 
-    @classmethod
-    def from_gold(cls, goldparse, doc=None):
-        doc_annotation = DocAnnotation(cats=goldparse.cats, links=goldparse.links)
-        token_annotation = goldparse.get_token_annotation()
-        return cls(doc_annotation, token_annotation, doc)
+    def _deprecated_get_gold(self, make_projective=False):
+        from ..syntax.gold_parse import get_parses_from_example
+
+        _, gold = get_parses_from_example(self, make_projective=make_projective)[0]
+        return gold
 
     @classmethod
     def from_dict(cls, example_dict, doc=None):
+        if example_dict is None:
+            raise ValueError("Example.from_dict expected dict, received None")
+        # TODO: This is ridiculous...
         token_dict = example_dict.get("token_annotation", {})
-        token_annotation = TokenAnnotation.from_dict(token_dict)
         doc_dict = example_dict.get("doc_annotation", {})
+        for key, value in example_dict.items():
+            if key in ("token_annotation", "doc_annotation"):
+                pass
+            elif key in ("cats", "links"):
+                doc_dict[key] = value
+            else:
+                token_dict[key] = value
+        token_annotation = TokenAnnotation.from_dict(token_dict)
         doc_annotation = DocAnnotation.from_dict(doc_dict)
-        return cls(doc_annotation, token_annotation, doc)
+        return cls(
+            doc=doc, doc_annotation=doc_annotation, token_annotation=token_annotation
+        )
+
+    @property
+    def alignment(self):
+        if self._alignment is None:
+            if self.doc is None:
+                return None
+            spacy_words = [token.orth_ for token in self.doc]
+            gold_words = self.token_annotation.words
+            if gold_words == []:
+                gold_words = spacy_words
+            self._alignment = Alignment(spacy_words, gold_words)
+        return self._alignment
 
     def to_dict(self):
         """ Note that this method does NOT export the doc, only the annotations ! """
@@ -46,12 +66,31 @@ class Example:
             return self.doc.text
         return self.doc
 
-    @property
-    def gold(self):
-        if self.goldparse is None:
-            doc, gold = self.get_gold_parses()[0]
-            self.goldparse = gold
-        return self.goldparse
+    def get_aligned(self, field):
+        """Return an aligned array for a token annotation field."""
+        if self.doc is None:
+            return self.token_annotation.get_field(field)
+        doc = self.doc
+        if field == "word":
+            return [token.orth_ for token in doc]
+        gold_values = self.token_annotation.get_field(field)
+        alignment = self.alignment
+        i2j_multi = alignment.i2j_multi
+        gold_to_cand = alignment.gold_to_cand
+        cand_to_gold = alignment.cand_to_gold
+
+        output = []
+        for i, gold_i in enumerate(cand_to_gold):
+            if doc[i].text.isspace():
+                output.append(None)
+            elif gold_i is None:
+                if i in i2j_multi:
+                    output.append(gold_values[i2j_multi[i]])
+                else:
+                    output.append(None)
+            else:
+                output.append(gold_values[gold_i])
+        return output
 
     def set_token_annotation(
         self,
@@ -149,55 +188,6 @@ class Example:
         split_examples.append(s_example)
         return split_examples
 
-    def get_gold_parses(
-        self, merge=True, vocab=None, make_projective=False, ignore_misaligned=False
-    ):
-        """Return a list of (doc, GoldParse) objects.
-        If merge is set to True, keep all Token annotations as one big list."""
-        d = self.doc_annotation
-        # merge == do not modify Example
-        if merge:
-            t = self.token_annotation
-            doc = self.doc
-            if doc is None or not isinstance(doc, Doc):
-                if not vocab:
-                    raise ValueError(Errors.E998)
-                doc = Doc(vocab, words=t.words)
-            try:
-                gp = GoldParse.from_annotation(
-                    doc, d, t, make_projective=make_projective
-                )
-            except AlignmentError:
-                if ignore_misaligned:
-                    gp = None
-                else:
-                    raise
-            return [(doc, gp)]
-        # not merging: one GoldParse per sentence, defining docs with the words
-        # from each sentence
-        else:
-            parses = []
-            split_examples = self.split_sents()
-            for split_example in split_examples:
-                if not vocab:
-                    raise ValueError(Errors.E998)
-                split_doc = Doc(vocab, words=split_example.token_annotation.words)
-                try:
-                    gp = GoldParse.from_annotation(
-                        split_doc,
-                        d,
-                        split_example.token_annotation,
-                        make_projective=make_projective,
-                    )
-                except AlignmentError:
-                    if ignore_misaligned:
-                        gp = None
-                    else:
-                        raise
-                if gp is not None:
-                    parses.append((split_doc, gp))
-            return parses
-
     @classmethod
     def to_example_objects(cls, examples, make_doc=None, keep_raw_text=False):
         """
@@ -219,29 +209,16 @@ class Example:
                 else:
                     doc = make_doc(ex)
                     converted_examples.append(Example(doc=doc))
-            # convert Doc to Example
-            elif isinstance(ex, Doc):
-                converted_examples.append(Example(doc=ex))
             # convert tuples to Example
             elif isinstance(ex, tuple) and len(ex) == 2:
                 doc, gold = ex
-                gold_dict = {}
                 # convert string to Doc
                 if isinstance(doc, str) and not keep_raw_text:
                     doc = make_doc(doc)
-                # convert dict to GoldParse
-                if isinstance(gold, dict):
-                    gold_dict = gold
-                    if doc is not None or gold.get("words", None) is not None:
-                        gold = GoldParse(doc, **gold)
-                    else:
-                        gold = None
-                if gold is not None:
-                    converted_examples.append(
-                        Example.from_gold(goldparse=gold, doc=doc)
-                    )
-                else:
-                    raise ValueError(Errors.E999.format(gold_dict=gold_dict))
+                converted_examples.append(Example.from_dict(gold, doc=doc))
+            # convert Doc to Example
+            elif isinstance(ex, Doc):
+                converted_examples.append(Example(doc=ex))
             else:
                 converted_examples.append(ex)
         return converted_examples
diff --git a/spacy/gold/gold_io.pyx b/spacy/gold/gold_io.pyx
index 15581c151..424e44f72 100644
--- a/spacy/gold/gold_io.pyx
+++ b/spacy/gold/gold_io.pyx
@@ -3,7 +3,6 @@ import srsly
 from .. import util
 from ..errors import Warnings
 from ..tokens import Token, Doc
-from .example import Example
 from .iob_utils import biluo_tags_from_offsets
 
 
@@ -64,6 +63,19 @@ def docs_to_json(docs, id=0, ner_missing_tag="O"):
     return json_doc
 
 
+def read_json_file(loc, docs_filter=None, limit=None):
+    loc = util.ensure_path(loc)
+    if loc.is_dir():
+        for filename in loc.iterdir():
+            yield from read_json_file(loc / filename, limit=limit)
+    else:
+        for doc in json_iterate(loc):
+            if docs_filter is not None and not docs_filter(doc):
+                continue
+            for json_data in json_to_examples(doc):
+                yield json_data
+
+
 def json_to_examples(doc):
     """Convert an item in the JSON-formatted training data to the format
     used by GoldParse.
@@ -72,7 +84,7 @@ def json_to_examples(doc):
     YIELDS (Example): The reformatted data - one training example per paragraph
     """
     for paragraph in doc["paragraphs"]:
-        example = Example(doc=paragraph.get("raw", None))
+        example = {"text": paragraph.get("raw", None)}
         words = []
         ids = []
         tags = []
@@ -110,39 +122,23 @@ def json_to_examples(doc):
         cats = {}
         for cat in paragraph.get("cats", {}):
             cats[cat["label"]] = cat["value"]
-        example.set_token_annotation(ids=ids, words=words, tags=tags,
-                pos=pos, morphs=morphs, lemmas=lemmas, heads=heads,
-                deps=labels, entities=ner, sent_starts=sent_starts,
-                brackets=brackets)
-        example.set_doc_annotation(cats=cats)
+        example["token_annotation"] = dict(
+            ids=ids,
+            words=words,
+            tags=tags,
+            pos=pos,
+            morphs=morphs,
+            lemmas=lemmas,
+            heads=heads,
+            deps=labels,
+            entities=ner,
+            sent_starts=sent_starts,
+            brackets=brackets
+        )
+        example["doc_annotation"] = dict(cats=cats)
         yield example
 
 
-def read_json_file(loc, docs_filter=None, limit=None):
-    loc = util.ensure_path(loc)
-    if loc.is_dir():
-        for filename in loc.iterdir():
-            yield from read_json_file(loc / filename, limit=limit)
-    else:
-        for doc in json_iterate(loc):
-            if docs_filter is not None and not docs_filter(doc):
-                continue
-            for json_data in json_to_examples(doc):
-                yield json_data
-
-
-def read_json_object(json_corpus_section):
-    """Take a list of JSON-formatted documents (e.g. from an already loaded
-    training data file) and yield annotations in the GoldParse format.
-
-    json_corpus_section (list): The data.
-    YIELDS (Example): The reformatted data - one training example per paragraph
-    """
-    for json_doc in json_corpus_section:
-        examples = json_to_examples(json_doc)
-        for ex in examples:
-            yield ex
-
 
 def json_iterate(loc):
     # We should've made these files jsonl...But since we didn't, parse out
diff --git a/spacy/language.py b/spacy/language.py
index 6341dc858..57664ec17 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -636,6 +636,7 @@ class Language(object):
         examples (iterable): `Example` objects.
         YIELDS (tuple): `Example` objects.
         """
+        # TODO: This is deprecated right?
         for name, proc in self.pipeline:
             if hasattr(proc, "preprocess_gold"):
                 examples = proc.preprocess_gold(examples)
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index c45a72b25..7116d7afd 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -92,7 +92,7 @@ class Morphologizer(Tagger):
         guesses = scores.argmax(axis=1)
         known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
         for ex in examples:
-            gold = ex.gold
+            gold = ex._deprecated_get_gold()
             for i in range(len(gold.morphs)):
                 pos = gold.pos[i] if i < len(gold.pos) else ""
                 morph = gold.morphs[i]
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index a6edf00d9..2c40738f6 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -373,7 +373,7 @@ class Tagger(Pipe):
 
     def get_loss(self, examples, scores):
         loss_func = SequenceCategoricalCrossentropy(names=self.labels)
-        truths = [eg.gold.tags for eg in examples]
+        truths = [eg.get_aligned("tag") for eg in examples]
         d_scores, loss = loss_func(scores, truths)
         if self.model.ops.xp.isnan(loss):
             raise ValueError("nan value when computing loss")
@@ -560,9 +560,9 @@ class SentenceRecognizer(Tagger):
         correct = numpy.zeros((scores.shape[0],), dtype="i")
         guesses = scores.argmax(axis=1)
         known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
-        for ex in examples:
-            gold = ex.gold
-            for sent_start in gold.sent_starts:
+        for eg in examples:
+            sent_starts = eg.get_aligned("sent_start")
+            for sent_start in sent_starts:
                 if sent_start is None:
                     correct[idx] = guesses[idx]
                 elif sent_start in tag_index:
@@ -575,7 +575,7 @@ class SentenceRecognizer(Tagger):
         d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
         d_scores *= self.model.ops.asarray(known_labels)
         loss = (d_scores**2).sum()
-        docs = [ex.doc for ex in examples]
+        docs = [eg.doc for eg in examples]
         d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
         return float(loss), d_scores
 
@@ -706,13 +706,13 @@ class MultitaskObjective(Tagger):
         cdef int idx = 0
         correct = numpy.zeros((scores.shape[0],), dtype="i")
         guesses = scores.argmax(axis=1)
-        golds = [ex.gold for ex in examples]
         docs = [ex.doc for ex in examples]
-        for i, gold in enumerate(golds):
-            for j in range(len(docs[i])):
-                # Handels alignment for tokenization differences
-                token_annotation = gold.get_token_annotation()
-                label = self.make_label(j, token_annotation)
+        for i, eg in enumerate(examples):
+            # Handles alignment for tokenization differences
+            doc_annots = eg.get_aligned()
+            for j in range(len(eg.doc)):
+                tok_annots = {key: values[j] for key, values in tok_annots.items()}
+                label = self.make_label(j, tok_annots)
                 if label is None or label not in self.labels:
                     correct[idx] = guesses[idx]
                 else:
@@ -951,13 +951,12 @@ class TextCategorizer(Pipe):
             losses[self.name] += (gradient**2).sum()
 
     def _examples_to_truth(self, examples):
-        golds = [ex.gold for ex in examples]
-        truths = numpy.zeros((len(golds), len(self.labels)), dtype="f")
-        not_missing = numpy.ones((len(golds), len(self.labels)), dtype="f")
-        for i, gold in enumerate(golds):
+        truths = numpy.zeros((len(examples), len(self.labels)), dtype="f")
+        not_missing = numpy.ones((len(examples), len(self.labels)), dtype="f")
+        for i, eg in enumerate(examples):
             for j, label in enumerate(self.labels):
-                if label in gold.cats:
-                    truths[i, j] = gold.cats[label]
+                if label in eg.doc_annotation.cats:
+                    truths[i, j] = eg.doc_annotation.cats[label]
                 else:
                     not_missing[i, j] = 0.
         truths = self.model.ops.asarray(truths)
@@ -1160,14 +1159,14 @@ class EntityLinker(Pipe):
             # This seems simpler than other ways to get that exact output -- but
             # it does run the model twice :(
             predictions = self.model.predict(docs)
-        golds = [ex.gold for ex in examples]
 
-        for doc, gold in zip(docs, golds):
+        for eg in examples:
+            doc = eg.doc
             ents_by_offset = dict()
             for ent in doc.ents:
                 ents_by_offset[(ent.start_char, ent.end_char)] = ent
 
-            for entity, kb_dict in gold.links.items():
+            for entity, kb_dict in eg.doc_annotation.links.items():
                 if isinstance(entity, str):
                     entity = literal_eval(entity)
                 start, end = entity
@@ -1188,7 +1187,10 @@ class EntityLinker(Pipe):
                             raise RuntimeError(Errors.E030)
         set_dropout_rate(self.model, drop)
         sentence_encodings, bp_context = self.model.begin_update(sentence_docs)
-        loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds)
+        loss, d_scores = self.get_similarity_loss(
+            scores=sentence_encodings,
+            examples=examples
+        )
         bp_context(d_scores)
         if sgd is not None:
             self.model.finish_update(sgd)
@@ -1199,10 +1201,10 @@ class EntityLinker(Pipe):
             self.set_annotations(docs, predictions)
         return loss
 
-    def get_similarity_loss(self, golds, scores):
+    def get_similarity_loss(self, examples, scores):
         entity_encodings = []
-        for gold in golds:
-            for entity, kb_dict in gold.links.items():
+        for eg in examples:
+            for entity, kb_dict in eg.doc_annotation.links.items():
                 for kb_id, value in kb_dict.items():
                     # this loss function assumes we're only using positive examples
                     if value:
@@ -1222,7 +1224,7 @@ class EntityLinker(Pipe):
     def get_loss(self, examples, scores):
         cats = []
         for ex in examples:
-            for entity, kb_dict in ex.gold.links.items():
+            for entity, kb_dict in ex.doc_annotation.links.items():
                 for kb_id, value in kb_dict.items():
                     cats.append([value])
 
diff --git a/spacy/scorer.py b/spacy/scorer.py
index 7e2466be7..5e49a90d2 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -282,7 +282,7 @@ class Scorer(object):
         if isinstance(example, tuple) and len(example) == 2:
             doc, gold = example
         else:
-            gold = example.gold
+            gold = example._deprecated_get_gold()
             doc = example.doc
 
         if len(doc) != len(gold):
diff --git a/spacy/syntax/gold_parse.pyx b/spacy/syntax/gold_parse.pyx
index df4059a21..05361fd82 100644
--- a/spacy/syntax/gold_parse.pyx
+++ b/spacy/syntax/gold_parse.pyx
@@ -24,6 +24,57 @@ def is_punct_label(label):
     return label == "P" or label.lower() == "punct"
 
 
+def get_parses_from_example(
+    eg, merge=True, vocab=None, make_projective=True, ignore_misaligned=False
+):
+    """Return a list of (doc, GoldParse) objects.
+    If merge is set to True, keep all Token annotations as one big list."""
+    d = eg.doc_annotation
+    # merge == do not modify Example
+    if merge:
+        t = eg.token_annotation
+        doc = eg.doc
+        if doc is None or not isinstance(doc, Doc):
+            if not vocab:
+                raise ValueError(Errors.E998)
+            doc = Doc(vocab, words=t.words)
+        try:
+            gp = GoldParse.from_annotation(
+                doc, d, t, make_projective=make_projective
+            )
+        except AlignmentError:
+            if ignore_misaligned:
+                gp = None
+            else:
+                raise
+        return [(doc, gp)]
+    # not merging: one GoldParse per sentence, defining docs with the words
+    # from each sentence
+    else:
+        parses = []
+        split_examples = eg.split_sents()
+        for split_example in split_examples:
+            if not vocab:
+                raise ValueError(Errors.E998)
+            split_doc = Doc(vocab, words=split_example.token_annotation.words)
+            try:
+                gp = GoldParse.from_annotation(
+                    split_doc,
+                    d,
+                    split_example.token_annotation,
+                    make_projective=make_projective,
+                )
+            except AlignmentError:
+                if ignore_misaligned:
+                    gp = None
+                else:
+                    raise
+            if gp is not None:
+                parses.append((split_doc, gp))
+        return parses
+
+
+
 cdef class GoldParse:
     """Collection for training annotations.
 
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 12f56ba67..f74f3dd73 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -21,6 +21,7 @@ import warnings
 
 from ..tokens.doc cimport Doc
 from .gold_parse cimport GoldParse
+from .gold_parse import get_parses_from_example
 from ..typedefs cimport weight_t, class_t, hash_t
 from ._parser_model cimport alloc_activations, free_activations
 from ._parser_model cimport predict_states, arg_max_if_valid
@@ -515,8 +516,8 @@ cdef class Parser:
         good_golds = []
         good_states = []
         for i, eg in enumerate(whole_examples):
-            doc = eg.doc
-            gold = self.moves.preprocess_gold(eg.gold)
+            parses = get_parses_from_example(eg)
+            doc, gold = parses[0]
             if gold is not None and self.moves.has_gold(gold):
                 good_docs.append(doc)
                 good_golds.append(gold)
@@ -535,8 +536,12 @@ cdef class Parser:
         cdef:
             StateClass state
             Transition action
-        whole_docs = [ex.doc for ex in whole_examples]
-        whole_golds = [ex.gold for ex in whole_examples]
+        whole_docs = []
+        whole_golds = []
+        for eg in whole_examples:
+            for doc, gold in get_parses_from_example(eg):
+                whole_docs.append(doc)
+                whole_golds.append(gold)
         whole_states = self.moves.init_batch(whole_docs)
         max_length = max(min_length, min(max_length, min([len(doc) for doc in whole_docs])))
         max_moves = 0
@@ -625,7 +630,7 @@ cdef class Parser:
         doc_sample = []
         gold_sample = []
         for example in islice(get_examples(), 10):
-            parses = example.get_gold_parses(merge=False, vocab=self.vocab)
+            parses = get_parses_from_example(example, merge=False, vocab=self.vocab)
             for doc, gold in parses:
                 if len(doc):
                     doc_sample.append(doc)
diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py
index ee1bba886..fdab3a2e3 100644
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@@ -34,7 +34,10 @@ def _train_parser(parser):
     for i in range(5):
         losses = {}
         doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
-        gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"])
+        gold = {
+            "heads": [1, 1, 3, 3],
+            "deps": ["left", "ROOT", "left", "ROOT"]
+        }
         parser.update((doc, gold), sgd=sgd, losses=losses)
     return parser
 
@@ -46,9 +49,10 @@ def test_add_label(parser):
     for i in range(100):
         losses = {}
         doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
-        gold = GoldParse(
-            doc, heads=[1, 1, 3, 3], deps=["right", "ROOT", "left", "ROOT"]
-        )
+        gold = {
+            "heads": [1, 1, 3, 3],
+            "deps": ["right", "ROOT", "left", "ROOT"]
+        }
         parser.update((doc, gold), sgd=sgd, losses=losses)
     doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
     doc = parser(doc)
diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py
index b648e9a00..c07e6aa38 100644
--- a/spacy/tests/parser/test_neural_parser.py
+++ b/spacy/tests/parser/test_neural_parser.py
@@ -46,7 +46,7 @@ def doc(vocab):
 
 @pytest.fixture
 def gold(doc):
-    return GoldParse(doc, heads=[1, 1, 1], deps=["L", "ROOT", "R"])
+    return {"heads": [1, 1, 1], "deps": ["L", "ROOT", "R"]}
 
 
 def test_can_init_nn_parser(parser):
diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py
index dc13fcdf1..3d0726353 100644
--- a/spacy/tests/parser/test_preset_sbd.py
+++ b/spacy/tests/parser/test_preset_sbd.py
@@ -1,7 +1,6 @@
 import pytest
 from thinc.api import Adam
 from spacy.attrs import NORM
-from spacy.gold import GoldParse
 from spacy.vocab import Vocab
 
 from spacy.pipeline.defaults import default_parser
@@ -27,7 +26,7 @@ def parser(vocab):
     for i in range(10):
         losses = {}
         doc = Doc(vocab, words=["a", "b", "c", "d"])
-        gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"])
+        gold = dict(heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"])
         parser.update((doc, gold), sgd=sgd, losses=losses)
     return parser
 
diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
index 982c0d910..4b4250179 100644
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@@ -1,9 +1,10 @@
 from spacy.errors import AlignmentError
 from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
-from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo, align
+from spacy.gold import spans_from_biluo_tags, iob_to_biluo, align
 from spacy.gold import GoldCorpus, docs_to_json, Example, DocAnnotation
 from spacy.lang.en import English
 from spacy.syntax.nonproj import is_nonproj_tree
+from spacy.syntax.gold_parse import GoldParse, get_parses_from_example
 from spacy.tokens import Doc
 from spacy.util import get_words_and_spaces, compounding, minibatch
 import pytest
@@ -270,10 +271,9 @@ def test_roundtrip_docs_to_json(doc):
         srsly.write_json(json_file, [docs_to_json(doc)])
         goldcorpus = GoldCorpus(train=str(json_file), dev=str(json_file))
 
-    reloaded_example = next(goldcorpus.dev_dataset(nlp))
-    goldparse = reloaded_example.gold
-
-    assert len(doc) == goldcorpus.count_train()
+        reloaded_example = next(goldcorpus.dev_dataset(nlp=nlp))
+        goldparse = reloaded_example._deprecated_get_gold()
+        assert len(doc) == goldcorpus.count_train()
     assert text == reloaded_example.text
     assert tags == goldparse.tags
     assert pos == goldparse.pos
@@ -287,54 +287,6 @@ def test_roundtrip_docs_to_json(doc):
     assert cats["TRAVEL"] == goldparse.cats["TRAVEL"]
     assert cats["BAKING"] == goldparse.cats["BAKING"]
 
-    # roundtrip to JSONL train dicts
-    with make_tempdir() as tmpdir:
-        jsonl_file = tmpdir / "roundtrip.jsonl"
-        srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
-        goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
-
-    reloaded_example = next(goldcorpus.dev_dataset(nlp))
-    goldparse = reloaded_example.gold
-
-    assert len(doc) == goldcorpus.count_train()
-    assert text == reloaded_example.text
-    assert tags == goldparse.tags
-    assert pos == goldparse.pos
-    assert morphs == goldparse.morphs
-    assert lemmas == goldparse.lemmas
-    assert deps == goldparse.labels
-    assert heads == goldparse.heads
-    assert biluo_tags == goldparse.ner
-    assert "TRAVEL" in goldparse.cats
-    assert "BAKING" in goldparse.cats
-    assert cats["TRAVEL"] == goldparse.cats["TRAVEL"]
-    assert cats["BAKING"] == goldparse.cats["BAKING"]
-
-    # roundtrip to JSONL tuples
-    with make_tempdir() as tmpdir:
-        jsonl_file = tmpdir / "roundtrip.jsonl"
-        # write to JSONL train dicts
-        srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
-        goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
-        # load and rewrite as JSONL tuples
-        srsly.write_jsonl(jsonl_file, goldcorpus.train_examples)
-        goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
-
-    reloaded_example = next(goldcorpus.dev_dataset(nlp))
-    goldparse = reloaded_example.gold
-
-    assert len(doc) == goldcorpus.count_train()
-    assert text == reloaded_example.text
-    assert tags == goldparse.tags
-    assert deps == goldparse.labels
-    assert heads == goldparse.heads
-    assert lemmas == goldparse.lemmas
-    assert biluo_tags == goldparse.ner
-    assert "TRAVEL" in goldparse.cats
-    assert "BAKING" in goldparse.cats
-    assert cats["TRAVEL"] == goldparse.cats["TRAVEL"]
-    assert cats["BAKING"] == goldparse.cats["BAKING"]
-
 
 def test_projective_train_vs_nonprojective_dev(doc):
     nlp = English()
@@ -342,16 +294,16 @@ def test_projective_train_vs_nonprojective_dev(doc):
     heads = [t.head.i for t in doc]
 
     with make_tempdir() as tmpdir:
-        jsonl_file = tmpdir / "test.jsonl"
-        # write to JSONL train dicts
-        srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
-        goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
+        json_file = tmpdir / "test.json"
+        # write to JSON train dicts
+        srsly.write_json(json_file, [docs_to_json(doc)])
+        goldcorpus = GoldCorpus(str(json_file), str(json_file))
 
-    train_reloaded_example = next(goldcorpus.train_dataset(nlp))
-    train_goldparse = train_reloaded_example.gold
+        train_reloaded_example = next(goldcorpus.train_dataset(nlp))
+        train_goldparse = get_parses_from_example(train_reloaded_example)[0][1]
 
-    dev_reloaded_example = next(goldcorpus.dev_dataset(nlp))
-    dev_goldparse = dev_reloaded_example.gold
+        dev_reloaded_example = next(goldcorpus.dev_dataset(nlp))
+        dev_goldparse = dev_reloaded_example._deprecated_get_gold()
 
     assert is_nonproj_tree([t.head.i for t in doc]) is True
     assert is_nonproj_tree(train_goldparse.heads) is False
@@ -364,45 +316,49 @@ def test_projective_train_vs_nonprojective_dev(doc):
     assert deps == dev_goldparse.labels
 
 
+# Hm, not sure where misalignment check would be handled? In the components too?
+# I guess that does make sense. A text categorizer doesn't care if it's 
+# misaligned...
+@pytest.mark.xfail # TODO
 def test_ignore_misaligned(doc):
     nlp = English()
     text = doc.text
     with make_tempdir() as tmpdir:
-        jsonl_file = tmpdir / "test.jsonl"
+        json_file = tmpdir / "test.json"
         data = [docs_to_json(doc)]
         data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
-        # write to JSONL train dicts
-        srsly.write_jsonl(jsonl_file, data)
-        goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
+        # write to JSON train dicts
+        srsly.write_json(json_file, data)
+        goldcorpus = GoldCorpus(str(json_file), str(json_file))
 
-    with pytest.raises(AlignmentError):
-        train_reloaded_example = next(goldcorpus.train_dataset(nlp))
+        with pytest.raises(AlignmentError):
+            train_reloaded_example = next(goldcorpus.train_dataset(nlp))
 
     with make_tempdir() as tmpdir:
-        jsonl_file = tmpdir / "test.jsonl"
+        json_file = tmpdir / "test.json"
         data = [docs_to_json(doc)]
         data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
-        # write to JSONL train dicts
-        srsly.write_jsonl(jsonl_file, data)
-        goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
+        # write to JSON train dicts
+        srsly.write_json(json_file, data)
+        goldcorpus = GoldCorpus(str(json_file), str(json_file))
 
-    # doesn't raise an AlignmentError, but there is nothing to iterate over
-    # because the only example can't be aligned
-    train_reloaded_example = list(goldcorpus.train_dataset(nlp, ignore_misaligned=True))
-    assert len(train_reloaded_example) == 0
+        # doesn't raise an AlignmentError, but there is nothing to iterate over
+        # because the only example can't be aligned
+        train_reloaded_example = list(goldcorpus.train_dataset(nlp, ignore_misaligned=True))
+        assert len(train_reloaded_example) == 0
 
 
 def test_make_orth_variants(doc):
     nlp = English()
     with make_tempdir() as tmpdir:
-        jsonl_file = tmpdir / "test.jsonl"
-        # write to JSONL train dicts
-        srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
-        goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
+        json_file = tmpdir / "test.json"
+        # write to JSON train dicts
+        srsly.write_json(json_file, [docs_to_json(doc)])
+        goldcorpus = GoldCorpus(str(json_file), str(json_file))
 
-    # due to randomness, test only that this runs with no errors for now
-    train_reloaded_example = next(goldcorpus.train_dataset(nlp, orth_variant_level=0.2))
-    train_goldparse = train_reloaded_example.gold  # noqa: F841
+        # due to randomness, test only that this runs with no errors for now
+        train_reloaded_example = next(goldcorpus.train_dataset(nlp, orth_variant_level=0.2))
+        train_goldparse = train_reloaded_example._deprecated_get_gold()
 
 
 @pytest.mark.parametrize(
@@ -485,6 +441,7 @@ def test_tuple_format_implicit():
     _train(train_data)
 
 
+@pytest.mark.xfail # TODO
 def test_tuple_format_implicit_invalid():
     """Test that an error is thrown for an implicit invalid GoldParse field"""
 
@@ -520,8 +477,18 @@ def test_split_sents(merged_dict):
     nlp = English()
     example = Example()
     example.set_token_annotation(**merged_dict)
-    assert len(example.get_gold_parses(merge=False, vocab=nlp.vocab)) == 2
-    assert len(example.get_gold_parses(merge=True, vocab=nlp.vocab)) == 1
+    assert len(get_parses_from_example(
+        example,
+        merge=False,
+        vocab=nlp.vocab,
+        make_projective=False)
+    ) == 2
+    assert len(get_parses_from_example(
+        example,
+        merge=True,
+        vocab=nlp.vocab,
+        make_projective=False
+    )) == 1
 
     split_examples = example.split_sents()
     assert len(split_examples) == 2
@@ -557,4 +524,4 @@ def test_empty_example_goldparse():
     nlp = English()
     doc = nlp("")
     example = Example(doc=doc)
-    assert len(example.get_gold_parses()) == 1
+    assert len(get_parses_from_example(example)) == 1
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index 58db0a040..363366eeb 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -19,22 +19,16 @@ def nlp():
     return nlp
 
 
+@pytest.mark.xfail # TODO
 def test_language_update(nlp):
     text = "hello world"
     annots = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}
     wrongkeyannots = {"LABEL": True}
     doc = Doc(nlp.vocab, words=text.split(" "))
-    gold = GoldParse(doc, **annots)
-    # Update with doc and gold objects
-    nlp.update((doc, gold))
     # Update with text and dict
     nlp.update((text, annots))
     # Update with doc object and dict
     nlp.update((doc, annots))
-    # Update with text and gold object
-    nlp.update((text, gold))
-    # Update with empty doc and gold object
-    nlp.update((None, gold))
     # Update badly
     with pytest.raises(ValueError):
         nlp.update((doc, None))
@@ -44,20 +38,16 @@ def test_language_update(nlp):
 
 def test_language_evaluate(nlp):
     text = "hello world"
-    annots = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}
+    annots = {
+        "doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}
+    }
     doc = Doc(nlp.vocab, words=text.split(" "))
-    gold = GoldParse(doc, **annots)
-    # Evaluate with doc and gold objects
-    nlp.evaluate([(doc, gold)])
     # Evaluate with text and dict
     nlp.evaluate([(text, annots)])
     # Evaluate with doc object and dict
     nlp.evaluate([(doc, annots)])
-    # Evaluate with text and gold object
-    nlp.evaluate([(text, gold)])
-    # Evaluate badly
     with pytest.raises(Exception):
-        nlp.evaluate([text, gold])
+        nlp.evaluate([text, annots])
 
 
 def test_evaluate_no_pipe(nlp):

From d9289712ba76d4c67450fe1969642416d0ac57f4 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 8 Jun 2020 22:28:50 +0200
Subject: [PATCH 21/56] * Make GoldCorpus return dict, not Example * Make
 Example require a Doc object (previously optional)

Clarify methods in GoldCorpus

WIP refactor Example

Refactor Example.split_sents

Fix test

Fix augment

Update test

Update test

Fix import

Update test_scorer

Update Example
---
 spacy/cli/converters/conllu2json.py           |  10 +-
 spacy/gold/annotation.py                      |   3 +
 spacy/gold/augment.py                         |   7 +-
 spacy/gold/corpus.py                          |  45 ++---
 spacy/gold/example.py                         | 155 +++++++++++-------
 spacy/gold/gold_io.pyx                        |   4 +-
 spacy/syntax/nonproj.pyx                      |   4 +-
 spacy/tests/regression/test_issue1501-2000.py |  15 +-
 spacy/tests/test_gold.py                      |  24 ++-
 spacy/tests/test_scorer.py                    |  18 +-
 spacy/tokens/doc.pyx                          |   2 +
 11 files changed, 176 insertions(+), 111 deletions(-)

diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py
index 1ece755b8..2cf5f7942 100644
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@@ -2,6 +2,7 @@ import re
 
 from ...gold import Example
 from ...gold import iob_to_biluo, spans_from_biluo_tags, biluo_tags_from_offsets
+from ...gold import TokenAnnotation
 from ...language import Language
 from ...tokens import Doc, Token
 from .conll_ner2json import n_sents_info
@@ -284,13 +285,8 @@ def example_from_conllu_sentence(
         spaces.append(t._.merged_spaceafter)
     ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
     ents = biluo_tags_from_offsets(doc, ent_offsets)
-    raw = ""
-    for word, space in zip(words, spaces):
-        raw += word
-        if space:
-            raw += " "
-    example = Example(doc=raw)
-    example.set_token_annotation(
+    example = Example(doc=Doc(vocab, words=words, spaces=spaces))
+    example.token_annotation = TokenAnnotation(
         ids=ids,
         words=words,
         tags=tags,
diff --git a/spacy/gold/annotation.py b/spacy/gold/annotation.py
index 6bae679c3..5f78902ab 100644
--- a/spacy/gold/annotation.py
+++ b/spacy/gold/annotation.py
@@ -1,3 +1,6 @@
+from .iob_utils import biluo_tags_from_offsets
+
+
 class TokenAnnotation:
     def __init__(
         self,
diff --git a/spacy/gold/augment.py b/spacy/gold/augment.py
index 656308214..f938f540f 100644
--- a/spacy/gold/augment.py
+++ b/spacy/gold/augment.py
@@ -1,6 +1,7 @@
 import random
 import itertools
 from .example import Example
+from .annotation import TokenAnnotation
 
 
 def make_orth_variants(nlp, example, orth_variant_level=0.0):
@@ -17,14 +18,14 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
     ndsv = nlp.Defaults.single_orth_variants
     ndpv = nlp.Defaults.paired_orth_variants
     # modify words in paragraph_tuples
-    variant_example = Example(doc=raw)
+    variant_example = Example(doc=nlp.make_doc(raw))
     token_annotation = example.token_annotation
     words = token_annotation.words
     tags = token_annotation.tags
     if not words or not tags:
         # add the unmodified annotation
         token_dict = token_annotation.to_dict()
-        variant_example.set_token_annotation(**token_dict)
+        variant_example.token_annotation = TokenAnnotation(**token_dict)
     else:
         if lower:
             words = [w.lower() for w in words]
@@ -60,7 +61,7 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
         token_dict = token_annotation.to_dict()
         token_dict["words"] = words
         token_dict["tags"] = tags
-        variant_example.set_token_annotation(**token_dict)
+        variant_example.token_annotation = TokenAnnotation(**token_dict)
     # modify raw to match variant_paragraph_tuples
     if raw is not None:
         variants = []
diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py
index 9462f0aa4..df13ab505 100644
--- a/spacy/gold/corpus.py
+++ b/spacy/gold/corpus.py
@@ -28,8 +28,8 @@ class GoldCorpus(object):
         """
         self.limit = limit
         if isinstance(train, str) or isinstance(train, Path):
-            train = self.read_examples(self.walk_corpus(train))
-            dev = self.read_examples(self.walk_corpus(dev))
+            train = self.read_annotations(self.walk_corpus(train))
+            dev = self.read_annotations(self.walk_corpus(dev))
         # Write temp directory with one doc per file, so we can shuffle and stream
         self.tmp_dir = Path(tempfile.mkdtemp())
         self.write_msgpack(self.tmp_dir / "train", train, limit=self.limit)
@@ -71,7 +71,7 @@ class GoldCorpus(object):
         return locs
 
     @staticmethod
-    def read_examples(locs, limit=0):
+    def read_annotations(locs, limit=0):
         """ Yield training examples """
         i = 0
         for loc in locs:
@@ -101,11 +101,11 @@ class GoldCorpus(object):
                                 or isinstance(doc, str)
                             ):
                                 raise ValueError(Errors.E987.format(type=type(doc)))
-                            examples.append(Example.from_dict(ex_dict, doc=doc))
+                            examples.append(ex_dict)
 
             elif file_name.endswith("msg"):
                 text, ex_dict = srsly.read_msgpack(loc)
-                examples = [Example.from_dict(ex_dict, doc=text)]
+                examples = [ex_dict]
             else:
                 supported = ("json", "jsonl", "msg")
                 raise ValueError(Errors.E124.format(path=loc, formats=supported))
@@ -123,21 +123,21 @@ class GoldCorpus(object):
                 raise ValueError(Errors.E996.format(file=file_name, msg=msg))
 
     @property
-    def dev_examples(self):
+    def dev_annotations(self):
         locs = (self.tmp_dir / "dev").iterdir()
-        yield from self.read_examples(locs, limit=self.limit)
+        yield from self.read_annotations(locs, limit=self.limit)
 
     @property
-    def train_examples(self):
+    def train_annotations(self):
         locs = (self.tmp_dir / "train").iterdir()
-        yield from self.read_examples(locs, limit=self.limit)
+        yield from self.read_annotations(locs, limit=self.limit)
 
     def count_train(self):
         """Returns count of words in train examples"""
         n = 0
         i = 0
-        for example in self.train_examples:
-            n += len(example.token_annotation.words)
+        for eg_dict in self.train_annotations:
+            n += len(eg_dict["token_annotation"]["words"])
             if self.limit and i >= self.limit:
                 break
             i += 1
@@ -154,10 +154,10 @@ class GoldCorpus(object):
     ):
         locs = list((self.tmp_dir / "train").iterdir())
         random.shuffle(locs)
-        train_examples = self.read_examples(locs, limit=self.limit)
-        gold_examples = self.iter_gold_docs(
+        train_annotations = self.read_annotations(locs, limit=self.limit)
+        examples = self.iter_examples(
             nlp,
-            train_examples,
+            train_annotations,
             gold_preproc,
             max_length=max_length,
             noise_level=noise_level,
@@ -165,33 +165,33 @@ class GoldCorpus(object):
             make_projective=True,
             ignore_misaligned=ignore_misaligned,
         )
-        yield from gold_examples
+        yield from examples
 
     def train_dataset_without_preprocessing(
         self, nlp, gold_preproc=False, ignore_misaligned=False
     ):
-        examples = self.iter_gold_docs(
+        examples = self.iter_examples(
             nlp,
-            self.train_examples,
+            self.train_annotations,
             gold_preproc=gold_preproc,
             ignore_misaligned=ignore_misaligned,
         )
         yield from examples
 
     def dev_dataset(self, nlp, gold_preproc=False, ignore_misaligned=False):
-        examples = self.iter_gold_docs(
+        examples = self.iter_examples(
             nlp,
-            self.dev_examples,
+            self.dev_annotations,
             gold_preproc=gold_preproc,
             ignore_misaligned=ignore_misaligned,
         )
         yield from examples
 
     @classmethod
-    def iter_gold_docs(
+    def iter_examples(
         cls,
         nlp,
-        examples,
+        annotations,
         gold_preproc,
         max_length=None,
         noise_level=0.0,
@@ -200,7 +200,8 @@ class GoldCorpus(object):
         ignore_misaligned=False,
     ):
         """ Setting gold_preproc will result in creating a doc per sentence """
-        for example in examples:
+        for eg_dict in annotations:
+            example = Example.from_dict(eg_dict, doc=nlp.make_doc(eg_dict["text"]))
             example_docs = []
             if gold_preproc:
                 split_examples = example.split_sents()
diff --git a/spacy/gold/example.py b/spacy/gold/example.py
index 1d8665572..c8ad58da7 100644
--- a/spacy/gold/example.py
+++ b/spacy/gold/example.py
@@ -1,18 +1,69 @@
+import numpy
 from .annotation import TokenAnnotation, DocAnnotation
+from .iob_utils import spans_from_biluo_tags, biluo_tags_from_offsets
 from .align import Alignment
 from ..errors import Errors, AlignmentError
 from ..tokens import Doc
 
 
+def annotations2doc(doc, doc_annot, tok_annot):
+    # TODO: Improve and test this
+    words = tok_annot.words or [tok.text for tok in doc]
+    fields = {
+        "tags": "TAG",
+        "pos": "POS",
+        "lemmas": "LEMMA",
+        "deps": "DEP",
+    }
+    attrs = []
+    values = []
+    for field, attr in fields.items():
+        value = getattr(tok_annot, field)
+        # Unset fields will be empty lists.
+        if value:
+            attrs.append(attr)
+            values.append([doc.vocab.strings.add(v) for v in value])
+    if tok_annot.heads:
+        attrs.append("HEAD")
+        values.append([h - i for i, h in enumerate(tok_annot.heads)])
+    output = Doc(doc.vocab, words=words)
+    if values:
+        array = numpy.array(values, dtype="uint64")
+        output = output.from_array(attrs, array.T)
+    if tok_annot.entities:
+        output.ents = spans_from_biluo_tags(output, tok_annot.entities)
+    doc.cats = dict(doc_annot.cats)
+    # TODO: Calculate token.ent_kb_id from links.
+    # We need to fix this and the doc.ents thing, both should be doc
+    # annotations.
+    return doc
+
+
 class Example:
-    def __init__(self, doc=None, doc_annotation=None, token_annotation=None):
+    def __init__(self, doc, doc_annotation=None, token_annotation=None):
         """ Doc can either be text, or an actual Doc """
+        if not isinstance(doc, Doc):
+            raise TypeError("Must pass Doc instance")
+        self.predicted = doc
         self.doc = doc
         self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation()
         self.token_annotation = (
             token_annotation if token_annotation else TokenAnnotation()
         )
         self._alignment = None
+        self.reference = annotations2doc(
+            self.doc,
+            self.doc_annotation,
+            self.token_annotation
+        )
+
+    @property
+    def x(self):
+        return self.predicted
+    
+    @property
+    def y(self):
+        return self.reference
 
     def _deprecated_get_gold(self, make_projective=False):
         from ..syntax.gold_parse import get_parses_from_example
@@ -24,6 +75,8 @@ class Example:
     def from_dict(cls, example_dict, doc=None):
         if example_dict is None:
             raise ValueError("Example.from_dict expected dict, received None")
+        if doc is None:
+            raise ValueError("Must pass doc")
         # TODO: This is ridiculous...
         token_dict = example_dict.get("token_annotation", {})
         doc_dict = example_dict.get("doc_annotation", {})
@@ -34,6 +87,10 @@ class Example:
                 doc_dict[key] = value
             else:
                 token_dict[key] = value
+        if token_dict.get("entities"):
+            entities = token_dict["entities"]
+            if isinstance(entities[0], (list, tuple)):
+                token_dict["entities"] = biluo_tags_from_offsets(doc, entities)
         token_annotation = TokenAnnotation.from_dict(token_dict)
         doc_annotation = DocAnnotation.from_dict(doc_dict)
         return cls(
@@ -45,8 +102,8 @@ class Example:
         if self._alignment is None:
             if self.doc is None:
                 return None
-            spacy_words = [token.orth_ for token in self.doc]
-            gold_words = self.token_annotation.words
+            spacy_words = [token.orth_ for token in self.predicted]
+            gold_words = [token.orth_ for token in self.reference]
             if gold_words == []:
                 gold_words = spacy_words
             self._alignment = Alignment(spacy_words, gold_words)
@@ -92,34 +149,6 @@ class Example:
                 output.append(gold_values[gold_i])
         return output
 
-    def set_token_annotation(
-        self,
-        ids=None,
-        words=None,
-        tags=None,
-        pos=None,
-        morphs=None,
-        lemmas=None,
-        heads=None,
-        deps=None,
-        entities=None,
-        sent_starts=None,
-        brackets=None,
-    ):
-        self.token_annotation = TokenAnnotation(
-            ids=ids,
-            words=words,
-            tags=tags,
-            pos=pos,
-            morphs=morphs,
-            lemmas=lemmas,
-            heads=heads,
-            deps=deps,
-            entities=entities,
-            sent_starts=sent_starts,
-            brackets=brackets,
-        )
-
     def set_doc_annotation(self, cats=None, links=None):
         if cats:
             self.doc_annotation.cats = cats
@@ -131,7 +160,6 @@ class Example:
         sent_starts and return a list of the new Examples"""
         if not self.token_annotation.words:
             return [self]
-        s_example = Example(doc=None, doc_annotation=self.doc_annotation)
         s_ids, s_words, s_tags, s_pos, s_morphs = [], [], [], [], []
         s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], []
         s_brackets = []
@@ -140,21 +168,25 @@ class Example:
         split_examples = []
         for i in range(len(t.words)):
             if i > 0 and t.sent_starts[i] == 1:
-                s_example.set_token_annotation(
-                    ids=s_ids,
-                    words=s_words,
-                    tags=s_tags,
-                    pos=s_pos,
-                    morphs=s_morphs,
-                    lemmas=s_lemmas,
-                    heads=s_heads,
-                    deps=s_deps,
-                    entities=s_ents,
-                    sent_starts=s_sent_starts,
-                    brackets=s_brackets,
+                split_examples.append(
+                    Example(
+                        doc=Doc(self.doc.vocab, words=s_words),
+                        token_annotation=TokenAnnotation(
+                            ids=s_ids,
+                            words=s_words,
+                            tags=s_tags,
+                            pos=s_pos,
+                            morphs=s_morphs,
+                            lemmas=s_lemmas,
+                            heads=s_heads,
+                            deps=s_deps,
+                            entities=s_ents,
+                            sent_starts=s_sent_starts,
+                            brackets=s_brackets,
+                        ),
+                        doc_annotation=self.doc_annotation
+                    )
                 )
-                split_examples.append(s_example)
-                s_example = Example(doc=None, doc_annotation=self.doc_annotation)
                 s_ids, s_words, s_tags, s_pos, s_heads = [], [], [], [], []
                 s_deps, s_ents, s_morphs, s_lemmas = [], [], [], []
                 s_sent_starts, s_brackets = [], []
@@ -172,20 +204,25 @@ class Example:
             for b_end, b_label in t.brackets_by_start.get(i, []):
                 s_brackets.append((i - sent_start_i, b_end - sent_start_i, b_label))
             i += 1
-        s_example.set_token_annotation(
-            ids=s_ids,
-            words=s_words,
-            tags=s_tags,
-            pos=s_pos,
-            morphs=s_morphs,
-            lemmas=s_lemmas,
-            heads=s_heads,
-            deps=s_deps,
-            entities=s_ents,
-            sent_starts=s_sent_starts,
-            brackets=s_brackets,
+        split_examples.append(
+            Example(
+                doc=Doc(self.doc.vocab, words=s_words),
+                token_annotation=TokenAnnotation(
+                    ids=s_ids,
+                    words=s_words,
+                    tags=s_tags,
+                    pos=s_pos,
+                    morphs=s_morphs,
+                    lemmas=s_lemmas,
+                    heads=s_heads,
+                    deps=s_deps,
+                    entities=s_ents,
+                    sent_starts=s_sent_starts,
+                    brackets=s_brackets,
+                ),
+                doc_annotation=self.doc_annotation
+            )
         )
-        split_examples.append(s_example)
         return split_examples
 
     @classmethod
diff --git a/spacy/gold/gold_io.pyx b/spacy/gold/gold_io.pyx
index 424e44f72..8aa5f4017 100644
--- a/spacy/gold/gold_io.pyx
+++ b/spacy/gold/gold_io.pyx
@@ -76,12 +76,12 @@ def read_json_file(loc, docs_filter=None, limit=None):
                 yield json_data
 
 
-def json_to_examples(doc):
+def json_to_annotations(doc):
     """Convert an item in the JSON-formatted training data to the format
     used by GoldParse.
 
     doc (dict): One entry in the training data.
-    YIELDS (Example): The reformatted data - one training example per paragraph
+    YIELDS (tuple): The reformatted data - one training example per paragraph
     """
     for paragraph in doc["paragraphs"]:
         example = {"text": paragraph.get("raw", None)}
diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx
index 1edb2e65c..a91176f44 100644
--- a/spacy/syntax/nonproj.pyx
+++ b/spacy/syntax/nonproj.pyx
@@ -108,7 +108,7 @@ def preprocess_training_data(gold_data, label_freq_cutoff=30):
         proj_token_dict = example.token_annotation.to_dict()
         proj_token_dict["heads"] = proj_heads
         proj_token_dict["deps"] = deco_deps
-        new_example.set_token_annotation(**proj_token_dict)
+        new_example.token_annotation = TokenAnnotation(**proj_token_dict)
         preprocessed.append(new_example)
     if label_freq_cutoff > 0:
         return _filter_labels(preprocessed, label_freq_cutoff, freqs)
@@ -216,6 +216,6 @@ def _filter_labels(examples, cutoff, freqs):
                 filtered_labels.append(label)
         filtered_token_dict = example.token_annotation.to_dict()
         filtered_token_dict["deps"] = filtered_labels
-        new_example.set_token_annotation(**filtered_token_dict)
+        new_example.token_annotation = TokenAnnotation(**filtered_token_dict)
         filtered.append(new_example)
     return filtered
diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py
index 5a76697bc..ed1f33351 100644
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@@ -3,7 +3,7 @@ import gc
 import numpy
 import copy
 
-from spacy.gold import Example
+from spacy.gold import Example, TokenAnnotation
 from spacy.lang.en import English
 from spacy.lang.en.stop_words import STOP_WORDS
 from spacy.lang.lex_attrs import is_stop
@@ -271,9 +271,16 @@ def test_issue1963(en_tokenizer):
 @pytest.mark.parametrize("label", ["U-JOB-NAME"])
 def test_issue1967(label):
     ner = EntityRecognizer(Vocab(), default_ner())
-    example = Example(doc=None)
-    example.set_token_annotation(
-        ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label]
+    example = Example(
+        doc=Doc(ner.vocab, words=["word"]),
+        token_annotation=TokenAnnotation(
+            ids=[0],
+            words=["word"],
+            tags=["tag"],
+            heads=[0],
+            deps=["dep"],
+            entities=[label]
+        )
     )
     ner.moves.get_actions(gold_parses=[example])
 
diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
index 4b4250179..29ddc7456 100644
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@@ -95,6 +95,12 @@ def merged_dict():
     }
 
 
+@pytest.fixture
+def vocab():
+    nlp = English()
+    return nlp.vocab
+
+
 def test_gold_biluo_U(en_vocab):
     words = ["I", "flew", "to", "London", "."]
     spaces = [True, True, True, False, True]
@@ -475,8 +481,10 @@ def _train(train_data):
 
 def test_split_sents(merged_dict):
     nlp = English()
-    example = Example()
-    example.set_token_annotation(**merged_dict)
+    example = Example.from_dict(
+        merged_dict,
+        doc=Doc(nlp.vocab, words=merged_dict["words"])
+    )
     assert len(get_parses_from_example(
         example,
         merge=False,
@@ -506,13 +514,15 @@ def test_split_sents(merged_dict):
     assert token_annotation_2.sent_starts == [1, 0, 0, 0]
 
 
-def test_tuples_to_example(merged_dict):
-    ex = Example()
-    ex.set_token_annotation(**merged_dict)
+def test_tuples_to_example(vocab, merged_dict):
     cats = {"TRAVEL": 1.0, "BAKING": 0.0}
-    ex.set_doc_annotation(cats=cats)
+    merged_dict = dict(merged_dict)
+    merged_dict["cats"] = cats
+    ex = Example.from_dict(
+        merged_dict,
+        doc=Doc(vocab, words=merged_dict["words"])
+    )
     ex_dict = ex.to_dict()
-
     assert ex_dict["token_annotation"]["ids"] == merged_dict["ids"]
     assert ex_dict["token_annotation"]["words"] == merged_dict["words"]
     assert ex_dict["token_annotation"]["tags"] == merged_dict["tags"]
diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py
index d750a8202..5eaf8d5b3 100644
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@@ -1,12 +1,14 @@
 from numpy.testing import assert_almost_equal, assert_array_almost_equal
 import pytest
 from pytest import approx
-from spacy.gold import Example, GoldParse
+from spacy.gold import Example, GoldParse, TokenAnnotation
+from spacy.gold.iob_utils import biluo_tags_from_offsets
 from spacy.scorer import Scorer, ROCAUCScore
 from spacy.scorer import _roc_auc_score, _roc_curve
 from .util import get_doc
 from spacy.lang.en import English
 
+
 test_las_apple = [
     [
         "Apple is looking at buying U.K. startup for $ 1 billion",
@@ -134,8 +136,11 @@ def test_ner_per_type(en_vocab):
             words=input_.split(" "),
             ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]],
         )
-        ex = Example(doc=doc)
-        ex.set_token_annotation(entities=annot["entities"])
+        entities = biluo_tags_from_offsets(doc, annot["entities"])
+        ex = Example(
+            doc=doc,
+            token_annotation=TokenAnnotation(entities=entities)
+        )
         scorer.score(ex)
     results = scorer.scores
 
@@ -155,8 +160,11 @@ def test_ner_per_type(en_vocab):
             words=input_.split(" "),
             ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]],
         )
-        ex = Example(doc=doc)
-        ex.set_token_annotation(entities=annot["entities"])
+        entities = biluo_tags_from_offsets(doc, annot["entities"])
+        ex = Example(
+            doc=doc,
+            token_annotation=TokenAnnotation(entities=entities)
+        )
         scorer.score(ex)
     results = scorer.scores
 
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 3aa27e451..81cef4492 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -799,6 +799,8 @@ cdef class Doc:
         cdef attr_id_t attr_id
         cdef TokenC* tokens = self.c
         cdef int length = len(array)
+        if length != len(self):
+            raise ValueError("Cannot set array values longer than the document.")
         # Get set up for fast loading
         cdef Pool mem = Pool()
         cdef int n_attrs = len(attrs)

From 549164c31cf273339487e97aae4f6d4e84ee7779 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 9 Jun 2020 12:33:14 +0200
Subject: [PATCH 22/56] Fix corpus when no raw text supplied

---
 spacy/gold/corpus.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py
index df13ab505..e8bb91359 100644
--- a/spacy/gold/corpus.py
+++ b/spacy/gold/corpus.py
@@ -201,7 +201,16 @@ class GoldCorpus(object):
     ):
         """ Setting gold_preproc will result in creating a doc per sentence """
         for eg_dict in annotations:
-            example = Example.from_dict(eg_dict, doc=nlp.make_doc(eg_dict["text"]))
+            if eg_dict["text"]:
+                example = Example.from_dict(
+                    eg_dict,
+                    doc=nlp.make_doc(eg_dict["text"])
+                )
+            else:
+                example = Example.from_dict(
+                    eg_dict,
+                    doc=Doc(nlp.vocab, words=eg_dict["words"])
+                )
             example_docs = []
             if gold_preproc:
                 split_examples = example.split_sents()

From 20a1bdb29813f509f2de9b55d30cb775e2225732 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 9 Jun 2020 12:33:29 +0200
Subject: [PATCH 23/56] Fix train

---
 spacy/cli/train_from_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py
index c4db5f6ba..4fea39064 100644
--- a/spacy/cli/train_from_config.py
+++ b/spacy/cli/train_from_config.py
@@ -186,7 +186,7 @@ def train(
     msg.info("Loading training corpus")
     corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit)
     msg.info("Initializing the nlp pipeline")
-    nlp.begin_training(lambda: corpus.train_examples)
+    nlp.begin_training(lambda: corpus.train_dataset(nlp))
 
     train_batches = create_train_batches(nlp, corpus, training)
     evaluate = create_evaluation_callback(nlp, optimizer, corpus, training)

From cb08ce39362a30f5d589de5f0d219c75ca269a9e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 9 Jun 2020 12:40:41 +0200
Subject: [PATCH 24/56] Move alignment into Cython

---
 spacy/gold/{align.py => align.pyx} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename spacy/gold/{align.py => align.pyx} (100%)

diff --git a/spacy/gold/align.py b/spacy/gold/align.pyx
similarity index 100%
rename from spacy/gold/align.py
rename to spacy/gold/align.pyx

From 449000c23458788eaaab5390c61452d5062d88d5 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 9 Jun 2020 12:43:53 +0200
Subject: [PATCH 25/56] Fix gold_io

---
 spacy/gold/gold_io.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/gold/gold_io.pyx b/spacy/gold/gold_io.pyx
index 8aa5f4017..83208ad85 100644
--- a/spacy/gold/gold_io.pyx
+++ b/spacy/gold/gold_io.pyx
@@ -72,7 +72,7 @@ def read_json_file(loc, docs_filter=None, limit=None):
         for doc in json_iterate(loc):
             if docs_filter is not None and not docs_filter(doc):
                 continue
-            for json_data in json_to_examples(doc):
+            for json_data in json_to_annotations(doc):
                 yield json_data
 
 

From 453cfa14d0200e13cf1246406fa7ae8ba58f3987 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 9 Jun 2020 15:28:42 +0200
Subject: [PATCH 26/56] Start drafting new example class

---
 spacy/gold/new_example.pxd |   8 +
 spacy/gold/new_example.pyx | 304 +++++++++++++++++++++++++++++++++++++
 2 files changed, 312 insertions(+)
 create mode 100644 spacy/gold/new_example.pxd
 create mode 100644 spacy/gold/new_example.pyx

diff --git a/spacy/gold/new_example.pxd b/spacy/gold/new_example.pxd
new file mode 100644
index 000000000..9e513b033
--- /dev/null
+++ b/spacy/gold/new_example.pxd
@@ -0,0 +1,8 @@
+from ..tokens.doc cimport Doc
+from .align cimport Alignment
+
+
+cdef class NewExample:
+    cdef readonly Doc x
+    cdef readonly Doc y
+    cdef readonly Alignment _alignment
diff --git a/spacy/gold/new_example.pyx b/spacy/gold/new_example.pyx
new file mode 100644
index 000000000..7f081ffbd
--- /dev/null
+++ b/spacy/gold/new_example.pyx
@@ -0,0 +1,304 @@
+import numpy
+from ..tokens.doc cimport Doc
+from ..attrs import IDS
+from .align cimport Alignment
+from .annotation import TokenAnnotation, DocAnnotation
+from .iob_utils import biluo_to_iob, biluo_tags_from_offsets
+from .align import Alignment
+from ..errors import Errors, AlignmentError
+
+
+cpdef Doc annotations2doc(Doc predicted, doc_annot, tok_annot):
+    # TODO: Improve and test this
+    words = tok_annot.get("ORTH", [tok.text for tok in predicted])
+    attrs, array = _annot2array(predicted.vocab.strings, tok_annot, doc_annot)
+    output = Doc(predicted.vocab, words=words)
+    if array.size:
+        output = output.from_array(attrs, array)
+    output.cats.update(doc_annot.get("cats", {}))
+    return output
+
+
+cdef class NewExample:
+    def __init__(self, Doc predicted, Doc reference, *, Alignment alignment=None):
+        """ Doc can either be text, or an actual Doc """
+        msg = "Example.__init__ got None for '{arg}'. Requires Doc."
+        if predicted is None:
+            raise TypeError(msg.format(arg="predicted"))
+        if reference is None:
+            raise TypeError(msg.format(arg="reference"))
+        self.x = predicted
+        self.y = reference
+        self._alignment = alignment
+
+    @property
+    def predicted(self):
+        return self.x
+    
+    @property
+    def reference(self):
+        return self.y
+ 
+    @classmethod
+    def from_dict(cls, Doc predicted, dict example_dict):
+        if example_dict is None:
+            raise ValueError("Example.from_dict expected dict, received None")
+        if not isinstance(predicted, Doc):
+            raise TypeError(f"Argument 1 should be Doc. Got {type(predicted)}")
+        example_dict = _fix_legacy_dict_data(predicted, example_dict)
+        tok_dict, doc_dict = _parse_example_dict_data(example_dict)
+        return NewExample(
+            predicted,
+            annotations2doc(predicted, tok_dict, doc_dict)
+        )
+    
+    @property
+    def alignment(self):
+        if self._alignment is None:
+            if self.doc is None:
+                return None
+            spacy_words = [token.orth_ for token in self.predicted]
+            gold_words = [token.orth_ for token in self.reference]
+            if gold_words == []:
+                gold_words = spacy_words
+            self._alignment = Alignment(spacy_words, gold_words)
+        return self._alignment
+
+    def get_aligned(self, field):
+        raise NotImplementedError
+
+    def to_dict(self):
+        """ Note that this method does NOT export the doc, only the annotations ! """
+        token_dict = self._token_annotation
+        doc_dict = self._doc_annotation
+        return {"token_annotation": token_dict, "doc_annotation": doc_dict}
+
+    def text(self):
+        return self.x.text
+
+
+def _annot2array(strings, tok_annot, doc_annot):
+    attrs = []
+    values = []
+    for key, value in tok_annot.items():
+        if key not in IDS:
+            raise ValueError(f"Unknown attr: {key}")
+        if key == "HEAD":
+            values.append([h-i for i, h in enumerate(value)])
+        else:
+            values.append([strings.add(v) for v in value])
+        attrs.append(key)
+    # TODO: Calculate token.ent_kb_id from doc_annot["links"].
+    # We need to fix this and the doc.ents thing, both should be doc
+    # annotations.
+    array = numpy.array(values, dtype="uint64")
+    return attrs, array
+
+
+def _parse_example_dict_data(example_dict):
+    return (
+        example_dict["token_annotation"],
+        example_dict["doc_annotation"]
+    )
+
+
+def _fix_legacy_dict_data(predicted, example_dict):
+    token_dict = example_dict.get("token_annotation", {})
+    doc_dict = example_dict.get("doc_annotation", {})
+    for key, value in example_dict.items():
+        if key in ("token_annotation", "doc_annotation"):
+            pass
+        elif key in ("cats", "links"):
+            doc_dict[key] = value
+        else:
+            token_dict[key] = value
+    # Remap keys
+    remapping = {
+        "words": "ORTH",
+        "tags": "TAG",
+        "pos": "POS",
+        "lemmas": "LEMMA",
+        "deps": "DEP",
+        "heads": "HEAD",
+        "sent_starts": "SENT_START",
+        "morphs": "MORPH",
+    }
+    old_token_dict = token_dict
+    token_dict = {}
+    for key, value in old_token_dict.items():
+        if key in remapping:
+            token_dict[remapping[key]] = value
+        elif key in ("ner", "entities") and value:
+            # Arguably it would be smarter to put this in the doc annotation?
+            words = token_dict.get("words", [t.text for t in predicted])
+            ent_iobs, ent_types = _parse_ner_tags(predicted, words, value)
+            token_dict["ENT_IOB"] = ent_iobs
+            token_dict["ENT_TYPE"] = ent_types
+    return {
+        "token_annotation": token_dict,
+        "doc_annotation": doc_dict
+    }
+
+
+def _parse_ner_tags(predicted, words, biluo_or_offsets):
+    if isinstance(biluo_or_offsets[0], (list, tuple)):
+        # Convert to biluo if necessary
+        # This is annoying but to convert the offsets we need a Doc
+        # that has the target tokenization.
+        reference = Doc(
+            predicted.vocab,
+            words=words
+        )
+        biluo = biluo_tags_from_offsets(predicted, biluo_or_offsets)
+    else:
+        biluo = biluo_or_offsets
+    ent_iobs = []
+    ent_types = []
+    for iob_tag in biluo_to_iob(biluo):
+        ent_iobs.append(iob_tag.split("-")[0])
+        if iob_tag.startswith("I") or iob_tag.startswith("B"):
+            ent_types.append(iob_tag.split("-", 1)[1])
+        else:
+            ent_types.append("")
+    return ent_iobs, ent_types
+
+
+class Example:
+    def get_aligned(self, field):
+        """Return an aligned array for a token annotation field."""
+        if self.doc is None:
+            return self.token_annotation.get_field(field)
+        doc = self.doc
+        if field == "word":
+            return [token.orth_ for token in doc]
+        gold_values = self.token_annotation.get_field(field)
+        alignment = self.alignment
+        i2j_multi = alignment.i2j_multi
+        gold_to_cand = alignment.gold_to_cand
+        cand_to_gold = alignment.cand_to_gold
+
+        output = []
+        for i, gold_i in enumerate(cand_to_gold):
+            if doc[i].text.isspace():
+                output.append(None)
+            elif gold_i is None:
+                if i in i2j_multi:
+                    output.append(gold_values[i2j_multi[i]])
+                else:
+                    output.append(None)
+            else:
+                output.append(gold_values[gold_i])
+        return output
+
+    def split_sents(self):
+        """ Split the token annotations into multiple Examples based on
+        sent_starts and return a list of the new Examples"""
+        if not self.token_annotation.words:
+            return [self]
+        s_ids, s_words, s_tags, s_pos, s_morphs = [], [], [], [], []
+        s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], []
+        s_brackets = []
+        sent_start_i = 0
+        t = self.token_annotation
+        split_examples = []
+        for i in range(len(t.words)):
+            if i > 0 and t.sent_starts[i] == 1:
+                split_examples.append(
+                    Example(
+                        doc=Doc(self.doc.vocab, words=s_words),
+                        token_annotation=TokenAnnotation(
+                            ids=s_ids,
+                            words=s_words,
+                            tags=s_tags,
+                            pos=s_pos,
+                            morphs=s_morphs,
+                            lemmas=s_lemmas,
+                            heads=s_heads,
+                            deps=s_deps,
+                            entities=s_ents,
+                            sent_starts=s_sent_starts,
+                            brackets=s_brackets,
+                        ),
+                        doc_annotation=self.doc_annotation
+                    )
+                )
+                s_ids, s_words, s_tags, s_pos, s_heads = [], [], [], [], []
+                s_deps, s_ents, s_morphs, s_lemmas = [], [], [], []
+                s_sent_starts, s_brackets = [], []
+                sent_start_i = i
+            s_ids.append(t.get_id(i))
+            s_words.append(t.get_word(i))
+            s_tags.append(t.get_tag(i))
+            s_pos.append(t.get_pos(i))
+            s_morphs.append(t.get_morph(i))
+            s_lemmas.append(t.get_lemma(i))
+            s_heads.append(t.get_head(i) - sent_start_i)
+            s_deps.append(t.get_dep(i))
+            s_ents.append(t.get_entity(i))
+            s_sent_starts.append(t.get_sent_start(i))
+            for b_end, b_label in t.brackets_by_start.get(i, []):
+                s_brackets.append((i - sent_start_i, b_end - sent_start_i, b_label))
+            i += 1
+        split_examples.append(
+            Example(
+                doc=Doc(self.doc.vocab, words=s_words),
+                token_annotation=TokenAnnotation(
+                    ids=s_ids,
+                    words=s_words,
+                    tags=s_tags,
+                    pos=s_pos,
+                    morphs=s_morphs,
+                    lemmas=s_lemmas,
+                    heads=s_heads,
+                    deps=s_deps,
+                    entities=s_ents,
+                    sent_starts=s_sent_starts,
+                    brackets=s_brackets,
+                ),
+                doc_annotation=self.doc_annotation
+            )
+        )
+        return split_examples
+
+    @classmethod
+    def to_example_objects(cls, examples, make_doc=None, keep_raw_text=False):
+        """
+        Return a list of Example objects, from a variety of input formats.
+        make_doc needs to be provided when the examples contain text strings and keep_raw_text=False
+        """
+        if isinstance(examples, Example):
+            return [examples]
+        if isinstance(examples, tuple):
+            examples = [examples]
+        converted_examples = []
+        for ex in examples:
+            if isinstance(ex, Example):
+                converted_examples.append(ex)
+            # convert string to Doc to Example
+            elif isinstance(ex, str):
+                if keep_raw_text:
+                    converted_examples.append(Example(doc=ex))
+                else:
+                    doc = make_doc(ex)
+                    converted_examples.append(Example(doc=doc))
+            # convert tuples to Example
+            elif isinstance(ex, tuple) and len(ex) == 2:
+                doc, gold = ex
+                # convert string to Doc
+                if isinstance(doc, str) and not keep_raw_text:
+                    doc = make_doc(doc)
+                converted_examples.append(Example.from_dict(gold, doc=doc))
+            # convert Doc to Example
+            elif isinstance(ex, Doc):
+                converted_examples.append(Example(doc=ex))
+            else:
+                converted_examples.append(ex)
+        return converted_examples
+
+    def _deprecated_get_gold(self, make_projective=False):
+        from ..syntax.gold_parse import get_parses_from_example
+
+        _, gold = get_parses_from_example(self, make_projective=make_projective)[0]
+        return gold
+
+

From c833ebe1ad72154bbac3213832a50bae0caa84f6 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 9 Jun 2020 15:29:05 +0200
Subject: [PATCH 27/56] Start tests for new example class

---
 spacy/tests/test_new_example.py | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 spacy/tests/test_new_example.py

diff --git a/spacy/tests/test_new_example.py b/spacy/tests/test_new_example.py
new file mode 100644
index 000000000..c481ae932
--- /dev/null
+++ b/spacy/tests/test_new_example.py
@@ -0,0 +1,30 @@
+import pytest
+from spacy.gold.new_example import NewExample as Example
+from spacy.tokens import Doc
+from spacy.vocab import Vocab
+
+
+@pytest.fixture
+def vocab():
+    return Vocab()
+
+
+def test_Example_init_requires_doc_objects(vocab):
+    with pytest.raises(TypeError):
+        eg = Example(None, None)
+    with pytest.raises(TypeError):
+        eg = Example(Doc(vocab, words=["hi"]), None)
+    with pytest.raises(TypeError):
+        eg = Example(None, Doc(vocab, words=["hi"]))
+
+
+
+def test_Example_from_dict(vocab):
+    eg = Example.from_dict(
+        Doc(vocab, words=["hello", "world"]),
+        {
+            "words": ["hello", "world"]
+        }
+    )
+    assert isinstance(eg.x, Doc)
+    assert isinstance(eg.y, Doc)

From f1189dc205b76817a8e738463e06f7aad18883a4 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 9 Jun 2020 15:43:08 +0200
Subject: [PATCH 28/56] Draft tests for new Example class

---
 spacy/tests/test_new_example.py | 50 +++++++++++++++++++++++++++------
 1 file changed, 42 insertions(+), 8 deletions(-)

diff --git a/spacy/tests/test_new_example.py b/spacy/tests/test_new_example.py
index c481ae932..fcd02ee91 100644
--- a/spacy/tests/test_new_example.py
+++ b/spacy/tests/test_new_example.py
@@ -4,12 +4,8 @@ from spacy.tokens import Doc
 from spacy.vocab import Vocab
 
 
-@pytest.fixture
-def vocab():
-    return Vocab()
-
-
-def test_Example_init_requires_doc_objects(vocab):
+def test_Example_init_requires_doc_objects():
+    vocab = Vocab()
     with pytest.raises(TypeError):
         eg = Example(None, None)
     with pytest.raises(TypeError):
@@ -19,12 +15,50 @@ def test_Example_init_requires_doc_objects(vocab):
 
 
 
-def test_Example_from_dict(vocab):
+def test_Example_from_dict_basic():
     eg = Example.from_dict(
-        Doc(vocab, words=["hello", "world"]),
+        Doc(Vocab(), words=["hello", "world"]),
         {
             "words": ["hello", "world"]
         }
     )
     assert isinstance(eg.x, Doc)
     assert isinstance(eg.y, Doc)
+
+
+@pytest.mark.parametrize("annots", [
+    {"words": ["ice", "cream"], "tags": ["NN", "NN"]},
+])
+def test_Example_from_dict_with_tags(annots):
+    vocab = Vocab()
+    predicted = Doc(vocab, words=annots["words"])
+    eg = Example.from_dict(predicted, annots)
+    for i, token in enumerate(eg.reference):
+        assert token.tag_ == annots["tags"][i]
+
+
+"""
+def test_Example_from_dict_with_entities(vocab):
+    # TODO
+    pass
+
+def test_Example_from_dict_with_parse(vocab):
+    # TODO
+    pass
+
+def test_Example_from_dict_with_morphology(vocab):
+    # TODO
+    pass
+
+def test_Example_from_dict_with_sent_start(vocab):
+    # TODO
+    pass
+
+def test_Example_from_dict_with_cats(vocab):
+    # TODO
+    pass
+
+def test_Example_from_dict_with_links(vocab):
+    # TODO
+    pass
+"""

From 36d49a0f13e8a17185a8ee821738e57c55c3848d Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 9 Jun 2020 15:43:19 +0200
Subject: [PATCH 29/56] Fix NewExample class

---
 spacy/gold/new_example.pyx | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/spacy/gold/new_example.pyx b/spacy/gold/new_example.pyx
index 7f081ffbd..3c42c0bb1 100644
--- a/spacy/gold/new_example.pyx
+++ b/spacy/gold/new_example.pyx
@@ -8,7 +8,7 @@ from .align import Alignment
 from ..errors import Errors, AlignmentError
 
 
-cpdef Doc annotations2doc(Doc predicted, doc_annot, tok_annot):
+cpdef Doc annotations2doc(Doc predicted, tok_annot, doc_annot):
     # TODO: Improve and test this
     words = tok_annot.get("ORTH", [tok.text for tok in predicted])
     attrs, array = _annot2array(predicted.vocab.strings, tok_annot, doc_annot)
@@ -83,16 +83,19 @@ def _annot2array(strings, tok_annot, doc_annot):
     for key, value in tok_annot.items():
         if key not in IDS:
             raise ValueError(f"Unknown attr: {key}")
-        if key == "HEAD":
+        elif key == "ORTH":
+            pass
+        elif key == "HEAD":
+            attrs.append(key)
             values.append([h-i for i, h in enumerate(value)])
         else:
+            attrs.append(key)
             values.append([strings.add(v) for v in value])
-        attrs.append(key)
     # TODO: Calculate token.ent_kb_id from doc_annot["links"].
     # We need to fix this and the doc.ents thing, both should be doc
     # annotations.
     array = numpy.array(values, dtype="uint64")
-    return attrs, array
+    return attrs, array.T
 
 
 def _parse_example_dict_data(example_dict):

From 793092d2d82cdbabc2393fad1ffb3bb19575d76e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 9 Jun 2020 15:43:38 +0200
Subject: [PATCH 30/56] Fix renaming in GoldCorpus

---
 spacy/gold/corpus.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py
index e8bb91359..84de01665 100644
--- a/spacy/gold/corpus.py
+++ b/spacy/gold/corpus.py
@@ -7,7 +7,7 @@ import itertools
 from ..tokens import Doc
 from .. import util
 from ..errors import Errors, AlignmentError
-from .gold_io import read_json_file, json_to_examples
+from .gold_io import read_json_file, json_to_annotations
 from .augment import make_orth_variants, add_noise
 from .example import Example
 
@@ -88,7 +88,7 @@ class GoldCorpus(object):
                     if first_gold_tuple.get("paragraphs", None):
                         examples = []
                         for json_doc in gold_tuples:
-                            examples.extend(json_to_examples(json_doc))
+                            examples.extend(json_to_annotations(json_doc))
                     elif first_gold_tuple.get("doc_annotation", None):
                         examples = []
                         for ex_dict in gold_tuples:

From b5ef39763930f6ad838a260ac064ad74c3f37818 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 9 Jun 2020 15:43:48 +0200
Subject: [PATCH 31/56] Add header for align.pxd

---
 spacy/gold/align.pxd | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 spacy/gold/align.pxd

diff --git a/spacy/gold/align.pxd b/spacy/gold/align.pxd
new file mode 100644
index 000000000..ea3615863
--- /dev/null
+++ b/spacy/gold/align.pxd
@@ -0,0 +1,8 @@
+cdef class Alignment:
+    cdef public object cost
+    cdef public object i2j
+    cdef public object j2i
+    cdef public object i2j_multi
+    cdef public object j2i_multi
+    cdef public object cand_to_gold
+    cdef public object gold_to_cand

From f4caaa8ad9f36a5bb3c9a040859d781eb81c40b5 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 9 Jun 2020 15:43:57 +0200
Subject: [PATCH 32/56] Update alignment

---
 spacy/gold/align.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/gold/align.pyx b/spacy/gold/align.pyx
index 49e8aaa98..80ba0346a 100644
--- a/spacy/gold/align.pyx
+++ b/spacy/gold/align.pyx
@@ -2,7 +2,7 @@ import numpy
 from ..errors import Errors, AlignmentError
 
 
-class Alignment:
+cdef class Alignment:
     def __init__(self, spacy_words, gold_words):
         # Do many-to-one alignment for misaligned tokens.
         # If we over-segment, we'll have one gold word that covers a sequence

From 04569c0b3e6606db70a494f5b5706090bb809646 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 9 Jun 2020 15:44:08 +0200
Subject: [PATCH 33/56] Fix import

---
 spacy/syntax/nonproj.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx
index a91176f44..ee3219392 100644
--- a/spacy/syntax/nonproj.pyx
+++ b/spacy/syntax/nonproj.pyx
@@ -7,7 +7,7 @@ from copy import copy
 
 from ..tokens.doc cimport Doc, set_children_from_heads
 
-from ..gold import Example
+from ..gold import Example, TokenAnnotation
 from ..errors import Errors
 
 

From a20ac36bb7331cf963d62197d099479539387716 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 9 Jun 2020 15:44:17 +0200
Subject: [PATCH 34/56] Compile new modules

---
 setup.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/setup.py b/setup.py
index 864a4036a..c92761f2a 100755
--- a/setup.py
+++ b/setup.py
@@ -23,6 +23,8 @@ Options.docstrings = True
 
 PACKAGES = find_packages()
 MOD_NAMES = [
+    "spacy.gold.align",
+    "spacy.gold.new_example",
     "spacy.parts_of_speech",
     "spacy.strings",
     "spacy.lexeme",

From ccd332a9fc6290ee0c49dcbfbd6c62349cab1a1f Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 9 Jun 2020 15:49:04 +0200
Subject: [PATCH 35/56] Update test stubs

---
 spacy/tests/test_new_example.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/spacy/tests/test_new_example.py b/spacy/tests/test_new_example.py
index fcd02ee91..473666eca 100644
--- a/spacy/tests/test_new_example.py
+++ b/spacy/tests/test_new_example.py
@@ -38,27 +38,27 @@ def test_Example_from_dict_with_tags(annots):
 
 
 """
-def test_Example_from_dict_with_entities(vocab):
+def test_Example_from_dict_with_entities(annots):
     # TODO
     pass
 
-def test_Example_from_dict_with_parse(vocab):
+def test_Example_from_dict_with_parse(annots):
     # TODO
     pass
 
-def test_Example_from_dict_with_morphology(vocab):
+def test_Example_from_dict_with_morphology(annots):
     # TODO
     pass
 
-def test_Example_from_dict_with_sent_start(vocab):
+def test_Example_from_dict_with_sent_start(annots):
     # TODO
     pass
 
-def test_Example_from_dict_with_cats(vocab):
+def test_Example_from_dict_with_cats(annots):
     # TODO
     pass
 
-def test_Example_from_dict_with_links(vocab):
+def test_Example_from_dict_with_links(annots):
     # TODO
     pass
 """

From b3868cd1f8d8c1a71d81fbbf16ab8ffaaa3e21d9 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 9 Jun 2020 23:06:48 +0200
Subject: [PATCH 36/56] Update NewExample

---
 spacy/gold/new_example.pyx | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/spacy/gold/new_example.pyx b/spacy/gold/new_example.pyx
index 3c42c0bb1..136eca130 100644
--- a/spacy/gold/new_example.pyx
+++ b/spacy/gold/new_example.pyx
@@ -31,13 +31,19 @@ cdef class NewExample:
         self.y = reference
         self._alignment = alignment
 
-    @property
-    def predicted(self):
-        return self.x
+    property predicted:
+        def __get__(self):
+            return self.x
+
+        def __set__(self, doc):
+            self.x = doc
     
-    @property
-    def reference(self):
-        return self.y
+    property reference:
+        def __get__(self):
+            return self.y
+
+        def __set__(self, doc):
+            self.y = doc
  
     @classmethod
     def from_dict(cls, Doc predicted, dict example_dict):

From 0714f1fa5c84da386e4dc771e83d9d639fb9d301 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 9 Jun 2020 23:30:06 +0200
Subject: [PATCH 37/56] Remove the 'pass example into __call__' thing

---
 spacy/pipeline/pipes.pyx | 237 +++++++++++++++------------------------
 1 file changed, 88 insertions(+), 149 deletions(-)

diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 2c40738f6..c6233be90 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -20,7 +20,7 @@ from .defaults import default_nel, default_senter
 from .functions import merge_subtokens
 from ..language import Language, component
 from ..syntax import nonproj
-from ..gold import Example
+from ..gold.new_example import NewExample as Example
 from ..attrs import POS, ID
 from ..util import link_vectors_to_models, create_default_optimizer
 from ..parts_of_speech import X
@@ -48,12 +48,6 @@ class Pipe(object):
     def from_nlp(cls, nlp, model, **cfg):
         return cls(nlp.vocab, model, **cfg)
 
-    def _get_doc(self, example):
-        """ Use this method if the `example` can be both a Doc or an Example """
-        if isinstance(example, Doc):
-            return example
-        return example.doc
-
     def __init__(self, vocab, model, **cfg):
         """Create a new pipe instance."""
         raise NotImplementedError
@@ -73,18 +67,17 @@ class Pipe(object):
         else:
             self.set_annotations([doc], predictions)
         if isinstance(example, Example):
-            example.doc = doc
+            example.predicted = doc
             return example
         return doc
 
-    def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
+    def pipe(self, stream, batch_size=128, n_threads=-1):
         """Apply the pipe to a stream of documents.
 
         Both __call__ and pipe should delegate to the `predict()`
         and `set_annotations()` methods.
         """
         for examples in util.minibatch(stream, size=batch_size):
-            docs = [self._get_doc(ex) for ex in examples]
             predictions = self.predict(docs)
             if isinstance(predictions, tuple) and len(tuple) == 2:
                 scores, tensors = predictions
@@ -94,7 +87,7 @@ class Pipe(object):
 
             if as_example:
                 for ex, doc in zip(examples, docs):
-                    ex.doc = doc
+                    ex.predicted = doc
                     yield ex
             else:
                 yield from docs
@@ -116,7 +109,6 @@ class Pipe(object):
         Delegates to predict() and get_loss().
         """
         if set_annotations:
-            docs = (self._get_doc(ex) for ex in examples)
             docs = list(self.pipe(docs))
 
     def rehearse(self, examples, sgd=None, losses=None, **config):
@@ -256,28 +248,18 @@ class Tagger(Pipe):
         return tuple(self.vocab.morphology.tag_names)
 
     def __call__(self, example):
-        doc = self._get_doc(example)
         tags = self.predict([doc])
         self.set_annotations([doc], tags)
         if isinstance(example, Example):
-            example.doc = doc
+            example.predicted = doc
             return example
         return doc
 
     def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
-        for examples in util.minibatch(stream, size=batch_size):
-            docs = [self._get_doc(ex) for ex in examples]
+        for docs in util.minibatch(stream, size=batch_size):
             tag_ids = self.predict(docs)
-            assert len(docs) == len(examples)
-            assert len(tag_ids) == len(examples)
             self.set_annotations(docs, tag_ids)
-
-            if as_example:
-                for ex, doc in zip(examples, docs):
-                    ex.doc = doc
-                    yield ex
-            else:
-                yield from docs
+            yield from docs
 
     def predict(self, docs):
         if not any(len(doc) for doc in docs):
@@ -327,15 +309,17 @@ class Tagger(Pipe):
             doc.is_tagged = True
 
     def update(self, examples, drop=0., sgd=None, losses=None, set_annotations=False):
-        examples = Example.to_example_objects(examples)
+        for eg in examples:
+            assert isinstance(eg, Example)
         if losses is not None and self.name not in losses:
             losses[self.name] = 0.
 
-        if not any(len(ex.doc) if ex.doc else 0 for ex in examples):
+        if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
             # Handle cases where there are no tokens in any docs.
             return
         set_dropout_rate(self.model, drop)
-        tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples])
+        tag_scores, bp_tag_scores = self.model.begin_update(
+            [eg.predicted for eg in examples])
         for sc in tag_scores:
             if self.model.ops.xp.isnan(sc.sum()):
                 raise ValueError("nan value in scores")
@@ -347,17 +331,16 @@ class Tagger(Pipe):
         if losses is not None:
             losses[self.name] += loss
         if set_annotations:
-            docs = [ex.doc for ex in examples]
+            docs = [eg.predicted for eg in examples]
             self.set_annotations(docs, self._scores2guesses(tag_scores))
 
     def rehearse(self, examples, drop=0., sgd=None, losses=None):
         """Perform a 'rehearsal' update, where we try to match the output of
         an initial model.
         """
+        docs = [eg.predicted for eg in examples]
         if self._rehearsal_model is None:
             return
-        examples = Example.to_example_objects(examples)
-        docs = [ex.doc for ex in examples]
         if not any(len(doc) for doc in docs):
             # Handle cases where there are no tokens in any docs.
             return
@@ -387,7 +370,8 @@ class Tagger(Pipe):
         orig_tag_map = dict(self.vocab.morphology.tag_map)
         new_tag_map = {}
         for example in get_examples():
-            for tag in example.token_annotation.tags:
+            for token in example.y:
+                tag = token.tag_
                 if tag in orig_tag_map:
                     new_tag_map[tag] = orig_tag_map[tag]
                 else:
@@ -575,7 +559,7 @@ class SentenceRecognizer(Tagger):
         d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
         d_scores *= self.model.ops.asarray(known_labels)
         loss = (d_scores**2).sum()
-        docs = [eg.doc for eg in examples]
+        docs = [eg.predicted for eg in examples]
         d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
         return float(loss), d_scores
 
@@ -687,8 +671,8 @@ class MultitaskObjective(Tagger):
         gold_examples = nonproj.preprocess_training_data(get_examples())
         # for raw_text, doc_annot in gold_tuples:
         for example in gold_examples:
-            for i in range(len(example.token_annotation.ids)):
-                label = self.make_label(i, example.token_annotation)
+            for token in example.y:
+                label = self.make_label(token)
                 if label is not None and label not in self.labels:
                     self.labels[label] = len(self.labels)
         self.model.initialize()
@@ -706,11 +690,11 @@ class MultitaskObjective(Tagger):
         cdef int idx = 0
         correct = numpy.zeros((scores.shape[0],), dtype="i")
         guesses = scores.argmax(axis=1)
-        docs = [ex.doc for ex in examples]
+        docs = [eg.predicted for eg in examples]
         for i, eg in enumerate(examples):
             # Handles alignment for tokenization differences
             doc_annots = eg.get_aligned()
-            for j in range(len(eg.doc)):
+            for j in range(len(eg.predicted)):
                 tok_annots = {key: values[j] for key, values in tok_annots.items()}
                 label = self.make_label(j, tok_annots)
                 if label is None or label not in self.labels:
@@ -724,83 +708,49 @@ class MultitaskObjective(Tagger):
         return float(loss), d_scores
 
     @staticmethod
-    def make_dep(i, token_annotation):
-        if token_annotation.deps[i] is None or token_annotation.heads[i] is None:
-            return None
-        return token_annotation.deps[i]
+    def make_dep(token):
+        return token.dep_
 
     @staticmethod
-    def make_tag(i, token_annotation):
-        return token_annotation.tags[i]
+    def make_tag(token):
+        return token.tag_
 
     @staticmethod
-    def make_ent(i, token_annotation):
-        if token_annotation.entities is None:
-            return None
-        return token_annotation.entities[i]
+    def make_ent(token):
+        if token.ent_iob_ == "O":
+            return "O"
+        else:
+            return token.ent_iob_ + "-" + token.ent_type_
 
     @staticmethod
-    def make_dep_tag_offset(i, token_annotation):
-        if token_annotation.deps[i] is None or token_annotation.heads[i] is None:
-            return None
-        offset = token_annotation.heads[i] - i
+    def make_dep_tag_offset(token):
+        dep = token.dep_
+        tag = token.tag_
+        offset = token.head.i - token.i
         offset = min(offset, 2)
         offset = max(offset, -2)
-        return f"{token_annotation.deps[i]}-{token_annotation.tags[i]}:{offset}"
+        return f"{dep}-{tag}:{offset}"
 
     @staticmethod
-    def make_ent_tag(i, token_annotation):
-        if token_annotation.entities is None or token_annotation.entities[i] is None:
-            return None
+    def make_ent_tag(token):
+        if token.ent_iob_ == "O":
+            ent = "O"
         else:
-            return f"{token_annotation.tags[i]}-{token_annotation.entities[i]}"
+            ent = token.ent_iob_ + "-" + token.ent_type_
+        tag = token.tag_
+        return f"{tag}-{ent}"
 
     @staticmethod
-    def make_sent_start(target, token_annotation, cache=True, _cache={}):
+    def make_sent_start(token):
         """A multi-task objective for representing sentence boundaries,
         using BILU scheme. (O is impossible)
-
-        The implementation of this method uses an internal cache that relies
-        on the identity of the heads array, to avoid requiring a new piece
-        of gold data. You can pass cache=False if you know the cache will
-        do the wrong thing.
         """
-        words = token_annotation.words
-        heads = token_annotation.heads
-        assert len(words) == len(heads)
-        assert target < len(words), (target, len(words))
-        if cache:
-            if id(heads) in _cache:
-                return _cache[id(heads)][target]
-            else:
-                for key in list(_cache.keys()):
-                    _cache.pop(key)
-            sent_tags = ["I-SENT"] * len(words)
-            _cache[id(heads)] = sent_tags
+        if token.is_sent_start and token.is_sent_end:
+            return "U-SENT"
+        elif token.is_sent_start:
+            return "B-SENT"
         else:
-            sent_tags = ["I-SENT"] * len(words)
-
-        def _find_root(child):
-            seen = set([child])
-            while child is not None and heads[child] != child:
-                seen.add(child)
-                child = heads[child]
-            return child
-
-        sentences = {}
-        for i in range(len(words)):
-            root = _find_root(i)
-            if root is None:
-                sent_tags[i] = None
-            else:
-                sentences.setdefault(root, []).append(i)
-        for root, span in sorted(sentences.items()):
-            if len(span) == 1:
-                sent_tags[span[0]] = "U-SENT"
-            else:
-                sent_tags[span[0]] = "B-SENT"
-                sent_tags[span[-1]] = "L-SENT"
-        return sent_tags[target]
+            return "I-SENT"
 
 
 class ClozeMultitask(Pipe):
@@ -833,7 +783,7 @@ class ClozeMultitask(Pipe):
         # token.vector values, but that's a bit inefficient, especially on GPU.
         # Instead we fetch the index into the vectors table for each of our tokens,
         # and look them up all at once. This prevents data copying.
-        ids = self.model.ops.flatten([ex.doc.to_array(ID).ravel() for ex in examples])
+        ids = self.model.ops.flatten([eg.predicted.to_array(ID).ravel() for eg in examples])
         target = vectors[ids]
         gradient = self.distance.get_grad(prediction, target)
         loss = self.distance.get_loss(prediction, target)
@@ -843,11 +793,12 @@ class ClozeMultitask(Pipe):
         pass
 
     def rehearse(self, examples, drop=0., sgd=None, losses=None):
-        examples = Example.to_example_objects(examples)
         if losses is not None and self.name not in losses:
             losses[self.name] = 0.
+        docs = [eg.predicted for eg in examples]
         set_dropout_rate(self.model, drop)
-        predictions, bp_predictions = self.model.begin_update([ex.doc for ex in examples])
+        predictions, bp_predictions = self.model.begin_update(
+            [eg.predicted for eg in examples])
         loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
         bp_predictions(d_predictions)
         if sgd is not None:
@@ -883,17 +834,10 @@ class TextCategorizer(Pipe):
         self.cfg["labels"] = tuple(value)
 
     def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
-        for examples in util.minibatch(stream, size=batch_size):
-            docs = [self._get_doc(ex) for ex in examples]
+        for docs in util.minibatch(stream, size=batch_size):
             scores, tensors = self.predict(docs)
             self.set_annotations(docs, scores, tensors=tensors)
-
-            if as_example:
-                for ex, doc in zip(examples, docs):
-                    ex.doc = doc
-                    yield ex
-            else:
-                yield from docs
+            yield from docs
 
     def predict(self, docs):
         tensors = [doc.tensor for doc in docs]
@@ -914,12 +858,15 @@ class TextCategorizer(Pipe):
                 doc.cats[label] = float(scores[i, j])
 
     def update(self, examples, state=None, drop=0., set_annotations=False, sgd=None, losses=None):
-        examples = Example.to_example_objects(examples)
-        if not any(len(ex.doc) if ex.doc else 0 for ex in examples):
+        for eg in examples:
+            assert isinstance(eg, Example)
+        if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
             # Handle cases where there are no tokens in any docs.
             return
         set_dropout_rate(self.model, drop)
-        scores, bp_scores = self.model.begin_update([ex.doc for ex in examples])
+        scores, bp_scores = self.model.begin_update(
+            [eg.predicted for eg in examples]
+        )
         loss, d_scores = self.get_loss(examples, scores)
         bp_scores(d_scores)
         if sgd is not None:
@@ -928,14 +875,15 @@ class TextCategorizer(Pipe):
             losses.setdefault(self.name, 0.0)
             losses[self.name] += loss
         if set_annotations:
-            docs = [ex.doc for ex in examples]
+            docs = [eg.predicted for eg in examples]
             self.set_annotations(docs, scores=scores)
 
     def rehearse(self, examples, drop=0., sgd=None, losses=None):
         if self._rehearsal_model is None:
             return
-        examples = Example.to_example_objects(examples)
-        docs=[ex.doc for ex in examples]
+        for eg in examples:
+            assert isinstance(eg, Example)
+        docs = [eg.predicted for eg in examples]
         if not any(len(doc) for doc in docs):
             # Handle cases where there are no tokens in any docs.
             return
@@ -955,8 +903,8 @@ class TextCategorizer(Pipe):
         not_missing = numpy.ones((len(examples), len(self.labels)), dtype="f")
         for i, eg in enumerate(examples):
             for j, label in enumerate(self.labels):
-                if label in eg.doc_annotation.cats:
-                    truths[i, j] = eg.doc_annotation.cats[label]
+                if label in eg.predicted.cats:
+                    truths[i, j] = eg.reference.cats[label]
                 else:
                     not_missing[i, j] = 0.
         truths = self.model.ops.asarray(truths)
@@ -993,7 +941,7 @@ class TextCategorizer(Pipe):
         # TODO: begin_training is not guaranteed to see all data / labels ?
         examples = list(get_examples())
         for example in examples:
-            for cat in example.doc_annotation.cats:
+            for cat in example.y.cats:
                 self.add_label(cat)
         self.require_labels()
         docs = [Doc(Vocab(), words=["hello"])]
@@ -1152,21 +1100,22 @@ class EntityLinker(Pipe):
             losses.setdefault(self.name, 0.0)
         if not examples:
             return 0
-        examples = Example.to_example_objects(examples)
+        for eg in examples:
+            assert isinstance(eg, Example)
         sentence_docs = []
-        docs = [ex.doc for ex in examples]
+        docs = [eg.predicted for eg in examples]
         if set_annotations:
             # This seems simpler than other ways to get that exact output -- but
             # it does run the model twice :(
             predictions = self.model.predict(docs)
 
         for eg in examples:
-            doc = eg.doc
+            doc = eg.predicted
             ents_by_offset = dict()
             for ent in doc.ents:
                 ents_by_offset[(ent.start_char, ent.end_char)] = ent
-
-            for entity, kb_dict in eg.doc_annotation.links.items():
+            links = self._get_links_from_doc(eg.reference)
+            for entity, kb_dict in links.items():
                 if isinstance(entity, str):
                     entity = literal_eval(entity)
                 start, end = entity
@@ -1204,7 +1153,8 @@ class EntityLinker(Pipe):
     def get_similarity_loss(self, examples, scores):
         entity_encodings = []
         for eg in examples:
-            for entity, kb_dict in eg.doc_annotation.links.items():
+            links = self._get_links_from_doc(eg.reference)
+            for entity, kb_dict in links.items():
                 for kb_id, value in kb_dict.items():
                     # this loss function assumes we're only using positive examples
                     if value:
@@ -1223,8 +1173,9 @@ class EntityLinker(Pipe):
 
     def get_loss(self, examples, scores):
         cats = []
-        for ex in examples:
-            for entity, kb_dict in ex.doc_annotation.links.items():
+        for eg in examples:
+            links = self._get_links_from_doc(eg.reference)
+            for entity, kb_dict in links.items():
                 for kb_id, value in kb_dict.items():
                     cats.append([value])
 
@@ -1237,27 +1188,22 @@ class EntityLinker(Pipe):
         loss = loss / len(cats)
         return loss, d_scores
 
-    def __call__(self, example):
-        doc = self._get_doc(example)
+    def _get_links_from_doc(self, doc):
+        return {}
+
+    def __call__(self, doc):
         kb_ids, tensors = self.predict([doc])
         self.set_annotations([doc], kb_ids, tensors=tensors)
         if isinstance(example, Example):
-            example.doc = doc
+            example.x = doc
             return example
         return doc
 
     def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
-        for examples in util.minibatch(stream, size=batch_size):
-            docs = [self._get_doc(ex) for ex in examples]
+        for docs in util.minibatch(stream, size=batch_size):
             kb_ids, tensors = self.predict(docs)
             self.set_annotations(docs, kb_ids, tensors=tensors)
-
-            if as_example:
-                for ex, doc in zip(examples, docs):
-                    ex.doc = doc
-                    yield ex
-            else:
-                yield from docs
+            yield from docs
 
     def predict(self, docs):
         """ Return the KB IDs for each entity in each doc, including NIL if there is no prediction """
@@ -1433,7 +1379,7 @@ class Sentencizer(Pipe):
     ):
         pass
 
-    def __call__(self, example):
+    def __call__(self, doc):
         """Apply the sentencizer to a Doc and set Token.is_sent_start.
 
         example (Doc or Example): The document to process.
@@ -1441,7 +1387,6 @@ class Sentencizer(Pipe):
 
         DOCS: https://spacy.io/api/sentencizer#call
         """
-        doc = self._get_doc(example)
         start = 0
         seen_period = False
         for i, token in enumerate(doc):
@@ -1460,21 +1405,15 @@ class Sentencizer(Pipe):
             return example
         return doc
 
-    def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
-        for examples in util.minibatch(stream, size=batch_size):
-            docs = [self._get_doc(ex) for ex in examples]
+    def pipe(self, stream, batch_size=128, n_threads=-1):
+        for docs in util.minibatch(stream, size=batch_size):
             predictions = self.predict(docs)
             if isinstance(predictions, tuple) and len(tuple) == 2:
                 scores, tensors = predictions
                 self.set_annotations(docs, scores, tensors=tensors)
             else:
                 self.set_annotations(docs, predictions)
-            if as_example:
-                for ex, doc in zip(examples, docs):
-                    ex.doc = doc
-                    yield ex
-            else:
-                yield from docs
+            yield from docs
 
     def predict(self, docs):
         """Apply the pipeline's model to a batch of docs, without

From af1b5f129b8653678291dd5f8a226cc8cfe78893 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 9 Jun 2020 23:31:19 +0200
Subject: [PATCH 38/56] Use new example class in GoldCorpus

---
 spacy/gold/corpus.py | 55 ++++++++------------------------------------
 1 file changed, 10 insertions(+), 45 deletions(-)

diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py
index 84de01665..8dc044639 100644
--- a/spacy/gold/corpus.py
+++ b/spacy/gold/corpus.py
@@ -9,7 +9,7 @@ from .. import util
 from ..errors import Errors, AlignmentError
 from .gold_io import read_json_file, json_to_annotations
 from .augment import make_orth_variants, add_noise
-from .example import Example
+from .new_example import NewExample as Example
 
 
 class GoldCorpus(object):
@@ -203,59 +203,24 @@ class GoldCorpus(object):
         for eg_dict in annotations:
             if eg_dict["text"]:
                 example = Example.from_dict(
-                    eg_dict,
-                    doc=nlp.make_doc(eg_dict["text"])
+                    nlp.make_doc(eg_dict["text"]),
+                    eg_dict
                 )
             else:
                 example = Example.from_dict(
-                    eg_dict,
-                    doc=Doc(nlp.vocab, words=eg_dict["words"])
+                    Doc(nlp.vocab, words=eg_dict["words"]),
+                    eg_dict
                 )
-            example_docs = []
             if gold_preproc:
-                split_examples = example.split_sents()
-                for split_example in split_examples:
-                    split_example_docs = cls._make_docs(
-                        nlp,
-                        split_example,
-                        gold_preproc,
-                        noise_level=noise_level,
-                        orth_variant_level=orth_variant_level,
-                    )
-                    example_docs.extend(split_example_docs)
+                # TODO: Data augmentation
+                examples = example.split_sents()
             else:
-                example_docs = cls._make_docs(
-                    nlp,
-                    example,
-                    gold_preproc,
-                    noise_level=noise_level,
-                    orth_variant_level=orth_variant_level,
-                )
-            for ex in example_docs:
-                if (not max_length) or len(ex.doc) < max_length:
+                examples = [example]
+            for ex in examples:
+                if (not max_length) or len(ex.predicted) < max_length:
                     if ignore_misaligned:
                         try:
                             _ = ex._deprecated_get_gold()
                         except AlignmentError:
                             continue
                     yield ex
-
-    @classmethod
-    def _make_docs(
-        cls, nlp, example, gold_preproc, noise_level=0.0, orth_variant_level=0.0
-    ):
-        var_example = make_orth_variants(
-            nlp, example, orth_variant_level=orth_variant_level
-        )
-        # gold_preproc is not used ?!
-        if example.text is not None:
-            var_text = add_noise(var_example.text, noise_level)
-            var_doc = nlp.make_doc(var_text)
-            var_example.doc = var_doc
-        else:
-            var_doc = Doc(
-                nlp.vocab,
-                words=add_noise(var_example.token_annotation.words, noise_level),
-            )
-            var_example.doc = var_doc
-        return [var_example]

From 82810b98466376daf37602cde75d0ec2b0352577 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 9 Jun 2020 23:32:07 +0200
Subject: [PATCH 39/56] Update morphologizer

---
 spacy/pipeline/morphologizer.pyx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 7116d7afd..c5d140a4e 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -51,9 +51,9 @@ class Morphologizer(Tagger):
     def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
                        **kwargs):
         for example in get_examples():
-            for i, morph in enumerate(example.token_annotation.morphs):
-                pos = example.token_annotation.get_pos(i)
-                morph = Morphology.feats_to_dict(morph)
+            for i, token in enumerate(example.reference):
+                pos = token.pos_
+                morph = token.morph
                 norm_morph = self.vocab.strings[self.vocab.morphology.add(morph)]
                 if pos:
                     morph["POS"] = pos

From ad547a4b8fc7d957dc70f6454b1c672a6941b49b Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 9 Jun 2020 23:39:46 +0200
Subject: [PATCH 40/56] Refactor towards new Example class

---
 spacy/pipeline/pipes.pyx | 35 ++++++++---------------------------
 1 file changed, 8 insertions(+), 27 deletions(-)

diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index c6233be90..58a76a9a1 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -52,23 +52,19 @@ class Pipe(object):
         """Create a new pipe instance."""
         raise NotImplementedError
 
-    def __call__(self, example):
+    def __call__(self, Doc doc):
         """Apply the pipe to one document. The document is
         modified in-place, and returned.
 
         Both __call__ and pipe should delegate to the `predict()`
         and `set_annotations()` methods.
         """
-        doc = self._get_doc(example)
         predictions = self.predict([doc])
         if isinstance(predictions, tuple) and len(predictions) == 2:
             scores, tensors = predictions
             self.set_annotations([doc], scores, tensors=tensors)
         else:
             self.set_annotations([doc], predictions)
-        if isinstance(example, Example):
-            example.predicted = doc
-            return example
         return doc
 
     def pipe(self, stream, batch_size=128, n_threads=-1):
@@ -77,20 +73,14 @@ class Pipe(object):
         Both __call__ and pipe should delegate to the `predict()`
         and `set_annotations()` methods.
         """
-        for examples in util.minibatch(stream, size=batch_size):
+        for docs in util.minibatch(stream, size=batch_size):
             predictions = self.predict(docs)
             if isinstance(predictions, tuple) and len(tuple) == 2:
                 scores, tensors = predictions
                 self.set_annotations(docs, scores, tensors=tensors)
             else:
                 self.set_annotations(docs, predictions)
-
-            if as_example:
-                for ex, doc in zip(examples, docs):
-                    ex.predicted = doc
-                    yield ex
-            else:
-                yield from docs
+            yield from docs
 
     def predict(self, docs):
         """Apply the pipeline's model to a batch of docs, without
@@ -102,7 +92,7 @@ class Pipe(object):
         """Modify a batch of documents, using pre-computed scores."""
         raise NotImplementedError
 
-    def update(self, examples, set_annotations=False, drop=0.0, sgd=None, losses=None):
+    def update(self, docs, set_annotations=False, drop=0.0, sgd=None, losses=None):
         """Learn from a batch of documents and gold-standard information,
         updating the pipe's model.
 
@@ -247,15 +237,12 @@ class Tagger(Pipe):
     def labels(self):
         return tuple(self.vocab.morphology.tag_names)
 
-    def __call__(self, example):
+    def __call__(self, doc):
         tags = self.predict([doc])
         self.set_annotations([doc], tags)
-        if isinstance(example, Example):
-            example.predicted = doc
-            return example
         return doc
 
-    def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
+    def pipe(self, stream, batch_size=128, n_threads=-1):
         for docs in util.minibatch(stream, size=batch_size):
             tag_ids = self.predict(docs)
             self.set_annotations(docs, tag_ids)
@@ -833,7 +820,7 @@ class TextCategorizer(Pipe):
     def labels(self, value):
         self.cfg["labels"] = tuple(value)
 
-    def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
+    def pipe(self, stream, batch_size=128, n_threads=-1):
         for docs in util.minibatch(stream, size=batch_size):
             scores, tensors = self.predict(docs)
             self.set_annotations(docs, scores, tensors=tensors)
@@ -1194,12 +1181,9 @@ class EntityLinker(Pipe):
     def __call__(self, doc):
         kb_ids, tensors = self.predict([doc])
         self.set_annotations([doc], kb_ids, tensors=tensors)
-        if isinstance(example, Example):
-            example.x = doc
-            return example
         return doc
 
-    def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
+    def pipe(self, stream, batch_size=128, n_threads=-1):
         for docs in util.minibatch(stream, size=batch_size):
             kb_ids, tensors = self.predict(docs)
             self.set_annotations(docs, kb_ids, tensors=tensors)
@@ -1400,9 +1384,6 @@ class Sentencizer(Pipe):
                 seen_period = True
         if start < len(doc):
             doc[start].is_sent_start = True
-        if isinstance(example, Example):
-            example.doc = doc
-            return example
         return doc
 
     def pipe(self, stream, batch_size=128, n_threads=-1):

From 337d2b5ad65508ce0897d7fea49fa39e33a8d327 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 9 Jun 2020 23:58:16 +0200
Subject: [PATCH 41/56] Fix sent start in NewExample

---
 spacy/gold/new_example.pyx | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/spacy/gold/new_example.pyx b/spacy/gold/new_example.pyx
index 136eca130..4247f21b5 100644
--- a/spacy/gold/new_example.pyx
+++ b/spacy/gold/new_example.pyx
@@ -94,13 +94,16 @@ def _annot2array(strings, tok_annot, doc_annot):
         elif key == "HEAD":
             attrs.append(key)
             values.append([h-i for i, h in enumerate(value)])
+        elif key == "SENT_START":
+            attrs.append(key)
+            values.append(value)
         else:
             attrs.append(key)
             values.append([strings.add(v) for v in value])
     # TODO: Calculate token.ent_kb_id from doc_annot["links"].
     # We need to fix this and the doc.ents thing, both should be doc
     # annotations.
-    array = numpy.array(values, dtype="uint64")
+    array = numpy.asarray(values, dtype="uint64")
     return attrs, array.T
 
 

From 488727aee0ef3bee60113264f9348d9c1ad5e422 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 9 Jun 2020 23:58:28 +0200
Subject: [PATCH 42/56] Start updating test

---
 spacy/tests/test_gold.py | 35 ++++++++++++++++-------------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
index 29ddc7456..3c13259ba 100644
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@@ -1,7 +1,8 @@
 from spacy.errors import AlignmentError
 from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
 from spacy.gold import spans_from_biluo_tags, iob_to_biluo, align
-from spacy.gold import GoldCorpus, docs_to_json, Example, DocAnnotation
+from spacy.gold import GoldCorpus, docs_to_json, DocAnnotation
+from spacy.gold.new_example import NewExample as Example
 from spacy.lang.en import English
 from spacy.syntax.nonproj import is_nonproj_tree
 from spacy.syntax.gold_parse import GoldParse, get_parses_from_example
@@ -91,7 +92,7 @@ def merged_dict():
         "ids": [1, 2, 3, 4, 5, 6, 7],
         "words": ["Hi", "there", "everyone", "It", "is", "just", "me"],
         "tags": ["INTJ", "ADV", "PRON", "PRON", "AUX", "ADV", "PRON"],
-        "sent_starts": [1, 0, 0, 1, 0, 0, 0, 0],
+        "sent_starts": [1, 0, 0, 1, 0, 0, 0],
     }
 
 
@@ -482,8 +483,8 @@ def _train(train_data):
 def test_split_sents(merged_dict):
     nlp = English()
     example = Example.from_dict(
-        merged_dict,
-        doc=Doc(nlp.vocab, words=merged_dict["words"])
+        Doc(nlp.vocab, words=merged_dict["words"]),
+        merged_dict
     )
     assert len(get_parses_from_example(
         example,
@@ -514,24 +515,20 @@ def test_split_sents(merged_dict):
     assert token_annotation_2.sent_starts == [1, 0, 0, 0]
 
 
+# This fails on some None value? Need to look into that.
+@pytest.mark.xfail # TODO
 def test_tuples_to_example(vocab, merged_dict):
     cats = {"TRAVEL": 1.0, "BAKING": 0.0}
     merged_dict = dict(merged_dict)
     merged_dict["cats"] = cats
     ex = Example.from_dict(
-        merged_dict,
-        doc=Doc(vocab, words=merged_dict["words"])
+        Doc(vocab, words=merged_dict["words"]),
+        merged_dict
     )
-    ex_dict = ex.to_dict()
-    assert ex_dict["token_annotation"]["ids"] == merged_dict["ids"]
-    assert ex_dict["token_annotation"]["words"] == merged_dict["words"]
-    assert ex_dict["token_annotation"]["tags"] == merged_dict["tags"]
-    assert ex_dict["token_annotation"]["sent_starts"] == merged_dict["sent_starts"]
-    assert ex_dict["doc_annotation"]["cats"] == cats
-
-
-def test_empty_example_goldparse():
-    nlp = English()
-    doc = nlp("")
-    example = Example(doc=doc)
-    assert len(get_parses_from_example(example)) == 1
+    words = [token.text for token in ex.reference]
+    assert words == merged_dict["words"]
+    tags = [token.tag_ for token in ex.reference]
+    assert tags == merged_dict["tags"]
+    sent_starts = [token.is_sent_start for token in ex.reference]
+    assert sent_starts == [bool(v) for v in merged_dict["sent_starts"]]
+    example.reference.cats == cats

From 6a67a1168235d75eaf7db95f5ee5cec482451990 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 11 Jun 2020 17:43:40 +0200
Subject: [PATCH 43/56] adding tests for new example class (some still failing
 - WIP)

---
 spacy/gold/new_example.pyx      |   2 +
 spacy/tests/test_gold.py        |   2 +-
 spacy/tests/test_new_example.py | 137 +++++++++++++++++++++++++++-----
 3 files changed, 118 insertions(+), 23 deletions(-)

diff --git a/spacy/gold/new_example.pyx b/spacy/gold/new_example.pyx
index 4247f21b5..fa50e4369 100644
--- a/spacy/gold/new_example.pyx
+++ b/spacy/gold/new_example.pyx
@@ -146,6 +146,8 @@ def _fix_legacy_dict_data(predicted, example_dict):
             ent_iobs, ent_types = _parse_ner_tags(predicted, words, value)
             token_dict["ENT_IOB"] = ent_iobs
             token_dict["ENT_TYPE"] = ent_types
+        else:
+            raise ValueError(f"Unknown attr: {key}")
     return {
         "token_annotation": token_dict,
         "doc_annotation": doc_dict
diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
index 3c13259ba..f60f52e6e 100644
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@@ -531,4 +531,4 @@ def test_tuples_to_example(vocab, merged_dict):
     assert tags == merged_dict["tags"]
     sent_starts = [token.is_sent_start for token in ex.reference]
     assert sent_starts == [bool(v) for v in merged_dict["sent_starts"]]
-    example.reference.cats == cats
+    ex.reference.cats == cats
diff --git a/spacy/tests/test_new_example.py b/spacy/tests/test_new_example.py
index 473666eca..a8651dfee 100644
--- a/spacy/tests/test_new_example.py
+++ b/spacy/tests/test_new_example.py
@@ -14,21 +14,25 @@ def test_Example_init_requires_doc_objects():
         eg = Example(None, Doc(vocab, words=["hi"]))
 
 
-
 def test_Example_from_dict_basic():
     eg = Example.from_dict(
-        Doc(Vocab(), words=["hello", "world"]),
-        {
-            "words": ["hello", "world"]
-        }
+        Doc(Vocab(), words=["hello", "world"]), {"words": ["hello", "world"]}
     )
     assert isinstance(eg.x, Doc)
     assert isinstance(eg.y, Doc)
 
 
-@pytest.mark.parametrize("annots", [
-    {"words": ["ice", "cream"], "tags": ["NN", "NN"]},
-])
+@pytest.mark.parametrize(
+    "annots", [{"words": ["ice", "cream"], "weirdannots": ["something", "such"]}]
+)
+def test_Example_from_dict_invalid(annots):
+    vocab = Vocab()
+    predicted = Doc(vocab, words=annots["words"])
+    with pytest.raises(ValueError):
+        eg = Example.from_dict(predicted, annots)
+
+
+@pytest.mark.parametrize("annots", [{"words": ["ice", "cream"], "tags": ["NN", "NN"]}])
 def test_Example_from_dict_with_tags(annots):
     vocab = Vocab()
     predicted = Doc(vocab, words=annots["words"])
@@ -37,28 +41,117 @@ def test_Example_from_dict_with_tags(annots):
         assert token.tag_ == annots["tags"][i]
 
 
-"""
+@pytest.mark.xfail(reason="TODO - fix")
+@pytest.mark.parametrize(
+    "annots",
+    [
+        {
+            "words": ["I", "like", "London", "and", "Berlin", "."],
+            "entities": [(7, 13, "LOC"), (18, 24, "LOC")],
+        }
+    ],
+)
 def test_Example_from_dict_with_entities(annots):
-    # TODO
-    pass
+    vocab = Vocab()
+    predicted = Doc(vocab, words=annots["words"])
+    eg = Example.from_dict(predicted, annots)
+    assert len(list(eg.reference.ents)) == 2
 
+
+@pytest.mark.parametrize(
+    "annots",
+    [
+        {
+            "words": ["I", "like", "London", "and", "Berlin", "."],
+            "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],
+            "heads": [1, 1, 1, 2, 2, 1],
+        }
+    ],
+)
 def test_Example_from_dict_with_parse(annots):
-    # TODO
-    pass
+    vocab = Vocab()
+    predicted = Doc(vocab, words=annots["words"])
+    eg = Example.from_dict(predicted, annots)
+    for i, token in enumerate(eg.reference):
+        assert token.dep_ == annots["deps"][i]
+        assert token.head.i == annots["heads"][i]
 
+
+@pytest.mark.xfail(reason="TODO - fix")
+@pytest.mark.parametrize(
+    "annots",
+    [
+        {
+            "words": ["Sarah", "'s", "sister", "flew"],
+            "morphs": [
+                "NounType=prop|Number=sing",
+                "Poss=yes",
+                "Number=sing",
+                "Tense=past|VerbForm=fin",
+            ],
+        }
+    ],
+)
 def test_Example_from_dict_with_morphology(annots):
-    # TODO
-    pass
+    vocab = Vocab()
+    predicted = Doc(vocab, words=annots["words"])
+    eg = Example.from_dict(predicted, annots)
+    for i, token in enumerate(eg.reference):
+        assert token.morph_ == annots["morphs"][i]
 
+
+@pytest.mark.parametrize(
+    "annots",
+    [
+        {
+            "words": ["This", "is", "one", "sentence", "this", "is", "another"],
+            "sent_starts": [1, 0, 0, 0, 1, 0, 0],
+        }
+    ],
+)
 def test_Example_from_dict_with_sent_start(annots):
-    # TODO
-    pass
+    vocab = Vocab()
+    predicted = Doc(vocab, words=annots["words"])
+    eg = Example.from_dict(predicted, annots)
+    assert len(list(eg.reference.sents)) == 2
+    for i, token in enumerate(eg.reference):
+        assert bool(token.is_sent_start) == bool(annots["sent_starts"][i])
 
+
+@pytest.mark.parametrize(
+    "annots",
+    [
+        {
+            "words": ["This", "is", "a", "sentence"],
+            "cats": {"cat1": 1.0, "cat2": 0.0, "cat3": 0.5},
+        }
+    ],
+)
 def test_Example_from_dict_with_cats(annots):
-    # TODO
-    pass
+    vocab = Vocab()
+    predicted = Doc(vocab, words=annots["words"])
+    eg = Example.from_dict(predicted, annots)
+    assert len(list(eg.reference.cats)) == 3
+    assert eg.reference.cats["cat1"] == 1.0
+    assert eg.reference.cats["cat2"] == 0.0
+    assert eg.reference.cats["cat3"] == 0.5
 
+
+@pytest.mark.xfail(reason="TODO - fix")
+@pytest.mark.parametrize(
+    "annots",
+    [
+        {
+            "words": ["Russ", "Cochran", "made", "reprints"],
+            "links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}},
+        }
+    ],
+)
 def test_Example_from_dict_with_links(annots):
-    # TODO
-    pass
-"""
+    vocab = Vocab()
+    predicted = Doc(vocab, words=annots["words"])
+    eg = Example.from_dict(predicted, annots)
+    assert eg.reference[0].ent_kb_id_ == "Q7381115"
+    assert eg.reference[1].ent_kb_id_ == "Q7381115"
+    assert eg.reference[2].ent_kb_id_ == ""
+    assert eg.reference[3].ent_kb_id_ == ""

From 3aed177a35ced290cd6eee9773cd73d012202745 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 12 Jun 2020 11:30:24 +0200
Subject: [PATCH 44/56] fix ENT_IOB conversion and enable unit test

---
 spacy/errors.py                 |  2 ++
 spacy/gold/new_example.pyx      |  9 +++++++++
 spacy/tests/test_new_example.py | 16 +++++++++++++---
 spacy/tokens/doc.pyx            |  7 +++++++
 spacy/tokens/token.pyx          |  7 +++++--
 5 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 94a0218a7..8efef8333 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -581,6 +581,8 @@ class Errors(object):
 
     # TODO: fix numbering after merging develop into master
 
+    E985 = ("The 'ent_iob' attribute of a Token should be an integer indexing "
+            "into {values}, but found {value}.")
     E986 = ("Could not create any training batches: check your input. "
             "Perhaps discard_oversize should be set to False ?")
     E987 = ("The text of an example training instance is either a Doc or "
diff --git a/spacy/gold/new_example.pyx b/spacy/gold/new_example.pyx
index fa50e4369..51007e8c3 100644
--- a/spacy/gold/new_example.pyx
+++ b/spacy/gold/new_example.pyx
@@ -1,4 +1,6 @@
 import numpy
+
+from ..tokens import Token
 from ..tokens.doc cimport Doc
 from ..attrs import IDS
 from .align cimport Alignment
@@ -97,6 +99,13 @@ def _annot2array(strings, tok_annot, doc_annot):
         elif key == "SENT_START":
             attrs.append(key)
             values.append(value)
+        elif key == "ENT_IOB":
+            iob_strings = Token.iob_strings()
+            attrs.append(key)
+            try:
+                values.append([iob_strings.index(v) for v in value])
+            except ValueError:
+                raise ValueError(Errors.E985.format(values=iob_strings, value=values))
         else:
             attrs.append(key)
             values.append([strings.add(v) for v in value])
diff --git a/spacy/tests/test_new_example.py b/spacy/tests/test_new_example.py
index a8651dfee..7a43cd9a6 100644
--- a/spacy/tests/test_new_example.py
+++ b/spacy/tests/test_new_example.py
@@ -29,7 +29,7 @@ def test_Example_from_dict_invalid(annots):
     vocab = Vocab()
     predicted = Doc(vocab, words=annots["words"])
     with pytest.raises(ValueError):
-        eg = Example.from_dict(predicted, annots)
+        Example.from_dict(predicted, annots)
 
 
 @pytest.mark.parametrize("annots", [{"words": ["ice", "cream"], "tags": ["NN", "NN"]}])
@@ -46,8 +46,8 @@ def test_Example_from_dict_with_tags(annots):
     "annots",
     [
         {
-            "words": ["I", "like", "London", "and", "Berlin", "."],
-            "entities": [(7, 13, "LOC"), (18, 24, "LOC")],
+            "words": ["I", "like", "New", "York", "and", "Berlin", "."],
+            "entities": [(7, 15, "LOC"), (20, 26, "LOC")],
         }
     ],
 )
@@ -56,6 +56,16 @@ def test_Example_from_dict_with_entities(annots):
     predicted = Doc(vocab, words=annots["words"])
     eg = Example.from_dict(predicted, annots)
     assert len(list(eg.reference.ents)) == 2
+    assert eg.reference[0].ent_iob_ == "O"
+    assert eg.reference[1].ent_iob_ == "O"
+    assert eg.reference[2].ent_iob_ == "B"
+    assert eg.reference[3].ent_iob_ == "I"
+    assert eg.reference[4].ent_iob_ == "O"
+    assert eg.reference[5].ent_iob_ == "B"
+    assert eg.reference[6].ent_iob_ == "O"
+    assert eg.reference[2].ent_type_ == "LOC"
+    assert eg.reference[3].ent_type_ == "LOC"
+    assert eg.reference[5].ent_type_ == "LOC"
 
 
 @pytest.mark.parametrize(
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 81cef4492..c4581d0a8 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -825,6 +825,13 @@ cdef class Doc:
             for i in range(length):
                 if array[i, col] != 0:
                     self.vocab.morphology.assign_tag(&tokens[i], array[i, col])
+        # Verify ENT_IOB are proper integers
+        if ENT_IOB in attrs:
+            iob_strings = Token.iob_strings()
+            col = attrs.index(ENT_IOB)
+            for i in range(length):
+                if array[i, col] not in range(0, len(iob_strings)):
+                    raise ValueError(Errors.E985.format(values=iob_strings, value=array[i, col]))
         # Now load the data
         for i in range(length):
             token = &self.c[i]
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 320cfaad5..f85a17d69 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -778,6 +778,10 @@ cdef class Token:
         """
         return self.c.ent_iob
 
+    @classmethod
+    def iob_strings(cls):
+        return ("", "I", "O", "B")
+
     @property
     def ent_iob_(self):
         """IOB code of named entity tag. "B" means the token begins an entity,
@@ -787,8 +791,7 @@ cdef class Token:
 
         RETURNS (str): IOB code of named entity tag.
         """
-        iob_strings = ("", "I", "O", "B")
-        return iob_strings[self.c.ent_iob]
+        return self.iob_strings()[self.c.ent_iob]
 
     property ent_id:
         """RETURNS (uint64): ID of the entity the token is an instance of,

From 880dccf93e11be93bdd75c660617a551f589a82c Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 12 Jun 2020 15:47:20 +0200
Subject: [PATCH 45/56] entities on doc_annotation, parse links and check their
 offsets against the entities. unit test works

---
 spacy/errors.py                 |  4 ++
 spacy/gold/new_example.pyx      | 76 +++++++++++++++++++++-------
 spacy/tests/test_new_example.py | 88 ++++++++++++++++++++-------------
 3 files changed, 115 insertions(+), 53 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 8efef8333..e4f6610ee 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -581,6 +581,10 @@ class Errors(object):
 
     # TODO: fix numbering after merging develop into master
 
+    E983 = ("Each link annotation should refer to a dictionary with at most one "
+            "identifier mapping to 1.0, and all others to 0.0.")
+    E984 = ("The offsets of the annotations for 'links' need to refer exactly "
+            "to the offsets of the 'entities' annotations.")
     E985 = ("The 'ent_iob' attribute of a Token should be an integer indexing "
             "into {values}, but found {value}.")
     E986 = ("Could not create any training batches: check your input. "
diff --git a/spacy/gold/new_example.pyx b/spacy/gold/new_example.pyx
index 51007e8c3..d2492a29f 100644
--- a/spacy/gold/new_example.pyx
+++ b/spacy/gold/new_example.pyx
@@ -85,12 +85,28 @@ cdef class NewExample:
         return self.x.text
 
 
-def _annot2array(strings, tok_annot, doc_annot):
+def _annot2array(vocab, tok_annot, doc_annot):
     attrs = []
     values = []
+
+    for key, value in doc_annot.items():
+        if key == "entities":
+            words = tok_annot["ORTH"]
+            ent_iobs, ent_types = _parse_ner_tags(vocab, words, value)
+            tok_annot["ENT_IOB"] = ent_iobs
+            tok_annot["ENT_TYPE"] = ent_types
+        elif key == "links":
+            entities = doc_annot.get("entities", {})
+            if value and not entities:
+                raise ValueError(Errors.E984)
+            ent_kb_ids = _parse_links(vocab, words, value, entities)
+            tok_annot["ENT_KB_ID"] = ent_kb_ids
+        else:
+            raise ValueError(f"Unknown doc attribute: {key}")
+
     for key, value in tok_annot.items():
         if key not in IDS:
-            raise ValueError(f"Unknown attr: {key}")
+            raise ValueError(f"Unknown token attribute: {key}")
         elif key == "ORTH":
             pass
         elif key == "HEAD":
@@ -108,10 +124,8 @@ def _annot2array(strings, tok_annot, doc_annot):
                 raise ValueError(Errors.E985.format(values=iob_strings, value=values))
         else:
             attrs.append(key)
-            values.append([strings.add(v) for v in value])
-    # TODO: Calculate token.ent_kb_id from doc_annot["links"].
-    # We need to fix this and the doc.ents thing, both should be doc
-    # annotations.
+            values.append([vocab.strings.add(v) for v in value])
+
     array = numpy.asarray(values, dtype="uint64")
     return attrs, array.T
 
@@ -129,8 +143,10 @@ def _fix_legacy_dict_data(predicted, example_dict):
     for key, value in example_dict.items():
         if key in ("token_annotation", "doc_annotation"):
             pass
-        elif key in ("cats", "links"):
+        elif key in ("cats", "links") and value:
             doc_dict[key] = value
+        elif key in ("ner", "entities") and value:
+            doc_dict["entities"] = value
         else:
             token_dict[key] = value
     # Remap keys
@@ -149,12 +165,6 @@ def _fix_legacy_dict_data(predicted, example_dict):
     for key, value in old_token_dict.items():
         if key in remapping:
             token_dict[remapping[key]] = value
-        elif key in ("ner", "entities") and value:
-            # Arguably it would be smarter to put this in the doc annotation?
-            words = token_dict.get("words", [t.text for t in predicted])
-            ent_iobs, ent_types = _parse_ner_tags(predicted, words, value)
-            token_dict["ENT_IOB"] = ent_iobs
-            token_dict["ENT_TYPE"] = ent_types
         else:
             raise ValueError(f"Unknown attr: {key}")
     return {
@@ -163,16 +173,13 @@ def _fix_legacy_dict_data(predicted, example_dict):
     }
 
 
-def _parse_ner_tags(predicted, words, biluo_or_offsets):
+def _parse_ner_tags(vocab, words, biluo_or_offsets):
     if isinstance(biluo_or_offsets[0], (list, tuple)):
         # Convert to biluo if necessary
         # This is annoying but to convert the offsets we need a Doc
         # that has the target tokenization.
-        reference = Doc(
-            predicted.vocab,
-            words=words
-        )
-        biluo = biluo_tags_from_offsets(predicted, biluo_or_offsets)
+        reference = Doc(vocab, words=words)
+        biluo = biluo_tags_from_offsets(reference, biluo_or_offsets)
     else:
         biluo = biluo_or_offsets
     ent_iobs = []
@@ -185,6 +192,37 @@ def _parse_ner_tags(predicted, words, biluo_or_offsets):
             ent_types.append("")
     return ent_iobs, ent_types
 
+def _parse_links(vocab, words, links, entities):
+    reference = Doc(vocab, words=words)
+
+    starts = {token.idx: token.i for token in reference}
+    ends = {token.idx + len(token): token.i for token in reference}
+    ent_kb_ids = ["" for _ in reference]
+    entity_map = [(ent[0], ent[1]) for ent in entities]
+
+    # links annotations need to refer 1-1 to entity annotations - throw error otherwise
+    for index, annot_dict in links.items():
+        start_char, end_char = index
+        if (start_char, end_char) not in entity_map:
+            raise ValueError(Errors.E984)
+
+    for index, annot_dict in links.items():
+        true_kb_ids = []
+        for key, value in annot_dict.items():
+            if value == 1.0:
+                true_kb_ids.append(key)
+        if len(true_kb_ids) > 1:
+            raise ValueError(Errors.E983)
+
+        if len(true_kb_ids) == 1:
+            start_char, end_char = index
+            start_token = starts.get(start_char)
+            end_token = ends.get(end_char)
+            for i in range(start_token, end_token+1):
+                ent_kb_ids[i] = true_kb_ids[0]
+
+    return ent_kb_ids
+
 
 class Example:
     def get_aligned(self, field):
diff --git a/spacy/tests/test_new_example.py b/spacy/tests/test_new_example.py
index 7a43cd9a6..4ebafb6bb 100644
--- a/spacy/tests/test_new_example.py
+++ b/spacy/tests/test_new_example.py
@@ -41,33 +41,6 @@ def test_Example_from_dict_with_tags(annots):
         assert token.tag_ == annots["tags"][i]
 
 
-@pytest.mark.xfail(reason="TODO - fix")
-@pytest.mark.parametrize(
-    "annots",
-    [
-        {
-            "words": ["I", "like", "New", "York", "and", "Berlin", "."],
-            "entities": [(7, 15, "LOC"), (20, 26, "LOC")],
-        }
-    ],
-)
-def test_Example_from_dict_with_entities(annots):
-    vocab = Vocab()
-    predicted = Doc(vocab, words=annots["words"])
-    eg = Example.from_dict(predicted, annots)
-    assert len(list(eg.reference.ents)) == 2
-    assert eg.reference[0].ent_iob_ == "O"
-    assert eg.reference[1].ent_iob_ == "O"
-    assert eg.reference[2].ent_iob_ == "B"
-    assert eg.reference[3].ent_iob_ == "I"
-    assert eg.reference[4].ent_iob_ == "O"
-    assert eg.reference[5].ent_iob_ == "B"
-    assert eg.reference[6].ent_iob_ == "O"
-    assert eg.reference[2].ent_type_ == "LOC"
-    assert eg.reference[3].ent_type_ == "LOC"
-    assert eg.reference[5].ent_type_ == "LOC"
-
-
 @pytest.mark.parametrize(
     "annots",
     [
@@ -147,13 +120,39 @@ def test_Example_from_dict_with_cats(annots):
     assert eg.reference.cats["cat3"] == 0.5
 
 
-@pytest.mark.xfail(reason="TODO - fix")
 @pytest.mark.parametrize(
     "annots",
     [
         {
-            "words": ["Russ", "Cochran", "made", "reprints"],
-            "links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}},
+            "words": ["I", "like", "New", "York", "and", "Berlin", "."],
+            "entities": [(7, 15, "LOC"), (20, 26, "LOC")],
+        }
+    ],
+)
+def test_Example_from_dict_with_entities(annots):
+    vocab = Vocab()
+    predicted = Doc(vocab, words=annots["words"])
+    eg = Example.from_dict(predicted, annots)
+    assert len(list(eg.reference.ents)) == 2
+    assert eg.reference[0].ent_iob_ == "O"
+    assert eg.reference[1].ent_iob_ == "O"
+    assert eg.reference[2].ent_iob_ == "B"
+    assert eg.reference[3].ent_iob_ == "I"
+    assert eg.reference[4].ent_iob_ == "O"
+    assert eg.reference[5].ent_iob_ == "B"
+    assert eg.reference[6].ent_iob_ == "O"
+    assert eg.reference[2].ent_type_ == "LOC"
+    assert eg.reference[3].ent_type_ == "LOC"
+    assert eg.reference[5].ent_type_ == "LOC"
+
+
+@pytest.mark.parametrize(
+    "annots",
+    [
+        {
+            "words": ["I", "like", "New", "York", "and", "Berlin", "."],
+            "entities": [(7, 15, "LOC"), (20, 26, "LOC")],
+            "links": {(7, 15): {"Q60": 1.0, "Q64": 0.0}, (20, 26): {"Q60": 0.0, "Q64": 1.0}},
         }
     ],
 )
@@ -161,7 +160,28 @@ def test_Example_from_dict_with_links(annots):
     vocab = Vocab()
     predicted = Doc(vocab, words=annots["words"])
     eg = Example.from_dict(predicted, annots)
-    assert eg.reference[0].ent_kb_id_ == "Q7381115"
-    assert eg.reference[1].ent_kb_id_ == "Q7381115"
-    assert eg.reference[2].ent_kb_id_ == ""
-    assert eg.reference[3].ent_kb_id_ == ""
+    assert eg.reference[0].ent_kb_id_ == ""
+    assert eg.reference[1].ent_kb_id_ == ""
+    assert eg.reference[2].ent_kb_id_ == "Q60"
+    assert eg.reference[3].ent_kb_id_ == "Q60"
+    assert eg.reference[4].ent_kb_id_ == ""
+    assert eg.reference[5].ent_kb_id_ == "Q64"
+    assert eg.reference[6].ent_kb_id_ == ""
+
+
+@pytest.mark.parametrize(
+    "annots",
+    [
+        {
+            "words": ["I", "like", "New", "York", "and", "Berlin", "."],
+            "entities": [(7, 15, "LOC"), (20, 26, "LOC")],
+            "links": {(0, 1): {"Q7381115": 1.0, "Q2146908": 0.0}},
+        }
+    ],
+)
+def test_Example_from_dict_with_links_invalid(annots):
+    vocab = Vocab()
+    predicted = Doc(vocab, words=annots["words"])
+    with pytest.raises(ValueError):
+        Example.from_dict(predicted, annots)
+

From a5ee082da1c1f4c01af2dc84d6bfe8195012c5f7 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 12 Jun 2020 15:49:38 +0200
Subject: [PATCH 46/56] cats bugfix

---
 spacy/gold/new_example.pyx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/spacy/gold/new_example.pyx b/spacy/gold/new_example.pyx
index d2492a29f..e7506d697 100644
--- a/spacy/gold/new_example.pyx
+++ b/spacy/gold/new_example.pyx
@@ -101,6 +101,8 @@ def _annot2array(vocab, tok_annot, doc_annot):
                 raise ValueError(Errors.E984)
             ent_kb_ids = _parse_links(vocab, words, value, entities)
             tok_annot["ENT_KB_ID"] = ent_kb_ids
+        elif key == "cats":
+            pass
         else:
             raise ValueError(f"Unknown doc attribute: {key}")
 

From face0de74f716a318be1db4be56f503985025407 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 12 Jun 2020 16:29:09 +0200
Subject: [PATCH 47/56] fix MORPH conversion + enable unit test

---
 spacy/gold/new_example.pyx      | 5 ++++-
 spacy/tests/test_new_example.py | 1 -
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/spacy/gold/new_example.pyx b/spacy/gold/new_example.pyx
index e7506d697..46b8ed423 100644
--- a/spacy/gold/new_example.pyx
+++ b/spacy/gold/new_example.pyx
@@ -13,7 +13,7 @@ from ..errors import Errors, AlignmentError
 cpdef Doc annotations2doc(Doc predicted, tok_annot, doc_annot):
     # TODO: Improve and test this
     words = tok_annot.get("ORTH", [tok.text for tok in predicted])
-    attrs, array = _annot2array(predicted.vocab.strings, tok_annot, doc_annot)
+    attrs, array = _annot2array(predicted.vocab, tok_annot, doc_annot)
     output = Doc(predicted.vocab, words=words)
     if array.size:
         output = output.from_array(attrs, array)
@@ -117,6 +117,9 @@ def _annot2array(vocab, tok_annot, doc_annot):
         elif key == "SENT_START":
             attrs.append(key)
             values.append(value)
+        elif key == "MORPH":
+            attrs.append(key)
+            values.append([vocab.morphology.add(v) for v in value])
         elif key == "ENT_IOB":
             iob_strings = Token.iob_strings()
             attrs.append(key)
diff --git a/spacy/tests/test_new_example.py b/spacy/tests/test_new_example.py
index 4ebafb6bb..0be78624a 100644
--- a/spacy/tests/test_new_example.py
+++ b/spacy/tests/test_new_example.py
@@ -60,7 +60,6 @@ def test_Example_from_dict_with_parse(annots):
         assert token.head.i == annots["heads"][i]
 
 
-@pytest.mark.xfail(reason="TODO - fix")
 @pytest.mark.parametrize(
     "annots",
     [

From b078b05ecd7d1d78a1f67f1f178db18ae6c7280f Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 13 Jun 2020 15:30:12 +0200
Subject: [PATCH 48/56] Handle various data better in NewExample

---
 spacy/gold/new_example.pyx | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/spacy/gold/new_example.pyx b/spacy/gold/new_example.pyx
index 46b8ed423..eb796eb83 100644
--- a/spacy/gold/new_example.pyx
+++ b/spacy/gold/new_example.pyx
@@ -148,6 +148,8 @@ def _fix_legacy_dict_data(predicted, example_dict):
     for key, value in example_dict.items():
         if key in ("token_annotation", "doc_annotation"):
             pass
+        elif key == "ids":
+            pass
         elif key in ("cats", "links") and value:
             doc_dict[key] = value
         elif key in ("ner", "entities") and value:
@@ -168,10 +170,15 @@ def _fix_legacy_dict_data(predicted, example_dict):
     old_token_dict = token_dict
     token_dict = {}
     for key, value in old_token_dict.items():
-        if key in remapping:
+        if key in ("text", "ids", "entities", "ner", "brackets"):
+            pass
+        elif key in remapping:
             token_dict[remapping[key]] = value
         else:
             raise ValueError(f"Unknown attr: {key}")
+    if "HEAD" in token_dict and "SENT_START" in token_dict:
+        # If heads are set, we don't also redundantly specify SENT_START.
+        token_dict.pop("SENT_START")
     return {
         "token_annotation": token_dict,
         "doc_annotation": doc_dict

From 5564314d323f746a180a81888e76166a3687ff11 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 13 Jun 2020 15:43:35 +0200
Subject: [PATCH 49/56] Suggest approach for GoldParse

---
 spacy/gold/new_example.pyx  | 13 +++++++---
 spacy/syntax/gold_parse.pyx | 50 +++++++++++--------------------------
 2 files changed, 24 insertions(+), 39 deletions(-)

diff --git a/spacy/gold/new_example.pyx b/spacy/gold/new_example.pyx
index eb796eb83..d9a712e38 100644
--- a/spacy/gold/new_example.pyx
+++ b/spacy/gold/new_example.pyx
@@ -76,10 +76,15 @@ cdef class NewExample:
         raise NotImplementedError
 
     def to_dict(self):
-        """ Note that this method does NOT export the doc, only the annotations ! """
-        token_dict = self._token_annotation
-        doc_dict = self._doc_annotation
-        return {"token_annotation": token_dict, "doc_annotation": doc_dict}
+        # We should probably implement this? We could return the 
+        # doc_annotation and token_annotation, and this would allow us to
+        # easily implement the `get_parses_from_example` in
+        # spacy.syntax.gold_parse
+        raise NotImplementedError
+
+    def split_sents(self):
+        # Unclear whether we should really implement this. I guess?
+        raise NotImplementedError
 
     def text(self):
         return self.x.text
diff --git a/spacy/syntax/gold_parse.pyx b/spacy/syntax/gold_parse.pyx
index 05361fd82..9712f6e94 100644
--- a/spacy/syntax/gold_parse.pyx
+++ b/spacy/syntax/gold_parse.pyx
@@ -25,54 +25,34 @@ def is_punct_label(label):
 
 
 def get_parses_from_example(
-    eg, merge=True, vocab=None, make_projective=True, ignore_misaligned=False
+    example, merge=True, vocab=None, make_projective=True, ignore_misaligned=False
 ):
     """Return a list of (doc, GoldParse) objects.
     If merge is set to True, keep all Token annotations as one big list."""
-    d = eg.doc_annotation
     # merge == do not modify Example
     if merge:
-        t = eg.token_annotation
-        doc = eg.doc
-        if doc is None or not isinstance(doc, Doc):
-            if not vocab:
-                raise ValueError(Errors.E998)
-            doc = Doc(vocab, words=t.words)
+        examples = [example]
+    else:
+        # not merging: one GoldParse per sentence, defining docs with the words
+        # from each sentence
+        examples = eg.split_sents()
+    outputs = []
+    for eg in examples:
+        eg_dict = eg.to_dict()
         try:
             gp = GoldParse.from_annotation(
-                doc, d, t, make_projective=make_projective
+                eg.predicted,
+                eg_dict["doc_annotation"],
+                eg_dict["token_annotation"],
+                make_projective=make_projective
             )
         except AlignmentError:
             if ignore_misaligned:
                 gp = None
             else:
                 raise
-        return [(doc, gp)]
-    # not merging: one GoldParse per sentence, defining docs with the words
-    # from each sentence
-    else:
-        parses = []
-        split_examples = eg.split_sents()
-        for split_example in split_examples:
-            if not vocab:
-                raise ValueError(Errors.E998)
-            split_doc = Doc(vocab, words=split_example.token_annotation.words)
-            try:
-                gp = GoldParse.from_annotation(
-                    split_doc,
-                    d,
-                    split_example.token_annotation,
-                    make_projective=make_projective,
-                )
-            except AlignmentError:
-                if ignore_misaligned:
-                    gp = None
-                else:
-                    raise
-            if gp is not None:
-                parses.append((split_doc, gp))
-        return parses
-
+        outputs.append((eg.predicted, gp))
+    return outputs
 
 
 cdef class GoldParse:

From 3eb8f3867e03e6d4c4017c081189f9505c2f7567 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 13 Jun 2020 23:05:16 +0200
Subject: [PATCH 50/56] Update test

---
 spacy/tests/test_gold.py | 44 ++++++++++++++--------------------------
 1 file changed, 15 insertions(+), 29 deletions(-)

diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
index f60f52e6e..cc9224ae1 100644
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@@ -6,6 +6,7 @@ from spacy.gold.new_example import NewExample as Example
 from spacy.lang.en import English
 from spacy.syntax.nonproj import is_nonproj_tree
 from spacy.syntax.gold_parse import GoldParse, get_parses_from_example
+from spacy.syntax.gold_parse import get_parses_from_example
 from spacy.tokens import Doc
 from spacy.util import get_words_and_spaces, compounding, minibatch
 import pytest
@@ -279,22 +280,21 @@ def test_roundtrip_docs_to_json(doc):
         goldcorpus = GoldCorpus(train=str(json_file), dev=str(json_file))
 
         reloaded_example = next(goldcorpus.dev_dataset(nlp=nlp))
-        goldparse = reloaded_example._deprecated_get_gold()
         assert len(doc) == goldcorpus.count_train()
-    assert text == reloaded_example.text
-    assert tags == goldparse.tags
-    assert pos == goldparse.pos
-    assert morphs == goldparse.morphs
-    assert lemmas == goldparse.lemmas
-    assert deps == goldparse.labels
-    assert heads == goldparse.heads
-    assert biluo_tags == goldparse.ner
-    assert "TRAVEL" in goldparse.cats
-    assert "BAKING" in goldparse.cats
-    assert cats["TRAVEL"] == goldparse.cats["TRAVEL"]
-    assert cats["BAKING"] == goldparse.cats["BAKING"]
+    assert text == reloaded_example.predicted.text
+    assert tags == [t.tag_ for t in reloaded_example.reference]
+    assert pos == [t.pos_ for t in reloaded_example.reference]
+    assert morphs == [t.morph_ for t in reloaded_example.reference]
+    assert lemmas == [t.lemma_ for t in reloaded_example.reference]
+    assert deps == [t.dep_ for t in reloaded_example.reference]
+    assert heads == [t.head.i for t in reloaded_example.reference]
+    assert "TRAVEL" in reloaded_example.reference.cats
+    assert "BAKING" in reloaded_example.reference.cats
+    assert cats["TRAVEL"] == reloaded_example.reference.cats["TRAVEL"]
+    assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"]
 
 
+@pytest.mark.xfail # TODO do we need to do the projectivity differently?
 def test_projective_train_vs_nonprojective_dev(doc):
     nlp = English()
     deps = [t.dep_ for t in doc]
@@ -310,7 +310,7 @@ def test_projective_train_vs_nonprojective_dev(doc):
         train_goldparse = get_parses_from_example(train_reloaded_example)[0][1]
 
         dev_reloaded_example = next(goldcorpus.dev_dataset(nlp))
-        dev_goldparse = dev_reloaded_example._deprecated_get_gold()
+        dev_goldparse = get_parses_from_example(dev_reloaded_example)[0][1]
 
     assert is_nonproj_tree([t.head.i for t in doc]) is True
     assert is_nonproj_tree(train_goldparse.heads) is False
@@ -365,7 +365,7 @@ def test_make_orth_variants(doc):
 
         # due to randomness, test only that this runs with no errors for now
         train_reloaded_example = next(goldcorpus.train_dataset(nlp, orth_variant_level=0.2))
-        train_goldparse = train_reloaded_example._deprecated_get_gold()
+        train_goldparse = get_parses_from_example(train_reloaded_example)[0][1]
 
 
 @pytest.mark.parametrize(
@@ -419,20 +419,6 @@ def test_gold_constructor():
     assert gold.words == ["This", "is", "a", "sentence"]
 
 
-def test_gold_orig_annot():
-    nlp = English()
-    doc = nlp("This is a sentence")
-    gold = GoldParse(doc, cats={"cat1": 1.0, "cat2": 0.0})
-
-    assert gold.orig.words == ["This", "is", "a", "sentence"]
-    assert gold.cats["cat1"]
-
-    doc_annotation = DocAnnotation(cats={"cat1": 0.0, "cat2": 1.0})
-    gold2 = GoldParse.from_annotation(doc, doc_annotation, gold.orig)
-    assert gold2.orig.words == ["This", "is", "a", "sentence"]
-    assert not gold2.cats["cat1"]
-
-
 def test_tuple_format_implicit():
     """Test tuple format with implicit GoldParse creation"""
 

From caa75087252649e527923f31ee88fe01e4694f7d Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 13 Jun 2020 23:10:21 +0200
Subject: [PATCH 51/56] Draft missing NewExample stuff

---
 spacy/gold/new_example.pyx | 70 +++++++++++++++++++++++++++++++++-----
 1 file changed, 61 insertions(+), 9 deletions(-)

diff --git a/spacy/gold/new_example.pyx b/spacy/gold/new_example.pyx
index d9a712e38..5b66d0cae 100644
--- a/spacy/gold/new_example.pyx
+++ b/spacy/gold/new_example.pyx
@@ -5,7 +5,7 @@ from ..tokens.doc cimport Doc
 from ..attrs import IDS
 from .align cimport Alignment
 from .annotation import TokenAnnotation, DocAnnotation
-from .iob_utils import biluo_to_iob, biluo_tags_from_offsets
+from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc
 from .align import Alignment
 from ..errors import Errors, AlignmentError
 
@@ -73,18 +73,70 @@ cdef class NewExample:
         return self._alignment
 
     def get_aligned(self, field):
-        raise NotImplementedError
+        """Return an aligned array for a token attribute."""
+        # TODO: This is probably wrong. I just bashed this out and there's probably
+        # all sorts of edge-cases.
+        alignment = self.alignment
+        i2j_multi = alignment.i2j_multi
+        gold_to_cand = alignment.gold_to_cand
+        cand_to_gold = alignment.cand_to_gold
+
+        gold_values = self.reference.to_array([field])
+        output = []
+        for i, gold_i in enumerate(cand_to_gold):
+            if self.predicted[i].text.isspace():
+                output.append(None)
+            elif gold_i is None:
+                if i in i2j_multi:
+                    output.append(gold_values[i2j_multi[i]])
+                else:
+                    output.append(None)
+            else:
+                output.append(gold_values[gold_i])
+        return output
 
     def to_dict(self):
-        # We should probably implement this? We could return the 
-        # doc_annotation and token_annotation, and this would allow us to
-        # easily implement the `get_parses_from_example` in
-        # spacy.syntax.gold_parse
-        raise NotImplementedError
+        return {
+            "doc_annotation": {
+                "cats": dict(self.reference.cats),
+                "links": [], # TODO
+            },
+            "token_annotation": {
+                "ids": [t.i+1 for t in self.reference],
+                "words": [t.text for t in self.reference],
+                "tags": [t.tag_ for t in self.reference],
+                "lemmas": [t.lemma_ for t in self.reference],
+                "pos": [t.pos_ for t in self.reference],
+                "morphs": [t.morph_ for t in self.reference],
+                "heads": [t.head.i for t in self.reference],
+                "deps": [t.dep_ for t in self.reference],
+                "sent_starts": [int(bool(t.is_sent_start)) for t in self.reference],
+                "entities": biluo_tags_from_doc(self.reference)
+            }
+        }
 
     def split_sents(self):
-        # Unclear whether we should really implement this. I guess?
-        raise NotImplementedError
+        """ Split the token annotations into multiple Examples based on
+        sent_starts and return a list of the new Examples"""
+        if not self.reference.is_sentenced:
+            return [self]
+        # TODO: Do this for misaligned somehow?
+        predicted_words = [t.text for t in self.predicted]
+        reference_words = [t.text for t in self.reference]
+        if predicted_words != reference_words:
+            raise NotImplementedError("TODO: Implement this")
+        # Implement the easy case.
+        output = []
+        cls = self.__class__
+        for sent in self.reference.sents:
+            # I guess for misaligned we just need to use the gold_to_cand?
+            output.append(
+                cls(
+                    self.predicted[sent.start : sent.end + 1].as_doc(),
+                    sent.as_doc()
+                )
+            )
+        return output
 
     def text(self):
         return self.x.text

From 3a0bbcfb4ca31c89a8235e91d454ae5ceb6da424 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 13 Jun 2020 23:10:54 +0200
Subject: [PATCH 52/56] Add biluo_tags_from_doc function

---
 spacy/gold/iob_utils.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/spacy/gold/iob_utils.py b/spacy/gold/iob_utils.py
index 2f0f116a1..6d16cf1a5 100644
--- a/spacy/gold/iob_utils.py
+++ b/spacy/gold/iob_utils.py
@@ -47,6 +47,14 @@ def _consume_ent(tags):
         return [start] + middle + [end]
 
 
+def biluo_tags_from_doc(doc, missing="O"):
+    return biluo_tags_from_offsets(
+        doc,
+        [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents],
+        missing=missing
+    )
+
+
 def biluo_tags_from_offsets(doc, entities, missing="O"):
     """Encode labelled spans into per-token tags, using the
     Begin/In/Last/Unit/Out scheme (BILUO).

From 8f941ef527794ca7b7102b69c73e562731248b4d Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 13 Jun 2020 23:11:29 +0200
Subject: [PATCH 53/56] Update GoldParse

---
 spacy/syntax/gold_parse.pyx | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/spacy/syntax/gold_parse.pyx b/spacy/syntax/gold_parse.pyx
index 9712f6e94..d547de821 100644
--- a/spacy/syntax/gold_parse.pyx
+++ b/spacy/syntax/gold_parse.pyx
@@ -35,7 +35,7 @@ def get_parses_from_example(
     else:
         # not merging: one GoldParse per sentence, defining docs with the words
         # from each sentence
-        examples = eg.split_sents()
+        examples = example.split_sents()
     outputs = []
     for eg in examples:
         eg_dict = eg.to_dict()
@@ -62,18 +62,21 @@ cdef class GoldParse:
     """
     @classmethod
     def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False):
-        return cls(doc, words=token_annotation.words,
-                   tags=token_annotation.tags,
-                   pos=token_annotation.pos,
-                   morphs=token_annotation.morphs,
-                   lemmas=token_annotation.lemmas,
-                   heads=token_annotation.heads,
-                   deps=token_annotation.deps,
-                   entities=token_annotation.entities,
-                   sent_starts=token_annotation.sent_starts,
-                   cats=doc_annotation.cats,
-                   links=doc_annotation.links,
-                   make_projective=make_projective)
+        return cls(
+            doc,
+            words=token_annotation["words"],
+            tags=token_annotation["tags"],
+            pos=token_annotation["pos"],
+            morphs=token_annotation["morphs"],
+            lemmas=token_annotation["lemmas"],
+            heads=token_annotation["heads"],
+            deps=token_annotation["deps"],
+            entities=token_annotation["entities"],
+            sent_starts=token_annotation["sent_starts"],
+            cats=doc_annotation["cats"],
+            links=doc_annotation["links"],
+            make_projective=make_projective
+        )
 
     def get_token_annotation(self):
         ids = None

From 7de997c0a53adaf7ed8881c44593946d772a5081 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 13 Jun 2020 23:11:45 +0200
Subject: [PATCH 54/56] Update test

---
 spacy/tests/test_gold.py | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
index cc9224ae1..6e3f7b2ba 100644
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@@ -488,17 +488,15 @@ def test_split_sents(merged_dict):
     split_examples = example.split_sents()
     assert len(split_examples) == 2
 
-    token_annotation_1 = split_examples[0].token_annotation
-    assert token_annotation_1.ids == [1, 2, 3]
-    assert token_annotation_1.words == ["Hi", "there", "everyone"]
-    assert token_annotation_1.tags == ["INTJ", "ADV", "PRON"]
-    assert token_annotation_1.sent_starts == [1, 0, 0]
+    token_annotation_1 = split_examples[0].to_dict()["token_annotation"]
+    assert token_annotation_1["words"] == ["Hi", "there", "everyone"]
+    assert token_annotation_1["tags"] == ["INTJ", "ADV", "PRON"]
+    assert token_annotation_1["sent_starts"] == [1, 0, 0]
 
-    token_annotation_2 = split_examples[1].token_annotation
-    assert token_annotation_2.ids == [4, 5, 6, 7]
-    assert token_annotation_2.words == ["It", "is", "just", "me"]
-    assert token_annotation_2.tags == ["PRON", "AUX", "ADV", "PRON"]
-    assert token_annotation_2.sent_starts == [1, 0, 0, 0]
+    token_annotation_2 = split_examples[1].to_dict()["token_annotation"]
+    assert token_annotation_2["words"] == ["It", "is", "just", "me"]
+    assert token_annotation_2["tags"] == ["PRON", "AUX", "ADV", "PRON"]
+    assert token_annotation_2["sent_starts"] == [1, 0, 0, 0]
 
 
 # This fails on some None value? Need to look into that.

From 4362ec7084597f90919c5d9e33523c955b96d472 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 13 Jun 2020 23:37:42 +0200
Subject: [PATCH 55/56] Hack Language.evaluate

---
 spacy/language.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 57664ec17..4ab9bed5a 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -723,24 +723,26 @@ class Language(object):
 
         DOCS: https://spacy.io/api/language#evaluate
         """
-        examples = Example.to_example_objects(examples, make_doc=self.make_doc)
+        examples = Example.to_example_objects(examples)
         if scorer is None:
             scorer = Scorer(pipeline=self.pipeline)
         if component_cfg is None:
             component_cfg = {}
+        docs = (eg.predicted for eg in examples)
         for name, pipe in self.pipeline:
             kwargs = component_cfg.get(name, {})
             kwargs.setdefault("batch_size", batch_size)
             if not hasattr(pipe, "pipe"):
-                examples = _pipe(examples, pipe, kwargs)
+                docs = _pipe(docs, pipe, kwargs)
             else:
-                examples = pipe.pipe(examples, as_example=True, **kwargs)
-        for ex in examples:
+                docs = pipe.pipe(docs, **kwargs)
+        for doc, eg in zip(docs, examples):
             if verbose:
                 print(ex.doc)
+            eg.predicted = doc
             kwargs = component_cfg.get("scorer", {})
             kwargs.setdefault("verbose", verbose)
-            scorer.score(ex, **kwargs)
+            scorer.score(eg, **kwargs)
         return scorer
 
     @contextmanager

From 380cce9d8b3b1a90c3b25b5187a3d666ee416d71 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 14 Jun 2020 17:40:05 +0200
Subject: [PATCH 56/56] Update errors

---
 spacy/errors.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 459301315..9c7bf9e50 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -580,14 +580,6 @@ class Errors(object):
             "table, which contains {n_rows} vectors.")
 
     # TODO: fix numbering after merging develop into master
-
-    E983 = ("Each link annotation should refer to a dictionary with at most one "
-            "identifier mapping to 1.0, and all others to 0.0.")
-    E984 = ("The offsets of the annotations for 'links' need to refer exactly "
-            "to the offsets of the 'entities' annotations.")
-    E985 = ("The 'ent_iob' attribute of a Token should be an integer indexing "
-            "into {values}, but found {value}.")
-    
     E983 = ("Invalid key for '{dict_name}': {key}. Available keys: "
             "{keys}")
     E984 = ("Could not parse the {input} - double check the data is written "
@@ -628,6 +620,14 @@ class Errors(object):
     E999 = ("Encountered an unexpected format for the dictionary holding "
             "gold annotations: {gold_dict}")
 
+    # TODO: These were left over after a merge, but I couldn't find them?
+    #E983 = ("Each link annotation should refer to a dictionary with at most one "
+    #        "identifier mapping to 1.0, and all others to 0.0.")
+    #E984 = ("The offsets of the annotations for 'links' need to refer exactly "
+    #        "to the offsets of the 'entities' annotations.")
+    #E985 = ("The 'ent_iob' attribute of a Token should be an integer indexing "
+    #        "into {values}, but found {value}.")
+ 
 
 @add_codes
 class TempErrors(object):