Restructure Example with merged sents as default (#4632)

* Switch to train_dataset() function in train CLI * Fixes for pipe() methods in pipeline components * Don't clobber `examples` variable with `as_example` in pipe() methods * Remove unnecessary traversals of `examples` * Update Parser.pipe() for Examples * Add `as_examples` kwarg to `pipe()` with implementation to return `Example`s * Accept `Doc` or `Example` in `pipe()` with `_get_doc()` (copied from `Pipe`) * Fixes to Example implementation in spacy.gold * Move `make_projective` from an attribute of Example to an argument of `Example.get_gold_parses()` * Head of 0 are not treated as unset * Unset heads are set to self rather than `None` (which causes problems while projectivizing) * Check for `Doc` (not just not `None`) when creating GoldParses for pre-merged example * Don't clobber `examples` variable in `iter_gold_docs()` * Add/modify gold tests for handling projectivity * In JSON roundtrip compare results from `dev_dataset` rather than `train_dataset` to avoid projectivization (and other potential modifications) * Add test for projective train vs. nonprojective dev versions of the same `Doc` * Handle ignore_misaligned as arg rather than attr Move `ignore_misaligned` from an attribute of `Example` to an argument to `Example.get_gold_parses()`, which makes it parallel to `make_projective`. Add test with old and new align that checks whether `ignore_misaligned` errors are raised as expected (only for new align). * Remove unused attrs from gold.pxd Remove `ignore_misaligned` and `make_projective` from `gold.pxd` * Restructure Example with merged sents as default An `Example` now includes a single `TokenAnnotation` that includes all the information from one `Doc` (=JSON `paragraph`). If required, the individual sentences can be returned as a list of examples with `Example.split_sents()` with no raw text available. * Input/output a single `Example.token_annotation` * Add `sent_starts` to `TokenAnnotation` to handle sentence boundaries * Replace `Example.merge_sents()` with `Example.split_sents()` * Modify components to use a single `Example.token_annotation` * Pipeline components * conllu2json converter * Rework/rename `add_token_annotation()` and `add_doc_annotation()` to `set_token_annotation()` and `set_doc_annotation()`, functions that set rather then appending/extending. * Rename `morphology` to `morphs` in `TokenAnnotation` and `GoldParse` * Add getters to `TokenAnnotation` to supply default values when a given attribute is not available * `Example.get_gold_parses()` in `spacy.gold._make_golds()` is only applied on single examples, so the `GoldParse` is returned saved in the provided `Example` rather than creating a new `Example` with no other internal annotation * Update tests for API changes and `merge_sents()` vs. `split_sents()` * Refer to Example.goldparse in iter_gold_docs() Use `Example.goldparse` in `iter_gold_docs()` instead of `Example.gold` because a `None` `GoldParse` is generated with ignore_misaligned and generating it on-the-fly can raise an unwanted AlignmentError * Fix make_orth_variants() Fix bug in make_orth_variants() related to conversion from multiple to one TokenAnnotation per Example. * Add basic test for make_orth_variants() * Replace try/except with conditionals * Replace default morph value with set
2025-07-18 20:22:25 +03:00 · 2019-11-25 16:03:28 +01:00 · 2019-11-25 16:03:28 +01:00 · 392c4880d9
commit 392c4880d9
parent 44829950ba
12 changed files with 376 additions and 330 deletions
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@ -24,17 +24,16 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_):
    checked_for_ner = False
    has_ner_tags = False
    for i, example in enumerate(conll_data):
-        for token_annotation in example.token_annotations:
+        if not checked_for_ner:
-            if not checked_for_ner:
+            has_ner_tags = is_ner(example.token_annotation.entities[0])
-                has_ner_tags = is_ner(token_annotation.entities[0])
+            checked_for_ner = True
-                checked_for_ner = True
+        sentences.append(generate_sentence(example.token_annotation, has_ner_tags))
-            sentences.append(generate_sentence(token_annotation, has_ner_tags))
+        # Real-sized documents could be extracted using the comments on the
-            # Real-sized documents could be extracted using the comments on the
+        # conllu document
-            # conluu document
+        if len(sentences) % n_sents == 0:
-            if len(sentences) % n_sents == 0:
+            doc = create_doc(sentences, i)
-                doc = create_doc(sentences, i)
+            docs.append(doc)
-                docs.append(doc)
+            sentences = []
                sentences = []
    return docs
@ -84,7 +83,7 @@ def read_conllx(input_data, use_morphology=False, n=0):
                    print(line)
                    raise
            example = Example(doc=None)
-            example.add_token_annotation(ids=ids, words=words, tags=tags,
+            example.set_token_annotation(ids=ids, words=words, tags=tags,
                                         heads=heads, deps=deps, entities=ents)
            yield example
            i += 1
--- a/spacy/gold.pxd
+++ b/spacy/gold.pxd
@ -25,7 +25,7 @@ cdef class GoldParse:
    cdef public int loss
    cdef public list words
    cdef public list tags
-    cdef public list morphology
+    cdef public list morphs
    cdef public list heads
    cdef public list labels
    cdef public dict orths
@ -45,7 +45,8 @@ cdef class TokenAnnotation:
    cdef public list heads
    cdef public list deps
    cdef public list entities
-    cdef public list morphology
+    cdef public list morphs
    cdef public list sent_starts
    cdef public list brackets
@ -56,7 +57,7 @@ cdef class DocAnnotation:
 cdef class Example:
    cdef public object doc
-    cdef public list token_annotations
+    cdef public TokenAnnotation token_annotation
    cdef public DocAnnotation doc_annotation
    cdef public object goldparse
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -215,7 +215,7 @@ class GoldCorpus(object):
            ex_dict = example.to_dict()
            text = example.text
            srsly.write_msgpack(directory / "{}.msg".format(i), (text, ex_dict))
-            n += len(example.token_annotations)
+            n += 1
            if limit and n >= limit:
                break
@ -271,7 +271,7 @@ class GoldCorpus(object):
                raise ValueError(Errors.E124.format(path=path2str(loc), formats=supported))
            for example in examples:
                yield example
-                i += len(example.token_annotations)
+                i += 1
                if limit and i >= limit:
                    return
@ -286,15 +286,14 @@ class GoldCorpus(object):
        yield from self.read_examples(locs, limit=self.limit)
    def count_train(self):
-        # TODO: should this count words or sentences ?
+        """Returns count of words in train examples"""
        n = 0
        i = 0
        for example in self.train_examples:
-            for token_annotation in example.token_annotations:
+            n += len(example.token_annotation.words)
-                n += len(token_annotation.words)
+            if self.limit and i >= self.limit:
-                if self.limit and i >= self.limit:
+                break
-                    break
+            i += 1
                i += 1
        return n
    def train_dataset(self, nlp, gold_preproc=False, max_length=None,
@ -328,18 +327,27 @@ class GoldCorpus(object):
    def iter_gold_docs(cls, nlp, examples, gold_preproc, max_length=None,
                       noise_level=0.0, orth_variant_level=0.0,
                       make_projective=False, ignore_misaligned=False):
-        """ Setting gold_preproc will result in creating a doc per 'sentence' """
+        """ Setting gold_preproc will result in creating a doc per sentence """
        for example in examples:
            if gold_preproc:
                example.doc = None
                split_examples = example.split_sents()
                example_golds = []
                for split_example in split_examples:
                    split_example_docs = cls._make_docs(nlp, split_example,
                            gold_preproc, noise_level=noise_level,
                            orth_variant_level=orth_variant_level)
                    split_example_golds = cls._make_golds(split_example_docs,
                            vocab=nlp.vocab, make_projective=make_projective,
                            ignore_misaligned=ignore_misaligned)
                    example_golds.extend(split_example_golds)
            else:
-                example = example.merge_sents()
+                example_docs = cls._make_docs(nlp, example,
-            example_docs = cls._make_docs(nlp, example,
+                        gold_preproc, noise_level=noise_level,
-                                      gold_preproc, noise_level=noise_level,
+                        orth_variant_level=orth_variant_level)
-                                      orth_variant_level=orth_variant_level)
+                example_golds = cls._make_golds(example_docs, vocab=nlp.vocab,
-            example_golds = cls._make_golds(example_docs, vocab=nlp.vocab,
+                        make_projective=make_projective,
-                                            make_projective=make_projective,
+                        ignore_misaligned=ignore_misaligned)
                                            ignore_misaligned=ignore_misaligned)
            for ex in example_golds:
                if ex.goldparse is not None:
                    if (not max_length) or len(ex.doc) < max_length:
@ -353,35 +361,28 @@ class GoldCorpus(object):
            var_text = add_noise(var_example.text, noise_level)
            var_doc = nlp.make_doc(var_text)
            var_example.doc = var_doc
            return [var_example]
        else:
-            doc_examples = []
+            var_doc = Doc(nlp.vocab, words=add_noise(var_example.token_annotation.words, noise_level))
-            for token_annotation in var_example.token_annotations:
+            var_example.doc = var_doc
-                t_doc = Doc(nlp.vocab, words=add_noise(token_annotation.words, noise_level))
+        return [var_example]
                doc_example = Example(doc_annotation=example.doc_annotation,
                                      token_annotations=[token_annotation],
                                      doc=t_doc)
                doc_examples.append(doc_example)
            return doc_examples
    @classmethod
    def _make_golds(cls, examples, vocab=None, make_projective=False,
                    ignore_misaligned=False):
        gold_examples = []
        for example in examples:
            gold_parses = example.get_gold_parses(vocab=vocab,
                    make_projective=make_projective,
                    ignore_misaligned=ignore_misaligned)
-            for (doc, gold) in gold_parses:
+            assert len(gold_parses) == 1
-                ex = Example(doc=doc)
+            assert gold_parses[0][0] == example.doc
-                ex.goldparse = gold
+            example.goldparse = gold_parses[0][1]
-                gold_examples.append(ex)
+        return examples
-        return gold_examples
+
 def make_orth_variants(nlp, example, orth_variant_level=0.0):
    if random.random() >= orth_variant_level:
        return example
-    if not example.token_annotations:
+    if not example.token_annotation:
        return example
    raw = example.text
    if random.random() >= 0.5:
@ -392,46 +393,46 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
    ndpv = nlp.Defaults.paired_orth_variants
    # modify words in paragraph_tuples
    variant_example = Example(doc=raw)
-    for token_annotation in example.token_annotations:
+    token_annotation = example.token_annotation
-        words = token_annotation.words
+    words = token_annotation.words
-        tags = token_annotation.tags
+    tags = token_annotation.tags
-        if not words or not tags:
+    if not words or not tags:
-           # add the unmodified annotation
+       # add the unmodified annotation
-            token_dict = token_annotation.to_dict()
+        token_dict = token_annotation.to_dict()
-            variant_example.add_token_annotation(**token_dict)
+        variant_example.set_token_annotation(**token_dict)
-        else:
+    else:
-            if lower:
+        if lower:
-                words = [w.lower() for w in words]
+            words = [w.lower() for w in words]
-            # single variants
+        # single variants
-            punct_choices = [random.choice(x["variants"]) for x in ndsv]
+        punct_choices = [random.choice(x["variants"]) for x in ndsv]
-            for word_idx in range(len(words)):
+        for word_idx in range(len(words)):
-                for punct_idx in range(len(ndsv)):
+            for punct_idx in range(len(ndsv)):
-                    if tags[word_idx] in ndsv[punct_idx]["tags"] \
+                if tags[word_idx] in ndsv[punct_idx]["tags"] \
-                            and words[word_idx] in ndsv[punct_idx]["variants"]:
+                        and words[word_idx] in ndsv[punct_idx]["variants"]:
-                        words[word_idx] = punct_choices[punct_idx]
+                    words[word_idx] = punct_choices[punct_idx]
-            # paired variants
+        # paired variants
-            punct_choices = [random.choice(x["variants"]) for x in ndpv]
+        punct_choices = [random.choice(x["variants"]) for x in ndpv]
-            for word_idx in range(len(words)):
+        for word_idx in range(len(words)):
-                for punct_idx in range(len(ndpv)):
+            for punct_idx in range(len(ndpv)):
-                    if tags[word_idx] in ndpv[punct_idx]["tags"] \
+                if tags[word_idx] in ndpv[punct_idx]["tags"] \
-                            and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
+                        and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
-                        # backup option: random left vs. right from pair
+                    # backup option: random left vs. right from pair
-                        pair_idx = random.choice([0, 1])
+                    pair_idx = random.choice([0, 1])
-                        # best option: rely on paired POS tags like `` / ''
+                    # best option: rely on paired POS tags like `` / ''
-                        if len(ndpv[punct_idx]["tags"]) == 2:
+                    if len(ndpv[punct_idx]["tags"]) == 2:
-                            pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx])
+                        pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx])
-                        # next best option: rely on position in variants
+                    # next best option: rely on position in variants
-                        # (may not be unambiguous, so order of variants matters)
+                    # (may not be unambiguous, so order of variants matters)
-                        else:
+                    else:
-                            for pair in ndpv[punct_idx]["variants"]:
+                        for pair in ndpv[punct_idx]["variants"]:
-                                if words[word_idx] in pair:
+                            if words[word_idx] in pair:
-                                    pair_idx = pair.index(words[word_idx])
+                                pair_idx = pair.index(words[word_idx])
-                        words[word_idx] = punct_choices[punct_idx][pair_idx]
+                    words[word_idx] = punct_choices[punct_idx][pair_idx]
-            token_dict = token_annotation.to_dict()
+        token_dict = token_annotation.to_dict()
-            token_dict["words"] = words
+        token_dict["words"] = words
-            token_dict["tags"] = tags
+        token_dict["tags"] = tags
-            variant_example.add_token_annotation(**token_dict)
+        variant_example.set_token_annotation(**token_dict)
    # modify raw to match variant_paragraph_tuples
    if raw is not None:
        variants = []
@ -449,30 +450,29 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
        while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
            variant_raw += raw[raw_idx]
            raw_idx += 1
-        for token_annotation in variant_example.token_annotations:
+        for word in variant_example.token_annotation.words:
-            for word in token_annotation.words:
+            match_found = False
-                match_found = False
+            # add identical word
-                # add identical word
+            if word not in variants and raw[raw_idx:].startswith(word):
-                if word not in variants and raw[raw_idx:].startswith(word):
+                variant_raw += word
-                    variant_raw += word
+                raw_idx += len(word)
-                    raw_idx += len(word)
+                match_found = True
-                    match_found = True
+            # add variant word
-                # add variant word
+            else:
-                else:
+                for variant in variants:
-                    for variant in variants:
+                    if not match_found and \
-                        if not match_found and \
+                            raw[raw_idx:].startswith(variant):
-                                raw[raw_idx:].startswith(variant):
+                        raw_idx += len(variant)
-                            raw_idx += len(variant)
+                        variant_raw += word
-                            variant_raw += word
+                        match_found = True
-                            match_found = True
+            # something went wrong, abort
-                # something went wrong, abort
+            # (add a warning message?)
-                # (add a warning message?)
+            if not match_found:
-                if not match_found:
+                return example
-                    return example
+            # add following whitespace
-                # add following whitespace
+            while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
-                while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
+                variant_raw += raw[raw_idx]
-                    variant_raw += raw[raw_idx]
+                raw_idx += 1
                    raw_idx += 1
        variant_example.doc = variant_raw
        return variant_example
    return variant_example
@ -521,30 +521,43 @@ def json_to_examples(doc):
    paragraphs = []
    for paragraph in doc["paragraphs"]:
        example = Example(doc=paragraph.get("raw", None))
        words = []
        ids = []
        tags = []
        heads = []
        labels = []
        ner = []
        morphs = []
        sent_starts = []
        brackets = []
        for sent in paragraph["sentences"]:
-            words = []
+            sent_start_i = len(words)
            ids = []
            tags = []
            heads = []
            labels = []
            ner = []
            for i, token in enumerate(sent["tokens"]):
                words.append(token["orth"])
-                ids.append(i)
+                ids.append(token.get('id', sent_start_i + i))
                tags.append(token.get('tag', "-"))
-                heads.append(token.get("head", 0) + i)
+                heads.append(token.get("head", 0) + sent_start_i + i)
                labels.append(token.get("dep", ""))
                # Ensure ROOT label is case-insensitive
                if labels[-1].lower() == "root":
                    labels[-1] = "ROOT"
                ner.append(token.get("ner", "-"))
-            example.add_token_annotation(ids=ids, words=words, tags=tags,
+                morphs.append(token.get("morph", {}))
-                                        heads=heads, deps=labels, entities=ner,
+                if i == 0:
-                                        brackets=sent.get("brackets", []))
+                    sent_starts.append(True)
                else:
                    sent_starts.append(False)
            if "brackets" in sent:
                brackets.extend((b["first"] + sent_start_i,
                                 b["last"] + sent_start_i, b["label"])
                                 for b in sent["brackets"])
        cats = {}
        for cat in paragraph.get("cats", {}):
            cats[cat["label"]] = cat["value"]
-        example.add_doc_annotation(cats=cats)
+        example.set_token_annotation(ids=ids, words=words, tags=tags,
                heads=heads, deps=labels, entities=ner, morphs=morphs,
                sent_starts=sent_starts, brackets=brackets)
        example.set_doc_annotation(cats=cats)
        yield example
@ -652,15 +665,16 @@ def _consume_ent(tags):
 cdef class TokenAnnotation:
-    def __init__(self, ids=None, words=None, tags=None, heads=None, deps=None, entities=None, morphology=None, brackets=None):
+    def __init__(self, ids=None, words=None, tags=None, heads=None, deps=None, entities=None, morphs=None, sent_starts=None, brackets=None):
        self.ids = ids if ids else []
        self.words = words if words else []
        self.tags = tags if tags else []
        self.heads = heads if heads else []
        self.deps = deps if deps else []
        self.entities = entities if entities else []
        self.morphs = morphs if morphs else []
        self.sent_starts = sent_starts if sent_starts else []
        self.brackets = brackets if brackets else []
        self.morphology = morphology if morphology else []
    @classmethod
    def from_dict(cls, token_dict):
@ -670,7 +684,8 @@ cdef class TokenAnnotation:
                   heads=token_dict.get("heads", None),
                   deps=token_dict.get("deps", None),
                   entities=token_dict.get("entities", None),
-                   morphology=token_dict.get("morphology", None),
+                   morphs=token_dict.get("morphs", None),
                   sent_starts=token_dict.get("sent_starts", None),
                   brackets=token_dict.get("brackets", None))
    def to_dict(self):
@ -680,9 +695,34 @@ cdef class TokenAnnotation:
                "heads": self.heads,
                "deps": self.deps,
                "entities": self.entities,
-                "morphology": self.morphology,
+                "morphs": self.morphs,
                "sent_starts": self.sent_starts,
                "brackets": self.brackets}
    def get_id(self, i):
        return self.ids[i] if i < len(self.ids) else i
    def get_word(self, i):
        return self.words[i] if i < len(self.words) else ""
    def get_tag(self, i):
        return self.tags[i] if i < len(self.tags) else "-"
    def get_head(self, i):
        return self.heads[i] if i < len(self.heads) else i
    def get_dep(self, i):
        return self.deps[i] if i < len(self.deps) else ""
    def get_entity(self, i):
        return self.entities[i] if i < len(self.entities) else "-"
    def get_morph(self, i):
        return self.morphs[i] if i < len(self.morphs) else set()
    def get_sent_start(self, i):
        return self.sent_starts[i] if i < len(self.sent_starts) else None
 cdef class DocAnnotation:
    def __init__(self, cats=None, links=None):
@ -698,33 +738,33 @@ cdef class DocAnnotation:
 cdef class Example:
-    def __init__(self, doc_annotation=None, token_annotations=None, doc=None,
+    def __init__(self, doc_annotation=None, token_annotation=None, doc=None,
                 goldparse=None):
        """ Doc can either be text, or an actual Doc """
        self.doc = doc
        self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation()
-        self.token_annotations = token_annotations if token_annotations else []
+        self.token_annotation = token_annotation if token_annotation else TokenAnnotation()
        self.goldparse = goldparse
    @classmethod
    def from_gold(cls, goldparse, doc=None):
        doc_annotation = DocAnnotation(cats=goldparse.cats, links=goldparse.links)
        token_annotation = goldparse.get_token_annotation()
-        return cls(doc_annotation, [token_annotation], doc)
+        return cls(doc_annotation, token_annotation, doc)
    @classmethod
    def from_dict(cls, example_dict, doc=None):
-        token_dicts = example_dict["token_annotations"]
+        token_dict = example_dict["token_annotation"]
-        token_annotations = [TokenAnnotation.from_dict(t) for t in token_dicts]
+        token_annotation = TokenAnnotation.from_dict(token_dict)
        doc_dict = example_dict["doc_annotation"]
        doc_annotation = DocAnnotation.from_dict(doc_dict)
-        return cls(doc_annotation, token_annotations, doc)
+        return cls(doc_annotation, token_annotation, doc)
    def to_dict(self):
        """ Note that this method does NOT export the doc, only the annotations ! """
-        token_dicts = [t.to_dict() for t in self.token_annotations]
+        token_dict = self.token_annotation.to_dict()
        doc_dict = self.doc_annotation.to_dict()
-        return {"token_annotations": token_dicts, "doc_annotation": doc_dict}
+        return {"token_annotation": token_dict, "doc_annotation": doc_dict}
    @property
    def text(self):
@ -737,96 +777,108 @@ cdef class Example:
    @property
    def gold(self):
        if self.goldparse is None:
-            doc, gold = self.get_gold_parses(merge=True)[0]
+            doc, gold = self.get_gold_parses()[0]
            self.goldparse = gold
        return self.goldparse
-    def add_token_annotation(self, ids=None, words=None, tags=None, heads=None,
+    def set_token_annotation(self, ids=None, words=None, tags=None, heads=None,
-                             deps=None, entities=None, morphology=None, brackets=None):
+                             deps=None, entities=None, morphs=None,
-        t = TokenAnnotation(ids=ids, words=words, tags=tags,
+                             sent_starts=None, brackets=None):
        self.token_annotation = TokenAnnotation(ids=ids, words=words, tags=tags,
                            heads=heads, deps=deps, entities=entities,
-                            morphology=morphology, brackets=brackets)
+                            morphs=morphs, sent_starts=sent_starts,
-        self.token_annotations.append(t)
+                            brackets=brackets)
-    def add_doc_annotation(self, cats=None, links=None):
+    def set_doc_annotation(self, cats=None, links=None):
        if cats:
-            self.doc_annotation.cats.update(cats)
+            self.doc_annotation.cats = cats
        if links:
-            self.doc_annotation.links.update(links)
+            self.doc_annotation.links = links
-    def merge_sents(self):
+    def split_sents(self):
-        """ Merge the list of token annotations into one object and return this new object """
+        """ Split the token annotations into multiple Examples based on
-        m_example = Example(doc=self.doc, doc_annotation=self.doc_annotation)
+        sent_starts and return a list of the new Examples"""
-        m_ids, m_words, m_tags, m_heads, m_deps, m_ents, m_morph = [], [], [], [], [], [], []
+        s_example = Example(doc=None, doc_annotation=self.doc_annotation)
-        m_brackets = []
+        s_ids, s_words, s_tags, s_heads = [], [], [], []
-        i = 0
+        s_deps, s_ents, s_morphs, s_sent_starts = [], [], [], []
-        for t in self.token_annotations:
+        s_brackets = []
-            m_ids.extend(id_ + i for id_ in t.ids)
+        sent_start_i = 0
-            m_words.extend(t.words)
+        t = self.token_annotation
-            m_tags.extend(t.tags)
+        split_examples = []
-            m_heads.extend(head + i if head is not None and head >= 0 else head_i + i for head_i, head in enumerate(t.heads))
+        for i in range(len(t.words)):
-            m_deps.extend(t.deps)
+            if i > 0 and t.sent_starts[i] == True:
-            m_ents.extend(t.entities)
+                s_example.set_token_annotation(ids=s_ids,
-            m_morph.extend(t.morphology)
+                        words=s_words, tags=s_tags, heads=s_heads, deps=s_deps,
-            m_brackets.extend((b["first"] + i, b["last"] + i, b["label"])
+                        entities=s_ents, morphs=s_morphs,
-                              for b in t.brackets)
+                        sent_starts=s_sent_starts, brackets=s_brackets)
-            i += len(t.ids)
+                split_examples.append(s_example)
-        m_example.add_token_annotation(ids=m_ids, words=m_words, tags=m_tags,
+                s_example = Example(doc=None, doc_annotation=self.doc_annotation)
-                                       heads=m_heads, deps=m_deps, entities=m_ents,
+                s_ids, s_words, s_tags, s_heads = [], [], [], []
-                                       morphology=m_morph, brackets=m_brackets)
+                s_deps, s_ents, s_morphs, s_sent_starts = [], [], [], []
-        return m_example
+                s_brackets = []
                sent_start_i = i
            s_ids.append(t.get_id(i))
            s_words.append(t.get_word(i))
            s_tags.append(t.get_tag(i))
            s_heads.append(t.get_head(i) - sent_start_i)
            s_deps.append(t.get_dep(i))
            s_ents.append(t.get_entity(i))
            s_morphs.append(t.get_morph(i))
            s_sent_starts.append(t.get_sent_start(i))
            s_brackets.extend((b[0] - sent_start_i,
                               b[1] - sent_start_i, b[2])
                               for b in t.brackets if b[0] == i)
            i += 1
        s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags,
                heads=s_heads, deps=s_deps, entities=s_ents,
                morphs=s_morphs, sent_starts=s_sent_starts,
                brackets=s_brackets)
        split_examples.append(s_example)
        return split_examples
-    def get_gold_parses(self, merge=False, vocab=None, make_projective=False,
+    def get_gold_parses(self, merge=True, vocab=None, make_projective=False,
                        ignore_misaligned=False):
        """Return a list of (doc, GoldParse) objects.
-        If merge is set to True, add all Token annotations to one big list."""
+        If merge is set to True, keep all Token annotations as one big list."""
        d = self.doc_annotation
-        # merging different sentences
+        # merge == do not modify Example
        if merge:
-            merged_example = self.merge_sents()
+            t = self.token_annotation
-            assert(len(merged_example.token_annotations)) == 1
+            doc = self.doc
-            t = merged_example.token_annotations[0]
+            if not self.doc:
            m_doc = merged_example.doc
            if not m_doc:
                if not vocab:
                    raise ValueError(Errors.E998)
-                m_doc = Doc(vocab, words=t.words)
+                doc = Doc(vocab, words=t.words)
            try:
-                gp = GoldParse.from_annotation(m_doc, d, t, make_projective=make_projective)
+                gp = GoldParse.from_annotation(doc, d, t,
                                               make_projective=make_projective)
            except AlignmentError:
                if ignore_misaligned:
                    gp = None
                else:
                    raise
-            return [(self.doc, gp)]
+            return [(doc, gp)]
-        # we only have one sentence and an appropriate doc
+        # not merging: one GoldParse per sentence, defining docs with the words
-        elif len(self.token_annotations) == 1 and isinstance(self.doc, Doc):
+        # from each sentence
            t = self.token_annotations[0]
            try:
                gp = GoldParse.from_annotation(self.doc, d, t, make_projective=make_projective)
            except AlignmentError:
                if ignore_misaligned:
                    gp = None
                else:
                    raise
            return [(self.doc, gp)]
        # not merging: one GoldParse per 'sentence', defining docs with the words from each sentence
        else:
            parses = []
-            for t in self.token_annotations:
+            split_examples = self.split_sents()
            for split_example in split_examples:
                if not vocab:
                    raise ValueError(Errors.E998)
-                t_doc = Doc(vocab, words=t.words)
+                split_doc = Doc(vocab, words=split_example.token_annotation.words)
                try:
-                    gp = GoldParse.from_annotation(t_doc, d, t, make_projective=make_projective)
+                    gp = GoldParse.from_annotation(split_doc, d,
                            split_example.token_annotation,
                            make_projective=make_projective)
                except AlignmentError:
                    if ignore_misaligned:
                        gp = None
                    else:
                        raise
                if gp is not None:
-                    parses.append((t_doc, gp))
+                    parses.append((split_doc, gp))
            return parses
    @classmethod
@ -881,9 +933,14 @@ cdef class GoldParse:
    """
    @classmethod
    def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False):
-        return cls(doc, words=token_annotation.words, tags=token_annotation.tags,
+        return cls(doc, words=token_annotation.words,
-                   heads=token_annotation.heads, deps=token_annotation.deps, entities=token_annotation.entities,
+                   tags=token_annotation.tags,
-                   morphology=token_annotation.morphology, cats=doc_annotation.cats, links=doc_annotation.links,
+                   heads=token_annotation.heads,
                   deps=token_annotation.deps,
                   entities=token_annotation.entities,
                   morphs=token_annotation.morphs,
                   cats=doc_annotation.cats,
                   links=doc_annotation.links,
                   make_projective=make_projective)
    def get_token_annotation(self):
@ -893,9 +950,9 @@ cdef class GoldParse:
        return TokenAnnotation(ids=ids, words=self.words, tags=self.tags,
                               heads=self.heads, deps=self.labels, entities=self.ner,
-                               morphology=self.morphology)
+                               morphs=self.morphs)
-    def __init__(self, doc, words=None, tags=None, morphology=None,
+    def __init__(self, doc, words=None, tags=None, morphs=None,
                 heads=None, deps=None, entities=None, make_projective=False,
                 cats=None, links=None):
        """Create a GoldParse. The fields will not be initialized if len(doc) is zero.
@ -944,8 +1001,8 @@ cdef class GoldParse:
                heads = [None for _ in words]
            if not deps:
                deps = [None for _ in words]
-            if not morphology:
+            if not morphs:
-                morphology = [None for _ in words]
+                morphs = [None for _ in words]
            if entities is None:
                entities = ["-" for _ in words]
            elif len(entities) == 0:
@ -971,7 +1028,7 @@ cdef class GoldParse:
            self.heads = [None] * len(doc)
            self.labels = [None] * len(doc)
            self.ner = [None] * len(doc)
-            self.morphology = [None] * len(doc)
+            self.morphs = [None] * len(doc)
            # This needs to be done before we align the words
            if make_projective and heads is not None and deps is not None:
@ -990,7 +1047,7 @@ cdef class GoldParse:
            self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
            self.orig = TokenAnnotation(ids=list(range(len(words))), words=words, tags=tags,
-                                        heads=heads, deps=deps, entities=entities, morphology=morphology,
+                                        heads=heads, deps=deps, entities=entities, morphs=morphs,
                                        brackets=[])
            for i, gold_i in enumerate(self.cand_to_gold):
@ -1000,12 +1057,12 @@ cdef class GoldParse:
                    self.heads[i] = None
                    self.labels[i] = None
                    self.ner[i] = None
-                    self.morphology[i] = set()
+                    self.morphs[i] = set()
                if gold_i is None:
                    if i in i2j_multi:
                        self.words[i] = words[i2j_multi[i]]
                        self.tags[i] = tags[i2j_multi[i]]
-                        self.morphology[i] = morphology[i2j_multi[i]]
+                        self.morphs[i] = morphs[i2j_multi[i]]
                        is_last = i2j_multi[i] != i2j_multi.get(i+1)
                        is_first = i2j_multi[i] != i2j_multi.get(i-1)
                        # Set next word in multi-token span as head, until last
@ -1044,7 +1101,7 @@ cdef class GoldParse:
                else:
                    self.words[i] = words[gold_i]
                    self.tags[i] = tags[gold_i]
-                    self.morphology[i] = morphology[gold_i]
+                    self.morphs[i] = morphs[gold_i]
                    if heads[gold_i] is None:
                        self.heads[i] = None
                    else:
--- a/spacy/language.py
+++ b/spacy/language.py
@ -574,9 +574,8 @@ class Language(object):
        # Populate vocab
        else:
            for example in get_examples():
-                for token_annotation in example.token_annotations:
+                for word in example.token_annotation.words:
-                    for word in token_annotation.words:
+                    _ = self.vocab[word]  # noqa: F841
                        _ = self.vocab[word]  # noqa: F841
        if cfg.get("device", -1) >= 0:
            util.use_gpu(cfg["device"])
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -565,12 +565,11 @@ class Tagger(Pipe):
        orig_tag_map = dict(self.vocab.morphology.tag_map)
        new_tag_map = OrderedDict()
        for example in get_examples():
-            for token_annotation in example.token_annotations:
+            for tag in example.token_annotation.tags:
-                for tag in token_annotation.tags:
+                if tag in orig_tag_map:
-                    if tag in orig_tag_map:
+                    new_tag_map[tag] = orig_tag_map[tag]
-                        new_tag_map[tag] = orig_tag_map[tag]
+                else:
-                    else:
+                    new_tag_map[tag] = {POS: X}
                        new_tag_map[tag] = {POS: X}
        cdef Vocab vocab = self.vocab
        if new_tag_map:
            vocab.morphology = Morphology(vocab.strings, new_tag_map,
@ -750,11 +749,10 @@ class MultitaskObjective(Tagger):
        gold_examples = nonproj.preprocess_training_data(get_examples())
        # for raw_text, doc_annot in gold_tuples:
        for example in gold_examples:
-            for token_annotation in example.token_annotations:
+            for i in range(len(example.token_annotation.ids)):
-                for i in range(len(token_annotation.ids)):
+                label = self.make_label(i, example.token_annotation)
-                    label = self.make_label(i, token_annotation)
+                if label is not None and label not in self.labels:
-                    if label is not None and label not in self.labels:
+                    self.labels[label] = len(self.labels)
                        self.labels[label] = len(self.labels)
        if self.model is True:
            token_vector_width = util.env_opt("token_vector_width")
            self.model = self.Model(len(self.labels), tok2vec=tok2vec)
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -237,7 +237,7 @@ class Scorer(object):
        if len(doc) != len(gold):
            doc_annotation = DocAnnotation(cats=gold.cats)
            token_annotation = gold.orig
-            gold = GoldParse.from_annotation(doc, doc_annotation, [token_annotation])
+            gold = GoldParse.from_annotation(doc, doc_annotation, token_annotation)
        orig = gold.orig
        gold_deps = set()
        gold_deps_per_dep = {}
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -342,19 +342,19 @@ cdef class ArcEager(TransitionSystem):
            actions[RIGHT][label] = 1
            actions[REDUCE][label] = 1
        for example in kwargs.get('gold_parses', []):
-            for token_annotation in example.token_annotations:
+            heads, labels = nonproj.projectivize(example.token_annotation.heads,
-                heads, labels = nonproj.projectivize(token_annotation.heads, token_annotation.deps)
+                                                 example.token_annotation.deps)
-                for child, head, label in zip(token_annotation.ids, heads, labels):
+            for child, head, label in zip(example.token_annotation.ids, heads, labels):
-                    if label.upper() == 'ROOT' :
+                if label.upper() == 'ROOT' :
-                        label = 'ROOT'
+                    label = 'ROOT'
-                    if head == child:
+                if head == child:
-                        actions[BREAK][label] += 1
+                    actions[BREAK][label] += 1
-                    elif head < child:
+                elif head < child:
-                        actions[RIGHT][label] += 1
+                    actions[RIGHT][label] += 1
-                        actions[REDUCE][''] += 1
+                    actions[REDUCE][''] += 1
-                    elif head > child:
+                elif head > child:
-                        actions[LEFT][label] += 1
+                    actions[LEFT][label] += 1
-                        actions[SHIFT][''] += 1
+                    actions[SHIFT][''] += 1
        if min_freq is not None:
            for action, label_freqs in actions.items():
                for label, freq in list(label_freqs.items()):
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@ -73,12 +73,11 @@ cdef class BiluoPushDown(TransitionSystem):
                actions[action][entity_type] = 1
        moves = ('M', 'B', 'I', 'L', 'U')
        for example in kwargs.get('gold_parses', []):
-            for token_annotation in example.token_annotations:
+            for i, ner_tag in enumerate(example.token_annotation.entities):
-                for i, ner_tag in enumerate(token_annotation.entities):
+                if ner_tag != 'O' and ner_tag != '-':
-                    if ner_tag != 'O' and ner_tag != '-':
+                    _, label = ner_tag.split('-', 1)
-                        _, label = ner_tag.split('-', 1)
+                    for action in (BEGIN, IN, LAST, UNIT):
-                        for action in (BEGIN, IN, LAST, UNIT):
+                        actions[action][label] += 1
                            actions[action][label] += 1
        return actions
    @property
--- a/spacy/syntax/nonproj.pyx
+++ b/spacy/syntax/nonproj.pyx
@ -81,15 +81,15 @@ def is_decorated(label):
 def count_decorated_labels(gold_data):
    freqs = {}
    for example in gold_data:
-        for token_annotation in example.token_annotations:
+        proj_heads, deco_deps = projectivize(example.token_annotation.heads,
-            proj_heads, deco_deps = projectivize(token_annotation.heads, token_annotation.deps)
+                                             example.token_annotation.deps)
-            # set the label to ROOT for each root dependent
+        # set the label to ROOT for each root dependent
-            deco_deps = ['ROOT' if head == i else deco_deps[i]
+        deco_deps = ['ROOT' if head == i else deco_deps[i]
-                           for i, head in enumerate(proj_heads)]
+                       for i, head in enumerate(proj_heads)]
-            # count label frequencies
+        # count label frequencies
-            for label in deco_deps:
+        for label in deco_deps:
-                if is_decorated(label):
+            if is_decorated(label):
-                    freqs[label] = freqs.get(label, 0) + 1
+                freqs[label] = freqs.get(label, 0) + 1
    return freqs
@ -98,21 +98,20 @@ def preprocess_training_data(gold_data, label_freq_cutoff=30):
    freqs = {}
    for example in gold_data:
        new_example = Example(doc=example.doc)
-        for token_annotation in example.token_annotations:
+        proj_heads, deco_deps = projectivize(example.token_annotation.heads,
-            proj_heads, deco_deps = projectivize(token_annotation.heads, token_annotation.deps)
+                                             example.token_annotation.deps)
-            # set the label to ROOT for each root dependent
+        # set the label to ROOT for each root dependent
-            deco_deps = ['ROOT' if head == i else deco_deps[i]
+        deco_deps = ['ROOT' if head == i else deco_deps[i]
-                           for i, head in enumerate(proj_heads)]
+                       for i, head in enumerate(proj_heads)]
-            # count label frequencies
+        # count label frequencies
-            if label_freq_cutoff > 0:
+        if label_freq_cutoff > 0:
-                for label in deco_deps:
+            for label in deco_deps:
-                    if is_decorated(label):
+                if is_decorated(label):
-                        freqs[label] = freqs.get(label, 0) + 1
+                    freqs[label] = freqs.get(label, 0) + 1
-            # TODO: the code would be less ugly when changing heads and deps in-place, but is this OK upstream ?
+        proj_token_dict = example.token_annotation.to_dict()
-            proj_token_dict = token_annotation.to_dict()
+        proj_token_dict["heads"] = proj_heads
-            proj_token_dict["heads"] = proj_heads
+        proj_token_dict["deps"] = deco_deps
-            proj_token_dict["deps"] = deco_deps
+        new_example.set_token_annotation(**proj_token_dict)
            new_example.add_token_annotation(**proj_token_dict)
        preprocessed.append(new_example)
    if label_freq_cutoff > 0:
        return _filter_labels(preprocessed, label_freq_cutoff, freqs)
@ -213,15 +212,14 @@ def _filter_labels(examples, cutoff, freqs):
    filtered = []
    for example in examples:
        new_example = Example(doc=example.doc)
-        for token_annotation in example.token_annotations:
+        filtered_labels = []
-            filtered_labels = []
+        for label in example.token_annotation.deps:
-            for label in token_annotation.deps:
+            if is_decorated(label) and freqs.get(label, 0) < cutoff:
-                if is_decorated(label) and freqs.get(label, 0) < cutoff:
+                filtered_labels.append(decompose(label)[0])
-                    filtered_labels.append(decompose(label)[0])
+            else:
-                else:
+                filtered_labels.append(label)
-                    filtered_labels.append(label)
+        filtered_token_dict = example.token_annotation.to_dict()
-            filtered_token_dict = token_annotation.to_dict()
+        filtered_token_dict["deps"] = filtered_labels
-            filtered_token_dict["deps"] = filtered_labels
+        new_example.set_token_annotation(**filtered_token_dict)
            new_example.add_token_annotation(**filtered_token_dict)
        filtered.append(new_example)
    return filtered
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@ -273,7 +273,7 @@ def test_issue1963(en_tokenizer):
 def test_issue1967(label):
    ner = EntityRecognizer(Vocab())
    example = Example(doc=None)
-    example.add_token_annotation(ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label])
+    example.set_token_annotation(ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label])
    ner.moves.get_actions(gold_parses=[example])
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@ -36,6 +36,16 @@ def doc():
    return doc
@pytest.fixture()
 def merged_dict():
    return {
        "ids": [1, 2, 3, 4, 5, 6, 7],
        "words": ["Hi", "there", "everyone", "It", "is", "just", "me"],
        "tags": ["INTJ", "ADV", "PRON", "PRON", "AUX", "ADV", "PRON"],
        "sent_starts": [1, 0, 0, 1, 0, 0, 0, 0],
        }
 def test_gold_biluo_U(en_vocab):
    words = ["I", "flew", "to", "London", "."]
    spaces = [True, True, True, False, True]
@ -231,7 +241,7 @@ def test_ignore_misaligned(doc):
    deps = [t.dep_ for t in doc]
    heads = [t.head.i for t in doc]
-    use_new_align = spacy.gold.USE_NEW_ALIGN
+    saved_use_new_align = spacy.gold.USE_NEW_ALIGN
    spacy.gold.USE_NEW_ALIGN = False
    with make_tempdir() as tmpdir:
@ -270,7 +280,25 @@ def test_ignore_misaligned(doc):
                                  ignore_misaligned=True))
    assert len(train_reloaded_example) == 0
-    spacy.gold.USE_NEW_ALIGN = use_new_align
+    spacy.gold.USE_NEW_ALIGN = saved_use_new_align
 def test_make_orth_variants(doc):
    nlp = English()
    text = doc.text
    deps = [t.dep_ for t in doc]
    heads = [t.head.i for t in doc]
    with make_tempdir() as tmpdir:
        jsonl_file = tmpdir / "test.jsonl"
        # write to JSONL train dicts
        srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
        goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
    # due to randomness, test only that this runs with no errors for now
    train_reloaded_example = next(goldcorpus.train_dataset(nlp,
        orth_variant_level=0.2))
    train_goldparse = train_reloaded_example.gold
 # xfail while we have backwards-compatible alignment
@ -386,71 +414,38 @@ def _train(train_data):
            nlp.update(batch, sgd=optimizer, losses=losses)
-tokens_1 = {
+def test_split_sents(merged_dict):
    "ids": [1, 2, 3],
    "words": ["Hi", "there", "everyone"],
    "tags": ["INTJ", "ADV", "PRON"],
 }
 tokens_2 = {
    "ids": [1, 2, 3, 4],
    "words": ["It", "is", "just", "me"],
    "tags": ["PRON", "AUX", "ADV", "PRON"],
 }
 text0 = "Hi there everyone It is just me"
 def test_merge_sents():
    nlp = English()
    example = Example()
-    example.add_token_annotation(**tokens_1)
+    example.set_token_annotation(**merged_dict)
    example.add_token_annotation(**tokens_2)
    assert len(example.get_gold_parses(merge=False, vocab=nlp.vocab)) == 2
-    assert len(example.get_gold_parses(merge=True, vocab=nlp.vocab)) == 1   # this shouldn't change the original object
+    assert len(example.get_gold_parses(merge=True, vocab=nlp.vocab)) == 1
-    merged_example = example.merge_sents()
+    split_examples = example.split_sents()
    assert len(split_examples) == 2
-    token_annotation_1 = example.token_annotations[0]
+    token_annotation_1 = split_examples[0].token_annotation
    assert token_annotation_1.ids == [1, 2, 3]
    assert token_annotation_1.words == ["Hi", "there", "everyone"]
    assert token_annotation_1.tags == ["INTJ", "ADV", "PRON"]
    assert token_annotation_1.sent_starts == [1, 0, 0]
-    token_annotation_m = merged_example.token_annotations[0]
+    token_annotation_2 = split_examples[1].token_annotation
-    assert token_annotation_m.ids == [1, 2, 3, 4, 5, 6, 7]
+    assert token_annotation_2.ids == [4, 5, 6, 7]
-    assert token_annotation_m.words == ["Hi", "there", "everyone", "It", "is", "just", "me"]
+    assert token_annotation_2.words == ["It", "is", "just", "me"]
-    assert token_annotation_m.tags == ["INTJ", "ADV", "PRON", "PRON", "AUX", "ADV", "PRON"]
+    assert token_annotation_2.tags == ["PRON", "AUX", "ADV", "PRON"]
    assert token_annotation_2.sent_starts == [1, 0, 0, 0]
-def test_tuples_to_example():
+def test_tuples_to_example(merged_dict):
    ex = Example()
-    ex.add_token_annotation(**tokens_1)
+    ex.set_token_annotation(**merged_dict)
-    ex.add_token_annotation(**tokens_2)
+    cats = {"TRAVEL": 1.0, "BAKING": 0.0}
-    ex.add_doc_annotation(cats={"TRAVEL": 1.0, "BAKING": 0.0})
+    ex.set_doc_annotation(cats=cats)
    ex_dict = ex.to_dict()
-    token_dicts = [
+    assert ex_dict["token_annotation"]["ids"] == merged_dict["ids"]
-        {
+    assert ex_dict["token_annotation"]["words"] == merged_dict["words"]
-            "ids": [1, 2, 3],
+    assert ex_dict["token_annotation"]["tags"] == merged_dict["tags"]
-            "words": ["Hi", "there", "everyone"],
+    assert ex_dict["token_annotation"]["sent_starts"] == merged_dict["sent_starts"]
-            "tags": ["INTJ", "ADV", "PRON"],
+    assert ex_dict["doc_annotation"]["cats"] == cats
            "heads": [],
            "deps": [],
            "entities": [],
            "morphology": [],
            "brackets": [],
        },
        {
            "ids": [1, 2, 3, 4],
            "words": ["It", "is", "just", "me"],
            "tags": ["PRON", "AUX", "ADV", "PRON"],
            "heads": [],
            "deps": [],
            "entities": [],
            "morphology": [],
            "brackets": [],
        },
    ]
    doc_dict = {"cats": {"TRAVEL": 1.0, "BAKING": 0.0}, "links": {}}
    assert ex_dict == {"token_annotations": token_dicts, "doc_annotation": doc_dict}
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@ -86,7 +86,7 @@ def test_ner_per_type(en_vocab):
            ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]],
        )
        ex = Example(doc=doc)
-        ex.add_token_annotation(entities=annot["entities"])
+        ex.set_token_annotation(entities=annot["entities"])
        scorer.score(ex)
    results = scorer.scores
@ -107,7 +107,7 @@ def test_ner_per_type(en_vocab):
            ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]],
        )
        ex = Example(doc=doc)
-        ex.add_token_annotation(entities=annot["entities"])
+        ex.set_token_annotation(entities=annot["entities"])
        scorer.score(ex)
    results = scorer.scores