From d9289712ba76d4c67450fe1969642416d0ac57f4 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 8 Jun 2020 22:28:50 +0200
Subject: [PATCH] * Make GoldCorpus return dict, not Example * Make Example
 require a Doc object (previously optional)

Clarify methods in GoldCorpus

WIP refactor Example

Refactor Example.split_sents

Fix test

Fix augment

Update test

Update test

Fix import

Update test_scorer

Update Example
---
 spacy/cli/converters/conllu2json.py           |  10 +-
 spacy/gold/annotation.py                      |   3 +
 spacy/gold/augment.py                         |   7 +-
 spacy/gold/corpus.py                          |  45 ++---
 spacy/gold/example.py                         | 155 +++++++++++-------
 spacy/gold/gold_io.pyx                        |   4 +-
 spacy/syntax/nonproj.pyx                      |   4 +-
 spacy/tests/regression/test_issue1501-2000.py |  15 +-
 spacy/tests/test_gold.py                      |  24 ++-
 spacy/tests/test_scorer.py                    |  18 +-
 spacy/tokens/doc.pyx                          |   2 +
 11 files changed, 176 insertions(+), 111 deletions(-)

diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py
index 1ece755b8..2cf5f7942 100644
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@@ -2,6 +2,7 @@ import re
 
 from ...gold import Example
 from ...gold import iob_to_biluo, spans_from_biluo_tags, biluo_tags_from_offsets
+from ...gold import TokenAnnotation
 from ...language import Language
 from ...tokens import Doc, Token
 from .conll_ner2json import n_sents_info
@@ -284,13 +285,8 @@ def example_from_conllu_sentence(
         spaces.append(t._.merged_spaceafter)
     ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
     ents = biluo_tags_from_offsets(doc, ent_offsets)
-    raw = ""
-    for word, space in zip(words, spaces):
-        raw += word
-        if space:
-            raw += " "
-    example = Example(doc=raw)
-    example.set_token_annotation(
+    example = Example(doc=Doc(vocab, words=words, spaces=spaces))
+    example.token_annotation = TokenAnnotation(
         ids=ids,
         words=words,
         tags=tags,
diff --git a/spacy/gold/annotation.py b/spacy/gold/annotation.py
index 6bae679c3..5f78902ab 100644
--- a/spacy/gold/annotation.py
+++ b/spacy/gold/annotation.py
@@ -1,3 +1,6 @@
+from .iob_utils import biluo_tags_from_offsets
+
+
 class TokenAnnotation:
     def __init__(
         self,
diff --git a/spacy/gold/augment.py b/spacy/gold/augment.py
index 656308214..f938f540f 100644
--- a/spacy/gold/augment.py
+++ b/spacy/gold/augment.py
@@ -1,6 +1,7 @@
 import random
 import itertools
 from .example import Example
+from .annotation import TokenAnnotation
 
 
 def make_orth_variants(nlp, example, orth_variant_level=0.0):
@@ -17,14 +18,14 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
     ndsv = nlp.Defaults.single_orth_variants
     ndpv = nlp.Defaults.paired_orth_variants
     # modify words in paragraph_tuples
-    variant_example = Example(doc=raw)
+    variant_example = Example(doc=nlp.make_doc(raw))
     token_annotation = example.token_annotation
     words = token_annotation.words
     tags = token_annotation.tags
     if not words or not tags:
         # add the unmodified annotation
         token_dict = token_annotation.to_dict()
-        variant_example.set_token_annotation(**token_dict)
+        variant_example.token_annotation = TokenAnnotation(**token_dict)
     else:
         if lower:
             words = [w.lower() for w in words]
@@ -60,7 +61,7 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
         token_dict = token_annotation.to_dict()
         token_dict["words"] = words
         token_dict["tags"] = tags
-        variant_example.set_token_annotation(**token_dict)
+        variant_example.token_annotation = TokenAnnotation(**token_dict)
     # modify raw to match variant_paragraph_tuples
     if raw is not None:
         variants = []
diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py
index 9462f0aa4..df13ab505 100644
--- a/spacy/gold/corpus.py
+++ b/spacy/gold/corpus.py
@@ -28,8 +28,8 @@ class GoldCorpus(object):
         """
         self.limit = limit
         if isinstance(train, str) or isinstance(train, Path):
-            train = self.read_examples(self.walk_corpus(train))
-            dev = self.read_examples(self.walk_corpus(dev))
+            train = self.read_annotations(self.walk_corpus(train))
+            dev = self.read_annotations(self.walk_corpus(dev))
         # Write temp directory with one doc per file, so we can shuffle and stream
         self.tmp_dir = Path(tempfile.mkdtemp())
         self.write_msgpack(self.tmp_dir / "train", train, limit=self.limit)
@@ -71,7 +71,7 @@ class GoldCorpus(object):
         return locs
 
     @staticmethod
-    def read_examples(locs, limit=0):
+    def read_annotations(locs, limit=0):
         """ Yield training examples """
         i = 0
         for loc in locs:
@@ -101,11 +101,11 @@ class GoldCorpus(object):
                                 or isinstance(doc, str)
                             ):
                                 raise ValueError(Errors.E987.format(type=type(doc)))
-                            examples.append(Example.from_dict(ex_dict, doc=doc))
+                            examples.append(ex_dict)
 
             elif file_name.endswith("msg"):
                 text, ex_dict = srsly.read_msgpack(loc)
-                examples = [Example.from_dict(ex_dict, doc=text)]
+                examples = [ex_dict]
             else:
                 supported = ("json", "jsonl", "msg")
                 raise ValueError(Errors.E124.format(path=loc, formats=supported))
@@ -123,21 +123,21 @@ class GoldCorpus(object):
                 raise ValueError(Errors.E996.format(file=file_name, msg=msg))
 
     @property
-    def dev_examples(self):
+    def dev_annotations(self):
         locs = (self.tmp_dir / "dev").iterdir()
-        yield from self.read_examples(locs, limit=self.limit)
+        yield from self.read_annotations(locs, limit=self.limit)
 
     @property
-    def train_examples(self):
+    def train_annotations(self):
         locs = (self.tmp_dir / "train").iterdir()
-        yield from self.read_examples(locs, limit=self.limit)
+        yield from self.read_annotations(locs, limit=self.limit)
 
     def count_train(self):
         """Returns count of words in train examples"""
         n = 0
         i = 0
-        for example in self.train_examples:
-            n += len(example.token_annotation.words)
+        for eg_dict in self.train_annotations:
+            n += len(eg_dict["token_annotation"]["words"])
             if self.limit and i >= self.limit:
                 break
             i += 1
@@ -154,10 +154,10 @@ class GoldCorpus(object):
     ):
         locs = list((self.tmp_dir / "train").iterdir())
         random.shuffle(locs)
-        train_examples = self.read_examples(locs, limit=self.limit)
-        gold_examples = self.iter_gold_docs(
+        train_annotations = self.read_annotations(locs, limit=self.limit)
+        examples = self.iter_examples(
             nlp,
-            train_examples,
+            train_annotations,
             gold_preproc,
             max_length=max_length,
             noise_level=noise_level,
@@ -165,33 +165,33 @@ class GoldCorpus(object):
             make_projective=True,
             ignore_misaligned=ignore_misaligned,
         )
-        yield from gold_examples
+        yield from examples
 
     def train_dataset_without_preprocessing(
         self, nlp, gold_preproc=False, ignore_misaligned=False
     ):
-        examples = self.iter_gold_docs(
+        examples = self.iter_examples(
             nlp,
-            self.train_examples,
+            self.train_annotations,
             gold_preproc=gold_preproc,
             ignore_misaligned=ignore_misaligned,
         )
         yield from examples
 
     def dev_dataset(self, nlp, gold_preproc=False, ignore_misaligned=False):
-        examples = self.iter_gold_docs(
+        examples = self.iter_examples(
             nlp,
-            self.dev_examples,
+            self.dev_annotations,
             gold_preproc=gold_preproc,
             ignore_misaligned=ignore_misaligned,
         )
         yield from examples
 
     @classmethod
-    def iter_gold_docs(
+    def iter_examples(
         cls,
         nlp,
-        examples,
+        annotations,
         gold_preproc,
         max_length=None,
         noise_level=0.0,
@@ -200,7 +200,8 @@ class GoldCorpus(object):
         ignore_misaligned=False,
     ):
         """ Setting gold_preproc will result in creating a doc per sentence """
-        for example in examples:
+        for eg_dict in annotations:
+            example = Example.from_dict(eg_dict, doc=nlp.make_doc(eg_dict["text"]))
             example_docs = []
             if gold_preproc:
                 split_examples = example.split_sents()
diff --git a/spacy/gold/example.py b/spacy/gold/example.py
index 1d8665572..c8ad58da7 100644
--- a/spacy/gold/example.py
+++ b/spacy/gold/example.py
@@ -1,18 +1,69 @@
+import numpy
 from .annotation import TokenAnnotation, DocAnnotation
+from .iob_utils import spans_from_biluo_tags, biluo_tags_from_offsets
 from .align import Alignment
 from ..errors import Errors, AlignmentError
 from ..tokens import Doc
 
 
+def annotations2doc(doc, doc_annot, tok_annot):
+    # TODO: Improve and test this
+    words = tok_annot.words or [tok.text for tok in doc]
+    fields = {
+        "tags": "TAG",
+        "pos": "POS",
+        "lemmas": "LEMMA",
+        "deps": "DEP",
+    }
+    attrs = []
+    values = []
+    for field, attr in fields.items():
+        value = getattr(tok_annot, field)
+        # Unset fields will be empty lists.
+        if value:
+            attrs.append(attr)
+            values.append([doc.vocab.strings.add(v) for v in value])
+    if tok_annot.heads:
+        attrs.append("HEAD")
+        values.append([h - i for i, h in enumerate(tok_annot.heads)])
+    output = Doc(doc.vocab, words=words)
+    if values:
+        array = numpy.array(values, dtype="uint64")
+        output = output.from_array(attrs, array.T)
+    if tok_annot.entities:
+        output.ents = spans_from_biluo_tags(output, tok_annot.entities)
+    doc.cats = dict(doc_annot.cats)
+    # TODO: Calculate token.ent_kb_id from links.
+    # We need to fix this and the doc.ents thing, both should be doc
+    # annotations.
+    return doc
+
+
 class Example:
-    def __init__(self, doc=None, doc_annotation=None, token_annotation=None):
+    def __init__(self, doc, doc_annotation=None, token_annotation=None):
         """ Doc can either be text, or an actual Doc """
+        if not isinstance(doc, Doc):
+            raise TypeError("Must pass Doc instance")
+        self.predicted = doc
         self.doc = doc
         self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation()
         self.token_annotation = (
             token_annotation if token_annotation else TokenAnnotation()
         )
         self._alignment = None
+        self.reference = annotations2doc(
+            self.doc,
+            self.doc_annotation,
+            self.token_annotation
+        )
+
+    @property
+    def x(self):
+        return self.predicted
+    
+    @property
+    def y(self):
+        return self.reference
 
     def _deprecated_get_gold(self, make_projective=False):
         from ..syntax.gold_parse import get_parses_from_example
@@ -24,6 +75,8 @@ class Example:
     def from_dict(cls, example_dict, doc=None):
         if example_dict is None:
             raise ValueError("Example.from_dict expected dict, received None")
+        if doc is None:
+            raise ValueError("Must pass doc")
         # TODO: This is ridiculous...
         token_dict = example_dict.get("token_annotation", {})
         doc_dict = example_dict.get("doc_annotation", {})
@@ -34,6 +87,10 @@ class Example:
                 doc_dict[key] = value
             else:
                 token_dict[key] = value
+        if token_dict.get("entities"):
+            entities = token_dict["entities"]
+            if isinstance(entities[0], (list, tuple)):
+                token_dict["entities"] = biluo_tags_from_offsets(doc, entities)
         token_annotation = TokenAnnotation.from_dict(token_dict)
         doc_annotation = DocAnnotation.from_dict(doc_dict)
         return cls(
@@ -45,8 +102,8 @@ class Example:
         if self._alignment is None:
             if self.doc is None:
                 return None
-            spacy_words = [token.orth_ for token in self.doc]
-            gold_words = self.token_annotation.words
+            spacy_words = [token.orth_ for token in self.predicted]
+            gold_words = [token.orth_ for token in self.reference]
             if gold_words == []:
                 gold_words = spacy_words
             self._alignment = Alignment(spacy_words, gold_words)
@@ -92,34 +149,6 @@ class Example:
                 output.append(gold_values[gold_i])
         return output
 
-    def set_token_annotation(
-        self,
-        ids=None,
-        words=None,
-        tags=None,
-        pos=None,
-        morphs=None,
-        lemmas=None,
-        heads=None,
-        deps=None,
-        entities=None,
-        sent_starts=None,
-        brackets=None,
-    ):
-        self.token_annotation = TokenAnnotation(
-            ids=ids,
-            words=words,
-            tags=tags,
-            pos=pos,
-            morphs=morphs,
-            lemmas=lemmas,
-            heads=heads,
-            deps=deps,
-            entities=entities,
-            sent_starts=sent_starts,
-            brackets=brackets,
-        )
-
     def set_doc_annotation(self, cats=None, links=None):
         if cats:
             self.doc_annotation.cats = cats
@@ -131,7 +160,6 @@ class Example:
         sent_starts and return a list of the new Examples"""
         if not self.token_annotation.words:
             return [self]
-        s_example = Example(doc=None, doc_annotation=self.doc_annotation)
         s_ids, s_words, s_tags, s_pos, s_morphs = [], [], [], [], []
         s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], []
         s_brackets = []
@@ -140,21 +168,25 @@ class Example:
         split_examples = []
         for i in range(len(t.words)):
             if i > 0 and t.sent_starts[i] == 1:
-                s_example.set_token_annotation(
-                    ids=s_ids,
-                    words=s_words,
-                    tags=s_tags,
-                    pos=s_pos,
-                    morphs=s_morphs,
-                    lemmas=s_lemmas,
-                    heads=s_heads,
-                    deps=s_deps,
-                    entities=s_ents,
-                    sent_starts=s_sent_starts,
-                    brackets=s_brackets,
+                split_examples.append(
+                    Example(
+                        doc=Doc(self.doc.vocab, words=s_words),
+                        token_annotation=TokenAnnotation(
+                            ids=s_ids,
+                            words=s_words,
+                            tags=s_tags,
+                            pos=s_pos,
+                            morphs=s_morphs,
+                            lemmas=s_lemmas,
+                            heads=s_heads,
+                            deps=s_deps,
+                            entities=s_ents,
+                            sent_starts=s_sent_starts,
+                            brackets=s_brackets,
+                        ),
+                        doc_annotation=self.doc_annotation
+                    )
                 )
-                split_examples.append(s_example)
-                s_example = Example(doc=None, doc_annotation=self.doc_annotation)
                 s_ids, s_words, s_tags, s_pos, s_heads = [], [], [], [], []
                 s_deps, s_ents, s_morphs, s_lemmas = [], [], [], []
                 s_sent_starts, s_brackets = [], []
@@ -172,20 +204,25 @@ class Example:
             for b_end, b_label in t.brackets_by_start.get(i, []):
                 s_brackets.append((i - sent_start_i, b_end - sent_start_i, b_label))
             i += 1
-        s_example.set_token_annotation(
-            ids=s_ids,
-            words=s_words,
-            tags=s_tags,
-            pos=s_pos,
-            morphs=s_morphs,
-            lemmas=s_lemmas,
-            heads=s_heads,
-            deps=s_deps,
-            entities=s_ents,
-            sent_starts=s_sent_starts,
-            brackets=s_brackets,
+        split_examples.append(
+            Example(
+                doc=Doc(self.doc.vocab, words=s_words),
+                token_annotation=TokenAnnotation(
+                    ids=s_ids,
+                    words=s_words,
+                    tags=s_tags,
+                    pos=s_pos,
+                    morphs=s_morphs,
+                    lemmas=s_lemmas,
+                    heads=s_heads,
+                    deps=s_deps,
+                    entities=s_ents,
+                    sent_starts=s_sent_starts,
+                    brackets=s_brackets,
+                ),
+                doc_annotation=self.doc_annotation
+            )
         )
-        split_examples.append(s_example)
         return split_examples
 
     @classmethod
diff --git a/spacy/gold/gold_io.pyx b/spacy/gold/gold_io.pyx
index 424e44f72..8aa5f4017 100644
--- a/spacy/gold/gold_io.pyx
+++ b/spacy/gold/gold_io.pyx
@@ -76,12 +76,12 @@ def read_json_file(loc, docs_filter=None, limit=None):
                 yield json_data
 
 
-def json_to_examples(doc):
+def json_to_annotations(doc):
     """Convert an item in the JSON-formatted training data to the format
     used by GoldParse.
 
     doc (dict): One entry in the training data.
-    YIELDS (Example): The reformatted data - one training example per paragraph
+    YIELDS (tuple): The reformatted data - one training example per paragraph
     """
     for paragraph in doc["paragraphs"]:
         example = {"text": paragraph.get("raw", None)}
diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx
index 1edb2e65c..a91176f44 100644
--- a/spacy/syntax/nonproj.pyx
+++ b/spacy/syntax/nonproj.pyx
@@ -108,7 +108,7 @@ def preprocess_training_data(gold_data, label_freq_cutoff=30):
         proj_token_dict = example.token_annotation.to_dict()
         proj_token_dict["heads"] = proj_heads
         proj_token_dict["deps"] = deco_deps
-        new_example.set_token_annotation(**proj_token_dict)
+        new_example.token_annotation = TokenAnnotation(**proj_token_dict)
         preprocessed.append(new_example)
     if label_freq_cutoff > 0:
         return _filter_labels(preprocessed, label_freq_cutoff, freqs)
@@ -216,6 +216,6 @@ def _filter_labels(examples, cutoff, freqs):
                 filtered_labels.append(label)
         filtered_token_dict = example.token_annotation.to_dict()
         filtered_token_dict["deps"] = filtered_labels
-        new_example.set_token_annotation(**filtered_token_dict)
+        new_example.token_annotation = TokenAnnotation(**filtered_token_dict)
         filtered.append(new_example)
     return filtered
diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py
index 5a76697bc..ed1f33351 100644
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@@ -3,7 +3,7 @@ import gc
 import numpy
 import copy
 
-from spacy.gold import Example
+from spacy.gold import Example, TokenAnnotation
 from spacy.lang.en import English
 from spacy.lang.en.stop_words import STOP_WORDS
 from spacy.lang.lex_attrs import is_stop
@@ -271,9 +271,16 @@ def test_issue1963(en_tokenizer):
 @pytest.mark.parametrize("label", ["U-JOB-NAME"])
 def test_issue1967(label):
     ner = EntityRecognizer(Vocab(), default_ner())
-    example = Example(doc=None)
-    example.set_token_annotation(
-        ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label]
+    example = Example(
+        doc=Doc(ner.vocab, words=["word"]),
+        token_annotation=TokenAnnotation(
+            ids=[0],
+            words=["word"],
+            tags=["tag"],
+            heads=[0],
+            deps=["dep"],
+            entities=[label]
+        )
     )
     ner.moves.get_actions(gold_parses=[example])
 
diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
index 4b4250179..29ddc7456 100644
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@@ -95,6 +95,12 @@ def merged_dict():
     }
 
 
+@pytest.fixture
+def vocab():
+    nlp = English()
+    return nlp.vocab
+
+
 def test_gold_biluo_U(en_vocab):
     words = ["I", "flew", "to", "London", "."]
     spaces = [True, True, True, False, True]
@@ -475,8 +481,10 @@ def _train(train_data):
 
 def test_split_sents(merged_dict):
     nlp = English()
-    example = Example()
-    example.set_token_annotation(**merged_dict)
+    example = Example.from_dict(
+        merged_dict,
+        doc=Doc(nlp.vocab, words=merged_dict["words"])
+    )
     assert len(get_parses_from_example(
         example,
         merge=False,
@@ -506,13 +514,15 @@ def test_split_sents(merged_dict):
     assert token_annotation_2.sent_starts == [1, 0, 0, 0]
 
 
-def test_tuples_to_example(merged_dict):
-    ex = Example()
-    ex.set_token_annotation(**merged_dict)
+def test_tuples_to_example(vocab, merged_dict):
     cats = {"TRAVEL": 1.0, "BAKING": 0.0}
-    ex.set_doc_annotation(cats=cats)
+    merged_dict = dict(merged_dict)
+    merged_dict["cats"] = cats
+    ex = Example.from_dict(
+        merged_dict,
+        doc=Doc(vocab, words=merged_dict["words"])
+    )
     ex_dict = ex.to_dict()
-
     assert ex_dict["token_annotation"]["ids"] == merged_dict["ids"]
     assert ex_dict["token_annotation"]["words"] == merged_dict["words"]
     assert ex_dict["token_annotation"]["tags"] == merged_dict["tags"]
diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py
index d750a8202..5eaf8d5b3 100644
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@@ -1,12 +1,14 @@
 from numpy.testing import assert_almost_equal, assert_array_almost_equal
 import pytest
 from pytest import approx
-from spacy.gold import Example, GoldParse
+from spacy.gold import Example, GoldParse, TokenAnnotation
+from spacy.gold.iob_utils import biluo_tags_from_offsets
 from spacy.scorer import Scorer, ROCAUCScore
 from spacy.scorer import _roc_auc_score, _roc_curve
 from .util import get_doc
 from spacy.lang.en import English
 
+
 test_las_apple = [
     [
         "Apple is looking at buying U.K. startup for $ 1 billion",
@@ -134,8 +136,11 @@ def test_ner_per_type(en_vocab):
             words=input_.split(" "),
             ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]],
         )
-        ex = Example(doc=doc)
-        ex.set_token_annotation(entities=annot["entities"])
+        entities = biluo_tags_from_offsets(doc, annot["entities"])
+        ex = Example(
+            doc=doc,
+            token_annotation=TokenAnnotation(entities=entities)
+        )
         scorer.score(ex)
     results = scorer.scores
 
@@ -155,8 +160,11 @@ def test_ner_per_type(en_vocab):
             words=input_.split(" "),
             ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]],
         )
-        ex = Example(doc=doc)
-        ex.set_token_annotation(entities=annot["entities"])
+        entities = biluo_tags_from_offsets(doc, annot["entities"])
+        ex = Example(
+            doc=doc,
+            token_annotation=TokenAnnotation(entities=entities)
+        )
         scorer.score(ex)
     results = scorer.scores
 
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 3aa27e451..81cef4492 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -799,6 +799,8 @@ cdef class Doc:
         cdef attr_id_t attr_id
         cdef TokenC* tokens = self.c
         cdef int length = len(array)
+        if length != len(self):
+            raise ValueError("Cannot set array values longer than the document.")
         # Get set up for fast loading
         cdef Pool mem = Pool()
         cdef int n_attrs = len(attrs)