From 34873c4911163401329ca9cb0e72fb3b13d34ac0 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Tue, 4 Aug 2020 22:22:26 +0200
Subject: [PATCH] Example Dict format consistency (#5858)

* consistently use upper-case IDS in token_annotation format and for get_aligned

* remove ID from to_dict (not used in from_dict either)

* fix test

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
---
 spacy/gold/example.pyx          | 24 ++++++++++++++----------
 spacy/pipeline/senter.pyx       |  2 +-
 spacy/pipeline/tagger.pyx       |  2 +-
 spacy/tests/test_gold.py        | 14 +++++++-------
 spacy/tests/test_new_example.py | 14 +++++++++-----
 5 files changed, 32 insertions(+), 24 deletions(-)

diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx
index 8d320ce93..f90d98603 100644
--- a/spacy/gold/example.pyx
+++ b/spacy/gold/example.pyx
@@ -183,15 +183,15 @@ cdef class Example:
                 "links": self._links_to_dict()
             },
             "token_annotation": {
-                "ids": [t.i+1 for t in self.reference],
-                "words": [t.text for t in self.reference],
-                "tags": [t.tag_ for t in self.reference],
-                "lemmas": [t.lemma_ for t in self.reference],
-                "pos": [t.pos_ for t in self.reference],
-                "morphs": [t.morph_ for t in self.reference],
-                "heads": [t.head.i for t in self.reference],
-                "deps": [t.dep_ for t in self.reference],
-                "sent_starts": [int(bool(t.is_sent_start)) for t in self.reference]
+                "ORTH": [t.text for t in self.reference],
+                "SPACY": [bool(t.whitespace_) for t in self.reference],
+                "TAG": [t.tag_ for t in self.reference],
+                "LEMMA": [t.lemma_ for t in self.reference],
+                "POS": [t.pos_ for t in self.reference],
+                "MORPH": [t.morph_ for t in self.reference],
+                "HEAD": [t.head.i for t in self.reference],
+                "DEP": [t.dep_ for t in self.reference],
+                "SENT_START": [int(bool(t.is_sent_start)) for t in self.reference]
             }
         }
 
@@ -335,10 +335,14 @@ def _fix_legacy_dict_data(example_dict):
     for key, value in old_token_dict.items():
         if key in ("text", "ids", "brackets"):
             pass
+        elif key in remapping.values():
+            token_dict[key] = value
         elif key.lower() in remapping:
             token_dict[remapping[key.lower()]] = value
         else:
-            raise KeyError(Errors.E983.format(key=key, dict="token_annotation", keys=remapping.keys()))
+            all_keys = set(remapping.values())
+            all_keys.update(remapping.keys())
+            raise KeyError(Errors.E983.format(key=key, dict="token_annotation", keys=all_keys))
     text = example_dict.get("text", example_dict.get("raw"))
     if _has_field(token_dict, "ORTH") and not _has_field(token_dict, "SPACY"):
         token_dict["SPACY"] = _guess_spaces(text, token_dict["ORTH"])
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index f826f21de..620a8557e 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -108,7 +108,7 @@ class SentenceRecognizer(Tagger):
         truths = []
         for eg in examples:
             eg_truth = []
-            for x in eg.get_aligned("sent_start"):
+            for x in eg.get_aligned("SENT_START"):
                 if x is None:
                     eg_truth.append(None)
                 elif x == 1:
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index f2e06efed..43f5b02cb 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -259,7 +259,7 @@ class Tagger(Pipe):
         DOCS: https://spacy.io/api/tagger#get_loss
         """
         loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
-        truths = [eg.get_aligned("tag", as_string=True) for eg in examples]
+        truths = [eg.get_aligned("TAG", as_string=True) for eg in examples]
         d_scores, loss = loss_func(scores, truths)
         if self.model.ops.xp.isnan(loss):
             raise ValueError("nan value when computing loss")
diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
index 81b71aaea..82965acbc 100644
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@@ -646,14 +646,14 @@ def test_split_sents(merged_dict):
     assert split_examples[1].text == "It is just me"
 
     token_annotation_1 = split_examples[0].to_dict()["token_annotation"]
-    assert token_annotation_1["words"] == ["Hi", "there", "everyone"]
-    assert token_annotation_1["tags"] == ["INTJ", "ADV", "PRON"]
-    assert token_annotation_1["sent_starts"] == [1, 0, 0]
+    assert token_annotation_1["ORTH"] == ["Hi", "there", "everyone"]
+    assert token_annotation_1["TAG"] == ["INTJ", "ADV", "PRON"]
+    assert token_annotation_1["SENT_START"] == [1, 0, 0]
 
     token_annotation_2 = split_examples[1].to_dict()["token_annotation"]
-    assert token_annotation_2["words"] == ["It", "is", "just", "me"]
-    assert token_annotation_2["tags"] == ["PRON", "AUX", "ADV", "PRON"]
-    assert token_annotation_2["sent_starts"] == [1, 0, 0, 0]
+    assert token_annotation_2["ORTH"] == ["It", "is", "just", "me"]
+    assert token_annotation_2["TAG"] == ["PRON", "AUX", "ADV", "PRON"]
+    assert token_annotation_2["SENT_START"] == [1, 0, 0, 0]
 
 
 def test_alignment():
@@ -723,4 +723,4 @@ def test_retokenized_docs(doc):
         retokenizer.merge(doc1[0:2])
         retokenizer.merge(doc1[5:7])
 
-    assert example.get_aligned("ORTH", as_string=True) == [None, 'sister', 'flew', 'to', None, 'via', 'London', '.']
+    assert example.get_aligned("ORTH", as_string=True) == [None, 'sister', 'flew', 'to', None, 'via', 'London', '.']
\ No newline at end of file
diff --git a/spacy/tests/test_new_example.py b/spacy/tests/test_new_example.py
index 886a24a8e..df6489aa8 100644
--- a/spacy/tests/test_new_example.py
+++ b/spacy/tests/test_new_example.py
@@ -42,7 +42,7 @@ def test_Example_from_dict_with_tags(pred_words, annots):
     example = Example.from_dict(predicted, annots)
     for i, token in enumerate(example.reference):
         assert token.tag_ == annots["tags"][i]
-    aligned_tags = example.get_aligned("tag", as_string=True)
+    aligned_tags = example.get_aligned("TAG", as_string=True)
     assert aligned_tags == ["NN" for _ in predicted]
 
 
@@ -53,9 +53,13 @@ def test_aligned_tags():
     annots = {"words": gold_words, "tags": gold_tags}
     vocab = Vocab()
     predicted = Doc(vocab, words=pred_words)
-    example = Example.from_dict(predicted, annots)
-    aligned_tags = example.get_aligned("tag", as_string=True)
-    assert aligned_tags == ["VERB", "DET", "NOUN", "SCONJ", "PRON", "VERB", "VERB"]
+    example1 = Example.from_dict(predicted, annots)
+    aligned_tags1 = example1.get_aligned("TAG", as_string=True)
+    assert aligned_tags1 == ["VERB", "DET", "NOUN", "SCONJ", "PRON", "VERB", "VERB"]
+    # ensure that to_dict works correctly
+    example2 = Example.from_dict(predicted, example1.to_dict())
+    aligned_tags2 = example2.get_aligned("TAG", as_string=True)
+    assert aligned_tags2 == ["VERB", "DET", "NOUN", "SCONJ", "PRON", "VERB", "VERB"]
 
 
 def test_aligned_tags_multi():
@@ -66,7 +70,7 @@ def test_aligned_tags_multi():
     vocab = Vocab()
     predicted = Doc(vocab, words=pred_words)
     example = Example.from_dict(predicted, annots)
-    aligned_tags = example.get_aligned("tag", as_string=True)
+    aligned_tags = example.get_aligned("TAG", as_string=True)
     assert aligned_tags == [None, None, "SCONJ", "PRON", "VERB", "VERB"]