From 34873c4911163401329ca9cb0e72fb3b13d34ac0 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 4 Aug 2020 22:22:26 +0200 Subject: [PATCH] Example Dict format consistency (#5858) * consistently use upper-case IDS in token_annotation format and for get_aligned * remove ID from to_dict (not used in from_dict either) * fix test Co-authored-by: Matthew Honnibal --- spacy/gold/example.pyx | 24 ++++++++++++++---------- spacy/pipeline/senter.pyx | 2 +- spacy/pipeline/tagger.pyx | 2 +- spacy/tests/test_gold.py | 14 +++++++------- spacy/tests/test_new_example.py | 14 +++++++++----- 5 files changed, 32 insertions(+), 24 deletions(-) diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index 8d320ce93..f90d98603 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -183,15 +183,15 @@ cdef class Example: "links": self._links_to_dict() }, "token_annotation": { - "ids": [t.i+1 for t in self.reference], - "words": [t.text for t in self.reference], - "tags": [t.tag_ for t in self.reference], - "lemmas": [t.lemma_ for t in self.reference], - "pos": [t.pos_ for t in self.reference], - "morphs": [t.morph_ for t in self.reference], - "heads": [t.head.i for t in self.reference], - "deps": [t.dep_ for t in self.reference], - "sent_starts": [int(bool(t.is_sent_start)) for t in self.reference] + "ORTH": [t.text for t in self.reference], + "SPACY": [bool(t.whitespace_) for t in self.reference], + "TAG": [t.tag_ for t in self.reference], + "LEMMA": [t.lemma_ for t in self.reference], + "POS": [t.pos_ for t in self.reference], + "MORPH": [t.morph_ for t in self.reference], + "HEAD": [t.head.i for t in self.reference], + "DEP": [t.dep_ for t in self.reference], + "SENT_START": [int(bool(t.is_sent_start)) for t in self.reference] } } @@ -335,10 +335,14 @@ def _fix_legacy_dict_data(example_dict): for key, value in old_token_dict.items(): if key in ("text", "ids", "brackets"): pass + elif key in remapping.values(): + token_dict[key] = value elif key.lower() in remapping: token_dict[remapping[key.lower()]] = value else: - raise KeyError(Errors.E983.format(key=key, dict="token_annotation", keys=remapping.keys())) + all_keys = set(remapping.values()) + all_keys.update(remapping.keys()) + raise KeyError(Errors.E983.format(key=key, dict="token_annotation", keys=all_keys)) text = example_dict.get("text", example_dict.get("raw")) if _has_field(token_dict, "ORTH") and not _has_field(token_dict, "SPACY"): token_dict["SPACY"] = _guess_spaces(text, token_dict["ORTH"]) diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index f826f21de..620a8557e 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -108,7 +108,7 @@ class SentenceRecognizer(Tagger): truths = [] for eg in examples: eg_truth = [] - for x in eg.get_aligned("sent_start"): + for x in eg.get_aligned("SENT_START"): if x is None: eg_truth.append(None) elif x == 1: diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index f2e06efed..43f5b02cb 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -259,7 +259,7 @@ class Tagger(Pipe): DOCS: https://spacy.io/api/tagger#get_loss """ loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False) - truths = [eg.get_aligned("tag", as_string=True) for eg in examples] + truths = [eg.get_aligned("TAG", as_string=True) for eg in examples] d_scores, loss = loss_func(scores, truths) if self.model.ops.xp.isnan(loss): raise ValueError("nan value when computing loss") diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 81b71aaea..82965acbc 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -646,14 +646,14 @@ def test_split_sents(merged_dict): assert split_examples[1].text == "It is just me" token_annotation_1 = split_examples[0].to_dict()["token_annotation"] - assert token_annotation_1["words"] == ["Hi", "there", "everyone"] - assert token_annotation_1["tags"] == ["INTJ", "ADV", "PRON"] - assert token_annotation_1["sent_starts"] == [1, 0, 0] + assert token_annotation_1["ORTH"] == ["Hi", "there", "everyone"] + assert token_annotation_1["TAG"] == ["INTJ", "ADV", "PRON"] + assert token_annotation_1["SENT_START"] == [1, 0, 0] token_annotation_2 = split_examples[1].to_dict()["token_annotation"] - assert token_annotation_2["words"] == ["It", "is", "just", "me"] - assert token_annotation_2["tags"] == ["PRON", "AUX", "ADV", "PRON"] - assert token_annotation_2["sent_starts"] == [1, 0, 0, 0] + assert token_annotation_2["ORTH"] == ["It", "is", "just", "me"] + assert token_annotation_2["TAG"] == ["PRON", "AUX", "ADV", "PRON"] + assert token_annotation_2["SENT_START"] == [1, 0, 0, 0] def test_alignment(): @@ -723,4 +723,4 @@ def test_retokenized_docs(doc): retokenizer.merge(doc1[0:2]) retokenizer.merge(doc1[5:7]) - assert example.get_aligned("ORTH", as_string=True) == [None, 'sister', 'flew', 'to', None, 'via', 'London', '.'] + assert example.get_aligned("ORTH", as_string=True) == [None, 'sister', 'flew', 'to', None, 'via', 'London', '.'] \ No newline at end of file diff --git a/spacy/tests/test_new_example.py b/spacy/tests/test_new_example.py index 886a24a8e..df6489aa8 100644 --- a/spacy/tests/test_new_example.py +++ b/spacy/tests/test_new_example.py @@ -42,7 +42,7 @@ def test_Example_from_dict_with_tags(pred_words, annots): example = Example.from_dict(predicted, annots) for i, token in enumerate(example.reference): assert token.tag_ == annots["tags"][i] - aligned_tags = example.get_aligned("tag", as_string=True) + aligned_tags = example.get_aligned("TAG", as_string=True) assert aligned_tags == ["NN" for _ in predicted] @@ -53,9 +53,13 @@ def test_aligned_tags(): annots = {"words": gold_words, "tags": gold_tags} vocab = Vocab() predicted = Doc(vocab, words=pred_words) - example = Example.from_dict(predicted, annots) - aligned_tags = example.get_aligned("tag", as_string=True) - assert aligned_tags == ["VERB", "DET", "NOUN", "SCONJ", "PRON", "VERB", "VERB"] + example1 = Example.from_dict(predicted, annots) + aligned_tags1 = example1.get_aligned("TAG", as_string=True) + assert aligned_tags1 == ["VERB", "DET", "NOUN", "SCONJ", "PRON", "VERB", "VERB"] + # ensure that to_dict works correctly + example2 = Example.from_dict(predicted, example1.to_dict()) + aligned_tags2 = example2.get_aligned("TAG", as_string=True) + assert aligned_tags2 == ["VERB", "DET", "NOUN", "SCONJ", "PRON", "VERB", "VERB"] def test_aligned_tags_multi(): @@ -66,7 +70,7 @@ def test_aligned_tags_multi(): vocab = Vocab() predicted = Doc(vocab, words=pred_words) example = Example.from_dict(predicted, annots) - aligned_tags = example.get_aligned("tag", as_string=True) + aligned_tags = example.get_aligned("TAG", as_string=True) assert aligned_tags == [None, None, "SCONJ", "PRON", "VERB", "VERB"]