From 492d1ec5de3610160651b3c812c08ad76b3f7bad Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 4 Aug 2020 16:29:18 +0200 Subject: [PATCH] Prevent alignment when texts don't match (#5867) * remove empty gold.pyx * add alignment unit test (to be used in docs) * ensure that Alignment is only used on equal texts * additional test using example.alignment * formatting Co-authored-by: Matthew Honnibal --- spacy/errors.py | 1 + spacy/gold.pyx | 0 spacy/gold/align.py | 4 +++ spacy/tests/test_gold.py | 57 +++++++++++++++++++++++++++++++++++++++- 4 files changed, 61 insertions(+), 1 deletion(-) delete mode 100644 spacy/gold.pyx diff --git a/spacy/errors.py b/spacy/errors.py index 6e595fe33..418d682ad 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -488,6 +488,7 @@ class Errors: "a string value from {expected} but got: '{arg}'") E948 = ("Matcher.add received invalid 'patterns' argument: expected " "a List, but got: {arg_type}") + E949 = ("Can only create an alignment when the texts are the same.") E952 = ("The section '{name}' is not a valid section in the provided config.") E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}") E954 = ("The Tok2Vec listener did not receive a valid input.") diff --git a/spacy/gold.pyx b/spacy/gold.pyx deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/gold/align.py b/spacy/gold/align.py index af70ee5b7..e8f17a667 100644 --- a/spacy/gold/align.py +++ b/spacy/gold/align.py @@ -4,6 +4,8 @@ from thinc.types import Ragged from dataclasses import dataclass import tokenizations +from ..errors import Errors + @dataclass class Alignment: @@ -18,6 +20,8 @@ class Alignment: @classmethod def from_strings(cls, A: List[str], B: List[str]) -> "Alignment": + if "".join(A).replace(" ", "").lower() != "".join(B).replace(" ", "").lower(): + raise ValueError(Errors.E949) x2y, y2x = tokenizations.get_alignments(A, B) return Alignment.from_indices(x2y=x2y, y2x=y2x) diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 349c64836..81b71aaea 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -1,5 +1,5 @@ import numpy -from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags +from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags, Alignment from spacy.gold import spans_from_biluo_tags, iob_to_biluo from spacy.gold import Corpus, docs_to_json from spacy.gold.example import Example @@ -656,6 +656,61 @@ def test_split_sents(merged_dict): assert token_annotation_2["sent_starts"] == [1, 0, 0, 0] +def test_alignment(): + other_tokens = ["i", "listened", "to", "obama", "'", "s", "podcasts", "."] + spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts", "."] + align = Alignment.from_strings(other_tokens, spacy_tokens) + assert list(align.x2y.lengths) == [1, 1, 1, 1, 1, 1, 1, 1] + assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 6] + assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 1, 1] + assert list(align.y2x.dataXd) == [0, 1, 2, 3, 4, 5, 6, 7] + + +def test_alignment_case_insensitive(): + other_tokens = ["I", "listened", "to", "obama", "'", "s", "podcasts", "."] + spacy_tokens = ["i", "listened", "to", "Obama", "'s", "PODCASTS", "."] + align = Alignment.from_strings(other_tokens, spacy_tokens) + assert list(align.x2y.lengths) == [1, 1, 1, 1, 1, 1, 1, 1] + assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 6] + assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 1, 1] + assert list(align.y2x.dataXd) == [0, 1, 2, 3, 4, 5, 6, 7] + + +def test_alignment_complex(): + other_tokens = ["i listened to", "obama", "'", "s", "podcasts", "."] + spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."] + align = Alignment.from_strings(other_tokens, spacy_tokens) + assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1] + assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5] + assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2] + assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5] + + +def test_alignment_complex_example(en_vocab): + other_tokens = ["i listened to", "obama", "'", "s", "podcasts", "."] + spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."] + predicted = Doc( + en_vocab, words=other_tokens, spaces=[True, False, False, True, False, False] + ) + reference = Doc( + en_vocab, words=spacy_tokens, spaces=[True, True, True, False, True, False] + ) + assert predicted.text == "i listened to obama's podcasts." + assert reference.text == "i listened to obama's podcasts." + example = Example(predicted, reference) + align = example.alignment + assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1] + assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5] + assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2] + assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5] + + +def test_alignment_different_texts(): + other_tokens = ["she", "listened", "to", "obama", "'s", "podcasts", "."] + spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts", "."] + with pytest.raises(ValueError): + Alignment.from_strings(other_tokens, spacy_tokens) + def test_retokenized_docs(doc): a = doc.to_array(["TAG"]) doc1 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a)