From 492d1ec5de3610160651b3c812c08ad76b3f7bad Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Tue, 4 Aug 2020 16:29:18 +0200
Subject: [PATCH] Prevent alignment when texts don't match (#5867)

* remove empty gold.pyx

* add alignment unit test (to be used in docs)

* ensure that Alignment is only used on equal texts

* additional test using example.alignment

* formatting

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
---
 spacy/errors.py          |  1 +
 spacy/gold.pyx           |  0
 spacy/gold/align.py      |  4 +++
 spacy/tests/test_gold.py | 57 +++++++++++++++++++++++++++++++++++++++-
 4 files changed, 61 insertions(+), 1 deletion(-)
 delete mode 100644 spacy/gold.pyx

diff --git a/spacy/errors.py b/spacy/errors.py
index 6e595fe33..418d682ad 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -488,6 +488,7 @@ class Errors:
             "a string value from {expected} but got: '{arg}'")
     E948 = ("Matcher.add received invalid 'patterns' argument: expected "
             "a List, but got: {arg_type}")
+    E949 = ("Can only create an alignment when the texts are the same.")
     E952 = ("The section '{name}' is not a valid section in the provided config.")
     E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
     E954 = ("The Tok2Vec listener did not receive a valid input.")
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
deleted file mode 100644
index e69de29bb..000000000
diff --git a/spacy/gold/align.py b/spacy/gold/align.py
index af70ee5b7..e8f17a667 100644
--- a/spacy/gold/align.py
+++ b/spacy/gold/align.py
@@ -4,6 +4,8 @@ from thinc.types import Ragged
 from dataclasses import dataclass
 import tokenizations
 
+from ..errors import Errors
+
 
 @dataclass
 class Alignment:
@@ -18,6 +20,8 @@ class Alignment:
 
     @classmethod
     def from_strings(cls, A: List[str], B: List[str]) -> "Alignment":
+        if "".join(A).replace(" ", "").lower() != "".join(B).replace(" ", "").lower():
+            raise ValueError(Errors.E949)
         x2y, y2x = tokenizations.get_alignments(A, B)
         return Alignment.from_indices(x2y=x2y, y2x=y2x)
 
diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
index 349c64836..81b71aaea 100644
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@@ -1,5 +1,5 @@
 import numpy
-from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
+from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags, Alignment
 from spacy.gold import spans_from_biluo_tags, iob_to_biluo
 from spacy.gold import Corpus, docs_to_json
 from spacy.gold.example import Example
@@ -656,6 +656,61 @@ def test_split_sents(merged_dict):
     assert token_annotation_2["sent_starts"] == [1, 0, 0, 0]
 
 
+def test_alignment():
+    other_tokens = ["i", "listened", "to", "obama", "'", "s", "podcasts", "."]
+    spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts", "."]
+    align = Alignment.from_strings(other_tokens, spacy_tokens)
+    assert list(align.x2y.lengths) == [1, 1, 1, 1, 1, 1, 1, 1]
+    assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 6]
+    assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 1, 1]
+    assert list(align.y2x.dataXd) == [0, 1, 2, 3, 4, 5, 6, 7]
+
+
+def test_alignment_case_insensitive():
+    other_tokens = ["I", "listened", "to", "obama", "'", "s", "podcasts", "."]
+    spacy_tokens = ["i", "listened", "to", "Obama", "'s", "PODCASTS", "."]
+    align = Alignment.from_strings(other_tokens, spacy_tokens)
+    assert list(align.x2y.lengths) == [1, 1, 1, 1, 1, 1, 1, 1]
+    assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 6]
+    assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 1, 1]
+    assert list(align.y2x.dataXd) == [0, 1, 2, 3, 4, 5, 6, 7]
+
+
+def test_alignment_complex():
+    other_tokens = ["i listened to", "obama", "'", "s", "podcasts", "."]
+    spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
+    align = Alignment.from_strings(other_tokens, spacy_tokens)
+    assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1]
+    assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
+    assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
+    assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5]
+
+
+def test_alignment_complex_example(en_vocab):
+    other_tokens = ["i listened to", "obama", "'", "s", "podcasts", "."]
+    spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
+    predicted = Doc(
+        en_vocab, words=other_tokens, spaces=[True, False, False, True, False, False]
+    )
+    reference = Doc(
+        en_vocab, words=spacy_tokens, spaces=[True, True, True, False, True, False]
+    )
+    assert predicted.text == "i listened to obama's podcasts."
+    assert reference.text == "i listened to obama's podcasts."
+    example = Example(predicted, reference)
+    align = example.alignment
+    assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1]
+    assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
+    assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
+    assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5]
+
+
+def test_alignment_different_texts():
+    other_tokens = ["she", "listened", "to", "obama", "'s", "podcasts", "."]
+    spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts", "."]
+    with pytest.raises(ValueError):
+        Alignment.from_strings(other_tokens, spacy_tokens)
+
 def test_retokenized_docs(doc):
     a = doc.to_array(["TAG"])
     doc1 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a)