Prevent alignment when texts don't match (#5867)

* remove empty gold.pyx * add alignment unit test (to be used in docs) * ensure that Alignment is only used on equal texts * additional test using example.alignment * formatting Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
2025-07-18 04:02:20 +03:00 · 2020-08-04 16:29:18 +02:00 · 2020-08-04 16:29:18 +02:00 · 492d1ec5de
commit 492d1ec5de
parent ecb3c4e8f4
4 changed files with 61 additions and 1 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -488,6 +488,7 @@ class Errors:
            "a string value from {expected} but got: '{arg}'")
    E948 = ("Matcher.add received invalid 'patterns' argument: expected "
            "a List, but got: {arg_type}")
+    E949 = ("Can only create an alignment when the texts are the same.")
    E952 = ("The section '{name}' is not a valid section in the provided config.")
    E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
    E954 = ("The Tok2Vec listener did not receive a valid input.")
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
--- a/spacy/gold/align.py
+++ b/spacy/gold/align.py
@ -4,6 +4,8 @@ from thinc.types import Ragged
 from dataclasses import dataclass
 import tokenizations

+from ..errors import Errors
+

@dataclass
 class Alignment:
@ -18,6 +20,8 @@ class Alignment:

    @classmethod
    def from_strings(cls, A: List[str], B: List[str]) -> "Alignment":
+        if "".join(A).replace(" ", "").lower() != "".join(B).replace(" ", "").lower():
+            raise ValueError(Errors.E949)
        x2y, y2x = tokenizations.get_alignments(A, B)
        return Alignment.from_indices(x2y=x2y, y2x=y2x)

--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@ -1,5 +1,5 @@
 import numpy
-from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
+from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags, Alignment
 from spacy.gold import spans_from_biluo_tags, iob_to_biluo
 from spacy.gold import Corpus, docs_to_json
 from spacy.gold.example import Example
@ -656,6 +656,61 @@ def test_split_sents(merged_dict):
    assert token_annotation_2["sent_starts"] == [1, 0, 0, 0]


+def test_alignment():
+    other_tokens = ["i", "listened", "to", "obama", "'", "s", "podcasts", "."]
+    spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts", "."]
+    align = Alignment.from_strings(other_tokens, spacy_tokens)
+    assert list(align.x2y.lengths) == [1, 1, 1, 1, 1, 1, 1, 1]
+    assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 6]
+    assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 1, 1]
+    assert list(align.y2x.dataXd) == [0, 1, 2, 3, 4, 5, 6, 7]
+
+
+def test_alignment_case_insensitive():
+    other_tokens = ["I", "listened", "to", "obama", "'", "s", "podcasts", "."]
+    spacy_tokens = ["i", "listened", "to", "Obama", "'s", "PODCASTS", "."]
+    align = Alignment.from_strings(other_tokens, spacy_tokens)
+    assert list(align.x2y.lengths) == [1, 1, 1, 1, 1, 1, 1, 1]
+    assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 6]
+    assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 1, 1]
+    assert list(align.y2x.dataXd) == [0, 1, 2, 3, 4, 5, 6, 7]
+
+
+def test_alignment_complex():
+    other_tokens = ["i listened to", "obama", "'", "s", "podcasts", "."]
+    spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
+    align = Alignment.from_strings(other_tokens, spacy_tokens)
+    assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1]
+    assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
+    assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
+    assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5]
+
+
+def test_alignment_complex_example(en_vocab):
+    other_tokens = ["i listened to", "obama", "'", "s", "podcasts", "."]
+    spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
+    predicted = Doc(
+        en_vocab, words=other_tokens, spaces=[True, False, False, True, False, False]
+    )
+    reference = Doc(
+        en_vocab, words=spacy_tokens, spaces=[True, True, True, False, True, False]
+    )
+    assert predicted.text == "i listened to obama's podcasts."
+    assert reference.text == "i listened to obama's podcasts."
+    example = Example(predicted, reference)
+    align = example.alignment
+    assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1]
+    assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
+    assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
+    assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5]
+
+
+def test_alignment_different_texts():
+    other_tokens = ["she", "listened", "to", "obama", "'s", "podcasts", "."]
+    spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts", "."]
+    with pytest.raises(ValueError):
+        Alignment.from_strings(other_tokens, spacy_tokens)
+
 def test_retokenized_docs(doc):
    a = doc.to_array(["TAG"])
    doc1 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a)