Prevent alignment when texts don't match (#5867)

* remove empty gold.pyx

* add alignment unit test (to be used in docs)

* ensure that Alignment is only used on equal texts

* additional test using example.alignment

* formatting

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
This commit is contained in:
Sofie Van Landeghem 2020-08-04 16:29:18 +02:00 committed by GitHub
parent ecb3c4e8f4
commit 492d1ec5de
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 61 additions and 1 deletions

View File

@ -488,6 +488,7 @@ class Errors:
"a string value from {expected} but got: '{arg}'") "a string value from {expected} but got: '{arg}'")
E948 = ("Matcher.add received invalid 'patterns' argument: expected " E948 = ("Matcher.add received invalid 'patterns' argument: expected "
"a List, but got: {arg_type}") "a List, but got: {arg_type}")
E949 = ("Can only create an alignment when the texts are the same.")
E952 = ("The section '{name}' is not a valid section in the provided config.") E952 = ("The section '{name}' is not a valid section in the provided config.")
E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}") E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
E954 = ("The Tok2Vec listener did not receive a valid input.") E954 = ("The Tok2Vec listener did not receive a valid input.")

View File

View File

@ -4,6 +4,8 @@ from thinc.types import Ragged
from dataclasses import dataclass from dataclasses import dataclass
import tokenizations import tokenizations
from ..errors import Errors
@dataclass @dataclass
class Alignment: class Alignment:
@ -18,6 +20,8 @@ class Alignment:
@classmethod @classmethod
def from_strings(cls, A: List[str], B: List[str]) -> "Alignment": def from_strings(cls, A: List[str], B: List[str]) -> "Alignment":
if "".join(A).replace(" ", "").lower() != "".join(B).replace(" ", "").lower():
raise ValueError(Errors.E949)
x2y, y2x = tokenizations.get_alignments(A, B) x2y, y2x = tokenizations.get_alignments(A, B)
return Alignment.from_indices(x2y=x2y, y2x=y2x) return Alignment.from_indices(x2y=x2y, y2x=y2x)

View File

@ -1,5 +1,5 @@
import numpy import numpy
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags, Alignment
from spacy.gold import spans_from_biluo_tags, iob_to_biluo from spacy.gold import spans_from_biluo_tags, iob_to_biluo
from spacy.gold import Corpus, docs_to_json from spacy.gold import Corpus, docs_to_json
from spacy.gold.example import Example from spacy.gold.example import Example
@ -656,6 +656,61 @@ def test_split_sents(merged_dict):
assert token_annotation_2["sent_starts"] == [1, 0, 0, 0] assert token_annotation_2["sent_starts"] == [1, 0, 0, 0]
def test_alignment():
other_tokens = ["i", "listened", "to", "obama", "'", "s", "podcasts", "."]
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts", "."]
align = Alignment.from_strings(other_tokens, spacy_tokens)
assert list(align.x2y.lengths) == [1, 1, 1, 1, 1, 1, 1, 1]
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 6]
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 1, 1]
assert list(align.y2x.dataXd) == [0, 1, 2, 3, 4, 5, 6, 7]
def test_alignment_case_insensitive():
other_tokens = ["I", "listened", "to", "obama", "'", "s", "podcasts", "."]
spacy_tokens = ["i", "listened", "to", "Obama", "'s", "PODCASTS", "."]
align = Alignment.from_strings(other_tokens, spacy_tokens)
assert list(align.x2y.lengths) == [1, 1, 1, 1, 1, 1, 1, 1]
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 6]
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 1, 1]
assert list(align.y2x.dataXd) == [0, 1, 2, 3, 4, 5, 6, 7]
def test_alignment_complex():
other_tokens = ["i listened to", "obama", "'", "s", "podcasts", "."]
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
align = Alignment.from_strings(other_tokens, spacy_tokens)
assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1]
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5]
def test_alignment_complex_example(en_vocab):
other_tokens = ["i listened to", "obama", "'", "s", "podcasts", "."]
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
predicted = Doc(
en_vocab, words=other_tokens, spaces=[True, False, False, True, False, False]
)
reference = Doc(
en_vocab, words=spacy_tokens, spaces=[True, True, True, False, True, False]
)
assert predicted.text == "i listened to obama's podcasts."
assert reference.text == "i listened to obama's podcasts."
example = Example(predicted, reference)
align = example.alignment
assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1]
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5]
def test_alignment_different_texts():
other_tokens = ["she", "listened", "to", "obama", "'s", "podcasts", "."]
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts", "."]
with pytest.raises(ValueError):
Alignment.from_strings(other_tokens, spacy_tokens)
def test_retokenized_docs(doc): def test_retokenized_docs(doc):
a = doc.to_array(["TAG"]) a = doc.to_array(["TAG"])
doc1 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a) doc1 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a)