mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Prevent alignment when texts don't match (#5867)
* remove empty gold.pyx * add alignment unit test (to be used in docs) * ensure that Alignment is only used on equal texts * additional test using example.alignment * formatting Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
This commit is contained in:
parent
ecb3c4e8f4
commit
492d1ec5de
|
@ -488,6 +488,7 @@ class Errors:
|
||||||
"a string value from {expected} but got: '{arg}'")
|
"a string value from {expected} but got: '{arg}'")
|
||||||
E948 = ("Matcher.add received invalid 'patterns' argument: expected "
|
E948 = ("Matcher.add received invalid 'patterns' argument: expected "
|
||||||
"a List, but got: {arg_type}")
|
"a List, but got: {arg_type}")
|
||||||
|
E949 = ("Can only create an alignment when the texts are the same.")
|
||||||
E952 = ("The section '{name}' is not a valid section in the provided config.")
|
E952 = ("The section '{name}' is not a valid section in the provided config.")
|
||||||
E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
|
E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
|
||||||
E954 = ("The Tok2Vec listener did not receive a valid input.")
|
E954 = ("The Tok2Vec listener did not receive a valid input.")
|
||||||
|
|
|
@ -4,6 +4,8 @@ from thinc.types import Ragged
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
import tokenizations
|
import tokenizations
|
||||||
|
|
||||||
|
from ..errors import Errors
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Alignment:
|
class Alignment:
|
||||||
|
@ -18,6 +20,8 @@ class Alignment:
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_strings(cls, A: List[str], B: List[str]) -> "Alignment":
|
def from_strings(cls, A: List[str], B: List[str]) -> "Alignment":
|
||||||
|
if "".join(A).replace(" ", "").lower() != "".join(B).replace(" ", "").lower():
|
||||||
|
raise ValueError(Errors.E949)
|
||||||
x2y, y2x = tokenizations.get_alignments(A, B)
|
x2y, y2x = tokenizations.get_alignments(A, B)
|
||||||
return Alignment.from_indices(x2y=x2y, y2x=y2x)
|
return Alignment.from_indices(x2y=x2y, y2x=y2x)
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
import numpy
|
import numpy
|
||||||
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
|
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags, Alignment
|
||||||
from spacy.gold import spans_from_biluo_tags, iob_to_biluo
|
from spacy.gold import spans_from_biluo_tags, iob_to_biluo
|
||||||
from spacy.gold import Corpus, docs_to_json
|
from spacy.gold import Corpus, docs_to_json
|
||||||
from spacy.gold.example import Example
|
from spacy.gold.example import Example
|
||||||
|
@ -656,6 +656,61 @@ def test_split_sents(merged_dict):
|
||||||
assert token_annotation_2["sent_starts"] == [1, 0, 0, 0]
|
assert token_annotation_2["sent_starts"] == [1, 0, 0, 0]
|
||||||
|
|
||||||
|
|
||||||
|
def test_alignment():
|
||||||
|
other_tokens = ["i", "listened", "to", "obama", "'", "s", "podcasts", "."]
|
||||||
|
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts", "."]
|
||||||
|
align = Alignment.from_strings(other_tokens, spacy_tokens)
|
||||||
|
assert list(align.x2y.lengths) == [1, 1, 1, 1, 1, 1, 1, 1]
|
||||||
|
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 6]
|
||||||
|
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 1, 1]
|
||||||
|
assert list(align.y2x.dataXd) == [0, 1, 2, 3, 4, 5, 6, 7]
|
||||||
|
|
||||||
|
|
||||||
|
def test_alignment_case_insensitive():
|
||||||
|
other_tokens = ["I", "listened", "to", "obama", "'", "s", "podcasts", "."]
|
||||||
|
spacy_tokens = ["i", "listened", "to", "Obama", "'s", "PODCASTS", "."]
|
||||||
|
align = Alignment.from_strings(other_tokens, spacy_tokens)
|
||||||
|
assert list(align.x2y.lengths) == [1, 1, 1, 1, 1, 1, 1, 1]
|
||||||
|
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 6]
|
||||||
|
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 1, 1]
|
||||||
|
assert list(align.y2x.dataXd) == [0, 1, 2, 3, 4, 5, 6, 7]
|
||||||
|
|
||||||
|
|
||||||
|
def test_alignment_complex():
|
||||||
|
other_tokens = ["i listened to", "obama", "'", "s", "podcasts", "."]
|
||||||
|
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
|
||||||
|
align = Alignment.from_strings(other_tokens, spacy_tokens)
|
||||||
|
assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1]
|
||||||
|
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
|
||||||
|
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
|
||||||
|
assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5]
|
||||||
|
|
||||||
|
|
||||||
|
def test_alignment_complex_example(en_vocab):
|
||||||
|
other_tokens = ["i listened to", "obama", "'", "s", "podcasts", "."]
|
||||||
|
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
|
||||||
|
predicted = Doc(
|
||||||
|
en_vocab, words=other_tokens, spaces=[True, False, False, True, False, False]
|
||||||
|
)
|
||||||
|
reference = Doc(
|
||||||
|
en_vocab, words=spacy_tokens, spaces=[True, True, True, False, True, False]
|
||||||
|
)
|
||||||
|
assert predicted.text == "i listened to obama's podcasts."
|
||||||
|
assert reference.text == "i listened to obama's podcasts."
|
||||||
|
example = Example(predicted, reference)
|
||||||
|
align = example.alignment
|
||||||
|
assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1]
|
||||||
|
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
|
||||||
|
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
|
||||||
|
assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5]
|
||||||
|
|
||||||
|
|
||||||
|
def test_alignment_different_texts():
|
||||||
|
other_tokens = ["she", "listened", "to", "obama", "'s", "podcasts", "."]
|
||||||
|
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts", "."]
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
Alignment.from_strings(other_tokens, spacy_tokens)
|
||||||
|
|
||||||
def test_retokenized_docs(doc):
|
def test_retokenized_docs(doc):
|
||||||
a = doc.to_array(["TAG"])
|
a = doc.to_array(["TAG"])
|
||||||
doc1 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a)
|
doc1 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user