mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Fix alignment for 1-to-1 tokens and lowercasing (#6476)
* When checking for token alignments, check not only that the tokens are identical but that the character positions are both at the start of a token. It's possible for the tokens to be identical even though the two tokens aren't aligned one-to-one in a case like `["a'", "''"]` vs. `["a", "''", "'"]`, where the middle tokens are identical but should not be aligned on the token level at character position 2 since it's the start of one token but the middle of another. * Use the lowercased version of the token texts to create the character-to-token alignment because lowercasing can change the string length (e.g., for `İ`, see the not-a-bug bug report: https://bugs.python.org/issue34723)
This commit is contained in:
parent
ee2ec52f48
commit
4448680750
|
@ -514,6 +514,11 @@ def test_roundtrip_docs_to_docbin(doc):
|
|||
([[0], [1], [2, 3]], [[0], [1], [2], [2]]),
|
||||
),
|
||||
([" ", "a"], ["a"], ([[], [0]], [[1]])),
|
||||
(
|
||||
["a", "''", "'", ","],
|
||||
["a'", "''", ","],
|
||||
([[0], [0, 1], [1], [2]], [[0, 1], [1, 2], [3]]),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_align(tokens_a, tokens_b, expected): # noqa
|
||||
|
@ -698,7 +703,7 @@ def test_alignment_spaces(en_vocab):
|
|||
align = Alignment.from_strings(other_tokens, spacy_tokens)
|
||||
assert list(align.x2y.lengths) == [0, 3, 1, 1, 1, 1, 1]
|
||||
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
|
||||
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,]
|
||||
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
|
||||
assert list(align.y2x.dataXd) == [1, 1, 1, 2, 3, 4, 5, 6]
|
||||
|
||||
# multiple leading whitespace tokens
|
||||
|
@ -707,7 +712,7 @@ def test_alignment_spaces(en_vocab):
|
|||
align = Alignment.from_strings(other_tokens, spacy_tokens)
|
||||
assert list(align.x2y.lengths) == [0, 0, 3, 1, 1, 1, 1, 1]
|
||||
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
|
||||
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,]
|
||||
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
|
||||
assert list(align.y2x.dataXd) == [2, 2, 2, 3, 4, 5, 6, 7]
|
||||
|
||||
# both with leading whitespace, not identical
|
||||
|
|
|
@ -7,8 +7,8 @@ from ..errors import Errors
|
|||
|
||||
def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[List[int]]]:
|
||||
# Create character-to-token mappings
|
||||
char_to_token_a = tuple(chain(*((i,) * len(x) for i, x in enumerate(A))))
|
||||
char_to_token_b = tuple(chain(*((i,) * len(x) for i, x in enumerate(B))))
|
||||
char_to_token_a = tuple(chain(*((i,) * len(x.lower()) for i, x in enumerate(A))))
|
||||
char_to_token_b = tuple(chain(*((i,) * len(x.lower()) for i, x in enumerate(B))))
|
||||
str_a = "".join(A).lower()
|
||||
str_b = "".join(B).lower()
|
||||
cdef int len_str_a = len(str_a)
|
||||
|
@ -36,8 +36,14 @@ def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[Li
|
|||
if prev_token_idx_b != token_idx_b:
|
||||
b2a.append(set())
|
||||
# Process the alignment at the current position
|
||||
if A[token_idx_a] == B[token_idx_b]:
|
||||
# Current tokens are identical
|
||||
if A[token_idx_a] == B[token_idx_b] and \
|
||||
(char_idx_a == 0 or \
|
||||
char_to_token_a[char_idx_a - 1] < token_idx_a) and \
|
||||
(char_idx_b == 0 or \
|
||||
char_to_token_b[char_idx_b - 1] < token_idx_b):
|
||||
# Current tokens are identical and both character offsets are the
|
||||
# start of a token (either at the beginning of the document or the
|
||||
# previous character belongs to a different token)
|
||||
a2b[-1].add(token_idx_b)
|
||||
b2a[-1].add(token_idx_a)
|
||||
char_idx_a += len(A[token_idx_a])
|
||||
|
|
Loading…
Reference in New Issue
Block a user