mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
Fix spacy when retokenizing cases with affixes (#6475)
Preserve `token.spacy` corresponding to the span end token in the original doc rather than adjusting for the current offset. * If not modifying in place, this checks in the original document (`doc.c` rather than `tokens`). * If modifying in place, the document has not been modified past the current span start position so the value at the current span end position is valid.
This commit is contained in:
parent
4448680750
commit
29b058ebdc
|
@ -2,6 +2,7 @@ import pytest
|
|||
from spacy.vocab import Vocab
|
||||
from spacy.tokenizer import Tokenizer
|
||||
from spacy.util import ensure_path
|
||||
from spacy.lang.en import English
|
||||
|
||||
|
||||
def test_tokenizer_handles_no_word(tokenizer):
|
||||
|
@ -150,6 +151,22 @@ def test_tokenizer_special_cases_with_affixes(tokenizer):
|
|||
]
|
||||
|
||||
|
||||
def test_tokenizer_special_cases_with_affixes_preserve_spacy():
|
||||
tokenizer = English().tokenizer
|
||||
# reset all special cases
|
||||
tokenizer.rules = {}
|
||||
|
||||
# in-place modification (only merges)
|
||||
text = "''a'' "
|
||||
tokenizer.add_special_case("''", [{"ORTH": "''"}])
|
||||
assert tokenizer(text).text == text
|
||||
|
||||
# not in-place (splits and merges)
|
||||
tokenizer.add_special_case("ab", [{"ORTH": "a"}, {"ORTH": "b"}])
|
||||
text = "ab ab ab ''ab ab'' ab'' ''ab"
|
||||
assert tokenizer(text).text == text
|
||||
|
||||
|
||||
def test_tokenizer_special_cases_with_period(tokenizer):
|
||||
text = "_SPECIAL_."
|
||||
tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}])
|
||||
|
|
|
@ -338,7 +338,7 @@ cdef class Tokenizer:
|
|||
# Copy special case tokens into doc and adjust token and
|
||||
# character offsets
|
||||
idx_offset = 0
|
||||
orig_final_spacy = doc.c[span_end + offset - 1].spacy
|
||||
orig_final_spacy = doc.c[span_end - 1].spacy
|
||||
orig_idx = doc.c[i].idx
|
||||
for j in range(cached.length):
|
||||
tokens[i + offset + j] = cached.data.tokens[j]
|
||||
|
|
Loading…
Reference in New Issue
Block a user