Fix token.idx for special cases with affixes (#6035)

This commit is contained in:
Adriane Boyd 2020-09-13 14:05:36 +02:00 committed by GitHub
parent 54c40223a1
commit c7bd631b5f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 11 additions and 2 deletions

View File

@ -155,3 +155,11 @@ def test_tokenizer_special_cases_with_period(tokenizer):
tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}]) tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}])
doc = tokenizer(text) doc = tokenizer(text)
assert [token.text for token in doc] == ["_SPECIAL_", "."] assert [token.text for token in doc] == ["_SPECIAL_", "."]
def test_tokenizer_special_cases_idx(tokenizer):
text = "the _ID'X_"
tokenizer.add_special_case("_ID'X_", [{"orth": "_ID"}, {"orth": "'X_"}])
doc = tokenizer(text)
assert doc[1].idx == 4
assert doc[2].idx == 7

View File

@ -343,8 +343,9 @@ cdef class Tokenizer:
for j in range(cached.length): for j in range(cached.length):
tokens[i + offset + j] = cached.data.tokens[j] tokens[i + offset + j] = cached.data.tokens[j]
tokens[i + offset + j].idx = orig_idx + idx_offset tokens[i + offset + j].idx = orig_idx + idx_offset
idx_offset += cached.data.tokens[j].lex.length + \ idx_offset += cached.data.tokens[j].lex.length
1 if cached.data.tokens[j].spacy else 0 if cached.data.tokens[j].spacy:
idx_offset += 1
tokens[i + offset + cached.length - 1].spacy = orig_final_spacy tokens[i + offset + cached.length - 1].spacy = orig_final_spacy
i += span_end - span_start i += span_end - span_start
offset += span[3] offset += span[3]