mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 10:46:29 +03:00
Fix token.idx for special cases with affixes (#6035)
This commit is contained in:
parent
54c40223a1
commit
c7bd631b5f
|
@ -155,3 +155,11 @@ def test_tokenizer_special_cases_with_period(tokenizer):
|
||||||
tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}])
|
tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}])
|
||||||
doc = tokenizer(text)
|
doc = tokenizer(text)
|
||||||
assert [token.text for token in doc] == ["_SPECIAL_", "."]
|
assert [token.text for token in doc] == ["_SPECIAL_", "."]
|
||||||
|
|
||||||
|
|
||||||
|
def test_tokenizer_special_cases_idx(tokenizer):
|
||||||
|
text = "the _ID'X_"
|
||||||
|
tokenizer.add_special_case("_ID'X_", [{"orth": "_ID"}, {"orth": "'X_"}])
|
||||||
|
doc = tokenizer(text)
|
||||||
|
assert doc[1].idx == 4
|
||||||
|
assert doc[2].idx == 7
|
||||||
|
|
|
@ -343,8 +343,9 @@ cdef class Tokenizer:
|
||||||
for j in range(cached.length):
|
for j in range(cached.length):
|
||||||
tokens[i + offset + j] = cached.data.tokens[j]
|
tokens[i + offset + j] = cached.data.tokens[j]
|
||||||
tokens[i + offset + j].idx = orig_idx + idx_offset
|
tokens[i + offset + j].idx = orig_idx + idx_offset
|
||||||
idx_offset += cached.data.tokens[j].lex.length + \
|
idx_offset += cached.data.tokens[j].lex.length
|
||||||
1 if cached.data.tokens[j].spacy else 0
|
if cached.data.tokens[j].spacy:
|
||||||
|
idx_offset += 1
|
||||||
tokens[i + offset + cached.length - 1].spacy = orig_final_spacy
|
tokens[i + offset + cached.length - 1].spacy = orig_final_spacy
|
||||||
i += span_end - span_start
|
i += span_end - span_start
|
||||||
offset += span[3]
|
offset += span[3]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user