From c7bd631b5f260d60dabe0ad65b609dcf8ddc298a Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Sun, 13 Sep 2020 14:05:36 +0200 Subject: [PATCH] Fix token.idx for special cases with affixes (#6035) --- spacy/tests/tokenizer/test_tokenizer.py | 8 ++++++++ spacy/tokenizer.pyx | 5 +++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index ff31ae8a9..23c2d5c47 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -155,3 +155,11 @@ def test_tokenizer_special_cases_with_period(tokenizer): tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}]) doc = tokenizer(text) assert [token.text for token in doc] == ["_SPECIAL_", "."] + + +def test_tokenizer_special_cases_idx(tokenizer): + text = "the _ID'X_" + tokenizer.add_special_case("_ID'X_", [{"orth": "_ID"}, {"orth": "'X_"}]) + doc = tokenizer(text) + assert doc[1].idx == 4 + assert doc[2].idx == 7 diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 787cca652..17714940d 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -343,8 +343,9 @@ cdef class Tokenizer: for j in range(cached.length): tokens[i + offset + j] = cached.data.tokens[j] tokens[i + offset + j].idx = orig_idx + idx_offset - idx_offset += cached.data.tokens[j].lex.length + \ - 1 if cached.data.tokens[j].spacy else 0 + idx_offset += cached.data.tokens[j].lex.length + if cached.data.tokens[j].spacy: + idx_offset += 1 tokens[i + offset + cached.length - 1].spacy = orig_final_spacy i += span_end - span_start offset += span[3]