From 73b38c33e4401542b7e35696777dd6decaa516f7 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 22 Aug 2019 12:23:54 +0200 Subject: [PATCH] Small retokenizer fix (#4174) --- spacy/tokens/_retokenize.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index 7da718349..a692c9188 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -388,6 +388,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs): cdef const LexemeC* lex cdef TokenC* token cdef TokenC orig_token = doc.c[token_index] + cdef int orig_length = len(doc) if(len(heads) != nb_subtokens): raise ValueError(Errors.E115) @@ -408,7 +409,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs): if to_process_tensor: xp = get_array_module(doc.tensor) doc.tensor = xp.append(doc.tensor, xp.zeros((nb_subtokens,doc.tensor.shape[1]), dtype="float32"), axis=0) - for token_to_move in range(doc.length - 1, token_index, -1): + for token_to_move in range(orig_length - 1, token_index, -1): doc.c[token_to_move + nb_subtokens - 1] = doc.c[token_to_move] if to_process_tensor: doc.tensor[token_to_move + nb_subtokens - 1] = doc.tensor[token_to_move]