Small retokenizer fix (#4174)

2025-09-18 01:52:37 +03:00 · 2019-08-22 12:23:54 +02:00 · 2019-08-22 12:23:54 +02:00 · 73b38c33e4
commit 73b38c33e4
parent a8752a569d
1 changed files with 2 additions and 1 deletions
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@ -388,6 +388,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
    cdef const LexemeC* lex
    cdef TokenC* token
    cdef TokenC orig_token = doc.c[token_index]
    cdef int orig_length = len(doc)
    if(len(heads) != nb_subtokens):
        raise ValueError(Errors.E115)
@ -408,7 +409,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
    if to_process_tensor:
        xp = get_array_module(doc.tensor)
        doc.tensor = xp.append(doc.tensor, xp.zeros((nb_subtokens,doc.tensor.shape[1]), dtype="float32"), axis=0)
-    for token_to_move in range(doc.length - 1, token_index, -1):
+    for token_to_move in range(orig_length - 1, token_index, -1):
        doc.c[token_to_move + nb_subtokens - 1] = doc.c[token_to_move]
        if to_process_tensor:
            doc.tensor[token_to_move + nb_subtokens - 1] = doc.tensor[token_to_move]