Small retokenizer fix (#4174)

This commit is contained in:
Sofie Van Landeghem 2019-08-22 12:23:54 +02:00 committed by Ines Montani
parent a8752a569d
commit 73b38c33e4

View File

@ -388,6 +388,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
cdef const LexemeC* lex cdef const LexemeC* lex
cdef TokenC* token cdef TokenC* token
cdef TokenC orig_token = doc.c[token_index] cdef TokenC orig_token = doc.c[token_index]
cdef int orig_length = len(doc)
if(len(heads) != nb_subtokens): if(len(heads) != nb_subtokens):
raise ValueError(Errors.E115) raise ValueError(Errors.E115)
@ -408,7 +409,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
if to_process_tensor: if to_process_tensor:
xp = get_array_module(doc.tensor) xp = get_array_module(doc.tensor)
doc.tensor = xp.append(doc.tensor, xp.zeros((nb_subtokens,doc.tensor.shape[1]), dtype="float32"), axis=0) doc.tensor = xp.append(doc.tensor, xp.zeros((nb_subtokens,doc.tensor.shape[1]), dtype="float32"), axis=0)
for token_to_move in range(doc.length - 1, token_index, -1): for token_to_move in range(orig_length - 1, token_index, -1):
doc.c[token_to_move + nb_subtokens - 1] = doc.c[token_to_move] doc.c[token_to_move + nb_subtokens - 1] = doc.c[token_to_move]
if to_process_tensor: if to_process_tensor:
doc.tensor[token_to_move + nb_subtokens - 1] = doc.tensor[token_to_move] doc.tensor[token_to_move + nb_subtokens - 1] = doc.tensor[token_to_move]