diff --git a/spacy/tests/doc/test_retokenize_split.py b/spacy/tests/doc/test_retokenize_split.py index d074fddc6..d84c846de 100644 --- a/spacy/tests/doc/test_retokenize_split.py +++ b/spacy/tests/doc/test_retokenize_split.py @@ -198,3 +198,22 @@ def test_doc_retokenizer_realloc(en_vocab): token = doc[0] heads = [(token, 0)] * len(token) retokenizer.split(doc[token.i], list(token.text), heads=heads) + + +def test_doc_retokenizer_split_norm(en_vocab): + """#6060: reset norm in split""" + text = "The quick brownfoxjumpsoverthe lazy dog w/ white spots" + doc = Doc(en_vocab, words=text.split()) + + # Set custom norm on the w/ token. + doc[5].norm_ = "with" + + # Retokenize to split out the words in the token at doc[2]. + token = doc[2] + with doc.retokenize() as retokenizer: + retokenizer.split(token, ["brown", "fox", "jumps", "over", "the"], heads=[(token, idx) for idx in range(5)]) + + assert doc[9].text == "w/" + assert doc[9].norm_ == "with" + assert doc[5].text == "over" + assert doc[5].norm_ == "over" diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index abc9b731b..4a030bef6 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -355,6 +355,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs): lex = doc.vocab.get(doc.mem, orth) token.lex = lex token.lemma = 0 # reset lemma + token.norm = 0 # reset norm if to_process_tensor: # setting the tensors of the split tokens to array of zeros doc.tensor[token_index + i] = xp.zeros((1,doc.tensor.shape[1]), dtype="float32")