Fix norm in retokenizer split (#6111)

Parallel to behavior in merge, reset norm on original token in retokenizer split.
2025-09-22 12:06:43 +03:00 · 2020-09-22 21:53:33 +02:00 · 2020-09-22 21:53:33 +02:00 · e4acb28658
commit e4acb28658
parent 9b4979407d
2 changed files with 20 additions and 0 deletions
--- a/spacy/tests/doc/test_retokenize_split.py
+++ b/spacy/tests/doc/test_retokenize_split.py
@ -198,3 +198,22 @@ def test_doc_retokenizer_realloc(en_vocab):
        token = doc[0]
        heads = [(token, 0)] * len(token)
        retokenizer.split(doc[token.i], list(token.text), heads=heads)
+
+
+def test_doc_retokenizer_split_norm(en_vocab):
+    """#6060: reset norm in split"""
+    text = "The quick brownfoxjumpsoverthe lazy dog w/ white spots"
+    doc = Doc(en_vocab, words=text.split())
+
+    # Set custom norm on the w/ token.
+    doc[5].norm_ = "with"
+
+    # Retokenize to split out the words in the token at doc[2].
+    token = doc[2]
+    with doc.retokenize() as retokenizer:
+      retokenizer.split(token, ["brown", "fox", "jumps", "over", "the"], heads=[(token, idx) for idx in range(5)])
+
+    assert doc[9].text  == "w/"
+    assert doc[9].norm_ == "with"
+    assert doc[5].text  == "over"
+    assert doc[5].norm_ == "over"
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@ -355,6 +355,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
        lex = doc.vocab.get(doc.mem, orth)
        token.lex = lex
        token.lemma = 0  # reset lemma
+        token.norm = 0  # reset norm
        if to_process_tensor:
            # setting the tensors of the split tokens to array of zeros
            doc.tensor[token_index + i] = xp.zeros((1,doc.tensor.shape[1]), dtype="float32")