Fix norm in retokenizer split (#6111)

Parallel to behavior in merge, reset norm on original token in
retokenizer split.
This commit is contained in:
Adriane Boyd 2020-09-22 21:53:33 +02:00 committed by GitHub
parent 9b4979407d
commit e4acb28658
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 20 additions and 0 deletions

View File

@ -198,3 +198,22 @@ def test_doc_retokenizer_realloc(en_vocab):
token = doc[0]
heads = [(token, 0)] * len(token)
retokenizer.split(doc[token.i], list(token.text), heads=heads)
def test_doc_retokenizer_split_norm(en_vocab):
"""#6060: reset norm in split"""
text = "The quick brownfoxjumpsoverthe lazy dog w/ white spots"
doc = Doc(en_vocab, words=text.split())
# Set custom norm on the w/ token.
doc[5].norm_ = "with"
# Retokenize to split out the words in the token at doc[2].
token = doc[2]
with doc.retokenize() as retokenizer:
retokenizer.split(token, ["brown", "fox", "jumps", "over", "the"], heads=[(token, idx) for idx in range(5)])
assert doc[9].text == "w/"
assert doc[9].norm_ == "with"
assert doc[5].text == "over"
assert doc[5].norm_ == "over"

View File

@ -355,6 +355,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
lex = doc.vocab.get(doc.mem, orth)
token.lex = lex
token.lemma = 0 # reset lemma
token.norm = 0 # reset norm
if to_process_tensor:
# setting the tensors of the split tokens to array of zeros
doc.tensor[token_index + i] = xp.zeros((1,doc.tensor.shape[1]), dtype="float32")