mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Fix norm in retokenizer split (#6111)
Parallel to behavior in merge, reset norm on original token in retokenizer split.
This commit is contained in:
parent
9b4979407d
commit
e4acb28658
|
@ -198,3 +198,22 @@ def test_doc_retokenizer_realloc(en_vocab):
|
|||
token = doc[0]
|
||||
heads = [(token, 0)] * len(token)
|
||||
retokenizer.split(doc[token.i], list(token.text), heads=heads)
|
||||
|
||||
|
||||
def test_doc_retokenizer_split_norm(en_vocab):
|
||||
"""#6060: reset norm in split"""
|
||||
text = "The quick brownfoxjumpsoverthe lazy dog w/ white spots"
|
||||
doc = Doc(en_vocab, words=text.split())
|
||||
|
||||
# Set custom norm on the w/ token.
|
||||
doc[5].norm_ = "with"
|
||||
|
||||
# Retokenize to split out the words in the token at doc[2].
|
||||
token = doc[2]
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.split(token, ["brown", "fox", "jumps", "over", "the"], heads=[(token, idx) for idx in range(5)])
|
||||
|
||||
assert doc[9].text == "w/"
|
||||
assert doc[9].norm_ == "with"
|
||||
assert doc[5].text == "over"
|
||||
assert doc[5].norm_ == "over"
|
||||
|
|
|
@ -355,6 +355,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
|
|||
lex = doc.vocab.get(doc.mem, orth)
|
||||
token.lex = lex
|
||||
token.lemma = 0 # reset lemma
|
||||
token.norm = 0 # reset norm
|
||||
if to_process_tensor:
|
||||
# setting the tensors of the split tokens to array of zeros
|
||||
doc.tensor[token_index + i] = xp.zeros((1,doc.tensor.shape[1]), dtype="float32")
|
||||
|
|
Loading…
Reference in New Issue
Block a user