From 53c0fb7431a00243224288a058f3d1e7cae88f6f Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 30 Nov 2020 02:35:42 +0100 Subject: [PATCH] Only set NORM on Token in retokenizer (#6464) * Only set NORM on Token in retokenizer Instead of setting `NORM` on both the token and lexeme, set `NORM` only on the token. The retokenizer tries to set all possible attributes with `Token/Lexeme.set_struct_attr` so that it doesn't have to enumerate which attributes are available for each. `NORM` is the only attribute that's stored on both and for most cases it doesn't make sense to set the global norms based on a individual retokenization. For lexeme-only attributes like `IS_STOP` there's no way to avoid the global side effects, but I think that `NORM` would be better only on the token. * Fix test --- spacy/tests/doc/test_retokenize_merge.py | 7 +++++++ spacy/tokens/_retokenize.pyx | 12 +++++++----- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py index 636b7bb14..6d4c436d6 100644 --- a/spacy/tests/doc/test_retokenize_merge.py +++ b/spacy/tests/doc/test_retokenize_merge.py @@ -414,6 +414,13 @@ def test_doc_retokenizer_merge_lex_attrs(en_vocab): assert doc[1].is_stop assert not doc[0].is_stop assert not doc[1].like_num + # Test that norm is only set on tokens + doc = Doc(en_vocab, words=["eins", "zwei", "!", "!"]) + assert doc[0].norm_ == "eins" + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[0:1], attrs={"norm": "1"}) + assert doc[0].norm_ == "1" + assert en_vocab["eins"].norm_ == "eins" def test_retokenize_skip_duplicates(en_vocab): diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index 4a030bef6..485e52304 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -16,7 +16,7 @@ from .span cimport Span from .token cimport Token from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..structs cimport LexemeC, TokenC -from ..attrs cimport TAG +from ..attrs cimport TAG, NORM from .underscore import is_writable_attr from ..attrs import intify_attrs @@ -238,9 +238,10 @@ def _merge(Doc doc, merges): # Set attributes on both token and lexeme to take care of token # attribute vs. lexical attribute without having to enumerate # them. If an attribute name is not valid, set_struct_attr will - # ignore it. + # ignore it. Exception: set NORM only on tokens. Token.set_struct_attr(token, attr_name, attr_value) - Lexeme.set_struct_attr(lex, attr_name, attr_value) + if attr_name != NORM: + Lexeme.set_struct_attr(lex, attr_name, attr_value) # Begin by setting all the head indices to absolute token positions # This is easier to work with for now than the offsets # Before thinking of something simpler, beware the case where a @@ -393,9 +394,10 @@ def _split(Doc doc, int token_index, orths, heads, attrs): # Set attributes on both token and lexeme to take care of token # attribute vs. lexical attribute without having to enumerate # them. If an attribute name is not valid, set_struct_attr will - # ignore it. + # ignore it. Exception: set NORM only on tokens. Token.set_struct_attr(token, attr_name, get_string_id(attr_value)) - Lexeme.set_struct_attr(token.lex, attr_name, get_string_id(attr_value)) + if attr_name != NORM: + Lexeme.set_struct_attr(token.lex, attr_name, get_string_id(attr_value)) # Assign correct dependencies to the inner token for i, head in enumerate(heads): doc.c[token_index + i].head = head