From 271923eaea508b6640e1fe75b755a9a071ee24a3 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 9 Dec 2020 11:29:55 +1100 Subject: [PATCH] Fix retokenizer --- spacy/tokens/_retokenize.pyx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index ea65c464a..0069e36bf 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -11,7 +11,7 @@ from .span cimport Span from .token cimport Token from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..structs cimport LexemeC, TokenC -from ..attrs cimport MORPH +from ..attrs cimport MORPH, NORM from ..vocab cimport Vocab from .underscore import is_writable_attr @@ -436,6 +436,7 @@ def set_token_attrs(Token py_token, attrs): # Set attributes on both token and lexeme to take care of token # attribute vs. lexical attribute without having to enumerate # them. If an attribute name is not valid, set_struct_attr will - # ignore it. + # ignore it. Exception: set NORM only on tokens. Token.set_struct_attr(token, attr_name, attr_value) - Lexeme.set_struct_attr(lex, attr_name, attr_value) + if attr_name != NORM: + Lexeme.set_struct_attr(lex, attr_name, attr_value)