Fix retokenizer

This commit is contained in:
Ines Montani 2020-12-09 11:29:55 +11:00
parent 04b3068747
commit 271923eaea

View File

@ -11,7 +11,7 @@ from .span cimport Span
from .token cimport Token from .token cimport Token
from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..lexeme cimport Lexeme, EMPTY_LEXEME
from ..structs cimport LexemeC, TokenC from ..structs cimport LexemeC, TokenC
from ..attrs cimport MORPH from ..attrs cimport MORPH, NORM
from ..vocab cimport Vocab from ..vocab cimport Vocab
from .underscore import is_writable_attr from .underscore import is_writable_attr
@ -436,6 +436,7 @@ def set_token_attrs(Token py_token, attrs):
# Set attributes on both token and lexeme to take care of token # Set attributes on both token and lexeme to take care of token
# attribute vs. lexical attribute without having to enumerate # attribute vs. lexical attribute without having to enumerate
# them. If an attribute name is not valid, set_struct_attr will # them. If an attribute name is not valid, set_struct_attr will
# ignore it. # ignore it. Exception: set NORM only on tokens.
Token.set_struct_attr(token, attr_name, attr_value) Token.set_struct_attr(token, attr_name, attr_value)
Lexeme.set_struct_attr(<LexemeC*>lex, attr_name, attr_value) if attr_name != NORM:
Lexeme.set_struct_attr(<LexemeC*>lex, attr_name, attr_value)