mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-06 12:53:19 +03:00
Only set NORM on Token in retokenizer (#6464)
* Only set NORM on Token in retokenizer Instead of setting `NORM` on both the token and lexeme, set `NORM` only on the token. The retokenizer tries to set all possible attributes with `Token/Lexeme.set_struct_attr` so that it doesn't have to enumerate which attributes are available for each. `NORM` is the only attribute that's stored on both and for most cases it doesn't make sense to set the global norms based on a individual retokenization. For lexeme-only attributes like `IS_STOP` there's no way to avoid the global side effects, but I think that `NORM` would be better only on the token. * Fix test
This commit is contained in:
parent
03ae77e603
commit
53c0fb7431
|
@ -414,6 +414,13 @@ def test_doc_retokenizer_merge_lex_attrs(en_vocab):
|
||||||
assert doc[1].is_stop
|
assert doc[1].is_stop
|
||||||
assert not doc[0].is_stop
|
assert not doc[0].is_stop
|
||||||
assert not doc[1].like_num
|
assert not doc[1].like_num
|
||||||
|
# Test that norm is only set on tokens
|
||||||
|
doc = Doc(en_vocab, words=["eins", "zwei", "!", "!"])
|
||||||
|
assert doc[0].norm_ == "eins"
|
||||||
|
with doc.retokenize() as retokenizer:
|
||||||
|
retokenizer.merge(doc[0:1], attrs={"norm": "1"})
|
||||||
|
assert doc[0].norm_ == "1"
|
||||||
|
assert en_vocab["eins"].norm_ == "eins"
|
||||||
|
|
||||||
|
|
||||||
def test_retokenize_skip_duplicates(en_vocab):
|
def test_retokenize_skip_duplicates(en_vocab):
|
||||||
|
|
|
@ -16,7 +16,7 @@ from .span cimport Span
|
||||||
from .token cimport Token
|
from .token cimport Token
|
||||||
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
||||||
from ..structs cimport LexemeC, TokenC
|
from ..structs cimport LexemeC, TokenC
|
||||||
from ..attrs cimport TAG
|
from ..attrs cimport TAG, NORM
|
||||||
|
|
||||||
from .underscore import is_writable_attr
|
from .underscore import is_writable_attr
|
||||||
from ..attrs import intify_attrs
|
from ..attrs import intify_attrs
|
||||||
|
@ -238,9 +238,10 @@ def _merge(Doc doc, merges):
|
||||||
# Set attributes on both token and lexeme to take care of token
|
# Set attributes on both token and lexeme to take care of token
|
||||||
# attribute vs. lexical attribute without having to enumerate
|
# attribute vs. lexical attribute without having to enumerate
|
||||||
# them. If an attribute name is not valid, set_struct_attr will
|
# them. If an attribute name is not valid, set_struct_attr will
|
||||||
# ignore it.
|
# ignore it. Exception: set NORM only on tokens.
|
||||||
Token.set_struct_attr(token, attr_name, attr_value)
|
Token.set_struct_attr(token, attr_name, attr_value)
|
||||||
Lexeme.set_struct_attr(<LexemeC*>lex, attr_name, attr_value)
|
if attr_name != NORM:
|
||||||
|
Lexeme.set_struct_attr(<LexemeC*>lex, attr_name, attr_value)
|
||||||
# Begin by setting all the head indices to absolute token positions
|
# Begin by setting all the head indices to absolute token positions
|
||||||
# This is easier to work with for now than the offsets
|
# This is easier to work with for now than the offsets
|
||||||
# Before thinking of something simpler, beware the case where a
|
# Before thinking of something simpler, beware the case where a
|
||||||
|
@ -393,9 +394,10 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
|
||||||
# Set attributes on both token and lexeme to take care of token
|
# Set attributes on both token and lexeme to take care of token
|
||||||
# attribute vs. lexical attribute without having to enumerate
|
# attribute vs. lexical attribute without having to enumerate
|
||||||
# them. If an attribute name is not valid, set_struct_attr will
|
# them. If an attribute name is not valid, set_struct_attr will
|
||||||
# ignore it.
|
# ignore it. Exception: set NORM only on tokens.
|
||||||
Token.set_struct_attr(token, attr_name, get_string_id(attr_value))
|
Token.set_struct_attr(token, attr_name, get_string_id(attr_value))
|
||||||
Lexeme.set_struct_attr(<LexemeC*>token.lex, attr_name, get_string_id(attr_value))
|
if attr_name != NORM:
|
||||||
|
Lexeme.set_struct_attr(<LexemeC*>token.lex, attr_name, get_string_id(attr_value))
|
||||||
# Assign correct dependencies to the inner token
|
# Assign correct dependencies to the inner token
|
||||||
for i, head in enumerate(heads):
|
for i, head in enumerate(heads):
|
||||||
doc.c[token_index + i].head = head
|
doc.c[token_index + i].head = head
|
||||||
|
|
Loading…
Reference in New Issue
Block a user