mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
Make NORM a token attribute (#3029)
See #3028. The solution in this patch is pretty debateable. What we do is give the TokenC struct a .norm field, by repurposing the previously idle .sense attribute. It's nice to repurpose a previous field because it means the TokenC doesn't change size, so even if someone's using the internals very deeply, nothing will break. The weird thing here is that the TokenC and the LexemeC both have an attribute named NORM. This arguably assists in backwards compatibility. On the other hand, maybe it's really bad! We're changing the semantics of the attribute subtly, so maybe it's better if someone calling lex.norm gets a breakage, and instead is told to write lex.default_norm? Overall I believe this patch makes the NORM feature work the way we sort of expected it to work. Certainly it's much more like how the docs describe it, and more in line with how we've been directing people to use the norm attribute. We'll also be able to use token.norm to do stuff like spelling correction, which is pretty cool.
This commit is contained in:
parent
a338c6f8f6
commit
8aa7882762
|
@ -58,7 +58,7 @@ cdef struct TokenC:
|
||||||
attr_t tag
|
attr_t tag
|
||||||
int idx
|
int idx
|
||||||
attr_t lemma
|
attr_t lemma
|
||||||
attr_t sense
|
attr_t norm
|
||||||
int head
|
int head
|
||||||
attr_t dep
|
attr_t dep
|
||||||
|
|
||||||
|
|
14
spacy/tests/regression/test_issue2754.py
Normal file
14
spacy/tests/regression/test_issue2754.py
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from spacy.lang.en import English
|
||||||
|
|
||||||
|
def test_issue2754():
|
||||||
|
"""Test that words like 'a' and 'a.m.' don't get exceptional norm values."""
|
||||||
|
nlp = English()
|
||||||
|
a = nlp('a')
|
||||||
|
assert a[0].norm_ == 'a'
|
||||||
|
am = nlp('am')
|
||||||
|
assert am[0].norm_ == 'am'
|
||||||
|
|
|
@ -34,6 +34,11 @@ cdef class Token:
|
||||||
return Lexeme.c_check_flag(token.lex, feat_name)
|
return Lexeme.c_check_flag(token.lex, feat_name)
|
||||||
elif feat_name == LEMMA:
|
elif feat_name == LEMMA:
|
||||||
return token.lemma
|
return token.lemma
|
||||||
|
elif feat_name == NORM:
|
||||||
|
if token.norm == 0:
|
||||||
|
return token.lex.norm
|
||||||
|
else:
|
||||||
|
return token.norm
|
||||||
elif feat_name == POS:
|
elif feat_name == POS:
|
||||||
return token.pos
|
return token.pos
|
||||||
elif feat_name == TAG:
|
elif feat_name == TAG:
|
||||||
|
@ -58,6 +63,8 @@ cdef class Token:
|
||||||
attr_t value) nogil:
|
attr_t value) nogil:
|
||||||
if feat_name == LEMMA:
|
if feat_name == LEMMA:
|
||||||
token.lemma = value
|
token.lemma = value
|
||||||
|
elif feat_name == NORM:
|
||||||
|
token.norm = value
|
||||||
elif feat_name == POS:
|
elif feat_name == POS:
|
||||||
token.pos = <univ_pos_t>value
|
token.pos = <univ_pos_t>value
|
||||||
elif feat_name == TAG:
|
elif feat_name == TAG:
|
||||||
|
|
|
@ -249,7 +249,10 @@ cdef class Token:
|
||||||
or norm exceptions.
|
or norm exceptions.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.lex.norm
|
if self.c.norm == 0:
|
||||||
|
return self.c.lex.norm
|
||||||
|
else:
|
||||||
|
return self.c.norm
|
||||||
|
|
||||||
property shape:
|
property shape:
|
||||||
"""RETURNS (uint64): ID of the token's shape, a transform of the
|
"""RETURNS (uint64): ID of the token's shape, a transform of the
|
||||||
|
@ -711,7 +714,10 @@ cdef class Token:
|
||||||
norm exceptions.
|
norm exceptions.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.lex.norm]
|
return self.vocab.strings[self.norm]
|
||||||
|
|
||||||
|
def __set__(self, unicode norm_):
|
||||||
|
self.c.norm = self.vocab.strings.add(norm_)
|
||||||
|
|
||||||
property shape_:
|
property shape_:
|
||||||
"""RETURNS (unicode): Transform of the tokens's string, to show
|
"""RETURNS (unicode): Transform of the tokens's string, to show
|
||||||
|
|
|
@ -17,7 +17,7 @@ from .structs cimport SerializedLexemeC
|
||||||
from .compat import copy_reg, basestring_
|
from .compat import copy_reg, basestring_
|
||||||
from .errors import Errors
|
from .errors import Errors
|
||||||
from .lemmatizer import Lemmatizer
|
from .lemmatizer import Lemmatizer
|
||||||
from .attrs import intify_attrs
|
from .attrs import intify_attrs, NORM
|
||||||
from .vectors import Vectors
|
from .vectors import Vectors
|
||||||
from ._ml import link_vectors_to_models
|
from ._ml import link_vectors_to_models
|
||||||
from . import util
|
from . import util
|
||||||
|
@ -234,7 +234,10 @@ cdef class Vocab:
|
||||||
self.morphology.assign_tag(token, props[TAG])
|
self.morphology.assign_tag(token, props[TAG])
|
||||||
for attr_id, value in props.items():
|
for attr_id, value in props.items():
|
||||||
Token.set_struct_attr(token, attr_id, value)
|
Token.set_struct_attr(token, attr_id, value)
|
||||||
Lexeme.set_struct_attr(lex, attr_id, value)
|
# NORM is the only one that overlaps between the two
|
||||||
|
# (which is maybe not great?)
|
||||||
|
if attr_id != NORM:
|
||||||
|
Lexeme.set_struct_attr(lex, attr_id, value)
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
Loading…
Reference in New Issue
Block a user