diff --git a/spacy/structs.pxd b/spacy/structs.pxd index cfcadc3d0..fa282cae7 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -58,7 +58,7 @@ cdef struct TokenC: attr_t tag int idx attr_t lemma - attr_t sense + attr_t norm int head attr_t dep diff --git a/spacy/tests/regression/test_issue2754.py b/spacy/tests/regression/test_issue2754.py new file mode 100644 index 000000000..5f76727f8 --- /dev/null +++ b/spacy/tests/regression/test_issue2754.py @@ -0,0 +1,14 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest +from spacy.lang.en import English + +def test_issue2754(): + """Test that words like 'a' and 'a.m.' don't get exceptional norm values.""" + nlp = English() + a = nlp('a') + assert a[0].norm_ == 'a' + am = nlp('am') + assert am[0].norm_ == 'am' + diff --git a/spacy/tokens/token.pxd b/spacy/tokens/token.pxd index 9b02d07fb..bb9f7d070 100644 --- a/spacy/tokens/token.pxd +++ b/spacy/tokens/token.pxd @@ -34,6 +34,11 @@ cdef class Token: return Lexeme.c_check_flag(token.lex, feat_name) elif feat_name == LEMMA: return token.lemma + elif feat_name == NORM: + if token.norm == 0: + return token.lex.norm + else: + return token.norm elif feat_name == POS: return token.pos elif feat_name == TAG: @@ -58,6 +63,8 @@ cdef class Token: attr_t value) nogil: if feat_name == LEMMA: token.lemma = value + elif feat_name == NORM: + token.norm = value elif feat_name == POS: token.pos = value elif feat_name == TAG: diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 5c8af1333..0266004b5 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -249,7 +249,10 @@ cdef class Token: or norm exceptions. """ def __get__(self): - return self.c.lex.norm + if self.c.norm == 0: + return self.c.lex.norm + else: + return self.c.norm property shape: """RETURNS (uint64): ID of the token's shape, a transform of the @@ -711,7 +714,10 @@ cdef class Token: norm exceptions. """ def __get__(self): - return self.vocab.strings[self.c.lex.norm] + return self.vocab.strings[self.norm] + + def __set__(self, unicode norm_): + self.c.norm = self.vocab.strings.add(norm_) property shape_: """RETURNS (unicode): Transform of the tokens's string, to show diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 42fd2f46e..e28aa0b86 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -17,7 +17,7 @@ from .structs cimport SerializedLexemeC from .compat import copy_reg, basestring_ from .errors import Errors from .lemmatizer import Lemmatizer -from .attrs import intify_attrs +from .attrs import intify_attrs, NORM from .vectors import Vectors from ._ml import link_vectors_to_models from . import util @@ -234,7 +234,10 @@ cdef class Vocab: self.morphology.assign_tag(token, props[TAG]) for attr_id, value in props.items(): Token.set_struct_attr(token, attr_id, value) - Lexeme.set_struct_attr(lex, attr_id, value) + # NORM is the only one that overlaps between the two + # (which is maybe not great?) + if attr_id != NORM: + Lexeme.set_struct_attr(lex, attr_id, value) return tokens @property