spaCy/spacy/tokens/token.pxd
Matthew Honnibal 8aa7882762
Make NORM a token attribute (#3029)
See #3028. The solution in this patch is pretty debateable.

What we do is give the TokenC struct a .norm field, by repurposing the previously idle .sense attribute. It's nice to repurpose a previous field because it means the TokenC doesn't change size, so even if someone's using the internals very deeply, nothing will break.

The weird thing here is that the TokenC and the LexemeC both have an attribute named NORM. This arguably assists in backwards compatibility. On the other hand, maybe it's really bad! We're changing the semantics of the attribute subtly, so maybe it's better if someone calling lex.norm gets a breakage, and instead is told to write lex.default_norm?

Overall I believe this patch makes the NORM feature work the way we sort of expected it to work. Certainly it's much more like how the docs describe it, and more in line with how we've been directing people to use the norm attribute. We'll also be able to use token.norm to do stuff like spelling correction, which is pretty cool.
2018-12-08 10:49:10 +01:00

84 lines
2.7 KiB
Cython

from numpy cimport ndarray
from ..vocab cimport Vocab
from ..structs cimport TokenC
from ..attrs cimport *
from ..typedefs cimport attr_t, flags_t
from ..parts_of_speech cimport univ_pos_t
from .doc cimport Doc
from ..lexeme cimport Lexeme
from ..errors import Errors
cdef class Token:
cdef readonly Vocab vocab
cdef TokenC* c
cdef readonly int i
cdef readonly Doc doc
@staticmethod
cdef inline Token cinit(Vocab vocab, const TokenC* token, int offset, Doc doc):
if offset < 0 or offset >= doc.length:
raise IndexError(Errors.E040.format(i=offset, max_length=doc.length))
cdef Token self = Token.__new__(Token, vocab, doc, offset)
return self
#cdef inline TokenC struct_from_attrs(Vocab vocab, attrs):
# cdef TokenC token
# attrs = normalize_attrs(attrs)
cpdef bint check_flag(self, attr_id_t flag_id) except -1
@staticmethod
cdef inline attr_t get_struct_attr(const TokenC* token, attr_id_t feat_name) nogil:
if feat_name < (sizeof(flags_t) * 8):
return Lexeme.c_check_flag(token.lex, feat_name)
elif feat_name == LEMMA:
return token.lemma
elif feat_name == NORM:
if token.norm == 0:
return token.lex.norm
else:
return token.norm
elif feat_name == POS:
return token.pos
elif feat_name == TAG:
return token.tag
elif feat_name == DEP:
return token.dep
elif feat_name == HEAD:
return token.head
elif feat_name == SPACY:
return token.spacy
elif feat_name == ENT_IOB:
return token.ent_iob
elif feat_name == ENT_TYPE:
return token.ent_type
elif feat_name == SENT_START:
return token.sent_start
else:
return Lexeme.get_struct_attr(token.lex, feat_name)
@staticmethod
cdef inline attr_t set_struct_attr(TokenC* token, attr_id_t feat_name,
attr_t value) nogil:
if feat_name == LEMMA:
token.lemma = value
elif feat_name == NORM:
token.norm = value
elif feat_name == POS:
token.pos = <univ_pos_t>value
elif feat_name == TAG:
token.tag = value
elif feat_name == DEP:
token.dep = value
elif feat_name == HEAD:
token.head = value
elif feat_name == SPACY:
token.spacy = value
elif feat_name == ENT_IOB:
token.ent_iob = value
elif feat_name == ENT_TYPE:
token.ent_type = value
elif feat_name == SENT_START:
token.sent_start = value