mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-08 00:06:37 +03:00
8aa7882762
See #3028. The solution in this patch is pretty debateable. What we do is give the TokenC struct a .norm field, by repurposing the previously idle .sense attribute. It's nice to repurpose a previous field because it means the TokenC doesn't change size, so even if someone's using the internals very deeply, nothing will break. The weird thing here is that the TokenC and the LexemeC both have an attribute named NORM. This arguably assists in backwards compatibility. On the other hand, maybe it's really bad! We're changing the semantics of the attribute subtly, so maybe it's better if someone calling lex.norm gets a breakage, and instead is told to write lex.default_norm? Overall I believe this patch makes the NORM feature work the way we sort of expected it to work. Certainly it's much more like how the docs describe it, and more in line with how we've been directing people to use the norm attribute. We'll also be able to use token.norm to do stuff like spelling correction, which is pretty cool.
84 lines
2.7 KiB
Cython
84 lines
2.7 KiB
Cython
from numpy cimport ndarray
|
|
from ..vocab cimport Vocab
|
|
from ..structs cimport TokenC
|
|
from ..attrs cimport *
|
|
from ..typedefs cimport attr_t, flags_t
|
|
from ..parts_of_speech cimport univ_pos_t
|
|
from .doc cimport Doc
|
|
from ..lexeme cimport Lexeme
|
|
from ..errors import Errors
|
|
|
|
|
|
cdef class Token:
|
|
cdef readonly Vocab vocab
|
|
cdef TokenC* c
|
|
cdef readonly int i
|
|
cdef readonly Doc doc
|
|
|
|
@staticmethod
|
|
cdef inline Token cinit(Vocab vocab, const TokenC* token, int offset, Doc doc):
|
|
if offset < 0 or offset >= doc.length:
|
|
raise IndexError(Errors.E040.format(i=offset, max_length=doc.length))
|
|
cdef Token self = Token.__new__(Token, vocab, doc, offset)
|
|
return self
|
|
|
|
#cdef inline TokenC struct_from_attrs(Vocab vocab, attrs):
|
|
# cdef TokenC token
|
|
# attrs = normalize_attrs(attrs)
|
|
|
|
cpdef bint check_flag(self, attr_id_t flag_id) except -1
|
|
|
|
@staticmethod
|
|
cdef inline attr_t get_struct_attr(const TokenC* token, attr_id_t feat_name) nogil:
|
|
if feat_name < (sizeof(flags_t) * 8):
|
|
return Lexeme.c_check_flag(token.lex, feat_name)
|
|
elif feat_name == LEMMA:
|
|
return token.lemma
|
|
elif feat_name == NORM:
|
|
if token.norm == 0:
|
|
return token.lex.norm
|
|
else:
|
|
return token.norm
|
|
elif feat_name == POS:
|
|
return token.pos
|
|
elif feat_name == TAG:
|
|
return token.tag
|
|
elif feat_name == DEP:
|
|
return token.dep
|
|
elif feat_name == HEAD:
|
|
return token.head
|
|
elif feat_name == SPACY:
|
|
return token.spacy
|
|
elif feat_name == ENT_IOB:
|
|
return token.ent_iob
|
|
elif feat_name == ENT_TYPE:
|
|
return token.ent_type
|
|
elif feat_name == SENT_START:
|
|
return token.sent_start
|
|
else:
|
|
return Lexeme.get_struct_attr(token.lex, feat_name)
|
|
|
|
@staticmethod
|
|
cdef inline attr_t set_struct_attr(TokenC* token, attr_id_t feat_name,
|
|
attr_t value) nogil:
|
|
if feat_name == LEMMA:
|
|
token.lemma = value
|
|
elif feat_name == NORM:
|
|
token.norm = value
|
|
elif feat_name == POS:
|
|
token.pos = <univ_pos_t>value
|
|
elif feat_name == TAG:
|
|
token.tag = value
|
|
elif feat_name == DEP:
|
|
token.dep = value
|
|
elif feat_name == HEAD:
|
|
token.head = value
|
|
elif feat_name == SPACY:
|
|
token.spacy = value
|
|
elif feat_name == ENT_IOB:
|
|
token.ent_iob = value
|
|
elif feat_name == ENT_TYPE:
|
|
token.ent_type = value
|
|
elif feat_name == SENT_START:
|
|
token.sent_start = value
|