spaCy/spacy/lexeme.pxd

58 lines
1.3 KiB
Cython
Raw Normal View History

2014-08-03 00:51:52 +04:00
from libc.stdint cimport uint32_t
from libc.stdint cimport uint64_t
# Put these above import to avoid circular import problem
ctypedef int ClusterID
2014-08-03 00:51:52 +04:00
ctypedef uint32_t StringHash
ctypedef size_t Lexeme_addr
ctypedef char Bits8
ctypedef uint64_t Bits64
cdef struct Orthography:
StringHash last3
StringHash shape
StringHash norm
2014-08-03 00:26:44 +04:00
size_t length
2014-08-16 18:09:24 +04:00
unsigned char first
Bits8 flags
cdef struct Distribution:
double prob
ClusterID cluster
Bits64 tagdict
Bits8 flags
cdef struct Lexeme:
StringHash sic # Hash of the original string
StringHash lex # Hash of the word, with punctuation and clitics split off
Distribution* dist # Distribution info, lazy loaded
Orthography* orth # Extra orthographic views
Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens
cdef Lexeme BLANK_WORD = Lexeme(0, 0, NULL, NULL, NULL)
2014-07-07 22:27:02 +04:00
cdef enum StringAttr:
SIC
LEX
NORM
SHAPE
LAST3
2014-08-03 00:26:44 +04:00
LENGTH
2014-07-07 22:27:02 +04:00
cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0
cpdef StringHash sic_of(size_t lex_id) except 0
cpdef StringHash lex_of(size_t lex_id) except 0
cpdef StringHash norm_of(size_t lex_id) except 0
2014-07-07 21:12:19 +04:00
cpdef StringHash shape_of(size_t lex_id) except 0
2014-07-07 22:27:02 +04:00
cpdef StringHash last3_of(size_t lex_id) except 0
2014-08-03 00:26:44 +04:00
cpdef StringHash length_of(size_t lex_id)