2014-08-03 00:51:52 +04:00
|
|
|
from libc.stdint cimport uint32_t
|
2014-07-05 22:51:42 +04:00
|
|
|
from libc.stdint cimport uint64_t
|
|
|
|
|
2014-07-07 06:21:06 +04:00
|
|
|
# Put these above import to avoid circular import problem
|
2014-07-05 22:51:42 +04:00
|
|
|
ctypedef int ClusterID
|
2014-08-03 00:51:52 +04:00
|
|
|
ctypedef uint32_t StringHash
|
2014-07-07 06:21:06 +04:00
|
|
|
ctypedef size_t Lexeme_addr
|
2014-07-07 18:58:48 +04:00
|
|
|
ctypedef char Bits8
|
|
|
|
ctypedef uint64_t Bits64
|
|
|
|
|
2014-07-05 22:51:42 +04:00
|
|
|
|
2014-08-19 04:40:37 +04:00
|
|
|
cdef enum OrthFlag:
|
|
|
|
IS_ALPHA
|
|
|
|
IS_DIGIT
|
|
|
|
IS_PUNCT
|
|
|
|
IS_WHITE
|
|
|
|
IS_LOWER
|
|
|
|
IS_UPPER
|
|
|
|
IS_TITLE
|
|
|
|
IS_ASCII
|
|
|
|
|
|
|
|
|
|
|
|
cdef enum DistFlag:
|
|
|
|
OFT_UPPER
|
|
|
|
OFT_TITLE
|
|
|
|
DIST_FLAG3
|
|
|
|
DIST_FLAG4
|
|
|
|
DIST_FLAG5
|
|
|
|
DIST_FLAG6
|
|
|
|
DIST_FLAG7
|
|
|
|
DIST_FLAG8
|
|
|
|
|
|
|
|
|
2014-07-07 18:58:48 +04:00
|
|
|
cdef struct Orthography:
|
|
|
|
StringHash shape
|
|
|
|
StringHash norm
|
2014-08-19 06:21:20 +04:00
|
|
|
StringHash last3
|
2014-07-07 18:58:48 +04:00
|
|
|
Bits8 flags
|
|
|
|
|
|
|
|
|
|
|
|
cdef struct Distribution:
|
|
|
|
double prob
|
|
|
|
ClusterID cluster
|
|
|
|
Bits64 tagdict
|
|
|
|
Bits8 flags
|
|
|
|
|
|
|
|
|
2014-07-05 22:51:42 +04:00
|
|
|
cdef struct Lexeme:
|
2014-08-19 06:21:20 +04:00
|
|
|
char* string
|
|
|
|
size_t length
|
|
|
|
StringHash lex
|
|
|
|
Orthography orth # Extra orthographic views
|
|
|
|
Distribution dist # Distribution info
|
2014-07-05 22:51:42 +04:00
|
|
|
|
|
|
|
|
2014-08-19 06:21:20 +04:00
|
|
|
cdef Lexeme BLANK_WORD = Lexeme(NULL, 0, 0,
|
|
|
|
Orthography(0, 0, 0, 0),
|
|
|
|
Distribution(0.0, 0, 0, 0)
|
|
|
|
)
|
2014-07-07 06:21:06 +04:00
|
|
|
|
2014-07-07 18:58:48 +04:00
|
|
|
|
2014-07-07 22:27:02 +04:00
|
|
|
cdef enum StringAttr:
|
|
|
|
LEX
|
|
|
|
NORM
|
|
|
|
SHAPE
|
|
|
|
LAST3
|
2014-08-03 00:26:44 +04:00
|
|
|
LENGTH
|
2014-07-07 22:27:02 +04:00
|
|
|
|
|
|
|
|
|
|
|
cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0
|
|
|
|
|
2014-07-07 18:58:48 +04:00
|
|
|
cpdef StringHash lex_of(size_t lex_id) except 0
|
|
|
|
cpdef StringHash norm_of(size_t lex_id) except 0
|
2014-07-07 21:12:19 +04:00
|
|
|
cpdef StringHash shape_of(size_t lex_id) except 0
|
2014-07-07 22:27:02 +04:00
|
|
|
cpdef StringHash last3_of(size_t lex_id) except 0
|
2014-08-19 04:40:37 +04:00
|
|
|
|
|
|
|
cpdef size_t length_of(size_t lex_id) except *
|
|
|
|
|
|
|
|
cpdef double prob_of(size_t lex_id) except 0
|
|
|
|
cpdef ClusterID cluster_of(size_t lex_id) except 0
|
|
|
|
|
|
|
|
cpdef bint check_orth_flag(size_t lex, OrthFlag flag) except *
|
|
|
|
cpdef bint check_dist_flag(size_t lex, DistFlag flag) except *
|