2014-10-30 07:42:15 +03:00
|
|
|
from .typedefs cimport hash_t, utf8_t, flag_t, id_t, len_t, tag_t
|
2014-09-10 22:41:37 +04:00
|
|
|
|
2014-10-23 17:59:17 +04:00
|
|
|
from .utf8string cimport StringStore
|
2014-10-30 07:42:15 +03:00
|
|
|
from libc.stdint cimport uint16_t
|
2014-10-09 12:53:30 +04:00
|
|
|
|
2014-10-23 17:59:17 +04:00
|
|
|
cpdef flag_t OOV_DIST_FLAGS
|
2014-10-09 12:53:30 +04:00
|
|
|
|
2014-10-23 17:59:17 +04:00
|
|
|
# Flags
|
|
|
|
cpdef enum:
|
|
|
|
IS_ALPHA
|
|
|
|
IS_ASCII
|
|
|
|
IS_DIGIT
|
|
|
|
IS_LOWER
|
|
|
|
IS_PUNCT
|
|
|
|
IS_SPACE
|
|
|
|
IS_TITLE
|
|
|
|
IS_UPPER
|
2014-10-09 12:53:30 +04:00
|
|
|
|
2014-11-02 05:19:05 +03:00
|
|
|
LIKE_URL
|
|
|
|
LIKE_NUMBER
|
|
|
|
|
2014-10-23 17:59:17 +04:00
|
|
|
OFT_LOWER
|
|
|
|
OFT_TITLE
|
|
|
|
OFT_UPPER
|
2014-10-09 12:53:30 +04:00
|
|
|
|
2014-11-02 16:13:51 +03:00
|
|
|
IN_MALES
|
|
|
|
IN_FEMALES
|
|
|
|
IN_SURNAMES
|
|
|
|
IN_PLACES
|
|
|
|
IN_GAMES
|
|
|
|
IN_CELEBS
|
|
|
|
IN_NAMES
|
|
|
|
|
2014-10-09 12:53:30 +04:00
|
|
|
|
2014-10-23 17:59:17 +04:00
|
|
|
cdef struct Lexeme:
|
2014-10-30 07:42:15 +03:00
|
|
|
flag_t flags
|
2014-10-29 15:19:38 +03:00
|
|
|
|
2014-10-31 09:43:00 +03:00
|
|
|
id_t id
|
2014-10-30 07:42:15 +03:00
|
|
|
id_t sic
|
|
|
|
id_t norm
|
|
|
|
id_t shape
|
|
|
|
id_t asciied
|
|
|
|
id_t prefix
|
|
|
|
id_t suffix
|
2014-10-10 12:17:22 +04:00
|
|
|
|
2014-10-23 17:59:17 +04:00
|
|
|
float prob
|
2014-10-30 07:42:15 +03:00
|
|
|
|
|
|
|
len_t length
|
|
|
|
tag_t cluster
|
2014-10-31 09:43:00 +03:00
|
|
|
tag_t postype
|
2014-10-30 07:42:15 +03:00
|
|
|
tag_t supersense
|
|
|
|
|
2014-10-22 18:57:59 +04:00
|
|
|
|
2014-10-23 17:59:17 +04:00
|
|
|
cdef Lexeme EMPTY_LEXEME
|
2014-09-10 22:41:37 +04:00
|
|
|
|
2014-10-31 09:43:00 +03:00
|
|
|
cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
|
2014-10-29 15:19:38 +03:00
|
|
|
StringStore store, dict props) except *
|
|
|
|
|
2014-10-09 07:10:46 +04:00
|
|
|
|
2014-10-23 17:59:17 +04:00
|
|
|
cdef inline bint check_flag(Lexeme* lexeme, size_t flag_id) nogil:
|
|
|
|
return lexeme.flags & (1 << flag_id)
|