2014-12-03 03:04:00 +03:00
|
|
|
from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t
|
2014-09-10 22:41:37 +04:00
|
|
|
|
2014-10-23 17:59:17 +04:00
|
|
|
from .utf8string cimport StringStore
|
2014-10-09 12:53:30 +04:00
|
|
|
|
|
|
|
|
2014-12-03 03:04:00 +03:00
|
|
|
# Reserve 64 values for flag features
|
|
|
|
cpdef enum attr_id_t:
|
|
|
|
FLAG0
|
|
|
|
FLAG1
|
|
|
|
FLAG2
|
|
|
|
FLAG3
|
|
|
|
FLAG4
|
|
|
|
FLAG5
|
|
|
|
FLAG6
|
|
|
|
FLAG7
|
|
|
|
FLAG8
|
|
|
|
FLAG9
|
|
|
|
FLAG10
|
|
|
|
FLAG11
|
|
|
|
FLAG12
|
|
|
|
FLAG13
|
|
|
|
FLAG14
|
|
|
|
FLAG15
|
|
|
|
FLAG16
|
|
|
|
FLAG17
|
|
|
|
FLAG18
|
|
|
|
FLAG19
|
|
|
|
FLAG20
|
|
|
|
FLAG21
|
|
|
|
FLAG22
|
|
|
|
FLAG23
|
|
|
|
FLAG24
|
|
|
|
FLAG25
|
|
|
|
FLAG26
|
|
|
|
FLAG27
|
|
|
|
FLAG28
|
|
|
|
FLAG29
|
|
|
|
FLAG30
|
|
|
|
FLAG31
|
|
|
|
FLAG32
|
|
|
|
FLAG33
|
|
|
|
FLAG34
|
|
|
|
FLAG35
|
|
|
|
FLAG36
|
|
|
|
FLAG37
|
|
|
|
FLAG38
|
|
|
|
FLAG39
|
|
|
|
FLAG40
|
|
|
|
FLAG41
|
|
|
|
FLAG42
|
|
|
|
FLAG43
|
|
|
|
FLAG44
|
|
|
|
FLAG45
|
|
|
|
FLAG46
|
|
|
|
FLAG47
|
|
|
|
FLAG48
|
|
|
|
FLAG49
|
|
|
|
FLAG50
|
|
|
|
FLAG51
|
|
|
|
FLAG52
|
|
|
|
FLAG53
|
|
|
|
FLAG54
|
|
|
|
FLAG55
|
|
|
|
FLAG56
|
|
|
|
FLAG57
|
|
|
|
FLAG58
|
|
|
|
FLAG59
|
|
|
|
FLAG60
|
|
|
|
FLAG61
|
|
|
|
FLAG62
|
|
|
|
FLAG63
|
2014-10-09 12:53:30 +04:00
|
|
|
|
2014-12-03 03:04:00 +03:00
|
|
|
ID
|
|
|
|
SIC
|
2014-12-04 12:46:20 +03:00
|
|
|
DENSE
|
2014-12-03 03:04:00 +03:00
|
|
|
SHAPE
|
|
|
|
PREFIX
|
|
|
|
SUFFIX
|
2014-11-02 05:19:05 +03:00
|
|
|
|
2014-12-03 03:04:00 +03:00
|
|
|
LENGTH
|
|
|
|
CLUSTER
|
|
|
|
POS_TYPE
|
2014-12-10 00:09:32 +03:00
|
|
|
LEMMA
|
2014-11-02 16:13:51 +03:00
|
|
|
|
2014-10-09 12:53:30 +04:00
|
|
|
|
2014-10-23 17:59:17 +04:00
|
|
|
cdef struct Lexeme:
|
2014-12-03 03:04:00 +03:00
|
|
|
flags_t flags
|
2014-10-29 15:19:38 +03:00
|
|
|
|
2014-12-03 03:04:00 +03:00
|
|
|
attr_t id
|
|
|
|
attr_t sic
|
2014-12-04 12:46:20 +03:00
|
|
|
attr_t dense
|
2014-12-03 03:04:00 +03:00
|
|
|
attr_t shape
|
|
|
|
attr_t prefix
|
|
|
|
attr_t suffix
|
|
|
|
|
|
|
|
attr_t length
|
|
|
|
attr_t cluster
|
|
|
|
attr_t pos_type
|
2014-10-10 12:17:22 +04:00
|
|
|
|
2014-10-23 17:59:17 +04:00
|
|
|
float prob
|
2014-12-04 13:22:38 +03:00
|
|
|
float sentiment
|
2014-10-30 07:42:15 +03:00
|
|
|
|
2014-10-22 18:57:59 +04:00
|
|
|
|
2014-10-23 17:59:17 +04:00
|
|
|
cdef Lexeme EMPTY_LEXEME
|
2014-09-10 22:41:37 +04:00
|
|
|
|
2014-12-03 03:04:00 +03:00
|
|
|
|
|
|
|
cpdef Lexeme init(id_t i, unicode string, hash_t hashed, StringStore store,
|
|
|
|
dict props) except *
|
2014-10-29 15:19:38 +03:00
|
|
|
|
2014-10-09 07:10:46 +04:00
|
|
|
|
2014-12-03 07:44:25 +03:00
|
|
|
cdef inline bint check_flag(const Lexeme* lexeme, attr_id_t flag_id) nogil:
|
2014-10-23 17:59:17 +04:00
|
|
|
return lexeme.flags & (1 << flag_id)
|
2014-12-03 03:04:00 +03:00
|
|
|
|
|
|
|
|
2014-12-04 12:46:20 +03:00
|
|
|
cdef inline attr_t get_attr(const Lexeme* lex, attr_id_t feat_name) nogil:
|
|
|
|
if feat_name < (sizeof(flags_t) * 8):
|
|
|
|
return check_flag(lex, feat_name)
|
|
|
|
elif feat_name == ID:
|
|
|
|
return lex.id
|
|
|
|
elif feat_name == SIC:
|
|
|
|
return lex.sic
|
|
|
|
elif feat_name == DENSE:
|
|
|
|
return lex.dense
|
|
|
|
elif feat_name == SHAPE:
|
|
|
|
return lex.shape
|
|
|
|
elif feat_name == PREFIX:
|
|
|
|
return lex.prefix
|
|
|
|
elif feat_name == SUFFIX:
|
|
|
|
return lex.suffix
|
|
|
|
elif feat_name == LENGTH:
|
|
|
|
return lex.length
|
|
|
|
elif feat_name == CLUSTER:
|
|
|
|
return lex.cluster
|
|
|
|
elif feat_name == POS_TYPE:
|
|
|
|
return lex.pos_type
|
|
|
|
else:
|
|
|
|
return 0
|