2014-09-15 05:22:40 +04:00
|
|
|
from cpython.ref cimport Py_INCREF
|
2014-09-18 01:09:24 +04:00
|
|
|
from cymem.cymem cimport Pool
|
2014-09-15 05:22:40 +04:00
|
|
|
|
2014-10-09 12:53:30 +04:00
|
|
|
import orth
|
2014-09-10 22:41:37 +04:00
|
|
|
|
2014-10-09 12:53:30 +04:00
|
|
|
OOV_DIST_FLAGS = 0
|
2014-09-10 22:41:37 +04:00
|
|
|
|
|
|
|
|
2014-10-10 01:11:31 +04:00
|
|
|
cpdef dict get_lexeme_dict(size_t i, unicode string):
|
2014-10-09 12:53:30 +04:00
|
|
|
ints = [None for _ in range(LexInt_N)]
|
|
|
|
ints[<int>LexInt_i] = i
|
|
|
|
ints[<int>LexInt_length] = len(string)
|
|
|
|
ints[<int>LexInt_cluster] = 0
|
|
|
|
ints[<int>LexInt_pos] = 0
|
|
|
|
ints[<int>LexInt_supersense] = 0
|
|
|
|
|
|
|
|
floats = [None for _ in range(LexFloat_N)]
|
|
|
|
floats[<int>LexFloat_prob] = 0
|
|
|
|
floats[<int>LexFloat_sentiment] = 0
|
2014-09-10 22:41:37 +04:00
|
|
|
|
2014-10-09 12:53:30 +04:00
|
|
|
strings = [None for _ in range(LexStr_N)]
|
2014-10-10 12:17:22 +04:00
|
|
|
strings[<int>LexStr_orig] = string
|
|
|
|
strings[<int>LexStr_casefix] = strings[<int>LexStr_orig]
|
2014-10-10 01:11:31 +04:00
|
|
|
strings[<int>LexStr_shape] = orth.word_shape(string)
|
2014-10-09 12:53:30 +04:00
|
|
|
strings[<int>LexStr_unsparse] = strings[<int>LexStr_shape]
|
2014-10-10 01:11:31 +04:00
|
|
|
strings[<int>LexStr_asciied] = orth.asciied(string)
|
2014-10-09 12:53:30 +04:00
|
|
|
|
|
|
|
orth_flags = get_orth_flags(string)
|
|
|
|
dist_flags = OOV_DIST_FLAGS
|
|
|
|
|
|
|
|
return {'ints': ints, 'floats': floats, 'strings': strings,
|
|
|
|
'orth_flags': orth_flags, 'dist_flags': dist_flags}
|
|
|
|
|
|
|
|
def get_orth_flags(unicode string):
|
2014-10-10 01:11:31 +04:00
|
|
|
cdef flag_t flags = 0
|
|
|
|
|
|
|
|
flags |= orth.is_ascii(string) << LexOrth_ascii
|
|
|
|
flags |= orth.is_alpha(string) << LexOrth_alpha
|
|
|
|
flags |= orth.is_digit(string) << LexOrth_digit
|
|
|
|
flags |= orth.is_lower(string) << LexOrth_lower
|
|
|
|
flags |= orth.is_punct(string) << LexOrth_punct
|
|
|
|
flags |= orth.is_space(string) << LexOrth_space
|
|
|
|
flags |= orth.is_title(string) << LexOrth_title
|
|
|
|
flags |= orth.is_upper(string) << LexOrth_upper
|
|
|
|
return flags
|
2014-10-09 12:53:30 +04:00
|
|
|
|
2014-10-10 12:17:22 +04:00
|
|
|
|
2014-10-09 12:53:30 +04:00
|
|
|
def get_dist_flags(unicode string):
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
|
|
cdef char* intern_and_encode(unicode string, size_t* length) except NULL:
|
2014-09-15 08:34:45 +04:00
|
|
|
cdef bytes byte_string = string.encode('utf8')
|
|
|
|
cdef bytes utf8_string = intern(byte_string)
|
2014-09-15 05:22:40 +04:00
|
|
|
Py_INCREF(utf8_string)
|
2014-09-15 03:31:44 +04:00
|
|
|
length[0] = len(utf8_string)
|
2014-09-10 22:41:37 +04:00
|
|
|
return <char*>utf8_string
|
|
|
|
|
|
|
|
|
2014-10-09 12:53:30 +04:00
|
|
|
cdef int lexeme_get_int(LexemeC* lexeme, size_t i) except *:
|
|
|
|
return lexeme.ints[i]
|
2014-09-10 22:41:37 +04:00
|
|
|
|
|
|
|
|
2014-10-09 12:53:30 +04:00
|
|
|
cdef float lexeme_get_float(LexemeC* lexeme, size_t i) except *:
|
|
|
|
return lexeme.floats[i]
|
|
|
|
|
|
|
|
|
|
|
|
cdef unicode lexeme_get_string(LexemeC* lexeme, size_t i):
|
|
|
|
cdef bytes byte_string = lexeme.strings[i]
|
2014-09-10 22:41:37 +04:00
|
|
|
return byte_string.decode('utf8')
|
2014-10-09 07:10:46 +04:00
|
|
|
|
|
|
|
|
2014-10-09 12:53:30 +04:00
|
|
|
cdef bint lexeme_check_orth_flag(LexemeC* lexeme, size_t flag_id) except *:
|
|
|
|
return lexeme.orth_flags & (1 << flag_id)
|
|
|
|
|
|
|
|
|
|
|
|
cdef bint lexeme_check_dist_flag(LexemeC* lexeme, size_t flag_id) except *:
|
|
|
|
return lexeme.dist_flags & (1 << flag_id)
|
|
|
|
|
|
|
|
|
|
|
|
cdef dict lexeme_pack(LexemeC* lex):
|
2014-10-09 07:10:46 +04:00
|
|
|
cdef dict packed = {}
|
2014-10-09 12:53:30 +04:00
|
|
|
packed['ints'] = [lex.ints[i] for i in range(LexInt_N)]
|
|
|
|
packed['floats'] = [lex.floats[i] for i in range(LexFloat_N)]
|
|
|
|
packed['strings'] = [lex.strings[i].decode('utf8') for i in range(LexStr_N)]
|
|
|
|
packed['orth_flags'] = lex.orth_flags
|
|
|
|
packed['dist_flags'] = lex.orth_flags
|
2014-10-09 07:10:46 +04:00
|
|
|
return packed
|
|
|
|
|
|
|
|
|
|
|
|
cdef int lexeme_unpack(LexemeC* lex, dict p) except -1:
|
2014-10-09 12:53:30 +04:00
|
|
|
cdef size_t i
|
|
|
|
cdef int lex_int
|
|
|
|
cdef float lex_float
|
|
|
|
cdef unicode string
|
|
|
|
for i, lex_int in enumerate(p['ints']):
|
|
|
|
lex.ints[i] = lex_int
|
|
|
|
for i, lex_float in enumerate(p['floats']):
|
2014-10-10 01:11:31 +04:00
|
|
|
lex.floats[i] = lex_float
|
2014-10-09 12:53:30 +04:00
|
|
|
cdef size_t _
|
|
|
|
for i, lex_string in enumerate(p['strings']):
|
|
|
|
lex.strings[i] = intern_and_encode(lex_string, &_)
|
|
|
|
lex.orth_flags = p['orth_flags']
|
2014-10-10 01:11:31 +04:00
|
|
|
lex.dist_flags = p['dist_flags']
|