* Revising data model of lexeme. Compiles.

This commit is contained in:
Matthew Honnibal 2014-10-09 19:53:30 +11:00
parent e40caae51f
commit 1b0e01d3d8
2 changed files with 147 additions and 54 deletions

View File

@ -2,24 +2,86 @@ from .typedefs cimport hash_t, utf8_t, flag_t, id_t
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
cpdef flag_t OOV_DIST_FLAGS
cpdef enum LexInts:
LexInt_i
LexInt_length
LexInt_cluster
LexInt_pos
LexInt_supersense
LexInt_N
cpdef enum LexFloats:
LexFloat_prob
LexFloat_sentiment
LexFloat_N
cpdef enum LexStrs:
LexStr_key
LexStr_casefix
LexStr_shape
LexStr_unsparse
LexStr_asciied
LexStr_N
cpdef enum LexOrthFlags:
LexOrth_alpha
LexOrth_ascii
LexOrth_digit
LexOrth_lower
LexOrth_punct
LexOrth_space
LexOrth_title
LexOrth_upper
LexOrth_N
cpdef enum LexDistFlags:
LexDist_adj
LexDist_adp
LexDist_adv
LexDist_conj
LexDist_det
LexDist_noun
LexDist_num
LexDist_pdt
LexDist_pos
LexDist_pron
LexDist_prt
LexDist_punct
LexDist_verb
LexDist_lower
LexDist_title
LexDist_upper
LexDist_N
cdef struct LexemeC: cdef struct LexemeC:
size_t i int[<int>LexInt_N] ints
size_t length float[<int>LexFloat_N] floats
double prob utf8_t[<int>LexStr_N] strings
size_t cluster flag_t orth_flags
flag_t dist_flags
char* string
char** views
flag_t flags
cdef LexemeC* lexeme_init(Pool mem, size_t i, unicode string, double prob, size_t cluster, cdef char* intern_and_encode(unicode string, size_t* length) except NULL
list views, set flags)
cdef bint lexeme_check_flag(LexemeC* lexeme, size_t flag_id) cdef int lexeme_get_int(LexemeC* lexeme, size_t i) except *
cdef unicode lexeme_string_view(LexemeC* lexeme, size_t view_id)
cdef float lexeme_get_float(LexemeC* lexeme, size_t i) except *
cdef unicode lexeme_get_string(LexemeC* lexeme, size_t i)
cdef bint lexeme_check_orth_flag(LexemeC* lexeme, size_t flag_id) except *
cdef bint lexeme_check_dist_flag(LexemeC* lexeme, size_t flag_id) except *
cdef dict lexeme_pack(LexemeC* lexeme) cdef dict lexeme_pack(LexemeC* lexeme)
cdef int lexeme_unpack(LexemeC* lexeme, dict p) except -1 cdef int lexeme_unpack(LexemeC* lexeme, dict p) except -1

View File

@ -1,25 +1,46 @@
from cpython.ref cimport Py_INCREF from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
import orth
cdef LexemeC* lexeme_init(Pool mem, size_t i, unicode string, double prob, OOV_DIST_FLAGS = 0
size_t cluster, list views, set flags):
cdef LexemeC* lexeme = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
lexeme.i = i
lexeme.cluster = cluster
lexeme.prob = prob
lexeme.string = intern_and_encode(string, &lexeme.length)
lexeme.views = <char**>mem.alloc(len(views), sizeof(char*))
cdef size_t length = 0
for i, string in enumerate(views):
lexeme.views[i] = intern_and_encode(string, &length)
for active_flag in flags:
lexeme.flags |= (1 << active_flag)
return lexeme
cdef char* intern_and_encode(unicode string, size_t* length): def get_lexeme_dict(size_t i, unicode string):
ints = [None for _ in range(LexInt_N)]
ints[<int>LexInt_i] = i
ints[<int>LexInt_length] = len(string)
ints[<int>LexInt_cluster] = 0
ints[<int>LexInt_pos] = 0
ints[<int>LexInt_supersense] = 0
floats = [None for _ in range(LexFloat_N)]
floats[<int>LexFloat_prob] = 0
floats[<int>LexFloat_sentiment] = 0
cdef size_t length
strings = [None for _ in range(LexStr_N)]
strings[<int>LexStr_key] = intern_and_encode(string, &length)
strings[<int>LexStr_casefix] = strings[<int>LexStr_key]
strings[<int>LexStr_shape] = intern_and_encode(orth.word_shape(string), &length)
strings[<int>LexStr_unsparse] = strings[<int>LexStr_shape]
strings[<int>LexStr_asciied] = intern_and_encode(orth.asciied(string), &length)
orth_flags = get_orth_flags(string)
dist_flags = OOV_DIST_FLAGS
return {'ints': ints, 'floats': floats, 'strings': strings,
'orth_flags': orth_flags, 'dist_flags': dist_flags}
def get_orth_flags(unicode string):
return 0
def get_dist_flags(unicode string):
return 0
cdef char* intern_and_encode(unicode string, size_t* length) except NULL:
cdef bytes byte_string = string.encode('utf8') cdef bytes byte_string = string.encode('utf8')
cdef bytes utf8_string = intern(byte_string) cdef bytes utf8_string = intern(byte_string)
Py_INCREF(utf8_string) Py_INCREF(utf8_string)
@ -27,38 +48,48 @@ cdef char* intern_and_encode(unicode string, size_t* length):
return <char*>utf8_string return <char*>utf8_string
cdef bint lexeme_check_flag(LexemeC* lexeme, size_t flag_id): cdef int lexeme_get_int(LexemeC* lexeme, size_t i) except *:
return lexeme.flags & (1 << flag_id) return lexeme.ints[i]
cdef unicode lexeme_string_view(LexemeC* lexeme, size_t view_id): cdef float lexeme_get_float(LexemeC* lexeme, size_t i) except *:
cdef bytes byte_string = lexeme.views[view_id] return lexeme.floats[i]
cdef unicode lexeme_get_string(LexemeC* lexeme, size_t i):
cdef bytes byte_string = lexeme.strings[i]
return byte_string.decode('utf8') return byte_string.decode('utf8')
cdef dict lexeme_pack(LexemeC* lexeme): cdef bint lexeme_check_orth_flag(LexemeC* lexeme, size_t flag_id) except *:
return lexeme.orth_flags & (1 << flag_id)
cdef bint lexeme_check_dist_flag(LexemeC* lexeme, size_t flag_id) except *:
return lexeme.dist_flags & (1 << flag_id)
cdef dict lexeme_pack(LexemeC* lex):
cdef dict packed = {} cdef dict packed = {}
packed['i'] = lexeme.i packed['ints'] = [lex.ints[i] for i in range(LexInt_N)]
packed['length'] = lexeme.length packed['floats'] = [lex.floats[i] for i in range(LexFloat_N)]
packed['prob'] = lexeme.prob packed['strings'] = [lex.strings[i].decode('utf8') for i in range(LexStr_N)]
packed['cluster'] = lexeme.cluster packed['orth_flags'] = lex.orth_flags
packed['string'] = lexeme.string.decode('utf8') packed['dist_flags'] = lex.orth_flags
packed['views'] = []
cdef size_t i = 0
while lexeme.views[i] != NULL:
packed['views'].append(lexeme.views[i].decode('utf8'))
i += 1
packed['flags'] = lexeme.flags
return packed return packed
cdef int lexeme_unpack(LexemeC* lex, dict p) except -1: cdef int lexeme_unpack(LexemeC* lex, dict p) except -1:
cdef size_t length cdef size_t i
lex.i = p['i'] cdef int lex_int
lex.length = p['length'] cdef float lex_float
lex.prob = p['prob'] cdef unicode string
lex.cluster = p['cluster'] for i, lex_int in enumerate(p['ints']):
lex.string = intern_and_encode(p['string'], &length) lex.ints[i] = lex_int
for i, view in enumerate(p['views']): for i, lex_float in enumerate(p['floats']):
lex.views[i] = intern_and_encode(view, &length) lex.ints[i] = lex_int
lex.flags = p['flags'] cdef size_t _
for i, lex_string in enumerate(p['strings']):
lex.strings[i] = intern_and_encode(lex_string, &_)
lex.orth_flags = p['orth_flags']
lex.orth_flags = p['orth_flags']