mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 09:56:28 +03:00
* Revising data model of lexeme. Compiles.
This commit is contained in:
parent
e40caae51f
commit
1b0e01d3d8
|
@ -2,24 +2,86 @@ from .typedefs cimport hash_t, utf8_t, flag_t, id_t
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
|
|
||||||
|
cpdef flag_t OOV_DIST_FLAGS
|
||||||
|
|
||||||
|
|
||||||
|
cpdef enum LexInts:
|
||||||
|
LexInt_i
|
||||||
|
LexInt_length
|
||||||
|
LexInt_cluster
|
||||||
|
LexInt_pos
|
||||||
|
LexInt_supersense
|
||||||
|
LexInt_N
|
||||||
|
|
||||||
|
|
||||||
|
cpdef enum LexFloats:
|
||||||
|
LexFloat_prob
|
||||||
|
LexFloat_sentiment
|
||||||
|
LexFloat_N
|
||||||
|
|
||||||
|
|
||||||
|
cpdef enum LexStrs:
|
||||||
|
LexStr_key
|
||||||
|
LexStr_casefix
|
||||||
|
LexStr_shape
|
||||||
|
LexStr_unsparse
|
||||||
|
LexStr_asciied
|
||||||
|
LexStr_N
|
||||||
|
|
||||||
|
|
||||||
|
cpdef enum LexOrthFlags:
|
||||||
|
LexOrth_alpha
|
||||||
|
LexOrth_ascii
|
||||||
|
LexOrth_digit
|
||||||
|
LexOrth_lower
|
||||||
|
LexOrth_punct
|
||||||
|
LexOrth_space
|
||||||
|
LexOrth_title
|
||||||
|
LexOrth_upper
|
||||||
|
LexOrth_N
|
||||||
|
|
||||||
|
|
||||||
|
cpdef enum LexDistFlags:
|
||||||
|
LexDist_adj
|
||||||
|
LexDist_adp
|
||||||
|
LexDist_adv
|
||||||
|
LexDist_conj
|
||||||
|
LexDist_det
|
||||||
|
LexDist_noun
|
||||||
|
LexDist_num
|
||||||
|
LexDist_pdt
|
||||||
|
LexDist_pos
|
||||||
|
LexDist_pron
|
||||||
|
LexDist_prt
|
||||||
|
LexDist_punct
|
||||||
|
LexDist_verb
|
||||||
|
|
||||||
|
LexDist_lower
|
||||||
|
LexDist_title
|
||||||
|
LexDist_upper
|
||||||
|
|
||||||
|
LexDist_N
|
||||||
|
|
||||||
|
|
||||||
cdef struct LexemeC:
|
cdef struct LexemeC:
|
||||||
size_t i
|
int[<int>LexInt_N] ints
|
||||||
size_t length
|
float[<int>LexFloat_N] floats
|
||||||
double prob
|
utf8_t[<int>LexStr_N] strings
|
||||||
size_t cluster
|
flag_t orth_flags
|
||||||
|
flag_t dist_flags
|
||||||
char* string
|
|
||||||
|
|
||||||
char** views
|
|
||||||
flag_t flags
|
|
||||||
|
|
||||||
|
|
||||||
cdef LexemeC* lexeme_init(Pool mem, size_t i, unicode string, double prob, size_t cluster,
|
cdef char* intern_and_encode(unicode string, size_t* length) except NULL
|
||||||
list views, set flags)
|
|
||||||
|
|
||||||
cdef bint lexeme_check_flag(LexemeC* lexeme, size_t flag_id)
|
cdef int lexeme_get_int(LexemeC* lexeme, size_t i) except *
|
||||||
cdef unicode lexeme_string_view(LexemeC* lexeme, size_t view_id)
|
|
||||||
|
|
||||||
|
cdef float lexeme_get_float(LexemeC* lexeme, size_t i) except *
|
||||||
|
|
||||||
|
cdef unicode lexeme_get_string(LexemeC* lexeme, size_t i)
|
||||||
|
|
||||||
|
cdef bint lexeme_check_orth_flag(LexemeC* lexeme, size_t flag_id) except *
|
||||||
|
|
||||||
|
cdef bint lexeme_check_dist_flag(LexemeC* lexeme, size_t flag_id) except *
|
||||||
|
|
||||||
cdef dict lexeme_pack(LexemeC* lexeme)
|
cdef dict lexeme_pack(LexemeC* lexeme)
|
||||||
cdef int lexeme_unpack(LexemeC* lexeme, dict p) except -1
|
cdef int lexeme_unpack(LexemeC* lexeme, dict p) except -1
|
||||||
|
|
113
spacy/lexeme.pyx
113
spacy/lexeme.pyx
|
@ -1,25 +1,46 @@
|
||||||
from cpython.ref cimport Py_INCREF
|
from cpython.ref cimport Py_INCREF
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
|
import orth
|
||||||
|
|
||||||
cdef LexemeC* lexeme_init(Pool mem, size_t i, unicode string, double prob,
|
OOV_DIST_FLAGS = 0
|
||||||
size_t cluster, list views, set flags):
|
|
||||||
cdef LexemeC* lexeme = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
|
|
||||||
lexeme.i = i
|
|
||||||
lexeme.cluster = cluster
|
|
||||||
lexeme.prob = prob
|
|
||||||
lexeme.string = intern_and_encode(string, &lexeme.length)
|
|
||||||
lexeme.views = <char**>mem.alloc(len(views), sizeof(char*))
|
|
||||||
cdef size_t length = 0
|
|
||||||
for i, string in enumerate(views):
|
|
||||||
lexeme.views[i] = intern_and_encode(string, &length)
|
|
||||||
|
|
||||||
for active_flag in flags:
|
|
||||||
lexeme.flags |= (1 << active_flag)
|
|
||||||
return lexeme
|
|
||||||
|
|
||||||
|
|
||||||
cdef char* intern_and_encode(unicode string, size_t* length):
|
def get_lexeme_dict(size_t i, unicode string):
|
||||||
|
ints = [None for _ in range(LexInt_N)]
|
||||||
|
ints[<int>LexInt_i] = i
|
||||||
|
ints[<int>LexInt_length] = len(string)
|
||||||
|
ints[<int>LexInt_cluster] = 0
|
||||||
|
ints[<int>LexInt_pos] = 0
|
||||||
|
ints[<int>LexInt_supersense] = 0
|
||||||
|
|
||||||
|
floats = [None for _ in range(LexFloat_N)]
|
||||||
|
floats[<int>LexFloat_prob] = 0
|
||||||
|
floats[<int>LexFloat_sentiment] = 0
|
||||||
|
|
||||||
|
cdef size_t length
|
||||||
|
strings = [None for _ in range(LexStr_N)]
|
||||||
|
strings[<int>LexStr_key] = intern_and_encode(string, &length)
|
||||||
|
strings[<int>LexStr_casefix] = strings[<int>LexStr_key]
|
||||||
|
strings[<int>LexStr_shape] = intern_and_encode(orth.word_shape(string), &length)
|
||||||
|
strings[<int>LexStr_unsparse] = strings[<int>LexStr_shape]
|
||||||
|
strings[<int>LexStr_asciied] = intern_and_encode(orth.asciied(string), &length)
|
||||||
|
|
||||||
|
orth_flags = get_orth_flags(string)
|
||||||
|
dist_flags = OOV_DIST_FLAGS
|
||||||
|
|
||||||
|
return {'ints': ints, 'floats': floats, 'strings': strings,
|
||||||
|
'orth_flags': orth_flags, 'dist_flags': dist_flags}
|
||||||
|
|
||||||
|
def get_orth_flags(unicode string):
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
def get_dist_flags(unicode string):
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
cdef char* intern_and_encode(unicode string, size_t* length) except NULL:
|
||||||
cdef bytes byte_string = string.encode('utf8')
|
cdef bytes byte_string = string.encode('utf8')
|
||||||
cdef bytes utf8_string = intern(byte_string)
|
cdef bytes utf8_string = intern(byte_string)
|
||||||
Py_INCREF(utf8_string)
|
Py_INCREF(utf8_string)
|
||||||
|
@ -27,38 +48,48 @@ cdef char* intern_and_encode(unicode string, size_t* length):
|
||||||
return <char*>utf8_string
|
return <char*>utf8_string
|
||||||
|
|
||||||
|
|
||||||
cdef bint lexeme_check_flag(LexemeC* lexeme, size_t flag_id):
|
cdef int lexeme_get_int(LexemeC* lexeme, size_t i) except *:
|
||||||
return lexeme.flags & (1 << flag_id)
|
return lexeme.ints[i]
|
||||||
|
|
||||||
|
|
||||||
cdef unicode lexeme_string_view(LexemeC* lexeme, size_t view_id):
|
cdef float lexeme_get_float(LexemeC* lexeme, size_t i) except *:
|
||||||
cdef bytes byte_string = lexeme.views[view_id]
|
return lexeme.floats[i]
|
||||||
|
|
||||||
|
|
||||||
|
cdef unicode lexeme_get_string(LexemeC* lexeme, size_t i):
|
||||||
|
cdef bytes byte_string = lexeme.strings[i]
|
||||||
return byte_string.decode('utf8')
|
return byte_string.decode('utf8')
|
||||||
|
|
||||||
|
|
||||||
cdef dict lexeme_pack(LexemeC* lexeme):
|
cdef bint lexeme_check_orth_flag(LexemeC* lexeme, size_t flag_id) except *:
|
||||||
|
return lexeme.orth_flags & (1 << flag_id)
|
||||||
|
|
||||||
|
|
||||||
|
cdef bint lexeme_check_dist_flag(LexemeC* lexeme, size_t flag_id) except *:
|
||||||
|
return lexeme.dist_flags & (1 << flag_id)
|
||||||
|
|
||||||
|
|
||||||
|
cdef dict lexeme_pack(LexemeC* lex):
|
||||||
cdef dict packed = {}
|
cdef dict packed = {}
|
||||||
packed['i'] = lexeme.i
|
packed['ints'] = [lex.ints[i] for i in range(LexInt_N)]
|
||||||
packed['length'] = lexeme.length
|
packed['floats'] = [lex.floats[i] for i in range(LexFloat_N)]
|
||||||
packed['prob'] = lexeme.prob
|
packed['strings'] = [lex.strings[i].decode('utf8') for i in range(LexStr_N)]
|
||||||
packed['cluster'] = lexeme.cluster
|
packed['orth_flags'] = lex.orth_flags
|
||||||
packed['string'] = lexeme.string.decode('utf8')
|
packed['dist_flags'] = lex.orth_flags
|
||||||
packed['views'] = []
|
|
||||||
cdef size_t i = 0
|
|
||||||
while lexeme.views[i] != NULL:
|
|
||||||
packed['views'].append(lexeme.views[i].decode('utf8'))
|
|
||||||
i += 1
|
|
||||||
packed['flags'] = lexeme.flags
|
|
||||||
return packed
|
return packed
|
||||||
|
|
||||||
|
|
||||||
cdef int lexeme_unpack(LexemeC* lex, dict p) except -1:
|
cdef int lexeme_unpack(LexemeC* lex, dict p) except -1:
|
||||||
cdef size_t length
|
cdef size_t i
|
||||||
lex.i = p['i']
|
cdef int lex_int
|
||||||
lex.length = p['length']
|
cdef float lex_float
|
||||||
lex.prob = p['prob']
|
cdef unicode string
|
||||||
lex.cluster = p['cluster']
|
for i, lex_int in enumerate(p['ints']):
|
||||||
lex.string = intern_and_encode(p['string'], &length)
|
lex.ints[i] = lex_int
|
||||||
for i, view in enumerate(p['views']):
|
for i, lex_float in enumerate(p['floats']):
|
||||||
lex.views[i] = intern_and_encode(view, &length)
|
lex.ints[i] = lex_int
|
||||||
lex.flags = p['flags']
|
cdef size_t _
|
||||||
|
for i, lex_string in enumerate(p['strings']):
|
||||||
|
lex.strings[i] = intern_and_encode(lex_string, &_)
|
||||||
|
lex.orth_flags = p['orth_flags']
|
||||||
|
lex.orth_flags = p['orth_flags']
|
||||||
|
|
Loading…
Reference in New Issue
Block a user