diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index f45c581f2..09d10d0b7 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -2,24 +2,86 @@ from .typedefs cimport hash_t, utf8_t, flag_t, id_t from cymem.cymem cimport Pool +cpdef flag_t OOV_DIST_FLAGS + + +cpdef enum LexInts: + LexInt_i + LexInt_length + LexInt_cluster + LexInt_pos + LexInt_supersense + LexInt_N + + +cpdef enum LexFloats: + LexFloat_prob + LexFloat_sentiment + LexFloat_N + + +cpdef enum LexStrs: + LexStr_key + LexStr_casefix + LexStr_shape + LexStr_unsparse + LexStr_asciied + LexStr_N + + +cpdef enum LexOrthFlags: + LexOrth_alpha + LexOrth_ascii + LexOrth_digit + LexOrth_lower + LexOrth_punct + LexOrth_space + LexOrth_title + LexOrth_upper + LexOrth_N + + +cpdef enum LexDistFlags: + LexDist_adj + LexDist_adp + LexDist_adv + LexDist_conj + LexDist_det + LexDist_noun + LexDist_num + LexDist_pdt + LexDist_pos + LexDist_pron + LexDist_prt + LexDist_punct + LexDist_verb + + LexDist_lower + LexDist_title + LexDist_upper + + LexDist_N + + cdef struct LexemeC: - size_t i - size_t length - double prob - size_t cluster - - char* string - - char** views - flag_t flags + int[LexInt_N] ints + float[LexFloat_N] floats + utf8_t[LexStr_N] strings + flag_t orth_flags + flag_t dist_flags -cdef LexemeC* lexeme_init(Pool mem, size_t i, unicode string, double prob, size_t cluster, - list views, set flags) +cdef char* intern_and_encode(unicode string, size_t* length) except NULL -cdef bint lexeme_check_flag(LexemeC* lexeme, size_t flag_id) -cdef unicode lexeme_string_view(LexemeC* lexeme, size_t view_id) +cdef int lexeme_get_int(LexemeC* lexeme, size_t i) except * +cdef float lexeme_get_float(LexemeC* lexeme, size_t i) except * + +cdef unicode lexeme_get_string(LexemeC* lexeme, size_t i) + +cdef bint lexeme_check_orth_flag(LexemeC* lexeme, size_t flag_id) except * + +cdef bint lexeme_check_dist_flag(LexemeC* lexeme, size_t flag_id) except * cdef dict lexeme_pack(LexemeC* lexeme) cdef int lexeme_unpack(LexemeC* lexeme, dict p) except -1 diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 238a954e7..d09dfb72d 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -1,25 +1,46 @@ from cpython.ref cimport Py_INCREF from cymem.cymem cimport Pool +import orth -cdef LexemeC* lexeme_init(Pool mem, size_t i, unicode string, double prob, - size_t cluster, list views, set flags): - cdef LexemeC* lexeme = mem.alloc(1, sizeof(LexemeC)) - lexeme.i = i - lexeme.cluster = cluster - lexeme.prob = prob - lexeme.string = intern_and_encode(string, &lexeme.length) - lexeme.views = mem.alloc(len(views), sizeof(char*)) - cdef size_t length = 0 - for i, string in enumerate(views): - lexeme.views[i] = intern_and_encode(string, &length) - - for active_flag in flags: - lexeme.flags |= (1 << active_flag) - return lexeme +OOV_DIST_FLAGS = 0 -cdef char* intern_and_encode(unicode string, size_t* length): +def get_lexeme_dict(size_t i, unicode string): + ints = [None for _ in range(LexInt_N)] + ints[LexInt_i] = i + ints[LexInt_length] = len(string) + ints[LexInt_cluster] = 0 + ints[LexInt_pos] = 0 + ints[LexInt_supersense] = 0 + + floats = [None for _ in range(LexFloat_N)] + floats[LexFloat_prob] = 0 + floats[LexFloat_sentiment] = 0 + + cdef size_t length + strings = [None for _ in range(LexStr_N)] + strings[LexStr_key] = intern_and_encode(string, &length) + strings[LexStr_casefix] = strings[LexStr_key] + strings[LexStr_shape] = intern_and_encode(orth.word_shape(string), &length) + strings[LexStr_unsparse] = strings[LexStr_shape] + strings[LexStr_asciied] = intern_and_encode(orth.asciied(string), &length) + + orth_flags = get_orth_flags(string) + dist_flags = OOV_DIST_FLAGS + + return {'ints': ints, 'floats': floats, 'strings': strings, + 'orth_flags': orth_flags, 'dist_flags': dist_flags} + +def get_orth_flags(unicode string): + return 0 + + +def get_dist_flags(unicode string): + return 0 + + +cdef char* intern_and_encode(unicode string, size_t* length) except NULL: cdef bytes byte_string = string.encode('utf8') cdef bytes utf8_string = intern(byte_string) Py_INCREF(utf8_string) @@ -27,38 +48,48 @@ cdef char* intern_and_encode(unicode string, size_t* length): return utf8_string -cdef bint lexeme_check_flag(LexemeC* lexeme, size_t flag_id): - return lexeme.flags & (1 << flag_id) +cdef int lexeme_get_int(LexemeC* lexeme, size_t i) except *: + return lexeme.ints[i] -cdef unicode lexeme_string_view(LexemeC* lexeme, size_t view_id): - cdef bytes byte_string = lexeme.views[view_id] +cdef float lexeme_get_float(LexemeC* lexeme, size_t i) except *: + return lexeme.floats[i] + + +cdef unicode lexeme_get_string(LexemeC* lexeme, size_t i): + cdef bytes byte_string = lexeme.strings[i] return byte_string.decode('utf8') -cdef dict lexeme_pack(LexemeC* lexeme): +cdef bint lexeme_check_orth_flag(LexemeC* lexeme, size_t flag_id) except *: + return lexeme.orth_flags & (1 << flag_id) + + +cdef bint lexeme_check_dist_flag(LexemeC* lexeme, size_t flag_id) except *: + return lexeme.dist_flags & (1 << flag_id) + + +cdef dict lexeme_pack(LexemeC* lex): cdef dict packed = {} - packed['i'] = lexeme.i - packed['length'] = lexeme.length - packed['prob'] = lexeme.prob - packed['cluster'] = lexeme.cluster - packed['string'] = lexeme.string.decode('utf8') - packed['views'] = [] - cdef size_t i = 0 - while lexeme.views[i] != NULL: - packed['views'].append(lexeme.views[i].decode('utf8')) - i += 1 - packed['flags'] = lexeme.flags + packed['ints'] = [lex.ints[i] for i in range(LexInt_N)] + packed['floats'] = [lex.floats[i] for i in range(LexFloat_N)] + packed['strings'] = [lex.strings[i].decode('utf8') for i in range(LexStr_N)] + packed['orth_flags'] = lex.orth_flags + packed['dist_flags'] = lex.orth_flags return packed cdef int lexeme_unpack(LexemeC* lex, dict p) except -1: - cdef size_t length - lex.i = p['i'] - lex.length = p['length'] - lex.prob = p['prob'] - lex.cluster = p['cluster'] - lex.string = intern_and_encode(p['string'], &length) - for i, view in enumerate(p['views']): - lex.views[i] = intern_and_encode(view, &length) - lex.flags = p['flags'] + cdef size_t i + cdef int lex_int + cdef float lex_float + cdef unicode string + for i, lex_int in enumerate(p['ints']): + lex.ints[i] = lex_int + for i, lex_float in enumerate(p['floats']): + lex.ints[i] = lex_int + cdef size_t _ + for i, lex_string in enumerate(p['strings']): + lex.strings[i] = intern_and_encode(lex_string, &_) + lex.orth_flags = p['orth_flags'] + lex.orth_flags = p['orth_flags']