* Revising data model of lexeme. Compiles.

2025-10-30 23:47:31 +03:00 · 2014-10-09 19:53:30 +11:00 · 2014-10-09 19:53:30 +11:00 · 1b0e01d3d8
commit 1b0e01d3d8
parent e40caae51f
2 changed files with 147 additions and 54 deletions
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -2,24 +2,86 @@ from .typedefs cimport hash_t, utf8_t, flag_t, id_t
 from cymem.cymem cimport Pool


+cpdef flag_t OOV_DIST_FLAGS
+
+
+cpdef enum LexInts:
+    LexInt_i
+    LexInt_length
+    LexInt_cluster
+    LexInt_pos
+    LexInt_supersense
+    LexInt_N
+
+
+cpdef enum LexFloats:
+    LexFloat_prob
+    LexFloat_sentiment
+    LexFloat_N
+
+
+cpdef enum LexStrs:
+    LexStr_key
+    LexStr_casefix
+    LexStr_shape
+    LexStr_unsparse
+    LexStr_asciied
+    LexStr_N
+
+
+cpdef enum LexOrthFlags:
+    LexOrth_alpha
+    LexOrth_ascii
+    LexOrth_digit
+    LexOrth_lower
+    LexOrth_punct
+    LexOrth_space
+    LexOrth_title
+    LexOrth_upper
+    LexOrth_N
+
+
+cpdef enum LexDistFlags:
+    LexDist_adj
+    LexDist_adp
+    LexDist_adv
+    LexDist_conj
+    LexDist_det
+    LexDist_noun
+    LexDist_num
+    LexDist_pdt
+    LexDist_pos
+    LexDist_pron
+    LexDist_prt
+    LexDist_punct
+    LexDist_verb
+
+    LexDist_lower
+    LexDist_title
+    LexDist_upper
+
+    LexDist_N
+
+
 cdef struct LexemeC:
-    size_t i
-    size_t length
-    double prob
-    size_t cluster
-
-    char* string
-    
-    char** views
-    flag_t flags
+    int[<int>LexInt_N] ints
+    float[<int>LexFloat_N] floats
+    utf8_t[<int>LexStr_N] strings
+    flag_t orth_flags
+    flag_t dist_flags


-cdef LexemeC* lexeme_init(Pool mem, size_t i, unicode string, double prob, size_t cluster,
-                     list views, set flags)
+cdef char* intern_and_encode(unicode string, size_t* length) except NULL

-cdef bint lexeme_check_flag(LexemeC* lexeme, size_t flag_id)
-cdef unicode lexeme_string_view(LexemeC* lexeme, size_t view_id)
+cdef int lexeme_get_int(LexemeC* lexeme, size_t i) except *

+cdef float lexeme_get_float(LexemeC* lexeme, size_t i) except *
+
+cdef unicode lexeme_get_string(LexemeC* lexeme, size_t i)
+
+cdef bint lexeme_check_orth_flag(LexemeC* lexeme, size_t flag_id) except *
+
+cdef bint lexeme_check_dist_flag(LexemeC* lexeme, size_t flag_id) except *

 cdef dict lexeme_pack(LexemeC* lexeme)
 cdef int lexeme_unpack(LexemeC* lexeme, dict p) except -1
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -1,25 +1,46 @@
 from cpython.ref cimport Py_INCREF
 from cymem.cymem cimport Pool

+import orth

-cdef LexemeC* lexeme_init(Pool mem, size_t i, unicode string, double prob,
-                          size_t cluster, list views, set flags):
-    cdef LexemeC* lexeme = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
-    lexeme.i = i
-    lexeme.cluster = cluster
-    lexeme.prob = prob
-    lexeme.string = intern_and_encode(string, &lexeme.length)
-    lexeme.views = <char**>mem.alloc(len(views), sizeof(char*))
-    cdef size_t length = 0
-    for i, string in enumerate(views):
-        lexeme.views[i] = intern_and_encode(string, &length)
-
-    for active_flag in flags:
-        lexeme.flags |= (1 << active_flag)
-    return lexeme
+OOV_DIST_FLAGS = 0


-cdef char* intern_and_encode(unicode string, size_t* length):
+def get_lexeme_dict(size_t i, unicode string):
+    ints = [None for _ in range(LexInt_N)]
+    ints[<int>LexInt_i] = i
+    ints[<int>LexInt_length] = len(string)
+    ints[<int>LexInt_cluster] = 0
+    ints[<int>LexInt_pos] = 0
+    ints[<int>LexInt_supersense] = 0
+    
+    floats = [None for _ in range(LexFloat_N)]
+    floats[<int>LexFloat_prob] = 0
+    floats[<int>LexFloat_sentiment] = 0
+
+    cdef size_t length
+    strings = [None for _ in range(LexStr_N)]
+    strings[<int>LexStr_key] = intern_and_encode(string, &length)
+    strings[<int>LexStr_casefix] = strings[<int>LexStr_key]
+    strings[<int>LexStr_shape] = intern_and_encode(orth.word_shape(string), &length)
+    strings[<int>LexStr_unsparse] = strings[<int>LexStr_shape]
+    strings[<int>LexStr_asciied] = intern_and_encode(orth.asciied(string), &length)
+
+    orth_flags = get_orth_flags(string)
+    dist_flags = OOV_DIST_FLAGS
+
+    return {'ints': ints, 'floats': floats, 'strings': strings,
+            'orth_flags': orth_flags, 'dist_flags': dist_flags}
+
+def get_orth_flags(unicode string):
+    return 0
+
+
+def get_dist_flags(unicode string):
+    return 0
+
+
+cdef char* intern_and_encode(unicode string, size_t* length) except NULL:
    cdef bytes byte_string = string.encode('utf8')
    cdef bytes utf8_string = intern(byte_string)
    Py_INCREF(utf8_string)
@ -27,38 +48,48 @@ cdef char* intern_and_encode(unicode string, size_t* length):
    return <char*>utf8_string


-cdef bint lexeme_check_flag(LexemeC* lexeme, size_t flag_id):
-    return lexeme.flags & (1 << flag_id)
+cdef int lexeme_get_int(LexemeC* lexeme, size_t i) except *:
+    return lexeme.ints[i]


-cdef unicode lexeme_string_view(LexemeC* lexeme, size_t view_id):
-    cdef bytes byte_string = lexeme.views[view_id]
+cdef float lexeme_get_float(LexemeC* lexeme, size_t i) except *:
+    return lexeme.floats[i]
+
+
+cdef unicode lexeme_get_string(LexemeC* lexeme, size_t i):
+    cdef bytes byte_string = lexeme.strings[i]
    return byte_string.decode('utf8')


-cdef dict lexeme_pack(LexemeC* lexeme):
+cdef bint lexeme_check_orth_flag(LexemeC* lexeme, size_t flag_id) except *:
+    return lexeme.orth_flags & (1 << flag_id)
+
+
+cdef bint lexeme_check_dist_flag(LexemeC* lexeme, size_t flag_id) except *:
+    return lexeme.dist_flags & (1 << flag_id)
+
+
+cdef dict lexeme_pack(LexemeC* lex):
    cdef dict packed = {}
-    packed['i'] = lexeme.i
-    packed['length'] = lexeme.length
-    packed['prob'] = lexeme.prob
-    packed['cluster'] = lexeme.cluster
-    packed['string'] = lexeme.string.decode('utf8')
-    packed['views'] = []
-    cdef size_t i = 0
-    while lexeme.views[i] != NULL:
-        packed['views'].append(lexeme.views[i].decode('utf8'))
-        i += 1
-    packed['flags'] = lexeme.flags
+    packed['ints'] = [lex.ints[i] for i in range(LexInt_N)]
+    packed['floats'] = [lex.floats[i] for i in range(LexFloat_N)]
+    packed['strings'] = [lex.strings[i].decode('utf8') for i in range(LexStr_N)]
+    packed['orth_flags'] = lex.orth_flags
+    packed['dist_flags'] = lex.orth_flags
    return packed


 cdef int lexeme_unpack(LexemeC* lex, dict p) except -1:
-    cdef size_t length
-    lex.i = p['i']
-    lex.length = p['length']
-    lex.prob = p['prob']
-    lex.cluster = p['cluster']
-    lex.string = intern_and_encode(p['string'], &length)
-    for i, view in enumerate(p['views']):
-        lex.views[i] = intern_and_encode(view, &length)
-    lex.flags = p['flags']
+    cdef size_t i
+    cdef int lex_int
+    cdef float lex_float
+    cdef unicode string
+    for i, lex_int in enumerate(p['ints']):
+        lex.ints[i] = lex_int
+    for i, lex_float in enumerate(p['floats']):
+        lex.ints[i] = lex_int
+    cdef size_t _
+    for i, lex_string in enumerate(p['strings']):
+        lex.strings[i] = intern_and_encode(lex_string, &_)
+    lex.orth_flags = p['orth_flags']
+    lex.orth_flags = p['orth_flags']