diff --git a/setup.py b/setup.py index ea75cc342..d33ab750e 100644 --- a/setup.py +++ b/setup.py @@ -48,6 +48,8 @@ exts = [ Extension("spacy.lang", ["spacy/lang.pyx"], language="c++", include_dirs=includes), Extension("spacy.word", ["spacy/word.pyx"], language="c++", include_dirs=includes), + Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", + include_dirs=includes), Extension("spacy.ptb3", ["spacy/ptb3.pyx"], language="c++", include_dirs=includes), Extension("spacy.en", ["spacy/en.pyx"], language="c++", include_dirs=includes), diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd new file mode 100644 index 000000000..887ed0e59 --- /dev/null +++ b/spacy/lexeme.pxd @@ -0,0 +1,21 @@ +from .typedefs cimport hash_t, utf8_t, flag_t, id_t + + +cdef struct LexemeC: + size_t length + double prob + size_t cluster + + char* string + + char** views + + flag_t flags + + +cdef LexemeC* lexeme_init(unicode string, double prob, size_t cluster, + list views, set flags) +cdef int lexeme_free(LexemeC* lexeme) except -1 + +cdef bint lexeme_check_flag(LexemeC* lexeme, size_t flag_id) +cdef unicode lexeme_string_view(LexemeC* lexeme, size_t view_id) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx new file mode 100644 index 000000000..4d5d08f52 --- /dev/null +++ b/spacy/lexeme.pyx @@ -0,0 +1,40 @@ +from libc.stdlib cimport calloc, free + + +cdef LexemeC* lexeme_init(unicode string, double prob, size_t cluster, + list views, set flags): + cdef LexemeC* lexeme = calloc(1, sizeof(LexemeC)) + lexeme.cluster = cluster + lexeme.prob = prob + lexeme.length = len(string) + lexeme.string = intern_and_encode(string) + + lexeme.views = calloc(len(views), sizeof(char*)) + for i, string in enumerate(views): + lexeme.views[i] = intern_and_encode(string) + + for active_flag in flags: + lexeme.flags |= (1 << active_flag) + return lexeme + + +cdef int lexeme_free(LexemeC* lexeme) except -1: + free(lexeme.views) + free(lexeme) + + +cdef set _strings = set() +cdef char* intern_and_encode(unicode string): + global _strings + cdef bytes utf8_string = intern(string.encode('utf8')) + _strings.add(utf8_string) + return utf8_string + + +cdef bint lexeme_check_flag(LexemeC* lexeme, size_t flag_id): + return lexeme.flags & (1 << flag_id) + + +cdef unicode lexeme_string_view(LexemeC* lexeme, size_t view_id): + cdef bytes byte_string = lexeme.views[view_id] + return byte_string.decode('utf8')