mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
* Restoring Lexeme-as-struct
This commit is contained in:
parent
e80d3b9784
commit
b488224c09
2
setup.py
2
setup.py
|
@ -48,6 +48,8 @@ exts = [
|
|||
Extension("spacy.lang", ["spacy/lang.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.word", ["spacy/word.pyx"], language="c++",
|
||||
include_dirs=includes),
|
||||
Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++",
|
||||
include_dirs=includes),
|
||||
Extension("spacy.ptb3", ["spacy/ptb3.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.en", ["spacy/en.pyx"], language="c++",
|
||||
include_dirs=includes),
|
||||
|
|
21
spacy/lexeme.pxd
Normal file
21
spacy/lexeme.pxd
Normal file
|
@ -0,0 +1,21 @@
|
|||
from .typedefs cimport hash_t, utf8_t, flag_t, id_t
|
||||
|
||||
|
||||
cdef struct LexemeC:
|
||||
size_t length
|
||||
double prob
|
||||
size_t cluster
|
||||
|
||||
char* string
|
||||
|
||||
char** views
|
||||
|
||||
flag_t flags
|
||||
|
||||
|
||||
cdef LexemeC* lexeme_init(unicode string, double prob, size_t cluster,
|
||||
list views, set flags)
|
||||
cdef int lexeme_free(LexemeC* lexeme) except -1
|
||||
|
||||
cdef bint lexeme_check_flag(LexemeC* lexeme, size_t flag_id)
|
||||
cdef unicode lexeme_string_view(LexemeC* lexeme, size_t view_id)
|
40
spacy/lexeme.pyx
Normal file
40
spacy/lexeme.pyx
Normal file
|
@ -0,0 +1,40 @@
|
|||
from libc.stdlib cimport calloc, free
|
||||
|
||||
|
||||
cdef LexemeC* lexeme_init(unicode string, double prob, size_t cluster,
|
||||
list views, set flags):
|
||||
cdef LexemeC* lexeme = <LexemeC*>calloc(1, sizeof(LexemeC))
|
||||
lexeme.cluster = cluster
|
||||
lexeme.prob = prob
|
||||
lexeme.length = len(string)
|
||||
lexeme.string = intern_and_encode(string)
|
||||
|
||||
lexeme.views = <char**>calloc(len(views), sizeof(char*))
|
||||
for i, string in enumerate(views):
|
||||
lexeme.views[i] = intern_and_encode(string)
|
||||
|
||||
for active_flag in flags:
|
||||
lexeme.flags |= (1 << active_flag)
|
||||
return lexeme
|
||||
|
||||
|
||||
cdef int lexeme_free(LexemeC* lexeme) except -1:
|
||||
free(lexeme.views)
|
||||
free(lexeme)
|
||||
|
||||
|
||||
cdef set _strings = set()
|
||||
cdef char* intern_and_encode(unicode string):
|
||||
global _strings
|
||||
cdef bytes utf8_string = intern(string.encode('utf8'))
|
||||
_strings.add(utf8_string)
|
||||
return <char*>utf8_string
|
||||
|
||||
|
||||
cdef bint lexeme_check_flag(LexemeC* lexeme, size_t flag_id):
|
||||
return lexeme.flags & (1 << flag_id)
|
||||
|
||||
|
||||
cdef unicode lexeme_string_view(LexemeC* lexeme, size_t view_id):
|
||||
cdef bytes byte_string = lexeme.views[view_id]
|
||||
return byte_string.decode('utf8')
|
Loading…
Reference in New Issue
Block a user