mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
* Fiddle with data types on Lexeme, to compress them to a much smaller size.
This commit is contained in:
parent
ac88893232
commit
87c2418a89
|
@ -32,9 +32,6 @@ cdef class Lexicon:
|
||||||
|
|
||||||
cdef PreshMap _dict
|
cdef PreshMap _dict
|
||||||
|
|
||||||
cdef list _string_features
|
|
||||||
cdef list _flag_features
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
cdef Pool _mem
|
cdef Pool _mem
|
||||||
|
|
|
@ -266,10 +266,10 @@ cdef class Lexicon:
|
||||||
cpdef Lexeme lookup(self, unicode uni_string):
|
cpdef Lexeme lookup(self, unicode uni_string):
|
||||||
"""Retrieve (or create, if not found) a Lexeme for a string, and return it.
|
"""Retrieve (or create, if not found) a Lexeme for a string, and return it.
|
||||||
|
|
||||||
Args
|
Args
|
||||||
string (unicode): The string to be looked up. Must be unicode, not bytes.
|
string (unicode): The string to be looked up. Must be unicode, not bytes.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
lexeme (Lexeme): A reference to a lexical type.
|
lexeme (Lexeme): A reference to a lexical type.
|
||||||
"""
|
"""
|
||||||
cdef String string
|
cdef String string
|
||||||
|
|
|
@ -1,8 +1,7 @@
|
||||||
from .typedefs cimport hash_t, utf8_t, flag_t, id_t
|
from .typedefs cimport hash_t, utf8_t, flag_t, id_t, len_t, tag_t
|
||||||
|
|
||||||
from thinc.typedefs cimport atom_t
|
|
||||||
|
|
||||||
from .utf8string cimport StringStore
|
from .utf8string cimport StringStore
|
||||||
|
from libc.stdint cimport uint16_t
|
||||||
|
|
||||||
cpdef flag_t OOV_DIST_FLAGS
|
cpdef flag_t OOV_DIST_FLAGS
|
||||||
|
|
||||||
|
@ -23,23 +22,24 @@ cpdef enum:
|
||||||
|
|
||||||
|
|
||||||
cdef struct Lexeme:
|
cdef struct Lexeme:
|
||||||
atom_t length
|
flag_t flags
|
||||||
|
|
||||||
atom_t sic
|
id_t sic
|
||||||
atom_t norm
|
id_t norm
|
||||||
atom_t shape
|
id_t shape
|
||||||
atom_t vocab10k
|
id_t vocab10k
|
||||||
atom_t asciied
|
id_t asciied
|
||||||
atom_t prefix
|
id_t prefix
|
||||||
atom_t suffix
|
id_t suffix
|
||||||
|
|
||||||
atom_t cluster
|
|
||||||
atom_t pos
|
|
||||||
atom_t supersense
|
|
||||||
|
|
||||||
float prob
|
float prob
|
||||||
|
|
||||||
|
len_t length
|
||||||
|
tag_t cluster
|
||||||
|
tag_t pos
|
||||||
|
tag_t supersense
|
||||||
|
|
||||||
|
|
||||||
flag_t flags
|
|
||||||
|
|
||||||
|
|
||||||
cdef Lexeme EMPTY_LEXEME
|
cdef Lexeme EMPTY_LEXEME
|
||||||
|
|
|
@ -52,7 +52,7 @@ cpdef Lexeme init(unicode string, hash_t hashed,
|
||||||
lex.flags = get_flags(string, upper_pc, title_pc, lower_pc)
|
lex.flags = get_flags(string, upper_pc, title_pc, lower_pc)
|
||||||
return lex
|
return lex
|
||||||
|
|
||||||
cdef atom_t get_string_id(unicode string, StringStore store) except 0:
|
cdef id_t get_string_id(unicode string, StringStore store) except 0:
|
||||||
cdef bytes byte_string = string.encode('utf8')
|
cdef bytes byte_string = string.encode('utf8')
|
||||||
cdef Utf8Str* orig_str = store.intern(<char*>byte_string, len(byte_string))
|
cdef Utf8Str* orig_str = store.intern(<char*>byte_string, len(byte_string))
|
||||||
return orig_str.i
|
return orig_str.i
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
from libc.stdint cimport uint64_t, uintptr_t
|
from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t
|
||||||
|
|
||||||
ctypedef uint64_t hash_t
|
ctypedef uint64_t hash_t
|
||||||
ctypedef char* utf8_t
|
ctypedef char* utf8_t
|
||||||
ctypedef uint64_t flag_t
|
ctypedef uint64_t flag_t
|
||||||
ctypedef uintptr_t id_t
|
ctypedef uint32_t id_t
|
||||||
|
ctypedef uint16_t len_t
|
||||||
|
ctypedef uint16_t tag_t
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user