* Fiddle with data types on Lexeme, to compress them to a much smaller size.

This commit is contained in:
Matthew Honnibal 2014-10-30 15:42:15 +11:00
parent ac88893232
commit 87c2418a89
5 changed files with 23 additions and 24 deletions

View File

@ -32,9 +32,6 @@ cdef class Lexicon:
cdef PreshMap _dict cdef PreshMap _dict
cdef list _string_features
cdef list _flag_features
cdef class Language: cdef class Language:
cdef Pool _mem cdef Pool _mem

View File

@ -266,10 +266,10 @@ cdef class Lexicon:
cpdef Lexeme lookup(self, unicode uni_string): cpdef Lexeme lookup(self, unicode uni_string):
"""Retrieve (or create, if not found) a Lexeme for a string, and return it. """Retrieve (or create, if not found) a Lexeme for a string, and return it.
Args Args
string (unicode): The string to be looked up. Must be unicode, not bytes. string (unicode): The string to be looked up. Must be unicode, not bytes.
Returns: Returns:
lexeme (Lexeme): A reference to a lexical type. lexeme (Lexeme): A reference to a lexical type.
""" """
cdef String string cdef String string

View File

@ -1,8 +1,7 @@
from .typedefs cimport hash_t, utf8_t, flag_t, id_t from .typedefs cimport hash_t, utf8_t, flag_t, id_t, len_t, tag_t
from thinc.typedefs cimport atom_t
from .utf8string cimport StringStore from .utf8string cimport StringStore
from libc.stdint cimport uint16_t
cpdef flag_t OOV_DIST_FLAGS cpdef flag_t OOV_DIST_FLAGS
@ -23,23 +22,24 @@ cpdef enum:
cdef struct Lexeme: cdef struct Lexeme:
atom_t length flag_t flags
atom_t sic id_t sic
atom_t norm id_t norm
atom_t shape id_t shape
atom_t vocab10k id_t vocab10k
atom_t asciied id_t asciied
atom_t prefix id_t prefix
atom_t suffix id_t suffix
atom_t cluster
atom_t pos
atom_t supersense
float prob float prob
len_t length
tag_t cluster
tag_t pos
tag_t supersense
flag_t flags
cdef Lexeme EMPTY_LEXEME cdef Lexeme EMPTY_LEXEME

View File

@ -52,7 +52,7 @@ cpdef Lexeme init(unicode string, hash_t hashed,
lex.flags = get_flags(string, upper_pc, title_pc, lower_pc) lex.flags = get_flags(string, upper_pc, title_pc, lower_pc)
return lex return lex
cdef atom_t get_string_id(unicode string, StringStore store) except 0: cdef id_t get_string_id(unicode string, StringStore store) except 0:
cdef bytes byte_string = string.encode('utf8') cdef bytes byte_string = string.encode('utf8')
cdef Utf8Str* orig_str = store.intern(<char*>byte_string, len(byte_string)) cdef Utf8Str* orig_str = store.intern(<char*>byte_string, len(byte_string))
return orig_str.i return orig_str.i

View File

@ -1,8 +1,10 @@
from libc.stdint cimport uint64_t, uintptr_t from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t
ctypedef uint64_t hash_t ctypedef uint64_t hash_t
ctypedef char* utf8_t ctypedef char* utf8_t
ctypedef uint64_t flag_t ctypedef uint64_t flag_t
ctypedef uintptr_t id_t ctypedef uint32_t id_t
ctypedef uint16_t len_t
ctypedef uint16_t tag_t