* Fiddle with data types on Lexeme, to compress them to a much smaller size.

2025-10-26 13:41:21 +03:00 · 2014-10-30 15:42:15 +11:00 · 2014-10-30 15:42:15 +11:00 · 87c2418a89
commit 87c2418a89
parent ac88893232
5 changed files with 23 additions and 24 deletions
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@ -32,9 +32,6 @@ cdef class Lexicon:
    cdef PreshMap _dict
    cdef list _string_features
    cdef list _flag_features
 cdef class Language:
    cdef Pool _mem
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -266,10 +266,10 @@ cdef class Lexicon:
    cpdef Lexeme lookup(self, unicode uni_string):
        """Retrieve (or create, if not found) a Lexeme for a string, and return it.
-        Args
+       Args
            string (unicode):  The string to be looked up. Must be unicode, not bytes.
-        Returns:
+       Returns:
            lexeme (Lexeme): A reference to a lexical type.
        """
        cdef String string
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -1,8 +1,7 @@
-from .typedefs cimport hash_t, utf8_t, flag_t, id_t
+from .typedefs cimport hash_t, utf8_t, flag_t, id_t, len_t, tag_t
 from thinc.typedefs cimport atom_t
 from .utf8string cimport StringStore
 from libc.stdint cimport uint16_t
 cpdef flag_t OOV_DIST_FLAGS
@ -23,23 +22,24 @@ cpdef enum:
 cdef struct Lexeme:
-    atom_t length
+    flag_t flags
-    atom_t sic
+    id_t sic
-    atom_t norm
+    id_t norm
-    atom_t shape
+    id_t shape
-    atom_t vocab10k
+    id_t vocab10k
-    atom_t asciied
+    id_t asciied
-    atom_t prefix
+    id_t prefix
-    atom_t suffix
+    id_t suffix
    atom_t cluster
    atom_t pos
    atom_t supersense
    float prob
    len_t length
    tag_t cluster
    tag_t pos
    tag_t supersense
    flag_t flags
 cdef Lexeme EMPTY_LEXEME
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -52,7 +52,7 @@ cpdef Lexeme init(unicode string, hash_t hashed,
    lex.flags = get_flags(string, upper_pc, title_pc, lower_pc)
    return lex
-cdef atom_t get_string_id(unicode string, StringStore store) except 0:
+cdef id_t get_string_id(unicode string, StringStore store) except 0:
    cdef bytes byte_string = string.encode('utf8')
    cdef Utf8Str* orig_str = store.intern(<char*>byte_string, len(byte_string))
    return orig_str.i
--- a/spacy/typedefs.pxd
+++ b/spacy/typedefs.pxd
@ -1,8 +1,10 @@
-from libc.stdint cimport uint64_t, uintptr_t
+from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t
 ctypedef uint64_t hash_t
 ctypedef char* utf8_t
 ctypedef uint64_t flag_t
-ctypedef uintptr_t id_t
+ctypedef uint32_t id_t
 ctypedef uint16_t len_t
 ctypedef uint16_t tag_t