From 87c2418a891dd3f9ceaf5afd18862610a73339a5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 30 Oct 2014 15:42:15 +1100 Subject: [PATCH] * Fiddle with data types on Lexeme, to compress them to a much smaller size. --- spacy/lang.pxd | 3 --- spacy/lang.pyx | 4 ++-- spacy/lexeme.pxd | 32 ++++++++++++++++---------------- spacy/lexeme.pyx | 2 +- spacy/typedefs.pxd | 6 ++++-- 5 files changed, 23 insertions(+), 24 deletions(-) diff --git a/spacy/lang.pxd b/spacy/lang.pxd index ba9d0a779..e2c7c56e6 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -32,9 +32,6 @@ cdef class Lexicon: cdef PreshMap _dict - cdef list _string_features - cdef list _flag_features - cdef class Language: cdef Pool _mem diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 9323dc052..5042ff4b2 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -266,10 +266,10 @@ cdef class Lexicon: cpdef Lexeme lookup(self, unicode uni_string): """Retrieve (or create, if not found) a Lexeme for a string, and return it. - Args + Args string (unicode): The string to be looked up. Must be unicode, not bytes. - Returns: + Returns: lexeme (Lexeme): A reference to a lexical type. """ cdef String string diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index b39a32522..066f05b20 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -1,8 +1,7 @@ -from .typedefs cimport hash_t, utf8_t, flag_t, id_t - -from thinc.typedefs cimport atom_t +from .typedefs cimport hash_t, utf8_t, flag_t, id_t, len_t, tag_t from .utf8string cimport StringStore +from libc.stdint cimport uint16_t cpdef flag_t OOV_DIST_FLAGS @@ -23,23 +22,24 @@ cpdef enum: cdef struct Lexeme: - atom_t length + flag_t flags - atom_t sic - atom_t norm - atom_t shape - atom_t vocab10k - atom_t asciied - atom_t prefix - atom_t suffix - - atom_t cluster - atom_t pos - atom_t supersense + id_t sic + id_t norm + id_t shape + id_t vocab10k + id_t asciied + id_t prefix + id_t suffix float prob + + len_t length + tag_t cluster + tag_t pos + tag_t supersense + - flag_t flags cdef Lexeme EMPTY_LEXEME diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 6760b3913..62804621d 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -52,7 +52,7 @@ cpdef Lexeme init(unicode string, hash_t hashed, lex.flags = get_flags(string, upper_pc, title_pc, lower_pc) return lex -cdef atom_t get_string_id(unicode string, StringStore store) except 0: +cdef id_t get_string_id(unicode string, StringStore store) except 0: cdef bytes byte_string = string.encode('utf8') cdef Utf8Str* orig_str = store.intern(byte_string, len(byte_string)) return orig_str.i diff --git a/spacy/typedefs.pxd b/spacy/typedefs.pxd index 34c327069..db6eb42ce 100644 --- a/spacy/typedefs.pxd +++ b/spacy/typedefs.pxd @@ -1,8 +1,10 @@ -from libc.stdint cimport uint64_t, uintptr_t +from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t ctypedef uint64_t hash_t ctypedef char* utf8_t ctypedef uint64_t flag_t -ctypedef uintptr_t id_t +ctypedef uint32_t id_t +ctypedef uint16_t len_t +ctypedef uint16_t tag_t