* Update lexeme serialization, using a binary file format

2025-07-16 03:02:41 +03:00 · 2014-10-30 01:01:00 +11:00 · 2014-10-30 01:01:00 +11:00 · 67c8c8019f
commit 67c8c8019f
parent 13909a2e24
5 changed files with 18 additions and 8 deletions
--- a/setup.py
+++ b/setup.py
@ -6,7 +6,6 @@ import distutils.core
 import sys
 import os
 import os.path
-import numpy

 from os import path
 from glob import glob
@ -35,7 +34,7 @@ compile_args = []
 link_args = []
 libs = []

-includes = ['.', numpy.get_include()]
+includes = ['.']
 cython_includes = ['.']


@ -48,11 +47,11 @@ else:

 exts = [
    Extension("spacy.lang", ["spacy/lang.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.word", ["spacy/word.pyx"], language="c++", include_dirs=includes),
    Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes),
    Extension("spacy.en", ["spacy/en.pyx"], language="c++", include_dirs=includes),
    Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
    Extension("spacy.pos", ["spacy/pos.pyx"], language="c++", include_dirs=includes),
+    Extension("spacy.utf8string", ["spacy/utf8string.pyx"], language="c++", include_dirs=includes),
 ]


--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -45,6 +45,8 @@ cdef class Language:
        self.suffix_re = re.compile(suffix)
        self.infix_re = re.compile(infix)
        self.lexicon = Lexicon(lexemes)
+        self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes'))
+        self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings'))
        self._load_special_tokenization(rules)

    cpdef Tokens tokenize(self, unicode string):
@ -244,6 +246,13 @@ cdef class Lexicon:
            self.lexemes.push_back(lexeme)
            self.size += 1

+    def set(self, unicode py_string, dict lexeme_dict):
+        cdef String string
+        string_from_unicode(&string, py_string)
+        cdef Lexeme* lex = self.get(&string)
+        lex[0] = lexeme_init(string.chars[:string.n], string.key, lex.i,
+                             self.strings, lexeme_dict)
+
    cdef Lexeme* get(self, String* string) except NULL:
        cdef Lexeme* lex
        lex = <Lexeme*>self._dict.get(string.key)
@ -278,7 +287,7 @@ cdef class Lexicon:
        cdef FILE* fp = fopen(<char*>bytes_loc, 'wb')
        assert fp != NULL
        cdef size_t st
-        for i in range(self.size):
+        for i in range(self.size-1):
            st = fwrite(self.lexemes[i], sizeof(Lexeme), 1, fp)
            assert st == 1
        st = fclose(fp)
@ -293,11 +302,12 @@ cdef class Lexicon:
        cdef Lexeme* lexeme
        while True:
            lexeme = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
-            st = fread(lexeme, sizeof(lexeme), 1, fp)
-            if st == 0:
+            st = fread(lexeme, sizeof(Lexeme), 1, fp)
+            if st != 1:
                break
            self.lexemes.push_back(lexeme)
            self._dict.set(lexeme.hash, lexeme)
+        fclose(fp)
        

 cdef void string_from_unicode(String* s, unicode uni):
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -31,7 +31,6 @@ cpdef Lexeme init(unicode string, hash_t hashed, atom_t i,
    cdef Lexeme lex
    lex.hash = hashed
    lex.i = i
-    print string, i
    lex.length = len(string)
    lex.sic = get_string_id(string, store)
    
--- a/spacy/utf8string.pyx
+++ b/spacy/utf8string.pyx
@ -58,10 +58,12 @@ cdef class StringStore:
        strings = []
        cdef Utf8Str* string
        cdef bytes py_string
+        print "Dump strings"
        for i in range(self.size):
            string = &self.strings[i]
            py_string = string.chars[:string.length]
            strings.append(py_string)
+        print len(strings)
        with open(loc, 'w') as file_:
            ujson.dump(strings, file_, ensure_ascii=False)

--- a/spacy/util.py
+++ b/spacy/util.py
@ -13,7 +13,7 @@ def utf8open(loc, mode='r'):

 def read_lang_data(name):
    data_dir = path.join(DATA_DIR, name)
-    tokenization = read_tokenization(data_dir)
+    tokenization = read_tokenization(name)
    prefix = read_prefix(data_dir)
    suffix = read_suffix(data_dir)
    infix = read_infix(data_dir)