* Update lexeme serialization, using a binary file format

This commit is contained in:
Matthew Honnibal 2014-10-30 01:01:00 +11:00
parent 13909a2e24
commit 67c8c8019f
5 changed files with 18 additions and 8 deletions

View File

@ -6,7 +6,6 @@ import distutils.core
import sys
import os
import os.path
import numpy
from os import path
from glob import glob
@ -35,7 +34,7 @@ compile_args = []
link_args = []
libs = []
includes = ['.', numpy.get_include()]
includes = ['.']
cython_includes = ['.']
@ -48,11 +47,11 @@ else:
exts = [
Extension("spacy.lang", ["spacy/lang.pyx"], language="c++", include_dirs=includes),
Extension("spacy.word", ["spacy/word.pyx"], language="c++", include_dirs=includes),
Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes),
Extension("spacy.en", ["spacy/en.pyx"], language="c++", include_dirs=includes),
Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
Extension("spacy.pos", ["spacy/pos.pyx"], language="c++", include_dirs=includes),
Extension("spacy.utf8string", ["spacy/utf8string.pyx"], language="c++", include_dirs=includes),
]

View File

@ -45,6 +45,8 @@ cdef class Language:
self.suffix_re = re.compile(suffix)
self.infix_re = re.compile(infix)
self.lexicon = Lexicon(lexemes)
self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes'))
self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings'))
self._load_special_tokenization(rules)
cpdef Tokens tokenize(self, unicode string):
@ -244,6 +246,13 @@ cdef class Lexicon:
self.lexemes.push_back(lexeme)
self.size += 1
def set(self, unicode py_string, dict lexeme_dict):
cdef String string
string_from_unicode(&string, py_string)
cdef Lexeme* lex = self.get(&string)
lex[0] = lexeme_init(string.chars[:string.n], string.key, lex.i,
self.strings, lexeme_dict)
cdef Lexeme* get(self, String* string) except NULL:
cdef Lexeme* lex
lex = <Lexeme*>self._dict.get(string.key)
@ -278,7 +287,7 @@ cdef class Lexicon:
cdef FILE* fp = fopen(<char*>bytes_loc, 'wb')
assert fp != NULL
cdef size_t st
for i in range(self.size):
for i in range(self.size-1):
st = fwrite(self.lexemes[i], sizeof(Lexeme), 1, fp)
assert st == 1
st = fclose(fp)
@ -293,11 +302,12 @@ cdef class Lexicon:
cdef Lexeme* lexeme
while True:
lexeme = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
st = fread(lexeme, sizeof(lexeme), 1, fp)
if st == 0:
st = fread(lexeme, sizeof(Lexeme), 1, fp)
if st != 1:
break
self.lexemes.push_back(lexeme)
self._dict.set(lexeme.hash, lexeme)
fclose(fp)
cdef void string_from_unicode(String* s, unicode uni):

View File

@ -31,7 +31,6 @@ cpdef Lexeme init(unicode string, hash_t hashed, atom_t i,
cdef Lexeme lex
lex.hash = hashed
lex.i = i
print string, i
lex.length = len(string)
lex.sic = get_string_id(string, store)

View File

@ -58,10 +58,12 @@ cdef class StringStore:
strings = []
cdef Utf8Str* string
cdef bytes py_string
print "Dump strings"
for i in range(self.size):
string = &self.strings[i]
py_string = string.chars[:string.length]
strings.append(py_string)
print len(strings)
with open(loc, 'w') as file_:
ujson.dump(strings, file_, ensure_ascii=False)

View File

@ -13,7 +13,7 @@ def utf8open(loc, mode='r'):
def read_lang_data(name):
data_dir = path.join(DATA_DIR, name)
tokenization = read_tokenization(data_dir)
tokenization = read_tokenization(name)
prefix = read_prefix(data_dir)
suffix = read_suffix(data_dir)
infix = read_infix(data_dir)