mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
* Update lexeme serialization, using a binary file format
This commit is contained in:
parent
13909a2e24
commit
67c8c8019f
5
setup.py
5
setup.py
|
@ -6,7 +6,6 @@ import distutils.core
|
|||
import sys
|
||||
import os
|
||||
import os.path
|
||||
import numpy
|
||||
|
||||
from os import path
|
||||
from glob import glob
|
||||
|
@ -35,7 +34,7 @@ compile_args = []
|
|||
link_args = []
|
||||
libs = []
|
||||
|
||||
includes = ['.', numpy.get_include()]
|
||||
includes = ['.']
|
||||
cython_includes = ['.']
|
||||
|
||||
|
||||
|
@ -48,11 +47,11 @@ else:
|
|||
|
||||
exts = [
|
||||
Extension("spacy.lang", ["spacy/lang.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.word", ["spacy/word.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.en", ["spacy/en.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.pos", ["spacy/pos.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.utf8string", ["spacy/utf8string.pyx"], language="c++", include_dirs=includes),
|
||||
]
|
||||
|
||||
|
||||
|
|
|
@ -45,6 +45,8 @@ cdef class Language:
|
|||
self.suffix_re = re.compile(suffix)
|
||||
self.infix_re = re.compile(infix)
|
||||
self.lexicon = Lexicon(lexemes)
|
||||
self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes'))
|
||||
self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings'))
|
||||
self._load_special_tokenization(rules)
|
||||
|
||||
cpdef Tokens tokenize(self, unicode string):
|
||||
|
@ -244,6 +246,13 @@ cdef class Lexicon:
|
|||
self.lexemes.push_back(lexeme)
|
||||
self.size += 1
|
||||
|
||||
def set(self, unicode py_string, dict lexeme_dict):
|
||||
cdef String string
|
||||
string_from_unicode(&string, py_string)
|
||||
cdef Lexeme* lex = self.get(&string)
|
||||
lex[0] = lexeme_init(string.chars[:string.n], string.key, lex.i,
|
||||
self.strings, lexeme_dict)
|
||||
|
||||
cdef Lexeme* get(self, String* string) except NULL:
|
||||
cdef Lexeme* lex
|
||||
lex = <Lexeme*>self._dict.get(string.key)
|
||||
|
@ -278,7 +287,7 @@ cdef class Lexicon:
|
|||
cdef FILE* fp = fopen(<char*>bytes_loc, 'wb')
|
||||
assert fp != NULL
|
||||
cdef size_t st
|
||||
for i in range(self.size):
|
||||
for i in range(self.size-1):
|
||||
st = fwrite(self.lexemes[i], sizeof(Lexeme), 1, fp)
|
||||
assert st == 1
|
||||
st = fclose(fp)
|
||||
|
@ -293,11 +302,12 @@ cdef class Lexicon:
|
|||
cdef Lexeme* lexeme
|
||||
while True:
|
||||
lexeme = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
|
||||
st = fread(lexeme, sizeof(lexeme), 1, fp)
|
||||
if st == 0:
|
||||
st = fread(lexeme, sizeof(Lexeme), 1, fp)
|
||||
if st != 1:
|
||||
break
|
||||
self.lexemes.push_back(lexeme)
|
||||
self._dict.set(lexeme.hash, lexeme)
|
||||
fclose(fp)
|
||||
|
||||
|
||||
cdef void string_from_unicode(String* s, unicode uni):
|
||||
|
|
|
@ -31,7 +31,6 @@ cpdef Lexeme init(unicode string, hash_t hashed, atom_t i,
|
|||
cdef Lexeme lex
|
||||
lex.hash = hashed
|
||||
lex.i = i
|
||||
print string, i
|
||||
lex.length = len(string)
|
||||
lex.sic = get_string_id(string, store)
|
||||
|
||||
|
|
|
@ -58,10 +58,12 @@ cdef class StringStore:
|
|||
strings = []
|
||||
cdef Utf8Str* string
|
||||
cdef bytes py_string
|
||||
print "Dump strings"
|
||||
for i in range(self.size):
|
||||
string = &self.strings[i]
|
||||
py_string = string.chars[:string.length]
|
||||
strings.append(py_string)
|
||||
print len(strings)
|
||||
with open(loc, 'w') as file_:
|
||||
ujson.dump(strings, file_, ensure_ascii=False)
|
||||
|
||||
|
|
|
@ -13,7 +13,7 @@ def utf8open(loc, mode='r'):
|
|||
|
||||
def read_lang_data(name):
|
||||
data_dir = path.join(DATA_DIR, name)
|
||||
tokenization = read_tokenization(data_dir)
|
||||
tokenization = read_tokenization(name)
|
||||
prefix = read_prefix(data_dir)
|
||||
suffix = read_suffix(data_dir)
|
||||
infix = read_infix(data_dir)
|
||||
|
|
Loading…
Reference in New Issue
Block a user