mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 10:46:29 +03:00
* Update lexeme serialization, using a binary file format
This commit is contained in:
parent
13909a2e24
commit
67c8c8019f
5
setup.py
5
setup.py
|
@ -6,7 +6,6 @@ import distutils.core
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
import os.path
|
import os.path
|
||||||
import numpy
|
|
||||||
|
|
||||||
from os import path
|
from os import path
|
||||||
from glob import glob
|
from glob import glob
|
||||||
|
@ -35,7 +34,7 @@ compile_args = []
|
||||||
link_args = []
|
link_args = []
|
||||||
libs = []
|
libs = []
|
||||||
|
|
||||||
includes = ['.', numpy.get_include()]
|
includes = ['.']
|
||||||
cython_includes = ['.']
|
cython_includes = ['.']
|
||||||
|
|
||||||
|
|
||||||
|
@ -48,11 +47,11 @@ else:
|
||||||
|
|
||||||
exts = [
|
exts = [
|
||||||
Extension("spacy.lang", ["spacy/lang.pyx"], language="c++", include_dirs=includes),
|
Extension("spacy.lang", ["spacy/lang.pyx"], language="c++", include_dirs=includes),
|
||||||
Extension("spacy.word", ["spacy/word.pyx"], language="c++", include_dirs=includes),
|
|
||||||
Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes),
|
Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes),
|
||||||
Extension("spacy.en", ["spacy/en.pyx"], language="c++", include_dirs=includes),
|
Extension("spacy.en", ["spacy/en.pyx"], language="c++", include_dirs=includes),
|
||||||
Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
|
Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
|
||||||
Extension("spacy.pos", ["spacy/pos.pyx"], language="c++", include_dirs=includes),
|
Extension("spacy.pos", ["spacy/pos.pyx"], language="c++", include_dirs=includes),
|
||||||
|
Extension("spacy.utf8string", ["spacy/utf8string.pyx"], language="c++", include_dirs=includes),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -45,6 +45,8 @@ cdef class Language:
|
||||||
self.suffix_re = re.compile(suffix)
|
self.suffix_re = re.compile(suffix)
|
||||||
self.infix_re = re.compile(infix)
|
self.infix_re = re.compile(infix)
|
||||||
self.lexicon = Lexicon(lexemes)
|
self.lexicon = Lexicon(lexemes)
|
||||||
|
self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes'))
|
||||||
|
self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings'))
|
||||||
self._load_special_tokenization(rules)
|
self._load_special_tokenization(rules)
|
||||||
|
|
||||||
cpdef Tokens tokenize(self, unicode string):
|
cpdef Tokens tokenize(self, unicode string):
|
||||||
|
@ -244,6 +246,13 @@ cdef class Lexicon:
|
||||||
self.lexemes.push_back(lexeme)
|
self.lexemes.push_back(lexeme)
|
||||||
self.size += 1
|
self.size += 1
|
||||||
|
|
||||||
|
def set(self, unicode py_string, dict lexeme_dict):
|
||||||
|
cdef String string
|
||||||
|
string_from_unicode(&string, py_string)
|
||||||
|
cdef Lexeme* lex = self.get(&string)
|
||||||
|
lex[0] = lexeme_init(string.chars[:string.n], string.key, lex.i,
|
||||||
|
self.strings, lexeme_dict)
|
||||||
|
|
||||||
cdef Lexeme* get(self, String* string) except NULL:
|
cdef Lexeme* get(self, String* string) except NULL:
|
||||||
cdef Lexeme* lex
|
cdef Lexeme* lex
|
||||||
lex = <Lexeme*>self._dict.get(string.key)
|
lex = <Lexeme*>self._dict.get(string.key)
|
||||||
|
@ -278,7 +287,7 @@ cdef class Lexicon:
|
||||||
cdef FILE* fp = fopen(<char*>bytes_loc, 'wb')
|
cdef FILE* fp = fopen(<char*>bytes_loc, 'wb')
|
||||||
assert fp != NULL
|
assert fp != NULL
|
||||||
cdef size_t st
|
cdef size_t st
|
||||||
for i in range(self.size):
|
for i in range(self.size-1):
|
||||||
st = fwrite(self.lexemes[i], sizeof(Lexeme), 1, fp)
|
st = fwrite(self.lexemes[i], sizeof(Lexeme), 1, fp)
|
||||||
assert st == 1
|
assert st == 1
|
||||||
st = fclose(fp)
|
st = fclose(fp)
|
||||||
|
@ -293,11 +302,12 @@ cdef class Lexicon:
|
||||||
cdef Lexeme* lexeme
|
cdef Lexeme* lexeme
|
||||||
while True:
|
while True:
|
||||||
lexeme = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
|
lexeme = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
|
||||||
st = fread(lexeme, sizeof(lexeme), 1, fp)
|
st = fread(lexeme, sizeof(Lexeme), 1, fp)
|
||||||
if st == 0:
|
if st != 1:
|
||||||
break
|
break
|
||||||
self.lexemes.push_back(lexeme)
|
self.lexemes.push_back(lexeme)
|
||||||
self._dict.set(lexeme.hash, lexeme)
|
self._dict.set(lexeme.hash, lexeme)
|
||||||
|
fclose(fp)
|
||||||
|
|
||||||
|
|
||||||
cdef void string_from_unicode(String* s, unicode uni):
|
cdef void string_from_unicode(String* s, unicode uni):
|
||||||
|
|
|
@ -31,7 +31,6 @@ cpdef Lexeme init(unicode string, hash_t hashed, atom_t i,
|
||||||
cdef Lexeme lex
|
cdef Lexeme lex
|
||||||
lex.hash = hashed
|
lex.hash = hashed
|
||||||
lex.i = i
|
lex.i = i
|
||||||
print string, i
|
|
||||||
lex.length = len(string)
|
lex.length = len(string)
|
||||||
lex.sic = get_string_id(string, store)
|
lex.sic = get_string_id(string, store)
|
||||||
|
|
||||||
|
|
|
@ -58,10 +58,12 @@ cdef class StringStore:
|
||||||
strings = []
|
strings = []
|
||||||
cdef Utf8Str* string
|
cdef Utf8Str* string
|
||||||
cdef bytes py_string
|
cdef bytes py_string
|
||||||
|
print "Dump strings"
|
||||||
for i in range(self.size):
|
for i in range(self.size):
|
||||||
string = &self.strings[i]
|
string = &self.strings[i]
|
||||||
py_string = string.chars[:string.length]
|
py_string = string.chars[:string.length]
|
||||||
strings.append(py_string)
|
strings.append(py_string)
|
||||||
|
print len(strings)
|
||||||
with open(loc, 'w') as file_:
|
with open(loc, 'w') as file_:
|
||||||
ujson.dump(strings, file_, ensure_ascii=False)
|
ujson.dump(strings, file_, ensure_ascii=False)
|
||||||
|
|
||||||
|
|
|
@ -13,7 +13,7 @@ def utf8open(loc, mode='r'):
|
||||||
|
|
||||||
def read_lang_data(name):
|
def read_lang_data(name):
|
||||||
data_dir = path.join(DATA_DIR, name)
|
data_dir = path.join(DATA_DIR, name)
|
||||||
tokenization = read_tokenization(data_dir)
|
tokenization = read_tokenization(name)
|
||||||
prefix = read_prefix(data_dir)
|
prefix = read_prefix(data_dir)
|
||||||
suffix = read_suffix(data_dir)
|
suffix = read_suffix(data_dir)
|
||||||
infix = read_infix(data_dir)
|
infix = read_infix(data_dir)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user