mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 02:36:32 +03:00
* Change lexemes.bin format. Add a header specifying size of LexemeC and number of lexemes, and don't have the redundant orth information.
This commit is contained in:
parent
6047f2aa35
commit
6b586cdad4
|
@ -3,7 +3,7 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
||||||
from libc.string cimport memset
|
from libc.string cimport memset
|
||||||
from libc.stdint cimport int32_t
|
from libc.stdint cimport int32_t, uint64_t
|
||||||
|
|
||||||
import bz2
|
import bz2
|
||||||
from os import path
|
from os import path
|
||||||
|
@ -186,12 +186,17 @@ cdef class Vocab:
|
||||||
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
||||||
|
|
||||||
cdef CFile fp = CFile(bytes_loc, 'wb')
|
cdef CFile fp = CFile(bytes_loc, 'wb')
|
||||||
cdef size_t st
|
|
||||||
|
cdef uint64_t size_of_lexeme = sizeof(LexemeC)
|
||||||
|
items = list(self._by_hash.items())
|
||||||
|
cdef uint64_t n_lexemes = len(items)
|
||||||
|
fp.write_from(&size_of_lexeme, 1, sizeof(size_of_lexeme))
|
||||||
|
fp.write_from(&n_lexemes, 1, sizeof(n_lexemes))
|
||||||
|
|
||||||
cdef size_t addr
|
cdef size_t addr
|
||||||
cdef hash_t key
|
cdef hash_t key
|
||||||
for key, addr in self._by_hash.items():
|
for key, addr in items:
|
||||||
lexeme = <LexemeC*>addr
|
lexeme = <LexemeC*>addr
|
||||||
fp.write_from(&lexeme.orth, sizeof(lexeme.orth), 1)
|
|
||||||
fp.write_from(lexeme, sizeof(LexemeC), 1)
|
fp.write_from(lexeme, sizeof(LexemeC), 1)
|
||||||
fp.close()
|
fp.close()
|
||||||
|
|
||||||
|
@ -199,36 +204,28 @@ cdef class Vocab:
|
||||||
self.strings.load(strings_loc)
|
self.strings.load(strings_loc)
|
||||||
if not path.exists(loc):
|
if not path.exists(loc):
|
||||||
raise IOError('LexemeCs file not found at %s' % loc)
|
raise IOError('LexemeCs file not found at %s' % loc)
|
||||||
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
|
||||||
cdef FILE* fp = fopen(<char*>bytes_loc, b'rb')
|
cdef CFile fp = CFile(loc, 'rb')
|
||||||
if fp == NULL:
|
|
||||||
raise IOError('lexemes data file present, but cannot open from ' % loc)
|
cdef uint64_t size_of_lexeme
|
||||||
cdef size_t st
|
cdef uint64_t n_lexemes
|
||||||
cdef LexemeC* lexeme
|
fp.read_into(&size_of_lexeme, 1, sizeof(size_of_lexeme))
|
||||||
cdef attr_t orth
|
assert size_of_lexeme == sizeof(LexemeC)
|
||||||
|
fp.read_into(&n_lexemes, 1, sizeof(n_lexemes))
|
||||||
|
|
||||||
|
lexemes = <LexemeC*>self.mem.alloc(n_lexemes, sizeof(LexemeC))
|
||||||
cdef hash_t key
|
cdef hash_t key
|
||||||
cdef unicode py_str
|
cdef unicode py_str
|
||||||
i = 0
|
cdef int i
|
||||||
while True:
|
self.length = n_lexemes
|
||||||
st = fread(&orth, sizeof(orth), 1, fp)
|
for i in range(n_lexemes):
|
||||||
if st != 1:
|
fp.read_into(&lexemes[i], sizeof(LexemeC), 1)
|
||||||
break
|
lexemes[i].repvec = EMPTY_VEC
|
||||||
lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
|
py_str = self.strings[lexemes[i].orth]
|
||||||
# Copies data from the file into the lexeme
|
|
||||||
st = fread(lexeme, sizeof(LexemeC), 1, fp)
|
|
||||||
lexeme.repvec = EMPTY_VEC
|
|
||||||
if st != 1:
|
|
||||||
break
|
|
||||||
if orth != lexeme.orth:
|
|
||||||
# TODO: Improve this error message, pending resolution to Issue #64
|
|
||||||
raise IOError('Error reading from lexemes.bin. Integrity check fails.')
|
|
||||||
py_str = self.strings[orth]
|
|
||||||
key = hash_string(py_str)
|
key = hash_string(py_str)
|
||||||
self._by_hash.set(key, lexeme)
|
self._by_hash.set(key, &lexemes[i])
|
||||||
self._by_orth.set(lexeme.orth, lexeme)
|
self._by_orth.set(lexemes[i].orth, &lexemes[i])
|
||||||
self.length += 1
|
assert lexemes[i].length == len(py_str)
|
||||||
i += 1
|
|
||||||
fclose(fp)
|
|
||||||
|
|
||||||
def load_rep_vectors(self, loc):
|
def load_rep_vectors(self, loc):
|
||||||
cdef CFile file_ = CFile(loc, b'rb')
|
cdef CFile file_ = CFile(loc, b'rb')
|
||||||
|
|
Loading…
Reference in New Issue
Block a user