* Fix platform-specific lexicon bug.

This commit is contained in:
Matthew Honnibal 2015-01-31 16:38:58 +11:00
parent a1ed574b7b
commit ce3ae8b5d9

View File

@ -12,6 +12,7 @@ from .lexeme cimport Lexeme
from .strings cimport slice_unicode from .strings cimport slice_unicode
from .strings cimport hash_string from .strings cimport hash_string
from .orth cimport word_shape from .orth cimport word_shape
from .typedefs cimport attr_t
from cymem.cymem cimport Address from cymem.cymem cimport Address
@ -41,8 +42,8 @@ cdef class Vocab:
if data_dir is not None: if data_dir is not None:
if not path.isdir(data_dir): if not path.isdir(data_dir):
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir) raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
self.strings.load(path.join(data_dir, 'strings.txt')) self.load_lexemes(path.join(data_dir, 'strings.txt'),
self.load_lexemes(path.join(data_dir, 'lexemes.bin')) path.join(data_dir, 'lexemes.bin'))
if path.exists(path.join(data_dir, 'vec.bin')): if path.exists(path.join(data_dir, 'vec.bin')):
self.load_rep_vectors(path.join(data_dir, 'vec.bin')) self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
@ -129,14 +130,15 @@ cdef class Vocab:
if key == 0: if key == 0:
continue continue
lexeme = <LexemeC*>self._map.c_map.cells[i].value lexeme = <LexemeC*>self._map.c_map.cells[i].value
st = fwrite(&key, sizeof(key), 1, fp) st = fwrite(&lexeme.orth, sizeof(lexeme.orth), 1, fp)
assert st == 1 assert st == 1
st = fwrite(lexeme, sizeof(LexemeC), 1, fp) st = fwrite(lexeme, sizeof(LexemeC), 1, fp)
assert st == 1 assert st == 1
st = fclose(fp) st = fclose(fp)
assert st == 0 assert st == 0
def load_lexemes(self, loc): def load_lexemes(self, strings_loc, loc):
self.strings.load(strings_loc)
if not path.exists(loc): if not path.exists(loc):
raise IOError('LexemeCs file not found at %s' % loc) raise IOError('LexemeCs file not found at %s' % loc)
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
@ -144,10 +146,12 @@ cdef class Vocab:
assert fp != NULL assert fp != NULL
cdef size_t st cdef size_t st
cdef LexemeC* lexeme cdef LexemeC* lexeme
cdef attr_t orth
cdef hash_t key cdef hash_t key
cdef unicode py_str
i = 0 i = 0
while True: while True:
st = fread(&key, sizeof(key), 1, fp) st = fread(&orth, sizeof(orth), 1, fp)
if st != 1: if st != 1:
break break
lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1) lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
@ -156,6 +160,9 @@ cdef class Vocab:
lexeme.repvec = EMPTY_VEC lexeme.repvec = EMPTY_VEC
if st != 1: if st != 1:
break break
assert orth == lexeme.orth
py_str = self.strings[orth]
key = hash_string(py_str)
self._map.set(key, lexeme) self._map.set(key, lexeme)
while self.lexemes.size() < (lexeme.id + 1): while self.lexemes.size() < (lexeme.id + 1):
self.lexemes.push_back(&EMPTY_LEXEME) self.lexemes.push_back(&EMPTY_LEXEME)