This commit is contained in:
Matthew Honnibal 2015-09-09 14:33:26 +02:00
parent d6561988cf
commit a7f4b26c8c

View File

@ -112,7 +112,9 @@ cdef class Vocab:
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL: cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
cdef hash_t key cdef hash_t key
cdef bint is_oov = mem is not self.mem #cdef bint is_oov = mem is not self.mem
# TODO
is_oov = False
mem = self.mem mem = self.mem
if len(string) < 3: if len(string) < 3:
mem = self.mem mem = self.mem
@ -197,7 +199,6 @@ cdef class Vocab:
cdef hash_t key cdef hash_t key
for key, addr in self._by_hash.items(): for key, addr in self._by_hash.items():
lexeme = <LexemeC*>addr lexeme = <LexemeC*>addr
fp.write_from(&lexeme.orth, sizeof(lexeme.orth), 1)
fp.write_from(&lexeme.flags, sizeof(lexeme.flags), 1) fp.write_from(&lexeme.flags, sizeof(lexeme.flags), 1)
fp.write_from(&lexeme.id, sizeof(lexeme.flags), 1) fp.write_from(&lexeme.id, sizeof(lexeme.flags), 1)
fp.write_from(&lexeme.length, sizeof(lexeme.length), 1) fp.write_from(&lexeme.length, sizeof(lexeme.length), 1)
@ -219,17 +220,17 @@ cdef class Vocab:
raise IOError('LexemeCs file not found at %s' % loc) raise IOError('LexemeCs file not found at %s' % loc)
fp = CFile(loc, 'rb') fp = CFile(loc, 'rb')
cdef LexemeC* lexeme cdef LexemeC* lexeme
cdef attr_t orth
cdef hash_t key cdef hash_t key
cdef unicode py_str cdef unicode py_str
cdef attr_t orth
assert sizeof(orth) == sizeof(lexeme.orth) assert sizeof(orth) == sizeof(lexeme.orth)
i = 0 i = 0
while True: while True:
lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
try: try:
fp.read_into(&orth, 1, sizeof(orth)) fp.read_into(&orth, 1, sizeof(orth))
except IOError: except IOError:
break break
lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
# Copy data from the file into the lexeme # Copy data from the file into the lexeme
fp.read_into(&lexeme.flags, 1, sizeof(lexeme.flags)) fp.read_into(&lexeme.flags, 1, sizeof(lexeme.flags))
fp.read_into(&lexeme.id, 1, sizeof(lexeme.id)) fp.read_into(&lexeme.id, 1, sizeof(lexeme.id))
@ -246,10 +247,8 @@ cdef class Vocab:
fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm)) fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm))
lexeme.repvec = EMPTY_VEC lexeme.repvec = EMPTY_VEC
if orth != lexeme.orth: py_str = self.strings[lexeme.orth]
# TODO: Improve this error message, pending resolution to Issue #64 assert py_str[-3:] == self.strings[lexeme.suffix], "%s (%d) suffix %s (%d)" % (repr(py_str), lexeme.orth, repr(self.strings[lexeme.suffix]), lexeme.suffix)
raise IOError('Error reading from lexemes.bin. Integrity check fails.')
py_str = self.strings[orth]
key = hash_string(py_str) key = hash_string(py_str)
self._by_hash.set(key, lexeme) self._by_hash.set(key, lexeme)
self._by_orth.set(lexeme.orth, lexeme) self._by_orth.set(lexeme.orth, lexeme)