From a7f4b26c8ca9cc4c5bd7a03aa57e06b053f34ce9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 9 Sep 2015 14:33:26 +0200 Subject: [PATCH] * Tmp --- spacy/vocab.pyx | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 5da29439b..571a37da9 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -112,7 +112,9 @@ cdef class Vocab: cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL: cdef hash_t key - cdef bint is_oov = mem is not self.mem + #cdef bint is_oov = mem is not self.mem + # TODO + is_oov = False mem = self.mem if len(string) < 3: mem = self.mem @@ -197,7 +199,6 @@ cdef class Vocab: cdef hash_t key for key, addr in self._by_hash.items(): lexeme = addr - fp.write_from(&lexeme.orth, sizeof(lexeme.orth), 1) fp.write_from(&lexeme.flags, sizeof(lexeme.flags), 1) fp.write_from(&lexeme.id, sizeof(lexeme.flags), 1) fp.write_from(&lexeme.length, sizeof(lexeme.length), 1) @@ -219,17 +220,17 @@ cdef class Vocab: raise IOError('LexemeCs file not found at %s' % loc) fp = CFile(loc, 'rb') cdef LexemeC* lexeme - cdef attr_t orth cdef hash_t key cdef unicode py_str + cdef attr_t orth assert sizeof(orth) == sizeof(lexeme.orth) i = 0 while True: - lexeme = self.mem.alloc(sizeof(LexemeC), 1) try: fp.read_into(&orth, 1, sizeof(orth)) except IOError: break + lexeme = self.mem.alloc(sizeof(LexemeC), 1) # Copy data from the file into the lexeme fp.read_into(&lexeme.flags, 1, sizeof(lexeme.flags)) fp.read_into(&lexeme.id, 1, sizeof(lexeme.id)) @@ -246,10 +247,8 @@ cdef class Vocab: fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm)) lexeme.repvec = EMPTY_VEC - if orth != lexeme.orth: - # TODO: Improve this error message, pending resolution to Issue #64 - raise IOError('Error reading from lexemes.bin. Integrity check fails.') - py_str = self.strings[orth] + py_str = self.strings[lexeme.orth] + assert py_str[-3:] == self.strings[lexeme.suffix], "%s (%d) suffix %s (%d)" % (repr(py_str), lexeme.orth, repr(self.strings[lexeme.suffix]), lexeme.suffix) key = hash_string(py_str) self._by_hash.set(key, lexeme) self._by_orth.set(lexeme.orth, lexeme)