* Reinstate the fixed vocabulary --- words are only added to the lexicon in init_model, after that we create LexemeC structs with the Pool given to us.

This commit is contained in:
Matthew Honnibal 2015-07-20 01:37:34 +02:00
parent 5a7d060d9c
commit 1f7170e0e1

View File

@ -55,7 +55,8 @@ cdef class Vocab:
if load_vectors and path.exists(path.join(data_dir, 'vec.bin')): if load_vectors and path.exists(path.join(data_dir, 'vec.bin')):
self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin')) self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
self.packer = Packer(self, util.read_encoding_freqs(data_dir)) #self.packer = Packer(self, util.read_encoding_freqs(data_dir))
self.packer = None
def __len__(self): def __len__(self):
"""The current number of lexemes stored.""" """The current number of lexemes stored."""
@ -69,18 +70,17 @@ cdef class Vocab:
lex = <LexemeC*>self._by_hash.get(c_str.key) lex = <LexemeC*>self._by_hash.get(c_str.key)
if lex != NULL: if lex != NULL:
return lex return lex
#if c_str.n < 3: cdef bint is_oov = mem is not self.mem
oov = mem is not self.mem if c_str.n < 3:
mem = self.mem mem = self.mem
cdef unicode py_str = c_str.chars[:c_str.n] cdef unicode py_str = c_str.chars[:c_str.n]
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1) lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
props = self.lexeme_props_getter(py_str) props = self.lexeme_props_getter(py_str)
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC) set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
#if mem is self.mem: if is_oov:
#else:
if oov:
lex.id = 0 lex.id = 0
self._add_lex_to_vocab(c_str.key, lex) else:
self._add_lex_to_vocab(c_str.key, lex)
return lex return lex
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1: cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1: