* Index lexemes by orth, instead of a lexemes vector. Breaks the mechanism for deciding not to own LexemeC structs during parsing. Need to reinstate this.

This commit is contained in:
Matthew Honnibal 2015-07-18 22:42:15 +02:00
parent 4dddc8a69b
commit 82d84b0f2b
2 changed files with 46 additions and 36 deletions

View File

@ -27,11 +27,12 @@ cdef class Vocab:
cpdef public lexeme_props_getter cpdef public lexeme_props_getter
cdef Pool mem cdef Pool mem
cpdef readonly StringStore strings cpdef readonly StringStore strings
cdef vector[const LexemeC*] lexemes
cdef readonly object pos_tags cdef readonly object pos_tags
cdef readonly int length
cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
cdef PreshMap _map cdef PreshMap _by_hash
cdef PreshMap _by_orth
cdef readonly int repvec_length cdef readonly int repvec_length

View File

@ -33,12 +33,15 @@ cdef class Vocab:
def __init__(self, data_dir=None, get_lex_props=None, load_vectors=True, def __init__(self, data_dir=None, get_lex_props=None, load_vectors=True,
pos_tags=None): pos_tags=None):
self.mem = Pool() self.mem = Pool()
self._map = PreshMap(2 ** 20) self._by_hash = PreshMap()
self._by_orth = PreshMap()
self.strings = StringStore() self.strings = StringStore()
self.pos_tags = pos_tags if pos_tags is not None else {} self.pos_tags = pos_tags if pos_tags is not None else {}
self.lexemes.push_back(&EMPTY_LEXEME)
self.lexeme_props_getter = get_lex_props self.lexeme_props_getter = get_lex_props
self.repvec_length = 0 self.repvec_length = 0
self.length = 0
self._add_lex_to_vocab(0, &EMPTY_LEXEME)
if data_dir is not None: if data_dir is not None:
if not path.exists(data_dir): if not path.exists(data_dir):
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir) raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
@ -52,34 +55,40 @@ cdef class Vocab:
def __len__(self): def __len__(self):
"""The current number of lexemes stored.""" """The current number of lexemes stored."""
return self.lexemes.size() return self.length
cdef const LexemeC* get(self, Pool mem, UniStr* c_str) except NULL: cdef const LexemeC* get(self, Pool mem, UniStr* c_str) except NULL:
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
if necessary, using memory acquired from the given pool. If the pool if necessary, using memory acquired from the given pool. If the pool
is the lexicon's own memory, the lexeme is saved in the lexicon.''' is the lexicon's own memory, the lexeme is saved in the lexicon.'''
cdef LexemeC* lex cdef LexemeC* lex
lex = <LexemeC*>self._map.get(c_str.key) lex = <LexemeC*>self._by_hash.get(c_str.key)
if lex != NULL: if lex != NULL:
return lex return lex
if c_str.n < 3: #if c_str.n < 3:
mem = self.mem oov = mem is not self.mem
mem = self.mem
cdef unicode py_str = c_str.chars[:c_str.n] cdef unicode py_str = c_str.chars[:c_str.n]
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1) lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
props = self.lexeme_props_getter(py_str) props = self.lexeme_props_getter(py_str)
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC) set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
if mem is self.mem: #if mem is self.mem:
lex.id = self.lexemes.size() #else:
self._add_lex_to_vocab(c_str.key, lex) if oov:
else: lex.id = 0
lex.id = 1 self._add_lex_to_vocab(c_str.key, lex)
return lex return lex
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1: cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
self._map.set(key, <void*>lex) self._by_hash.set(key, <void*>lex)
while self.lexemes.size() < (lex.id + 1): self._by_orth.set(lex.orth, <void*>lex)
self.lexemes.push_back(&EMPTY_LEXEME) self.length += 1
self.lexemes[lex.id] = lex
def __iter__(self):
cdef attr_t orth
cdef size_t addr
for orth, addr in self._by_orth.items():
yield Lexeme.from_ptr(<LexemeC*>addr, self.strings, self.repvec_length)
def __getitem__(self, id_or_string): def __getitem__(self, id_or_string):
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously '''Retrieve a lexeme, given an int ID or a unicode string. If a previously
@ -98,13 +107,17 @@ cdef class Vocab:
''' '''
cdef UniStr c_str cdef UniStr c_str
cdef const LexemeC* lexeme cdef const LexemeC* lexeme
cdef attr_t orth
if type(id_or_string) == int: if type(id_or_string) == int:
if id_or_string >= self.lexemes.size(): orth = id_or_string
raise IndexError lexeme = <LexemeC*>self._by_orth.get(orth)
lexeme = self.lexemes.at(id_or_string) if lexeme == NULL:
raise KeyError(id_or_string)
assert lexeme.orth == orth, ('%d vs %d' % (lexeme.orth, orth))
elif type(id_or_string) == unicode: elif type(id_or_string) == unicode:
slice_unicode(&c_str, id_or_string, 0, len(id_or_string)) slice_unicode(&c_str, id_or_string, 0, len(id_or_string))
lexeme = self.get(self.mem, &c_str) lexeme = self.get(self.mem, &c_str)
assert lexeme.orth == self.strings[id_or_string]
else: else:
raise ValueError("Vocab unable to map type: " raise ValueError("Vocab unable to map type: "
"%s. Maps unicode --> Lexeme or " "%s. Maps unicode --> Lexeme or "
@ -115,12 +128,11 @@ cdef class Vocab:
cdef UniStr c_str cdef UniStr c_str
slice_unicode(&c_str, py_str, 0, len(py_str)) slice_unicode(&c_str, py_str, 0, len(py_str))
cdef LexemeC* lex cdef LexemeC* lex
lex = <LexemeC*>self._map.get(c_str.key) lex = <LexemeC*>self._by_hash.get(c_str.key)
if lex == NULL: if lex == NULL:
lex = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1) lex = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
lex.id = self.lexemes.size()
self._add_lex_to_vocab(c_str.key, lex)
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC) set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
self._add_lex_to_vocab(c_str.key, lex)
def dump(self, loc): def dump(self, loc):
if path.exists(loc): if path.exists(loc):
@ -129,12 +141,10 @@ cdef class Vocab:
cdef FILE* fp = fopen(<char*>bytes_loc, 'wb') cdef FILE* fp = fopen(<char*>bytes_loc, 'wb')
assert fp != NULL assert fp != NULL
cdef size_t st cdef size_t st
cdef size_t addr
cdef hash_t key cdef hash_t key
for i in range(self._map.length): for key, addr in self._by_hash.items():
key = self._map.c_map.cells[i].key lexeme = <LexemeC*>addr
if key == 0:
continue
lexeme = <LexemeC*>self._map.c_map.cells[i].value
st = fwrite(&lexeme.orth, sizeof(lexeme.orth), 1, fp) st = fwrite(&lexeme.orth, sizeof(lexeme.orth), 1, fp)
assert st == 1 assert st == 1
st = fwrite(lexeme, sizeof(LexemeC), 1, fp) st = fwrite(lexeme, sizeof(LexemeC), 1, fp)
@ -171,10 +181,9 @@ cdef class Vocab:
raise IOError('Error reading from lexemes.bin. Integrity check fails.') raise IOError('Error reading from lexemes.bin. Integrity check fails.')
py_str = self.strings[orth] py_str = self.strings[orth]
key = hash_string(py_str) key = hash_string(py_str)
self._map.set(key, lexeme) self._by_hash.set(key, lexeme)
while self.lexemes.size() < (lexeme.id + 1): self._by_orth.set(lexeme.orth, lexeme)
self.lexemes.push_back(&EMPTY_LEXEME) self.length += 1
self.lexemes[lexeme.id] = lexeme
i += 1 i += 1
fclose(fp) fclose(fp)
@ -185,7 +194,7 @@ cdef class Vocab:
cdef int32_t prev_vec_len = 0 cdef int32_t prev_vec_len = 0
cdef float* vec cdef float* vec
cdef Address mem cdef Address mem
cdef id_t string_id cdef attr_t string_id
cdef bytes py_word cdef bytes py_word
cdef vector[float*] vectors cdef vector[float*] vectors
cdef int i cdef int i
@ -212,9 +221,9 @@ cdef class Vocab:
assert vec != NULL assert vec != NULL
vectors[string_id] = vec vectors[string_id] = vec
cdef LexemeC* lex cdef LexemeC* lex
for i in range(self.lexemes.size()): cdef size_t lex_addr
# Cast away the const, cos we can modify our lexemes for orth, lex_addr in self._by_orth.items():
lex = <LexemeC*>self.lexemes[i] lex = <LexemeC*>lex_addr
if lex.lower < vectors.size(): if lex.lower < vectors.size():
lex.repvec = vectors[lex.lower] lex.repvec = vectors[lex.lower]
for i in range(vec_len): for i in range(vec_len):