mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
* Index lexemes by orth, instead of a lexemes vector. Breaks the mechanism for deciding not to own LexemeC structs during parsing. Need to reinstate this.
This commit is contained in:
parent
4dddc8a69b
commit
82d84b0f2b
|
@ -27,11 +27,12 @@ cdef class Vocab:
|
||||||
cpdef public lexeme_props_getter
|
cpdef public lexeme_props_getter
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cpdef readonly StringStore strings
|
cpdef readonly StringStore strings
|
||||||
cdef vector[const LexemeC*] lexemes
|
|
||||||
cdef readonly object pos_tags
|
cdef readonly object pos_tags
|
||||||
|
cdef readonly int length
|
||||||
|
|
||||||
cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL
|
cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL
|
||||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
|
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
|
||||||
|
|
||||||
cdef PreshMap _map
|
cdef PreshMap _by_hash
|
||||||
|
cdef PreshMap _by_orth
|
||||||
cdef readonly int repvec_length
|
cdef readonly int repvec_length
|
||||||
|
|
|
@ -33,12 +33,15 @@ cdef class Vocab:
|
||||||
def __init__(self, data_dir=None, get_lex_props=None, load_vectors=True,
|
def __init__(self, data_dir=None, get_lex_props=None, load_vectors=True,
|
||||||
pos_tags=None):
|
pos_tags=None):
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self._map = PreshMap(2 ** 20)
|
self._by_hash = PreshMap()
|
||||||
|
self._by_orth = PreshMap()
|
||||||
self.strings = StringStore()
|
self.strings = StringStore()
|
||||||
self.pos_tags = pos_tags if pos_tags is not None else {}
|
self.pos_tags = pos_tags if pos_tags is not None else {}
|
||||||
self.lexemes.push_back(&EMPTY_LEXEME)
|
|
||||||
self.lexeme_props_getter = get_lex_props
|
self.lexeme_props_getter = get_lex_props
|
||||||
self.repvec_length = 0
|
self.repvec_length = 0
|
||||||
|
self.length = 0
|
||||||
|
self._add_lex_to_vocab(0, &EMPTY_LEXEME)
|
||||||
if data_dir is not None:
|
if data_dir is not None:
|
||||||
if not path.exists(data_dir):
|
if not path.exists(data_dir):
|
||||||
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
|
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
|
||||||
|
@ -52,34 +55,40 @@ cdef class Vocab:
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
"""The current number of lexemes stored."""
|
"""The current number of lexemes stored."""
|
||||||
return self.lexemes.size()
|
return self.length
|
||||||
|
|
||||||
cdef const LexemeC* get(self, Pool mem, UniStr* c_str) except NULL:
|
cdef const LexemeC* get(self, Pool mem, UniStr* c_str) except NULL:
|
||||||
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
||||||
if necessary, using memory acquired from the given pool. If the pool
|
if necessary, using memory acquired from the given pool. If the pool
|
||||||
is the lexicon's own memory, the lexeme is saved in the lexicon.'''
|
is the lexicon's own memory, the lexeme is saved in the lexicon.'''
|
||||||
cdef LexemeC* lex
|
cdef LexemeC* lex
|
||||||
lex = <LexemeC*>self._map.get(c_str.key)
|
lex = <LexemeC*>self._by_hash.get(c_str.key)
|
||||||
if lex != NULL:
|
if lex != NULL:
|
||||||
return lex
|
return lex
|
||||||
if c_str.n < 3:
|
#if c_str.n < 3:
|
||||||
mem = self.mem
|
oov = mem is not self.mem
|
||||||
|
mem = self.mem
|
||||||
cdef unicode py_str = c_str.chars[:c_str.n]
|
cdef unicode py_str = c_str.chars[:c_str.n]
|
||||||
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
|
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
|
||||||
props = self.lexeme_props_getter(py_str)
|
props = self.lexeme_props_getter(py_str)
|
||||||
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
|
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
|
||||||
if mem is self.mem:
|
#if mem is self.mem:
|
||||||
lex.id = self.lexemes.size()
|
#else:
|
||||||
self._add_lex_to_vocab(c_str.key, lex)
|
if oov:
|
||||||
else:
|
lex.id = 0
|
||||||
lex.id = 1
|
self._add_lex_to_vocab(c_str.key, lex)
|
||||||
return lex
|
return lex
|
||||||
|
|
||||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
|
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
|
||||||
self._map.set(key, <void*>lex)
|
self._by_hash.set(key, <void*>lex)
|
||||||
while self.lexemes.size() < (lex.id + 1):
|
self._by_orth.set(lex.orth, <void*>lex)
|
||||||
self.lexemes.push_back(&EMPTY_LEXEME)
|
self.length += 1
|
||||||
self.lexemes[lex.id] = lex
|
|
||||||
|
def __iter__(self):
|
||||||
|
cdef attr_t orth
|
||||||
|
cdef size_t addr
|
||||||
|
for orth, addr in self._by_orth.items():
|
||||||
|
yield Lexeme.from_ptr(<LexemeC*>addr, self.strings, self.repvec_length)
|
||||||
|
|
||||||
def __getitem__(self, id_or_string):
|
def __getitem__(self, id_or_string):
|
||||||
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously
|
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously
|
||||||
|
@ -98,13 +107,17 @@ cdef class Vocab:
|
||||||
'''
|
'''
|
||||||
cdef UniStr c_str
|
cdef UniStr c_str
|
||||||
cdef const LexemeC* lexeme
|
cdef const LexemeC* lexeme
|
||||||
|
cdef attr_t orth
|
||||||
if type(id_or_string) == int:
|
if type(id_or_string) == int:
|
||||||
if id_or_string >= self.lexemes.size():
|
orth = id_or_string
|
||||||
raise IndexError
|
lexeme = <LexemeC*>self._by_orth.get(orth)
|
||||||
lexeme = self.lexemes.at(id_or_string)
|
if lexeme == NULL:
|
||||||
|
raise KeyError(id_or_string)
|
||||||
|
assert lexeme.orth == orth, ('%d vs %d' % (lexeme.orth, orth))
|
||||||
elif type(id_or_string) == unicode:
|
elif type(id_or_string) == unicode:
|
||||||
slice_unicode(&c_str, id_or_string, 0, len(id_or_string))
|
slice_unicode(&c_str, id_or_string, 0, len(id_or_string))
|
||||||
lexeme = self.get(self.mem, &c_str)
|
lexeme = self.get(self.mem, &c_str)
|
||||||
|
assert lexeme.orth == self.strings[id_or_string]
|
||||||
else:
|
else:
|
||||||
raise ValueError("Vocab unable to map type: "
|
raise ValueError("Vocab unable to map type: "
|
||||||
"%s. Maps unicode --> Lexeme or "
|
"%s. Maps unicode --> Lexeme or "
|
||||||
|
@ -115,12 +128,11 @@ cdef class Vocab:
|
||||||
cdef UniStr c_str
|
cdef UniStr c_str
|
||||||
slice_unicode(&c_str, py_str, 0, len(py_str))
|
slice_unicode(&c_str, py_str, 0, len(py_str))
|
||||||
cdef LexemeC* lex
|
cdef LexemeC* lex
|
||||||
lex = <LexemeC*>self._map.get(c_str.key)
|
lex = <LexemeC*>self._by_hash.get(c_str.key)
|
||||||
if lex == NULL:
|
if lex == NULL:
|
||||||
lex = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
|
lex = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
|
||||||
lex.id = self.lexemes.size()
|
|
||||||
self._add_lex_to_vocab(c_str.key, lex)
|
|
||||||
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
|
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
|
||||||
|
self._add_lex_to_vocab(c_str.key, lex)
|
||||||
|
|
||||||
def dump(self, loc):
|
def dump(self, loc):
|
||||||
if path.exists(loc):
|
if path.exists(loc):
|
||||||
|
@ -129,12 +141,10 @@ cdef class Vocab:
|
||||||
cdef FILE* fp = fopen(<char*>bytes_loc, 'wb')
|
cdef FILE* fp = fopen(<char*>bytes_loc, 'wb')
|
||||||
assert fp != NULL
|
assert fp != NULL
|
||||||
cdef size_t st
|
cdef size_t st
|
||||||
|
cdef size_t addr
|
||||||
cdef hash_t key
|
cdef hash_t key
|
||||||
for i in range(self._map.length):
|
for key, addr in self._by_hash.items():
|
||||||
key = self._map.c_map.cells[i].key
|
lexeme = <LexemeC*>addr
|
||||||
if key == 0:
|
|
||||||
continue
|
|
||||||
lexeme = <LexemeC*>self._map.c_map.cells[i].value
|
|
||||||
st = fwrite(&lexeme.orth, sizeof(lexeme.orth), 1, fp)
|
st = fwrite(&lexeme.orth, sizeof(lexeme.orth), 1, fp)
|
||||||
assert st == 1
|
assert st == 1
|
||||||
st = fwrite(lexeme, sizeof(LexemeC), 1, fp)
|
st = fwrite(lexeme, sizeof(LexemeC), 1, fp)
|
||||||
|
@ -171,10 +181,9 @@ cdef class Vocab:
|
||||||
raise IOError('Error reading from lexemes.bin. Integrity check fails.')
|
raise IOError('Error reading from lexemes.bin. Integrity check fails.')
|
||||||
py_str = self.strings[orth]
|
py_str = self.strings[orth]
|
||||||
key = hash_string(py_str)
|
key = hash_string(py_str)
|
||||||
self._map.set(key, lexeme)
|
self._by_hash.set(key, lexeme)
|
||||||
while self.lexemes.size() < (lexeme.id + 1):
|
self._by_orth.set(lexeme.orth, lexeme)
|
||||||
self.lexemes.push_back(&EMPTY_LEXEME)
|
self.length += 1
|
||||||
self.lexemes[lexeme.id] = lexeme
|
|
||||||
i += 1
|
i += 1
|
||||||
fclose(fp)
|
fclose(fp)
|
||||||
|
|
||||||
|
@ -185,7 +194,7 @@ cdef class Vocab:
|
||||||
cdef int32_t prev_vec_len = 0
|
cdef int32_t prev_vec_len = 0
|
||||||
cdef float* vec
|
cdef float* vec
|
||||||
cdef Address mem
|
cdef Address mem
|
||||||
cdef id_t string_id
|
cdef attr_t string_id
|
||||||
cdef bytes py_word
|
cdef bytes py_word
|
||||||
cdef vector[float*] vectors
|
cdef vector[float*] vectors
|
||||||
cdef int i
|
cdef int i
|
||||||
|
@ -212,9 +221,9 @@ cdef class Vocab:
|
||||||
assert vec != NULL
|
assert vec != NULL
|
||||||
vectors[string_id] = vec
|
vectors[string_id] = vec
|
||||||
cdef LexemeC* lex
|
cdef LexemeC* lex
|
||||||
for i in range(self.lexemes.size()):
|
cdef size_t lex_addr
|
||||||
# Cast away the const, cos we can modify our lexemes
|
for orth, lex_addr in self._by_orth.items():
|
||||||
lex = <LexemeC*>self.lexemes[i]
|
lex = <LexemeC*>lex_addr
|
||||||
if lex.lower < vectors.size():
|
if lex.lower < vectors.size():
|
||||||
lex.repvec = vectors[lex.lower]
|
lex.repvec = vectors[lex.lower]
|
||||||
for i in range(vec_len):
|
for i in range(vec_len):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user