mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-04 04:10:20 +03:00
Don't pass mem pool to new lexeme function
This commit is contained in:
parent
b052b1b47f
commit
51eb625cec
|
@ -39,8 +39,7 @@ cdef class Vocab:
|
||||||
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
|
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
|
||||||
cdef const TokenC* make_fused_token(self, substrings) except NULL
|
cdef const TokenC* make_fused_token(self, substrings) except NULL
|
||||||
|
|
||||||
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
|
cdef const LexemeC* _new_lexeme(self, str string) except NULL
|
||||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
|
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
|
||||||
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
|
|
||||||
|
|
||||||
cdef PreshMap _by_orth
|
cdef PreshMap _by_orth
|
||||||
|
|
|
@ -157,7 +157,7 @@ cdef class Vocab:
|
||||||
orth=key, orth_id=string))
|
orth=key, orth_id=string))
|
||||||
return lex
|
return lex
|
||||||
else:
|
else:
|
||||||
return self._new_lexeme(mem, string)
|
return self._new_lexeme(string)
|
||||||
|
|
||||||
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
|
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
|
||||||
"""Get a pointer to a `LexemeC` from the lexicon, creating a new
|
"""Get a pointer to a `LexemeC` from the lexicon, creating a new
|
||||||
|
@ -171,21 +171,10 @@ cdef class Vocab:
|
||||||
if lex != NULL:
|
if lex != NULL:
|
||||||
return lex
|
return lex
|
||||||
else:
|
else:
|
||||||
return self._new_lexeme(mem, self.strings[orth])
|
return self._new_lexeme(self.strings[orth])
|
||||||
|
|
||||||
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL:
|
cdef const LexemeC* _new_lexeme(self, str string) except NULL:
|
||||||
# I think this heuristic is bad, and the Vocab should always
|
lex = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
|
||||||
# own the lexemes. It avoids weird bugs this way, as it's how the thing
|
|
||||||
# was originally supposed to work. The best solution to the growing
|
|
||||||
# memory use is to periodically reset the vocab, which is an action
|
|
||||||
# that should be up to the user to do (so we don't need to keep track
|
|
||||||
# of the doc ownership).
|
|
||||||
# TODO: Change the C API so that the mem isn't passed in here.
|
|
||||||
mem = self.mem
|
|
||||||
#if len(string) < 3 or self.length < 10000:
|
|
||||||
# mem = self.mem
|
|
||||||
cdef bint is_oov = mem is not self.mem
|
|
||||||
lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
|
|
||||||
lex.orth = self.strings.add(string)
|
lex.orth = self.strings.add(string)
|
||||||
lex.length = len(string)
|
lex.length = len(string)
|
||||||
if self.vectors is not None:
|
if self.vectors is not None:
|
||||||
|
@ -199,7 +188,6 @@ cdef class Vocab:
|
||||||
value = self.strings.add(value)
|
value = self.strings.add(value)
|
||||||
if value is not None:
|
if value is not None:
|
||||||
Lexeme.set_struct_attr(lex, attr, value)
|
Lexeme.set_struct_attr(lex, attr, value)
|
||||||
if not is_oov:
|
|
||||||
self._add_lex_to_vocab(lex.orth, lex)
|
self._add_lex_to_vocab(lex.orth, lex)
|
||||||
if lex == NULL:
|
if lex == NULL:
|
||||||
raise ValueError(Errors.E085.format(string=string))
|
raise ValueError(Errors.E085.format(string=string))
|
||||||
|
|
Loading…
Reference in New Issue
Block a user