From 51eb625cec9f4c13d2f9452ceb713cbd57eea8c3 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Thu, 19 Jan 2023 14:48:43 +0900 Subject: [PATCH] Don't pass mem pool to new lexeme function --- spacy/vocab.pxd | 3 +-- spacy/vocab.pyx | 22 +++++----------------- 2 files changed, 6 insertions(+), 19 deletions(-) diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index 815de0765..35cdc6503 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -39,8 +39,7 @@ cdef class Vocab: cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL cdef const TokenC* make_fused_token(self, substrings) except NULL - cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL + cdef const LexemeC* _new_lexeme(self, str string) except NULL cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 - cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL cdef PreshMap _by_orth diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index fc496a68b..9fc49a5b8 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -157,7 +157,7 @@ cdef class Vocab: orth=key, orth_id=string)) return lex else: - return self._new_lexeme(mem, string) + return self._new_lexeme(string) cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL: """Get a pointer to a `LexemeC` from the lexicon, creating a new @@ -171,21 +171,10 @@ cdef class Vocab: if lex != NULL: return lex else: - return self._new_lexeme(mem, self.strings[orth]) + return self._new_lexeme(self.strings[orth]) - cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL: - # I think this heuristic is bad, and the Vocab should always - # own the lexemes. It avoids weird bugs this way, as it's how the thing - # was originally supposed to work. The best solution to the growing - # memory use is to periodically reset the vocab, which is an action - # that should be up to the user to do (so we don't need to keep track - # of the doc ownership). - # TODO: Change the C API so that the mem isn't passed in here. - mem = self.mem - #if len(string) < 3 or self.length < 10000: - # mem = self.mem - cdef bint is_oov = mem is not self.mem - lex = mem.alloc(1, sizeof(LexemeC)) + cdef const LexemeC* _new_lexeme(self, str string) except NULL: + lex = self.mem.alloc(1, sizeof(LexemeC)) lex.orth = self.strings.add(string) lex.length = len(string) if self.vectors is not None: @@ -199,8 +188,7 @@ cdef class Vocab: value = self.strings.add(value) if value is not None: Lexeme.set_struct_attr(lex, attr, value) - if not is_oov: - self._add_lex_to_vocab(lex.orth, lex) + self._add_lex_to_vocab(lex.orth, lex) if lex == NULL: raise ValueError(Errors.E085.format(string=string)) return lex