Don't pass mem pool to new lexeme function

2025-08-04 04:10:20 +03:00 · 2023-01-19 14:48:43 +09:00 · 2023-01-19 14:48:43 +09:00 · 51eb625cec
commit 51eb625cec
parent b052b1b47f
2 changed files with 6 additions and 19 deletions
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@ -39,8 +39,7 @@ cdef class Vocab:
    cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
    cdef const TokenC* make_fused_token(self, substrings) except NULL
-    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
+    cdef const LexemeC* _new_lexeme(self, str string) except NULL
    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
    cdef PreshMap _by_orth
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -157,7 +157,7 @@ cdef class Vocab:
                                                  orth=key, orth_id=string))
            return lex
        else:
-            return self._new_lexeme(mem, string)
+            return self._new_lexeme(string)
    cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
        """Get a pointer to a `LexemeC` from the lexicon, creating a new
@ -171,21 +171,10 @@ cdef class Vocab:
        if lex != NULL:
            return lex
        else:
-            return self._new_lexeme(mem, self.strings[orth])
+            return self._new_lexeme(self.strings[orth])
-    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL:
+    cdef const LexemeC* _new_lexeme(self, str string) except NULL:
-        # I think this heuristic is bad, and the Vocab should always
+        lex = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
        # own the lexemes. It avoids weird bugs this way, as it's how the thing
        # was originally supposed to work. The best solution to the growing
        # memory use is to periodically reset the vocab, which is an action
        # that should be up to the user to do (so we don't need to keep track
        # of the doc ownership).
        # TODO: Change the C API so that the mem isn't passed in here.
        mem = self.mem
        #if len(string) < 3 or self.length < 10000:
        #    mem = self.mem
        cdef bint is_oov = mem is not self.mem
        lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
        lex.orth = self.strings.add(string)
        lex.length = len(string)
        if self.vectors is not None:
@ -199,7 +188,6 @@ cdef class Vocab:
                    value = self.strings.add(value)
                if value is not None:
                    Lexeme.set_struct_attr(lex, attr, value)
        if not is_oov:
        self._add_lex_to_vocab(lex.orth, lex)
        if lex == NULL:
            raise ValueError(Errors.E085.format(string=string))