* Switch to using a Python ref counted gateway to malloc/free, to prevent memory leaks

2025-10-17 17:24:14 +03:00 · 2014-09-17 20:02:26 +02:00 · 2014-09-17 20:02:26 +02:00 · 6266cac593
commit 6266cac593
parent 5a20dfc03e
10 changed files with 28 additions and 32 deletions
--- a/spacy/_hashing.pxd
+++ b/spacy/_hashing.pxd
@ -1,4 +1,5 @@
 from libc.stdint cimport uint64_t
+from .memory cimport Address

 ctypedef uint64_t key_t
 ctypedef void* val_t
@ -13,6 +14,7 @@ cdef class PointerHash:
    cdef size_t size
    cdef size_t filled
    cdef Cell* cells
+    cdef Address _mem

    cdef val_t get(self, key_t key) nogil
    cdef void set(self, key_t key, val_t value) except *
--- a/spacy/_hashing.pyx
+++ b/spacy/_hashing.pyx
@ -1,5 +1,5 @@
 # cython: profile=True
-from libc.stdlib cimport calloc, free
+from .memory cimport Address
 cimport cython


@ -10,10 +10,8 @@ cdef class PointerHash:
        # Size must be power of two
        assert self.size != 0
        assert self.size & (self.size - 1) == 0
-        self.cells = <Cell*>calloc(self.size, sizeof(Cell))
-
-    def __dealloc__(self):
-        free(self.cells)
+        self._mem = Address(self.size, sizeof(Cell))
+        self.cells = <Cell*>self._mem.addr

    def __getitem__(self, key_t key):
        assert key != 0
@ -47,7 +45,8 @@ cdef class PointerHash:
        cdef size_t old_size = self.size

        self.size = new_size
-        self.cells = <Cell*>calloc(new_size, sizeof(Cell))
+        cdef Address new_mem = Address(new_size, sizeof(Cell))
+        self.cells = <Cell*>new_mem.addr
        
        self.filled = 0
        cdef size_t i
@ -56,7 +55,7 @@ cdef class PointerHash:
            if old_cells[i].key != 0:
                assert old_cells[i].value != NULL, i
                self.set(old_cells[i].key, old_cells[i].value)
-        free(old_cells)
+        self._mem = new_mem


@cython.cdivision
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -37,7 +37,6 @@ provides a fully Penn Treebank 3-compliant tokenizer.

 from __future__ import unicode_literals

-from libc.stdlib cimport malloc, calloc, free
 from libc.stdint cimport uint64_t

 cimport lang
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@ -5,6 +5,8 @@ from spacy.tokens cimport Tokens
 from spacy.lexeme cimport LexemeC
 from spacy._hashing cimport PointerHash

+from spacy.memory cimport Pool
+
 from libcpp.utility cimport pair
 from libcpp.vector cimport vector
 from libc.stdint cimport uint64_t, int64_t
@ -22,6 +24,7 @@ cdef struct String:


 cdef class Lexicon:
+    cdef Pool _mem
    cpdef readonly size_t size

    cpdef Lexeme lookup(self, unicode string)
@ -34,6 +37,7 @@ cdef class Lexicon:


 cdef class Language:
+    cdef Pool _mem
    cdef unicode name
    cdef PointerHash cache
    cdef PointerHash specials
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -8,8 +8,6 @@ Special-case tokenization rules are read from data/<lang>/tokenization .
 """
 from __future__ import unicode_literals

-from libc.stdlib cimport calloc, free
-
 import json
 import random
 from os import path
@ -18,8 +16,11 @@ from .util import read_lang_data
 from spacy.tokens import Tokens
 from spacy.lexeme cimport LexemeC, lexeme_init
 from murmurhash.mrmr cimport hash64
+
 from cpython.ref cimport Py_INCREF

+from .memory cimport Pool
+
 from cython.operator cimport preincrement as preinc
 from cython.operator cimport dereference as deref

@ -127,6 +128,7 @@ cdef class Language:

    def __cinit__(self, name, user_string_features, user_flag_features):
        self.name = name
+        self._mem = Pool()
        self.cache = PointerHash(2 ** 25)
        self.specials = PointerHash(2 ** 16)
        lang_data = util.read_lang_data(name)
@ -203,7 +205,7 @@ cdef class Language:
        if lexemes != NULL:
            i = 0
            while lexemes[i] != NULL:
-                tokens.push_back(lexemes[i])
+                tokens_v.push_back(lexemes[i])
                i += 1
            return 0

@ -292,7 +294,7 @@ cdef class Language:
    cdef int _save_cached(self, vector[LexemeC*] *tokens,
                          uint64_t key, size_t n) except -1:
        assert tokens.size() > n
-        lexemes = <LexemeC**>calloc((tokens.size() - n) + 1, sizeof(LexemeC**))
+        lexemes = <LexemeC**>self._mem.alloc((tokens.size() - n) + 1, sizeof(LexemeC**))
        cdef size_t i, j
        for i, j in enumerate(range(n, tokens.size())):
            lexemes[i] = tokens.at(j)
@ -404,7 +406,7 @@ cdef class Language:
        cdef uint64_t hashed
        cdef String string
        for uni_string, substrings in token_rules:
-            lexemes = <LexemeC**>calloc(len(substrings) + 1, sizeof(LexemeC*))
+            lexemes = <LexemeC**>self._mem.alloc(len(substrings) + 1, sizeof(LexemeC*))
            for i, substring in enumerate(substrings):
                string_from_unicode(&string, substring)
                lexemes[i] = <LexemeC*>self.lexicon.get(&string)
@ -417,6 +419,7 @@ cdef class Language:
 cdef class Lexicon:
    def __cinit__(self, words, probs, clusters, case_stats, tag_stats,
                  string_features, flag_features):
+        self._mem = Pool()
        self._flag_features = flag_features
        self._string_features = string_features
        self._dict = PointerHash(2 ** 20)
@ -433,7 +436,7 @@ cdef class Lexicon:
            for i, flag_feature in enumerate(self._flag_features):
                if flag_feature(uni_string, prob, cluster, cases, tags):
                    flags.add(i)
-            lexeme = lexeme_init(uni_string, prob, cluster, views, flags)
+            lexeme = lexeme_init(self._mem, uni_string, prob, cluster, views, flags)
            string_from_unicode(&string, uni_string)
            self._dict.set(string.key, lexeme)
            self.size += 1
@ -452,7 +455,7 @@ cdef class Lexicon:
            if flag_feature(uni_string, 0.0, {}, {}):
                flags.add(i)
 
-        lexeme = lexeme_init(uni_string, 0, 0, views, flags)
+        lexeme = lexeme_init(self._mem, uni_string, 0, 0, views, flags)
        self._dict.set(string.key, lexeme)
        self.size += 1
        return lexeme
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -1,4 +1,5 @@
 from .typedefs cimport hash_t, utf8_t, flag_t, id_t
+from .memory cimport Pool


 cdef struct LexemeC:
@ -12,9 +13,8 @@ cdef struct LexemeC:
    flag_t flags


-cdef LexemeC* lexeme_init(unicode string, double prob, size_t cluster,
+cdef LexemeC* lexeme_init(Pool mem, unicode string, double prob, size_t cluster,
                     list views, set flags)
-cdef int lexeme_free(LexemeC* lexeme) except -1

 cdef bint lexeme_check_flag(LexemeC* lexeme, size_t flag_id)
 cdef unicode lexeme_string_view(LexemeC* lexeme, size_t view_id)
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -1,14 +1,14 @@
-from libc.stdlib cimport calloc, free
 from cpython.ref cimport Py_INCREF
+from .memory cimport Pool


-cdef LexemeC* lexeme_init(unicode string, double prob, size_t cluster,
+cdef LexemeC* lexeme_init(Pool mem, unicode string, double prob, size_t cluster,
                     list views, set flags):
-    cdef LexemeC* lexeme = <LexemeC*>calloc(1, sizeof(LexemeC))
+    cdef LexemeC* lexeme = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
    lexeme.cluster = cluster
    lexeme.prob = prob
    lexeme.string = intern_and_encode(string, &lexeme.length)
-    lexeme.views = <char**>calloc(len(views), sizeof(char*))
+    lexeme.views = <char**>mem.alloc(len(views), sizeof(char*))
    cdef size_t length = 0
    for i, string in enumerate(views):
        lexeme.views[i] = intern_and_encode(string, &length)
@ -18,11 +18,6 @@ cdef LexemeC* lexeme_init(unicode string, double prob, size_t cluster,
    return lexeme


-cdef int lexeme_free(LexemeC* lexeme) except -1:
-    free(lexeme.views)
-    free(lexeme)
-    
-
 cdef char* intern_and_encode(unicode string, size_t* length):
    cdef bytes byte_string = string.encode('utf8')
    cdef bytes utf8_string = intern(byte_string)
--- a/spacy/ptb3.pyx
+++ b/spacy/ptb3.pyx
@ -5,7 +5,6 @@ boldly assume no collisions.
 from __future__ import unicode_literals


-from libc.stdlib cimport malloc, calloc, free
 from libc.stdint cimport uint64_t


--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -1,6 +1,4 @@
 # cython: profile=True
-from libc.stdlib cimport calloc, free, realloc
-
 from spacy.word cimport Lexeme
 from spacy.lexeme cimport lexeme_check_flag
 from spacy.lexeme cimport lexeme_string_view
--- a/spacy/word.pyx
+++ b/spacy/word.pyx
@ -1,9 +1,6 @@
 # cython: profile=True
 # cython: embedsignature=True

-from libc.stdlib cimport calloc, free, realloc
-
-from spacy.lexeme cimport lexeme_free, lexeme_init
 from spacy.lexeme cimport lexeme_check_flag, lexeme_string_view