Prevent 0-length mem alloc (#6653)

* prevent 0-length mem alloc by adding asserts * fix lexeme mem allocation
2025-08-09 14:44:52 +03:00 · 2021-01-06 02:50:17 +01:00 · 2021-01-06 02:50:17 +01:00 · 29b59086f9
commit 29b59086f9
parent 6f83abb971
6 changed files with 10 additions and 3 deletions
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -133,8 +133,9 @@ cdef class Morphology:
        """
        cdef MorphAnalysisC tag
        tag.length = len(field_feature_pairs)
-        tag.fields = <attr_t*>self.mem.alloc(tag.length, sizeof(attr_t))
-        tag.features = <attr_t*>self.mem.alloc(tag.length, sizeof(attr_t))
+        if tag.length > 0:
+            tag.fields = <attr_t*>self.mem.alloc(tag.length, sizeof(attr_t))
+            tag.features = <attr_t*>self.mem.alloc(tag.length, sizeof(attr_t))
        for i, (field, feature) in enumerate(field_feature_pairs):
            tag.fields[i] = field
            tag.features[i] = feature
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@ -65,6 +65,7 @@ cdef GoldParseStateC create_gold_state(Pool mem, const StateC* state,
    cdef GoldParseStateC gs
    gs.length = len(heads)
    gs.stride = 1
+    assert gs.length > 0
    gs.labels = <attr_t*>mem.alloc(gs.length, sizeof(gs.labels[0]))
    gs.heads = <int32_t*>mem.alloc(gs.length, sizeof(gs.heads[0]))
    gs.n_kids = <int32_t*>mem.alloc(gs.length, sizeof(gs.n_kids[0]))
@ -126,6 +127,7 @@ cdef GoldParseStateC create_gold_state(Pool mem, const StateC* state,
                1
            )
    # Make an array of pointers, pointing into the gs_kids_flat array.
+    assert gs.length > 0
    gs.kids = <int32_t**>mem.alloc(gs.length, sizeof(int32_t*))
    for i in range(gs.length):
        if gs.n_kids[i] != 0:
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@ -63,6 +63,7 @@ cdef GoldNERStateC create_gold_state(
    Example example
 ) except *:
    cdef GoldNERStateC gs
+    assert example.x.length > 0
    gs.ner = <Transition*>mem.alloc(example.x.length, sizeof(Transition))
    ner_tags = example.get_aligned_ner()
    for i, ner_tag in enumerate(ner_tags):
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -258,6 +258,7 @@ cdef class Tokenizer:
            tokens = doc.c
        # Otherwise create a separate array to store modified tokens
        else:
+            assert max_length > 0
            tokens = <TokenC*>mem.alloc(max_length, sizeof(TokenC))
        # Modify tokenization according to filtered special cases
        offset = self._retokenize_special_spans(doc, tokens, span_data)
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -225,6 +225,7 @@ cdef class Doc:
        # Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
        # However, we need to remember the true starting places, so that we can
        # realloc.
+        assert size + (PADDING*2) > 0
        data_start = <TokenC*>self.mem.alloc(size + (PADDING*2), sizeof(TokenC))
        cdef int i
        for i in range(size + (PADDING*2)):
@ -1177,6 +1178,7 @@ cdef class Doc:
        other.length = self.length
        other.max_length = self.max_length
        buff_size = other.max_length + (PADDING*2)
+        assert buff_size > 0
        tokens = <TokenC*>other.mem.alloc(buff_size, sizeof(TokenC))
        memcpy(tokens, self.c - PADDING, buff_size * sizeof(TokenC))
        other.c = &tokens[PADDING]
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -164,7 +164,7 @@ cdef class Vocab:
        if len(string) < 3 or self.length < 10000:
            mem = self.mem
        cdef bint is_oov = mem is not self.mem
-        lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
+        lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
        lex.orth = self.strings.add(string)
        lex.length = len(string)
        if self.vectors is not None: