Prevent 0-length mem alloc (#6653)

* prevent 0-length mem alloc by adding asserts

* fix lexeme mem allocation
This commit is contained in:
Sofie Van Landeghem 2021-01-06 02:50:17 +01:00 committed by GitHub
parent 6f83abb971
commit 29b59086f9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 10 additions and 3 deletions

View File

@ -133,8 +133,9 @@ cdef class Morphology:
"""
cdef MorphAnalysisC tag
tag.length = len(field_feature_pairs)
tag.fields = <attr_t*>self.mem.alloc(tag.length, sizeof(attr_t))
tag.features = <attr_t*>self.mem.alloc(tag.length, sizeof(attr_t))
if tag.length > 0:
tag.fields = <attr_t*>self.mem.alloc(tag.length, sizeof(attr_t))
tag.features = <attr_t*>self.mem.alloc(tag.length, sizeof(attr_t))
for i, (field, feature) in enumerate(field_feature_pairs):
tag.fields[i] = field
tag.features[i] = feature

View File

@ -65,6 +65,7 @@ cdef GoldParseStateC create_gold_state(Pool mem, const StateC* state,
cdef GoldParseStateC gs
gs.length = len(heads)
gs.stride = 1
assert gs.length > 0
gs.labels = <attr_t*>mem.alloc(gs.length, sizeof(gs.labels[0]))
gs.heads = <int32_t*>mem.alloc(gs.length, sizeof(gs.heads[0]))
gs.n_kids = <int32_t*>mem.alloc(gs.length, sizeof(gs.n_kids[0]))
@ -126,6 +127,7 @@ cdef GoldParseStateC create_gold_state(Pool mem, const StateC* state,
1
)
# Make an array of pointers, pointing into the gs_kids_flat array.
assert gs.length > 0
gs.kids = <int32_t**>mem.alloc(gs.length, sizeof(int32_t*))
for i in range(gs.length):
if gs.n_kids[i] != 0:

View File

@ -63,6 +63,7 @@ cdef GoldNERStateC create_gold_state(
Example example
) except *:
cdef GoldNERStateC gs
assert example.x.length > 0
gs.ner = <Transition*>mem.alloc(example.x.length, sizeof(Transition))
ner_tags = example.get_aligned_ner()
for i, ner_tag in enumerate(ner_tags):

View File

@ -258,6 +258,7 @@ cdef class Tokenizer:
tokens = doc.c
# Otherwise create a separate array to store modified tokens
else:
assert max_length > 0
tokens = <TokenC*>mem.alloc(max_length, sizeof(TokenC))
# Modify tokenization according to filtered special cases
offset = self._retokenize_special_spans(doc, tokens, span_data)

View File

@ -225,6 +225,7 @@ cdef class Doc:
# Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
# However, we need to remember the true starting places, so that we can
# realloc.
assert size + (PADDING*2) > 0
data_start = <TokenC*>self.mem.alloc(size + (PADDING*2), sizeof(TokenC))
cdef int i
for i in range(size + (PADDING*2)):
@ -1177,6 +1178,7 @@ cdef class Doc:
other.length = self.length
other.max_length = self.max_length
buff_size = other.max_length + (PADDING*2)
assert buff_size > 0
tokens = <TokenC*>other.mem.alloc(buff_size, sizeof(TokenC))
memcpy(tokens, self.c - PADDING, buff_size * sizeof(TokenC))
other.c = &tokens[PADDING]

View File

@ -164,7 +164,7 @@ cdef class Vocab:
if len(string) < 3 or self.length < 10000:
mem = self.mem
cdef bint is_oov = mem is not self.mem
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
lex.orth = self.strings.add(string)
lex.length = len(string)
if self.vectors is not None: