From 817e1fc5e57e9a57f61ea587bb9cbf2ef28454f1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 27 Oct 2018 00:46:30 +0200 Subject: [PATCH] Fix out-of-bounds access in NER training The helper method state.B(1) gets the index of the first token of the buffer, or -1 if no such token exists. Normally this is safe because we pass this to functions like state.safe_get(), which returns an empty token. Here we used it directly as an array index, which is not okay! This error may have been the cause of out-of-bounds access errors during training. Similar errors may still be around, so much be hunted down. Hunting this one down took a long time...I printed out values across training runs and diffed, looking for points of divergence between runs, when no randomness should be allowed. --- spacy/syntax/ner.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index 55cba692f..b473e76b0 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -330,7 +330,7 @@ cdef class In: @staticmethod cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil: move = IN - cdef int next_act = gold.ner[s.B(1)].move if s.B(0) < s.c.length else OUT + cdef int next_act = gold.ner[s.B(1)].move if s.B(1) >= 0 else OUT cdef int g_act = gold.ner[s.B(0)].move cdef attr_t g_tag = gold.ner[s.B(0)].label cdef bint is_sunk = _entity_is_sunk(s, gold.ner)