From 8f8bccecb9427448563b2d2c4c3cf7fb4eecdfb1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 21 Oct 2017 00:51:42 +0200 Subject: [PATCH] Patch deserialisation for invalid loads, to avoid model failure --- spacy/vocab.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 205e5a2af..da4d21026 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -400,6 +400,7 @@ cdef class Vocab: cdef int j = 0 cdef SerializedLexemeC lex_data chunk_size = sizeof(lex_data.data) + cdef void* ptr cdef unsigned char* bytes_ptr = bytes_data for i in range(0, len(bytes_data), chunk_size): lexeme = self.mem.alloc(1, sizeof(LexemeC)) @@ -407,6 +408,9 @@ cdef class Vocab: lex_data.data[j] = bytes_ptr[i+j] Lexeme.c_from_bytes(lexeme, lex_data) + ptr = self.strings._map.get(lexeme.orth) + if ptr == NULL: + continue py_str = self.strings[lexeme.orth] assert self.strings[py_str] == lexeme.orth, (py_str, lexeme.orth) key = hash_string(py_str)