Fix vocab deserialization when loading already present lexemes (#3383)

* Fix vocab deserialization bug. Closes #2153

* Un-xfail test for #2153
This commit is contained in:
Matthew Honnibal 2019-03-10 17:21:19 +01:00 committed by GitHub
parent d6eaa71afc
commit 27dd820753
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 5 additions and 2 deletions

View File

@ -68,7 +68,6 @@ def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr):
assert vocab2[strings[0]].norm_ == lex_attr
@pytest.mark.xfail
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
def test_deserialize_vocab_seen_entries(strings, lex_attr):
# Reported in #2153

View File

@ -1,6 +1,7 @@
# coding: utf8
# cython: profile=True
from __future__ import unicode_literals
from libc.string cimport memcpy
import numpy
import srsly
@ -518,7 +519,10 @@ cdef class Vocab:
for j in range(sizeof(lex_data.data)):
lex_data.data[j] = bytes_ptr[i+j]
Lexeme.c_from_bytes(lexeme, lex_data)
prev_entry = self._by_orth.get(lexeme.orth)
if prev_entry != NULL:
memcpy(prev_entry, lexeme, sizeof(LexemeC))
continue
ptr = self.strings._map.get(lexeme.orth)
if ptr == NULL:
continue