Fix vocab deserialization when loading already present lexemes (#3383)

* Fix vocab deserialization bug. Closes #2153 * Un-xfail test for #2153
2025-08-04 04:10:20 +03:00 · 2019-03-10 17:21:19 +01:00 · 2019-03-10 17:21:19 +01:00 · 27dd820753
commit 27dd820753
parent d6eaa71afc
2 changed files with 5 additions and 2 deletions
--- a/spacy/tests/serialize/test_serialize_vocab_strings.py
+++ b/spacy/tests/serialize/test_serialize_vocab_strings.py
@ -68,7 +68,6 @@ def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr):
    assert vocab2[strings[0]].norm_ == lex_attr


-@pytest.mark.xfail
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
 def test_deserialize_vocab_seen_entries(strings, lex_attr):
    # Reported in #2153
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -1,6 +1,7 @@
 # coding: utf8
 # cython: profile=True
 from __future__ import unicode_literals
+from libc.string cimport memcpy

 import numpy
 import srsly
@ -518,7 +519,10 @@ cdef class Vocab:
            for j in range(sizeof(lex_data.data)):
                lex_data.data[j] = bytes_ptr[i+j]
            Lexeme.c_from_bytes(lexeme, lex_data)
-
+            prev_entry = self._by_orth.get(lexeme.orth)
+            if prev_entry != NULL:
+                memcpy(prev_entry, lexeme, sizeof(LexemeC))
+                continue
            ptr = self.strings._map.get(lexeme.orth)
            if ptr == NULL:
                continue