* Fix empty values in attributes and parts of speech, so symbols align correctly with the StringStore

2025-07-15 18:52:29 +03:00 · 2015-10-10 18:27:03 +11:00 · 2015-10-10 18:27:03 +11:00 · d70e8cac2c
commit d70e8cac2c
parent ce3e306376
4 changed files with 18 additions and 4 deletions
--- a/spacy/parts_of_speech.pyx
+++ b/spacy/parts_of_speech.pyx
@ -2,7 +2,7 @@ from __future__ import unicode_literals


 IDS = {
-    "NO_TAG": NO_TAG,
+    "": NO_TAG,
    "ADJ": ADJ,
    "ADP": ADP,
    "ADV": ADV,
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@ -1,5 +1,4 @@
 SYMBOL_IDS = {
-    "EMPTY_VALUE": EMPTY_VALUE,
    "Attr_is_alpha": Attr_is_alpha,
    "Attr_is_ascii": Attr_is_ascii,
    "Attr_is_digit": Attr_is_digit,
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -76,9 +76,11 @@ cdef class Vocab:
        # strings are loaded first, because the vocab is open-class, and these
        # symbols are closed class.
        for name in attrs.NAMES:
-            _ = self.strings[name]
+            if name:
+                _ = self.strings[name]
        for name in parts_of_speech.NAMES:
-            _ = self.strings[name]
+            if name:
+                _ = self.strings[name]
        #for morph_name in UNIV_MORPH_NAMES:
        #    _ = self.strings[morph_name]
        #for entity_type_name in entity_types.NAMES:
--- a/tests/vocab/test_vocab.py
+++ b/tests/vocab/test_vocab.py
@ -1,6 +1,9 @@
 from __future__ import unicode_literals
 import pytest

+from spacy.attrs import LEMMA, ORTH, PROB, IS_ALPHA
+from spacy.parts_of_speech import NOUN, VERB
+

 def test_neq(en_vocab):
    addr = en_vocab['Hello']
@ -25,3 +28,13 @@ def test_punct_neq(en_vocab):
 def test_shape_attr(en_vocab):
    example = en_vocab['example']
    assert example.orth != example.shape
+
+
+def test_symbols(en_vocab):
+    assert en_vocab.strings['IS_ALPHA'] == IS_ALPHA
+    assert en_vocab.strings['NOUN'] == NOUN
+    assert en_vocab.strings['VERB'] == VERB
+    assert en_vocab.strings['LEMMA'] == LEMMA
+    assert en_vocab.strings['ORTH'] == ORTH
+    assert en_vocab.strings['PROB'] == PROB
+