From d70e8cac2c4302249720cfded3de836302bddb1c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 10 Oct 2015 18:27:03 +1100 Subject: [PATCH] * Fix empty values in attributes and parts of speech, so symbols align correctly with the StringStore --- spacy/parts_of_speech.pyx | 2 +- spacy/symbols.pyx | 1 - spacy/vocab.pyx | 6 ++++-- tests/vocab/test_vocab.py | 13 +++++++++++++ 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/spacy/parts_of_speech.pyx b/spacy/parts_of_speech.pyx index 57d9c801b..14933480c 100644 --- a/spacy/parts_of_speech.pyx +++ b/spacy/parts_of_speech.pyx @@ -2,7 +2,7 @@ from __future__ import unicode_literals IDS = { - "NO_TAG": NO_TAG, + "": NO_TAG, "ADJ": ADJ, "ADP": ADP, "ADV": ADV, diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index 4251fb4ec..a0a39f2ff 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -1,5 +1,4 @@ SYMBOL_IDS = { - "EMPTY_VALUE": EMPTY_VALUE, "Attr_is_alpha": Attr_is_alpha, "Attr_is_ascii": Attr_is_ascii, "Attr_is_digit": Attr_is_digit, diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 1a787e7ac..6cf829344 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -76,9 +76,11 @@ cdef class Vocab: # strings are loaded first, because the vocab is open-class, and these # symbols are closed class. for name in attrs.NAMES: - _ = self.strings[name] + if name: + _ = self.strings[name] for name in parts_of_speech.NAMES: - _ = self.strings[name] + if name: + _ = self.strings[name] #for morph_name in UNIV_MORPH_NAMES: # _ = self.strings[morph_name] #for entity_type_name in entity_types.NAMES: diff --git a/tests/vocab/test_vocab.py b/tests/vocab/test_vocab.py index 7ad911626..153e0d546 100644 --- a/tests/vocab/test_vocab.py +++ b/tests/vocab/test_vocab.py @@ -1,6 +1,9 @@ from __future__ import unicode_literals import pytest +from spacy.attrs import LEMMA, ORTH, PROB, IS_ALPHA +from spacy.parts_of_speech import NOUN, VERB + def test_neq(en_vocab): addr = en_vocab['Hello'] @@ -25,3 +28,13 @@ def test_punct_neq(en_vocab): def test_shape_attr(en_vocab): example = en_vocab['example'] assert example.orth != example.shape + + +def test_symbols(en_vocab): + assert en_vocab.strings['IS_ALPHA'] == IS_ALPHA + assert en_vocab.strings['NOUN'] == NOUN + assert en_vocab.strings['VERB'] == VERB + assert en_vocab.strings['LEMMA'] == LEMMA + assert en_vocab.strings['ORTH'] == ORTH + assert en_vocab.strings['PROB'] == PROB +