Merge pull request #1442 from explosion/feature/fix-sp

💫Fix SP tag, tweak Vectors.__init__, fix Morphology
2026-03-03 19:31:35 +03:00 · 2017-10-24 10:24:07 +02:00 · 2017-10-24 10:24:07 +02:00 · ef3e5a361b
commit ef3e5a361b
parent fdf25d10ba 9010a1a060
10 changed files with 85 additions and 70 deletions
--- a/spacy/lang/de/tag_map.py
+++ b/spacy/lang/de/tag_map.py
@ -62,5 +62,5 @@ TAG_MAP = {
    "VVIZU":    {POS: VERB, "VerbForm": "inf"},
    "VVPP":     {POS: VERB, "Aspect": "perf", "VerbForm": "part"},
    "XY":       {POS: X},
-    "SP":       {POS: SPACE}
+    "_SP":      {POS: SPACE}
 }
--- a/spacy/lang/en/tag_map.py
+++ b/spacy/lang/en/tag_map.py
@ -55,11 +55,11 @@ TAG_MAP = {
    "WP":       {POS: NOUN, "PronType": "int|rel"},
    "WP$":      {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
    "WRB":      {POS: ADV, "PronType": "int|rel"},
-    "SP":       {POS: SPACE},
    "ADD":      {POS: X},
    "NFP":      {POS: PUNCT},
    "GW":       {POS: X},
    "XX":       {POS: X},
    "BES":      {POS: VERB},
-    "HVS":      {POS: VERB}
+    "HVS":      {POS: VERB},
+    "_SP":       {POS: SPACE},
 }
--- a/spacy/lang/es/tag_map.py
+++ b/spacy/lang/es/tag_map.py
@ -303,5 +303,5 @@ TAG_MAP = {
    "VERB__VerbForm=Ger": {"morph": "VerbForm=Ger", "pos": "VERB"},
    "VERB__VerbForm=Inf": {"morph": "VerbForm=Inf", "pos": "VERB"},
    "X___": {"morph": "_", "pos": "X"},
-    "SP": {"morph": "_", "pos": "SPACE"},
+    "_SP": {"morph": "_", "pos": "SPACE"},
 }
--- a/spacy/lang/th/tag_map.py
+++ b/spacy/lang/th/tag_map.py
@ -19,63 +19,64 @@ TAG_MAP = {
    "NPRP":     {POS: PRON},
    # ADJ
    "ADJ":      {POS: ADJ},
-    "NONM":      {POS: ADJ},
-    "VATT":      {POS: ADJ},
-    "DONM":      {POS: ADJ},
+    "NONM":     {POS: ADJ},
+    "VATT":     {POS: ADJ},
+    "DONM":     {POS: ADJ},
    # ADV
    "ADV":      {POS: ADV},
-    "ADVN":      {POS: ADV},
-    "ADVI":      {POS: ADV},
-    "ADVP":      {POS: ADV},
-    "ADVS":      {POS: ADV},
+    "ADVN":     {POS: ADV},
+    "ADVI":     {POS: ADV},
+    "ADVP":     {POS: ADV},
+    "ADVS":     {POS: ADV},
 	# INT
    "INT":      {POS: INTJ},
    # PRON
    "PROPN":    {POS: PROPN},
-    "PPRS":    {POS: PROPN},
-    "PDMN":    {POS: PROPN},
-    "PNTR":    {POS: PROPN},
+    "PPRS":     {POS: PROPN},
+    "PDMN":     {POS: PROPN},
+    "PNTR":     {POS: PROPN},
    # DET
    "DET":      {POS: DET},
-    "DDAN":      {POS: DET},
-    "DDAC":      {POS: DET},
-    "DDBQ":      {POS: DET},
-    "DDAQ":      {POS: DET},
-    "DIAC":      {POS: DET},
-    "DIBQ":      {POS: DET},
-    "DIAQ":      {POS: DET},
-    "DCNM":      {POS: DET},
+    "DDAN":     {POS: DET},
+    "DDAC":     {POS: DET},
+    "DDBQ":     {POS: DET},
+    "DDAQ":     {POS: DET},
+    "DIAC":     {POS: DET},
+    "DIBQ":     {POS: DET},
+    "DIAQ":     {POS: DET},
+    "DCNM":     {POS: DET},
    # NUM
    "NUM":      {POS: NUM},
-    "NCNM":      {POS: NUM},
-    "NLBL":      {POS: NUM},
-    "DCNM":      {POS: NUM},
+    "NCNM":     {POS: NUM},
+    "NLBL":     {POS: NUM},
+    "DCNM":     {POS: NUM},
 	# AUX
    "AUX":      {POS: AUX},
-    "XVBM":      {POS: AUX},
-    "XVAM":      {POS: AUX},
-    "XVMM":      {POS: AUX},
-    "XVBB":      {POS: AUX},
-    "XVAE":      {POS: AUX},
+    "XVBM":     {POS: AUX},
+    "XVAM":     {POS: AUX},
+    "XVMM":     {POS: AUX},
+    "XVBB":     {POS: AUX},
+    "XVAE":     {POS: AUX},
 	# ADP
    "ADP":      {POS: ADP},
-    "RPRE":      {POS: ADP},
+    "RPRE":     {POS: ADP},
    # CCONJ
    "CCONJ":    {POS: CCONJ},
-    "JCRG":    {POS: CCONJ},
+    "JCRG":     {POS: CCONJ},
 	# SCONJ
    "SCONJ":    {POS: SCONJ},
-    "PREL":    {POS: SCONJ},
-    "JSBR":    {POS: SCONJ},
-    "JCMP":    {POS: SCONJ},
+    "PREL":     {POS: SCONJ},
+    "JSBR":     {POS: SCONJ},
+    "JCMP":     {POS: SCONJ},
    # PART
-    "PART":    {POS: PART},
-    "FIXN":    {POS: PART},
-    "FIXV":    {POS: PART},
-    "EAFF":    {POS: PART},
-    "AITT":    {POS: PART},
-    "NEG":    {POS: PART},
+    "PART":     {POS: PART},
+    "FIXN":     {POS: PART},
+    "FIXV":     {POS: PART},
+    "EAFF":     {POS: PART},
+    "AITT":     {POS: PART},
+    "NEG":      {POS: PART},
    # PUNCT
    "PUNCT":    {POS: PUNCT},
-    "PUNC":    {POS: PUNCT}
+    "PUNC":     {POS: PUNCT},
+    "_SP":      {POS: SPACE}
 }
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@ -44,7 +44,7 @@ cdef class Morphology:
    cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1


-cpdef enum univ_morph_t:
+cdef enum univ_morph_t:
    NIL = 0
    Animacy_anim = symbols.Animacy_anim
    Animacy_inam
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -4,7 +4,7 @@ from __future__ import unicode_literals

 from libc.string cimport memset

-from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT
+from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT, SPACE
 from .attrs cimport POS, IS_SPACE
 from .parts_of_speech import IDS as POS_IDS
 from .lexeme cimport Lexeme
@ -36,14 +36,22 @@ cdef class Morphology:
    def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None):
        self.mem = Pool()
        self.strings = string_store
+        # Add special space symbol. We prefix with underscore, to make sure it
+        # always sorts to the end.
+        space_attrs = tag_map.pop('SP', {POS: SPACE})
+        if '_SP' not in tag_map:
+            self.strings.add('_SP')
+            tag_map = dict(tag_map)
+            tag_map['_SP'] = space_attrs
+        self.tag_names = tuple(sorted(tag_map.keys()))
        self.tag_map = {}
        self.lemmatizer = lemmatizer
        self.n_tags = len(tag_map)
-        self.tag_names = tuple(sorted(tag_map.keys()))
        self.reverse_index = {}

        self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags+1, sizeof(RichTagC))
        for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
+            self.strings.add(tag_str)
            self.tag_map[tag_str] = dict(attrs)
            attrs = _normalize_props(attrs)
            attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
@ -93,7 +101,7 @@ cdef class Morphology:
        # the statistical model fails.
        # Related to Issue #220
        if Lexeme.c_check_flag(token.lex, IS_SPACE):
-            tag_id = self.reverse_index[self.strings.add('SP')]
+            tag_id = self.reverse_index[self.strings.add('_SP')]
        rich_tag = self.rich_tags[tag_id]
        analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
        if analysis is NULL:
@ -426,3 +434,7 @@ IDS = {


 NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
+# Unfortunate hack here, to work around problem with long cpdef enum
+# (which is generating an enormous amount of C++ in Cython 0.24+)
+# We keep the enum cdef, and just make sure the names are available to Python
+locals().update(IDS)
--- a/spacy/tests/vectors/test_vectors.py
+++ b/spacy/tests/vectors/test_vectors.py
@ -35,18 +35,18 @@ def vocab(en_vocab, vectors):


 def test_init_vectors_with_data(strings, data):
-    v = Vectors(strings, data)
+    v = Vectors(strings, data=data)
    assert v.shape == data.shape

 def test_init_vectors_with_width(strings):
-    v = Vectors(strings, 3)
+    v = Vectors(strings, width=3)
    for string in strings:
        v.add(string)
    assert v.shape == (len(strings), 3)


 def test_get_vector(strings, data):
-    v = Vectors(strings, data)
+    v = Vectors(strings, data=data)
    for string in strings:
        v.add(string)
    assert list(v[strings[0]]) == list(data[0])
@ -56,7 +56,7 @@ def test_get_vector(strings, data):

 def test_set_vector(strings, data):
    orig = data.copy()
-    v = Vectors(strings, data)
+    v = Vectors(strings, data=data)
    for string in strings:
        v.add(string)
    assert list(v[strings[0]]) == list(orig[0])
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@ -32,22 +32,24 @@ cdef class Vectors:
    cdef public object keys
    cdef public int i

-    def __init__(self, strings, data_or_width=0):
+    def __init__(self, strings, width=0, data=None):
        if isinstance(strings, StringStore):
            self.strings = strings
        else:
            self.strings = StringStore()
            for string in strings:
                self.strings.add(string)
-        if isinstance(data_or_width, int):
-            self.data = data = numpy.zeros((len(strings), data_or_width),
-                                           dtype='f')
+        if data is not None:
+            self.data = numpy.asarray(data, dtype='f')
        else:
-            data = data_or_width
+            self.data = numpy.zeros((len(self.strings), width), dtype='f')
        self.i = 0
-        self.data = data
        self.key2row = {}
-        self.keys = np.ndarray((self.data.shape[0],), dtype='uint64')
+        self.keys = numpy.zeros((self.data.shape[0],), dtype='uint64')
+        for i, string in enumerate(self.strings):
+            if i >= self.data.shape[0]:
+                break
+            self.add(self.strings[string], self.data[i])

    def __reduce__(self):
        return (Vectors, (self.strings, self.data))
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -62,12 +62,9 @@ cdef class Vocab:
        if strings:
            for string in strings:
                _ = self[string]
-        for name in tag_map.keys():
-            if name:
-                self.strings.add(name)
        self.lex_attr_getters = lex_attr_getters
        self.morphology = Morphology(self.strings, tag_map, lemmatizer)
-        self.vectors = Vectors(self.strings)
+        self.vectors = Vectors(self.strings, width=0)

    property lang:
        def __get__(self):
@ -255,7 +252,7 @@ cdef class Vocab:
        """
        if new_dim is None:
            new_dim = self.vectors.data.shape[1]
-        self.vectors = Vectors(self.strings, new_dim)
+        self.vectors = Vectors(self.strings, width=new_dim)

    def get_vector(self, orth):
        """Retrieve a vector for a word in the vocabulary.
@ -338,7 +335,7 @@ cdef class Vocab:
            if self.vectors is None:
                return None
            else:
-                return self.vectors.to_bytes(exclude='strings.json')
+                return self.vectors.to_bytes()

        getters = OrderedDict((
            ('strings', lambda: self.strings.to_bytes()),
@ -358,7 +355,7 @@ cdef class Vocab:
            if self.vectors is None:
                return None
            else:
-                return self.vectors.from_bytes(b, exclude='strings')
+                return self.vectors.from_bytes(b)
        setters = OrderedDict((
            ('strings', lambda b: self.strings.from_bytes(b)),
            ('lexemes', lambda b: self.lexemes_from_bytes(b)),
--- a/website/api/vectors.jade
+++ b/website/api/vectors.jade
@ -12,7 +12,7 @@ p

 p
    |  Create a new vector store. To keep the vector table empty, pass
-    |  #[code data_or_width=0]. You can also create the vector table and add
+    |  #[code width=0]. You can also create the vector table and add
    |  vectors one by one, or set the vector values directly on initialisation.

 +aside-code("Example").
@ -21,11 +21,11 @@ p

    empty_vectors = Vectors(StringStore())

-    vectors = Vectors([u'cat'], 300)
+    vectors = Vectors([u'cat'], width=300)
    vectors[u'cat'] = numpy.random.uniform(-1, 1, (300,))

    vector_table = numpy.zeros((3, 300), dtype='f')
-    vectors = Vectors(StringStore(), vector_table)
+    vectors = Vectors(StringStore(), data=vector_table)

 +table(["Name", "Type", "Description"])
    +row
@ -36,9 +36,12 @@ p
            |  that maps strings to hash values, and vice versa.

    +row
-        +cell #[code data_or_width]
-        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] or int
-        +cell Vector data or number of dimensions.
+        +cell #[code data]
+        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+
+    +row
+        +cell #[code width]
+        +cell Number of dimensions.

    +row("foot")
        +cell returns