Rename 'SP' special tag to '_SP'

Renaming the tag with an underscore lets us add it to the tag map without worrying that we'll change the sequence of tags, which throws off the tag-to-ID mapping. For instance, if we inserted a 'SP' tag, the "VERB" tag is pushed to a different class ID, and the model is all messed up.
2026-02-04 22:39:50 +03:00 · 2017-10-20 14:01:12 +02:00 · 2017-10-20 14:01:12 +02:00 · 49895fbef6
commit 49895fbef6
parent 506cf2eb13
5 changed files with 54 additions and 45 deletions
--- a/spacy/lang/de/tag_map.py
+++ b/spacy/lang/de/tag_map.py
@ -62,5 +62,5 @@ TAG_MAP = {
    "VVIZU":    {POS: VERB, "VerbForm": "inf"},
    "VVPP":     {POS: VERB, "Aspect": "perf", "VerbForm": "part"},
    "XY":       {POS: X},
-    "SP":       {POS: SPACE}
+    "_SP":      {POS: SPACE}
 }
--- a/spacy/lang/en/tag_map.py
+++ b/spacy/lang/en/tag_map.py
@ -55,11 +55,11 @@ TAG_MAP = {
    "WP":       {POS: NOUN, "PronType": "int|rel"},
    "WP$":      {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
    "WRB":      {POS: ADV, "PronType": "int|rel"},
-    "SP":       {POS: SPACE},
    "ADD":      {POS: X},
    "NFP":      {POS: PUNCT},
    "GW":       {POS: X},
    "XX":       {POS: X},
    "BES":      {POS: VERB},
-    "HVS":      {POS: VERB}
+    "HVS":      {POS: VERB},
+    "_SP":       {POS: SPACE},
 }
--- a/spacy/lang/es/tag_map.py
+++ b/spacy/lang/es/tag_map.py
@ -303,5 +303,5 @@ TAG_MAP = {
    "VERB__VerbForm=Ger": {"morph": "VerbForm=Ger", "pos": "VERB"},
    "VERB__VerbForm=Inf": {"morph": "VerbForm=Inf", "pos": "VERB"},
    "X___": {"morph": "_", "pos": "X"},
-    "SP": {"morph": "_", "pos": "SPACE"},
+    "_SP": {"morph": "_", "pos": "SPACE"},
 }
--- a/spacy/lang/th/tag_map.py
+++ b/spacy/lang/th/tag_map.py
@ -19,63 +19,64 @@ TAG_MAP = {
    "NPRP":     {POS: PRON},
    # ADJ
    "ADJ":      {POS: ADJ},
-    "NONM":      {POS: ADJ},
-    "VATT":      {POS: ADJ},
-    "DONM":      {POS: ADJ},
+    "NONM":     {POS: ADJ},
+    "VATT":     {POS: ADJ},
+    "DONM":     {POS: ADJ},
    # ADV
    "ADV":      {POS: ADV},
-    "ADVN":      {POS: ADV},
-    "ADVI":      {POS: ADV},
-    "ADVP":      {POS: ADV},
-    "ADVS":      {POS: ADV},
+    "ADVN":     {POS: ADV},
+    "ADVI":     {POS: ADV},
+    "ADVP":     {POS: ADV},
+    "ADVS":     {POS: ADV},
 	# INT
    "INT":      {POS: INTJ},
    # PRON
    "PROPN":    {POS: PROPN},
-    "PPRS":    {POS: PROPN},
-    "PDMN":    {POS: PROPN},
-    "PNTR":    {POS: PROPN},
+    "PPRS":     {POS: PROPN},
+    "PDMN":     {POS: PROPN},
+    "PNTR":     {POS: PROPN},
    # DET
    "DET":      {POS: DET},
-    "DDAN":      {POS: DET},
-    "DDAC":      {POS: DET},
-    "DDBQ":      {POS: DET},
-    "DDAQ":      {POS: DET},
-    "DIAC":      {POS: DET},
-    "DIBQ":      {POS: DET},
-    "DIAQ":      {POS: DET},
-    "DCNM":      {POS: DET},
+    "DDAN":     {POS: DET},
+    "DDAC":     {POS: DET},
+    "DDBQ":     {POS: DET},
+    "DDAQ":     {POS: DET},
+    "DIAC":     {POS: DET},
+    "DIBQ":     {POS: DET},
+    "DIAQ":     {POS: DET},
+    "DCNM":     {POS: DET},
    # NUM
    "NUM":      {POS: NUM},
-    "NCNM":      {POS: NUM},
-    "NLBL":      {POS: NUM},
-    "DCNM":      {POS: NUM},
+    "NCNM":     {POS: NUM},
+    "NLBL":     {POS: NUM},
+    "DCNM":     {POS: NUM},
 	# AUX
    "AUX":      {POS: AUX},
-    "XVBM":      {POS: AUX},
-    "XVAM":      {POS: AUX},
-    "XVMM":      {POS: AUX},
-    "XVBB":      {POS: AUX},
-    "XVAE":      {POS: AUX},
+    "XVBM":     {POS: AUX},
+    "XVAM":     {POS: AUX},
+    "XVMM":     {POS: AUX},
+    "XVBB":     {POS: AUX},
+    "XVAE":     {POS: AUX},
 	# ADP
    "ADP":      {POS: ADP},
-    "RPRE":      {POS: ADP},
+    "RPRE":     {POS: ADP},
    # CCONJ
    "CCONJ":    {POS: CCONJ},
-    "JCRG":    {POS: CCONJ},
+    "JCRG":     {POS: CCONJ},
 	# SCONJ
    "SCONJ":    {POS: SCONJ},
-    "PREL":    {POS: SCONJ},
-    "JSBR":    {POS: SCONJ},
-    "JCMP":    {POS: SCONJ},
+    "PREL":     {POS: SCONJ},
+    "JSBR":     {POS: SCONJ},
+    "JCMP":     {POS: SCONJ},
    # PART
-    "PART":    {POS: PART},
-    "FIXN":    {POS: PART},
-    "FIXV":    {POS: PART},
-    "EAFF":    {POS: PART},
-    "AITT":    {POS: PART},
-    "NEG":    {POS: PART},
+    "PART":     {POS: PART},
+    "FIXN":     {POS: PART},
+    "FIXV":     {POS: PART},
+    "EAFF":     {POS: PART},
+    "AITT":     {POS: PART},
+    "NEG":      {POS: PART},
    # PUNCT
    "PUNCT":    {POS: PUNCT},
-    "PUNC":    {POS: PUNCT}
+    "PUNC":     {POS: PUNCT},
+    "_SP":      {POS: SPACE}
 }
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -4,7 +4,7 @@ from __future__ import unicode_literals

 from libc.string cimport memset

-from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT
+from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT, SPACE
 from .attrs cimport POS, IS_SPACE
 from .parts_of_speech import IDS as POS_IDS
 from .lexeme cimport Lexeme
@ -36,14 +36,22 @@ cdef class Morphology:
    def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None):
        self.mem = Pool()
        self.strings = string_store
+        # Add special space symbol. We prefix with underscore, to make sure it
+        # always sorts to the end.
+        space_attrs = tag_map.pop('SP', {POS: SPACE})
+        if '_SP' not in tag_map:
+            self.strings.add('_SP')
+            tag_map = dict(tag_map)
+            tag_map['_SP'] = space_attrs
+        self.tag_names = tuple(sorted(tag_map.keys()))
        self.tag_map = {}
        self.lemmatizer = lemmatizer
        self.n_tags = len(tag_map)
-        self.tag_names = tuple(sorted(tag_map.keys()))
        self.reverse_index = {}

        self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags+1, sizeof(RichTagC))
        for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
+            self.strings.add(tag_str)
            self.tag_map[tag_str] = dict(attrs)
            attrs = _normalize_props(attrs)
            attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
@ -93,7 +101,7 @@ cdef class Morphology:
        # the statistical model fails.
        # Related to Issue #220
        if Lexeme.c_check_flag(token.lex, IS_SPACE):
-            tag_id = self.reverse_index[self.strings.add('SP')]
+            tag_id = self.reverse_index[self.strings.add('_SP')]
        rich_tag = self.rich_tags[tag_id]
        analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
        if analysis is NULL: