From 49895fbef69598d18fd00197661ec3ad939de849 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 20 Oct 2017 14:01:12 +0200
Subject: [PATCH] Rename 'SP' special tag to '_SP'

Renaming the tag with an underscore lets us add it to the tag map
without worrying that we'll change the sequence of tags, which throws
off the tag-to-ID mapping. For instance, if we inserted a 'SP' tag,
the "VERB" tag is pushed to a different class ID, and the model is all
messed up.
---
 spacy/lang/de/tag_map.py |  2 +-
 spacy/lang/en/tag_map.py |  4 +--
 spacy/lang/es/tag_map.py |  2 +-
 spacy/lang/th/tag_map.py | 77 ++++++++++++++++++++--------------------
 spacy/morphology.pyx     | 14 ++++++--
 5 files changed, 54 insertions(+), 45 deletions(-)

diff --git a/spacy/lang/de/tag_map.py b/spacy/lang/de/tag_map.py
index d16bd17e0..730c15cfc 100644
--- a/spacy/lang/de/tag_map.py
+++ b/spacy/lang/de/tag_map.py
@@ -62,5 +62,5 @@ TAG_MAP = {
     "VVIZU":    {POS: VERB, "VerbForm": "inf"},
     "VVPP":     {POS: VERB, "Aspect": "perf", "VerbForm": "part"},
     "XY":       {POS: X},
-    "SP":       {POS: SPACE}
+    "_SP":      {POS: SPACE}
 }
diff --git a/spacy/lang/en/tag_map.py b/spacy/lang/en/tag_map.py
index a674c17e3..76eabf307 100644
--- a/spacy/lang/en/tag_map.py
+++ b/spacy/lang/en/tag_map.py
@@ -55,11 +55,11 @@ TAG_MAP = {
     "WP":       {POS: NOUN, "PronType": "int|rel"},
     "WP$":      {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
     "WRB":      {POS: ADV, "PronType": "int|rel"},
-    "SP":       {POS: SPACE},
     "ADD":      {POS: X},
     "NFP":      {POS: PUNCT},
     "GW":       {POS: X},
     "XX":       {POS: X},
     "BES":      {POS: VERB},
-    "HVS":      {POS: VERB}
+    "HVS":      {POS: VERB},
+    "_SP":       {POS: SPACE},
 }
diff --git a/spacy/lang/es/tag_map.py b/spacy/lang/es/tag_map.py
index 86dd48620..2095d23b1 100644
--- a/spacy/lang/es/tag_map.py
+++ b/spacy/lang/es/tag_map.py
@@ -303,5 +303,5 @@ TAG_MAP = {
     "VERB__VerbForm=Ger": {"morph": "VerbForm=Ger", "pos": "VERB"},
     "VERB__VerbForm=Inf": {"morph": "VerbForm=Inf", "pos": "VERB"},
     "X___": {"morph": "_", "pos": "X"},
-    "SP": {"morph": "_", "pos": "SPACE"},
+    "_SP": {"morph": "_", "pos": "SPACE"},
 }
diff --git a/spacy/lang/th/tag_map.py b/spacy/lang/th/tag_map.py
index 40e5ac44c..570871820 100644
--- a/spacy/lang/th/tag_map.py
+++ b/spacy/lang/th/tag_map.py
@@ -19,63 +19,64 @@ TAG_MAP = {
     "NPRP":     {POS: PRON},
     # ADJ
     "ADJ":      {POS: ADJ},
-    "NONM":      {POS: ADJ},
-    "VATT":      {POS: ADJ},
-    "DONM":      {POS: ADJ},
+    "NONM":     {POS: ADJ},
+    "VATT":     {POS: ADJ},
+    "DONM":     {POS: ADJ},
     # ADV
     "ADV":      {POS: ADV},
-    "ADVN":      {POS: ADV},
-    "ADVI":      {POS: ADV},
-    "ADVP":      {POS: ADV},
-    "ADVS":      {POS: ADV},
+    "ADVN":     {POS: ADV},
+    "ADVI":     {POS: ADV},
+    "ADVP":     {POS: ADV},
+    "ADVS":     {POS: ADV},
 	# INT
     "INT":      {POS: INTJ},
     # PRON
     "PROPN":    {POS: PROPN},
-    "PPRS":    {POS: PROPN},
-    "PDMN":    {POS: PROPN},
-    "PNTR":    {POS: PROPN},
+    "PPRS":     {POS: PROPN},
+    "PDMN":     {POS: PROPN},
+    "PNTR":     {POS: PROPN},
     # DET
     "DET":      {POS: DET},
-    "DDAN":      {POS: DET},
-    "DDAC":      {POS: DET},
-    "DDBQ":      {POS: DET},
-    "DDAQ":      {POS: DET},
-    "DIAC":      {POS: DET},
-    "DIBQ":      {POS: DET},
-    "DIAQ":      {POS: DET},
-    "DCNM":      {POS: DET},
+    "DDAN":     {POS: DET},
+    "DDAC":     {POS: DET},
+    "DDBQ":     {POS: DET},
+    "DDAQ":     {POS: DET},
+    "DIAC":     {POS: DET},
+    "DIBQ":     {POS: DET},
+    "DIAQ":     {POS: DET},
+    "DCNM":     {POS: DET},
     # NUM
     "NUM":      {POS: NUM},
-    "NCNM":      {POS: NUM},
-    "NLBL":      {POS: NUM},
-    "DCNM":      {POS: NUM},
+    "NCNM":     {POS: NUM},
+    "NLBL":     {POS: NUM},
+    "DCNM":     {POS: NUM},
 	# AUX
     "AUX":      {POS: AUX},
-    "XVBM":      {POS: AUX},
-    "XVAM":      {POS: AUX},
-    "XVMM":      {POS: AUX},
-    "XVBB":      {POS: AUX},
-    "XVAE":      {POS: AUX},
+    "XVBM":     {POS: AUX},
+    "XVAM":     {POS: AUX},
+    "XVMM":     {POS: AUX},
+    "XVBB":     {POS: AUX},
+    "XVAE":     {POS: AUX},
 	# ADP
     "ADP":      {POS: ADP},
-    "RPRE":      {POS: ADP},
+    "RPRE":     {POS: ADP},
     # CCONJ
     "CCONJ":    {POS: CCONJ},
-    "JCRG":    {POS: CCONJ},
+    "JCRG":     {POS: CCONJ},
 	# SCONJ
     "SCONJ":    {POS: SCONJ},
-    "PREL":    {POS: SCONJ},
-    "JSBR":    {POS: SCONJ},
-    "JCMP":    {POS: SCONJ},
+    "PREL":     {POS: SCONJ},
+    "JSBR":     {POS: SCONJ},
+    "JCMP":     {POS: SCONJ},
     # PART
-    "PART":    {POS: PART},
-    "FIXN":    {POS: PART},
-    "FIXV":    {POS: PART},
-    "EAFF":    {POS: PART},
-    "AITT":    {POS: PART},
-    "NEG":    {POS: PART},
+    "PART":     {POS: PART},
+    "FIXN":     {POS: PART},
+    "FIXV":     {POS: PART},
+    "EAFF":     {POS: PART},
+    "AITT":     {POS: PART},
+    "NEG":      {POS: PART},
     # PUNCT
     "PUNCT":    {POS: PUNCT},
-    "PUNC":    {POS: PUNCT}
+    "PUNC":     {POS: PUNCT},
+    "_SP":      {POS: SPACE}
 }
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index 65b46fe08..7845ab4e7 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -4,7 +4,7 @@ from __future__ import unicode_literals
 
 from libc.string cimport memset
 
-from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT
+from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT, SPACE
 from .attrs cimport POS, IS_SPACE
 from .parts_of_speech import IDS as POS_IDS
 from .lexeme cimport Lexeme
@@ -36,14 +36,22 @@ cdef class Morphology:
     def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None):
         self.mem = Pool()
         self.strings = string_store
+        # Add special space symbol. We prefix with underscore, to make sure it
+        # always sorts to the end.
+        space_attrs = tag_map.pop('SP', {POS: SPACE})
+        if '_SP' not in tag_map:
+            self.strings.add('_SP')
+            tag_map = dict(tag_map)
+            tag_map['_SP'] = space_attrs
+        self.tag_names = tuple(sorted(tag_map.keys()))
         self.tag_map = {}
         self.lemmatizer = lemmatizer
         self.n_tags = len(tag_map)
-        self.tag_names = tuple(sorted(tag_map.keys()))
         self.reverse_index = {}
 
         self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags+1, sizeof(RichTagC))
         for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
+            self.strings.add(tag_str)
             self.tag_map[tag_str] = dict(attrs)
             attrs = _normalize_props(attrs)
             attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
@@ -93,7 +101,7 @@ cdef class Morphology:
         # the statistical model fails.
         # Related to Issue #220
         if Lexeme.c_check_flag(token.lex, IS_SPACE):
-            tag_id = self.reverse_index[self.strings.add('SP')]
+            tag_id = self.reverse_index[self.strings.add('_SP')]
         rich_tag = self.rich_tags[tag_id]
         analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
         if analysis is NULL: