From 92ac9316b5f3ff79db1c3ec44be54f8c4dfe95dc Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 20 Oct 2017 13:59:24 +0200
Subject: [PATCH 1/8] Fix initialization of vectors, to address serialization
 problem

---
 spacy/vectors.pyx | 12 +++++-------
 spacy/vocab.pyx   | 10 ++++------
 2 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 5512279ae..cea583110 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -32,22 +32,20 @@ cdef class Vectors:
     cdef public object keys
     cdef public int i
 
-    def __init__(self, strings, data_or_width=0):
+    def __init__(self, strings, data=None, width=0):
         if isinstance(strings, StringStore):
             self.strings = strings
         else:
             self.strings = StringStore()
             for string in strings:
                 self.strings.add(string)
-        if isinstance(data_or_width, int):
-            self.data = data = numpy.zeros((len(strings), data_or_width),
-                                           dtype='f')
+        if data is not None:
+            self.data = numpy.asarray(data, dtype='f')
         else:
-            data = data_or_width
+            self.data = numpy.zeros((len(self.strings), width), dtype='f')
         self.i = 0
-        self.data = data
         self.key2row = {}
-        self.keys = np.ndarray((self.data.shape[0],), dtype='uint64')
+        self.keys = numpy.zeros((self.data.shape[0],), dtype='uint64')
 
     def __reduce__(self):
         return (Vectors, (self.strings, self.data))
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 205e5a2af..e6ba9944b 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -62,12 +62,10 @@ cdef class Vocab:
         if strings:
             for string in strings:
                 _ = self[string]
-        for name in tag_map.keys():
-            if name:
-                self.strings.add(name)
         self.lex_attr_getters = lex_attr_getters
+        print("Create morphology", list(self.strings), tag_map)
         self.morphology = Morphology(self.strings, tag_map, lemmatizer)
-        self.vectors = Vectors(self.strings)
+        self.vectors = Vectors(self.strings, width=0)
 
     property lang:
         def __get__(self):
@@ -338,7 +336,7 @@ cdef class Vocab:
             if self.vectors is None:
                 return None
             else:
-                return self.vectors.to_bytes(exclude='strings.json')
+                return self.vectors.to_bytes()
 
         getters = OrderedDict((
             ('strings', lambda: self.strings.to_bytes()),
@@ -358,7 +356,7 @@ cdef class Vocab:
             if self.vectors is None:
                 return None
             else:
-                return self.vectors.from_bytes(b, exclude='strings')
+                return self.vectors.from_bytes(b)
         setters = OrderedDict((
             ('strings', lambda b: self.strings.from_bytes(b)),
             ('lexemes', lambda b: self.lexemes_from_bytes(b)),

From 6218af0105d1514089ecd76c4cbf6fec31d50423 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 20 Oct 2017 13:59:57 +0200
Subject: [PATCH 2/8] Remove cpdef enum, to avoid too much code generation

---
 spacy/morphology.pxd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd
index be6711bfd..9192f351f 100644
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@@ -44,7 +44,7 @@ cdef class Morphology:
     cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
 
 
-cpdef enum univ_morph_t:
+cdef enum univ_morph_t:
     NIL = 0
     Animacy_anim = symbols.Animacy_anim
     Animacy_inam

From 506cf2eb1389da6149f97de7db80df52ed0d2d1f Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 20 Oct 2017 14:00:23 +0200
Subject: [PATCH 3/8] Remove cpdef enum, to avoid too much code generation

---
 spacy/morphology.pyx | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index 4a1a0aa54..65b46fe08 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -426,3 +426,7 @@ IDS = {
 
 
 NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
+# Unfortunate hack here, to work around problem with long cpdef enum
+# (which is generating an enormous amount of C++ in Cython 0.24+)
+# We keep the enum cdef, and just make sure the names are available to Python
+locals().update(IDS)

From 49895fbef69598d18fd00197661ec3ad939de849 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 20 Oct 2017 14:01:12 +0200
Subject: [PATCH 4/8] Rename 'SP' special tag to '_SP'

Renaming the tag with an underscore lets us add it to the tag map
without worrying that we'll change the sequence of tags, which throws
off the tag-to-ID mapping. For instance, if we inserted a 'SP' tag,
the "VERB" tag is pushed to a different class ID, and the model is all
messed up.
---
 spacy/lang/de/tag_map.py |  2 +-
 spacy/lang/en/tag_map.py |  4 +--
 spacy/lang/es/tag_map.py |  2 +-
 spacy/lang/th/tag_map.py | 77 ++++++++++++++++++++--------------------
 spacy/morphology.pyx     | 14 ++++++--
 5 files changed, 54 insertions(+), 45 deletions(-)

diff --git a/spacy/lang/de/tag_map.py b/spacy/lang/de/tag_map.py
index d16bd17e0..730c15cfc 100644
--- a/spacy/lang/de/tag_map.py
+++ b/spacy/lang/de/tag_map.py
@@ -62,5 +62,5 @@ TAG_MAP = {
     "VVIZU":    {POS: VERB, "VerbForm": "inf"},
     "VVPP":     {POS: VERB, "Aspect": "perf", "VerbForm": "part"},
     "XY":       {POS: X},
-    "SP":       {POS: SPACE}
+    "_SP":      {POS: SPACE}
 }
diff --git a/spacy/lang/en/tag_map.py b/spacy/lang/en/tag_map.py
index a674c17e3..76eabf307 100644
--- a/spacy/lang/en/tag_map.py
+++ b/spacy/lang/en/tag_map.py
@@ -55,11 +55,11 @@ TAG_MAP = {
     "WP":       {POS: NOUN, "PronType": "int|rel"},
     "WP$":      {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
     "WRB":      {POS: ADV, "PronType": "int|rel"},
-    "SP":       {POS: SPACE},
     "ADD":      {POS: X},
     "NFP":      {POS: PUNCT},
     "GW":       {POS: X},
     "XX":       {POS: X},
     "BES":      {POS: VERB},
-    "HVS":      {POS: VERB}
+    "HVS":      {POS: VERB},
+    "_SP":       {POS: SPACE},
 }
diff --git a/spacy/lang/es/tag_map.py b/spacy/lang/es/tag_map.py
index 86dd48620..2095d23b1 100644
--- a/spacy/lang/es/tag_map.py
+++ b/spacy/lang/es/tag_map.py
@@ -303,5 +303,5 @@ TAG_MAP = {
     "VERB__VerbForm=Ger": {"morph": "VerbForm=Ger", "pos": "VERB"},
     "VERB__VerbForm=Inf": {"morph": "VerbForm=Inf", "pos": "VERB"},
     "X___": {"morph": "_", "pos": "X"},
-    "SP": {"morph": "_", "pos": "SPACE"},
+    "_SP": {"morph": "_", "pos": "SPACE"},
 }
diff --git a/spacy/lang/th/tag_map.py b/spacy/lang/th/tag_map.py
index 40e5ac44c..570871820 100644
--- a/spacy/lang/th/tag_map.py
+++ b/spacy/lang/th/tag_map.py
@@ -19,63 +19,64 @@ TAG_MAP = {
     "NPRP":     {POS: PRON},
     # ADJ
     "ADJ":      {POS: ADJ},
-    "NONM":      {POS: ADJ},
-    "VATT":      {POS: ADJ},
-    "DONM":      {POS: ADJ},
+    "NONM":     {POS: ADJ},
+    "VATT":     {POS: ADJ},
+    "DONM":     {POS: ADJ},
     # ADV
     "ADV":      {POS: ADV},
-    "ADVN":      {POS: ADV},
-    "ADVI":      {POS: ADV},
-    "ADVP":      {POS: ADV},
-    "ADVS":      {POS: ADV},
+    "ADVN":     {POS: ADV},
+    "ADVI":     {POS: ADV},
+    "ADVP":     {POS: ADV},
+    "ADVS":     {POS: ADV},
 	# INT
     "INT":      {POS: INTJ},
     # PRON
     "PROPN":    {POS: PROPN},
-    "PPRS":    {POS: PROPN},
-    "PDMN":    {POS: PROPN},
-    "PNTR":    {POS: PROPN},
+    "PPRS":     {POS: PROPN},
+    "PDMN":     {POS: PROPN},
+    "PNTR":     {POS: PROPN},
     # DET
     "DET":      {POS: DET},
-    "DDAN":      {POS: DET},
-    "DDAC":      {POS: DET},
-    "DDBQ":      {POS: DET},
-    "DDAQ":      {POS: DET},
-    "DIAC":      {POS: DET},
-    "DIBQ":      {POS: DET},
-    "DIAQ":      {POS: DET},
-    "DCNM":      {POS: DET},
+    "DDAN":     {POS: DET},
+    "DDAC":     {POS: DET},
+    "DDBQ":     {POS: DET},
+    "DDAQ":     {POS: DET},
+    "DIAC":     {POS: DET},
+    "DIBQ":     {POS: DET},
+    "DIAQ":     {POS: DET},
+    "DCNM":     {POS: DET},
     # NUM
     "NUM":      {POS: NUM},
-    "NCNM":      {POS: NUM},
-    "NLBL":      {POS: NUM},
-    "DCNM":      {POS: NUM},
+    "NCNM":     {POS: NUM},
+    "NLBL":     {POS: NUM},
+    "DCNM":     {POS: NUM},
 	# AUX
     "AUX":      {POS: AUX},
-    "XVBM":      {POS: AUX},
-    "XVAM":      {POS: AUX},
-    "XVMM":      {POS: AUX},
-    "XVBB":      {POS: AUX},
-    "XVAE":      {POS: AUX},
+    "XVBM":     {POS: AUX},
+    "XVAM":     {POS: AUX},
+    "XVMM":     {POS: AUX},
+    "XVBB":     {POS: AUX},
+    "XVAE":     {POS: AUX},
 	# ADP
     "ADP":      {POS: ADP},
-    "RPRE":      {POS: ADP},
+    "RPRE":     {POS: ADP},
     # CCONJ
     "CCONJ":    {POS: CCONJ},
-    "JCRG":    {POS: CCONJ},
+    "JCRG":     {POS: CCONJ},
 	# SCONJ
     "SCONJ":    {POS: SCONJ},
-    "PREL":    {POS: SCONJ},
-    "JSBR":    {POS: SCONJ},
-    "JCMP":    {POS: SCONJ},
+    "PREL":     {POS: SCONJ},
+    "JSBR":     {POS: SCONJ},
+    "JCMP":     {POS: SCONJ},
     # PART
-    "PART":    {POS: PART},
-    "FIXN":    {POS: PART},
-    "FIXV":    {POS: PART},
-    "EAFF":    {POS: PART},
-    "AITT":    {POS: PART},
-    "NEG":    {POS: PART},
+    "PART":     {POS: PART},
+    "FIXN":     {POS: PART},
+    "FIXV":     {POS: PART},
+    "EAFF":     {POS: PART},
+    "AITT":     {POS: PART},
+    "NEG":      {POS: PART},
     # PUNCT
     "PUNCT":    {POS: PUNCT},
-    "PUNC":    {POS: PUNCT}
+    "PUNC":     {POS: PUNCT},
+    "_SP":      {POS: SPACE}
 }
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index 65b46fe08..7845ab4e7 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -4,7 +4,7 @@ from __future__ import unicode_literals
 
 from libc.string cimport memset
 
-from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT
+from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT, SPACE
 from .attrs cimport POS, IS_SPACE
 from .parts_of_speech import IDS as POS_IDS
 from .lexeme cimport Lexeme
@@ -36,14 +36,22 @@ cdef class Morphology:
     def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None):
         self.mem = Pool()
         self.strings = string_store
+        # Add special space symbol. We prefix with underscore, to make sure it
+        # always sorts to the end.
+        space_attrs = tag_map.pop('SP', {POS: SPACE})
+        if '_SP' not in tag_map:
+            self.strings.add('_SP')
+            tag_map = dict(tag_map)
+            tag_map['_SP'] = space_attrs
+        self.tag_names = tuple(sorted(tag_map.keys()))
         self.tag_map = {}
         self.lemmatizer = lemmatizer
         self.n_tags = len(tag_map)
-        self.tag_names = tuple(sorted(tag_map.keys()))
         self.reverse_index = {}
 
         self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags+1, sizeof(RichTagC))
         for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
+            self.strings.add(tag_str)
             self.tag_map[tag_str] = dict(attrs)
             attrs = _normalize_props(attrs)
             attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
@@ -93,7 +101,7 @@ cdef class Morphology:
         # the statistical model fails.
         # Related to Issue #220
         if Lexeme.c_check_flag(token.lex, IS_SPACE):
-            tag_id = self.reverse_index[self.strings.add('SP')]
+            tag_id = self.reverse_index[self.strings.add('_SP')]
         rich_tag = self.rich_tags[tag_id]
         analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
         if analysis is NULL:

From ebecaddb765713aaaf7f5b2f51488f39f66655d9 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 20 Oct 2017 14:17:15 +0200
Subject: [PATCH 5/8] Make 'data_or_width' two keyword args in Vectors.__init__

Previously the data and width options were one argument in Vectors,
which meant you couldn't say vectors = Vectors(strings, width=300).
It's better to have two keywords.
---
 spacy/tests/vectors/test_vectors.py |  8 ++++----
 website/api/vectors.jade            | 15 +++++++++------
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/spacy/tests/vectors/test_vectors.py b/spacy/tests/vectors/test_vectors.py
index 798871edd..74ac26a10 100644
--- a/spacy/tests/vectors/test_vectors.py
+++ b/spacy/tests/vectors/test_vectors.py
@@ -35,18 +35,18 @@ def vocab(en_vocab, vectors):
 
 
 def test_init_vectors_with_data(strings, data):
-    v = Vectors(strings, data)
+    v = Vectors(strings, data=data)
     assert v.shape == data.shape
 
 def test_init_vectors_with_width(strings):
-    v = Vectors(strings, 3)
+    v = Vectors(strings, width=3)
     for string in strings:
         v.add(string)
     assert v.shape == (len(strings), 3)
 
 
 def test_get_vector(strings, data):
-    v = Vectors(strings, data)
+    v = Vectors(strings, data=data)
     for string in strings:
         v.add(string)
     assert list(v[strings[0]]) == list(data[0])
@@ -56,7 +56,7 @@ def test_get_vector(strings, data):
 
 def test_set_vector(strings, data):
     orig = data.copy()
-    v = Vectors(strings, data)
+    v = Vectors(strings, data=data)
     for string in strings:
         v.add(string)
     assert list(v[strings[0]]) == list(orig[0])
diff --git a/website/api/vectors.jade b/website/api/vectors.jade
index a58736506..e08f34643 100644
--- a/website/api/vectors.jade
+++ b/website/api/vectors.jade
@@ -12,7 +12,7 @@ p
 
 p
     |  Create a new vector store. To keep the vector table empty, pass
-    |  #[code data_or_width=0]. You can also create the vector table and add
+    |  #[code width=0]. You can also create the vector table and add
     |  vectors one by one, or set the vector values directly on initialisation.
 
 +aside-code("Example").
@@ -21,11 +21,11 @@ p
 
     empty_vectors = Vectors(StringStore())
 
-    vectors = Vectors([u'cat'], 300)
+    vectors = Vectors([u'cat'], width=300)
     vectors[u'cat'] = numpy.random.uniform(-1, 1, (300,))
 
     vector_table = numpy.zeros((3, 300), dtype='f')
-    vectors = Vectors(StringStore(), vector_table)
+    vectors = Vectors(StringStore(), data=vector_table)
 
 +table(["Name", "Type", "Description"])
     +row
@@ -36,9 +36,12 @@ p
             |  that maps strings to hash values, and vice versa.
 
     +row
-        +cell #[code data_or_width]
-        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] or int
-        +cell Vector data or number of dimensions.
+        +cell #[code data]
+        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+
+    +row
+        +cell #[code width]
+        +cell Number of dimensions.
 
     +row("foot")
         +cell returns

From cfae54c507ab24a1da36d3008484d2ac8edb3071 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 20 Oct 2017 14:19:04 +0200
Subject: [PATCH 6/8] Make change to Vectors.__init__

---
 spacy/vectors.pyx | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index cea583110..fa5fcf624 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -32,7 +32,7 @@ cdef class Vectors:
     cdef public object keys
     cdef public int i
 
-    def __init__(self, strings, data=None, width=0):
+    def __init__(self, strings, width=0, data=None):
         if isinstance(strings, StringStore):
             self.strings = strings
         else:
@@ -46,6 +46,10 @@ cdef class Vectors:
         self.i = 0
         self.key2row = {}
         self.keys = numpy.zeros((self.data.shape[0],), dtype='uint64')
+        for i, string in enumerate(self.strings):
+            if i >= self.data.shape[0]:
+                break
+            self.add(self.strings[string], self.data[i])
 
     def __reduce__(self):
         return (Vectors, (self.strings, self.data))

From 33229b1c9ef53a49a3bbd00d61ca02c28c5481c8 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 20 Oct 2017 14:19:29 +0200
Subject: [PATCH 7/8] Remove print statement

---
 spacy/vocab.pyx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index e6ba9944b..2e189a02b 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -63,7 +63,6 @@ cdef class Vocab:
             for string in strings:
                 _ = self[string]
         self.lex_attr_getters = lex_attr_getters
-        print("Create morphology", list(self.strings), tag_map)
         self.morphology = Morphology(self.strings, tag_map, lemmatizer)
         self.vectors = Vectors(self.strings, width=0)
 

From 9010a1a0603fba85143bcd859b88aaed59937a9a Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 20 Oct 2017 14:19:46 +0200
Subject: [PATCH 8/8] Create vectors correctly

---
 spacy/vocab.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 2e189a02b..3f96b5144 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -252,7 +252,7 @@ cdef class Vocab:
         """
         if new_dim is None:
             new_dim = self.vectors.data.shape[1]
-        self.vectors = Vectors(self.strings, new_dim)
+        self.vectors = Vectors(self.strings, width=new_dim)
 
     def get_vector(self, orth):
         """Retrieve a vector for a word in the vocabulary.