From 3065f12ef206d13db3544213266973dcc2b08aa3 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 10 Oct 2017 22:57:31 +0200
Subject: [PATCH 01/13] Make add parser label work for hidden_depth=0

---
 spacy/syntax/nn_parser.pyx | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index a8a1d4334..939414bd3 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -800,11 +800,20 @@ cdef class Parser:
         if self.model not in (True, False, None) and resized:
             # Weights are stored in (nr_out, nr_in) format, so we're basically
             # just adding rows here.
-            smaller = self.model[-1]._layers[-1]
-            larger = Affine(self.moves.n_moves, smaller.nI)
-            copy_array(larger.W[:smaller.nO], smaller.W)
-            copy_array(larger.b[:smaller.nO], smaller.b)
-            self.model[-1]._layers[-1] = larger
+            if self.model[-1].is_noop:
+                smaller = self.model[1]
+                dims = dict(self.model[1]._dims)
+                dims['nO'] = self.moves.n_moves
+                larger = self.model[1].__class__(**dims)
+                copy_array(larger.W[:, :smaller.nO], smaller.W)
+                copy_array(larger.b[:smaller.nO], smaller.b)
+                self.model = (self.model[0], larger, self.model[2])
+            else:
+                smaller = self.model[-1]._layers[-1]
+                larger = Affine(self.moves.n_moves, smaller.nI)
+                copy_array(larger.W[:smaller.nO], smaller.W)
+                copy_array(larger.b[:smaller.nO], smaller.b)
+                self.model[-1]._layers[-1] = larger
 
     def begin_training(self, gold_tuples, pipeline=None, **cfg):
         if 'model' in cfg:

From d84136b4a9eb716be5771ed5634be6fef4c740ef Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 10 Oct 2017 22:57:41 +0200
Subject: [PATCH 02/13] Update add label test

---
 spacy/tests/parser/test_add_label.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py
index b89cca113..3fbfc96a6 100644
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@@ -22,14 +22,14 @@ def vocab():
 @pytest.fixture
 def parser(vocab):
     parser = NeuralDependencyParser(vocab)
-    parser.cfg['token_vector_width'] = 4
-    parser.cfg['hidden_width'] = 6
+    parser.cfg['token_vector_width'] = 8
+    parser.cfg['hidden_width'] = 30
     parser.cfg['hist_size'] = 0
     parser.add_label('left')
     parser.begin_training([], **parser.cfg)
     sgd = Adam(NumpyOps(), 0.001)
 
-    for i in range(30):
+    for i in range(10):
         losses = {}
         doc = Doc(vocab, words=['a', 'b', 'c', 'd'])
         gold = GoldParse(doc, heads=[1, 1, 3, 3],
@@ -37,6 +37,8 @@ def parser(vocab):
         parser.update([doc], [gold], sgd=sgd, losses=losses)
     return parser
 
+def test_init_parser(parser):
+    pass
 
 def test_add_label(parser):
     doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])

From 2c118ab3a6b516fae87280dac69cb9c5d7caa5a9 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 11 Oct 2017 03:21:23 +0200
Subject: [PATCH 03/13] Add tests for Doc creation

---
 spacy/tests/doc/test_creation.py | 37 ++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 spacy/tests/doc/test_creation.py

diff --git a/spacy/tests/doc/test_creation.py b/spacy/tests/doc/test_creation.py
new file mode 100644
index 000000000..edadbf086
--- /dev/null
+++ b/spacy/tests/doc/test_creation.py
@@ -0,0 +1,37 @@
+'''Test Doc sets up tokens correctly.'''
+from __future__ import unicode_literals
+import pytest
+
+from ...vocab import Vocab
+from ...tokens.doc import Doc
+from ...lemmatizerlookup import Lemmatizer
+
+
+@pytest.fixture
+def lemmatizer():
+    return Lemmatizer({'dogs': 'dog', 'boxen': 'box', 'mice': 'mouse'})
+
+
+@pytest.fixture
+def vocab(lemmatizer):
+    return Vocab(lemmatizer=lemmatizer)
+
+
+def test_empty_doc(vocab):
+    doc = Doc(vocab)
+    assert len(doc) == 0
+
+
+def test_single_word(vocab):
+    doc = Doc(vocab, words=['a'])
+    assert doc.text == 'a '
+    doc = Doc(vocab, words=['a'], spaces=[False])
+    assert doc.text == 'a'
+
+
+def test_lookup_lemmatization(vocab):
+    doc = Doc(vocab, words=['dogs', 'dogses'])
+    assert doc[0].text == 'dogs'
+    assert doc[0].lemma_ == 'dog'
+    assert doc[1].text == 'dogses'
+    assert doc[1].lemma_ == 'dogses'

From d528b6e36dd13d70238b085191f844728d8a7535 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 11 Oct 2017 03:22:49 +0200
Subject: [PATCH 04/13] Add assign_untagged method in Morphology

---
 spacy/morphology.pxd |  2 ++
 spacy/morphology.pyx | 14 ++++++++++----
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd
index 922843d6d..be6711bfd 100644
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@@ -35,6 +35,8 @@ cdef class Morphology:
     cdef RichTagC* rich_tags
     cdef PreshMapArray _cache
 
+    cdef int assign_untagged(self, TokenC* token) except -1
+
     cdef int assign_tag(self, TokenC* token, tag) except -1
 
     cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index 5ee11c151..5a4399698 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -42,7 +42,7 @@ cdef class Morphology:
         self.tag_names = tuple(sorted(tag_map.keys()))
         self.reverse_index = {}
 
-        self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC))
+        self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags+1, sizeof(RichTagC))
         for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
             self.tag_map[tag_str] = dict(attrs)
             attrs = _normalize_props(attrs)
@@ -52,6 +52,10 @@ cdef class Morphology:
             self.rich_tags[i].morph = 0
             self.rich_tags[i].pos = attrs[POS]
             self.reverse_index[self.rich_tags[i].name] = i
+        # Add a 'null' tag, which we can reference when assign morphology to
+        # untagged tokens.
+        self.rich_tags[self.n_tags].id = self.n_tags
+ 
         self._cache = PreshMapArray(self.n_tags)
         self.exc = {}
         if exc is not None:
@@ -62,6 +66,10 @@ cdef class Morphology:
         return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
                              self.exc), None, None)
 
+    cdef int assign_untagged(self, TokenC* token) except -1:
+        '''Set morphological attributes on a token without a POS tag.'''
+        token.lemma = self.lemmatize(0, token.lex.orth, {})
+
     cdef int assign_tag(self, TokenC* token, tag) except -1:
         if isinstance(tag, basestring):
             tag = self.strings.add(tag)
@@ -72,7 +80,7 @@ cdef class Morphology:
             token.tag = tag
 
     cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
-        if tag_id >= self.n_tags:
+        if tag_id > self.n_tags:
             raise ValueError("Unknown tag ID: %s" % tag_id)
         # TODO: It's pretty arbitrary to put this logic here. I guess the justification
         # is that this is where the specific word and the tag interact. Still,
@@ -151,8 +159,6 @@ cdef class Morphology:
         cdef unicode py_string = self.strings[orth]
         if self.lemmatizer is None:
             return self.strings.add(py_string.lower())
-        if univ_pos not in (NOUN, VERB, ADJ, PUNCT):
-            return self.strings.add(py_string.lower())
         cdef set lemma_strings
         cdef unicode lemma_string
         lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)

From c15d8278cb3c382a7453b1b33c10700a3f4f0766 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 11 Oct 2017 03:23:23 +0200
Subject: [PATCH 05/13] Avoid lemmatizing inappropriate tags in English
 lemmatizer

---
 spacy/lemmatizer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py
index 312c8db72..ff7666c37 100644
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@@ -24,6 +24,8 @@ class Lemmatizer(object):
             univ_pos = 'adj'
         elif univ_pos == PUNCT:
             univ_pos = 'punct'
+        else:
+            return set([string.lower()])
         # See Issue #435 for example of where this logic is requied.
         if self.is_base_form(univ_pos, morphology):
             return set([string.lower()])

From 3b527fa52bdd6f29131f3bfb7deb32816c2de4f0 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 11 Oct 2017 03:23:57 +0200
Subject: [PATCH 06/13] Call morphology.assign_untagged when pushing token to
 Doc

---
 spacy/tokens/doc.pyx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index df75ab3ec..400ca0f2a 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -512,6 +512,8 @@ cdef class Doc:
         assert t.lex.orth != 0
         t.spacy = has_space
         self.length += 1
+        # Set morphological attributes, e.g. by lemma, if possible
+        self.vocab.morphology.assign_untagged(t)
         self._py_tokens.append(None)
         return t.idx + t.lex.length + t.spacy
 

From fd47f8e89f55703ad1c527124d631ab8543e6213 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 11 Oct 2017 08:38:34 +0200
Subject: [PATCH 07/13] Fix failing test

---
 spacy/tests/parser/test_preset_sbd.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py
index 77326f797..f10b96192 100644
--- a/spacy/tests/parser/test_preset_sbd.py
+++ b/spacy/tests/parser/test_preset_sbd.py
@@ -64,7 +64,7 @@ def test_sents_1_3(parser):
     doc[1].sent_start = True
     doc[3].sent_start = True
     doc = parser(doc)
-    assert len(list(doc.sents)) == 4
+    assert len(list(doc.sents)) >= 3
     doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
     doc[1].sent_start = True
     doc[2].sent_start = False

From 74c2c6a58cabdb31b77df3b24f6068355d9738bb Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 11 Oct 2017 08:49:12 +0200
Subject: [PATCH 08/13] Add default name and lang to meta

---
 spacy/cli/train.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 05d035769..a8b45e8fa 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -68,6 +68,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
     if not isinstance(meta, dict):
         prints("Expected dict but got: {}".format(type(meta)),
                title="Not a valid meta.json format", exits=1)
+    meta.setdefault('lang', lang)
+    meta.setdefault('name', 'unnamed')
 
     pipeline = ['tagger', 'parser', 'ner']
     if no_tagger and 'tagger' in pipeline: pipeline.remove('tagger')

From acba2e1051a0734d7d6ae2cc11211096039446bd Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 11 Oct 2017 08:55:52 +0200
Subject: [PATCH 09/13] Fix metadata in training

---
 spacy/cli/train.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index a8b45e8fa..3dae3f68b 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -91,6 +91,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
 
     lang_class = util.get_lang_class(lang)
     nlp = lang_class()
+    meta['pipeline'] = pipeline
+    nlp.meta.update(meta)
     if vectors:
         util.load_model(vectors, vocab=nlp.vocab)
     for name in pipeline:

From 188f62004694d89a040f5409164258a150abc2b1 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 11 Oct 2017 09:43:48 +0200
Subject: [PATCH 10/13] Improve parser defaults

---
 spacy/syntax/nn_parser.pyx | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 939414bd3..ce9ee39fa 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -239,13 +239,13 @@ cdef class Parser:
     """
     @classmethod
     def Model(cls, nr_class, **cfg):
-        depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 0))
-        token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 128))
-        hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 128))
-        parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 3))
+        depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1))
+        token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 64))
+        hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 64))
+        parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 2))
         embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000))
-        hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0))
-        hist_width = util.env_opt('history_width', cfg.get('hist_width', 0))
+        hist_size = util.env_opt('history_feats', cfg.get('hist_size', 4))
+        hist_width = util.env_opt('history_width', cfg.get('hist_width', 16))
         if hist_size >= 1 and depth == 0:
             raise ValueError("Inconsistent hyper-params: "
                 "history_feats >= 1 but parser_hidden_depth==0")

From 76fe24f44d1238e3755c07cd377eddde2b74a913 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 11 Oct 2017 09:44:17 +0200
Subject: [PATCH 11/13] Improve embedding defaults

---
 spacy/_ml.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/_ml.py b/spacy/_ml.py
index 62e0ceb9a..b07e179f0 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -311,7 +311,7 @@ def link_vectors_to_models(vocab):
 
 def Tok2Vec(width, embed_size, **kwargs):
     pretrained_dims = kwargs.get('pretrained_dims', 0)
-    cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 3)
+    cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2)
     cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
     with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add,
                                  '*': reapply}):

From 6e552c9d83ed2010e8de2291680bc8527b58fec4 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 11 Oct 2017 02:46:44 -0500
Subject: [PATCH 12/13] Prune number of non-projective labels more aggressiely

---
 spacy/gold.pyx             | 2 +-
 spacy/syntax/nn_parser.pyx | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 2512c179f..5729af667 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -213,7 +213,7 @@ class GoldCorpus(object):
         train_tuples = self.train_tuples
         if projectivize:
             train_tuples = nonproj.preprocess_training_data(
-                               self.train_tuples)
+                               self.train_tuples, label_freq_cutoff=100)
         random.shuffle(train_tuples)
         gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
                                         max_length=max_length,
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index a8a1d4334..9288b523f 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -809,7 +809,7 @@ cdef class Parser:
     def begin_training(self, gold_tuples, pipeline=None, **cfg):
         if 'model' in cfg:
             self.model = cfg['model']
-        gold_tuples = nonproj.preprocess_training_data(gold_tuples)
+        gold_tuples = nonproj.preprocess_training_data(gold_tuples, label_freq_cutoff=100)
         actions = self.moves.get_actions(gold_parses=gold_tuples)
         for action, labels in actions.items():
             for label in labels:

From 17c467e0ab143eb89c45917740b5d32be303f56a Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 11 Oct 2017 03:33:06 -0500
Subject: [PATCH 13/13] Avoid clobbering existing lemmas

---
 spacy/morphology.pyx | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index 5a4399698..b8dbb83ba 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -55,7 +55,7 @@ cdef class Morphology:
         # Add a 'null' tag, which we can reference when assign morphology to
         # untagged tokens.
         self.rich_tags[self.n_tags].id = self.n_tags
- 
+
         self._cache = PreshMapArray(self.n_tags)
         self.exc = {}
         if exc is not None:
@@ -68,7 +68,8 @@ cdef class Morphology:
 
     cdef int assign_untagged(self, TokenC* token) except -1:
         '''Set morphological attributes on a token without a POS tag.'''
-        token.lemma = self.lemmatize(0, token.lex.orth, {})
+        if token.lemma == 0:
+            token.lemma = self.lemmatize(0, token.lex.orth, {})
 
     cdef int assign_tag(self, TokenC* token, tag) except -1:
         if isinstance(tag, basestring):