From e920885676c6e7019fdd2891b2173aa630d54c6b Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 2 Sep 2017 12:46:01 -0500
Subject: [PATCH 01/37] Fix pickle during train

---
 spacy/cli/train.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index ddec2c069..b2c87d2b5 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -80,6 +80,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
     n_train_words = corpus.count_train()
 
     optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
+    nlp._optimizer = None
 
     print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
     try:

From 33fa91feb7fa81a3346b8d34ac927c1578bc719b Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 4 Sep 2017 21:19:30 +0200
Subject: [PATCH 02/37] Restore correctness of parser model

---
 spacy/syntax/nn_parser.pyx | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 34e504da9..2aaae4f05 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -393,8 +393,7 @@ cdef class Parser:
 
         tokvecs = self.model[0].ops.flatten(tokvecses)
         if USE_FINE_TUNE:
-            # TODO: This is incorrect! Unhack when training next model
-            tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
+            tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
 
         nr_state = len(docs)
         nr_class = self.moves.n_moves
@@ -532,8 +531,8 @@ cdef class Parser:
             docs = [docs]
             golds = [golds]
         if USE_FINE_TUNE:
-            my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
-            tokvecs += self.model[0].ops.flatten(my_tokvecs)
+            tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
+            tokvecs = self.model[0].ops.flatten(tokvecs)
 
         cuda_stream = get_cuda_stream()
 
@@ -606,8 +605,8 @@ cdef class Parser:
         assert min(lengths) >= 1
         tokvecs = self.model[0].ops.flatten(tokvecs)
         if USE_FINE_TUNE:
-            my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
-            tokvecs += self.model[0].ops.flatten(my_tokvecs)
+            tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
+            tokvecs = self.model[0].ops.flatten(tokvecs)
 
         states = self.moves.init_batch(docs)
         for gold in golds:

From 5384fff5ceaa694109b6a6efa790877aabd5be7e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 6 Sep 2017 18:40:18 +0200
Subject: [PATCH 03/37] Add test for 1305: Incorrect lemmatization of VBZ for
 English

---
 spacy/tests/regression/test_issue1305.py | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 spacy/tests/regression/test_issue1305.py

diff --git a/spacy/tests/regression/test_issue1305.py b/spacy/tests/regression/test_issue1305.py
new file mode 100644
index 000000000..e123ce0ba
--- /dev/null
+++ b/spacy/tests/regression/test_issue1305.py
@@ -0,0 +1,8 @@
+import pytest
+
+@pytest.mark.models('en')
+def test_issue1305(EN):
+    '''Test lemmatization of English VBZ'''
+    assert EN.vocab.morphology.lemmatizer('works', 'verb') == set(['work'])
+    doc = EN(u'This app works well')
+    assert doc[2].lemma_ == 'work'

From 497a9308a8775b51c79621bfe5f03aab1bf6696c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 6 Sep 2017 18:41:22 +0200
Subject: [PATCH 04/37] Xfail new lemmatizer test

---
 spacy/tests/regression/test_issue1305.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/tests/regression/test_issue1305.py b/spacy/tests/regression/test_issue1305.py
index e123ce0ba..c75b42434 100644
--- a/spacy/tests/regression/test_issue1305.py
+++ b/spacy/tests/regression/test_issue1305.py
@@ -1,5 +1,6 @@
 import pytest
 
+@pytest.mark.xfail
 @pytest.mark.models('en')
 def test_issue1305(EN):
     '''Test lemmatization of English VBZ'''

From dd9cab0fafe0c9df863a5f539303b1bcfa4f1feb Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 6 Sep 2017 19:03:05 +0200
Subject: [PATCH 05/37] Fix type-check for int/long

---
 spacy/syntax/transition_system.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx
index 9cf82e0c7..055129c8b 100644
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@@ -148,7 +148,7 @@ cdef class TransitionSystem:
 
     def add_action(self, int action, label_name):
         cdef attr_t label_id
-        if not isinstance(label_name, int):
+        if not isinstance(label_name, (int, long)):
             label_id = self.strings.add(label_name)
         else:
             label_id = label_name

From 5c3ff069242804da4aba48ec6d09777eb91f74b9 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 6 Sep 2017 19:13:24 +0200
Subject: [PATCH 06/37] Fix lemmatizer rules

---
 spacy/lemmatizer.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py
index 4d534b50f..3a04a471d 100644
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@@ -25,6 +25,7 @@ class Lemmatizer(object):
         elif univ_pos == PUNCT:
             univ_pos = 'punct'
         # See Issue #435 for example of where this logic is requied.
+        print("Check base form", string)
         if self.is_base_form(univ_pos, morphology):
             return set([string.lower()])
         lemmas = lemmatize(string, self.index.get(univ_pos, {}),
@@ -38,7 +39,8 @@ class Lemmatizer(object):
         avoid lemmatization entirely.
         """
         morphology = {} if morphology is None else morphology
-        others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
+        others = [key for key in morphology
+                  if key not in (POS, 'Number', 'POS', 'VerbForm', 'Tense')]
         true_morph_key = morphology.get('morph', 0)
         if univ_pos == 'noun' and morphology.get('Number') == 'sing':
             return True
@@ -47,7 +49,9 @@ class Lemmatizer(object):
         # This maps 'VBP' to base form -- probably just need 'IS_BASE'
         # morphology
         elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and \
-                                     morphology.get('Tense') == 'pres'):
+                                     morphology.get('Tense') == 'pres' and \
+                                     morphology.get('Number') is None and \
+                                     not others):
             return True
         elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
             return True

From 99e44fbdbbb34f33f3a02890cec03bea2b0a4b74 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 6 Sep 2017 19:13:51 +0200
Subject: [PATCH 07/37] Update regression test

---
 spacy/tests/regression/test_issue429.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/spacy/tests/regression/test_issue429.py b/spacy/tests/regression/test_issue429.py
index 1baa9a1db..74f12bd9f 100644
--- a/spacy/tests/regression/test_issue429.py
+++ b/spacy/tests/regression/test_issue429.py
@@ -9,11 +9,14 @@ import pytest
 @pytest.mark.models('en')
 def test_issue429(EN):
     def merge_phrases(matcher, doc, i, matches):
-      if i != len(matches) - 1:
-        return None
-      spans = [(ent_id, ent_id, doc[start:end]) for ent_id, start, end in matches]
-      for ent_id, label, span in spans:
-        span.merge('NNP' if label else span.root.tag_, span.text, EN.vocab.strings[label])
+        if i != len(matches) - 1:
+            return None
+        spans = [(ent_id, ent_id, doc[start:end]) for ent_id, start, end in matches]
+        for ent_id, label, span in spans:
+            span.merge(
+                tag=('NNP' if label else span.root.tag_),
+                lemma=span.text,
+                label='PERSON')
 
     doc = EN('a')
     matcher = Matcher(EN.vocab)

From 456bb8a74c384bcef58d562525c5ce914b0bed76 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 6 Sep 2017 19:14:17 +0200
Subject: [PATCH 08/37] Unxfail and close #1305

---
 spacy/tests/regression/test_issue1305.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spacy/tests/regression/test_issue1305.py b/spacy/tests/regression/test_issue1305.py
index c75b42434..e123ce0ba 100644
--- a/spacy/tests/regression/test_issue1305.py
+++ b/spacy/tests/regression/test_issue1305.py
@@ -1,6 +1,5 @@
 import pytest
 
-@pytest.mark.xfail
 @pytest.mark.models('en')
 def test_issue1305(EN):
     '''Test lemmatization of English VBZ'''

From b8e81daccfd0ccf1388a7538ffcd9e6489e8d9ec Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Thu, 14 Sep 2017 12:49:59 +0200
Subject: [PATCH 09/37] Fix typo (closes #1312)

---
 website/docs/usage/customizing-tokenizer.jade | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/usage/customizing-tokenizer.jade b/website/docs/usage/customizing-tokenizer.jade
index 7e0b4b479..0bc81771d 100644
--- a/website/docs/usage/customizing-tokenizer.jade
+++ b/website/docs/usage/customizing-tokenizer.jade
@@ -282,7 +282,7 @@ p
         def __call__(self, text):
             words = text.split(' ')
             # All tokens 'own' a subsequent space character in this tokenizer
-            spaces = [True] * len(word)
+            spaces = [True] * len(words)
             return Doc(self.vocab, words=words, spaces=spaces)
 
 p

From ba23d63c35bf9187f093804f93af4fd345cfa1e3 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 14 Sep 2017 13:37:41 +0200
Subject: [PATCH 10/37] Fix minibatch function, for fixed batch size

---
 spacy/gold.pyx | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index f00d04109..fc8d6622b 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -7,6 +7,7 @@ import re
 import ujson
 import random
 import cytoolz
+import itertools
 
 from .syntax import nonproj
 from .util import ensure_path
@@ -146,9 +147,13 @@ def minibatch(items, size=8):
     '''Iterate over batches of items. `size` may be an iterator,
     so that batch-size can vary on each step.
     '''
+    if isinstance(size, int):
+        size_ = itertools.repeat(8)
+    else:
+        size_ = size
     items = iter(items)
     while True:
-        batch_size = next(size) #if hasattr(size, '__next__') else size
+        batch_size = next(size_)
         batch = list(cytoolz.take(int(batch_size), items))
         if len(batch) == 0:
             break

From 9cb2aef5877b342ef44cd77386328ee91039088e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 14 Sep 2017 13:38:28 +0200
Subject: [PATCH 11/37] Remove print statement

---
 spacy/lemmatizer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py
index 3a04a471d..312c8db72 100644
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@@ -25,7 +25,6 @@ class Lemmatizer(object):
         elif univ_pos == PUNCT:
             univ_pos = 'punct'
         # See Issue #435 for example of where this logic is requied.
-        print("Check base form", string)
         if self.is_base_form(univ_pos, morphology):
             return set([string.lower()])
         lemmas = lemmatize(string, self.index.get(univ_pos, {}),

From 683d81bb49096867f5ad8d3dde23217ea54d6790 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 14 Sep 2017 16:15:59 +0200
Subject: [PATCH 12/37] Update example for adding entity type

---
 examples/training/train_new_entity_type.py | 87 ++++++++++------------
 1 file changed, 40 insertions(+), 47 deletions(-)

diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py
index 4eae11c75..ab69285a6 100644
--- a/examples/training/train_new_entity_type.py
+++ b/examples/training/train_new_entity_type.py
@@ -25,7 +25,7 @@ For more details, see the documentation:
 * Saving and loading models: https://spacy.io/docs/usage/saving-loading
 
 Developed for: spaCy 1.7.6
-Last tested for: spaCy 1.7.6
+Last updated for: spaCy 2.0.0a13
 """
 from __future__ import unicode_literals, print_function
 
@@ -34,55 +34,41 @@ from pathlib import Path
 import random
 
 import spacy
-from spacy.gold import GoldParse
-from spacy.tagger import Tagger
+from spacy.gold import GoldParse, minibatch
+from spacy.pipeline import NeuralEntityRecognizer
+from spacy.pipeline import TokenVectorEncoder
 
 
+def get_gold_parses(tokenizer, train_data):
+    '''Shuffle and create GoldParse objects'''
+    random.shuffle(train_data)
+    for raw_text, entity_offsets in train_data:
+        doc = tokenizer(raw_text)
+        gold = GoldParse(doc, entities=entity_offsets)
+        yield doc, gold
+
+ 
 def train_ner(nlp, train_data, output_dir):
-    # Add new words to vocab
-    for raw_text, _ in train_data:
-        doc = nlp.make_doc(raw_text)
-        for word in doc:
-            _ = nlp.vocab[word.orth]
     random.seed(0)
-    # You may need to change the learning rate. It's generally difficult to
-    # guess what rate you should set, especially when you have limited data.
-    nlp.entity.model.learn_rate = 0.001
-    for itn in range(1000):
-        random.shuffle(train_data)
-        loss = 0.
-        for raw_text, entity_offsets in train_data:
-            gold = GoldParse(doc, entities=entity_offsets)
-            # By default, the GoldParse class assumes that the entities
-            # described by offset are complete, and all other words should
-            # have the tag 'O'. You can tell it to make no assumptions
-            # about the tag of a word by giving it the tag '-'.
-            # However, this allows a trivial solution to the current
-            # learning problem: if words are either 'any tag' or 'ANIMAL',
-            # the model can learn that all words can be tagged 'ANIMAL'.
-            #for i in range(len(gold.ner)):
-                #if not gold.ner[i].endswith('ANIMAL'):
-                #    gold.ner[i] = '-'
-            doc = nlp.make_doc(raw_text)
-            nlp.tagger(doc)
-            # As of 1.9, spaCy's parser now lets you supply a dropout probability
-            # This might help the model generalize better from only a few
-            # examples.
-            loss += nlp.entity.update(doc, gold, drop=0.9)
-        if loss == 0:
-            break
-    # This step averages the model's weights. This may or may not be good for
-    # your situation --- it's empirical.
-    nlp.end_training()
-    if output_dir:
-        if not output_dir.exists():
-            output_dir.mkdir()
-        nlp.save_to_directory(output_dir)
+    optimizer = nlp.begin_training(lambda: [])
+    nlp.meta['name'] = 'en_ent_animal'
+    for itn in range(50):
+        losses = {}
+        for batch in minibatch(get_gold_parses(nlp.make_doc, train_data), size=3):
+            docs, golds = zip(*batch)
+            nlp.update(docs, golds, losses=losses, sgd=optimizer, update_shared=True,
+                       drop=0.35)
+        print(losses)
+    if not output_dir:
+        return
+    elif not output_dir.exists():
+        output_dir.mkdir()
+    nlp.to_disk(output_dir)
 
 
 def main(model_name, output_directory=None):
-    print("Loading initial model", model_name)
-    nlp = spacy.load(model_name)
+    print("Creating initial model", model_name)
+    nlp = spacy.blank(model_name)
     if output_directory is not None:
         output_directory = Path(output_directory)
 
@@ -91,6 +77,11 @@ def main(model_name, output_directory=None):
             "Horses are too tall and they pretend to care about your feelings",
             [(0, 6, 'ANIMAL')],
         ),
+        (
+            "Do they bite?", 
+            [],
+        ),
+ 
         (
             "horses are too tall and they pretend to care about your feelings",
             [(0, 6, 'ANIMAL')]
@@ -109,18 +100,20 @@ def main(model_name, output_directory=None):
         )
 
     ]
-    nlp.entity.add_label('ANIMAL')
+    nlp.pipeline.append(TokenVectorEncoder(nlp.vocab))
+    nlp.pipeline.append(NeuralEntityRecognizer(nlp.vocab))
+    nlp.pipeline[-1].add_label('ANIMAL')
     train_ner(nlp, train_data, output_directory)
 
     # Test that the entity is recognized
-    doc = nlp('Do you like horses?')
+    text = 'Do you like horses?'
     print("Ents in 'Do you like horses?':")
+    doc = nlp(text)
     for ent in doc.ents:
         print(ent.label_, ent.text)
     if output_directory:
         print("Loading from", output_directory)
-        nlp2 = spacy.load('en', path=output_directory)
-        nlp2.entity.add_label('ANIMAL')
+        nlp2 = spacy.load(output_directory)
         doc2 = nlp2('Do you like horses?')
         for ent in doc2.ents:
             print(ent.label_, ent.text)

From daf869ab3b02a6e3ab36fe6b2bf5e4c7c0a72049 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 14 Sep 2017 16:16:41 +0200
Subject: [PATCH 13/37] Fix add_action for NER, so labelled 'O' actions aren't
 added

---
 spacy/syntax/ner.pyx | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx
index 2f5cd4e48..11b429aa2 100644
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@@ -220,6 +220,31 @@ cdef class BiluoPushDown(TransitionSystem):
             raise Exception(move)
         return t
 
+    def add_action(self, int action, label_name):
+        cdef attr_t label_id
+        if not isinstance(label_name, (int, long)):
+            label_id = self.strings.add(label_name)
+        else:
+            label_id = label_name
+        if action == OUT and label_id != 0:
+            return
+        if action == MISSING or action == ISNT:
+            return
+        # Check we're not creating a move we already have, so that this is
+        # idempotent
+        for trans in self.c[:self.n_moves]:
+            if trans.move == action and trans.label == label_id:
+                return 0
+        if self.n_moves >= self._size:
+            self._size *= 2
+            self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0]))
+        self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
+        assert self.c[self.n_moves].label == label_id
+        self.n_moves += 1
+        return 1
+
+
+
     cdef int initialize_state(self, StateC* st) nogil:
         # This is especially necessary when we use limited training data.
         for i in range(st.length):

From c6395b057a6cd65fe931f5b9b8aece35e94f16d7 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 14 Sep 2017 16:18:02 +0200
Subject: [PATCH 14/37] Improve parser feature extraction, for missing values

---
 spacy/syntax/_state.pxd    | 13 +++++++++----
 spacy/syntax/nn_parser.pyx | 16 +++++++++++++++-
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd
index 3da9e5d4c..9a08691de 100644
--- a/spacy/syntax/_state.pxd
+++ b/spacy/syntax/_state.pxd
@@ -101,9 +101,10 @@ cdef cppclass StateC:
         elif n == 6:
             if this.B(0) >= 0:
                 ids[0] = this.B(0)
+                ids[1] = this.B(0)-1
             else:
                 ids[0] = -1
-            ids[1] = this.B(0)
+                ids[1] = -1
             ids[2] = this.B(1)
             ids[3] = this.E(0)
             if ids[3] >= 1:
@@ -118,8 +119,12 @@ cdef cppclass StateC:
             # TODO error =/
             pass
         for i in range(n):
+            # Token vectors should be padded, so that there's a vector for
+            # missing values at the start.
             if ids[i] >= 0:
-                ids[i] += this.offset
+                ids[i] += this.offset + 1
+            else:
+                ids[i] = 0
 
     int S(int i) nogil const:
         if i >= this._s_i:
@@ -162,9 +167,9 @@ cdef cppclass StateC:
 
     int E(int i) nogil const:
         if this._e_i <= 0 or this._e_i >= this.length:
-            return 0
+            return -1
         if i < 0 or i >= this._e_i:
-            return 0
+            return -1
         return this._ents[this._e_i - (i+1)].start
 
     int L(int i, int idx) nogil const:
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 552ea4f8f..ad6ed280e 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -394,7 +394,7 @@ cdef class Parser:
         tokvecs = self.model[0].ops.flatten(tokvecses)
         if USE_FINE_TUNE:
             tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
-
+        tokvecs = self._pad_tokvecs(tokvecs)
         nr_state = len(docs)
         nr_class = self.moves.n_moves
         nr_dim = tokvecs.shape[1]
@@ -454,6 +454,7 @@ cdef class Parser:
         tokvecs = self.model[0].ops.flatten(tokvecses)
         if USE_FINE_TUNE:
             tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
+        tokvecs = self._pad_tokvecs(tokvecs)
         cuda_stream = get_cuda_stream()
         state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs,
                                                      cuda_stream, 0.0)
@@ -534,6 +535,8 @@ cdef class Parser:
             tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
             tokvecs = self.model[0].ops.flatten(tokvecs)
 
+        tokvecs = self._pad_tokvecs(tokvecs)
+
         cuda_stream = get_cuda_stream()
 
         states, golds, max_steps = self._init_gold_batch(docs, golds)
@@ -583,6 +586,7 @@ cdef class Parser:
                 break
         self._make_updates(d_tokvecs,
             backprops, sgd, cuda_stream)
+        d_tokvecs = self._unpad_tokvecs(d_tokvecs)
         d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
         if USE_FINE_TUNE:
             d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
@@ -639,10 +643,20 @@ cdef class Parser:
         d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
         self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream)
         d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, lengths)
+        d_tokvecs = self._unpad_tokvecs(d_tokvecs)
         if USE_FINE_TUNE:
             d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
         return d_tokvecs
 
+    def _pad_tokvecs(self, tokvecs):
+        # Add a vector for missing values at the start of tokvecs
+        xp = get_array_module(tokvecs)
+        pad = xp.zeros((1, tokvecs.shape[1]), dtype=tokvecs.dtype)
+        return xp.vstack((pad, tokvecs))
+
+    def _unpad_tokvecs(self, d_tokvecs):
+        return d_tokvecs[1:]
+
     def _init_gold_batch(self, whole_docs, whole_golds):
         """Make a square batch, of length equal to the shortest doc. A long
         doc will get multiple states. Let's say we have a doc of length 2*N,

From 70da88a3a74e17b0c15fd9224c025a5c556625aa Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 14 Sep 2017 16:18:30 +0200
Subject: [PATCH 15/37] Update comment on Language.begin_training

---
 spacy/language.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 66b42ff94..e6a5304dd 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -347,15 +347,9 @@ class Language(object):
         """Allocate models, pre-process training data and acquire a trainer and
         optimizer. Used as a contextmanager.
 
-        gold_tuples (iterable): Gold-standard training data.
+        get_gold_tuples (function): Function returning gold data
         **cfg: Config parameters.
-        YIELDS (tuple): A trainer and an optimizer.
-
-        EXAMPLE:
-            >>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
-            >>>    for epoch in trainer.epochs(gold):
-            >>>        for docs, golds in epoch:
-            >>>            state = nlp.update(docs, golds, sgd=optimizer)
+        returns: An optimizer
         """
         if self.parser:
             self.pipeline.append(NeuralLabeller(self.vocab))

From d1518027a980f57d6ee88d6d99e161267ab9ad25 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 14 Sep 2017 16:18:46 +0200
Subject: [PATCH 16/37] Increment version

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index d566fbb1f..40444ffd1 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -3,7 +3,7 @@
 # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
 
 __title__ = 'spacy-nightly'
-__version__ = '2.0.0a13'
+__version__ = '2.0.0a14'
 __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
 __uri__ = 'https://spacy.io'
 __author__ = 'Explosion AI'

From 664c5af745786312725917cd9a44418777868350 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 14 Sep 2017 16:59:25 +0200
Subject: [PATCH 17/37] Revert padding in parser

---
 spacy/syntax/_state.pxd    | 6 ++----
 spacy/syntax/nn_parser.pyx | 6 ------
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd
index 9a08691de..4fb16881a 100644
--- a/spacy/syntax/_state.pxd
+++ b/spacy/syntax/_state.pxd
@@ -119,12 +119,10 @@ cdef cppclass StateC:
             # TODO error =/
             pass
         for i in range(n):
-            # Token vectors should be padded, so that there's a vector for
-            # missing values at the start.
             if ids[i] >= 0:
-                ids[i] += this.offset + 1
+                ids[i] += this.offset
             else:
-                ids[i] = 0
+                ids[i] = -1
 
     int S(int i) nogil const:
         if i >= this._s_i:
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index ad6ed280e..3ea17f2fe 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -394,7 +394,6 @@ cdef class Parser:
         tokvecs = self.model[0].ops.flatten(tokvecses)
         if USE_FINE_TUNE:
             tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
-        tokvecs = self._pad_tokvecs(tokvecs)
         nr_state = len(docs)
         nr_class = self.moves.n_moves
         nr_dim = tokvecs.shape[1]
@@ -454,7 +453,6 @@ cdef class Parser:
         tokvecs = self.model[0].ops.flatten(tokvecses)
         if USE_FINE_TUNE:
             tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
-        tokvecs = self._pad_tokvecs(tokvecs)
         cuda_stream = get_cuda_stream()
         state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs,
                                                      cuda_stream, 0.0)
@@ -527,7 +525,6 @@ cdef class Parser:
         if losses is not None and self.name not in losses:
             losses[self.name] = 0.
         docs, tokvec_lists = docs_tokvecs
-        tokvecs = self.model[0].ops.flatten(tokvec_lists)
         if isinstance(docs, Doc) and isinstance(golds, GoldParse):
             docs = [docs]
             golds = [golds]
@@ -535,7 +532,6 @@ cdef class Parser:
             tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
             tokvecs = self.model[0].ops.flatten(tokvecs)
 
-        tokvecs = self._pad_tokvecs(tokvecs)
 
         cuda_stream = get_cuda_stream()
 
@@ -586,7 +582,6 @@ cdef class Parser:
                 break
         self._make_updates(d_tokvecs,
             backprops, sgd, cuda_stream)
-        d_tokvecs = self._unpad_tokvecs(d_tokvecs)
         d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
         if USE_FINE_TUNE:
             d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
@@ -643,7 +638,6 @@ cdef class Parser:
         d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
         self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream)
         d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, lengths)
-        d_tokvecs = self._unpad_tokvecs(d_tokvecs)
         if USE_FINE_TUNE:
             d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
         return d_tokvecs

From 8c503487af306e4ca1fc93372c28cecebede95ca Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 14 Sep 2017 16:59:45 +0200
Subject: [PATCH 18/37] Fix lookup of missing NER actions

---
 spacy/syntax/ner.pyx | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx
index 11b429aa2..1a174aba8 100644
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@@ -161,8 +161,7 @@ cdef class BiluoPushDown(TransitionSystem):
     cdef Transition lookup_transition(self, object name) except *:
         cdef attr_t label
         if name == '-' or name == None:
-            move_str = 'M'
-            label = 0
+            return Transition(clas=0, move=MISSING, label=0, score=0)
         elif name == '!O':
             return Transition(clas=0, move=ISNT, label=0, score=0)
         elif '-' in name:

From 18347ab69ceb4d57a87269bd141b300081b82983 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 14 Sep 2017 19:07:35 +0200
Subject: [PATCH 19/37] Implement AddHistory layer wrapper

---
 spacy/_ml.py | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/spacy/_ml.py b/spacy/_ml.py
index 003541f4b..d3c82897f 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -78,6 +78,37 @@ def add_tuples(X, drop=0.):
     return (vals1+vals2, length), add_tuples_bwd
 
 
+def AddHistory(layer, decay=0.0001):
+    ops = layer.ops
+    nonlocals = []
+    if layer.nI:
+        average_inputs = ops.allocate((layer.nO, layer.nI-layer.nO))
+        nonlocals = []
+    def history_fwd(X, drop=0.):
+        if not nonlocals:
+            nonlocals.append(ops.allocate((layer.nO, X.shape[1])))
+            model.history = nonlocals[0]
+        average_inputs = nonlocals[0]
+        hist = ops.xp.tensordot(X, average_inputs, axes=[[1], [1]])
+        X_hist = ops.xp.hstack((X, hist))
+        Y, bp_Y = layer.begin_update(X_hist, drop=drop)
+        for i in range(Y.shape[0]):
+            amax = Y[i].argmax()
+            average_inputs[amax] *= 1-decay
+            average_inputs[amax] += decay * X[i]
+        def history_bwd(dY, sgd=None):
+            dX_hist = bp_Y(dY, sgd=sgd)
+            dX = dX_hist[:, :X.shape[1]]
+            return dX
+        return Y, history_bwd
+    model = wrap(history_fwd, layer)
+    if layer.nI:
+        model.history = average_inputs
+    else:
+        model.history = None
+    return model
+
+
 def _zero_init(model):
     def _zero_init_impl(self, X, y):
         self.W.fill(0)

From bd3da3d6fb8626613e7ee76931ea6ae67786011e Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Thu, 14 Sep 2017 19:23:13 +0200
Subject: [PATCH 20/37] Port over change from #1323 and tidy up

---
 spacy/lang/zh/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py
index 3f68336f8..46ad3946f 100644
--- a/spacy/lang/zh/__init__.py
+++ b/spacy/lang/zh/__init__.py
@@ -14,8 +14,8 @@ class Chinese(Language):
         except ImportError:
             raise ImportError("The Chinese tokenizer requires the Jieba library: "
                               "https://github.com/fxsjy/jieba")
-        words = list(jieba.cut(text, cut_all=True))
-        words=[x for x in words if x]
+        words = list(jieba.cut(text, cut_all=False))
+        words = [x for x in words if x]
         return Doc(self.vocab, words=words, spaces=[False]*len(words))
 
 

From d84607f6bb7fa561d65734b1d2d15770c5de05b9 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 14 Sep 2017 20:34:40 +0200
Subject: [PATCH 21/37] Vectorize update in AddHistory

---
 spacy/_ml.py | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/spacy/_ml.py b/spacy/_ml.py
index d3c82897f..1f3d50cbd 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -81,31 +81,28 @@ def add_tuples(X, drop=0.):
 def AddHistory(layer, decay=0.0001):
     ops = layer.ops
     nonlocals = []
-    if layer.nI:
-        average_inputs = ops.allocate((layer.nO, layer.nI-layer.nO))
-        nonlocals = []
     def history_fwd(X, drop=0.):
         if not nonlocals:
-            nonlocals.append(ops.allocate((layer.nO, X.shape[1])))
+            if hasattr(layer, 'nO'):
+                nO = layer.nO
+            else:
+                nO = layer._layers[-1].nO
+            nonlocals.append(ops.allocate((nO, X.shape[1])))
             model.history = nonlocals[0]
         average_inputs = nonlocals[0]
         hist = ops.xp.tensordot(X, average_inputs, axes=[[1], [1]])
         X_hist = ops.xp.hstack((X, hist))
         Y, bp_Y = layer.begin_update(X_hist, drop=drop)
-        for i in range(Y.shape[0]):
-            amax = Y[i].argmax()
-            average_inputs[amax] *= 1-decay
-            average_inputs[amax] += decay * X[i]
+        amax = Y.argmax(axis=1)
+        average_inputs *= 1-decay
+        ops.scatter_add(average_inputs, amax, X * decay)
         def history_bwd(dY, sgd=None):
             dX_hist = bp_Y(dY, sgd=sgd)
             dX = dX_hist[:, :X.shape[1]]
-            return dX
+            return ops.xp.ascontiguousarray(dX)
         return Y, history_bwd
     model = wrap(history_fwd, layer)
-    if layer.nI:
-        model.history = average_inputs
-    else:
-        model.history = None
+    model.history = None
     return model
 
 

From 027a5d8b75c74fe2ae27d21ecb1d4ca36ec23cb3 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 15 Sep 2017 10:36:46 +0200
Subject: [PATCH 22/37] Update train_ner_standalone example

---
 examples/training/train_ner_standalone.py | 192 +++++++++-------------
 1 file changed, 80 insertions(+), 112 deletions(-)

diff --git a/examples/training/train_ner_standalone.py b/examples/training/train_ner_standalone.py
index 9591d1b71..6cca56c69 100644
--- a/examples/training/train_ner_standalone.py
+++ b/examples/training/train_ner_standalone.py
@@ -13,24 +13,27 @@ Input data:
 https://www.lt.informatik.tu-darmstadt.de/fileadmin/user_upload/Group_LangTech/data/GermEval2014_complete_data.zip
 
 Developed for: spaCy 1.7.1
-Last tested for: spaCy 1.7.1
+Last tested for: spaCy 2.0.0a13
 '''
 from __future__ import unicode_literals, print_function
 import plac
 from pathlib import Path
 import random
 import json
+from thinc.neural.optimizers import Adam
+from thinc.neural.ops import NumpyOps
+import tqdm
 
-import spacy.orth as orth_funcs
 from spacy.vocab import Vocab
-from spacy.pipeline import BeamEntityRecognizer
-from spacy.pipeline import EntityRecognizer
+from spacy.pipeline import TokenVectorEncoder, NeuralEntityRecognizer
 from spacy.tokenizer import Tokenizer
 from spacy.tokens import Doc
 from spacy.attrs import *
 from spacy.gold import GoldParse
-from spacy.gold import _iob_to_biluo as iob_to_biluo
+from spacy.gold import iob_to_biluo
+from spacy.gold import minibatch
 from spacy.scorer import Scorer
+import spacy.util
 
 try:
     unicode
@@ -38,95 +41,40 @@ except NameError:
     unicode = str
 
 
+spacy.util.set_env_log(True)
+
+
 def init_vocab():
     return Vocab(
         lex_attr_getters={
             LOWER: lambda string: string.lower(),
-            SHAPE: orth_funcs.word_shape,
+            NORM: lambda string: string.lower(),
             PREFIX: lambda string: string[0],
             SUFFIX: lambda string: string[-3:],
-            CLUSTER: lambda string: 0,
-            IS_ALPHA: orth_funcs.is_alpha,
-            IS_ASCII: orth_funcs.is_ascii,
-            IS_DIGIT: lambda string: string.isdigit(),
-            IS_LOWER: orth_funcs.is_lower,
-            IS_PUNCT: orth_funcs.is_punct,
-            IS_SPACE: lambda string: string.isspace(),
-            IS_TITLE: orth_funcs.is_title,
-            IS_UPPER: orth_funcs.is_upper,
-            IS_STOP: lambda string: False,
-            IS_OOV: lambda string: True
         })
 
 
-def save_vocab(vocab, path):
-    path = Path(path)
-    if not path.exists():
-        path.mkdir()
-    elif not path.is_dir():
-        raise IOError("Can't save vocab to %s\nNot a directory" % path)
-    with (path / 'strings.json').open('w') as file_:
-        vocab.strings.dump(file_)
-    vocab.dump((path / 'lexemes.bin').as_posix())
-
-
-def load_vocab(path):
-    path = Path(path)
-    if not path.exists():
-        raise IOError("Cannot load vocab from %s\nDoes not exist" % path)
-    if not path.is_dir():
-        raise IOError("Cannot load vocab from %s\nNot a directory" % path)
-    return Vocab.load(path)
-
-
-def init_ner_model(vocab, features=None):
-    if features is None:
-        features = tuple(EntityRecognizer.feature_templates)
-    return EntityRecognizer(vocab, features=features)
-
-
-def save_ner_model(model, path):
-    path = Path(path)
-    if not path.exists():
-        path.mkdir()
-    if not path.is_dir():
-        raise IOError("Can't save model to %s\nNot a directory" % path)
-    model.model.dump((path / 'model').as_posix())
-    with (path / 'config.json').open('w') as file_:
-        data = json.dumps(model.cfg)
-        if not isinstance(data, unicode):
-            data = data.decode('utf8')
-        file_.write(data)
-
-
-def load_ner_model(vocab, path):
-    return EntityRecognizer.load(path, vocab)
-
-
 class Pipeline(object):
-    @classmethod
-    def load(cls, path):
-        path = Path(path)
-        if not path.exists():
-            raise IOError("Cannot load pipeline from %s\nDoes not exist" % path)
-        if not path.is_dir():
-            raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
-        vocab = load_vocab(path)
-        tokenizer = Tokenizer(vocab, {}, None, None, None)
-        ner_model = load_ner_model(vocab, path / 'ner')
-        return cls(vocab, tokenizer, ner_model)
-
-    def __init__(self, vocab=None, tokenizer=None, entity=None):
+    def __init__(self, vocab=None, tokenizer=None, tensorizer=None, entity=None):
         if vocab is None:
             vocab = init_vocab()
         if tokenizer is None:
             tokenizer = Tokenizer(vocab, {}, None, None, None)
+        if tensorizer is None:
+            tensorizer = TokenVectorEncoder(vocab)
         if entity is None:
-            entity = init_ner_model(self.vocab)
+            entity = NeuralEntityRecognizer(vocab)
         self.vocab = vocab
         self.tokenizer = tokenizer
+        self.tensorizer = tensorizer
         self.entity = entity
-        self.pipeline = [self.entity]
+        self.pipeline = [tensorizer, self.entity]
+
+    def begin_training(self):
+        for model in self.pipeline:
+            model.begin_training([])
+        optimizer = Adam(NumpyOps(), 0.001)
+        return optimizer
 
     def __call__(self, input_):
         doc = self.make_doc(input_)
@@ -147,14 +95,18 @@ class Pipeline(object):
         gold = GoldParse(doc, entities=annotations)
         return gold
 
-    def update(self, input_, annot):
-        doc = self.make_doc(input_)
-        gold = self.make_gold(input_, annot)
-        for ner in gold.ner:
-            if ner not in (None, '-', 'O'):
-                action, label = ner.split('-', 1)
-                self.entity.add_label(label)
-        return self.entity.update(doc, gold)
+    def update(self, inputs, annots, sgd, losses=None, drop=0.):
+        if losses is None:
+            losses = {}
+        docs = [self.make_doc(input_) for input_ in inputs]
+        golds = [self.make_gold(input_, annot) for input_, annot in
+                 zip(inputs, annots)]
+
+        tensors, bp_tensors = self.tensorizer.update(docs, golds, drop=drop)
+        d_tensors = self.entity.update((docs, tensors), golds, drop=drop,
+                                      sgd=sgd, losses=losses)
+        bp_tensors(d_tensors, sgd=sgd)
+        return losses
 
     def evaluate(self, examples):
         scorer = Scorer()
@@ -164,34 +116,38 @@ class Pipeline(object):
             scorer.score(doc, gold)
         return scorer.scores
 
-    def average_weights(self):
-        self.entity.model.end_training()
-
-    def save(self, path):
+    def to_disk(self, path):
         path = Path(path)
         if not path.exists():
             path.mkdir()
         elif not path.is_dir():
             raise IOError("Can't save pipeline to %s\nNot a directory" % path)
-        save_vocab(self.vocab, path / 'vocab')
-        save_ner_model(self.entity, path / 'ner')
+        self.vocab.to_disk(path / 'vocab')
+        self.tensorizer.to_disk(path / 'tensorizer')
+        self.entity.to_disk(path / 'ner')
+
+    def from_disk(self, path):
+        path = Path(path)
+        if not path.exists():
+            raise IOError("Cannot load pipeline from %s\nDoes not exist" % path)
+        if not path.is_dir():
+            raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
+        self.vocab = self.vocab.from_disk(path / 'vocab')
+        self.tensorizer = self.tensorizer.from_disk(path / 'tensorizer')
+        self.entity = self.entity.from_disk(path / 'ner')
 
 
-def train(nlp, train_examples, dev_examples, ctx, nr_epoch=5):
-    next_epoch = train_examples
+def train(nlp, train_examples, dev_examples, nr_epoch=5):
+    sgd = nlp.begin_training()
     print("Iter", "Loss", "P", "R", "F")
     for i in range(nr_epoch):
-        this_epoch = next_epoch
-        next_epoch = []
-        loss = 0
-        for input_, annot in this_epoch:
-            loss += nlp.update(input_, annot)
-            if (i+1) < nr_epoch:
-                next_epoch.append((input_, annot))
-        random.shuffle(next_epoch)
+        random.shuffle(train_examples)
+        losses = {}
+        for batch in minibatch(tqdm.tqdm(train_examples, leave=False), size=8):
+            inputs, annots = zip(*batch)
+            nlp.update(list(inputs), list(annots), sgd, losses=losses)
         scores = nlp.evaluate(dev_examples)
-        report_scores(i, loss, scores)
-    nlp.average_weights()
+        report_scores(i, losses['ner'], scores)
     scores = nlp.evaluate(dev_examples)
     report_scores(channels, i+1, loss, scores)
 
@@ -208,7 +164,8 @@ def read_examples(path):
     with path.open() as file_:
         sents = file_.read().strip().split('\n\n')
         for sent in sents:
-            if not sent.strip():
+            sent = sent.strip()
+            if not sent:
                 continue
             tokens = sent.split('\n')
             while tokens and tokens[0].startswith('#'):
@@ -217,28 +174,39 @@ def read_examples(path):
             iob = []
             for token in tokens:
                 if token.strip():
-                    pieces = token.split()
+                    pieces = token.split('\t')
                     words.append(pieces[1])
                     iob.append(pieces[2])
             yield words, iob_to_biluo(iob)
 
 
+def get_labels(examples):
+    labels = set()
+    for words, tags in examples:
+        for tag in tags:
+            if '-' in tag:
+                labels.add(tag.split('-')[1])
+    return sorted(labels)
+
+
 @plac.annotations(
     model_dir=("Path to save the model", "positional", None, Path),
     train_loc=("Path to your training data", "positional", None, Path),
     dev_loc=("Path to your development data", "positional", None, Path),
 )
-def main(model_dir=Path('/home/matt/repos/spaCy/spacy/data/de-1.0.0'),
-        train_loc=None, dev_loc=None, nr_epoch=30):
-    
-    train_examples = read_examples(train_loc)
+def main(model_dir, train_loc, dev_loc, nr_epoch=30):
+    print(model_dir, train_loc, dev_loc)
+    train_examples = list(read_examples(train_loc))
     dev_examples = read_examples(dev_loc)
-    nlp = Pipeline.load(model_dir)
+    nlp = Pipeline()
+    for label in get_labels(train_examples):
+        nlp.entity.add_label(label)
+        print("Add label", label)
 
-    train(nlp, train_examples, list(dev_examples), ctx, nr_epoch)
+    train(nlp, train_examples, list(dev_examples), nr_epoch)
 
-    nlp.save(model_dir)
+    nlp.to_disk(model_dir)
 
 
 if __name__ == '__main__':
-    main()
+    plac.call(main)

From 8b481e04658443013e132988dc77740b4aa6a167 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 15 Sep 2017 10:38:08 +0200
Subject: [PATCH 23/37] Remove redundant brackets

---
 spacy/syntax/nn_parser.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 3ea17f2fe..e2dc35966 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -262,8 +262,8 @@ cdef class Parser:
                 upper.is_noop = True
             else:
                 upper = chain(
-                    clone(Maxout(hidden_width), (depth-1)),
-                    zero_init(Affine(nr_class, drop_factor=0.0))
+                    clone(Maxout(hidden_width), depth-1),
+                    zero_init(Affine(nr_class, hidden_width, drop_factor=0.0))
                 )
                 upper.is_noop = False
         # TODO: This is an unfortunate hack atm!

From 2f08489694f0ad74f03ccf566814628c57a1976c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 15 Sep 2017 10:41:40 +0200
Subject: [PATCH 24/37] Remove AddHistory layer -- didnt work as planned

---
 spacy/_ml.py | 28 ----------------------------
 1 file changed, 28 deletions(-)

diff --git a/spacy/_ml.py b/spacy/_ml.py
index 1f3d50cbd..003541f4b 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -78,34 +78,6 @@ def add_tuples(X, drop=0.):
     return (vals1+vals2, length), add_tuples_bwd
 
 
-def AddHistory(layer, decay=0.0001):
-    ops = layer.ops
-    nonlocals = []
-    def history_fwd(X, drop=0.):
-        if not nonlocals:
-            if hasattr(layer, 'nO'):
-                nO = layer.nO
-            else:
-                nO = layer._layers[-1].nO
-            nonlocals.append(ops.allocate((nO, X.shape[1])))
-            model.history = nonlocals[0]
-        average_inputs = nonlocals[0]
-        hist = ops.xp.tensordot(X, average_inputs, axes=[[1], [1]])
-        X_hist = ops.xp.hstack((X, hist))
-        Y, bp_Y = layer.begin_update(X_hist, drop=drop)
-        amax = Y.argmax(axis=1)
-        average_inputs *= 1-decay
-        ops.scatter_add(average_inputs, amax, X * decay)
-        def history_bwd(dY, sgd=None):
-            dX_hist = bp_Y(dY, sgd=sgd)
-            dX = dX_hist[:, :X.shape[1]]
-            return ops.xp.ascontiguousarray(dX)
-        return Y, history_bwd
-    model = wrap(history_fwd, layer)
-    model.history = None
-    return model
-
-
 def _zero_init(model):
     def _zero_init_impl(self, X, y):
         self.W.fill(0)

From 86367ab092d75af98bfb68bc3b6c499d28d0067f Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 15 Sep 2017 12:41:59 +0200
Subject: [PATCH 25/37] Start work on appveyor, for Windows build

---
 .appveyor.yml | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/.appveyor.yml b/.appveyor.yml
index 4dd7b0a31..d63512fcf 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -1 +1,53 @@
+environment:
+
+  matrix:
+
+    # For Python versions available on Appveyor, see
+    # http://www.appveyor.com/docs/installed-software#python
+    # The list here is complete (excluding Python 2.6, which
+    # isn't covered by this document) at the time of writing.
+
+    - PYTHON: "C:\\Python27"
+    #- PYTHON: "C:\\Python33"
+    #- PYTHON: "C:\\Python34"
+    #- PYTHON: "C:\\Python35"
+    #- PYTHON: "C:\\Python27-x64"
+    #- PYTHON: "C:\\Python33-x64"
+    #- DISTUTILS_USE_SDK: "1"
+    #- PYTHON: "C:\\Python34-x64"
+    #- DISTUTILS_USE_SDK: "1"
+    #- PYTHON: "C:\\Python35-x64"
+    #- PYTHON: "C:\\Python36-x64"
+
+install:
+  # We need wheel installed to build wheels
+  - "%PYTHON%\\python.exe -m pip install wheel"
+  - "%PYTHON%\\python.exe -m pip install -e ."
+
 build: off
+
+test_script:
+  # Put your test command here.
+  # If you don't need to build C extensions on 64-bit Python 3.3 or 3.4,
+  # you can remove "build.cmd" from the front of the command, as it's
+  # only needed to support those cases.
+  # Note that you must use the environment variable %PYTHON% to refer to
+  # the interpreter you're using - Appveyor does not do anything special
+  # to put the Python version you want to use on PATH.
+  - "%PYTHON%\\python.exe -m pytest spacy/"
+
+after_test:
+  # This step builds your wheels.
+  # Again, you only need build.cmd if you're building C extensions for
+  # 64-bit Python 3.3/3.4. And you need to use %PYTHON% to get the correct
+  # interpreter
+  - "%PYTHON%\\python.exe setup.py bdist_wheel"
+
+artifacts:
+  # bdist_wheel puts your built wheel in the dist directory
+  - path: dist\*
+
+#on_success:
+#  You can use this step to upload your artifacts to a public website.
+#  See Appveyor's documentation for more details. Or you can simply
+#  access your wheels from the Appveyor "artifacts" tab for your build.

From 1f840a9211347ec5d3f7dc64eccf74e254aa414c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 15 Sep 2017 12:49:02 +0200
Subject: [PATCH 26/37] Appveyor

---
 .appveyor.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.appveyor.yml b/.appveyor.yml
index d63512fcf..1fc3c920f 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -17,7 +17,7 @@ environment:
     #- PYTHON: "C:\\Python34-x64"
     #- DISTUTILS_USE_SDK: "1"
     #- PYTHON: "C:\\Python35-x64"
-    #- PYTHON: "C:\\Python36-x64"
+    - PYTHON: "C:\\Python36-x64"
 
 install:
   # We need wheel installed to build wheels

From 25ec8935adfc50f25109b411dd59980c2f065c52 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 15 Sep 2017 12:53:07 +0200
Subject: [PATCH 27/37] Appveyor

---
 .appveyor.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.appveyor.yml b/.appveyor.yml
index 1fc3c920f..f2d166754 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -22,6 +22,7 @@ environment:
 install:
   # We need wheel installed to build wheels
   - "%PYTHON%\\python.exe -m pip install wheel"
+  - "%PYTHON%\\python.exe -m pip install cython"
   - "%PYTHON%\\python.exe -m pip install -e ."
 
 build: off

From 02273eeca8290e5fb7906871c76a9e6db1c6f943 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 15 Sep 2017 12:55:33 +0200
Subject: [PATCH 28/37] Appveyor

---
 .appveyor.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.appveyor.yml b/.appveyor.yml
index f2d166754..a379cdd31 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -23,6 +23,7 @@ install:
   # We need wheel installed to build wheels
   - "%PYTHON%\\python.exe -m pip install wheel"
   - "%PYTHON%\\python.exe -m pip install cython"
+  - "%PYTHON%\\python.exe -m pip install -r requirements.txt"
   - "%PYTHON%\\python.exe -m pip install -e ."
 
 build: off

From 07cdbd121910c7984f7f597234b0cf986af9c2d2 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 15 Sep 2017 22:47:53 +0200
Subject: [PATCH 29/37] Require thinc 6.8.1, for Windows

---
 requirements.txt | 4 ++--
 setup.py         | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index aae0f9388..54c888a11 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,9 +1,9 @@
-cython<0.24
+cython>=0.24
 pathlib
 numpy>=1.7
 cymem>=1.30,<1.32
 preshed>=1.0.0,<2.0.0
-thinc>=6.8.0,<6.9.0
+thinc>=6.8.1,<6.9.0
 murmurhash>=0.28,<0.29
 plac<1.0.0,>=0.9.6
 six
diff --git a/setup.py b/setup.py
index 6a22f4076..535dddd0d 100755
--- a/setup.py
+++ b/setup.py
@@ -195,7 +195,7 @@ def setup_package():
                 'murmurhash>=0.28,<0.29',
                 'cymem>=1.30,<1.32',
                 'preshed>=1.0.0,<2.0.0',
-                'thinc>=6.8.0,<6.9.0',
+                'thinc>=6.8.1,<6.9.0',
                 'plac<1.0.0,>=0.9.6',
                 'pip>=9.0.0,<10.0.0',
                 'six',

From 2432308f3ef136e5e90d7be39ea795d3c6e61510 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 15 Sep 2017 23:55:19 +0200
Subject: [PATCH 30/37] Build in separate step for appveyor

---
 .appveyor.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.appveyor.yml b/.appveyor.yml
index a379cdd31..8dbdd2868 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -24,6 +24,7 @@ install:
   - "%PYTHON%\\python.exe -m pip install wheel"
   - "%PYTHON%\\python.exe -m pip install cython"
   - "%PYTHON%\\python.exe -m pip install -r requirements.txt"
+  - "%PYTHON%\\python.exe -m python setup.py build_ext --inplace"
   - "%PYTHON%\\python.exe -m pip install -e ."
 
 build: off

From 1ffc9a7fbfdd7ef620b025a770c4ba07305ce81d Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 15 Sep 2017 23:59:36 +0200
Subject: [PATCH 31/37] Fix appveyor

---
 .appveyor.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.appveyor.yml b/.appveyor.yml
index 8dbdd2868..12399a5a1 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -24,7 +24,7 @@ install:
   - "%PYTHON%\\python.exe -m pip install wheel"
   - "%PYTHON%\\python.exe -m pip install cython"
   - "%PYTHON%\\python.exe -m pip install -r requirements.txt"
-  - "%PYTHON%\\python.exe -m python setup.py build_ext --inplace"
+  - "%PYTHON%\\python.exe setup.py build_ext --inplace"
   - "%PYTHON%\\python.exe -m pip install -e ."
 
 build: off

From f730d07e4e7ab3d4cba3d16537b7ca7a5a098307 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 16 Sep 2017 00:25:18 +0200
Subject: [PATCH 32/37] Fix prange error for Windows

---
 spacy/syntax/nn_parser.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index e2dc35966..ea484f1c2 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -419,7 +419,7 @@ cdef class Parser:
         cdef int has_hidden = not getattr(vec2scores, 'is_noop', False)
         while not next_step.empty():
             if not has_hidden:
-                for i in cython.parallel.prange(
+                for i in range(
                         next_step.size(), num_threads=6, nogil=True):
                     self._parse_step(next_step[i],
                         feat_weights, nr_class, nr_feat, nr_piece)

From 3fa5b40b5cace40a5b8fde8112354abce6488b77 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 16 Sep 2017 11:21:35 +0200
Subject: [PATCH 33/37] Add test for hash consistency

---
 spacy/tests/stringstore/test_stringstore.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/spacy/tests/stringstore/test_stringstore.py b/spacy/tests/stringstore/test_stringstore.py
index 65b994606..602ebcee6 100644
--- a/spacy/tests/stringstore/test_stringstore.py
+++ b/spacy/tests/stringstore/test_stringstore.py
@@ -6,6 +6,16 @@ from ...strings import StringStore
 import pytest
 
 
+def test_string_hash(stringstore):
+    '''Test that string hashing is stable across platforms'''
+    ss = stringstore
+    assert ss.add('apple') == 8566208034543834098
+    heart = '\U0001f499'
+    print(heart)
+    h = ss.add(heart)
+    assert h == 11841826740069053588L
+ 
+
 def test_stringstore_from_api_docs(stringstore):
     apple_hash = stringstore.add('apple')
     assert apple_hash == 8566208034543834098

From 8a829eb98c41194a08f48fd1a9bec496ec673c98 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 16 Sep 2017 11:49:31 +0200
Subject: [PATCH 34/37] Fix travis.sh

---
 travis.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/travis.sh b/travis.sh
index 4b7d8017c..eed6a96f2 100755
--- a/travis.sh
+++ b/travis.sh
@@ -17,6 +17,7 @@ fi
 
 if [ "${VIA}" == "compile" ]; then
   pip install -r requirements.txt
+  python setup.py build_ext --inplace
   pip install -e .
 fi
 

From 11f2a05ede85b547b0b33d5642db6e3eaa1fba07 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 16 Sep 2017 12:20:04 +0200
Subject: [PATCH 35/37] Fix code explosion from long enum in Python 3, Cython
 0.24+

---
 spacy/symbols.pxd |  2 +-
 spacy/symbols.pyx | 11 ++++++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd
index 0b713cb21..e981de6ae 100644
--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@@ -1,4 +1,4 @@
-cpdef enum symbol_t:
+cdef enum symbol_t:
     NIL
     IS_ALPHA
     IS_ASCII
diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx
index 9f4009579..dd0e38cad 100644
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@@ -1,4 +1,6 @@
 # coding: utf8
+#cython: optimize.unpack_method_calls=False
+
 from __future__ import unicode_literals
 
 IDS = {
@@ -458,4 +460,11 @@ IDS = {
     "xcomp": xcomp
 }
 
-NAMES = [it[0] for it in sorted(IDS.items(), key=lambda it: it[1])]
+def sort_nums(x):
+    return x[1]
+
+NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)]
+# Unfortunate hack here, to work around problem with long cpdef enum
+# (which is generating an enormous amount of C++ in Cython 0.24+)
+# We keep the enum cdef, and just make sure the names are available to Python
+locals().update(IDS)

From 8c945310fb16912a23ef8311cd4cd00aeb3798e2 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 16 Sep 2017 16:21:13 +0200
Subject: [PATCH 36/37] Excuse emoji failure on narrow unicode builds

---
 spacy/tests/tokenizer/test_exceptions.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/tokenizer/test_exceptions.py
index 57281b998..132f27433 100644
--- a/spacy/tests/tokenizer/test_exceptions.py
+++ b/spacy/tests/tokenizer/test_exceptions.py
@@ -1,6 +1,7 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+import sys
 import pytest
 
 
@@ -37,9 +38,10 @@ def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
     tokens = tokenizer(text)
     assert len(tokens) == length
 
-
 @pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8),
                                          ('i💙you', 3), ('🤘🤘yay!', 4)])
 def test_tokenizer_handles_emoji(tokenizer, text, length):
-    tokens = tokenizer(text)
-    assert len(tokens) == length
+    # These break on narrow unicode builds, e.g. Windows
+    if sys.maxunicode >= 1114111:
+        tokens = tokenizer(text)
+        assert len(tokens) == length

From ebf8942564246729c79a299d597f13e8bd1215b2 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 16 Sep 2017 16:22:38 +0200
Subject: [PATCH 37/37] Fix test for Python3

---
 spacy/tests/stringstore/test_stringstore.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/stringstore/test_stringstore.py b/spacy/tests/stringstore/test_stringstore.py
index 602ebcee6..3f2992a6f 100644
--- a/spacy/tests/stringstore/test_stringstore.py
+++ b/spacy/tests/stringstore/test_stringstore.py
@@ -13,7 +13,7 @@ def test_string_hash(stringstore):
     heart = '\U0001f499'
     print(heart)
     h = ss.add(heart)
-    assert h == 11841826740069053588L
+    assert h == 11841826740069053588
  
 
 def test_stringstore_from_api_docs(stringstore):