From 66e1109b537c5f5900e528e3372271ec80e71f27 Mon Sep 17 00:00:00 2001
From: Roman Inflianskas <infroma@gmail.com>
Date: Sun, 26 Feb 2017 22:27:11 +0100
Subject: [PATCH 01/21] Add support for Universal Dependencies v2.0

---
 spacy/attrs.pyx                |  6 ++-
 spacy/de/tag_map.py            |  2 +-
 spacy/en/tag_map.py            |  2 +-
 spacy/language_data/tag_map.py |  1 +
 spacy/morphology.pxd           |  6 ++-
 spacy/morphology.pyx           |  4 ++
 spacy/parts_of_speech.pxd      |  1 +
 spacy/parts_of_speech.pyx      |  3 +-
 spacy/symbols.pxd              | 42 ++++++++++++++++--
 spacy/symbols.pyx              | 40 +++++++++++++++--
 spacy/syntax/nonproj.pyx       |  4 +-
 spacy/tagger.pyx               | 12 +++---
 spacy/tokens/doc.pyx           | 78 +++++++++++++++++-----------------
 spacy/tokens/token.pyx         | 28 ++++++------
 14 files changed, 155 insertions(+), 74 deletions(-)

diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx
index 81554ecd3..07044ee2d 100644
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@@ -93,7 +93,7 @@ NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
 
 def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
     '''Normalize a dictionary of attributes, converting them to ints.
-    
+
     Arguments:
         stringy_attrs (dict):
             Dictionary keyed by attribute string names. Values can be ints or strings.
@@ -125,7 +125,9 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
             'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss',
             'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType',
             'Number', 'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
-            'Reflex', 'Negative', 'Mood', 'Aspect', 'Case']
+            'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
+            'Polarity', # U20
+        ]
         for key in morph_keys:
             if key in stringy_attrs:
                 stringy_attrs.pop(key)
diff --git a/spacy/de/tag_map.py b/spacy/de/tag_map.py
index e5996b38c..050bc8255 100644
--- a/spacy/de/tag_map.py
+++ b/spacy/de/tag_map.py
@@ -41,7 +41,7 @@ TAG_MAP = {
     "PRF":      {POS: PRON, "PronType": "prs", "Reflex": "yes"},
     "PTKA":     {POS: PART},
     "PTKANT":   {POS: PART, "PartType": "res"},
-    "PTKNEG":   {POS: PART, "Negative": "yes"},
+    "PTKNEG":   {POS: PART, "Polarity": "Neg"},
     "PTKVZ":    {POS: PART, "PartType": "vbp"},
     "PTKZU":    {POS: PART, "PartType": "inf"},
     "PWAT":     {POS: DET, "PronType": "int"},
diff --git a/spacy/en/tag_map.py b/spacy/en/tag_map.py
index 7a3589d0e..5884d8fd4 100644
--- a/spacy/en/tag_map.py
+++ b/spacy/en/tag_map.py
@@ -16,7 +16,7 @@ TAG_MAP = {
     "$":        {POS: SYM, "Other": {"SymType": "currency"}},
     "#":        {POS: SYM, "Other": {"SymType": "numbersign"}},
     "AFX":      {POS: ADJ,  "Hyph": "yes"},
-    "CC":       {POS: CONJ, "ConjType": "coor"},
+    "CC":       {POS: CCONJ, "ConjType": "coor"},
     "CD":       {POS: NUM, "NumType": "card"},
     "DT":       {POS: DET},
     "EX":       {POS: ADV, "AdvType": "ex"},
diff --git a/spacy/language_data/tag_map.py b/spacy/language_data/tag_map.py
index 966960721..b861f39f5 100644
--- a/spacy/language_data/tag_map.py
+++ b/spacy/language_data/tag_map.py
@@ -19,6 +19,7 @@ TAG_MAP = {
     "AUX":      {POS: AUX},
     "X":        {POS: X},
     "CONJ":     {POS: CONJ},
+    "CCONJ":    {POS: CCONJ}, # U20
     "ADJ":      {POS: ADJ},
     "VERB":     {POS: VERB},
     "PART":     {POS: PART}
diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd
index 5dc1ce529..4d981b30d 100644
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@@ -37,7 +37,7 @@ cdef class Morphology:
     cdef int assign_tag(self, TokenC* token, tag) except -1
 
     cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1
-    
+
     cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
 
 
@@ -80,6 +80,7 @@ cpdef enum univ_morph_t:
     Definite_two
     Definite_def
     Definite_red
+    Definite_cons # U20
     Definite_ind
     Degree_cmp
     Degree_comp
@@ -103,6 +104,8 @@ cpdef enum univ_morph_t:
     Negative_neg
     Negative_pos
     Negative_yes
+    Polarity_neg # U20
+    Polarity_pos # U20
     Number_com
     Number_dual
     Number_none
@@ -151,6 +154,7 @@ cpdef enum univ_morph_t:
     VerbForm_partPres
     VerbForm_sup
     VerbForm_trans
+    VerbForm_conv # U20
     VerbForm_gdv # la
     Voice_act
     Voice_cau
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index c13ce1920..26405e988 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -192,6 +192,7 @@ IDS = {
     "Definite_two": Definite_two,
     "Definite_def": Definite_def,
     "Definite_red": Definite_red,
+    "Definite_cons": Definite_cons, # U20
     "Definite_ind": Definite_ind,
     "Degree_cmp": Degree_cmp,
     "Degree_comp": Degree_comp,
@@ -215,6 +216,8 @@ IDS = {
     "Negative_neg": Negative_neg,
     "Negative_pos": Negative_pos,
     "Negative_yes": Negative_yes,
+    "Polarity_neg": Polarity_neg, # U20
+    "Polarity_pos": Polarity_pos, # U20
     "Number_com": Number_com,
     "Number_dual": Number_dual,
     "Number_none": Number_none,
@@ -263,6 +266,7 @@ IDS = {
     "VerbForm_partPres": VerbForm_partPres,
     "VerbForm_sup": VerbForm_sup,
     "VerbForm_trans": VerbForm_trans,
+    "VerbForm_conv": VerbForm_conv, # U20
     "VerbForm_gdv ": VerbForm_gdv, # la,
     "Voice_act": Voice_act,
     "Voice_cau": Voice_cau,
diff --git a/spacy/parts_of_speech.pxd b/spacy/parts_of_speech.pxd
index c97673a69..0bf5b4789 100644
--- a/spacy/parts_of_speech.pxd
+++ b/spacy/parts_of_speech.pxd
@@ -7,6 +7,7 @@ cpdef enum univ_pos_t:
     ADV
     AUX
     CONJ
+    CCONJ # U20
     DET
     INTJ
     NOUN
diff --git a/spacy/parts_of_speech.pyx b/spacy/parts_of_speech.pyx
index 006a1f006..a5c770f61 100644
--- a/spacy/parts_of_speech.pyx
+++ b/spacy/parts_of_speech.pyx
@@ -7,7 +7,8 @@ IDS = {
     "ADP": ADP,
     "ADV": ADV,
     "AUX": AUX,
-    "CONJ": CONJ,
+    "CONJ": CONJ, # U20
+    "CCONJ": CCONJ,
     "DET": DET,
     "INTJ": INTJ,
     "NOUN": NOUN,
diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd
index ca1d1ed79..1a46f509f 100644
--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@@ -13,7 +13,7 @@ cpdef enum symbol_t:
     LIKE_EMAIL
     IS_STOP
     IS_OOV
-    
+
     FLAG14 = 14
     FLAG15
     FLAG16
@@ -90,6 +90,7 @@ cpdef enum symbol_t:
     ADV
     AUX
     CONJ
+    CCONJ # U20
     DET
     INTJ
     NOUN
@@ -107,11 +108,14 @@ cpdef enum symbol_t:
 
     Animacy_anim
     Animacy_inam
+    Animacy_hum # U20
     Aspect_freq
     Aspect_imp
     Aspect_mod
     Aspect_none
     Aspect_perf
+    Aspect_iter # U20
+    Aspect_hab # U20
     Case_abe
     Case_abl
     Case_abs
@@ -120,10 +124,12 @@ cpdef enum symbol_t:
     Case_all
     Case_cau
     Case_com
+    Case_cmp # U20
     Case_dat
     Case_del
     Case_dis
     Case_ela
+    Case_equ # U20
     Case_ess
     Case_gen
     Case_ill
@@ -142,7 +148,9 @@ cpdef enum symbol_t:
     Definite_two
     Definite_def
     Definite_red
+    Definite_cons # U20
     Definite_ind
+    Definite_spec # U20
     Degree_cmp
     Degree_comp
     Degree_none
@@ -151,6 +159,8 @@ cpdef enum symbol_t:
     Degree_abs
     Degree_com
     Degree_dim # du
+    Degree_equ # U20
+    Evident_nfh # U20
     Gender_com
     Gender_fem
     Gender_masc
@@ -162,16 +172,21 @@ cpdef enum symbol_t:
     Mood_pot
     Mood_sub
     Mood_opt
+    Mood_prp # U20
+    Mood_adm # U20
     Negative_neg
     Negative_pos
     Negative_yes
+    Polarity_neg # U20
+    Polarity_pos # U20
     Number_com
     Number_dual
     Number_none
     Number_plur
     Number_sing
     Number_ptan # bg
-    Number_count # bg
+    Number_count # bg, U20
+    Number_tri # U20
     NumType_card
     NumType_dist
     NumType_frac
@@ -197,7 +212,8 @@ cpdef enum symbol_t:
     PronType_rel
     PronType_tot
     PronType_clit
-    PronType_exc # es, ca, it, fa
+    PronType_exc # es, ca, it, fa, U20
+    PronType_emp # U20
     Reflex_yes
     Tense_fut
     Tense_imp
@@ -213,12 +229,17 @@ cpdef enum symbol_t:
     VerbForm_partPres
     VerbForm_sup
     VerbForm_trans
+    VerbForm_conv # U20
     VerbForm_gdv # la
+    VerbForm_vnoun # U20
     Voice_act
     Voice_cau
     Voice_pass
-    Voice_mid # gkc
+    Voice_mid # gkc, U20
     Voice_int # hb
+    Voice_antip # U20
+    Voice_dir # U20
+    Voice_inv # U20
     Abbr_yes # cz, fi, sl, U
     AdpType_prep # cz, U
     AdpType_post # U
@@ -284,6 +305,10 @@ cpdef enum symbol_t:
     Number_psee_plur # U
     Number_psor_sing # cz, fi, sl, U
     Number_psor_plur # cz, fi, sl, U
+    Number_pauc # U20
+    Number_grpa # U20
+    Number_grpl # U20
+    Number_inv # U20
     NumForm_digit # cz, sl, U
     NumForm_roman # cz, sl, U
     NumForm_word # cz, sl, U
@@ -311,6 +336,8 @@ cpdef enum symbol_t:
     Person_psor_one # fi, U
     Person_psor_two # fi, U
     Person_psor_three # fi, U
+    Person_zero # U20
+    Person_four # U20
     Polite_inf # bq, U
     Polite_pol # bq, U
     Polite_abs_inf # bq, U
@@ -319,6 +346,10 @@ cpdef enum symbol_t:
     Polite_erg_pol # bq, U
     Polite_dat_inf # bq, U
     Polite_dat_pol # bq, U
+    Polite_infm # U20
+    Polite_form # U20
+    Polite_form_elev # U20
+    Polite_form_humb # U20
     Prefix_yes # U
     PrepCase_npr # cz
     PrepCase_pre # U
@@ -383,6 +414,7 @@ cpdef enum symbol_t:
     ccomp
     complm
     conj
+    cop # U20
     csubj
     csubjpass
     dep
@@ -405,6 +437,8 @@ cpdef enum symbol_t:
     num
     number
     oprd
+    obj # U20
+    obl # U20
     parataxis
     partmod
     pcomp
diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx
index 7254297d4..56b27512e 100644
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@@ -91,6 +91,7 @@ IDS = {
     "ADV": ADV,
     "AUX": AUX,
     "CONJ": CONJ,
+    "CCONJ": CCONJ, # U20
     "DET": DET,
     "INTJ": INTJ,
     "NOUN": NOUN,
@@ -108,11 +109,14 @@ IDS = {
 
     "Animacy_anim": Animacy_anim,
     "Animacy_inam": Animacy_inam,
+    "Animacy_hum": Animacy_hum, # U20
     "Aspect_freq": Aspect_freq,
     "Aspect_imp": Aspect_imp,
     "Aspect_mod": Aspect_mod,
     "Aspect_none": Aspect_none,
     "Aspect_perf": Aspect_perf,
+    "Aspect_iter": Aspect_iter, # U20
+    "Aspect_hab": Aspect_hab, # U20
     "Case_abe": Case_abe,
     "Case_abl": Case_abl,
     "Case_abs": Case_abs,
@@ -121,10 +125,12 @@ IDS = {
     "Case_all": Case_all,
     "Case_cau": Case_cau,
     "Case_com": Case_com,
+    "Case_cmp": Case_cmp, # U20
     "Case_dat": Case_dat,
     "Case_del": Case_del,
     "Case_dis": Case_dis,
     "Case_ela": Case_ela,
+    "Case_equ": Case_equ, # U20
     "Case_ess": Case_ess,
     "Case_gen": Case_gen,
     "Case_ill": Case_ill,
@@ -143,7 +149,9 @@ IDS = {
     "Definite_two": Definite_two,
     "Definite_def": Definite_def,
     "Definite_red": Definite_red,
+    "Definite_cons": Definite_cons, # U20
     "Definite_ind": Definite_ind,
+    "Definite_spec": Definite_spec, # U20
     "Degree_cmp": Degree_cmp,
     "Degree_comp": Degree_comp,
     "Degree_none": Degree_none,
@@ -152,6 +160,8 @@ IDS = {
     "Degree_abs": Degree_abs,
     "Degree_com": Degree_com,
     "Degree_dim ": Degree_dim, # du
+    "Degree_equ": Degree_equ, # U20
+    "Evident_nfh": Evident_nfh, # U20
     "Gender_com": Gender_com,
     "Gender_fem": Gender_fem,
     "Gender_masc": Gender_masc,
@@ -163,16 +173,21 @@ IDS = {
     "Mood_pot": Mood_pot,
     "Mood_sub": Mood_sub,
     "Mood_opt": Mood_opt,
+    "Mood_prp": Mood_prp, # U20
+    "Mood_adm": Mood_adm, # U20
     "Negative_neg": Negative_neg,
     "Negative_pos": Negative_pos,
     "Negative_yes": Negative_yes,
+    "Polarity_neg": Polarity_neg, # U20
+    "Polarity_pos": Polarity_pos, # U20
     "Number_com": Number_com,
     "Number_dual": Number_dual,
     "Number_none": Number_none,
     "Number_plur": Number_plur,
     "Number_sing": Number_sing,
     "Number_ptan ": Number_ptan, # bg
-    "Number_count ": Number_count, # bg
+    "Number_count ": Number_count, # bg, U20
+    "Number_tri": Number_tri, # U20
     "NumType_card": NumType_card,
     "NumType_dist": NumType_dist,
     "NumType_frac": NumType_frac,
@@ -198,7 +213,8 @@ IDS = {
     "PronType_rel": PronType_rel,
     "PronType_tot": PronType_tot,
     "PronType_clit": PronType_clit,
-    "PronType_exc ": PronType_exc, # es, ca, it, fa,
+    "PronType_exc": PronType_exc, # es, ca, it, fa, U20
+    "PronType_emp": PronType_emp, # U20
     "Reflex_yes": Reflex_yes,
     "Tense_fut": Tense_fut,
     "Tense_imp": Tense_imp,
@@ -214,12 +230,17 @@ IDS = {
     "VerbForm_partPres": VerbForm_partPres,
     "VerbForm_sup": VerbForm_sup,
     "VerbForm_trans": VerbForm_trans,
+    "VerbForm_conv": VerbForm_conv, # U20
     "VerbForm_gdv ": VerbForm_gdv, # la,
+    "VerbForm_vnoun": VerbForm_vnoun, # U20
     "Voice_act": Voice_act,
     "Voice_cau": Voice_cau,
     "Voice_pass": Voice_pass,
-    "Voice_mid ": Voice_mid, # gkc,
+    "Voice_mid ": Voice_mid, # gkc, U20
     "Voice_int ": Voice_int, # hb,
+    "Voice_antip": Voice_antip, # U20
+    "Voice_dir": Voice_dir, # U20
+    "Voice_inv": Voice_inv, # U20
     "Abbr_yes ": Abbr_yes, # cz, fi, sl, U,
     "AdpType_prep ": AdpType_prep, # cz, U,
     "AdpType_post ": AdpType_post, # U,
@@ -285,6 +306,10 @@ IDS = {
     "Number_psee_plur ": Number_psee_plur, # U,
     "Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U,
     "Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U,
+    "Number_pauc": Number_pauc, # U20
+    "Number_grpa": Number_grpa, # U20
+    "Number_grpl": Number_grpl, # U20
+    "Number_inv": Number_inv, # U20
     "NumForm_digit ": NumForm_digit, # cz, sl, U,
     "NumForm_roman ": NumForm_roman, # cz, sl, U,
     "NumForm_word ": NumForm_word, # cz, sl, U,
@@ -312,6 +337,8 @@ IDS = {
     "Person_psor_one ": Person_psor_one, # fi, U,
     "Person_psor_two ": Person_psor_two, # fi, U,
     "Person_psor_three ": Person_psor_three, # fi, U,
+    "Person_zero ": Person_zero, # U20
+    "Person_four ": Person_four, # U20
     "Polite_inf ": Polite_inf, # bq, U,
     "Polite_pol ": Polite_pol, # bq, U,
     "Polite_abs_inf ": Polite_abs_inf, # bq, U,
@@ -320,6 +347,10 @@ IDS = {
     "Polite_erg_pol ": Polite_erg_pol, # bq, U,
     "Polite_dat_inf ": Polite_dat_inf, # bq, U,
     "Polite_dat_pol ": Polite_dat_pol, # bq, U,
+    "Polite_infm ": Polite_infm, # U20
+    "Polite_form ": Polite_form, # U20
+    "Polite_form_elev ": Polite_form_elev, # U20
+    "Polite_form_humb ": Polite_form_humb, # U20
     "Prefix_yes ": Prefix_yes, # U,
     "PrepCase_npr ": PrepCase_npr, # cz,
     "PrepCase_pre ": PrepCase_pre, # U,
@@ -384,6 +415,7 @@ IDS = {
     "ccomp": ccomp,
     "complm": complm,
     "conj": conj,
+    "cop": cop, # U20
     "csubj": csubj,
     "csubjpass": csubjpass,
     "dep": dep,
@@ -406,6 +438,8 @@ IDS = {
     "num": num,
     "number": number,
     "oprd": oprd,
+    "obj": obj, # U20
+    "obl": obl, # U20
     "parataxis": parataxis,
     "partmod": partmod,
     "pcomp": pcomp,
diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx
index 566588da4..1f4878247 100644
--- a/spacy/syntax/nonproj.pyx
+++ b/spacy/syntax/nonproj.pyx
@@ -8,7 +8,7 @@ from spacy.attrs import DEP, HEAD
 def ancestors(tokenid, heads):
     # returns all words going from the word up the path to the root
     # the path to root cannot be longer than the number of words in the sentence
-    # this function ends after at most len(heads) steps 
+    # this function ends after at most len(heads) steps
     # because it would otherwise loop indefinitely on cycles
     head = tokenid
     cnt = 0
@@ -180,7 +180,7 @@ class PseudoProjectivity:
             next_queue = []
             for qtoken in queue:
                 for child in qtoken.children:
-                    if child.is_space: continue                        
+                    if child.is_space: continue
                     if child == token: continue
                     if child.dep_ == headlabel:
                         return child
diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx
index 7903c44fb..954bced53 100644
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@@ -13,13 +13,13 @@ from thinc.linalg cimport VecVec
 from .typedefs cimport attr_t
 from .tokens.doc cimport Doc
 from .attrs cimport TAG
-from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
+from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CCONJ, DET, NOUN, NUM, PRON
 from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
 from .gold cimport GoldParse
 
 from .attrs cimport *
 
- 
+
 cpdef enum:
     P2_orth
     P2_cluster
@@ -71,7 +71,7 @@ cpdef enum:
 
 cdef class TaggerModel(AveragedPerceptron):
     cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *:
-        
+
         _fill_from_token(&eg.atoms[P2_orth], &tokens[i-2])
         _fill_from_token(&eg.atoms[P1_orth], &tokens[i-1])
         _fill_from_token(&eg.atoms[W_orth], &tokens[i])
@@ -191,7 +191,7 @@ cdef class Tagger:
                                   nr_class=self.vocab.morphology.n_tags,
                                   nr_feat=self.model.nr_feat)
         for i in range(tokens.length):
-            if tokens.c[i].pos == 0:                
+            if tokens.c[i].pos == 0:
                 self.model.set_featuresC(&eg.c, tokens.c, i)
                 self.model.set_scoresC(eg.c.scores,
                     eg.c.features, eg.c.nr_feat)
@@ -217,7 +217,7 @@ cdef class Tagger:
         for doc in stream:
             self(doc)
             yield doc
-    
+
     def update(self, Doc tokens, GoldParse gold):
         """Update the statistical model, with tags supplied for the given document.
 
@@ -251,7 +251,7 @@ cdef class Tagger:
             self.model.updateC(&eg.c)
 
             self.vocab.morphology.assign_tag_id(&tokens.c[i], eg.guess)
-            
+
             correct += eg.cost == 0
             self.freqs[TAG][tokens.c[i].tag] += 1
             eg.fill_scores(0, eg.c.nr_class)
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 805a5b30c..bda528383 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -16,7 +16,7 @@ from ..typedefs cimport attr_t, flags_t
 from ..attrs cimport attr_id_t
 from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
 from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
-from ..parts_of_speech cimport CONJ, PUNCT, NOUN
+from ..parts_of_speech cimport CCONJ, PUNCT, NOUN
 from ..parts_of_speech cimport univ_pos_t
 from ..lexeme cimport Lexeme
 from .span cimport Span
@@ -59,13 +59,13 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
 
 cdef class Doc:
     """
-    A sequence of `Token` objects. Access sentences and named entities, 
-    export annotations to numpy arrays, losslessly serialize to compressed 
+    A sequence of `Token` objects. Access sentences and named entities,
+    export annotations to numpy arrays, losslessly serialize to compressed
     binary strings.
 
     Aside: Internals
-        The `Doc` object holds an array of `TokenC` structs. 
-        The Python-level `Token` and `Span` objects are views of this 
+        The `Doc` object holds an array of `TokenC` structs.
+        The Python-level `Token` and `Span` objects are views of this
         array, i.e. they don't own the data themselves.
 
     Code: Construction 1
@@ -80,13 +80,13 @@ cdef class Doc:
         Create a Doc object.
 
         Aside: Implementation
-            This method of constructing a `Doc` object is usually only used 
-            for deserialization. Standard usage is to construct the document via 
+            This method of constructing a `Doc` object is usually only used
+            for deserialization. Standard usage is to construct the document via
             a call to the language object.
 
         Arguments:
             vocab:
-                A Vocabulary object, which must match any models you want to 
+                A Vocabulary object, which must match any models you want to
                 use (e.g. tokenizer, parser, entity recognizer).
 
             words:
@@ -156,19 +156,19 @@ cdef class Doc:
         if self.length == 0:
             self.is_tagged = True
             self.is_parsed = True
-    
+
     def __getitem__(self, object i):
         '''
         doc[i]
-            Get the Token object at position i, where i is an integer. 
-            Negative indexing is supported, and follows the usual Python 
+            Get the Token object at position i, where i is an integer.
+            Negative indexing is supported, and follows the usual Python
             semantics, i.e. doc[-2] is doc[len(doc) - 2].
         doc[start : end]]
             Get a `Span` object, starting at position `start`
             and ending at position `end`, where `start` and
             `end` are token indices. For instance,
-            `doc[2:5]` produces a span consisting of 
-            tokens 2, 3 and 4. Stepped slices (e.g. `doc[start : end : step]`) 
+            `doc[2:5]` produces a span consisting of
+            tokens 2, 3 and 4. Stepped slices (e.g. `doc[start : end : step]`)
             are not supported, as `Span` objects must be contiguous (cannot have gaps).
             You can use negative indices and open-ended ranges, which have their
             normal Python semantics.
@@ -188,11 +188,11 @@ cdef class Doc:
     def __iter__(self):
         '''
         for token in doc
-            Iterate over `Token`  objects, from which the annotations can 
-            be easily accessed. This is the main way of accessing Token 
-            objects, which are the main way annotations are accessed from 
-            Python. If faster-than-Python speeds are required, you can 
-            instead access the annotations as a numpy array, or access the 
+            Iterate over `Token`  objects, from which the annotations can
+            be easily accessed. This is the main way of accessing Token
+            objects, which are the main way annotations are accessed from
+            Python. If faster-than-Python speeds are required, you can
+            instead access the annotations as a numpy array, or access the
             underlying C data directly from Cython.
         '''
         cdef int i
@@ -251,13 +251,13 @@ cdef class Doc:
         def __get__(self):
             if 'has_vector' in self.user_hooks:
                 return self.user_hooks['has_vector'](self)
- 
+
             return any(token.has_vector for token in self)
 
     property vector:
         '''
         A real-valued meaning representation. Defaults to an average of the token vectors.
-        
+
         Type: numpy.ndarray[ndim=1, dtype='float32']
         '''
         def __get__(self):
@@ -285,14 +285,14 @@ cdef class Doc:
                     norm += value * value
                 self._vector_norm = sqrt(norm) if norm != 0 else 0
             return self._vector_norm
-        
+
         def __set__(self, value):
-            self._vector_norm = value 
+            self._vector_norm = value
 
     @property
     def string(self):
         return self.text
-    
+
     property text:
         '''A unicode representation of the document text.'''
         def __get__(self):
@@ -306,7 +306,7 @@ cdef class Doc:
     property ents:
         '''
         Yields named-entity `Span` objects, if the entity recognizer
-        has been applied to the document. Iterate over the span to get 
+        has been applied to the document. Iterate over the span to get
         individual Token objects, or access the label:
 
         Example:
@@ -352,7 +352,7 @@ cdef class Doc:
             cdef int i
             for i in range(self.length):
                 self.c[i].ent_type = 0
-                # At this point we don't know whether the NER has run over the 
+                # At this point we don't know whether the NER has run over the
                 # Doc. If the ent_iob is missing, leave it missing.
                 if self.c[i].ent_iob != 0:
                     self.c[i].ent_iob = 2 # Means O. Non-O are set from ents.
@@ -384,9 +384,9 @@ cdef class Doc:
     property noun_chunks:
         '''
         Yields base noun-phrase #[code Span] objects, if the document
-        has been syntactically parsed. A base noun phrase, or 
-        'NP chunk', is a noun phrase that does not permit other NPs to 
-        be nested within it – so no NP-level coordination, no prepositional 
+        has been syntactically parsed. A base noun phrase, or
+        'NP chunk', is a noun phrase that does not permit other NPs to
+        be nested within it – so no NP-level coordination, no prepositional
         phrases, and no relative clauses. For example:
         '''
         def __get__(self):
@@ -422,7 +422,7 @@ cdef class Doc:
         def __get__(self):
             if 'sents' in self.user_hooks:
                 return self.user_hooks['sents'](self)
- 
+
             if not self.is_parsed:
                 raise ValueError(
                     "sentence boundary detection requires the dependency parse, which "
@@ -465,8 +465,8 @@ cdef class Doc:
     @cython.boundscheck(False)
     cpdef np.ndarray to_array(self, object py_attr_ids):
         """
-        Given a list of M attribute IDs, export the tokens to a numpy 
-        `ndarray` of shape (N, M), where `N` is the length 
+        Given a list of M attribute IDs, export the tokens to a numpy
+        `ndarray` of shape (N, M), where `N` is the length
         of the document. The values will be 32-bit integers.
 
         Example:
@@ -474,7 +474,7 @@ cdef class Doc:
             doc = nlp(text)
             # All strings mapped to integers, for easy export to numpy
             np_array = doc.to_array([attrs.LOWER, attrs.POS, attrs.ENT_TYPE, attrs.IS_ALPHA])
-                
+
         Arguments:
             attr_ids (list[int]): A list of attribute ID ints.
 
@@ -520,7 +520,7 @@ cdef class Doc:
         cdef int i
         cdef attr_t attr
         cdef size_t count
-        
+
         if counts is None:
             counts = PreshCounter()
             output_dict = True
@@ -570,7 +570,7 @@ cdef class Doc:
         cdef TokenC* tokens = self.c
         cdef int length = len(array)
         cdef attr_t[:] values
-        for col, attr_id in enumerate(attrs): 
+        for col, attr_id in enumerate(attrs):
             values = array[:, col]
             if attr_id == HEAD:
                 for i in range(length):
@@ -612,11 +612,11 @@ cdef class Doc:
         '''Deserialize, loading from bytes.'''
         self.vocab.serializer.unpack_into(data[4:], self)
         return self
-    
+
     @staticmethod
     def read_bytes(file_):
         '''
-        A static method, used to read serialized #[code Doc] objects from 
+        A static method, used to read serialized #[code Doc] objects from
         a file. For example:
 
         Example:
@@ -673,7 +673,7 @@ cdef class Doc:
                 "Expected either 3 arguments (deprecated), or 0 (use keyword arguments). "
                 "Arguments supplied:\n%s\n"
                 "Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes)))
- 
+
         cdef int start = token_by_start(self.c, self.length, start_idx)
         if start == -1:
             return None
@@ -784,7 +784,7 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
             if child.l_edge < head.l_edge:
                 head.l_edge = child.l_edge
             head.l_kids += 1
-        
+
     # Set right edges --- same as above, but iterate in reverse
     for i in range(length-1, -1, -1):
         child = &tokens[i]
@@ -798,4 +798,4 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
     for i in range(length):
         if tokens[i].head == 0 and tokens[i].dep != 0:
             tokens[tokens[i].l_edge].sent_start = True
-            
+
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index fc84ba350..69bd9fa6e 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -20,7 +20,7 @@ from .. import parts_of_speech
 from ..attrs cimport LEMMA
 from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
 from ..attrs cimport POS, LEMMA, TAG, DEP
-from ..parts_of_speech cimport CONJ, PUNCT
+from ..parts_of_speech cimport CCONJ, PUNCT
 
 from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
 from ..attrs cimport IS_BRACKET
@@ -84,7 +84,7 @@ cdef class Token:
 
     cpdef bint check_flag(self, attr_id_t flag_id) except -1:
         '''Check the value of a boolean flag.
-        
+
         Arguments:
             flag_id (int): The ID of the flag attribute.
         Returns:
@@ -225,7 +225,7 @@ cdef class Token:
     property vector:
         '''
         A real-valued meaning representation.
-        
+
         Type: numpy.ndarray[ndim=1, dtype='float32']
         '''
         def __get__(self):
@@ -343,7 +343,7 @@ cdef class Token:
         '''
         def __get__(self):
             cdef const TokenC* head_ptr = self.c
-            # guard against infinite loop, no token can have 
+            # guard against infinite loop, no token can have
             # more ancestors than tokens in the tree
             cdef int i = 0
             while head_ptr.head != 0 and i < self.doc.length:
@@ -370,7 +370,7 @@ cdef class Token:
 
     property head:
         '''The syntactic parent, or "governor", of this token.
-        
+
         Returns: Token
         '''
         def __get__(self):
@@ -390,7 +390,7 @@ cdef class Token:
 
             # is the new head a descendant of the old head
             cdef bint is_desc = old_head.is_ancestor_of(new_head)
-            
+
             cdef int new_edge
             cdef Token anc, child
 
@@ -420,7 +420,7 @@ cdef class Token:
                         if anc.c.l_edge <= new_edge:
                             break
                         anc.c.l_edge = new_edge
-            
+
             elif self.c.head < 0: # right dependent
                 old_head.c.r_kids -= 1
                 # do the same thing as for l_edge
@@ -435,7 +435,7 @@ cdef class Token:
                             if child.c.r_edge > new_edge:
                                 new_edge = child.c.r_edge
                         old_head.c.r_edge = new_edge
-                    
+
                     for anc in old_head.ancestors:
                         if anc.c.r_edge >= new_edge:
                             break
@@ -598,19 +598,19 @@ cdef class Token:
     property is_punct:
         def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_PUNCT)
 
-    property is_space: 
+    property is_space:
         def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE)
-    
-    property is_bracket: 
+
+    property is_bracket:
         def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)
 
-    property is_quote: 
+    property is_quote:
         def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)
 
-    property is_left_punct: 
+    property is_left_punct:
         def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)
 
-    property is_right_punct: 
+    property is_right_punct:
         def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)
 
     property like_url:

From 6d67213b80350fe63e46ea2a18688f4a5a3f0d81 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 7 Mar 2017 15:55:28 +0100
Subject: [PATCH 02/21] Add test for 850: Matcher fails on zero-or-more.

---
 spacy/tests/regression/test_issue850.py | 29 +++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 spacy/tests/regression/test_issue850.py

diff --git a/spacy/tests/regression/test_issue850.py b/spacy/tests/regression/test_issue850.py
new file mode 100644
index 000000000..4113ec512
--- /dev/null
+++ b/spacy/tests/regression/test_issue850.py
@@ -0,0 +1,29 @@
+'''
+Test Matcher matches with '*' operator and Boolean flag
+'''
+from __future__ import unicode_literals
+import pytest
+
+from ...matcher import Matcher
+from ...vocab import Vocab
+from ...attrs import LOWER
+from ...tokens import Doc
+
+
+@pytest.mark.xfail
+def test_issue850():
+    matcher = Matcher(Vocab())
+    IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
+    matcher.add_pattern(
+        "FarAway",
+        [
+            {LOWER: "bob"},
+            {'OP': '*', IS_ANY_TOKEN: True},
+            {LOWER: 'frank'}
+        ])
+    doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'cat', 'frank'])
+    match = matcher(doc)
+    assert len(match) == 1
+    start, end, label, ent_id = match 
+    assert start == 0
+    assert end == 4

From 4e75e742475236cf7358b4481a29a54eb607dd4d Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 7 Mar 2017 16:08:32 +0100
Subject: [PATCH 03/21] Update regression test for variable-length pattern
 problem in the matcher.

---
 spacy/tests/regression/test_issue850.py | 28 ++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/spacy/tests/regression/test_issue850.py b/spacy/tests/regression/test_issue850.py
index 4113ec512..3b3952744 100644
--- a/spacy/tests/regression/test_issue850.py
+++ b/spacy/tests/regression/test_issue850.py
@@ -2,6 +2,7 @@
 Test Matcher matches with '*' operator and Boolean flag
 '''
 from __future__ import unicode_literals
+from __future__ import print_function
 import pytest
 
 from ...matcher import Matcher
@@ -10,9 +11,30 @@ from ...attrs import LOWER
 from ...tokens import Doc
 
 
+def test_basic_case():
+    matcher = Matcher(Vocab(
+                lex_attr_getters={LOWER: lambda string: string.lower()}))
+    IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
+    matcher.add_pattern(
+        "FarAway",
+        [
+            {LOWER: "bob"},
+            {'OP': '*', LOWER: 'and'},
+            {LOWER: 'frank'}
+        ])
+    doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank'])
+    match = matcher(doc)
+    assert len(match) == 1
+    ent_id, label, start, end = match[0]
+    assert start == 0
+    assert end == 4
+
 @pytest.mark.xfail
 def test_issue850():
-    matcher = Matcher(Vocab())
+    '''The problem here is that the variable-length pattern matches the
+    succeeding token. We then don't handle the ambiguity correctly.'''
+    matcher = Matcher(Vocab(
+                lex_attr_getters={LOWER: lambda string: string.lower()}))
     IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
     matcher.add_pattern(
         "FarAway",
@@ -21,9 +43,9 @@ def test_issue850():
             {'OP': '*', IS_ANY_TOKEN: True},
             {LOWER: 'frank'}
         ])
-    doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'cat', 'frank'])
+    doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank'])
     match = matcher(doc)
     assert len(match) == 1
-    start, end, label, ent_id = match 
+    ent_id, label, start, end = match[0]
     assert start == 0
     assert end == 4

From 5de7e712b758829afbd0d9d000ec9139c474f737 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 7 Mar 2017 17:15:18 +0100
Subject: [PATCH 04/21] Add support for pickling StringStore.

---
 spacy/strings.pyx           | 37 +++++++++++++++++++++----------------
 spacy/tests/test_pickles.py | 17 +++++++++++++++++
 2 files changed, 38 insertions(+), 16 deletions(-)
 create mode 100644 spacy/tests/test_pickles.py

diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index ddfddc29c..403ebd3c0 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -3,7 +3,7 @@ from __future__ import unicode_literals, absolute_import
 
 cimport cython
 from libc.string cimport memcpy
-from libc.stdint cimport uint64_t
+from libc.stdint cimport uint64_t, uint32_t
 
 from murmurhash.mrmr cimport hash64, hash32
 
@@ -12,22 +12,19 @@ from preshed.maps cimport map_iter, key_t
 from .typedefs cimport hash_t
 from libc.stdint cimport uint32_t
 
-try:
-    import ujson as json
-except ImportError:
-    import json
+import ujson
 
 
 cpdef hash_t hash_string(unicode string) except 0:
     chars = string.encode('utf8')
-    return _hash_utf8(chars, len(chars))
+    return hash_utf8(chars, len(chars))
 
 
-cdef hash_t _hash_utf8(char* utf8_string, int length):
+cdef hash_t hash_utf8(char* utf8_string, int length) nogil:
     return hash64(utf8_string, length, 1)
 
 
-cdef uint32_t _hash32_utf8(char* utf8_string, int length):
+cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil:
     return hash32(utf8_string, length, 1)
 
 
@@ -48,11 +45,11 @@ cdef unicode _decode(const Utf8Str* string):
         return string.p[i:length + i].decode('utf8')
 
 
-cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, int length) except *:
+cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *:
     cdef int n_length_bytes
     cdef int i
     cdef Utf8Str string
-    assert length != 0
+    cdef uint32_t ulength = length
     if length < sizeof(string.s):
         string.s[0] = <unsigned char>length
         memcpy(&string.s[1], chars, length)
@@ -98,6 +95,14 @@ cdef class StringStore:
         def __get__(self):
             return self.size -1
 
+    def __reduce__(self):
+        # TODO: OOV words, for the is_frozen stuff?
+        if self.is_frozen:
+            raise NotImplementedError(
+                "Currently missing support for pickling StringStore when "
+                "is_frozen=True")
+        return (StringStore, (list(self),))
+
     def __len__(self):
         """The number of strings in the store.
 
@@ -149,7 +154,7 @@ cdef class StringStore:
                 # pretty bad.
                 # We could also get unlucky here, and hash into a value that
                 # collides with the 'real' strings. 
-                return _hash32_utf8(byte_string, len(byte_string))
+                return hash32_utf8(byte_string, len(byte_string))
             else:
                 return utf8str - self.c
 
@@ -200,7 +205,7 @@ cdef class StringStore:
     cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length):
         # TODO: This function's API/behaviour is an unholy mess...
         # 0 means missing, but we don't bother offsetting the index.
-        cdef hash_t key = _hash_utf8(utf8_string, length)
+        cdef hash_t key = hash_utf8(utf8_string, length)
         cdef Utf8Str* value = <Utf8Str*>self._map.get(key)
         if value is not NULL:
             return value
@@ -209,7 +214,7 @@ cdef class StringStore:
             return value
         if self.is_frozen:
             # OOV store uses 32 bit hashes. Pretty ugly :(
-            key32 = _hash32_utf8(utf8_string, length)
+            key32 = hash32_utf8(utf8_string, length)
             # Important: Make the OOV store own the memory. That way it's trivial
             # to flush them all.
             value = <Utf8Str*>self._oov.mem.alloc(1, sizeof(Utf8Str))
@@ -232,7 +237,7 @@ cdef class StringStore:
         Returns:
             None
         """
-        string_data = json.dumps(list(self))
+        string_data = ujson.dumps(list(self))
         if not isinstance(string_data, unicode):
             string_data = string_data.decode('utf8')
         # TODO: OOV?
@@ -246,7 +251,7 @@ cdef class StringStore:
         Returns:
             None
         """
-        strings = json.load(file_)
+        strings = ujson.load(file_)
         if strings == ['']:
             return None
         cdef unicode string
@@ -271,7 +276,7 @@ cdef class StringStore:
             # Find array index with pointer arithmetic
             offset = ((<Utf8Str*>value) - self.c)
             keys[offset] = key
-        
+
         self._resize_at *= 2
         cdef size_t new_size = self._resize_at * sizeof(Utf8Str)
         self.c = <Utf8Str*>self.mem.realloc(self.c, new_size)
diff --git a/spacy/tests/test_pickles.py b/spacy/tests/test_pickles.py
new file mode 100644
index 000000000..46221fd8b
--- /dev/null
+++ b/spacy/tests/test_pickles.py
@@ -0,0 +1,17 @@
+from __future__ import unicode_literals
+
+import io
+import pickle
+
+from ..strings import StringStore
+
+
+def test_pickle_string_store():
+    sstore = StringStore()
+    hello = sstore['hello']
+    bye = sstore['bye']
+    bdata = pickle.dumps(sstore, protocol=-1)
+    unpickled = pickle.loads(bdata)
+    assert unpickled['hello'] == hello
+    assert unpickled['bye'] == bye
+

From 3edb8ae207a44fe9bf40f55d5e211c22cdf085f8 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 7 Mar 2017 17:16:26 +0100
Subject: [PATCH 05/21] Whitespace

---
 spacy/tests/regression/test_issue850.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/tests/regression/test_issue850.py b/spacy/tests/regression/test_issue850.py
index 3b3952744..8237763ea 100644
--- a/spacy/tests/regression/test_issue850.py
+++ b/spacy/tests/regression/test_issue850.py
@@ -29,6 +29,7 @@ def test_basic_case():
     assert start == 0
     assert end == 4
 
+
 @pytest.mark.xfail
 def test_issue850():
     '''The problem here is that the variable-length pattern matches the

From 26614e028f94212810159995004a4330aca6ce43 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 7 Mar 2017 20:24:37 +0100
Subject: [PATCH 06/21] Add hacky support for StringCFile, to make pickling
 easier.

---
 spacy/cfile.pxd | 14 ++++++++++++++
 spacy/cfile.pyx | 42 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+)

diff --git a/spacy/cfile.pxd b/spacy/cfile.pxd
index c9a6aec41..cb0077587 100644
--- a/spacy/cfile.pxd
+++ b/spacy/cfile.pxd
@@ -4,6 +4,20 @@ from cymem.cymem cimport Pool
 cdef class CFile:
     cdef FILE* fp
     cdef bint is_open
+    cdef Pool mem
+    cdef int size # For compatibility with subclass
+    cdef int _capacity # For compatibility with subclass
+
+    cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
+
+    cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
+
+    cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *
+
+
+
+cdef class StringCFile(CFile):
+    cdef unsigned char* data
  
     cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
 
diff --git a/spacy/cfile.pyx b/spacy/cfile.pyx
index b119d3b9b..95c61a468 100644
--- a/spacy/cfile.pyx
+++ b/spacy/cfile.pyx
@@ -1,4 +1,5 @@
 from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
+from libc.string cimport memcpy
 
 
 cdef class CFile:
@@ -9,6 +10,7 @@ cdef class CFile:
             mode_str = mode
         if hasattr(loc, 'as_posix'):
             loc = loc.as_posix()
+        self.mem = Pool()
         cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
         self.fp = fopen(<char*>bytes_loc, mode_str)
         if self.fp == NULL:
@@ -45,3 +47,43 @@ cdef class CFile:
         cdef bytes py_bytes = value.encode('utf8')
         cdef char* chars = <char*>py_bytes
         self.write(sizeof(char), len(py_bytes), chars)
+
+
+cdef class StringCFile:
+    def __init__(self, mode, bytes data=b'', on_open_error=None):
+        self.mem = Pool()
+        self.is_open = 'w' in mode
+        self._capacity = max(len(data), 8)
+        self.size = len(data)
+        self.data = <unsigned char*>self.mem.alloc(1, self._capacity)
+        for i in range(len(data)):
+            self.data[i] = data
+
+    def close(self):
+        self.is_open = False
+
+    def string_data(self):
+        return (self.data-self.size)[:self.size]
+
+    cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
+        memcpy(dest, self.data, elem_size * number)
+        self.data += elem_size * number
+
+    cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1:
+        write_size = number * elem_size
+        if (self.size + write_size) >= self._capacity:
+            self._capacity = (self.size + write_size) * 2
+            self.data = <unsigned char*>self.mem.realloc(self.data, self._capacity)
+        memcpy(self.data, src, elem_size * number)
+        self.data += write_size
+        self.size += write_size
+
+    cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
+        cdef void* dest = mem.alloc(number, elem_size)
+        self.read_into(dest, number, elem_size)
+        return dest
+
+    def write_unicode(self, unicode value):
+        cdef bytes py_bytes = value.encode('utf8')
+        cdef char* chars = <char*>py_bytes
+        self.write(sizeof(char), len(py_bytes), chars)

From d814892805c364d9d52fb0eec2c97a8a1bdfea30 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 7 Mar 2017 20:25:12 +0100
Subject: [PATCH 07/21] Hackish pickle support for Vocab.

---
 spacy/vocab.pyx | 116 ++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 107 insertions(+), 9 deletions(-)

diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index da3a67e56..e7994c127 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -9,11 +9,16 @@ import bz2
 import ujson as json
 import re
 
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
+
 from .lexeme cimport EMPTY_LEXEME
 from .lexeme cimport Lexeme
 from .strings cimport hash_string
 from .typedefs cimport attr_t
-from .cfile cimport CFile
+from .cfile cimport CFile, StringCFile
 from .lemmatizer import Lemmatizer
 from .attrs import intify_attrs
 from .tokens.token cimport Token
@@ -346,17 +351,18 @@ cdef class Vocab:
                 Token.set_struct_attr(token, attr_id, value)
         return tokens
 
-    def dump(self, loc):
-        """Save the lexemes binary data to the given location.
+    def dump(self, loc=None):
+        """Save the lexemes binary data to the given location, or
+        return a byte-string with the data if loc is None.
 
         Arguments:
-            loc (Path): The path to save to.
+            loc (Path or None): The path to save to, or None.
         """
-        if hasattr(loc, 'as_posix'):
-            loc = loc.as_posix()
-        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
-
-        cdef CFile fp = CFile(bytes_loc, 'wb')
+        cdef CFile fp
+        if loc is None:
+            fp = StringCFile('wb')
+        else:
+            fp = CFile(loc, 'wb')
         cdef size_t st
         cdef size_t addr
         cdef hash_t key
@@ -378,6 +384,8 @@ cdef class Vocab:
             fp.write_from(&lexeme.l2_norm, sizeof(lexeme.l2_norm), 1)
             fp.write_from(&lexeme.lang, sizeof(lexeme.lang), 1)
         fp.close()
+        if loc is None:
+            return fp.string_data()
 
     def load_lexemes(self, loc):
         '''Load the binary vocabulary data from the given location.
@@ -427,6 +435,60 @@ cdef class Vocab:
             i += 1
         fp.close()
 
+    def _deserialize_lexemes(self, CFile fp):
+        '''Load the binary vocabulary data from the given CFile.
+        '''
+        cdef LexemeC* lexeme
+        cdef hash_t key
+        cdef unicode py_str
+        cdef attr_t orth
+        assert sizeof(orth) == sizeof(lexeme.orth)
+        i = 0
+        cdef int todo = fp.size
+        cdef int lex_size = sizeof(lexeme.flags)
+        lex_size += sizeof(lexeme.id)
+        lex_size += sizeof(lexeme.length)
+        lex_size += sizeof(lexeme.orth)
+        lex_size += sizeof(lexeme.lower)
+        lex_size += sizeof(lexeme.norm)
+        lex_size += sizeof(lexeme.shape)
+        lex_size += sizeof(lexeme.prefix)
+        lex_size += sizeof(lexeme.suffix)
+        lex_size += sizeof(lexeme.cluster)
+        lex_size += sizeof(lexeme.prob)
+        lex_size += sizeof(lexeme.sentiment)
+        lex_size += sizeof(lexeme.l2_norm)
+        lex_size += sizeof(lexeme.lang)
+        while True:
+            if todo < lex_size:
+                break
+            todo -= lex_size
+            lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
+            # Copy data from the file into the lexeme
+            fp.read_into(&lexeme.flags, 1, sizeof(lexeme.flags))
+            fp.read_into(&lexeme.id, 1, sizeof(lexeme.id))
+            fp.read_into(&lexeme.length, 1, sizeof(lexeme.length))
+            fp.read_into(&lexeme.orth, 1, sizeof(lexeme.orth))
+            fp.read_into(&lexeme.lower, 1, sizeof(lexeme.lower))
+            fp.read_into(&lexeme.norm, 1, sizeof(lexeme.norm))
+            fp.read_into(&lexeme.shape, 1, sizeof(lexeme.shape))
+            fp.read_into(&lexeme.prefix, 1, sizeof(lexeme.prefix))
+            fp.read_into(&lexeme.suffix, 1, sizeof(lexeme.suffix))
+            fp.read_into(&lexeme.cluster, 1, sizeof(lexeme.cluster))
+            fp.read_into(&lexeme.prob, 1, sizeof(lexeme.prob))
+            fp.read_into(&lexeme.sentiment, 1, sizeof(lexeme.sentiment))
+            fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm))
+            fp.read_into(&lexeme.lang, 1, sizeof(lexeme.lang))
+
+            lexeme.vector = EMPTY_VEC
+            py_str = self.strings[lexeme.orth]
+            key = hash_string(py_str)
+            self._by_hash.set(key, lexeme)
+            self._by_orth.set(lexeme.orth, lexeme)
+            self.length += 1
+            i += 1
+        fp.close()
+
     def dump_vectors(self, out_loc):
         '''Save the word vectors to a binary file.
 
@@ -553,6 +615,42 @@ cdef class Vocab:
         return vec_len
 
 
+def pickle_vocab(vocab):
+    sstore = vocab.strings
+    morph = vocab.morphology
+    length = vocab.length
+    serializer = vocab._serializer
+    data_dir = vocab.data_dir
+    lex_attr_getters = vocab.lex_attr_getters
+
+    lexemes_data = vocab.dump()
+    vectors_length = vocab.vectors_length
+
+    return (unpickle_vocab,
+        (sstore, morph, serializer, data_dir, lex_attr_getters,
+            lexemes_data, length, vectors_length))
+
+
+def unpickle_vocab(sstore, morphology, serializer, data_dir,
+        lex_attr_getters, bytes lexemes_data, int length, int vectors_length):
+    cdef Vocab vocab = Vocab()
+    vocab.length = length
+    vocab.vectors_length = vectors_length
+    vocab.strings = sstore
+    cdef CFile fp = StringCFile('r', data=lexemes_data)
+    vocab.morphology = morphology
+    vocab._serializer = serializer
+    vocab.data_dir = data_dir
+    vocab.lex_attr_getters = lex_attr_getters
+    vocab._deserialize_lexemes(fp)
+    vocab.length = length
+    vocab.vectors_length = vectors_length
+    return vocab
+
+
+copy_reg.pickle(Vocab, pickle_vocab, unpickle_vocab)
+
+
 def write_binary_vectors(in_loc, out_loc):
     cdef CFile out_file = CFile(out_loc, 'wb')
     cdef Address mem

From a89c3500f69e8c9aebaad6d2d137729eab1c8458 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 7 Mar 2017 20:58:55 +0100
Subject: [PATCH 08/21] Fixes to hacky vocab pickling

---
 spacy/cfile.pyx             |  7 +++----
 spacy/tests/test_pickles.py | 24 +++++++++++++++++++++++-
 2 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/spacy/cfile.pyx b/spacy/cfile.pyx
index 95c61a468..ceebe2e59 100644
--- a/spacy/cfile.pyx
+++ b/spacy/cfile.pyx
@@ -57,7 +57,7 @@ cdef class StringCFile:
         self.size = len(data)
         self.data = <unsigned char*>self.mem.alloc(1, self._capacity)
         for i in range(len(data)):
-            self.data[i] = data
+            self.data[i] = data[i]
 
     def close(self):
         self.is_open = False
@@ -69,13 +69,12 @@ cdef class StringCFile:
         memcpy(dest, self.data, elem_size * number)
         self.data += elem_size * number
 
-    cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1:
+    cdef int write_from(self, void* src, size_t elem_size, size_t number) except -1:
         write_size = number * elem_size
         if (self.size + write_size) >= self._capacity:
             self._capacity = (self.size + write_size) * 2
             self.data = <unsigned char*>self.mem.realloc(self.data, self._capacity)
-        memcpy(self.data, src, elem_size * number)
-        self.data += write_size
+        memcpy(&self.data[self.size], src, elem_size * number)
         self.size += write_size
 
     cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
diff --git a/spacy/tests/test_pickles.py b/spacy/tests/test_pickles.py
index 46221fd8b..4464b890e 100644
--- a/spacy/tests/test_pickles.py
+++ b/spacy/tests/test_pickles.py
@@ -1,9 +1,12 @@
 from __future__ import unicode_literals
 
 import io
-import pickle
+import pytest
+import dill as pickle
 
 from ..strings import StringStore
+from ..vocab import Vocab
+from ..attrs import NORM
 
 
 def test_pickle_string_store():
@@ -14,4 +17,23 @@ def test_pickle_string_store():
     unpickled = pickle.loads(bdata)
     assert unpickled['hello'] == hello
     assert unpickled['bye'] == bye
+    assert len(sstore) == len(unpickled)
 
+
+def test_pickle_vocab():
+    vocab = Vocab(lex_attr_getters={int(NORM): lambda string: string[:-1]})
+    dog = vocab[u'dog']
+    cat = vocab[u'cat']
+    assert dog.norm_ == 'do'
+    assert cat.norm_ == 'ca'
+
+    bdata = pickle.dumps(vocab)
+    unpickled = pickle.loads(bdata)
+
+    assert unpickled[u'dog'].orth == dog.orth
+    assert unpickled[u'cat'].orth == cat.orth
+    assert unpickled[u'dog'].norm == dog.norm
+    assert unpickled[u'cat'].norm == cat.norm
+    dog_ = unpickled[u'dog']
+    cat_ = unpickled[u'cat']
+    assert dog_.norm != cat_.norm

From 16670d325121bb183d818d0061e572ac7f962cef Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 7 Mar 2017 21:43:28 +0100
Subject: [PATCH 09/21] Xfail the vocab pickling for now

---
 spacy/tests/test_pickles.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/tests/test_pickles.py b/spacy/tests/test_pickles.py
index 4464b890e..2e7fc6bf7 100644
--- a/spacy/tests/test_pickles.py
+++ b/spacy/tests/test_pickles.py
@@ -20,6 +20,7 @@ def test_pickle_string_store():
     assert len(sstore) == len(unpickled)
 
 
+@pytest.mark.xfail
 def test_pickle_vocab():
     vocab = Vocab(lex_attr_getters={int(NORM): lambda string: string[:-1]})
     dog = vocab[u'dog']

From 04a51dab623eb06a0b5cdcc5c8142ccd4cfdc8b8 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 8 Mar 2017 01:37:19 +0100
Subject: [PATCH 10/21] Print active parser features during training

---
 bin/parser/train.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/bin/parser/train.py b/bin/parser/train.py
index 574797ba5..24484f7cf 100755
--- a/bin/parser/train.py
+++ b/bin/parser/train.py
@@ -66,8 +66,8 @@ def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
 
 def train(Language, train_data, dev_data, model_dir, tagger_cfg, parser_cfg, entity_cfg,
         n_iter=15, seed=0, gold_preproc=False, n_sents=0, corruption_level=0):
-    print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
-    format_str = '{:d}\t{:d}\t{uas:.3f}\t{ents_f:.3f}\t{tags_acc:.3f}\t{token_acc:.3f}'
+    print("Itn.\tP.Loss\tN feats\tUAS\tNER F.\tTag %\tToken %")
+    format_str = '{:d}\t{:d}\t{:d}\t{uas:.3f}\t{ents_f:.3f}\t{tags_acc:.3f}\t{token_acc:.3f}'
     with Language.train(model_dir, train_data,
             tagger_cfg, parser_cfg, entity_cfg) as trainer:
         loss = 0
@@ -76,7 +76,8 @@ def train(Language, train_data, dev_data, model_dir, tagger_cfg, parser_cfg, ent
             for doc, gold in epoch:
                 trainer.update(doc, gold)
             dev_scores = trainer.evaluate(dev_data, gold_preproc=gold_preproc)
-            print(format_str.format(itn, loss, **dev_scores.scores))
+            print(format_str.format(itn, loss,
+                trainer.nlp.parser.model.nr_active_feat, **dev_scores.scores))
 
 
 def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
@@ -160,6 +161,7 @@ def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc=
     if not eval_only:
         gold_train = list(read_json_file(train_loc))
         gold_dev = list(read_json_file(dev_loc))
+        gold_train = gold_train[:n_sents]
         train(lang, gold_train, gold_dev, model_dir, tagger_cfg, parser_cfg, entity_cfg,
               n_sents=n_sents, gold_preproc=gold_preproc, corruption_level=corruption_level,
               n_iter=n_iter)

From d108534dc289f4f1342194be8cc0151bad769153 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 8 Mar 2017 01:37:52 +0100
Subject: [PATCH 11/21] Fix 2/3 problems for training

---
 spacy/en/__init__.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py
index 56cf4d184..f39faf308 100644
--- a/spacy/en/__init__.py
+++ b/spacy/en/__init__.py
@@ -2,6 +2,7 @@
 from __future__ import unicode_literals, print_function
 
 from os import path
+from pathlib import Path
 
 from ..util import match_best_version
 from ..util import get_data_path
@@ -13,6 +14,11 @@ from ..attrs import LANG
 
 from .language_data import *
 
+try:
+    basestring
+except NameError:
+    basestring = str
+
 
 class English(Language):
     lang = 'en'
@@ -43,14 +49,15 @@ def _fix_deprecated_glove_vectors_loading(overrides):
         data_path = get_data_path()
     else:
         path = overrides['path']
+        if isinstance(path, basestring):
+            path = Path(path)
         data_path = path.parent
     vec_path = None
     if 'add_vectors' not in overrides:
         if 'vectors' in overrides:
             vec_path = match_best_version(overrides['vectors'], None, data_path)
             if vec_path is None:
-                raise IOError(
-                    'Could not load data pack %s from %s' % (overrides['vectors'], data_path))
+                return overrides
         else:
             vec_path = match_best_version('en_glove_cc_300_1m_vectors', None, data_path)
         if vec_path is not None:

From 40703988bc01ea6c41d5e8a920634d4692231c05 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 8 Mar 2017 01:38:51 +0100
Subject: [PATCH 12/21] Use FTRL training in parser

---
 spacy/syntax/parser.pyx | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx
index eb094fa97..34ee920c6 100644
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@@ -124,6 +124,8 @@ cdef class Parser:
         elif 'features' not in cfg:
             cfg['features'] = self.feature_templates
         self.model = ParserModel(cfg['features'])
+        self.model.l1_penalty = 1e-7
+
         self.cfg = cfg
 
     def __reduce__(self):
@@ -258,15 +260,20 @@ cdef class Parser:
             self.model.set_featuresC(&eg.c, stcls.c)
             self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold)
             self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat)
-            self.model.updateC(&eg.c)
+            self.model.time += 1
             guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)
-
-            action = self.moves.c[eg.guess]
+            if eg.c.costs[guess] > 0:
+                best = VecVec.arg_max_if_zero(eg.c.scores, eg.c.costs, eg.c.nr_class)
+                for feat in eg.c.features[:eg.c.nr_feat]:
+                    self.model.update_weight_ftrl(feat.key, best, -feat.value * eg.costs[guess])
+                    self.model.update_weight_ftrl(feat.key, guess, feat.value * eg.costs[guess])
+ 
+            action = self.moves.c[guess]
             action.do(stcls.c, action.label)
-            loss += eg.costs[eg.guess]
-            eg.fill_scores(0, eg.nr_class)
-            eg.fill_costs(0, eg.nr_class)
-            eg.fill_is_valid(1, eg.nr_class)
+            loss += eg.costs[guess]
+            eg.fill_scores(0, eg.c.nr_class)
+            eg.fill_costs(0, eg.c.nr_class)
+            eg.fill_is_valid(1, eg.c.nr_class)
         return loss
 
     def step_through(self, Doc doc):
@@ -296,7 +303,7 @@ cdef class Parser:
         # Doesn't set label into serializer -- subclasses override it to do that.
         for action in self.moves.action_types:
             self.moves.add_action(action, label)
-                
+
 
 cdef class StepwiseState:
     cdef readonly StateClass stcls

From cd33b39a04c52e288c9a6e9a1043a29f72cf6527 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 8 Mar 2017 01:39:13 +0100
Subject: [PATCH 13/21] Fix 2/3 problem for json save/load

---
 spacy/language.py | 47 +++++++++++++++++++++++++++++++----------------
 1 file changed, 31 insertions(+), 16 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index bebdeab20..9f8cc49e1 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -5,7 +5,7 @@ import pathlib
 from contextlib import contextmanager
 import shutil
 
-import ujson as json
+import ujson
 
 
 try:
@@ -13,6 +13,10 @@ try:
 except NameError:
     basestring = str
 
+try:
+    unicode
+except NameError:
+    unicode = str
 
 from .tokenizer import Tokenizer
 from .vocab import Vocab
@@ -226,12 +230,21 @@ class Language(object):
         parser_cfg['actions'] = ArcEager.get_actions(gold_parses=gold_tuples)
         entity_cfg['actions'] = BiluoPushDown.get_actions(gold_parses=gold_tuples)
 
-        with (dep_model_dir / 'config.json').open('w') as file_:
-            json.dump(parser_cfg, file_)
-        with (ner_model_dir / 'config.json').open('w') as file_:
-            json.dump(entity_cfg, file_)
-        with (pos_model_dir / 'config.json').open('w') as file_:
-            json.dump(tagger_cfg, file_)
+        with (dep_model_dir / 'config.json').open('wb') as file_:
+            data = ujson.dumps(parser_cfg)
+            if isinstance(data, unicode):
+                data = data.encode('utf8')
+            file_.write(data)
+        with (ner_model_dir / 'config.json').open('wb') as file_:
+            data = ujson.dumps(entity_cfg)
+            if isinstance(data, unicode):
+                data = data.encode('utf8')
+            file_.write(data)
+        with (pos_model_dir / 'config.json').open('wb') as file_:
+            data = ujson.dumps(tagger_cfg)
+            if isinstance(data, unicode):
+                data = data.encode('utf8')
+            file_.write(data)
 
         self = cls(
                 path=path,
@@ -391,12 +404,14 @@ class Language(object):
         else:
             entity_iob_freqs = []
             entity_type_freqs = []
-        with (path / 'vocab' / 'serializer.json').open('w') as file_:
-            file_.write(
-                json.dumps([
-                    (TAG, tagger_freqs),
-                    (DEP, dep_freqs),
-                    (ENT_IOB, entity_iob_freqs),
-                    (ENT_TYPE, entity_type_freqs),
-                    (HEAD, head_freqs)
-                ]))
+        with (path / 'vocab' / 'serializer.json').open('wb') as file_:
+            data = ujson.dumps([
+                        (TAG, tagger_freqs),
+                        (DEP, dep_freqs),
+                        (ENT_IOB, entity_iob_freqs),
+                        (ENT_TYPE, entity_type_freqs),
+                        (HEAD, head_freqs)
+                    ])
+            if isinstance(data, unicode):
+                data = data.encode('utf8')
+            file_.write(data)

From ffe0f0c6c4be01aa356cc127e2df8103ba4cbf74 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 8 Mar 2017 14:11:54 +0100
Subject: [PATCH 14/21] Add dill to requirements

---
 requirements.txt | 1 +
 setup.py         | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 538862aed..4a75f6be3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,3 +10,4 @@ six
 ujson>=1.35
 cloudpickle
 sputnik>=0.9.2,<0.10.0
+dill>=0.2,<0.3
diff --git a/setup.py b/setup.py
index fc316e72f..49ea639e2 100644
--- a/setup.py
+++ b/setup.py
@@ -241,7 +241,8 @@ def setup_package():
                 'cloudpickle',
                 'pathlib',
                 'sputnik>=0.9.2,<0.10.0',
-                'ujson>=1.35'],
+                'ujson>=1.35',
+                'dill>=0.2,<0.3'],
             classifiers=[
                 'Development Status :: 5 - Production/Stable',
                 'Environment :: Console',

From 0a6d7ca2006d520883361d9922282679c4d2d6cc Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 8 Mar 2017 14:33:32 +0100
Subject: [PATCH 15/21] Fix spacing after token_match

The boolean flag indicating a space after the token was
being set incorrectly after the token_match regex was applied.
Fixes #859.
---
 spacy/tokenizer.pyx | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 8f2f111e7..1b74431ff 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -275,7 +275,10 @@ cdef class Tokenizer:
             if cache_hit:
                 pass
             elif self.token_match and self.token_match(string): 
-                tokens.push_back(self.vocab.get(tokens.mem, string), not suffixes.size())
+                # We're always saying 'no' to spaces here -- the caller will
+                # fix up the outermost one, with reference to the original.
+                # See Issue #859
+                tokens.push_back(self.vocab.get(tokens.mem, string), False)
             else:
                 matches = self.find_infix(string)
                 if not matches:

From c2e3e651b84f519f6ef021e064c161bc4ba5e89a Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 8 Mar 2017 14:36:09 +0100
Subject: [PATCH 16/21] Re-add regression test for #859

---
 spacy/tests/regression/test_issue859.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 spacy/tests/regression/test_issue859.py

diff --git a/spacy/tests/regression/test_issue859.py b/spacy/tests/regression/test_issue859.py
new file mode 100644
index 000000000..4a2d08df7
--- /dev/null
+++ b/spacy/tests/regression/test_issue859.py
@@ -0,0 +1,12 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+import pytest
+
+
+@pytest.mark.parametrize('text', ["aaabbb@ccc.com\nThank you!",
+                                  "aaabbb@ccc.com \nThank you!"])
+def test_issue859(en_tokenizer, text):
+    """Test that no extra space is added in doc.text method."""
+    doc = en_tokenizer(text)
+    assert doc.text == text

From 0ac3d2768991521205a6d0e365303560521b6108 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 8 Mar 2017 15:01:40 +0100
Subject: [PATCH 17/21] Fix handling of trailing whitespace

Fix off-by-one error that meant trailing spaces were being dropped.
Closes #792
---
 spacy/tests/regression/test_issue792.py | 12 +++++++++---
 spacy/tokenizer.pyx                     |  1 -
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/spacy/tests/regression/test_issue792.py b/spacy/tests/regression/test_issue792.py
index 563e061a6..df8b5ef50 100644
--- a/spacy/tests/regression/test_issue792.py
+++ b/spacy/tests/regression/test_issue792.py
@@ -4,9 +4,15 @@ from __future__ import unicode_literals
 import pytest
 
 
-@pytest.mark.xfail
 @pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"])
 def test_issue792(en_tokenizer, text):
-    """Test for Issue #792: Trailing whitespace is removed after parsing."""
+    """Test for Issue #792: Trailing whitespace is removed after tokenization."""
     doc = en_tokenizer(text)
-    assert doc.text_with_ws == text
+    assert ''.join([token.text_with_ws for token in doc]) == text
+
+
+@pytest.mark.parametrize('text', ["This is a string", "This is a string\n"])
+def test_control_issue792(en_tokenizer, text):
+    """Test base case for Issue #792: Non-trailing whitespace"""
+    doc = en_tokenizer(text)
+    assert ''.join([token.text_with_ws for token in doc]) == text
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 1b74431ff..5a4eb844a 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -163,7 +163,6 @@ cdef class Tokenizer:
                     start = i
                 in_ws = not in_ws
             i += 1
-        i += 1
         if start < i:
             span = string[start:]
             key = hash_string(span)

From f71eeef9bb620032aa6c83cead39ded983a8be3f Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Thu, 9 Mar 2017 18:42:40 -0600
Subject: [PATCH 18/21] Pass path argument to end_training

---
 spacy/language.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/language.py b/spacy/language.py
index 9f8cc49e1..66acec781 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -265,7 +265,7 @@ class Language(object):
         self.entity = self.Defaults.create_entity(self)
         self.pipeline = self.Defaults.create_pipeline(self)
         yield Trainer(self, gold_tuples)
-        self.end_training()
+        self.end_training(path=path)
 
     def __init__(self, **overrides):
         if 'data_dir' in overrides and 'path' not in overrides:

From c62da0234480cc30a2c41dfc2e054d72db3015ee Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Thu, 9 Mar 2017 18:43:21 -0600
Subject: [PATCH 19/21] Use ftrl training, to learn compressed model.

---
 spacy/syntax/parser.pyx | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx
index 34ee920c6..093186518 100644
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@@ -68,7 +68,7 @@ def get_templates(name):
 
 
 cdef class ParserModel(AveragedPerceptron):
-    cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil: 
+    cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil:
         fill_context(eg.atoms, state)
         eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms)
 
@@ -124,7 +124,7 @@ cdef class Parser:
         elif 'features' not in cfg:
             cfg['features'] = self.feature_templates
         self.model = ParserModel(cfg['features'])
-        self.model.l1_penalty = 1e-7
+        self.model.l1_penalty = cfg.get('L1', 0.0)
 
         self.cfg = cfg
 
@@ -234,7 +234,7 @@ cdef class Parser:
         free(eg.scores)
         free(eg.is_valid)
         return 0
-  
+
     def update(self, Doc tokens, GoldParse gold):
         """Update the statistical model.
 
@@ -263,11 +263,11 @@ cdef class Parser:
             self.model.time += 1
             guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)
             if eg.c.costs[guess] > 0:
-                best = VecVec.arg_max_if_zero(eg.c.scores, eg.c.costs, eg.c.nr_class)
+                best = arg_max_if_gold(eg.c.scores, eg.c.costs, eg.c.nr_class)
                 for feat in eg.c.features[:eg.c.nr_feat]:
-                    self.model.update_weight_ftrl(feat.key, best, -feat.value * eg.costs[guess])
-                    self.model.update_weight_ftrl(feat.key, guess, feat.value * eg.costs[guess])
- 
+                    self.model.update_weight_ftrl(feat.key, best, -feat.value * eg.c.costs[guess])
+                    self.model.update_weight_ftrl(feat.key, guess, feat.value * eg.c.costs[guess])
+
             action = self.moves.c[guess]
             action.do(stcls.c, action.label)
             loss += eg.costs[guess]
@@ -392,6 +392,14 @@ class ParserStateError(ValueError):
             "Please include the text that the parser failed on, which is:\n"
             "%s" % repr(doc.text))
 
+cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs, int n) nogil:
+    cdef int best = -1
+    for i in range(n):
+        if costs[i] <= 0:
+            if best == -1 or scores[i] > scores[best]:
+                best = i
+    return best
+
 
 cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actions,
                        int nr_class) except -1:

From 798450136dc30068f81ffb88bbd947596a931b32 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Thu, 9 Mar 2017 18:43:47 -0600
Subject: [PATCH 20/21] Set L1 penalty to 0 in tagger.

---
 spacy/tagger.pyx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx
index 954bced53..eab0d1126 100644
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@@ -152,6 +152,7 @@ cdef class Tagger:
             model = TaggerModel(cfg.get('features', self.feature_templates))
         self.vocab = vocab
         self.model = model
+        self.model.l1_penalty = 0.0
         # TODO: Move this to tag map
         self.freqs = {TAG: defaultdict(int)}
         for tag in self.tag_names:

From 35124b144a4b25f8377fcbbf0ab32fbffc3320eb Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Thu, 9 Mar 2017 18:44:53 -0600
Subject: [PATCH 21/21] Add L1 penalty option to parser

---
 bin/parser/train.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/bin/parser/train.py b/bin/parser/train.py
index 24484f7cf..26b545b6d 100755
--- a/bin/parser/train.py
+++ b/bin/parser/train.py
@@ -66,7 +66,7 @@ def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
 
 def train(Language, train_data, dev_data, model_dir, tagger_cfg, parser_cfg, entity_cfg,
         n_iter=15, seed=0, gold_preproc=False, n_sents=0, corruption_level=0):
-    print("Itn.\tP.Loss\tN feats\tUAS\tNER F.\tTag %\tToken %")
+    print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %")
     format_str = '{:d}\t{:d}\t{:d}\t{uas:.3f}\t{ents_f:.3f}\t{tags_acc:.3f}\t{token_acc:.3f}'
     with Language.train(model_dir, train_data,
             tagger_cfg, parser_cfg, entity_cfg) as trainer:
@@ -76,12 +76,13 @@ def train(Language, train_data, dev_data, model_dir, tagger_cfg, parser_cfg, ent
             for doc, gold in epoch:
                 trainer.update(doc, gold)
             dev_scores = trainer.evaluate(dev_data, gold_preproc=gold_preproc)
-            print(format_str.format(itn, loss,
+            print(format_str.format(itn, trainer.nlp.parser.model.nr_weight,
                 trainer.nlp.parser.model.nr_active_feat, **dev_scores.scores))
 
 
 def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
              beam_width=None, cand_preproc=None):
+    print("Load parser", model_dir)
     nlp = Language(path=model_dir)
     if nlp.lang == 'de':
         nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string])
@@ -146,22 +147,25 @@ def write_parses(Language, dev_loc, model_dir, out_loc):
     verbose=("Verbose error reporting", "flag", "v", bool),
     debug=("Debug mode", "flag", "d", bool),
     pseudoprojective=("Use pseudo-projective parsing", "flag", "p", bool),
+    L1=("L1 regularization penalty", "option", "L", float),
 )
 def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
-         debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False):
+         debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False,
+         L1=1e-6):
     parser_cfg = dict(locals())
     tagger_cfg = dict(locals())
     entity_cfg = dict(locals())
 
     lang = spacy.util.get_lang_class(language)
-    
+
     parser_cfg['features'] = lang.Defaults.parser_features
     entity_cfg['features'] = lang.Defaults.entity_features
 
     if not eval_only:
         gold_train = list(read_json_file(train_loc))
         gold_dev = list(read_json_file(dev_loc))
-        gold_train = gold_train[:n_sents]
+        if n_sents > 0:
+            gold_train = gold_train[:n_sents]
         train(lang, gold_train, gold_dev, model_dir, tagger_cfg, parser_cfg, entity_cfg,
               n_sents=n_sents, gold_preproc=gold_preproc, corruption_level=corruption_level,
               n_iter=n_iter)