From eae35e9b271605fb29c353e6655f0ac41e2d228b Mon Sep 17 00:00:00 2001
From: Wolfgang Seeker <seeker@ims.uni-stuttgart.de>
Date: Thu, 18 Feb 2016 13:24:20 +0100
Subject: [PATCH 1/6] add tokenizer files for German, add/change code to train
 German pos tagger

- add files to specify rules for German tokenization
- change generate_specials.py to generate from an external file (abbrev.de.tab)
- copy gazetteer.json from lang_data/en/

- init_model.py
	- change doc freq threshold to 0
- add train_german_tagger.py
	- expects conll09-formatted input
---
 bin/init_model.py                 |  12 +-
 bin/tagger/train_german_tagger.py | 160 +++++++++
 lang_data/de/abbrev.de.tab        | 319 +++++++++++++++++
 lang_data/de/gazetteer.json       | 194 +++++++++++
 lang_data/de/generate_specials.py |  25 +-
 lang_data/de/infix.txt            |   5 +-
 lang_data/de/prefix.txt           |   5 +
 lang_data/de/specials.json        | 560 ++++++++++++++++++++++++++++--
 lang_data/de/suffix.txt           |  49 ++-
 spacy/tagger.pyx                  |  13 +-
 10 files changed, 1290 insertions(+), 52 deletions(-)
 create mode 100644 bin/tagger/train_german_tagger.py
 create mode 100644 lang_data/de/abbrev.de.tab
 create mode 100644 lang_data/de/gazetteer.json

diff --git a/bin/init_model.py b/bin/init_model.py
index 991b5dd58..19cfcdc25 100644
--- a/bin/init_model.py
+++ b/bin/init_model.py
@@ -98,7 +98,7 @@ def _read_probs(loc):
     return probs, probs['-OOV-']
 
 
-def _read_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200):
+def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200):
     if not loc.exists():
         print("Warning: Frequencies file not found")
         return {}, 0.0
@@ -125,7 +125,8 @@ def _read_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200):
         doc_freq = int(doc_freq)
         freq = int(freq)
         if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
-            word = literal_eval(key)
+#            word = literal_eval(key)
+            word = key
             smooth_count = counts.smoother(int(freq))
             log_smooth_count = math.log(smooth_count)
             probs[word] = math.log(smooth_count) - log_total
@@ -165,7 +166,7 @@ def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir):
     clusters = _read_clusters(src_dir / 'clusters.txt')
     probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob')
     if not probs:
-        probs, oov_prob = _read_freqs(src_dir / 'freqs.txt.gz')
+        probs, oov_prob = _read_freqs(src_dir / 'freqs.txt')
     if not probs:
         oov_prob = -20
     else:
@@ -223,9 +224,8 @@ def main(lang_id, lang_data_dir, corpora_dir, model_dir):
         copyfile(str(lang_data_dir / 'gazetteer.json'),
                  str(model_dir / 'vocab' / 'gazetteer.json'))
 
-    if (lang_data_dir / 'tag_map.json').exists():
-        copyfile(str(lang_data_dir / 'tag_map.json'),
-                 str(model_dir / 'vocab' / 'tag_map.json'))
+    copyfile(str(lang_data_dir / 'tag_map.json'),
+             str(model_dir / 'vocab' / 'tag_map.json'))
 
     if (lang_data_dir / 'lemma_rules.json').exists():
         copyfile(str(lang_data_dir / 'lemma_rules.json'),
diff --git a/bin/tagger/train_german_tagger.py b/bin/tagger/train_german_tagger.py
new file mode 100644
index 000000000..4927a6e9a
--- /dev/null
+++ b/bin/tagger/train_german_tagger.py
@@ -0,0 +1,160 @@
+#!/usr/bin/env python
+from __future__ import division
+from __future__ import unicode_literals
+
+import os
+from os import path
+import shutil
+import io
+import random
+import time
+import gzip
+import ujson
+
+import plac
+import cProfile
+import pstats
+
+import spacy.util
+from spacy.de import German
+from spacy.gold import GoldParse
+from spacy.tagger import Tagger
+from spacy.scorer import PRFScore
+
+from spacy.tagger import P2_orth, P2_cluster, P2_shape, P2_prefix, P2_suffix, P2_pos, P2_lemma, P2_flags 
+from spacy.tagger import P1_orth, P1_cluster, P1_shape, P1_prefix, P1_suffix, P1_pos, P1_lemma, P1_flags 
+from spacy.tagger import W_orth, W_cluster, W_shape, W_prefix, W_suffix, W_pos, W_lemma, W_flags
+from spacy.tagger import N1_orth, N1_cluster, N1_shape, N1_prefix, N1_suffix, N1_pos, N1_lemma, N1_flags
+from spacy.tagger import N2_orth, N2_cluster, N2_shape, N2_prefix, N2_suffix, N2_pos, N2_lemma, N2_flags, N_CONTEXT_FIELDS
+
+
+def default_templates():
+    return spacy.tagger.Tagger.default_templates()
+
+def default_templates_without_clusters():
+    return (
+        (W_orth,),
+        (P1_lemma, P1_pos),
+        (P2_lemma, P2_pos),
+        (N1_orth,),
+        (N2_orth,),
+
+        (W_suffix,),
+        (W_prefix,),
+
+        (P1_pos,),
+        (P2_pos,),
+        (P1_pos, P2_pos),
+        (P1_pos, W_orth),
+        (P1_suffix,),
+        (N1_suffix,),
+
+        (W_shape,),
+
+        (W_flags,),
+        (N1_flags,),
+        (N2_flags,),
+        (P1_flags,),
+        (P2_flags,),
+    )
+
+
+def make_tagger(vocab, templates):
+    model = spacy.tagger.TaggerModel(templates)
+    return spacy.tagger.Tagger(vocab,model)
+
+
+def read_conll(file_):
+    def sentences():
+        words, tags = [], []
+        for line in file_:
+            line = line.strip()
+            if line:
+                word, tag = line.split('\t')[1::3][:2] # get column 1 and 4 (CoNLL09)
+                words.append(word)
+                tags.append(tag)
+            elif words:
+                yield words, tags
+                words, tags = [], []
+        if words:
+            yield words, tags
+    return [ s for s in sentences() ]
+
+        
+def score_model(score, nlp, words, gold_tags):
+    tokens = nlp.tokenizer.tokens_from_list(words)
+    assert(len(tokens) == len(gold_tags))
+    nlp.tagger(tokens)
+
+    for token, gold_tag in zip(tokens,gold_tags):
+        score.score_set(set([token.tag_]),set([gold_tag]))
+
+
+def train(Language, train_sents, dev_sents, model_dir, n_iter=15, seed=21):
+    # make shuffling deterministic
+    random.seed(seed)
+
+    # set up directory for model
+    pos_model_dir = path.join(model_dir, 'pos')
+    if path.exists(pos_model_dir):
+        shutil.rmtree(pos_model_dir)
+    os.mkdir(pos_model_dir)
+
+    nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
+    nlp.tagger = make_tagger(nlp.vocab,default_templates())
+     
+    print("Itn.\ttrain acc %\tdev acc %")
+    for itn in range(n_iter):
+        # train on train set
+        #train_acc = PRFScore()
+        correct, total = 0., 0.
+        for words, gold_tags in train_sents:
+            tokens = nlp.tokenizer.tokens_from_list(words)
+            correct += nlp.tagger.train(tokens, gold_tags)
+            total += len(words)
+        train_acc = correct/total
+
+        # test on dev set
+        dev_acc = PRFScore()
+        for words, gold_tags in dev_sents:
+            score_model(dev_acc, nlp, words, gold_tags)
+
+        random.shuffle(train_sents)
+        print('%d:\t%6.2f\t%6.2f' % (itn, 100*train_acc, 100*dev_acc.precision))
+
+
+    print('end training')
+    nlp.end_training(model_dir)
+    print('done')
+
+
+@plac.annotations(
+    train_loc=("Location of CoNLL 09 formatted training file"),
+    dev_loc=("Location of CoNLL 09 formatted development file"),
+    model_dir=("Location of output model directory"),
+    eval_only=("Skip training, and only evaluate", "flag", "e", bool),
+    n_iter=("Number of training iterations", "option", "i", int),
+)
+def main(train_loc, dev_loc, model_dir, eval_only=False, n_iter=15):
+    # training
+    if not eval_only:
+        with io.open(train_loc, 'r', encoding='utf8') as trainfile_, \
+             io.open(dev_loc, 'r', encoding='utf8') as devfile_:
+            train_sents = read_conll(trainfile_)
+            dev_sents = read_conll(devfile_)
+        train(German, train_sents, dev_sents, model_dir, n_iter=n_iter)
+
+    # testing
+    with io.open(dev_loc, 'r', encoding='utf8') as file_:
+        dev_sents = read_conll(file_)
+        nlp = German(data_dir=model_dir)
+
+        dev_acc = PRFScore()
+        for words, gold_tags in dev_sents:
+            score_model(dev_acc, nlp, words, gold_tags)                
+        
+        print('POS: %6.2f %%' % (100*dev_acc.precision))
+
+
+if __name__ == '__main__':
+    plac.call(main)
diff --git a/lang_data/de/abbrev.de.tab b/lang_data/de/abbrev.de.tab
new file mode 100644
index 000000000..97374c83d
--- /dev/null
+++ b/lang_data/de/abbrev.de.tab
@@ -0,0 +1,319 @@
+# surface form lemma pos 
+# multiple values are separated by |
+# empty lines and lines starting with # are being ignored 
+
+''	''
+\")	\")
+\n	\n	<nl>	SP
+\t	\t	<tab>	SP
+ 	 	<space>	SP
+
+# example: Wie geht's?
+'s	's	es
+'S	'S	es
+
+# example: Haste mal 'nen Euro?
+'n	'n	ein
+'ne	'ne	eine
+'nen	'nen	einen
+
+# example: Kommen S’ nur herein!
+s'	s'	sie
+S'	S'	sie
+
+# example: Da haben wir's!
+ich's	ich|'s	ich|es
+du's	du|'s	du|es
+er's	er|'s	er|es
+sie's	sie|'s	sie|es
+wir's	wir|'s	wir|es
+ihr's	ihr|'s	ihr|es
+
+# example: Die katze auf'm dach.
+auf'm	auf|'m	auf|dem
+unter'm	unter|'m	unter|dem
+über'm	über|'m	über|dem
+vor'm	vor|'m	vor|dem
+hinter'm	hinter|'m	hinter|dem
+
+# persons
+B.A.	B.A.
+B.Sc.	B.Sc.
+Dipl.	Dipl.
+Dipl.-Ing.	Dipl.-Ing.
+Dr.	Dr.
+Fr.	Fr.
+Frl.	Frl.
+Hr.	Hr.
+Hrn.	Hrn.
+Frl.	Frl.
+Prof.	Prof.
+St.	St.
+Hrgs.	Hrgs.
+Hg.	Hg.
+a.Z.	a.Z.
+a.D.	a.D.
+h.c.	h.c.
+Jr.	Jr.
+jr.	jr.
+jun.	jun.
+sen.	sen.
+rer.	rer.
+Ing.	Ing.
+M.A.	M.A.
+Mr.	Mr.
+M.Sc.	M.Sc.
+nat.	nat.
+phil.	phil.
+
+# companies
+Co.	Co.
+co.	co.
+Cie.	Cie.
+A.G.	A.G.
+G.m.b.H.	G.m.b.H.
+i.G.	i.G.
+e.V.	e.V.
+
+# popular german abbreviations
+Abb.	Abb.
+Abk.	Abk.
+Abs.	Abs.
+Abt.	Abt.
+abzgl.	abzgl.
+allg.	allg.
+a.M.	a.M.
+Bd.	Bd.
+betr.	betr.
+Betr.	Betr.
+Biol.	Biol.
+biol.	biol.
+Bf.	Bf.
+Bhf.	Bhf.
+Bsp.	Bsp.
+bspw.	bspw.
+bzgl.	bzgl.
+bzw.	bzw.
+d.h.	d.h.
+dgl.	dgl.
+ebd.	ebd.
+ehem.	ehem.
+eigtl.	eigtl.
+entspr.	entspr.
+erm.	erm.
+ev.	ev.
+evtl.	evtl.
+Fa.	Fa.
+Fam.	Fam.
+geb.	geb.
+Gebr.	Gebr.
+gem.	gem.
+ggf.	ggf.
+ggü.	ggü.
+ggfs.	ggfs.
+gegr.	gegr.
+Hbf.	Hbf.
+Hrsg.	Hrsg.
+hrsg.	hrsg.
+i.A.	i.A.
+i.d.R.	i.d.R.
+inkl.	inkl.
+insb.	insb.
+i.O.	i.O.
+i.Tr.	i.Tr.
+i.V.	i.V.
+jur.	jur.
+kath.	kath.
+K.O.	K.O.
+lt.	lt.
+max.	max.
+m.E.	m.E.
+m.M.	m.M.
+mtl.	mtl.
+min.	min.
+mind.	mind.
+MwSt.	MwSt.
+Nr.	Nr.
+o.a.	o.a.
+o.ä.	o.ä.
+o.Ä.	o.Ä.
+o.g.	o.g.
+o.k.	o.k.
+O.K.	O.K.
+Orig.	Orig.
+orig.	orig.
+pers.	pers.
+Pkt.	Pkt.
+Red.	Red.
+röm.	röm.
+s.o.	s.o.
+sog.	sog.
+std.	std.
+stellv.	stellv.
+Str.	Str.
+tägl.	tägl.
+Tel.	Tel.
+u.a.	u.a.
+usf.	usf.
+u.s.w.	u.s.w.
+usw.	usw.
+u.U.	u.U.
+u.v.m.	u.v.m.
+uvm.	uvm.
+v.a.	v.a.
+vgl.	vgl.
+vllt.	vllt.
+v.l.n.r.	v.l.n.r.
+vlt.	vlt.
+Vol.	Vol.
+wiss.	wiss.
+Univ.	Univ.
+z.B.	z.B.
+z.b.	z.b.
+z.Bsp.	z.Bsp.
+z.T.	z.T.
+z.Z.	z.Z.
+zzgl.	zzgl.
+z.Zt.	z.Zt.
+
+# popular latin abbreviations
+vs.	vs.
+adv.	adv.
+Chr.	Chr.
+A.C.	A.C.
+A.D.	A.D.
+e.g.	e.g.
+i.e.	i.e.
+al.	al.
+p.a.	p.a.
+P.S.	P.S.
+q.e.d.	q.e.d.
+R.I.P.	R.I.P.
+etc.	etc.
+incl.	incl.
+ca.	ca.
+n.Chr.	n.Chr.
+p.s.	p.s.
+v.Chr.	v.Chr.
+
+# popular english abbreviations
+D.C.	D.C.
+N.Y.	N.Y.
+N.Y.C.	N.Y.C.
+U.S.	U.S.
+U.S.A.	U.S.A.
+L.A.	L.A.
+U.S.S.	U.S.S.
+
+# dates & time
+Jan.	Jan.
+Feb.	Feb.
+Mrz.	Mrz.
+Mär.	Mär.
+Apr.	Apr.
+Jun.	Jun.
+Jul.	Jul.
+Aug.	Aug.
+Sep.	Sep.
+Sept.	Sept.
+Okt.	Okt.
+Nov.	Nov.
+Dez.	Dez.
+Mo.	Mo.
+Di.	Di.
+Mi.	Mi.
+Do.	Do.
+Fr.	Fr.
+Sa.	Sa.
+So.	So.
+Std.	Std.
+Jh.	Jh.
+Jhd.	Jhd.
+
+# numbers
+Tsd.	Tsd.
+Mio.	Mio.
+Mrd.	Mrd.
+
+# countries & languages
+engl.	engl.
+frz.	frz.
+lat.	lat.
+österr.	österr.
+
+# smileys
+:)	:)
+<3	<3
+;)	;)
+(:	(:
+:(	:(
+-_-	-_-
+=)	=)
+:/	:/
+:>	:>
+;-)	;-)
+:Y	:Y
+:P	:P
+:-P	:-P
+:3	:3
+=3	=3
+xD	xD
+^_^	^_^
+=]	=]
+=D	=D
+<333	<333
+:))	:))
+:0	:0
+-__-	-__-
+xDD	xDD
+o_o	o_o
+o_O	o_O
+V_V	V_V
+=[[	=[[
+<33	<33
+;p	;p
+;D	;D
+;-p	;-p
+;(	;(
+:p	:p
+:]	:]
+:O	:O
+:-/	:-/
+:-)	:-)
+:(((	:(((
+:((	:((
+:')	:')
+(^_^)	(^_^)
+(=	(=
+o.O	o.O
+
+# single letters
+a.	a.
+b.	b.
+c.	c.
+d.	d.
+e.	e.
+f.	f.
+g.	g.
+h.	h.
+i.	i.
+j.	j.
+k.	k.
+l.	l.
+m.	m.
+n.	n.
+o.	o.
+p.	p.
+q.	q.
+r.	r.
+s.	s.
+t.	t.
+u.	u.
+v.	v.
+w.	w.
+x.	x.
+y.	y.
+z.	z.
+ä.	ä.
+ö.	ö.
+ü.	ü.
diff --git a/lang_data/de/gazetteer.json b/lang_data/de/gazetteer.json
new file mode 100644
index 000000000..d52fed839
--- /dev/null
+++ b/lang_data/de/gazetteer.json
@@ -0,0 +1,194 @@
+{
+	"Reddit": [
+		"PRODUCT",
+		{},
+		[
+			[{"lower": "reddit"}]
+		]
+	],
+	"SeptemberElevenAttacks": [
+		"EVENT",
+		{},
+		[
+			[
+				{"orth": "9/11"}
+			],
+			[
+				{"lower": "september"},
+				{"orth": "11"}
+			]
+		]
+	],
+	"Linux": [
+		"PRODUCT",
+		{},
+		[
+			[{"lower": "linux"}]
+		]
+	],
+	"Haskell": [
+		"PRODUCT",
+		{},
+		[
+			[{"lower": "haskell"}]
+		]
+	],
+	"HaskellCurry": [
+		"PERSON",
+		{},
+		[
+			[
+				{"lower": "haskell"},
+				{"lower": "curry"}
+			]
+		]
+	],
+	"Javascript": [
+		"PRODUCT",
+		{},
+		[
+			[{"lower": "javascript"}]
+		]
+	],
+	"CSS": [
+		"PRODUCT",
+		{},
+		[
+			[{"lower": "css"}],
+			[{"lower": "css3"}]
+		]
+	],
+	"displaCy": [
+		"PRODUCT",
+		{},
+		[
+			[{"lower": "displacy"}]
+		]
+	],
+	"spaCy": [
+		"PRODUCT",
+		{},
+		[
+			[{"orth": "spaCy"}]
+		]
+	],
+
+    "HTML": [
+		"PRODUCT",
+		{},
+		[
+			[{"lower": "html"}],
+			[{"lower": "html5"}]
+		]
+	],
+    "Python": [
+        "PRODUCT",
+        {},
+        [
+            [{"orth": "Python"}]
+        ]
+    ],
+    "Ruby": [
+        "PRODUCT",
+        {},
+        [
+            [{"orth": "Ruby"}]
+        ]
+    ],
+    "Digg": [
+        "PRODUCT",
+        {},
+        [
+            [{"lower": "digg"}]
+        ]
+    ],
+     "FoxNews": [
+        "ORG",
+        {},
+        [
+            [{"orth": "Fox"}],
+            [{"orth": "News"}]
+        ]
+    ],
+    "Google": [
+        "ORG",
+        {},
+        [
+            [{"lower": "google"}]
+        ]
+    ],
+    "Mac": [
+        "PRODUCT",
+        {},
+        [
+            [{"lower": "mac"}]
+        ]
+    ],
+    "Wikipedia": [
+        "PRODUCT",
+        {},
+        [
+            [{"lower": "wikipedia"}]
+        ]
+    ],
+    "Windows": [
+        "PRODUCT",
+        {},
+        [
+            [{"orth": "Windows"}]
+        ]
+    ],
+     "Dell": [
+        "ORG",
+        {},
+        [
+            [{"lower": "dell"}]
+        ]
+    ],
+    "Facebook": [
+        "ORG",
+        {},
+        [
+            [{"lower": "facebook"}]
+        ]
+    ],
+     "Blizzard": [
+        "ORG",
+        {},
+        [
+            [{"orth": "Blizzard"}]
+        ]
+    ],
+    "Ubuntu": [
+        "ORG",
+        {},
+        [
+            [{"orth": "Ubuntu"}]
+        ]
+    ],
+    "Youtube": [
+        "PRODUCT",
+        {},
+        [
+            [{"lower": "youtube"}]
+        ]
+    ],
+    "false_positives": [
+        null,
+        {},
+        [
+            [{"orth": "Shit"}],
+            [{"orth": "Weed"}],
+            [{"orth": "Cool"}],
+            [{"orth": "Btw"}],
+            [{"orth": "Bah"}],
+            [{"orth": "Bullshit"}],
+            [{"orth": "Lol"}],
+            [{"orth": "Yo"}, {"lower": "dawg"}],
+            [{"orth": "Yay"}],
+            [{"orth": "Ahh"}],
+            [{"orth": "Yea"}],
+            [{"orth": "Bah"}]
+        ]
+    ]
+}
diff --git a/lang_data/de/generate_specials.py b/lang_data/de/generate_specials.py
index 44e674800..b3dc52e4f 100644
--- a/lang_data/de/generate_specials.py
+++ b/lang_data/de/generate_specials.py
@@ -1,5 +1,7 @@
 # coding=utf8
 import json
+import io
+import itertools
 
 contractions = {}
 
@@ -262,14 +264,30 @@ def get_token_properties(token, capitalize=False, remove_contractions=False):
     props["F"] = token
     return props
 
+
 def create_entry(token, endings, capitalize=False, remove_contractions=False):
-    
     properties = []
     properties.append(get_token_properties(token, capitalize=capitalize, remove_contractions=remove_contractions))
     for e in endings:
         properties.append(get_token_properties(e, remove_contractions=remove_contractions))
     return properties
 
+
+FIELDNAMES = ['F','L','pos']
+def read_hardcoded(stream):
+    hc_specials = {}
+    for line in stream:
+        line = line.strip()
+        if line.startswith('#') or not line:
+            continue
+        key,_,rest = line.partition('\t')
+        values = []
+        for annotation in zip(*[ e.split('|') for e in rest.split('\t') ]):
+            values.append({ k:v for k,v in itertools.izip_longest(FIELDNAMES,annotation) if v })
+        hc_specials[key] = values
+    return hc_specials
+
+
 def generate_specials():
 
     specials = {}
@@ -303,7 +321,10 @@ def generate_specials():
                 specials[special] = create_entry(token, endings, capitalize=True, remove_contractions=True)
 
     # add in hardcoded specials
-    specials = dict(specials, **hardcoded_specials)
+    # changed it so it generates them from a file
+    with io.open('abbrev.de.tab','r',encoding='utf8') as abbrev_:
+        hc_specials = read_hardcoded(abbrev_)
+    specials = dict(specials, **hc_specials)
 
     return specials
 
diff --git a/lang_data/de/infix.txt b/lang_data/de/infix.txt
index 37eca7350..8398d5d42 100644
--- a/lang_data/de/infix.txt
+++ b/lang_data/de/infix.txt
@@ -1,3 +1,6 @@
 \.\.\.
 (?<=[a-z])\.(?=[A-Z])
-(?<=[a-zA-Z])-(?=[a-zA-z])
+(?<=[a-zöäüßA-ZÖÄÜ"]):(?=[a-zöäüßA-ZÖÄÜ])
+(?<=[a-zöäüßA-ZÖÄÜ"])>(?=[a-zöäüßA-ZÖÄÜ])
+(?<=[a-zöäüßA-ZÖÄÜ"])<(?=[a-zöäüßA-ZÖÄÜ])
+(?<=[a-zöäüßA-ZÖÄÜ"])=(?=[a-zöäüßA-ZÖÄÜ])
diff --git a/lang_data/de/prefix.txt b/lang_data/de/prefix.txt
index 1082bef7d..e37542a9c 100644
--- a/lang_data/de/prefix.txt
+++ b/lang_data/de/prefix.txt
@@ -5,6 +5,7 @@
 {
 *
 <
+>
 $
 £
 „
@@ -20,3 +21,7 @@ a-
 ‘
 ....
 ...
+‚
+»
+_
+§
diff --git a/lang_data/de/specials.json b/lang_data/de/specials.json
index b8d084503..282ec6df4 100644
--- a/lang_data/de/specials.json
+++ b/lang_data/de/specials.json
@@ -1,27 +1,4 @@
 {
-    "\t": [
-        {
-            "F": "\t",
-            "pos": "SP"
-        }
-    ],
-    "\n": [
-        {
-            "F": "\n",
-            "pos": "SP"
-        }
-    ],
-    " ": [
-        {
-            "F": " ",
-            "pos": "SP"
-        }
-    ],
-    "\")": [
-        {
-            "F": "\")"
-        }
-    ],
     "''": [
         {
             "F": "''"
@@ -217,6 +194,11 @@
             "F": "<333"
         }
     ],
+    "<space>": [
+        {
+            "F": "SP"
+        }
+    ],
     "=)": [
         {
             "F": "=)"
@@ -267,6 +249,16 @@
             "F": "Abk."
         }
     ],
+    "Abs.": [
+        {
+            "F": "Abs."
+        }
+    ],
+    "Abt.": [
+        {
+            "F": "Abt."
+        }
+    ],
     "Apr.": [
         {
             "F": "Apr."
@@ -277,6 +269,26 @@
             "F": "Aug."
         }
     ],
+    "B.A.": [
+        {
+            "F": "B.A."
+        }
+    ],
+    "B.Sc.": [
+        {
+            "F": "B.Sc."
+        }
+    ],
+    "Bd.": [
+        {
+            "F": "Bd."
+        }
+    ],
+    "Betr.": [
+        {
+            "F": "Betr."
+        }
+    ],
     "Bf.": [
         {
             "F": "Bf."
@@ -292,6 +304,11 @@
             "F": "Biol."
         }
     ],
+    "Bsp.": [
+        {
+            "F": "Bsp."
+        }
+    ],
     "Chr.": [
         {
             "F": "Chr."
@@ -342,6 +359,16 @@
             "F": "Dr."
         }
     ],
+    "Fa.": [
+        {
+            "F": "Fa."
+        }
+    ],
+    "Fam.": [
+        {
+            "F": "Fam."
+        }
+    ],
     "Feb.": [
         {
             "F": "Feb."
@@ -387,6 +414,16 @@
             "F": "Hrgs."
         }
     ],
+    "Hrn.": [
+        {
+            "F": "Hrn."
+        }
+    ],
+    "Hrsg.": [
+        {
+            "F": "Hrsg."
+        }
+    ],
     "Ing.": [
         {
             "F": "Ing."
@@ -397,11 +434,21 @@
             "F": "Jan."
         }
     ],
+    "Jh.": [
+        {
+            "F": "Jh."
+        }
+    ],
     "Jhd.": [
         {
             "F": "Jhd."
         }
     ],
+    "Jr.": [
+        {
+            "F": "Jr."
+        }
+    ],
     "Jul.": [
         {
             "F": "Jul."
@@ -412,21 +459,61 @@
             "F": "Jun."
         }
     ],
+    "K.O.": [
+        {
+            "F": "K.O."
+        }
+    ],
+    "L.A.": [
+        {
+            "F": "L.A."
+        }
+    ],
+    "M.A.": [
+        {
+            "F": "M.A."
+        }
+    ],
+    "M.Sc.": [
+        {
+            "F": "M.Sc."
+        }
+    ],
     "Mi.": [
         {
             "F": "Mi."
         }
     ],
+    "Mio.": [
+        {
+            "F": "Mio."
+        }
+    ],
     "Mo.": [
         {
             "F": "Mo."
         }
     ],
+    "Mr.": [
+        {
+            "F": "Mr."
+        }
+    ],
+    "Mrd.": [
+        {
+            "F": "Mrd."
+        }
+    ],
     "Mrz.": [
         {
             "F": "Mrz."
         }
     ],
+    "MwSt.": [
+        {
+            "F": "MwSt."
+        }
+    ],
     "M\u00e4r.": [
         {
             "F": "M\u00e4r."
@@ -452,16 +539,31 @@
             "F": "Nr."
         }
     ],
+    "O.K.": [
+        {
+            "F": "O.K."
+        }
+    ],
     "Okt.": [
         {
             "F": "Okt."
         }
     ],
+    "Orig.": [
+        {
+            "F": "Orig."
+        }
+    ],
     "P.S.": [
         {
             "F": "P.S."
         }
     ],
+    "Pkt.": [
+        {
+            "F": "Pkt."
+        }
+    ],
     "Prof.": [
         {
             "F": "Prof."
@@ -472,6 +574,11 @@
             "F": "R.I.P."
         }
     ],
+    "Red.": [
+        {
+            "F": "Red."
+        }
+    ],
     "S'": [
         {
             "F": "S'",
@@ -503,6 +610,41 @@
             "F": "St."
         }
     ],
+    "Std.": [
+        {
+            "F": "Std."
+        }
+    ],
+    "Str.": [
+        {
+            "F": "Str."
+        }
+    ],
+    "Tel.": [
+        {
+            "F": "Tel."
+        }
+    ],
+    "Tsd.": [
+        {
+            "F": "Tsd."
+        }
+    ],
+    "U.S.": [
+        {
+            "F": "U.S."
+        }
+    ],
+    "U.S.A.": [
+        {
+            "F": "U.S.A."
+        }
+    ],
+    "U.S.S.": [
+        {
+            "F": "U.S.S."
+        }
+    ],
     "Univ.": [
         {
             "F": "Univ."
@@ -513,6 +655,30 @@
             "F": "V_V"
         }
     ],
+    "Vol.": [
+        {
+            "F": "Vol."
+        }
+    ],
+    "\\\")": [
+        {
+            "F": "\\\")"
+        }
+    ],
+    "\\n": [
+        {
+            "F": "\\n",
+            "L": "<nl>",
+            "pos": "SP"
+        }
+    ],
+    "\\t": [
+        {
+            "F": "\\t",
+            "L": "<tab>",
+            "pos": "SP"
+        }
+    ],
     "^_^": [
         {
             "F": "^_^"
@@ -528,6 +694,11 @@
             "F": "a.D."
         }
     ],
+    "a.M.": [
+        {
+            "F": "a.M."
+        }
+    ],
     "a.Z.": [
         {
             "F": "a.Z."
@@ -548,9 +719,15 @@
             "F": "al."
         }
     ],
+    "allg.": [
+        {
+            "F": "allg."
+        }
+    ],
     "auf'm": [
         {
-            "F": "auf"
+            "F": "auf",
+            "L": "auf"
         },
         {
             "F": "'m",
@@ -572,11 +749,31 @@
             "F": "biol."
         }
     ],
+    "bspw.": [
+        {
+            "F": "bspw."
+        }
+    ],
+    "bzgl.": [
+        {
+            "F": "bzgl."
+        }
+    ],
+    "bzw.": [
+        {
+            "F": "bzw."
+        }
+    ],
     "c.": [
         {
             "F": "c."
         }
     ],
+    "ca.": [
+        {
+            "F": "ca."
+        }
+    ],
     "co.": [
         {
             "F": "co."
@@ -587,9 +784,20 @@
             "F": "d."
         }
     ],
+    "d.h.": [
+        {
+            "F": "d.h."
+        }
+    ],
+    "dgl.": [
+        {
+            "F": "dgl."
+        }
+    ],
     "du's": [
         {
-            "F": "du"
+            "F": "du",
+            "L": "du"
         },
         {
             "F": "'s",
@@ -611,19 +819,35 @@
             "F": "e.g."
         }
     ],
+    "ebd.": [
+        {
+            "F": "ebd."
+        }
+    ],
     "ehem.": [
         {
             "F": "ehem."
         }
     ],
+    "eigtl.": [
+        {
+            "F": "eigtl."
+        }
+    ],
     "engl.": [
         {
             "F": "engl."
         }
     ],
+    "entspr.": [
+        {
+            "F": "entspr."
+        }
+    ],
     "er's": [
         {
-            "F": "er"
+            "F": "er",
+            "L": "er"
         },
         {
             "F": "'s",
@@ -640,11 +864,26 @@
             "F": "etc."
         }
     ],
+    "ev.": [
+        {
+            "F": "ev."
+        }
+    ],
+    "evtl.": [
+        {
+            "F": "evtl."
+        }
+    ],
     "f.": [
         {
             "F": "f."
         }
     ],
+    "frz.": [
+        {
+            "F": "frz."
+        }
+    ],
     "g.": [
         {
             "F": "g."
@@ -660,6 +899,11 @@
             "F": "gegr."
         }
     ],
+    "gem.": [
+        {
+            "F": "gem."
+        }
+    ],
     "ggf.": [
         {
             "F": "ggf."
@@ -687,23 +931,39 @@
     ],
     "hinter'm": [
         {
-            "F": "hinter"
+            "F": "hinter",
+            "L": "hinter"
         },
         {
             "F": "'m",
             "L": "dem"
         }
     ],
+    "hrsg.": [
+        {
+            "F": "hrsg."
+        }
+    ],
     "i.": [
         {
             "F": "i."
         }
     ],
+    "i.A.": [
+        {
+            "F": "i.A."
+        }
+    ],
     "i.G.": [
         {
             "F": "i.G."
         }
     ],
+    "i.O.": [
+        {
+            "F": "i.O."
+        }
+    ],
     "i.Tr.": [
         {
             "F": "i.Tr."
@@ -714,6 +974,11 @@
             "F": "i.V."
         }
     ],
+    "i.d.R.": [
+        {
+            "F": "i.d.R."
+        }
+    ],
     "i.e.": [
         {
             "F": "i.e."
@@ -721,7 +986,8 @@
     ],
     "ich's": [
         {
-            "F": "ich"
+            "F": "ich",
+            "L": "ich"
         },
         {
             "F": "'s",
@@ -730,7 +996,8 @@
     ],
     "ihr's": [
         {
-            "F": "ihr"
+            "F": "ihr",
+            "L": "ihr"
         },
         {
             "F": "'s",
@@ -757,6 +1024,11 @@
             "F": "j."
         }
     ],
+    "jr.": [
+        {
+            "F": "jr."
+        }
+    ],
     "jun.": [
         {
             "F": "jun."
@@ -772,11 +1044,21 @@
             "F": "k."
         }
     ],
+    "kath.": [
+        {
+            "F": "kath."
+        }
+    ],
     "l.": [
         {
             "F": "l."
         }
     ],
+    "lat.": [
+        {
+            "F": "lat."
+        }
+    ],
     "lt.": [
         {
             "F": "lt."
@@ -787,11 +1069,46 @@
             "F": "m."
         }
     ],
+    "m.E.": [
+        {
+            "F": "m.E."
+        }
+    ],
+    "m.M.": [
+        {
+            "F": "m.M."
+        }
+    ],
+    "max.": [
+        {
+            "F": "max."
+        }
+    ],
+    "min.": [
+        {
+            "F": "min."
+        }
+    ],
+    "mind.": [
+        {
+            "F": "mind."
+        }
+    ],
+    "mtl.": [
+        {
+            "F": "mtl."
+        }
+    ],
     "n.": [
         {
             "F": "n."
         }
     ],
+    "n.Chr.": [
+        {
+            "F": "n.Chr."
+        }
+    ],
     "nat.": [
         {
             "F": "nat."
@@ -807,6 +1124,31 @@
             "F": "o.O"
         }
     ],
+    "o.a.": [
+        {
+            "F": "o.a."
+        }
+    ],
+    "o.g.": [
+        {
+            "F": "o.g."
+        }
+    ],
+    "o.k.": [
+        {
+            "F": "o.k."
+        }
+    ],
+    "o.\u00c4.": [
+        {
+            "F": "o.\u00c4."
+        }
+    ],
+    "o.\u00e4.": [
+        {
+            "F": "o.\u00e4."
+        }
+    ],
     "o_O": [
         {
             "F": "o_O"
@@ -817,6 +1159,11 @@
             "F": "o_o"
         }
     ],
+    "orig.": [
+        {
+            "F": "orig."
+        }
+    ],
     "p.": [
         {
             "F": "p."
@@ -827,6 +1174,21 @@
             "F": "p.a."
         }
     ],
+    "p.s.": [
+        {
+            "F": "p.s."
+        }
+    ],
+    "pers.": [
+        {
+            "F": "pers."
+        }
+    ],
+    "phil.": [
+        {
+            "F": "phil."
+        }
+    ],
     "q.": [
         {
             "F": "q."
@@ -847,6 +1209,11 @@
             "F": "rer."
         }
     ],
+    "r\u00f6m.": [
+        {
+            "F": "r\u00f6m."
+        }
+    ],
     "s'": [
         {
             "F": "s'",
@@ -858,6 +1225,11 @@
             "F": "s."
         }
     ],
+    "s.o.": [
+        {
+            "F": "s.o."
+        }
+    ],
     "sen.": [
         {
             "F": "sen."
@@ -865,23 +1237,49 @@
     ],
     "sie's": [
         {
-            "F": "sie"
+            "F": "sie",
+            "L": "sie"
         },
         {
             "F": "'s",
             "L": "es"
         }
     ],
+    "sog.": [
+        {
+            "F": "sog."
+        }
+    ],
+    "std.": [
+        {
+            "F": "std."
+        }
+    ],
+    "stellv.": [
+        {
+            "F": "stellv."
+        }
+    ],
     "t.": [
         {
             "F": "t."
         }
     ],
+    "t\u00e4gl.": [
+        {
+            "F": "t\u00e4gl."
+        }
+    ],
     "u.": [
         {
             "F": "u."
         }
     ],
+    "u.U.": [
+        {
+            "F": "u.U."
+        }
+    ],
     "u.a.": [
         {
             "F": "u.a."
@@ -892,28 +1290,75 @@
             "F": "u.s.w."
         }
     ],
+    "u.v.m.": [
+        {
+            "F": "u.v.m."
+        }
+    ],
     "unter'm": [
         {
-            "F": "unter"
+            "F": "unter",
+            "L": "unter"
         },
         {
             "F": "'m",
             "L": "dem"
         }
     ],
+    "usf.": [
+        {
+            "F": "usf."
+        }
+    ],
+    "usw.": [
+        {
+            "F": "usw."
+        }
+    ],
+    "uvm.": [
+        {
+            "F": "uvm."
+        }
+    ],
     "v.": [
         {
             "F": "v."
         }
     ],
+    "v.Chr.": [
+        {
+            "F": "v.Chr."
+        }
+    ],
+    "v.a.": [
+        {
+            "F": "v.a."
+        }
+    ],
+    "v.l.n.r.": [
+        {
+            "F": "v.l.n.r."
+        }
+    ],
     "vgl.": [
         {
             "F": "vgl."
         }
     ],
+    "vllt.": [
+        {
+            "F": "vllt."
+        }
+    ],
+    "vlt.": [
+        {
+            "F": "vlt."
+        }
+    ],
     "vor'm": [
         {
-            "F": "vor"
+            "F": "vor",
+            "L": "vor"
         },
         {
             "F": "'m",
@@ -932,13 +1377,19 @@
     ],
     "wir's": [
         {
-            "F": "wir"
+            "F": "wir",
+            "L": "wir"
         },
         {
             "F": "'s",
             "L": "es"
         }
     ],
+    "wiss.": [
+        {
+            "F": "wiss."
+        }
+    ],
     "x.": [
         {
             "F": "x."
@@ -969,19 +1420,60 @@
             "F": "z.B."
         }
     ],
+    "z.Bsp.": [
+        {
+            "F": "z.Bsp."
+        }
+    ],
+    "z.T.": [
+        {
+            "F": "z.T."
+        }
+    ],
     "z.Z.": [
         {
             "F": "z.Z."
         }
     ],
+    "z.Zt.": [
+        {
+            "F": "z.Zt."
+        }
+    ],
+    "z.b.": [
+        {
+            "F": "z.b."
+        }
+    ],
     "zzgl.": [
         {
             "F": "zzgl."
         }
     ],
+    "\u00e4.": [
+        {
+            "F": "\u00e4."
+        }
+    ],
+    "\u00f6.": [
+        {
+            "F": "\u00f6."
+        }
+    ],
+    "\u00f6sterr.": [
+        {
+            "F": "\u00f6sterr."
+        }
+    ],
+    "\u00fc.": [
+        {
+            "F": "\u00fc."
+        }
+    ],
     "\u00fcber'm": [
         {
-            "F": "\u00fcber"
+            "F": "\u00fcber",
+            "L": "\u00fcber"
         },
         {
             "F": "'m",
diff --git a/lang_data/de/suffix.txt b/lang_data/de/suffix.txt
index d8c6bc2c2..aeecb85a2 100644
--- a/lang_data/de/suffix.txt
+++ b/lang_data/de/suffix.txt
@@ -13,14 +13,61 @@
 ;
 '
 ”
+“
+«
+_
 ''
 's
 'S
 ’s
 ’S
 ’
+‘
+°
+€
 \.\.
 \.\.\.
 \.\.\.\.
-(?<=[a-z0-9)\]"'%\)])\.
+(?<=[a-zäöüßÖÄÜ)\]"'´«‘’%\)²“”])\.
+\-\-
+´
+(?<=[0-9])km²
+(?<=[0-9])m²
+(?<=[0-9])cm²
+(?<=[0-9])mm²
+(?<=[0-9])km³
+(?<=[0-9])m³
+(?<=[0-9])cm³
+(?<=[0-9])mm³
+(?<=[0-9])ha
 (?<=[0-9])km
+(?<=[0-9])m
+(?<=[0-9])cm
+(?<=[0-9])mm
+(?<=[0-9])µm
+(?<=[0-9])nm
+(?<=[0-9])yd
+(?<=[0-9])in
+(?<=[0-9])ft
+(?<=[0-9])kg
+(?<=[0-9])g
+(?<=[0-9])mg
+(?<=[0-9])µg
+(?<=[0-9])t
+(?<=[0-9])lb
+(?<=[0-9])oz
+(?<=[0-9])m/s
+(?<=[0-9])km/h
+(?<=[0-9])mph
+(?<=[0-9])°C
+(?<=[0-9])°K
+(?<=[0-9])°F
+(?<=[0-9])hPa
+(?<=[0-9])Pa
+(?<=[0-9])mbar
+(?<=[0-9])mb
+(?<=[0-9])T
+(?<=[0-9])G
+(?<=[0-9])M
+(?<=[0-9])K
+(?<=[0-9])kb
diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx
index 11a6a2005..26f8fd3e5 100644
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@@ -153,10 +153,8 @@ cdef class Tagger:
     @classmethod
     def from_package(cls, pkg, vocab):
         # TODO: templates.json deprecated? not present in latest package
-        templates = cls.default_templates()
-        # templates = package.load_utf8(json.load,
-        #     'pos', 'templates.json',
-        #     default=cls.default_templates())
+        # templates = cls.default_templates()
+        templates = pkg.load_json(('pos', 'templates.json'), default=cls.default_templates())
 
         model = TaggerModel(templates)
         if pkg.has_file('pos', 'model'):
@@ -203,7 +201,7 @@ cdef class Tagger:
                                   nr_class=self.vocab.morphology.n_tags,
                                   nr_feat=self.model.nr_feat)
         for i in range(tokens.length):
-            if tokens.c[i].pos == 0:
+            if tokens.c[i].pos == 0:                
                 self.model.set_featuresC(&eg.c, tokens.c, i)
                 self.model.set_scoresC(eg.c.scores,
                     eg.c.features, eg.c.nr_feat)
@@ -221,7 +219,7 @@ cdef class Tagger:
     def train(self, Doc tokens, object gold_tag_strs):
         assert len(tokens) == len(gold_tag_strs)
         for tag in gold_tag_strs:
-            if tag not in self.tag_names:
+            if tag != None and tag not in self.tag_names:
                 msg = ("Unrecognized gold tag: %s. tag_map.json must contain all"
                        "gold tags, to maintain coarse-grained mapping.")
                 raise ValueError(msg % tag)
@@ -234,10 +232,9 @@ cdef class Tagger:
             nr_feat=self.model.nr_feat)
         for i in range(tokens.length):
             self.model.set_featuresC(&eg.c, tokens.c, i)
-            eg.set_label(golds[i])
+            eg.costs = [ 1 if golds[i] not in (c, -1) else 0 for c in xrange(eg.nr_class) ]
             self.model.set_scoresC(eg.c.scores,
                 eg.c.features, eg.c.nr_feat)
-            
             self.model.updateC(&eg.c)
 
             self.vocab.morphology.assign_tag(&tokens.c[i], eg.guess)

From 8d531c958b8ac6d58e527162c10bca8e5885d916 Mon Sep 17 00:00:00 2001
From: Wolfgang Seeker <seeker@ims.uni-stuttgart.de>
Date: Mon, 22 Feb 2016 14:40:40 +0100
Subject: [PATCH 2/6] replace tests for non-projectivity

- add functions to find non-projective edges
- add test file for non-projectivity functions
---
 spacy/gold.pyx              | 71 ++++++++++++++-----------------------
 spacy/nonproj.py            | 55 ++++++++++++++++++++++++++++
 spacy/tagger.pyx            |  5 +++
 spacy/tests/test_nonproj.py | 42 ++++++++++++++++++++++
 4 files changed, 128 insertions(+), 45 deletions(-)
 create mode 100644 spacy/nonproj.py
 create mode 100644 spacy/tests/test_nonproj.py

diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index d8b100744..dd29a42c7 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -14,6 +14,8 @@ try:
 except ImportError:
     import json
 
+import nonproj
+
 
 def tags_to_entities(tags):
     entities = []
@@ -236,34 +238,20 @@ cdef class GoldParse:
                 self.heads[i] = self.gold_to_cand[annot_tuples[3][gold_i]]
                 self.labels[i] = annot_tuples[4][gold_i]
                 self.ner[i] = annot_tuples[5][gold_i]
-       
-        # If we have any non-projective arcs, i.e. crossing brackets, consider
-        # the heads for those words missing in the gold-standard.
-        # This way, we can train from these sentences
-        cdef int w1, w2, h1, h2
-        if make_projective:
-            heads = list(self.heads)
-            for w1 in range(self.length):
-                if heads[w1] is not None:
-                    h1 = heads[w1]
-                    for w2 in range(w1+1, self.length):
-                        if heads[w2] is not None:
-                            h2 = heads[w2]
-                            if _arcs_cross(w1, h1, w2, h2):
-                                self.heads[w1] = None
-                                self.labels[w1] = ''
-                                self.heads[w2] = None
-                                self.labels[w2] = ''
 
-        # Check there are no cycles in the dependencies, i.e. we are a tree
-        for w in range(self.length):
-            seen = set([w])
-            head = w
-            while self.heads[head] != head and self.heads[head] != None:
-                head = self.heads[head]
-                if head in seen:
-                    raise Exception("Cycle found: %s" % seen)
-                seen.add(head)
+        cycle = nonproj.contains_cycle(self.heads)
+        if cycle != None:
+            raise Exception("Cycle found: %s" % cycle)
+
+        if make_projective:
+            # projectivity here means non-proj arcs are being disconnected
+            np_arcs = []
+            for word in range(self.length):
+                if nonproj.is_non_projective_arc(word,self.heads):
+                    np_arcs.append(word)
+            for np_arc in np_arcs:
+                self.heads[np_arc] = None
+                self.labels[np_arc] = ''
 
         self.brackets = {}
         for (gold_start, gold_end, label_str) in brackets:
@@ -278,25 +266,18 @@ cdef class GoldParse:
 
     @property
     def is_projective(self):
-        heads = list(self.heads)
-        for w1 in range(self.length):
-            if heads[w1] is not None:
-                h1 = heads[w1]
-                for w2 in range(self.length):
-                    if heads[w2] is not None and _arcs_cross(w1, h1, w2, heads[w2]):
-                        return False
-        return True
-
-
-cdef int _arcs_cross(int w1, int h1, int w2, int h2) except -1:
-    if w1 > h1:
-        w1, h1 = h1, w1
-    if w2 > h2:
-        w2, h2 = h2, w2
-    if w1 > w2:
-        w1, h1, w2, h2 = w2, h2, w1, h1
-    return w1 < w2 < h1 < h2 or w1 < w2 == h2 < h1
+        return not nonproj.is_non_projective_tree(self.heads)
 
 
 def is_punct_label(label):
     return label == 'P' or label.lower() == 'punct'
+
+
+
+
+
+
+
+
+
+
diff --git a/spacy/nonproj.py b/spacy/nonproj.py
new file mode 100644
index 000000000..58f9f3e9b
--- /dev/null
+++ b/spacy/nonproj.py
@@ -0,0 +1,55 @@
+
+
+def ancestors(word, heads):
+    # returns all words going from the word up the path to the root
+    # the path to root cannot be longer than the number of words in the sentence
+    # this function ends after at most len(heads) steps 
+    # because it would otherwise loop indefinitely on cycles
+    head = word
+    cnt = 0
+    while heads[head] != head and cnt < len(heads):
+        head = heads[head]
+        cnt += 1
+        yield head
+        if head == None:
+            break
+
+
+def contains_cycle(heads):
+    # in an acyclic tree, the path from each word following
+    # the head relation upwards always ends at the root node
+    for word in range(len(heads)):
+        seen = set([word])
+        for ancestor in ancestors(word,heads):
+            if ancestor in seen:
+                return seen
+            seen.add(ancestor)
+    return None
+
+
+def is_non_projective_arc(word, heads):
+    # definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective
+    # if there is a word k, h < k < d such that h is not
+    # an ancestor of k. Same for h -> d, h > d
+    head = heads[word]
+    if head == word: # root arcs cannot be non-projective
+        return False
+    elif head == None: # unattached tokens cannot be non-projective
+        return False
+
+    start, end = (head+1, word) if head < word else (word+1, head)
+    for k in range(start,end):
+        for ancestor in ancestors(k,heads):
+            if ancestor == None: # for unattached tokens/subtrees
+                break
+            elif ancestor == head: # normal case: k dominated by h
+                break
+        else: # head not in ancestors: d -> h is non-projective
+            return True
+    return False
+
+
+def is_non_projective_tree(heads):
+    # a tree is non-projective if at least one arc is non-projective
+    return any( is_non_projective_arc(word,heads) for word in range(len(heads)) )
+
diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx
index 26f8fd3e5..1c5baced7 100644
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@@ -211,6 +211,11 @@ cdef class Tagger:
         tokens.is_tagged = True
         tokens._py_tokens = [None] * tokens.length
 
+    def tags_from_list(self, Doc tokens, list strings):
+        assert(tokens.length == len(strings))
+        for i in range(tokens.length):
+            self.vocab.morphology.assign_tag(&tokens.c[i], strings[i])
+
     def pipe(self, stream, batch_size=1000, n_threads=2):
         for doc in stream:
             self(doc)
diff --git a/spacy/tests/test_nonproj.py b/spacy/tests/test_nonproj.py
new file mode 100644
index 000000000..bd7f12bff
--- /dev/null
+++ b/spacy/tests/test_nonproj.py
@@ -0,0 +1,42 @@
+from __future__ import unicode_literals
+import pytest
+
+from spacy.nonproj import ancestors, contains_cycle, is_non_projective_arc, is_non_projective_tree
+
+def test_ancestors():
+	tree = [1,2,2,4,5,2,2]
+	cyclic_tree = [1,2,2,4,5,3,2]
+	partial_tree = [1,2,2,4,5,None,2]
+	assert([ a for a in ancestors(3,tree) ] == [4,5,2])
+	assert([ a for a in ancestors(3,cyclic_tree) ] == [4,5,3,4,5,3,4])
+	assert([ a for a in ancestors(3,partial_tree) ] == [4,5,None])
+
+def test_contains_cycle():
+	tree = [1,2,2,4,5,2,2]
+	cyclic_tree = [1,2,2,4,5,3,2]
+	partial_tree = [1,2,2,4,5,None,2]
+	assert(contains_cycle(tree) == None)
+	assert(contains_cycle(cyclic_tree) == set([3,4,5]))
+	assert(contains_cycle(partial_tree) == None)
+
+def test_is_non_projective_arc():
+	nonproj_tree = [1,2,2,4,5,2,7,4,2]
+	assert(is_non_projective_arc(0,nonproj_tree) == False)
+	assert(is_non_projective_arc(1,nonproj_tree) == False)
+	assert(is_non_projective_arc(2,nonproj_tree) == False)
+	assert(is_non_projective_arc(3,nonproj_tree) == False)
+	assert(is_non_projective_arc(4,nonproj_tree) == False)
+	assert(is_non_projective_arc(5,nonproj_tree) == False)
+	assert(is_non_projective_arc(6,nonproj_tree) == False)
+	assert(is_non_projective_arc(7,nonproj_tree) == True)
+	assert(is_non_projective_arc(8,nonproj_tree) == False)
+	partial_tree = [1,2,2,4,5,None,7,4,2]
+	assert(is_non_projective_arc(7,partial_tree) == False)
+
+def test_is_non_projective_tree():
+	proj_tree = [1,2,2,4,5,2,7,5,2]
+	nonproj_tree = [1,2,2,4,5,2,7,4,2]
+	partial_tree = [1,2,2,4,5,None,7,4,2]
+	assert(is_non_projective_tree(proj_tree) == False)
+	assert(is_non_projective_tree(nonproj_tree) == True)
+	assert(is_non_projective_tree(partial_tree) == False)

From 4b2297d5d43d9148779dc54b446264cfd5c692db Mon Sep 17 00:00:00 2001
From: Wolfgang Seeker <seeker@ims.uni-stuttgart.de>
Date: Wed, 24 Feb 2016 11:26:25 +0100
Subject: [PATCH 3/6] add class PseudoProjective for pseudo-projective parsing

PseudoProjective() implements the algorithm from Nivre & Nilsson 2005
using their HEAD decoration scheme.
---
 spacy/gold.pyx              |   4 +-
 spacy/nonproj.py            | 152 +++++++++++++++++++++++++++++++++---
 spacy/tagger.pyx            |   5 --
 spacy/tests/test_nonproj.py | 103 ++++++++++++++++++++----
 4 files changed, 228 insertions(+), 36 deletions(-)

diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index dd29a42c7..7ab034195 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -247,7 +247,7 @@ cdef class GoldParse:
             # projectivity here means non-proj arcs are being disconnected
             np_arcs = []
             for word in range(self.length):
-                if nonproj.is_non_projective_arc(word,self.heads):
+                if nonproj.is_nonproj_arc(word,self.heads):
                     np_arcs.append(word)
             for np_arc in np_arcs:
                 self.heads[np_arc] = None
@@ -266,7 +266,7 @@ cdef class GoldParse:
 
     @property
     def is_projective(self):
-        return not nonproj.is_non_projective_tree(self.heads)
+        return not nonproj.is_nonproj_tree(self.heads)
 
 
 def is_punct_label(label):
diff --git a/spacy/nonproj.py b/spacy/nonproj.py
index 58f9f3e9b..facf9f299 100644
--- a/spacy/nonproj.py
+++ b/spacy/nonproj.py
@@ -1,11 +1,12 @@
+from copy import copy
+from collections import Counter
 
-
-def ancestors(word, heads):
+def ancestors(tokenid, heads):
     # returns all words going from the word up the path to the root
     # the path to root cannot be longer than the number of words in the sentence
     # this function ends after at most len(heads) steps 
     # because it would otherwise loop indefinitely on cycles
-    head = word
+    head = tokenid
     cnt = 0
     while heads[head] != head and cnt < len(heads):
         head = heads[head]
@@ -18,26 +19,26 @@ def ancestors(word, heads):
 def contains_cycle(heads):
     # in an acyclic tree, the path from each word following
     # the head relation upwards always ends at the root node
-    for word in range(len(heads)):
-        seen = set([word])
-        for ancestor in ancestors(word,heads):
+    for tokenid in range(len(heads)):
+        seen = set([tokenid])
+        for ancestor in ancestors(tokenid,heads):
             if ancestor in seen:
                 return seen
             seen.add(ancestor)
     return None
 
 
-def is_non_projective_arc(word, heads):
+def is_nonproj_arc(tokenid, heads):
     # definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective
-    # if there is a word k, h < k < d such that h is not
+    # if there is a token k, h < k < d such that h is not
     # an ancestor of k. Same for h -> d, h > d
-    head = heads[word]
-    if head == word: # root arcs cannot be non-projective
+    head = heads[tokenid]
+    if head == tokenid: # root arcs cannot be non-projective
         return False
     elif head == None: # unattached tokens cannot be non-projective
         return False
 
-    start, end = (head+1, word) if head < word else (word+1, head)
+    start, end = (head+1, tokenid) if head < tokenid else (tokenid+1, head)
     for k in range(start,end):
         for ancestor in ancestors(k,heads):
             if ancestor == None: # for unattached tokens/subtrees
@@ -49,7 +50,132 @@ def is_non_projective_arc(word, heads):
     return False
 
 
-def is_non_projective_tree(heads):
+def is_nonproj_tree(heads):
     # a tree is non-projective if at least one arc is non-projective
-    return any( is_non_projective_arc(word,heads) for word in range(len(heads)) )
+    return any( is_nonproj_arc(word,heads) for word in range(len(heads)) )
 
+
+class PseudoProjective:
+    # implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
+    # for doing pseudo-projective parsing
+    # implementation uses the HEAD decoration scheme
+
+    def preprocess_training_data(self, labeled_trees, label_freq_cutoff=30):
+        # expects a sequence of pairs of head arrays and labels
+        preprocessed = []
+        for heads,labels in labeled_trees:
+            proj_heads,deco_labels = self.projectivize(heads,labels)
+            # set the label to ROOT for each root dependent
+            deco_labels = [ 'ROOT' if head == i else deco_labels[i] for i,head in enumerate(proj_heads) ]
+            preprocessed.append((proj_heads,deco_labels))
+
+        if label_freq_cutoff > 0:
+            return self._filter_labels(preprocessed,label_freq_cutoff)
+        return preprocessed
+
+
+    def projectivize(self, heads, labels):
+        # use the algorithm by Nivre & Nilsson 2005
+        # assumes heads to be a proper tree, i.e. connected and cycle-free
+        # returns a new pair (heads,labels) which encode
+        # a projective and decorated tree
+        proj_heads = copy(heads)
+        smallest_np_arc = self._get_smallest_nonproj_arc(proj_heads)
+        if smallest_np_arc == None: # this sentence is already projective
+            return proj_heads, copy(labels)
+        while smallest_np_arc != None:
+            self._lift(smallest_np_arc, proj_heads)
+            smallest_np_arc = self._get_smallest_nonproj_arc(proj_heads)
+        deco_labels = self._decorate(heads, proj_heads, labels)
+        return proj_heads, deco_labels
+
+
+    def deprojectivize(self, heads, labels):
+        # reattach arcs with decorated labels (following HEAD scheme)
+        # for each decorated arc X||Y, search top-down, left-to-right,
+        # breadth-first until hitting a Y then make this the new head
+        newheads, newlabels = copy(heads), copy(labels)
+        spans = None
+        for tokenid, head in enumerate(heads):
+            if labels[tokenid].find('||') != -1:
+                newlabel,_,headlabel = labels[tokenid].partition('||')
+                newhead = self._find_new_head(head,tokenid,headlabel,heads,labels,spans=spans)
+                newheads[tokenid] = newhead
+                newlabels[tokenid] = newlabel
+        return newheads, newlabels
+
+
+    def _decorate(self, heads, proj_heads, labels):
+        # uses decoration scheme HEAD from Nivre & Nilsson 2005
+        assert(len(heads) == len(proj_heads) == len(labels))
+        deco_labels = []
+        for tokenid,head in enumerate(heads):
+            if head != proj_heads[tokenid]:
+                deco_labels.append('%s||%s' % (labels[tokenid],labels[head]))
+            else:
+                deco_labels.append(labels[tokenid])
+        return deco_labels
+
+
+    def _get_smallest_nonproj_arc(self, heads):
+        # return the smallest non-proj arc or None
+        # where size is defined as the distance between dep and head
+        # and ties are broken left to right
+        smallest_size = float('inf')
+        smallest_np_arc = None
+        for tokenid,head in enumerate(heads):
+            size = abs(tokenid-head)
+            if size < smallest_size and is_nonproj_arc(tokenid,heads):
+                smallest_size = size
+                smallest_np_arc = tokenid
+        return smallest_np_arc
+
+
+    def _lift(self, tokenid, heads):
+        # reattaches a word to it's grandfather
+        head = heads[tokenid]
+        ghead = heads[head]
+        # attach to ghead if head isn't attached to root else attach to root
+        heads[tokenid] = ghead if head != ghead else tokenid
+
+
+    def _find_new_head(self, rootid, tokenid, headlabel, heads, labels, spans=None):
+        # search through the tree starting from root
+        # returns the id of the first descendant with the given label
+        # if there is none, return the current head (no change)
+        if not spans:
+            spans = self._make_span_index(heads)
+        queue = spans.get(rootid,[])
+        queue.remove(tokenid) # don't search in the subtree of the nonproj arc
+        while queue:
+            next_queue = []
+            for idx in queue:
+                if labels[idx] == headlabel:
+                    return idx
+                next_queue.extend(spans.get(idx,[]))
+            queue = next_queue
+        return heads[tokenid]
+
+
+    def _make_span_index(self, heads):
+        # stores the direct dependents for each token
+        # for searching top-down through a tree
+        spans = {}
+        for tokenid, head in enumerate(heads):
+            if tokenid == head: # root
+                continue
+            if head not in spans:
+                spans[head] = []
+            spans[head].append(tokenid)
+        return spans
+
+
+    def _filter_labels(self, labeled_trees, cutoff):
+        # throw away infrequent decorated labels
+        # can't learn them reliably anyway and keeps label set smaller
+        freqs = Counter([ label for _,labels in labeled_trees for label in labels if label.find('||') != -1 ])
+        filtered = []
+        for proj_heads,deco_labels in labeled_trees:
+            filtered_labels = [ label.partition('||')[0] if freqs.get(label,cutoff) < cutoff else label for label in deco_labels ]
+            filtered.append((proj_heads,filtered_labels))
+        return filtered
diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx
index 1c5baced7..26f8fd3e5 100644
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@@ -211,11 +211,6 @@ cdef class Tagger:
         tokens.is_tagged = True
         tokens._py_tokens = [None] * tokens.length
 
-    def tags_from_list(self, Doc tokens, list strings):
-        assert(tokens.length == len(strings))
-        for i in range(tokens.length):
-            self.vocab.morphology.assign_tag(&tokens.c[i], strings[i])
-
     def pipe(self, stream, batch_size=1000, n_threads=2):
         for doc in stream:
             self(doc)
diff --git a/spacy/tests/test_nonproj.py b/spacy/tests/test_nonproj.py
index bd7f12bff..d5290e342 100644
--- a/spacy/tests/test_nonproj.py
+++ b/spacy/tests/test_nonproj.py
@@ -1,42 +1,113 @@
 from __future__ import unicode_literals
 import pytest
 
-from spacy.nonproj import ancestors, contains_cycle, is_non_projective_arc, is_non_projective_tree
+from spacy.nonproj import ancestors, contains_cycle, is_nonproj_arc, is_nonproj_tree, PseudoProjective
 
 def test_ancestors():
 	tree = [1,2,2,4,5,2,2]
 	cyclic_tree = [1,2,2,4,5,3,2]
 	partial_tree = [1,2,2,4,5,None,2]
+	multirooted_tree = [3,2,0,3,3,7,7,3,7,10,7,10,11,12,18,16,18,17,12,3]
 	assert([ a for a in ancestors(3,tree) ] == [4,5,2])
 	assert([ a for a in ancestors(3,cyclic_tree) ] == [4,5,3,4,5,3,4])
 	assert([ a for a in ancestors(3,partial_tree) ] == [4,5,None])
+	assert([ a for a in ancestors(17,multirooted_tree) ] == [])
 
 def test_contains_cycle():
 	tree = [1,2,2,4,5,2,2]
 	cyclic_tree = [1,2,2,4,5,3,2]
 	partial_tree = [1,2,2,4,5,None,2]
+	multirooted_tree = [3,2,0,3,3,7,7,3,7,10,7,10,11,12,18,16,18,17,12,3]
 	assert(contains_cycle(tree) == None)
 	assert(contains_cycle(cyclic_tree) == set([3,4,5]))
 	assert(contains_cycle(partial_tree) == None)
+	assert(contains_cycle(multirooted_tree) == None)
 
-def test_is_non_projective_arc():
+def test_is_nonproj_arc():
 	nonproj_tree = [1,2,2,4,5,2,7,4,2]
-	assert(is_non_projective_arc(0,nonproj_tree) == False)
-	assert(is_non_projective_arc(1,nonproj_tree) == False)
-	assert(is_non_projective_arc(2,nonproj_tree) == False)
-	assert(is_non_projective_arc(3,nonproj_tree) == False)
-	assert(is_non_projective_arc(4,nonproj_tree) == False)
-	assert(is_non_projective_arc(5,nonproj_tree) == False)
-	assert(is_non_projective_arc(6,nonproj_tree) == False)
-	assert(is_non_projective_arc(7,nonproj_tree) == True)
-	assert(is_non_projective_arc(8,nonproj_tree) == False)
 	partial_tree = [1,2,2,4,5,None,7,4,2]
-	assert(is_non_projective_arc(7,partial_tree) == False)
+	multirooted_tree = [3,2,0,3,3,7,7,3,7,10,7,10,11,12,18,16,18,17,12,3]
+	assert(is_nonproj_arc(0,nonproj_tree) == False)
+	assert(is_nonproj_arc(1,nonproj_tree) == False)
+	assert(is_nonproj_arc(2,nonproj_tree) == False)
+	assert(is_nonproj_arc(3,nonproj_tree) == False)
+	assert(is_nonproj_arc(4,nonproj_tree) == False)
+	assert(is_nonproj_arc(5,nonproj_tree) == False)
+	assert(is_nonproj_arc(6,nonproj_tree) == False)
+	assert(is_nonproj_arc(7,nonproj_tree) == True)
+	assert(is_nonproj_arc(8,nonproj_tree) == False)
+	assert(is_nonproj_arc(7,partial_tree) == False)
+	assert(is_nonproj_arc(17,multirooted_tree) == False)
+	assert(is_nonproj_arc(16,multirooted_tree) == True)
 
-def test_is_non_projective_tree():
+def test_is_nonproj_tree():
 	proj_tree = [1,2,2,4,5,2,7,5,2]
 	nonproj_tree = [1,2,2,4,5,2,7,4,2]
 	partial_tree = [1,2,2,4,5,None,7,4,2]
-	assert(is_non_projective_tree(proj_tree) == False)
-	assert(is_non_projective_tree(nonproj_tree) == True)
-	assert(is_non_projective_tree(partial_tree) == False)
+	multirooted_tree = [3,2,0,3,3,7,7,3,7,10,7,10,11,12,18,16,18,17,12,3]
+	assert(is_nonproj_tree(proj_tree) == False)
+	assert(is_nonproj_tree(nonproj_tree) == True)
+	assert(is_nonproj_tree(partial_tree) == False)
+	assert(is_nonproj_tree(multirooted_tree) == True)
+
+def test_pseudoprojective():
+	tree = [1,2,2]
+	nonproj_tree = [1,2,2,4,5,2,7,4,2]
+	labels = ['NK','SB','ROOT','NK','OA','OC','SB','RC','--']
+	nonproj_tree2 = [9,1,3,1,5,6,9,8,6,1,6,12,13,10,1]
+	labels2 = ['MO','ROOT','NK','SB','MO','NK','OA','NK','AG','OC','MNR','MO','NK','NK','--']
+
+	pp = PseudoProjective()
+
+	assert(pp._make_span_index(tree) == { 1:[0], 2:[1] })
+	assert(pp._make_span_index(nonproj_tree) == { 1:[0], 2:[1,5,8], 4:[3,7], 5:[4], 7:[6] })
+
+	pp._lift(0,tree)
+	assert(tree == [2,2,2])
+
+	np_arc = pp._get_smallest_nonproj_arc(nonproj_tree)
+	assert(np_arc == 7)
+
+	np_arc = pp._get_smallest_nonproj_arc(nonproj_tree2)
+	assert(np_arc == 10)
+
+	proj_heads, deco_labels = pp.projectivize(nonproj_tree,labels)
+	assert(proj_heads == [1,2,2,4,5,2,7,5,2])
+	assert(deco_labels == ['NK','SB','ROOT','NK','OA','OC','SB','RC||OA','--'])
+	deproj_heads, undeco_labels = pp.deprojectivize(proj_heads,deco_labels)
+	assert(deproj_heads == nonproj_tree)
+	assert(undeco_labels == labels)
+
+	proj_heads, deco_labels = pp.projectivize(nonproj_tree2,labels2)
+	assert(proj_heads == [1,1,3,1,5,6,9,8,6,1,9,12,13,10,1])
+	assert(deco_labels == ['MO||OC','ROOT','NK','SB','MO','NK','OA','NK','AG','OC','MNR||OA','MO','NK','NK','--'])
+	deproj_heads, undeco_labels = pp.deprojectivize(proj_heads,deco_labels)
+	assert(deproj_heads == nonproj_tree2)
+	assert(undeco_labels == labels2)
+
+	# if decoration is wrong such that there is no head with the desired label
+	# the structure is kept and the label is undecorated
+	deproj_heads, undeco_labels = pp.deprojectivize([1,2,2,4,5,2,7,5,2],['NK','SB','ROOT','NK','OA','OC','SB','RC||DA','--'])
+	assert(deproj_heads == [1,2,2,4,5,2,7,5,2])
+	assert(undeco_labels == ['NK','SB','ROOT','NK','OA','OC','SB','RC','--'])
+
+	# if there are two potential new heads, the first one is chosen even if it's wrong
+	deproj_heads, undeco_labels = pp.deprojectivize([1,1,3,1,5,6,9,8,6,1,9,12,13,10,1], \
+		                                            ['MO||OC','ROOT','NK','OC','MO','NK','OA','NK','AG','OC','MNR||OA','MO','NK','NK','--'])
+	assert(deproj_heads == [3,1,3,1,5,6,9,8,6,1,6,12,13,10,1])
+	assert(undeco_labels == ['MO','ROOT','NK','OC','MO','NK','OA','NK','AG','OC','MNR','MO','NK','NK','--'])
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

From 56b7210e824da89012105dedf4c2d335c7d97005 Mon Sep 17 00:00:00 2001
From: Wolfgang Seeker <seeker@ims.uni-stuttgart.de>
Date: Thu, 25 Feb 2016 15:08:49 +0100
Subject: [PATCH 4/6] moved nonproj.py to syntax/nonproj.pyx

---
 spacy/{nonproj.py => syntax/nonproj.pyx} | 0
 spacy/syntax/parser.pyx                  | 1 -
 2 files changed, 1 deletion(-)
 rename spacy/{nonproj.py => syntax/nonproj.pyx} (100%)

diff --git a/spacy/nonproj.py b/spacy/syntax/nonproj.pyx
similarity index 100%
rename from spacy/nonproj.py
rename to spacy/syntax/nonproj.pyx
diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx
index 6c77c6c96..7a352c0d9 100644
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@@ -12,7 +12,6 @@ from cpython.exc cimport PyErr_CheckSignals
 from libc.stdint cimport uint32_t, uint64_t
 from libc.string cimport memset, memcpy
 from libc.stdlib cimport malloc, calloc, free
-import random
 import os.path
 from os import path
 import shutil

From 3448cb40a4daa5128a8590e06087d44dda25e046 Mon Sep 17 00:00:00 2001
From: Wolfgang Seeker <seeker@ims.uni-stuttgart.de>
Date: Tue, 1 Mar 2016 10:09:08 +0100
Subject: [PATCH 5/6] integrated pseudo-projective parsing into parser

- nonproj.pyx holds a class PseudoProjectivity which currently holds
  all functionality to implement Nivre & Nilsson 2005's pseudo-projective
  parsing using the HEAD decoration scheme
- changed lefts/rights in Token to account for possible non-projective
  structures
---
 setup.py                                 |   1 +
 spacy/gold.pyx                           |   2 +-
 spacy/syntax/nonproj.pxd                 |   0
 spacy/syntax/nonproj.pyx                 | 131 +++++++++++++----------
 spacy/syntax/parser.pxd                  |   1 +
 spacy/syntax/parser.pyx                  |   9 +-
 spacy/tests/{ => parser}/test_nonproj.py |  53 +++++----
 spacy/tokens/token.pyx                   |  24 +----
 8 files changed, 120 insertions(+), 101 deletions(-)
 create mode 100644 spacy/syntax/nonproj.pxd
 rename spacy/tests/{ => parser}/test_nonproj.py (60%)

diff --git a/setup.py b/setup.py
index 5c6cbbf01..d2a62dc90 100644
--- a/setup.py
+++ b/setup.py
@@ -47,6 +47,7 @@ MOD_NAMES = [
     'spacy.syntax._state',
     'spacy.tokenizer',
     'spacy.syntax.parser',
+    'spacy.syntax.nonproj',
     'spacy.syntax.transition_system',
     'spacy.syntax.arc_eager',
     'spacy.syntax._parse_features',
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 7ab034195..5c7326d12 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -14,7 +14,7 @@ try:
 except ImportError:
     import json
 
-import nonproj
+from .syntax import nonproj
 
 
 def tags_to_entities(tags):
diff --git a/spacy/syntax/nonproj.pxd b/spacy/syntax/nonproj.pxd
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx
index facf9f299..dbc5555c3 100644
--- a/spacy/syntax/nonproj.pyx
+++ b/spacy/syntax/nonproj.pyx
@@ -1,6 +1,10 @@
 from copy import copy
 from collections import Counter
 
+from ..tokens.doc cimport Doc
+from spacy.attrs import DEP, HEAD
+
+
 def ancestors(tokenid, heads):
     # returns all words going from the word up the path to the root
     # the path to root cannot be longer than the number of words in the sentence
@@ -55,69 +59,90 @@ def is_nonproj_tree(heads):
     return any( is_nonproj_arc(word,heads) for word in range(len(heads)) )
 
 
-class PseudoProjective:
+cdef class PseudoProjectivity:
     # implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
     # for doing pseudo-projective parsing
     # implementation uses the HEAD decoration scheme
 
-    def preprocess_training_data(self, labeled_trees, label_freq_cutoff=30):
-        # expects a sequence of pairs of head arrays and labels
+    delimiter = '||'
+
+    @classmethod
+    def decompose(cls, label):
+        return label.partition(cls.delimiter)[::2]
+
+    @classmethod
+    def is_decorated(cls, label):
+        return label.find(cls.delimiter) != -1
+
+    @classmethod
+    def preprocess_training_data(cls, gold_tuples, label_freq_cutoff=30):
         preprocessed = []
-        for heads,labels in labeled_trees:
-            proj_heads,deco_labels = self.projectivize(heads,labels)
-            # set the label to ROOT for each root dependent
-            deco_labels = [ 'ROOT' if head == i else deco_labels[i] for i,head in enumerate(proj_heads) ]
-            preprocessed.append((proj_heads,deco_labels))
+        freqs = Counter()
+        for raw_text, sents in gold_tuples:
+            prepro_sents = []
+            for (ids, words, tags, heads, labels, iob), ctnts in sents:
+                proj_heads,deco_labels = cls.projectivize(heads,labels)
+                # set the label to ROOT for each root dependent
+                deco_labels = [ 'ROOT' if head == i else deco_labels[i] for i,head in enumerate(proj_heads) ]
+                # count label frequencies
+                if label_freq_cutoff > 0:
+                    freqs.update( label for label in deco_labels if cls.is_decorated(label) )
+                prepro_sents.append(((ids,words,tags,proj_heads,deco_labels,iob), ctnts))
+            preprocessed.append((raw_text, prepro_sents))
 
         if label_freq_cutoff > 0:
-            return self._filter_labels(preprocessed,label_freq_cutoff)
+            return cls._filter_labels(preprocessed,label_freq_cutoff,freqs)
         return preprocessed
 
 
-    def projectivize(self, heads, labels):
+    @classmethod
+    def projectivize(cls, heads, labels):
         # use the algorithm by Nivre & Nilsson 2005
         # assumes heads to be a proper tree, i.e. connected and cycle-free
         # returns a new pair (heads,labels) which encode
         # a projective and decorated tree
         proj_heads = copy(heads)
-        smallest_np_arc = self._get_smallest_nonproj_arc(proj_heads)
+        smallest_np_arc = cls._get_smallest_nonproj_arc(proj_heads)
         if smallest_np_arc == None: # this sentence is already projective
             return proj_heads, copy(labels)
         while smallest_np_arc != None:
-            self._lift(smallest_np_arc, proj_heads)
-            smallest_np_arc = self._get_smallest_nonproj_arc(proj_heads)
-        deco_labels = self._decorate(heads, proj_heads, labels)
+            cls._lift(smallest_np_arc, proj_heads)
+            smallest_np_arc = cls._get_smallest_nonproj_arc(proj_heads)
+        deco_labels = cls._decorate(heads, proj_heads, labels)
         return proj_heads, deco_labels
 
 
-    def deprojectivize(self, heads, labels):
+    @classmethod
+    def deprojectivize(cls, Doc tokens):
         # reattach arcs with decorated labels (following HEAD scheme)
         # for each decorated arc X||Y, search top-down, left-to-right,
         # breadth-first until hitting a Y then make this the new head
-        newheads, newlabels = copy(heads), copy(labels)
-        spans = None
-        for tokenid, head in enumerate(heads):
-            if labels[tokenid].find('||') != -1:
-                newlabel,_,headlabel = labels[tokenid].partition('||')
-                newhead = self._find_new_head(head,tokenid,headlabel,heads,labels,spans=spans)
-                newheads[tokenid] = newhead
-                newlabels[tokenid] = newlabel
-        return newheads, newlabels
+        parse = tokens.to_array([HEAD, DEP])
+        labels = [ tokens.vocab.strings[int(p[1])] for p in parse ]
+        for token in tokens:
+            if cls.is_decorated(token.dep_):
+                newlabel,headlabel = cls.decompose(token.dep_)
+                newhead = cls._find_new_head(token,headlabel)
+                parse[token.i,1] = tokens.vocab.strings[newlabel]
+                parse[token.i,0] = newhead.i - token.i
+        tokens.from_array([HEAD, DEP],parse)
 
 
-    def _decorate(self, heads, proj_heads, labels):
+    @classmethod
+    def _decorate(cls, heads, proj_heads, labels):
         # uses decoration scheme HEAD from Nivre & Nilsson 2005
         assert(len(heads) == len(proj_heads) == len(labels))
         deco_labels = []
         for tokenid,head in enumerate(heads):
             if head != proj_heads[tokenid]:
-                deco_labels.append('%s||%s' % (labels[tokenid],labels[head]))
+                deco_labels.append('%s%s%s' % (labels[tokenid],cls.delimiter,labels[head]))
             else:
                 deco_labels.append(labels[tokenid])
         return deco_labels
 
 
-    def _get_smallest_nonproj_arc(self, heads):
+    @classmethod
+    def _get_smallest_nonproj_arc(cls, heads):
         # return the smallest non-proj arc or None
         # where size is defined as the distance between dep and head
         # and ties are broken left to right
@@ -131,7 +156,8 @@ class PseudoProjective:
         return smallest_np_arc
 
 
-    def _lift(self, tokenid, heads):
+    @classmethod
+    def _lift(cls, tokenid, heads):
         # reattaches a word to it's grandfather
         head = heads[tokenid]
         ghead = heads[head]
@@ -139,43 +165,36 @@ class PseudoProjective:
         heads[tokenid] = ghead if head != ghead else tokenid
 
 
-    def _find_new_head(self, rootid, tokenid, headlabel, heads, labels, spans=None):
+    @classmethod
+    def _find_new_head(cls, token, headlabel):
         # search through the tree starting from root
         # returns the id of the first descendant with the given label
         # if there is none, return the current head (no change)
-        if not spans:
-            spans = self._make_span_index(heads)
-        queue = spans.get(rootid,[])
-        queue.remove(tokenid) # don't search in the subtree of the nonproj arc
+        queue = [token.head]
         while queue:
             next_queue = []
-            for idx in queue:
-                if labels[idx] == headlabel:
-                    return idx
-                next_queue.extend(spans.get(idx,[]))
+            for qtoken in queue:
+                for child in qtoken.children:
+                    if child == token:
+                        continue
+                    if child.dep_ == headlabel:
+                        return child
+                    next_queue.append(child)
             queue = next_queue
-        return heads[tokenid]
+        return token.head
 
 
-    def _make_span_index(self, heads):
-        # stores the direct dependents for each token
-        # for searching top-down through a tree
-        spans = {}
-        for tokenid, head in enumerate(heads):
-            if tokenid == head: # root
-                continue
-            if head not in spans:
-                spans[head] = []
-            spans[head].append(tokenid)
-        return spans
-
-
-    def _filter_labels(self, labeled_trees, cutoff):
+    @classmethod
+    def _filter_labels(cls, gold_tuples, cutoff, freqs):
         # throw away infrequent decorated labels
         # can't learn them reliably anyway and keeps label set smaller
-        freqs = Counter([ label for _,labels in labeled_trees for label in labels if label.find('||') != -1 ])
         filtered = []
-        for proj_heads,deco_labels in labeled_trees:
-            filtered_labels = [ label.partition('||')[0] if freqs.get(label,cutoff) < cutoff else label for label in deco_labels ]
-            filtered.append((proj_heads,filtered_labels))
+        for raw_text, sents in gold_tuples:
+            filtered_sents = []
+            for (ids, words, tags, heads, labels, iob), ctnts in sents:
+                filtered_labels = [ cls.decompose(label)[0] if freqs.get(label,cutoff) < cutoff else label for label in labels ]
+                filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts))
+            filtered.append((raw_text, filtered_sents))
         return filtered
+
+
diff --git a/spacy/syntax/parser.pxd b/spacy/syntax/parser.pxd
index 77ea376a1..e10049fb6 100644
--- a/spacy/syntax/parser.pxd
+++ b/spacy/syntax/parser.pxd
@@ -15,5 +15,6 @@ cdef class ParserModel(AveragedPerceptron):
 cdef class Parser:
     cdef readonly ParserModel model
     cdef readonly TransitionSystem moves
+    cdef int _projectivize
 
     cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil
diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx
index 7a352c0d9..2e2f009fd 100644
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@@ -17,6 +17,7 @@ from os import path
 import shutil
 import json
 import sys
+from .nonproj import PseudoProjectivity
 
 from cymem.cymem cimport Pool, Address
 from murmurhash.mrmr cimport hash64
@@ -78,9 +79,10 @@ cdef class ParserModel(AveragedPerceptron):
 
 
 cdef class Parser:
-    def __init__(self, StringStore strings, transition_system, ParserModel model):
+    def __init__(self, StringStore strings, transition_system, ParserModel model, int projectivize = 0):
         self.moves = transition_system
         self.model = model
+        self._projectivize = projectivize
 
     @classmethod
     def from_dir(cls, model_dir, strings, transition_system):
@@ -94,7 +96,7 @@ cdef class Parser:
         model = ParserModel(templates)
         if path.exists(path.join(model_dir, 'model')):
             model.load(path.join(model_dir, 'model'))
-        return cls(strings, moves, model)
+        return cls(strings, moves, model, cfg.projectivize)
 
     @classmethod
     def load(cls, pkg_or_str_or_file, vocab):
@@ -113,6 +115,9 @@ cdef class Parser:
             tokens.is_parsed = True
         # Check for KeyboardInterrupt etc. Untested
         PyErr_CheckSignals()
+        # projectivize output
+        if self._projectivize:
+            PseudoProjectivity.deprojectivize(tokens)
 
     def pipe(self, stream, int batch_size=1000, int n_threads=2):
         cdef Pool mem = Pool()
diff --git a/spacy/tests/test_nonproj.py b/spacy/tests/parser/test_nonproj.py
similarity index 60%
rename from spacy/tests/test_nonproj.py
rename to spacy/tests/parser/test_nonproj.py
index d5290e342..443db18ae 100644
--- a/spacy/tests/test_nonproj.py
+++ b/spacy/tests/parser/test_nonproj.py
@@ -1,7 +1,13 @@
 from __future__ import unicode_literals
 import pytest
 
-from spacy.nonproj import ancestors, contains_cycle, is_nonproj_arc, is_nonproj_tree, PseudoProjective
+from spacy.tokens.doc import Doc
+from spacy.vocab import Vocab
+from spacy.tokenizer import Tokenizer
+from spacy.attrs import DEP, HEAD
+import numpy
+
+from spacy.syntax.nonproj import ancestors, contains_cycle, is_nonproj_arc, is_nonproj_tree, PseudoProjectivity
 
 def test_ancestors():
 	tree = [1,2,2,4,5,2,2]
@@ -50,52 +56,53 @@ def test_is_nonproj_tree():
 	assert(is_nonproj_tree(partial_tree) == False)
 	assert(is_nonproj_tree(multirooted_tree) == True)
 
-def test_pseudoprojective():
+def test_pseudoprojectivity():
 	tree = [1,2,2]
 	nonproj_tree = [1,2,2,4,5,2,7,4,2]
 	labels = ['NK','SB','ROOT','NK','OA','OC','SB','RC','--']
 	nonproj_tree2 = [9,1,3,1,5,6,9,8,6,1,6,12,13,10,1]
 	labels2 = ['MO','ROOT','NK','SB','MO','NK','OA','NK','AG','OC','MNR','MO','NK','NK','--']
 
-	pp = PseudoProjective()
+	assert(PseudoProjectivity.decompose('X||Y') == ('X','Y'))
+	assert(PseudoProjectivity.decompose('X') == ('X',''))
 
-	assert(pp._make_span_index(tree) == { 1:[0], 2:[1] })
-	assert(pp._make_span_index(nonproj_tree) == { 1:[0], 2:[1,5,8], 4:[3,7], 5:[4], 7:[6] })
+	assert(PseudoProjectivity.is_decorated('X||Y') == True)
+	assert(PseudoProjectivity.is_decorated('X') == False)
 
-	pp._lift(0,tree)
+	PseudoProjectivity._lift(0,tree)
 	assert(tree == [2,2,2])
 
-	np_arc = pp._get_smallest_nonproj_arc(nonproj_tree)
+	np_arc = PseudoProjectivity._get_smallest_nonproj_arc(nonproj_tree)
 	assert(np_arc == 7)
 
-	np_arc = pp._get_smallest_nonproj_arc(nonproj_tree2)
+	np_arc = PseudoProjectivity._get_smallest_nonproj_arc(nonproj_tree2)
 	assert(np_arc == 10)
 
-	proj_heads, deco_labels = pp.projectivize(nonproj_tree,labels)
+	proj_heads, deco_labels = PseudoProjectivity.projectivize(nonproj_tree,labels)
 	assert(proj_heads == [1,2,2,4,5,2,7,5,2])
 	assert(deco_labels == ['NK','SB','ROOT','NK','OA','OC','SB','RC||OA','--'])
-	deproj_heads, undeco_labels = pp.deprojectivize(proj_heads,deco_labels)
-	assert(deproj_heads == nonproj_tree)
-	assert(undeco_labels == labels)
+	# deproj_heads, undeco_labels = PseudoProjectivity.deprojectivize(proj_heads,deco_labels)
+	# assert(deproj_heads == nonproj_tree)
+	# assert(undeco_labels == labels)
 
-	proj_heads, deco_labels = pp.projectivize(nonproj_tree2,labels2)
+	proj_heads, deco_labels = PseudoProjectivity.projectivize(nonproj_tree2,labels2)
 	assert(proj_heads == [1,1,3,1,5,6,9,8,6,1,9,12,13,10,1])
 	assert(deco_labels == ['MO||OC','ROOT','NK','SB','MO','NK','OA','NK','AG','OC','MNR||OA','MO','NK','NK','--'])
-	deproj_heads, undeco_labels = pp.deprojectivize(proj_heads,deco_labels)
-	assert(deproj_heads == nonproj_tree2)
-	assert(undeco_labels == labels2)
+	# deproj_heads, undeco_labels = PseudoProjectivity.deprojectivize(proj_heads,deco_labels)
+	# assert(deproj_heads == nonproj_tree2)
+	# assert(undeco_labels == labels2)
 
 	# if decoration is wrong such that there is no head with the desired label
 	# the structure is kept and the label is undecorated
-	deproj_heads, undeco_labels = pp.deprojectivize([1,2,2,4,5,2,7,5,2],['NK','SB','ROOT','NK','OA','OC','SB','RC||DA','--'])
-	assert(deproj_heads == [1,2,2,4,5,2,7,5,2])
-	assert(undeco_labels == ['NK','SB','ROOT','NK','OA','OC','SB','RC','--'])
+	# deproj_heads, undeco_labels = PseudoProjectivity.deprojectivize([1,2,2,4,5,2,7,5,2],['NK','SB','ROOT','NK','OA','OC','SB','RC||DA','--'])
+	# assert(deproj_heads == [1,2,2,4,5,2,7,5,2])
+	# assert(undeco_labels == ['NK','SB','ROOT','NK','OA','OC','SB','RC','--'])
 
 	# if there are two potential new heads, the first one is chosen even if it's wrong
-	deproj_heads, undeco_labels = pp.deprojectivize([1,1,3,1,5,6,9,8,6,1,9,12,13,10,1], \
-		                                            ['MO||OC','ROOT','NK','OC','MO','NK','OA','NK','AG','OC','MNR||OA','MO','NK','NK','--'])
-	assert(deproj_heads == [3,1,3,1,5,6,9,8,6,1,6,12,13,10,1])
-	assert(undeco_labels == ['MO','ROOT','NK','OC','MO','NK','OA','NK','AG','OC','MNR','MO','NK','NK','--'])
+	# deproj_heads, undeco_labels = PseudoProjectivity.deprojectivize([1,1,3,1,5,6,9,8,6,1,9,12,13,10,1], \
+	# 	                                            ['MO||OC','ROOT','NK','OC','MO','NK','OA','NK','AG','OC','MNR||OA','MO','NK','NK','--'])
+	# assert(deproj_heads == [3,1,3,1,5,6,9,8,6,1,6,12,13,10,1])
+	# assert(undeco_labels == ['MO','ROOT','NK','OC','MO','NK','OA','NK','AG','OC','MNR','MO','NK','NK','--'])
 
 
 
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 342bcf409..0ff574f1b 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -201,17 +201,9 @@ cdef class Token:
             cdef int nr_iter = 0
             cdef const TokenC* ptr = self.c - (self.i - self.c.l_edge)
             while ptr < self.c:
-                # If this head is still to the right of us, we can skip to it
-                # No token that's between this token and this head could be our
-                # child.
-                if (ptr.head >= 1) and (ptr + ptr.head) < self.c:
-                    ptr += ptr.head
-
-                elif ptr + ptr.head == self.c:
+                if ptr + ptr.head == self.c:
                     yield self.doc[ptr - (self.c - self.i)]
-                    ptr += 1
-                else:
-                    ptr += 1
+                ptr += 1
                 nr_iter += 1
                 # This is ugly, but it's a way to guard out infinite loops
                 if nr_iter >= 10000000:
@@ -226,16 +218,10 @@ cdef class Token:
             tokens = []
             cdef int nr_iter = 0
             while ptr > self.c:
-                # If this head is still to the right of us, we can skip to it
-                # No token that's between this token and this head could be our
-                # child.
-                if (ptr.head < 0) and ((ptr + ptr.head) > self.c):
-                    ptr += ptr.head
-                elif ptr + ptr.head == self.c:
+                if ptr + ptr.head == self.c:
                     tokens.append(self.doc[ptr - (self.c - self.i)])
-                    ptr -= 1
-                else:
-                    ptr -= 1
+                ptr -= 1
+                nr_iter += 1
                 if nr_iter >= 10000000:
                     raise RuntimeError(
                         "Possibly infinite loop encountered while looking for token.rights")

From 690c5acabf5db9a46c403dd48067f5f1e2f71637 Mon Sep 17 00:00:00 2001
From: Wolfgang Seeker <seeker@ims.uni-stuttgart.de>
Date: Thu, 3 Mar 2016 15:21:00 +0100
Subject: [PATCH 6/6] adjust train.py to train both english and german models

---
 bin/parser/train.py                | 37 ++++++++++++-------
 spacy/de/__init__.py               |  2 +-
 spacy/gold.pyx                     | 10 ++----
 spacy/syntax/parser.pyx            |  3 +-
 spacy/tagger.pyx                   |  2 +-
 spacy/tests/parser/test_nonproj.py | 58 +++++++++++++++++++-----------
 6 files changed, 67 insertions(+), 45 deletions(-)

diff --git a/bin/parser/train.py b/bin/parser/train.py
index a7dc74770..642ed53e7 100755
--- a/bin/parser/train.py
+++ b/bin/parser/train.py
@@ -14,6 +14,7 @@ import re
 
 import spacy.util
 from spacy.en import English
+from spacy.de import German
 
 from spacy.syntax.util import Config
 from spacy.gold import read_json_file
@@ -25,6 +26,7 @@ from spacy.syntax.arc_eager import ArcEager
 from spacy.syntax.ner import BiluoPushDown
 from spacy.tagger import Tagger
 from spacy.syntax.parser import Parser
+from spacy.syntax.nonproj import PseudoProjectivity
 
 
 def _corrupt(c, noise_level):
@@ -82,7 +84,7 @@ def _merge_sents(sents):
 def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
           seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
           beam_width=1, verbose=False,
-          use_orig_arc_eager=False):
+          use_orig_arc_eager=False, pseudoprojective=False):
     dep_model_dir = path.join(model_dir, 'deps')
     ner_model_dir = path.join(model_dir, 'ner')
     pos_model_dir = path.join(model_dir, 'pos')
@@ -96,9 +98,13 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
     os.mkdir(ner_model_dir)
     os.mkdir(pos_model_dir)
 
+    if pseudoprojective:
+        # preprocess training data here before ArcEager.get_labels() is called
+        gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples)
+
     Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
                  labels=ArcEager.get_labels(gold_tuples),
-                 beam_width=beam_width)
+                 beam_width=beam_width,projectivize=pseudoprojective)
     Config.write(ner_model_dir, 'config', features='ner', seed=seed,
                  labels=BiluoPushDown.get_labels(gold_tuples),
                  beam_width=0)
@@ -107,6 +113,8 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
         gold_tuples = gold_tuples[:n_sents]
 
     nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
+    if nlp.lang == 'de':
+        nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string])
     nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
     nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
     nlp.entity = Parser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown)
@@ -131,12 +139,9 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
                     raw_text = add_noise(raw_text, corruption_level)
                     tokens = nlp.tokenizer(raw_text)
                 nlp.tagger(tokens)
-                gold = GoldParse(tokens, annot_tuples, make_projective=True)
+                gold = GoldParse(tokens, annot_tuples)
                 if not gold.is_projective:
-                    raise Exception(
-                        "Non-projective sentence in training, after we should "
-                        "have enforced projectivity: %s" % annot_tuples
-                    )
+                    raise Exception("Non-projective sentence in training: %s" % annot_tuples)
                 loss += nlp.parser.train(tokens, gold)
                 nlp.entity.train(tokens, gold)
                 nlp.tagger.train(tokens, gold.tags)
@@ -152,6 +157,8 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
 def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
              beam_width=None, cand_preproc=None):
     nlp = Language(data_dir=model_dir)
+    if nlp.lang == 'de':
+        nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string])
     if beam_width is not None:
         nlp.parser.cfg.beam_width = beam_width
     scorer = Scorer()
@@ -200,6 +207,7 @@ def write_parses(Language, dev_loc, model_dir, out_loc):
 
 
 @plac.annotations(
+    language=("The language to train", "positional", None, str, ['en','de']),
     train_loc=("Location of training file or directory"),
     dev_loc=("Location of development file or directory"),
     model_dir=("Location of output model directory",),
@@ -211,19 +219,22 @@ def write_parses(Language, dev_loc, model_dir, out_loc):
     n_iter=("Number of training iterations", "option", "i", int),
     verbose=("Verbose error reporting", "flag", "v", bool),
     debug=("Debug mode", "flag", "d", bool),
+    pseudoprojective=("Use pseudo-projective parsing", "flag", "p", bool),
 )
-def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
-         debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False):
+def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
+         debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False):
+    lang = {'en':English, 'de':German}.get(language)
+
     if not eval_only:
         gold_train = list(read_json_file(train_loc))
-        train(English, gold_train, model_dir,
+        train(lang, gold_train, model_dir,
               feat_set='basic' if not debug else 'debug',
               gold_preproc=gold_preproc, n_sents=n_sents,
               corruption_level=corruption_level, n_iter=n_iter,
-              verbose=verbose)
+              verbose=verbose,pseudoprojective=pseudoprojective)
     if out_loc:
-        write_parses(English, dev_loc, model_dir, out_loc)
-    scorer = evaluate(English, list(read_json_file(dev_loc)),
+        write_parses(lang, dev_loc, model_dir, out_loc)
+    scorer = evaluate(lang, list(read_json_file(dev_loc)),
                       model_dir, gold_preproc=gold_preproc, verbose=verbose)
     print('TOK', scorer.token_acc)
     print('POS', scorer.tags_acc)
diff --git a/spacy/de/__init__.py b/spacy/de/__init__.py
index d7cc3dc65..76817ccff 100644
--- a/spacy/de/__init__.py
+++ b/spacy/de/__init__.py
@@ -6,4 +6,4 @@ from ..language import Language
 
 
 class German(Language):
-    pass
+    lang = 'de'
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 5c7326d12..67716b0ab 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -244,14 +244,8 @@ cdef class GoldParse:
             raise Exception("Cycle found: %s" % cycle)
 
         if make_projective:
-            # projectivity here means non-proj arcs are being disconnected
-            np_arcs = []
-            for word in range(self.length):
-                if nonproj.is_nonproj_arc(word,self.heads):
-                    np_arcs.append(word)
-            for np_arc in np_arcs:
-                self.heads[np_arc] = None
-                self.labels[np_arc] = ''
+            proj_heads,_ = nonproj.PseudoProjectivity.projectivize(self.heads,self.labels)
+            self.heads = proj_heads
 
         self.brackets = {}
         for (gold_start, gold_end, label_str) in brackets:
diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx
index 2e2f009fd..a83c397dc 100644
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@@ -94,9 +94,10 @@ cdef class Parser:
         moves = transition_system(strings, cfg.labels)
         templates = get_templates(cfg.features)
         model = ParserModel(templates)
+        project = cfg.projectivize if hasattr(cfg,'projectivize') else False
         if path.exists(path.join(model_dir, 'model')):
             model.load(path.join(model_dir, 'model'))
-        return cls(strings, moves, model, cfg.projectivize)
+        return cls(strings, moves, model, project)
 
     @classmethod
     def load(cls, pkg_or_str_or_file, vocab):
diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx
index 26f8fd3e5..9e4c8ac43 100644
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@@ -143,7 +143,7 @@ cdef class Tagger:
 
     @classmethod
     def blank(cls, vocab, templates):
-        model = TaggerModel(N_CONTEXT_FIELDS, templates)
+        model = TaggerModel(templates)
         return cls(vocab, model)
 
     @classmethod
diff --git a/spacy/tests/parser/test_nonproj.py b/spacy/tests/parser/test_nonproj.py
index 443db18ae..71b20bf74 100644
--- a/spacy/tests/parser/test_nonproj.py
+++ b/spacy/tests/parser/test_nonproj.py
@@ -1,9 +1,6 @@
 from __future__ import unicode_literals
 import pytest
 
-from spacy.tokens.doc import Doc
-from spacy.vocab import Vocab
-from spacy.tokenizer import Tokenizer
 from spacy.attrs import DEP, HEAD
 import numpy
 
@@ -56,12 +53,28 @@ def test_is_nonproj_tree():
 	assert(is_nonproj_tree(partial_tree) == False)
 	assert(is_nonproj_tree(multirooted_tree) == True)
 
-def test_pseudoprojectivity():
+
+def deprojectivize(proj_heads, deco_labels, EN):
+	slen = len(proj_heads)
+	sent = EN.tokenizer.tokens_from_list(['whatever'] * slen)
+	rel_proj_heads = [ head-i for i,head in enumerate(proj_heads) ]
+	labelids = [ EN.vocab.strings[label] for label in deco_labels ]
+	parse = numpy.asarray(zip(rel_proj_heads,labelids), dtype=numpy.int32)	
+	sent.from_array([HEAD,DEP],parse)
+	PseudoProjectivity.deprojectivize(sent)
+	parse = sent.to_array([HEAD,DEP])
+	deproj_heads = [ i+head for i,head in enumerate(parse[:,0]) ]
+	undeco_labels = [ EN.vocab.strings[int(labelid)] for labelid in parse[:,1] ]
+	return deproj_heads, undeco_labels
+
+
+@pytest.mark.models
+def test_pseudoprojectivity(EN):
 	tree = [1,2,2]
 	nonproj_tree = [1,2,2,4,5,2,7,4,2]
-	labels = ['NK','SB','ROOT','NK','OA','OC','SB','RC','--']
+	labels = ['det','nsubj','root','det','dobj','aux','nsubj','acl','punct']
 	nonproj_tree2 = [9,1,3,1,5,6,9,8,6,1,6,12,13,10,1]
-	labels2 = ['MO','ROOT','NK','SB','MO','NK','OA','NK','AG','OC','MNR','MO','NK','NK','--']
+	labels2 = ['advmod','root','det','nsubj','advmod','det','dobj','det','nmod','aux','nmod','advmod','det','amod','punct']
 
 	assert(PseudoProjectivity.decompose('X||Y') == ('X','Y'))
 	assert(PseudoProjectivity.decompose('X') == ('X',''))
@@ -80,29 +93,32 @@ def test_pseudoprojectivity():
 
 	proj_heads, deco_labels = PseudoProjectivity.projectivize(nonproj_tree,labels)
 	assert(proj_heads == [1,2,2,4,5,2,7,5,2])
-	assert(deco_labels == ['NK','SB','ROOT','NK','OA','OC','SB','RC||OA','--'])
-	# deproj_heads, undeco_labels = PseudoProjectivity.deprojectivize(proj_heads,deco_labels)
-	# assert(deproj_heads == nonproj_tree)
-	# assert(undeco_labels == labels)
+	assert(deco_labels == ['det','nsubj','root','det','dobj','aux','nsubj','acl||dobj','punct'])
+	deproj_heads, undeco_labels = deprojectivize(proj_heads,deco_labels,EN)
+	assert(deproj_heads == nonproj_tree)
+	assert(undeco_labels == labels)
 
 	proj_heads, deco_labels = PseudoProjectivity.projectivize(nonproj_tree2,labels2)
 	assert(proj_heads == [1,1,3,1,5,6,9,8,6,1,9,12,13,10,1])
-	assert(deco_labels == ['MO||OC','ROOT','NK','SB','MO','NK','OA','NK','AG','OC','MNR||OA','MO','NK','NK','--'])
-	# deproj_heads, undeco_labels = PseudoProjectivity.deprojectivize(proj_heads,deco_labels)
-	# assert(deproj_heads == nonproj_tree2)
-	# assert(undeco_labels == labels2)
+	assert(deco_labels == ['advmod||aux','root','det','nsubj','advmod','det','dobj','det','nmod','aux','nmod||dobj','advmod','det','amod','punct'])
+	deproj_heads, undeco_labels = deprojectivize(proj_heads,deco_labels,EN)
+	assert(deproj_heads == nonproj_tree2)
+	assert(undeco_labels == labels2)
 
 	# if decoration is wrong such that there is no head with the desired label
 	# the structure is kept and the label is undecorated
-	# deproj_heads, undeco_labels = PseudoProjectivity.deprojectivize([1,2,2,4,5,2,7,5,2],['NK','SB','ROOT','NK','OA','OC','SB','RC||DA','--'])
-	# assert(deproj_heads == [1,2,2,4,5,2,7,5,2])
-	# assert(undeco_labels == ['NK','SB','ROOT','NK','OA','OC','SB','RC','--'])
+	proj_heads = [1,2,2,4,5,2,7,5,2]
+	deco_labels = ['det','nsubj','root','det','dobj','aux','nsubj','acl||iobj','punct']
+	deproj_heads, undeco_labels = deprojectivize(proj_heads,deco_labels,EN)
+	assert(deproj_heads == proj_heads)
+	assert(undeco_labels == ['det','nsubj','root','det','dobj','aux','nsubj','acl','punct'])
 
 	# if there are two potential new heads, the first one is chosen even if it's wrong
-	# deproj_heads, undeco_labels = PseudoProjectivity.deprojectivize([1,1,3,1,5,6,9,8,6,1,9,12,13,10,1], \
-	# 	                                            ['MO||OC','ROOT','NK','OC','MO','NK','OA','NK','AG','OC','MNR||OA','MO','NK','NK','--'])
-	# assert(deproj_heads == [3,1,3,1,5,6,9,8,6,1,6,12,13,10,1])
-	# assert(undeco_labels == ['MO','ROOT','NK','OC','MO','NK','OA','NK','AG','OC','MNR','MO','NK','NK','--'])
+	proj_heads = [1,1,3,1,5,6,9,8,6,1,9,12,13,10,1]
+	deco_labels = ['advmod||aux','root','det','aux','advmod','det','dobj','det','nmod','aux','nmod||dobj','advmod','det','amod','punct']
+	deproj_heads, undeco_labels = deprojectivize(proj_heads,deco_labels,EN)
+	assert(deproj_heads == [3,1,3,1,5,6,9,8,6,1,6,12,13,10,1])
+	assert(undeco_labels == ['advmod','root','det','aux','advmod','det','dobj','det','nmod','aux','nmod','advmod','det','amod','punct'])