Merge pull request #280 from wbwseeker/german_parser

German parser
2025-08-23 21:44:54 +03:00 · 2016-03-04 03:27:42 +11:00 · 2016-03-04 03:27:42 +11:00 · fcaa0ad7ce
commit fcaa0ad7ce
parent 9d51e4d13c 690c5acabf
20 changed files with 1687 additions and 134 deletions
--- a/bin/init_model.py
+++ b/bin/init_model.py
@ -98,7 +98,7 @@ def _read_probs(loc):
    return probs, probs['-OOV-']


-def _read_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200):
+def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200):
    if not loc.exists():
        print("Warning: Frequencies file not found")
        return {}, 0.0
@ -125,7 +125,8 @@ def _read_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200):
        doc_freq = int(doc_freq)
        freq = int(freq)
        if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
-            word = literal_eval(key)
+#            word = literal_eval(key)
+            word = key
            smooth_count = counts.smoother(int(freq))
            log_smooth_count = math.log(smooth_count)
            probs[word] = math.log(smooth_count) - log_total
@ -165,7 +166,7 @@ def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir):
    clusters = _read_clusters(src_dir / 'clusters.txt')
    probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob')
    if not probs:
-        probs, oov_prob = _read_freqs(src_dir / 'freqs.txt.gz')
+        probs, oov_prob = _read_freqs(src_dir / 'freqs.txt')
    if not probs:
        oov_prob = -20
    else:
@ -223,9 +224,8 @@ def main(lang_id, lang_data_dir, corpora_dir, model_dir):
        copyfile(str(lang_data_dir / 'gazetteer.json'),
                 str(model_dir / 'vocab' / 'gazetteer.json'))

-    if (lang_data_dir / 'tag_map.json').exists():
-        copyfile(str(lang_data_dir / 'tag_map.json'),
-                 str(model_dir / 'vocab' / 'tag_map.json'))
+    copyfile(str(lang_data_dir / 'tag_map.json'),
+             str(model_dir / 'vocab' / 'tag_map.json'))

    if (lang_data_dir / 'lemma_rules.json').exists():
        copyfile(str(lang_data_dir / 'lemma_rules.json'),
--- a/bin/parser/train.py
+++ b/bin/parser/train.py
@ -14,6 +14,7 @@ import re

 import spacy.util
 from spacy.en import English
+from spacy.de import German

 from spacy.syntax.util import Config
 from spacy.gold import read_json_file
@ -25,6 +26,7 @@ from spacy.syntax.arc_eager import ArcEager
 from spacy.syntax.ner import BiluoPushDown
 from spacy.tagger import Tagger
 from spacy.syntax.parser import Parser
+from spacy.syntax.nonproj import PseudoProjectivity


 def _corrupt(c, noise_level):
@ -82,7 +84,7 @@ def _merge_sents(sents):
 def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
          seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
          beam_width=1, verbose=False,
-          use_orig_arc_eager=False):
+          use_orig_arc_eager=False, pseudoprojective=False):
    dep_model_dir = path.join(model_dir, 'deps')
    ner_model_dir = path.join(model_dir, 'ner')
    pos_model_dir = path.join(model_dir, 'pos')
@ -96,9 +98,13 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
    os.mkdir(ner_model_dir)
    os.mkdir(pos_model_dir)

+    if pseudoprojective:
+        # preprocess training data here before ArcEager.get_labels() is called
+        gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples)
+
    Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
                 labels=ArcEager.get_labels(gold_tuples),
-                 beam_width=beam_width)
+                 beam_width=beam_width,projectivize=pseudoprojective)
    Config.write(ner_model_dir, 'config', features='ner', seed=seed,
                 labels=BiluoPushDown.get_labels(gold_tuples),
                 beam_width=0)
@ -107,6 +113,8 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
        gold_tuples = gold_tuples[:n_sents]

    nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
+    if nlp.lang == 'de':
+        nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string])
    nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
    nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
    nlp.entity = Parser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown)
@ -131,12 +139,9 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
                    raw_text = add_noise(raw_text, corruption_level)
                    tokens = nlp.tokenizer(raw_text)
                nlp.tagger(tokens)
-                gold = GoldParse(tokens, annot_tuples, make_projective=True)
+                gold = GoldParse(tokens, annot_tuples)
                if not gold.is_projective:
-                    raise Exception(
-                        "Non-projective sentence in training, after we should "
-                        "have enforced projectivity: %s" % annot_tuples
-                    )
+                    raise Exception("Non-projective sentence in training: %s" % annot_tuples)
                loss += nlp.parser.train(tokens, gold)
                nlp.entity.train(tokens, gold)
                nlp.tagger.train(tokens, gold.tags)
@ -152,6 +157,8 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
 def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
             beam_width=None, cand_preproc=None):
    nlp = Language(data_dir=model_dir)
+    if nlp.lang == 'de':
+        nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string])
    if beam_width is not None:
        nlp.parser.cfg.beam_width = beam_width
    scorer = Scorer()
@ -200,6 +207,7 @@ def write_parses(Language, dev_loc, model_dir, out_loc):


@plac.annotations(
+    language=("The language to train", "positional", None, str, ['en','de']),
    train_loc=("Location of training file or directory"),
    dev_loc=("Location of development file or directory"),
    model_dir=("Location of output model directory",),
@ -211,19 +219,22 @@ def write_parses(Language, dev_loc, model_dir, out_loc):
    n_iter=("Number of training iterations", "option", "i", int),
    verbose=("Verbose error reporting", "flag", "v", bool),
    debug=("Debug mode", "flag", "d", bool),
+    pseudoprojective=("Use pseudo-projective parsing", "flag", "p", bool),
 )
-def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
-         debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False):
+def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
+         debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False):
+    lang = {'en':English, 'de':German}.get(language)
+
    if not eval_only:
        gold_train = list(read_json_file(train_loc))
-        train(English, gold_train, model_dir,
+        train(lang, gold_train, model_dir,
              feat_set='basic' if not debug else 'debug',
              gold_preproc=gold_preproc, n_sents=n_sents,
              corruption_level=corruption_level, n_iter=n_iter,
-              verbose=verbose)
+              verbose=verbose,pseudoprojective=pseudoprojective)
    if out_loc:
-        write_parses(English, dev_loc, model_dir, out_loc)
-    scorer = evaluate(English, list(read_json_file(dev_loc)),
+        write_parses(lang, dev_loc, model_dir, out_loc)
+    scorer = evaluate(lang, list(read_json_file(dev_loc)),
                      model_dir, gold_preproc=gold_preproc, verbose=verbose)
    print('TOK', scorer.token_acc)
    print('POS', scorer.tags_acc)
--- a/bin/tagger/train_german_tagger.py
+++ b/bin/tagger/train_german_tagger.py
@ -0,0 +1,160 @@
+#!/usr/bin/env python
+from __future__ import division
+from __future__ import unicode_literals
+
+import os
+from os import path
+import shutil
+import io
+import random
+import time
+import gzip
+import ujson
+
+import plac
+import cProfile
+import pstats
+
+import spacy.util
+from spacy.de import German
+from spacy.gold import GoldParse
+from spacy.tagger import Tagger
+from spacy.scorer import PRFScore
+
+from spacy.tagger import P2_orth, P2_cluster, P2_shape, P2_prefix, P2_suffix, P2_pos, P2_lemma, P2_flags 
+from spacy.tagger import P1_orth, P1_cluster, P1_shape, P1_prefix, P1_suffix, P1_pos, P1_lemma, P1_flags 
+from spacy.tagger import W_orth, W_cluster, W_shape, W_prefix, W_suffix, W_pos, W_lemma, W_flags
+from spacy.tagger import N1_orth, N1_cluster, N1_shape, N1_prefix, N1_suffix, N1_pos, N1_lemma, N1_flags
+from spacy.tagger import N2_orth, N2_cluster, N2_shape, N2_prefix, N2_suffix, N2_pos, N2_lemma, N2_flags, N_CONTEXT_FIELDS
+
+
+def default_templates():
+    return spacy.tagger.Tagger.default_templates()
+
+def default_templates_without_clusters():
+    return (
+        (W_orth,),
+        (P1_lemma, P1_pos),
+        (P2_lemma, P2_pos),
+        (N1_orth,),
+        (N2_orth,),
+
+        (W_suffix,),
+        (W_prefix,),
+
+        (P1_pos,),
+        (P2_pos,),
+        (P1_pos, P2_pos),
+        (P1_pos, W_orth),
+        (P1_suffix,),
+        (N1_suffix,),
+
+        (W_shape,),
+
+        (W_flags,),
+        (N1_flags,),
+        (N2_flags,),
+        (P1_flags,),
+        (P2_flags,),
+    )
+
+
+def make_tagger(vocab, templates):
+    model = spacy.tagger.TaggerModel(templates)
+    return spacy.tagger.Tagger(vocab,model)
+
+
+def read_conll(file_):
+    def sentences():
+        words, tags = [], []
+        for line in file_:
+            line = line.strip()
+            if line:
+                word, tag = line.split('\t')[1::3][:2] # get column 1 and 4 (CoNLL09)
+                words.append(word)
+                tags.append(tag)
+            elif words:
+                yield words, tags
+                words, tags = [], []
+        if words:
+            yield words, tags
+    return [ s for s in sentences() ]
+
+        
+def score_model(score, nlp, words, gold_tags):
+    tokens = nlp.tokenizer.tokens_from_list(words)
+    assert(len(tokens) == len(gold_tags))
+    nlp.tagger(tokens)
+
+    for token, gold_tag in zip(tokens,gold_tags):
+        score.score_set(set([token.tag_]),set([gold_tag]))
+
+
+def train(Language, train_sents, dev_sents, model_dir, n_iter=15, seed=21):
+    # make shuffling deterministic
+    random.seed(seed)
+
+    # set up directory for model
+    pos_model_dir = path.join(model_dir, 'pos')
+    if path.exists(pos_model_dir):
+        shutil.rmtree(pos_model_dir)
+    os.mkdir(pos_model_dir)
+
+    nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
+    nlp.tagger = make_tagger(nlp.vocab,default_templates())
+     
+    print("Itn.\ttrain acc %\tdev acc %")
+    for itn in range(n_iter):
+        # train on train set
+        #train_acc = PRFScore()
+        correct, total = 0., 0.
+        for words, gold_tags in train_sents:
+            tokens = nlp.tokenizer.tokens_from_list(words)
+            correct += nlp.tagger.train(tokens, gold_tags)
+            total += len(words)
+        train_acc = correct/total
+
+        # test on dev set
+        dev_acc = PRFScore()
+        for words, gold_tags in dev_sents:
+            score_model(dev_acc, nlp, words, gold_tags)
+
+        random.shuffle(train_sents)
+        print('%d:\t%6.2f\t%6.2f' % (itn, 100*train_acc, 100*dev_acc.precision))
+
+
+    print('end training')
+    nlp.end_training(model_dir)
+    print('done')
+
+
+@plac.annotations(
+    train_loc=("Location of CoNLL 09 formatted training file"),
+    dev_loc=("Location of CoNLL 09 formatted development file"),
+    model_dir=("Location of output model directory"),
+    eval_only=("Skip training, and only evaluate", "flag", "e", bool),
+    n_iter=("Number of training iterations", "option", "i", int),
+)
+def main(train_loc, dev_loc, model_dir, eval_only=False, n_iter=15):
+    # training
+    if not eval_only:
+        with io.open(train_loc, 'r', encoding='utf8') as trainfile_, \
+             io.open(dev_loc, 'r', encoding='utf8') as devfile_:
+            train_sents = read_conll(trainfile_)
+            dev_sents = read_conll(devfile_)
+        train(German, train_sents, dev_sents, model_dir, n_iter=n_iter)
+
+    # testing
+    with io.open(dev_loc, 'r', encoding='utf8') as file_:
+        dev_sents = read_conll(file_)
+        nlp = German(data_dir=model_dir)
+
+        dev_acc = PRFScore()
+        for words, gold_tags in dev_sents:
+            score_model(dev_acc, nlp, words, gold_tags)                
+        
+        print('POS: %6.2f %%' % (100*dev_acc.precision))
+
+
+if __name__ == '__main__':
+    plac.call(main)
--- a/lang_data/de/abbrev.de.tab
+++ b/lang_data/de/abbrev.de.tab
@ -0,0 +1,319 @@
+# surface form lemma pos 
+# multiple values are separated by |
+# empty lines and lines starting with # are being ignored 
+
+''	''
+\")	\")
+\n	\n	<nl>	SP
+\t	\t	<tab>	SP
+ 	 	<space>	SP
+
+# example: Wie geht's?
+'s	's	es
+'S	'S	es
+
+# example: Haste mal 'nen Euro?
+'n	'n	ein
+'ne	'ne	eine
+'nen	'nen	einen
+
+# example: Kommen S’ nur herein!
+s'	s'	sie
+S'	S'	sie
+
+# example: Da haben wir's!
+ich's	ich|'s	ich|es
+du's	du|'s	du|es
+er's	er|'s	er|es
+sie's	sie|'s	sie|es
+wir's	wir|'s	wir|es
+ihr's	ihr|'s	ihr|es
+
+# example: Die katze auf'm dach.
+auf'm	auf|'m	auf|dem
+unter'm	unter|'m	unter|dem
+über'm	über|'m	über|dem
+vor'm	vor|'m	vor|dem
+hinter'm	hinter|'m	hinter|dem
+
+# persons
+B.A.	B.A.
+B.Sc.	B.Sc.
+Dipl.	Dipl.
+Dipl.-Ing.	Dipl.-Ing.
+Dr.	Dr.
+Fr.	Fr.
+Frl.	Frl.
+Hr.	Hr.
+Hrn.	Hrn.
+Frl.	Frl.
+Prof.	Prof.
+St.	St.
+Hrgs.	Hrgs.
+Hg.	Hg.
+a.Z.	a.Z.
+a.D.	a.D.
+h.c.	h.c.
+Jr.	Jr.
+jr.	jr.
+jun.	jun.
+sen.	sen.
+rer.	rer.
+Ing.	Ing.
+M.A.	M.A.
+Mr.	Mr.
+M.Sc.	M.Sc.
+nat.	nat.
+phil.	phil.
+
+# companies
+Co.	Co.
+co.	co.
+Cie.	Cie.
+A.G.	A.G.
+G.m.b.H.	G.m.b.H.
+i.G.	i.G.
+e.V.	e.V.
+
+# popular german abbreviations
+Abb.	Abb.
+Abk.	Abk.
+Abs.	Abs.
+Abt.	Abt.
+abzgl.	abzgl.
+allg.	allg.
+a.M.	a.M.
+Bd.	Bd.
+betr.	betr.
+Betr.	Betr.
+Biol.	Biol.
+biol.	biol.
+Bf.	Bf.
+Bhf.	Bhf.
+Bsp.	Bsp.
+bspw.	bspw.
+bzgl.	bzgl.
+bzw.	bzw.
+d.h.	d.h.
+dgl.	dgl.
+ebd.	ebd.
+ehem.	ehem.
+eigtl.	eigtl.
+entspr.	entspr.
+erm.	erm.
+ev.	ev.
+evtl.	evtl.
+Fa.	Fa.
+Fam.	Fam.
+geb.	geb.
+Gebr.	Gebr.
+gem.	gem.
+ggf.	ggf.
+ggü.	ggü.
+ggfs.	ggfs.
+gegr.	gegr.
+Hbf.	Hbf.
+Hrsg.	Hrsg.
+hrsg.	hrsg.
+i.A.	i.A.
+i.d.R.	i.d.R.
+inkl.	inkl.
+insb.	insb.
+i.O.	i.O.
+i.Tr.	i.Tr.
+i.V.	i.V.
+jur.	jur.
+kath.	kath.
+K.O.	K.O.
+lt.	lt.
+max.	max.
+m.E.	m.E.
+m.M.	m.M.
+mtl.	mtl.
+min.	min.
+mind.	mind.
+MwSt.	MwSt.
+Nr.	Nr.
+o.a.	o.a.
+o.ä.	o.ä.
+o.Ä.	o.Ä.
+o.g.	o.g.
+o.k.	o.k.
+O.K.	O.K.
+Orig.	Orig.
+orig.	orig.
+pers.	pers.
+Pkt.	Pkt.
+Red.	Red.
+röm.	röm.
+s.o.	s.o.
+sog.	sog.
+std.	std.
+stellv.	stellv.
+Str.	Str.
+tägl.	tägl.
+Tel.	Tel.
+u.a.	u.a.
+usf.	usf.
+u.s.w.	u.s.w.
+usw.	usw.
+u.U.	u.U.
+u.v.m.	u.v.m.
+uvm.	uvm.
+v.a.	v.a.
+vgl.	vgl.
+vllt.	vllt.
+v.l.n.r.	v.l.n.r.
+vlt.	vlt.
+Vol.	Vol.
+wiss.	wiss.
+Univ.	Univ.
+z.B.	z.B.
+z.b.	z.b.
+z.Bsp.	z.Bsp.
+z.T.	z.T.
+z.Z.	z.Z.
+zzgl.	zzgl.
+z.Zt.	z.Zt.
+
+# popular latin abbreviations
+vs.	vs.
+adv.	adv.
+Chr.	Chr.
+A.C.	A.C.
+A.D.	A.D.
+e.g.	e.g.
+i.e.	i.e.
+al.	al.
+p.a.	p.a.
+P.S.	P.S.
+q.e.d.	q.e.d.
+R.I.P.	R.I.P.
+etc.	etc.
+incl.	incl.
+ca.	ca.
+n.Chr.	n.Chr.
+p.s.	p.s.
+v.Chr.	v.Chr.
+
+# popular english abbreviations
+D.C.	D.C.
+N.Y.	N.Y.
+N.Y.C.	N.Y.C.
+U.S.	U.S.
+U.S.A.	U.S.A.
+L.A.	L.A.
+U.S.S.	U.S.S.
+
+# dates & time
+Jan.	Jan.
+Feb.	Feb.
+Mrz.	Mrz.
+Mär.	Mär.
+Apr.	Apr.
+Jun.	Jun.
+Jul.	Jul.
+Aug.	Aug.
+Sep.	Sep.
+Sept.	Sept.
+Okt.	Okt.
+Nov.	Nov.
+Dez.	Dez.
+Mo.	Mo.
+Di.	Di.
+Mi.	Mi.
+Do.	Do.
+Fr.	Fr.
+Sa.	Sa.
+So.	So.
+Std.	Std.
+Jh.	Jh.
+Jhd.	Jhd.
+
+# numbers
+Tsd.	Tsd.
+Mio.	Mio.
+Mrd.	Mrd.
+
+# countries & languages
+engl.	engl.
+frz.	frz.
+lat.	lat.
+österr.	österr.
+
+# smileys
+:)	:)
+<3	<3
+;)	;)
+(:	(:
+:(	:(
+-_-	-_-
+=)	=)
+:/	:/
+:>	:>
+;-)	;-)
+:Y	:Y
+:P	:P
+:-P	:-P
+:3	:3
+=3	=3
+xD	xD
+^_^	^_^
+=]	=]
+=D	=D
+<333	<333
+:))	:))
+:0	:0
+-__-	-__-
+xDD	xDD
+o_o	o_o
+o_O	o_O
+V_V	V_V
+=[[	=[[
+<33	<33
+;p	;p
+;D	;D
+;-p	;-p
+;(	;(
+:p	:p
+:]	:]
+:O	:O
+:-/	:-/
+:-)	:-)
+:(((	:(((
+:((	:((
+:')	:')
+(^_^)	(^_^)
+(=	(=
+o.O	o.O
+
+# single letters
+a.	a.
+b.	b.
+c.	c.
+d.	d.
+e.	e.
+f.	f.
+g.	g.
+h.	h.
+i.	i.
+j.	j.
+k.	k.
+l.	l.
+m.	m.
+n.	n.
+o.	o.
+p.	p.
+q.	q.
+r.	r.
+s.	s.
+t.	t.
+u.	u.
+v.	v.
+w.	w.
+x.	x.
+y.	y.
+z.	z.
+ä.	ä.
+ö.	ö.
+ü.	ü.
--- a/lang_data/de/gazetteer.json
+++ b/lang_data/de/gazetteer.json
@ -0,0 +1,194 @@
+{
+	"Reddit": [
+		"PRODUCT",
+		{},
+		[
+			[{"lower": "reddit"}]
+		]
+	],
+	"SeptemberElevenAttacks": [
+		"EVENT",
+		{},
+		[
+			[
+				{"orth": "9/11"}
+			],
+			[
+				{"lower": "september"},
+				{"orth": "11"}
+			]
+		]
+	],
+	"Linux": [
+		"PRODUCT",
+		{},
+		[
+			[{"lower": "linux"}]
+		]
+	],
+	"Haskell": [
+		"PRODUCT",
+		{},
+		[
+			[{"lower": "haskell"}]
+		]
+	],
+	"HaskellCurry": [
+		"PERSON",
+		{},
+		[
+			[
+				{"lower": "haskell"},
+				{"lower": "curry"}
+			]
+		]
+	],
+	"Javascript": [
+		"PRODUCT",
+		{},
+		[
+			[{"lower": "javascript"}]
+		]
+	],
+	"CSS": [
+		"PRODUCT",
+		{},
+		[
+			[{"lower": "css"}],
+			[{"lower": "css3"}]
+		]
+	],
+	"displaCy": [
+		"PRODUCT",
+		{},
+		[
+			[{"lower": "displacy"}]
+		]
+	],
+	"spaCy": [
+		"PRODUCT",
+		{},
+		[
+			[{"orth": "spaCy"}]
+		]
+	],
+
+    "HTML": [
+		"PRODUCT",
+		{},
+		[
+			[{"lower": "html"}],
+			[{"lower": "html5"}]
+		]
+	],
+    "Python": [
+        "PRODUCT",
+        {},
+        [
+            [{"orth": "Python"}]
+        ]
+    ],
+    "Ruby": [
+        "PRODUCT",
+        {},
+        [
+            [{"orth": "Ruby"}]
+        ]
+    ],
+    "Digg": [
+        "PRODUCT",
+        {},
+        [
+            [{"lower": "digg"}]
+        ]
+    ],
+     "FoxNews": [
+        "ORG",
+        {},
+        [
+            [{"orth": "Fox"}],
+            [{"orth": "News"}]
+        ]
+    ],
+    "Google": [
+        "ORG",
+        {},
+        [
+            [{"lower": "google"}]
+        ]
+    ],
+    "Mac": [
+        "PRODUCT",
+        {},
+        [
+            [{"lower": "mac"}]
+        ]
+    ],
+    "Wikipedia": [
+        "PRODUCT",
+        {},
+        [
+            [{"lower": "wikipedia"}]
+        ]
+    ],
+    "Windows": [
+        "PRODUCT",
+        {},
+        [
+            [{"orth": "Windows"}]
+        ]
+    ],
+     "Dell": [
+        "ORG",
+        {},
+        [
+            [{"lower": "dell"}]
+        ]
+    ],
+    "Facebook": [
+        "ORG",
+        {},
+        [
+            [{"lower": "facebook"}]
+        ]
+    ],
+     "Blizzard": [
+        "ORG",
+        {},
+        [
+            [{"orth": "Blizzard"}]
+        ]
+    ],
+    "Ubuntu": [
+        "ORG",
+        {},
+        [
+            [{"orth": "Ubuntu"}]
+        ]
+    ],
+    "Youtube": [
+        "PRODUCT",
+        {},
+        [
+            [{"lower": "youtube"}]
+        ]
+    ],
+    "false_positives": [
+        null,
+        {},
+        [
+            [{"orth": "Shit"}],
+            [{"orth": "Weed"}],
+            [{"orth": "Cool"}],
+            [{"orth": "Btw"}],
+            [{"orth": "Bah"}],
+            [{"orth": "Bullshit"}],
+            [{"orth": "Lol"}],
+            [{"orth": "Yo"}, {"lower": "dawg"}],
+            [{"orth": "Yay"}],
+            [{"orth": "Ahh"}],
+            [{"orth": "Yea"}],
+            [{"orth": "Bah"}]
+        ]
+    ]
+}
--- a/lang_data/de/generate_specials.py
+++ b/lang_data/de/generate_specials.py
@ -1,5 +1,7 @@
 # coding=utf8
 import json
+import io
+import itertools

 contractions = {}

@ -262,14 +264,30 @@ def get_token_properties(token, capitalize=False, remove_contractions=False):
    props["F"] = token
    return props

+
 def create_entry(token, endings, capitalize=False, remove_contractions=False):
-    
    properties = []
    properties.append(get_token_properties(token, capitalize=capitalize, remove_contractions=remove_contractions))
    for e in endings:
        properties.append(get_token_properties(e, remove_contractions=remove_contractions))
    return properties

+
+FIELDNAMES = ['F','L','pos']
+def read_hardcoded(stream):
+    hc_specials = {}
+    for line in stream:
+        line = line.strip()
+        if line.startswith('#') or not line:
+            continue
+        key,_,rest = line.partition('\t')
+        values = []
+        for annotation in zip(*[ e.split('|') for e in rest.split('\t') ]):
+            values.append({ k:v for k,v in itertools.izip_longest(FIELDNAMES,annotation) if v })
+        hc_specials[key] = values
+    return hc_specials
+
+
 def generate_specials():

    specials = {}
@ -303,7 +321,10 @@ def generate_specials():
                specials[special] = create_entry(token, endings, capitalize=True, remove_contractions=True)

    # add in hardcoded specials
-    specials = dict(specials, **hardcoded_specials)
+    # changed it so it generates them from a file
+    with io.open('abbrev.de.tab','r',encoding='utf8') as abbrev_:
+        hc_specials = read_hardcoded(abbrev_)
+    specials = dict(specials, **hc_specials)

    return specials

--- a/lang_data/de/infix.txt
+++ b/lang_data/de/infix.txt
@ -1,3 +1,6 @@
 \.\.\.
 (?<=[a-z])\.(?=[A-Z])
-(?<=[a-zA-Z])-(?=[a-zA-z])
+(?<=[a-zöäüßA-ZÖÄÜ"]):(?=[a-zöäüßA-ZÖÄÜ])
+(?<=[a-zöäüßA-ZÖÄÜ"])>(?=[a-zöäüßA-ZÖÄÜ])
+(?<=[a-zöäüßA-ZÖÄÜ"])<(?=[a-zöäüßA-ZÖÄÜ])
+(?<=[a-zöäüßA-ZÖÄÜ"])=(?=[a-zöäüßA-ZÖÄÜ])
--- a/lang_data/de/prefix.txt
+++ b/lang_data/de/prefix.txt
@ -5,6 +5,7 @@
 {
 *
 <
+>
 $
 £
 „
@ -20,3 +21,7 @@ a-
 ‘
 ....
 ...
+‚
+»
+_
+§
--- a/lang_data/de/specials.json
+++ b/lang_data/de/specials.json
@ -1,27 +1,4 @@
 {
-    "\t": [
-        {
-            "F": "\t",
-            "pos": "SP"
-        }
-    ],
-    "\n": [
-        {
-            "F": "\n",
-            "pos": "SP"
-        }
-    ],
-    " ": [
-        {
-            "F": " ",
-            "pos": "SP"
-        }
-    ],
-    "\")": [
-        {
-            "F": "\")"
-        }
-    ],
    "''": [
        {
            "F": "''"
@ -217,6 +194,11 @@
            "F": "<333"
        }
    ],
+    "<space>": [
+        {
+            "F": "SP"
+        }
+    ],
    "=)": [
        {
            "F": "=)"
@ -267,6 +249,16 @@
            "F": "Abk."
        }
    ],
+    "Abs.": [
+        {
+            "F": "Abs."
+        }
+    ],
+    "Abt.": [
+        {
+            "F": "Abt."
+        }
+    ],
    "Apr.": [
        {
            "F": "Apr."
@ -277,6 +269,26 @@
            "F": "Aug."
        }
    ],
+    "B.A.": [
+        {
+            "F": "B.A."
+        }
+    ],
+    "B.Sc.": [
+        {
+            "F": "B.Sc."
+        }
+    ],
+    "Bd.": [
+        {
+            "F": "Bd."
+        }
+    ],
+    "Betr.": [
+        {
+            "F": "Betr."
+        }
+    ],
    "Bf.": [
        {
            "F": "Bf."
@ -292,6 +304,11 @@
            "F": "Biol."
        }
    ],
+    "Bsp.": [
+        {
+            "F": "Bsp."
+        }
+    ],
    "Chr.": [
        {
            "F": "Chr."
@ -342,6 +359,16 @@
            "F": "Dr."
        }
    ],
+    "Fa.": [
+        {
+            "F": "Fa."
+        }
+    ],
+    "Fam.": [
+        {
+            "F": "Fam."
+        }
+    ],
    "Feb.": [
        {
            "F": "Feb."
@ -387,6 +414,16 @@
            "F": "Hrgs."
        }
    ],
+    "Hrn.": [
+        {
+            "F": "Hrn."
+        }
+    ],
+    "Hrsg.": [
+        {
+            "F": "Hrsg."
+        }
+    ],
    "Ing.": [
        {
            "F": "Ing."
@ -397,11 +434,21 @@
            "F": "Jan."
        }
    ],
+    "Jh.": [
+        {
+            "F": "Jh."
+        }
+    ],
    "Jhd.": [
        {
            "F": "Jhd."
        }
    ],
+    "Jr.": [
+        {
+            "F": "Jr."
+        }
+    ],
    "Jul.": [
        {
            "F": "Jul."
@ -412,21 +459,61 @@
            "F": "Jun."
        }
    ],
+    "K.O.": [
+        {
+            "F": "K.O."
+        }
+    ],
+    "L.A.": [
+        {
+            "F": "L.A."
+        }
+    ],
+    "M.A.": [
+        {
+            "F": "M.A."
+        }
+    ],
+    "M.Sc.": [
+        {
+            "F": "M.Sc."
+        }
+    ],
    "Mi.": [
        {
            "F": "Mi."
        }
    ],
+    "Mio.": [
+        {
+            "F": "Mio."
+        }
+    ],
    "Mo.": [
        {
            "F": "Mo."
        }
    ],
+    "Mr.": [
+        {
+            "F": "Mr."
+        }
+    ],
+    "Mrd.": [
+        {
+            "F": "Mrd."
+        }
+    ],
    "Mrz.": [
        {
            "F": "Mrz."
        }
    ],
+    "MwSt.": [
+        {
+            "F": "MwSt."
+        }
+    ],
    "M\u00e4r.": [
        {
            "F": "M\u00e4r."
@ -452,16 +539,31 @@
            "F": "Nr."
        }
    ],
+    "O.K.": [
+        {
+            "F": "O.K."
+        }
+    ],
    "Okt.": [
        {
            "F": "Okt."
        }
    ],
+    "Orig.": [
+        {
+            "F": "Orig."
+        }
+    ],
    "P.S.": [
        {
            "F": "P.S."
        }
    ],
+    "Pkt.": [
+        {
+            "F": "Pkt."
+        }
+    ],
    "Prof.": [
        {
            "F": "Prof."
@ -472,6 +574,11 @@
            "F": "R.I.P."
        }
    ],
+    "Red.": [
+        {
+            "F": "Red."
+        }
+    ],
    "S'": [
        {
            "F": "S'",
@ -503,6 +610,41 @@
            "F": "St."
        }
    ],
+    "Std.": [
+        {
+            "F": "Std."
+        }
+    ],
+    "Str.": [
+        {
+            "F": "Str."
+        }
+    ],
+    "Tel.": [
+        {
+            "F": "Tel."
+        }
+    ],
+    "Tsd.": [
+        {
+            "F": "Tsd."
+        }
+    ],
+    "U.S.": [
+        {
+            "F": "U.S."
+        }
+    ],
+    "U.S.A.": [
+        {
+            "F": "U.S.A."
+        }
+    ],
+    "U.S.S.": [
+        {
+            "F": "U.S.S."
+        }
+    ],
    "Univ.": [
        {
            "F": "Univ."
@ -513,6 +655,30 @@
            "F": "V_V"
        }
    ],
+    "Vol.": [
+        {
+            "F": "Vol."
+        }
+    ],
+    "\\\")": [
+        {
+            "F": "\\\")"
+        }
+    ],
+    "\\n": [
+        {
+            "F": "\\n",
+            "L": "<nl>",
+            "pos": "SP"
+        }
+    ],
+    "\\t": [
+        {
+            "F": "\\t",
+            "L": "<tab>",
+            "pos": "SP"
+        }
+    ],
    "^_^": [
        {
            "F": "^_^"
@ -528,6 +694,11 @@
            "F": "a.D."
        }
    ],
+    "a.M.": [
+        {
+            "F": "a.M."
+        }
+    ],
    "a.Z.": [
        {
            "F": "a.Z."
@ -548,9 +719,15 @@
            "F": "al."
        }
    ],
+    "allg.": [
+        {
+            "F": "allg."
+        }
+    ],
    "auf'm": [
        {
-            "F": "auf"
+            "F": "auf",
+            "L": "auf"
        },
        {
            "F": "'m",
@ -572,11 +749,31 @@
            "F": "biol."
        }
    ],
+    "bspw.": [
+        {
+            "F": "bspw."
+        }
+    ],
+    "bzgl.": [
+        {
+            "F": "bzgl."
+        }
+    ],
+    "bzw.": [
+        {
+            "F": "bzw."
+        }
+    ],
    "c.": [
        {
            "F": "c."
        }
    ],
+    "ca.": [
+        {
+            "F": "ca."
+        }
+    ],
    "co.": [
        {
            "F": "co."
@ -587,9 +784,20 @@
            "F": "d."
        }
    ],
+    "d.h.": [
+        {
+            "F": "d.h."
+        }
+    ],
+    "dgl.": [
+        {
+            "F": "dgl."
+        }
+    ],
    "du's": [
        {
-            "F": "du"
+            "F": "du",
+            "L": "du"
        },
        {
            "F": "'s",
@ -611,19 +819,35 @@
            "F": "e.g."
        }
    ],
+    "ebd.": [
+        {
+            "F": "ebd."
+        }
+    ],
    "ehem.": [
        {
            "F": "ehem."
        }
    ],
+    "eigtl.": [
+        {
+            "F": "eigtl."
+        }
+    ],
    "engl.": [
        {
            "F": "engl."
        }
    ],
+    "entspr.": [
+        {
+            "F": "entspr."
+        }
+    ],
    "er's": [
        {
-            "F": "er"
+            "F": "er",
+            "L": "er"
        },
        {
            "F": "'s",
@ -640,11 +864,26 @@
            "F": "etc."
        }
    ],
+    "ev.": [
+        {
+            "F": "ev."
+        }
+    ],
+    "evtl.": [
+        {
+            "F": "evtl."
+        }
+    ],
    "f.": [
        {
            "F": "f."
        }
    ],
+    "frz.": [
+        {
+            "F": "frz."
+        }
+    ],
    "g.": [
        {
            "F": "g."
@ -660,6 +899,11 @@
            "F": "gegr."
        }
    ],
+    "gem.": [
+        {
+            "F": "gem."
+        }
+    ],
    "ggf.": [
        {
            "F": "ggf."
@ -687,23 +931,39 @@
    ],
    "hinter'm": [
        {
-            "F": "hinter"
+            "F": "hinter",
+            "L": "hinter"
        },
        {
            "F": "'m",
            "L": "dem"
        }
    ],
+    "hrsg.": [
+        {
+            "F": "hrsg."
+        }
+    ],
    "i.": [
        {
            "F": "i."
        }
    ],
+    "i.A.": [
+        {
+            "F": "i.A."
+        }
+    ],
    "i.G.": [
        {
            "F": "i.G."
        }
    ],
+    "i.O.": [
+        {
+            "F": "i.O."
+        }
+    ],
    "i.Tr.": [
        {
            "F": "i.Tr."
@ -714,6 +974,11 @@
            "F": "i.V."
        }
    ],
+    "i.d.R.": [
+        {
+            "F": "i.d.R."
+        }
+    ],
    "i.e.": [
        {
            "F": "i.e."
@ -721,7 +986,8 @@
    ],
    "ich's": [
        {
-            "F": "ich"
+            "F": "ich",
+            "L": "ich"
        },
        {
            "F": "'s",
@ -730,7 +996,8 @@
    ],
    "ihr's": [
        {
-            "F": "ihr"
+            "F": "ihr",
+            "L": "ihr"
        },
        {
            "F": "'s",
@ -757,6 +1024,11 @@
            "F": "j."
        }
    ],
+    "jr.": [
+        {
+            "F": "jr."
+        }
+    ],
    "jun.": [
        {
            "F": "jun."
@ -772,11 +1044,21 @@
            "F": "k."
        }
    ],
+    "kath.": [
+        {
+            "F": "kath."
+        }
+    ],
    "l.": [
        {
            "F": "l."
        }
    ],
+    "lat.": [
+        {
+            "F": "lat."
+        }
+    ],
    "lt.": [
        {
            "F": "lt."
@ -787,11 +1069,46 @@
            "F": "m."
        }
    ],
+    "m.E.": [
+        {
+            "F": "m.E."
+        }
+    ],
+    "m.M.": [
+        {
+            "F": "m.M."
+        }
+    ],
+    "max.": [
+        {
+            "F": "max."
+        }
+    ],
+    "min.": [
+        {
+            "F": "min."
+        }
+    ],
+    "mind.": [
+        {
+            "F": "mind."
+        }
+    ],
+    "mtl.": [
+        {
+            "F": "mtl."
+        }
+    ],
    "n.": [
        {
            "F": "n."
        }
    ],
+    "n.Chr.": [
+        {
+            "F": "n.Chr."
+        }
+    ],
    "nat.": [
        {
            "F": "nat."
@ -807,6 +1124,31 @@
            "F": "o.O"
        }
    ],
+    "o.a.": [
+        {
+            "F": "o.a."
+        }
+    ],
+    "o.g.": [
+        {
+            "F": "o.g."
+        }
+    ],
+    "o.k.": [
+        {
+            "F": "o.k."
+        }
+    ],
+    "o.\u00c4.": [
+        {
+            "F": "o.\u00c4."
+        }
+    ],
+    "o.\u00e4.": [
+        {
+            "F": "o.\u00e4."
+        }
+    ],
    "o_O": [
        {
            "F": "o_O"
@ -817,6 +1159,11 @@
            "F": "o_o"
        }
    ],
+    "orig.": [
+        {
+            "F": "orig."
+        }
+    ],
    "p.": [
        {
            "F": "p."
@ -827,6 +1174,21 @@
            "F": "p.a."
        }
    ],
+    "p.s.": [
+        {
+            "F": "p.s."
+        }
+    ],
+    "pers.": [
+        {
+            "F": "pers."
+        }
+    ],
+    "phil.": [
+        {
+            "F": "phil."
+        }
+    ],
    "q.": [
        {
            "F": "q."
@ -847,6 +1209,11 @@
            "F": "rer."
        }
    ],
+    "r\u00f6m.": [
+        {
+            "F": "r\u00f6m."
+        }
+    ],
    "s'": [
        {
            "F": "s'",
@ -858,6 +1225,11 @@
            "F": "s."
        }
    ],
+    "s.o.": [
+        {
+            "F": "s.o."
+        }
+    ],
    "sen.": [
        {
            "F": "sen."
@ -865,23 +1237,49 @@
    ],
    "sie's": [
        {
-            "F": "sie"
+            "F": "sie",
+            "L": "sie"
        },
        {
            "F": "'s",
            "L": "es"
        }
    ],
+    "sog.": [
+        {
+            "F": "sog."
+        }
+    ],
+    "std.": [
+        {
+            "F": "std."
+        }
+    ],
+    "stellv.": [
+        {
+            "F": "stellv."
+        }
+    ],
    "t.": [
        {
            "F": "t."
        }
    ],
+    "t\u00e4gl.": [
+        {
+            "F": "t\u00e4gl."
+        }
+    ],
    "u.": [
        {
            "F": "u."
        }
    ],
+    "u.U.": [
+        {
+            "F": "u.U."
+        }
+    ],
    "u.a.": [
        {
            "F": "u.a."
@ -892,28 +1290,75 @@
            "F": "u.s.w."
        }
    ],
+    "u.v.m.": [
+        {
+            "F": "u.v.m."
+        }
+    ],
    "unter'm": [
        {
-            "F": "unter"
+            "F": "unter",
+            "L": "unter"
        },
        {
            "F": "'m",
            "L": "dem"
        }
    ],
+    "usf.": [
+        {
+            "F": "usf."
+        }
+    ],
+    "usw.": [
+        {
+            "F": "usw."
+        }
+    ],
+    "uvm.": [
+        {
+            "F": "uvm."
+        }
+    ],
    "v.": [
        {
            "F": "v."
        }
    ],
+    "v.Chr.": [
+        {
+            "F": "v.Chr."
+        }
+    ],
+    "v.a.": [
+        {
+            "F": "v.a."
+        }
+    ],
+    "v.l.n.r.": [
+        {
+            "F": "v.l.n.r."
+        }
+    ],
    "vgl.": [
        {
            "F": "vgl."
        }
    ],
+    "vllt.": [
+        {
+            "F": "vllt."
+        }
+    ],
+    "vlt.": [
+        {
+            "F": "vlt."
+        }
+    ],
    "vor'm": [
        {
-            "F": "vor"
+            "F": "vor",
+            "L": "vor"
        },
        {
            "F": "'m",
@ -932,13 +1377,19 @@
    ],
    "wir's": [
        {
-            "F": "wir"
+            "F": "wir",
+            "L": "wir"
        },
        {
            "F": "'s",
            "L": "es"
        }
    ],
+    "wiss.": [
+        {
+            "F": "wiss."
+        }
+    ],
    "x.": [
        {
            "F": "x."
@ -969,19 +1420,60 @@
            "F": "z.B."
        }
    ],
+    "z.Bsp.": [
+        {
+            "F": "z.Bsp."
+        }
+    ],
+    "z.T.": [
+        {
+            "F": "z.T."
+        }
+    ],
    "z.Z.": [
        {
            "F": "z.Z."
        }
    ],
+    "z.Zt.": [
+        {
+            "F": "z.Zt."
+        }
+    ],
+    "z.b.": [
+        {
+            "F": "z.b."
+        }
+    ],
    "zzgl.": [
        {
            "F": "zzgl."
        }
    ],
+    "\u00e4.": [
+        {
+            "F": "\u00e4."
+        }
+    ],
+    "\u00f6.": [
+        {
+            "F": "\u00f6."
+        }
+    ],
+    "\u00f6sterr.": [
+        {
+            "F": "\u00f6sterr."
+        }
+    ],
+    "\u00fc.": [
+        {
+            "F": "\u00fc."
+        }
+    ],
    "\u00fcber'm": [
        {
-            "F": "\u00fcber"
+            "F": "\u00fcber",
+            "L": "\u00fcber"
        },
        {
            "F": "'m",
--- a/lang_data/de/suffix.txt
+++ b/lang_data/de/suffix.txt
@ -13,14 +13,61 @@
 ;
 '
 ”
+“
+«
+_
 ''
 's
 'S
 ’s
 ’S
 ’
+‘
+°
+€
 \.\.
 \.\.\.
 \.\.\.\.
-(?<=[a-z0-9)\]"'%\)])\.
+(?<=[a-zäöüßÖÄÜ)\]"'´«‘’%\)²“”])\.
+\-\-
+´
+(?<=[0-9])km²
+(?<=[0-9])m²
+(?<=[0-9])cm²
+(?<=[0-9])mm²
+(?<=[0-9])km³
+(?<=[0-9])m³
+(?<=[0-9])cm³
+(?<=[0-9])mm³
+(?<=[0-9])ha
 (?<=[0-9])km
+(?<=[0-9])m
+(?<=[0-9])cm
+(?<=[0-9])mm
+(?<=[0-9])µm
+(?<=[0-9])nm
+(?<=[0-9])yd
+(?<=[0-9])in
+(?<=[0-9])ft
+(?<=[0-9])kg
+(?<=[0-9])g
+(?<=[0-9])mg
+(?<=[0-9])µg
+(?<=[0-9])t
+(?<=[0-9])lb
+(?<=[0-9])oz
+(?<=[0-9])m/s
+(?<=[0-9])km/h
+(?<=[0-9])mph
+(?<=[0-9])°C
+(?<=[0-9])°K
+(?<=[0-9])°F
+(?<=[0-9])hPa
+(?<=[0-9])Pa
+(?<=[0-9])mbar
+(?<=[0-9])mb
+(?<=[0-9])T
+(?<=[0-9])G
+(?<=[0-9])M
+(?<=[0-9])K
+(?<=[0-9])kb
--- a/setup.py
+++ b/setup.py
@ -47,6 +47,7 @@ MOD_NAMES = [
    'spacy.syntax._state',
    'spacy.tokenizer',
    'spacy.syntax.parser',
+    'spacy.syntax.nonproj',
    'spacy.syntax.transition_system',
    'spacy.syntax.arc_eager',
    'spacy.syntax._parse_features',
--- a/spacy/de/init.py
+++ b/spacy/de/init.py
@ -6,4 +6,4 @@ from ..language import Language


 class German(Language):
-    pass
+    lang = 'de'
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -14,6 +14,8 @@ try:
 except ImportError:
    import json

+from .syntax import nonproj
+

 def tags_to_entities(tags):
    entities = []
@ -236,34 +238,14 @@ cdef class GoldParse:
                self.heads[i] = self.gold_to_cand[annot_tuples[3][gold_i]]
                self.labels[i] = annot_tuples[4][gold_i]
                self.ner[i] = annot_tuples[5][gold_i]
-       
-        # If we have any non-projective arcs, i.e. crossing brackets, consider
-        # the heads for those words missing in the gold-standard.
-        # This way, we can train from these sentences
-        cdef int w1, w2, h1, h2
-        if make_projective:
-            heads = list(self.heads)
-            for w1 in range(self.length):
-                if heads[w1] is not None:
-                    h1 = heads[w1]
-                    for w2 in range(w1+1, self.length):
-                        if heads[w2] is not None:
-                            h2 = heads[w2]
-                            if _arcs_cross(w1, h1, w2, h2):
-                                self.heads[w1] = None
-                                self.labels[w1] = ''
-                                self.heads[w2] = None
-                                self.labels[w2] = ''

-        # Check there are no cycles in the dependencies, i.e. we are a tree
-        for w in range(self.length):
-            seen = set([w])
-            head = w
-            while self.heads[head] != head and self.heads[head] != None:
-                head = self.heads[head]
-                if head in seen:
-                    raise Exception("Cycle found: %s" % seen)
-                seen.add(head)
+        cycle = nonproj.contains_cycle(self.heads)
+        if cycle != None:
+            raise Exception("Cycle found: %s" % cycle)
+
+        if make_projective:
+            proj_heads,_ = nonproj.PseudoProjectivity.projectivize(self.heads,self.labels)
+            self.heads = proj_heads

        self.brackets = {}
        for (gold_start, gold_end, label_str) in brackets:
@ -278,25 +260,18 @@ cdef class GoldParse:

    @property
    def is_projective(self):
-        heads = list(self.heads)
-        for w1 in range(self.length):
-            if heads[w1] is not None:
-                h1 = heads[w1]
-                for w2 in range(self.length):
-                    if heads[w2] is not None and _arcs_cross(w1, h1, w2, heads[w2]):
-                        return False
-        return True
-
-
-cdef int _arcs_cross(int w1, int h1, int w2, int h2) except -1:
-    if w1 > h1:
-        w1, h1 = h1, w1
-    if w2 > h2:
-        w2, h2 = h2, w2
-    if w1 > w2:
-        w1, h1, w2, h2 = w2, h2, w1, h1
-    return w1 < w2 < h1 < h2 or w1 < w2 == h2 < h1
+        return not nonproj.is_nonproj_tree(self.heads)


 def is_punct_label(label):
    return label == 'P' or label.lower() == 'punct'
+
+
+
+
+
+
+
+
+
+
--- a/spacy/syntax/nonproj.pxd
+++ b/spacy/syntax/nonproj.pxd
--- a/spacy/syntax/nonproj.pyx
+++ b/spacy/syntax/nonproj.pyx
@ -0,0 +1,200 @@
+from copy import copy
+from collections import Counter
+
+from ..tokens.doc cimport Doc
+from spacy.attrs import DEP, HEAD
+
+
+def ancestors(tokenid, heads):
+    # returns all words going from the word up the path to the root
+    # the path to root cannot be longer than the number of words in the sentence
+    # this function ends after at most len(heads) steps 
+    # because it would otherwise loop indefinitely on cycles
+    head = tokenid
+    cnt = 0
+    while heads[head] != head and cnt < len(heads):
+        head = heads[head]
+        cnt += 1
+        yield head
+        if head == None:
+            break
+
+
+def contains_cycle(heads):
+    # in an acyclic tree, the path from each word following
+    # the head relation upwards always ends at the root node
+    for tokenid in range(len(heads)):
+        seen = set([tokenid])
+        for ancestor in ancestors(tokenid,heads):
+            if ancestor in seen:
+                return seen
+            seen.add(ancestor)
+    return None
+
+
+def is_nonproj_arc(tokenid, heads):
+    # definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective
+    # if there is a token k, h < k < d such that h is not
+    # an ancestor of k. Same for h -> d, h > d
+    head = heads[tokenid]
+    if head == tokenid: # root arcs cannot be non-projective
+        return False
+    elif head == None: # unattached tokens cannot be non-projective
+        return False
+
+    start, end = (head+1, tokenid) if head < tokenid else (tokenid+1, head)
+    for k in range(start,end):
+        for ancestor in ancestors(k,heads):
+            if ancestor == None: # for unattached tokens/subtrees
+                break
+            elif ancestor == head: # normal case: k dominated by h
+                break
+        else: # head not in ancestors: d -> h is non-projective
+            return True
+    return False
+
+
+def is_nonproj_tree(heads):
+    # a tree is non-projective if at least one arc is non-projective
+    return any( is_nonproj_arc(word,heads) for word in range(len(heads)) )
+
+
+cdef class PseudoProjectivity:
+    # implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
+    # for doing pseudo-projective parsing
+    # implementation uses the HEAD decoration scheme
+
+    delimiter = '||'
+
+    @classmethod
+    def decompose(cls, label):
+        return label.partition(cls.delimiter)[::2]
+
+    @classmethod
+    def is_decorated(cls, label):
+        return label.find(cls.delimiter) != -1
+
+    @classmethod
+    def preprocess_training_data(cls, gold_tuples, label_freq_cutoff=30):
+        preprocessed = []
+        freqs = Counter()
+        for raw_text, sents in gold_tuples:
+            prepro_sents = []
+            for (ids, words, tags, heads, labels, iob), ctnts in sents:
+                proj_heads,deco_labels = cls.projectivize(heads,labels)
+                # set the label to ROOT for each root dependent
+                deco_labels = [ 'ROOT' if head == i else deco_labels[i] for i,head in enumerate(proj_heads) ]
+                # count label frequencies
+                if label_freq_cutoff > 0:
+                    freqs.update( label for label in deco_labels if cls.is_decorated(label) )
+                prepro_sents.append(((ids,words,tags,proj_heads,deco_labels,iob), ctnts))
+            preprocessed.append((raw_text, prepro_sents))
+
+        if label_freq_cutoff > 0:
+            return cls._filter_labels(preprocessed,label_freq_cutoff,freqs)
+        return preprocessed
+
+
+    @classmethod
+    def projectivize(cls, heads, labels):
+        # use the algorithm by Nivre & Nilsson 2005
+        # assumes heads to be a proper tree, i.e. connected and cycle-free
+        # returns a new pair (heads,labels) which encode
+        # a projective and decorated tree
+        proj_heads = copy(heads)
+        smallest_np_arc = cls._get_smallest_nonproj_arc(proj_heads)
+        if smallest_np_arc == None: # this sentence is already projective
+            return proj_heads, copy(labels)
+        while smallest_np_arc != None:
+            cls._lift(smallest_np_arc, proj_heads)
+            smallest_np_arc = cls._get_smallest_nonproj_arc(proj_heads)
+        deco_labels = cls._decorate(heads, proj_heads, labels)
+        return proj_heads, deco_labels
+
+
+    @classmethod
+    def deprojectivize(cls, Doc tokens):
+        # reattach arcs with decorated labels (following HEAD scheme)
+        # for each decorated arc X||Y, search top-down, left-to-right,
+        # breadth-first until hitting a Y then make this the new head
+        parse = tokens.to_array([HEAD, DEP])
+        labels = [ tokens.vocab.strings[int(p[1])] for p in parse ]
+        for token in tokens:
+            if cls.is_decorated(token.dep_):
+                newlabel,headlabel = cls.decompose(token.dep_)
+                newhead = cls._find_new_head(token,headlabel)
+                parse[token.i,1] = tokens.vocab.strings[newlabel]
+                parse[token.i,0] = newhead.i - token.i
+        tokens.from_array([HEAD, DEP],parse)
+
+
+    @classmethod
+    def _decorate(cls, heads, proj_heads, labels):
+        # uses decoration scheme HEAD from Nivre & Nilsson 2005
+        assert(len(heads) == len(proj_heads) == len(labels))
+        deco_labels = []
+        for tokenid,head in enumerate(heads):
+            if head != proj_heads[tokenid]:
+                deco_labels.append('%s%s%s' % (labels[tokenid],cls.delimiter,labels[head]))
+            else:
+                deco_labels.append(labels[tokenid])
+        return deco_labels
+
+
+    @classmethod
+    def _get_smallest_nonproj_arc(cls, heads):
+        # return the smallest non-proj arc or None
+        # where size is defined as the distance between dep and head
+        # and ties are broken left to right
+        smallest_size = float('inf')
+        smallest_np_arc = None
+        for tokenid,head in enumerate(heads):
+            size = abs(tokenid-head)
+            if size < smallest_size and is_nonproj_arc(tokenid,heads):
+                smallest_size = size
+                smallest_np_arc = tokenid
+        return smallest_np_arc
+
+
+    @classmethod
+    def _lift(cls, tokenid, heads):
+        # reattaches a word to it's grandfather
+        head = heads[tokenid]
+        ghead = heads[head]
+        # attach to ghead if head isn't attached to root else attach to root
+        heads[tokenid] = ghead if head != ghead else tokenid
+
+
+    @classmethod
+    def _find_new_head(cls, token, headlabel):
+        # search through the tree starting from root
+        # returns the id of the first descendant with the given label
+        # if there is none, return the current head (no change)
+        queue = [token.head]
+        while queue:
+            next_queue = []
+            for qtoken in queue:
+                for child in qtoken.children:
+                    if child == token:
+                        continue
+                    if child.dep_ == headlabel:
+                        return child
+                    next_queue.append(child)
+            queue = next_queue
+        return token.head
+
+
+    @classmethod
+    def _filter_labels(cls, gold_tuples, cutoff, freqs):
+        # throw away infrequent decorated labels
+        # can't learn them reliably anyway and keeps label set smaller
+        filtered = []
+        for raw_text, sents in gold_tuples:
+            filtered_sents = []
+            for (ids, words, tags, heads, labels, iob), ctnts in sents:
+                filtered_labels = [ cls.decompose(label)[0] if freqs.get(label,cutoff) < cutoff else label for label in labels ]
+                filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts))
+            filtered.append((raw_text, filtered_sents))
+        return filtered
+
+
--- a/spacy/syntax/parser.pxd
+++ b/spacy/syntax/parser.pxd
@ -15,5 +15,6 @@ cdef class ParserModel(AveragedPerceptron):
 cdef class Parser:
    cdef readonly ParserModel model
    cdef readonly TransitionSystem moves
+    cdef int _projectivize

    cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@ -12,12 +12,12 @@ from cpython.exc cimport PyErr_CheckSignals
 from libc.stdint cimport uint32_t, uint64_t
 from libc.string cimport memset, memcpy
 from libc.stdlib cimport malloc, calloc, free
-import random
 import os.path
 from os import path
 import shutil
 import json
 import sys
+from .nonproj import PseudoProjectivity

 from cymem.cymem cimport Pool, Address
 from murmurhash.mrmr cimport hash64
@ -79,9 +79,10 @@ cdef class ParserModel(AveragedPerceptron):


 cdef class Parser:
-    def __init__(self, StringStore strings, transition_system, ParserModel model):
+    def __init__(self, StringStore strings, transition_system, ParserModel model, int projectivize = 0):
        self.moves = transition_system
        self.model = model
+        self._projectivize = projectivize

    @classmethod
    def from_dir(cls, model_dir, strings, transition_system):
@ -93,9 +94,10 @@ cdef class Parser:
        moves = transition_system(strings, cfg.labels)
        templates = get_templates(cfg.features)
        model = ParserModel(templates)
+        project = cfg.projectivize if hasattr(cfg,'projectivize') else False
        if path.exists(path.join(model_dir, 'model')):
            model.load(path.join(model_dir, 'model'))
-        return cls(strings, moves, model)
+        return cls(strings, moves, model, project)

    @classmethod
    def load(cls, pkg_or_str_or_file, vocab):
@ -114,6 +116,9 @@ cdef class Parser:
            tokens.is_parsed = True
        # Check for KeyboardInterrupt etc. Untested
        PyErr_CheckSignals()
+        # projectivize output
+        if self._projectivize:
+            PseudoProjectivity.deprojectivize(tokens)

    def pipe(self, stream, int batch_size=1000, int n_threads=2):
        cdef Pool mem = Pool()
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -143,7 +143,7 @@ cdef class Tagger:

    @classmethod
    def blank(cls, vocab, templates):
-        model = TaggerModel(N_CONTEXT_FIELDS, templates)
+        model = TaggerModel(templates)
        return cls(vocab, model)

    @classmethod
@ -153,10 +153,8 @@ cdef class Tagger:
    @classmethod
    def from_package(cls, pkg, vocab):
        # TODO: templates.json deprecated? not present in latest package
-        templates = cls.default_templates()
-        # templates = package.load_utf8(json.load,
-        #     'pos', 'templates.json',
-        #     default=cls.default_templates())
+        # templates = cls.default_templates()
+        templates = pkg.load_json(('pos', 'templates.json'), default=cls.default_templates())

        model = TaggerModel(templates)
        if pkg.has_file('pos', 'model'):
@ -203,7 +201,7 @@ cdef class Tagger:
                                  nr_class=self.vocab.morphology.n_tags,
                                  nr_feat=self.model.nr_feat)
        for i in range(tokens.length):
-            if tokens.c[i].pos == 0:
+            if tokens.c[i].pos == 0:                
                self.model.set_featuresC(&eg.c, tokens.c, i)
                self.model.set_scoresC(eg.c.scores,
                    eg.c.features, eg.c.nr_feat)
@ -221,7 +219,7 @@ cdef class Tagger:
    def train(self, Doc tokens, object gold_tag_strs):
        assert len(tokens) == len(gold_tag_strs)
        for tag in gold_tag_strs:
-            if tag not in self.tag_names:
+            if tag != None and tag not in self.tag_names:
                msg = ("Unrecognized gold tag: %s. tag_map.json must contain all"
                       "gold tags, to maintain coarse-grained mapping.")
                raise ValueError(msg % tag)
@ -234,10 +232,9 @@ cdef class Tagger:
            nr_feat=self.model.nr_feat)
        for i in range(tokens.length):
            self.model.set_featuresC(&eg.c, tokens.c, i)
-            eg.set_label(golds[i])
+            eg.costs = [ 1 if golds[i] not in (c, -1) else 0 for c in xrange(eg.nr_class) ]
            self.model.set_scoresC(eg.c.scores,
                eg.c.features, eg.c.nr_feat)
-            
            self.model.updateC(&eg.c)

            self.vocab.morphology.assign_tag(&tokens.c[i], eg.guess)
--- a/spacy/tests/parser/test_nonproj.py
+++ b/spacy/tests/parser/test_nonproj.py
@ -0,0 +1,136 @@
+from __future__ import unicode_literals
+import pytest
+
+from spacy.attrs import DEP, HEAD
+import numpy
+
+from spacy.syntax.nonproj import ancestors, contains_cycle, is_nonproj_arc, is_nonproj_tree, PseudoProjectivity
+
+def test_ancestors():
+	tree = [1,2,2,4,5,2,2]
+	cyclic_tree = [1,2,2,4,5,3,2]
+	partial_tree = [1,2,2,4,5,None,2]
+	multirooted_tree = [3,2,0,3,3,7,7,3,7,10,7,10,11,12,18,16,18,17,12,3]
+	assert([ a for a in ancestors(3,tree) ] == [4,5,2])
+	assert([ a for a in ancestors(3,cyclic_tree) ] == [4,5,3,4,5,3,4])
+	assert([ a for a in ancestors(3,partial_tree) ] == [4,5,None])
+	assert([ a for a in ancestors(17,multirooted_tree) ] == [])
+
+def test_contains_cycle():
+	tree = [1,2,2,4,5,2,2]
+	cyclic_tree = [1,2,2,4,5,3,2]
+	partial_tree = [1,2,2,4,5,None,2]
+	multirooted_tree = [3,2,0,3,3,7,7,3,7,10,7,10,11,12,18,16,18,17,12,3]
+	assert(contains_cycle(tree) == None)
+	assert(contains_cycle(cyclic_tree) == set([3,4,5]))
+	assert(contains_cycle(partial_tree) == None)
+	assert(contains_cycle(multirooted_tree) == None)
+
+def test_is_nonproj_arc():
+	nonproj_tree = [1,2,2,4,5,2,7,4,2]
+	partial_tree = [1,2,2,4,5,None,7,4,2]
+	multirooted_tree = [3,2,0,3,3,7,7,3,7,10,7,10,11,12,18,16,18,17,12,3]
+	assert(is_nonproj_arc(0,nonproj_tree) == False)
+	assert(is_nonproj_arc(1,nonproj_tree) == False)
+	assert(is_nonproj_arc(2,nonproj_tree) == False)
+	assert(is_nonproj_arc(3,nonproj_tree) == False)
+	assert(is_nonproj_arc(4,nonproj_tree) == False)
+	assert(is_nonproj_arc(5,nonproj_tree) == False)
+	assert(is_nonproj_arc(6,nonproj_tree) == False)
+	assert(is_nonproj_arc(7,nonproj_tree) == True)
+	assert(is_nonproj_arc(8,nonproj_tree) == False)
+	assert(is_nonproj_arc(7,partial_tree) == False)
+	assert(is_nonproj_arc(17,multirooted_tree) == False)
+	assert(is_nonproj_arc(16,multirooted_tree) == True)
+
+def test_is_nonproj_tree():
+	proj_tree = [1,2,2,4,5,2,7,5,2]
+	nonproj_tree = [1,2,2,4,5,2,7,4,2]
+	partial_tree = [1,2,2,4,5,None,7,4,2]
+	multirooted_tree = [3,2,0,3,3,7,7,3,7,10,7,10,11,12,18,16,18,17,12,3]
+	assert(is_nonproj_tree(proj_tree) == False)
+	assert(is_nonproj_tree(nonproj_tree) == True)
+	assert(is_nonproj_tree(partial_tree) == False)
+	assert(is_nonproj_tree(multirooted_tree) == True)
+
+
+def deprojectivize(proj_heads, deco_labels, EN):
+	slen = len(proj_heads)
+	sent = EN.tokenizer.tokens_from_list(['whatever'] * slen)
+	rel_proj_heads = [ head-i for i,head in enumerate(proj_heads) ]
+	labelids = [ EN.vocab.strings[label] for label in deco_labels ]
+	parse = numpy.asarray(zip(rel_proj_heads,labelids), dtype=numpy.int32)	
+	sent.from_array([HEAD,DEP],parse)
+	PseudoProjectivity.deprojectivize(sent)
+	parse = sent.to_array([HEAD,DEP])
+	deproj_heads = [ i+head for i,head in enumerate(parse[:,0]) ]
+	undeco_labels = [ EN.vocab.strings[int(labelid)] for labelid in parse[:,1] ]
+	return deproj_heads, undeco_labels
+
+
+@pytest.mark.models
+def test_pseudoprojectivity(EN):
+	tree = [1,2,2]
+	nonproj_tree = [1,2,2,4,5,2,7,4,2]
+	labels = ['det','nsubj','root','det','dobj','aux','nsubj','acl','punct']
+	nonproj_tree2 = [9,1,3,1,5,6,9,8,6,1,6,12,13,10,1]
+	labels2 = ['advmod','root','det','nsubj','advmod','det','dobj','det','nmod','aux','nmod','advmod','det','amod','punct']
+
+	assert(PseudoProjectivity.decompose('X||Y') == ('X','Y'))
+	assert(PseudoProjectivity.decompose('X') == ('X',''))
+
+	assert(PseudoProjectivity.is_decorated('X||Y') == True)
+	assert(PseudoProjectivity.is_decorated('X') == False)
+
+	PseudoProjectivity._lift(0,tree)
+	assert(tree == [2,2,2])
+
+	np_arc = PseudoProjectivity._get_smallest_nonproj_arc(nonproj_tree)
+	assert(np_arc == 7)
+
+	np_arc = PseudoProjectivity._get_smallest_nonproj_arc(nonproj_tree2)
+	assert(np_arc == 10)
+
+	proj_heads, deco_labels = PseudoProjectivity.projectivize(nonproj_tree,labels)
+	assert(proj_heads == [1,2,2,4,5,2,7,5,2])
+	assert(deco_labels == ['det','nsubj','root','det','dobj','aux','nsubj','acl||dobj','punct'])
+	deproj_heads, undeco_labels = deprojectivize(proj_heads,deco_labels,EN)
+	assert(deproj_heads == nonproj_tree)
+	assert(undeco_labels == labels)
+
+	proj_heads, deco_labels = PseudoProjectivity.projectivize(nonproj_tree2,labels2)
+	assert(proj_heads == [1,1,3,1,5,6,9,8,6,1,9,12,13,10,1])
+	assert(deco_labels == ['advmod||aux','root','det','nsubj','advmod','det','dobj','det','nmod','aux','nmod||dobj','advmod','det','amod','punct'])
+	deproj_heads, undeco_labels = deprojectivize(proj_heads,deco_labels,EN)
+	assert(deproj_heads == nonproj_tree2)
+	assert(undeco_labels == labels2)
+
+	# if decoration is wrong such that there is no head with the desired label
+	# the structure is kept and the label is undecorated
+	proj_heads = [1,2,2,4,5,2,7,5,2]
+	deco_labels = ['det','nsubj','root','det','dobj','aux','nsubj','acl||iobj','punct']
+	deproj_heads, undeco_labels = deprojectivize(proj_heads,deco_labels,EN)
+	assert(deproj_heads == proj_heads)
+	assert(undeco_labels == ['det','nsubj','root','det','dobj','aux','nsubj','acl','punct'])
+
+	# if there are two potential new heads, the first one is chosen even if it's wrong
+	proj_heads = [1,1,3,1,5,6,9,8,6,1,9,12,13,10,1]
+	deco_labels = ['advmod||aux','root','det','aux','advmod','det','dobj','det','nmod','aux','nmod||dobj','advmod','det','amod','punct']
+	deproj_heads, undeco_labels = deprojectivize(proj_heads,deco_labels,EN)
+	assert(deproj_heads == [3,1,3,1,5,6,9,8,6,1,6,12,13,10,1])
+	assert(undeco_labels == ['advmod','root','det','aux','advmod','det','dobj','det','nmod','aux','nmod','advmod','det','amod','punct'])
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -201,17 +201,9 @@ cdef class Token:
            cdef int nr_iter = 0
            cdef const TokenC* ptr = self.c - (self.i - self.c.l_edge)
            while ptr < self.c:
-                # If this head is still to the right of us, we can skip to it
-                # No token that's between this token and this head could be our
-                # child.
-                if (ptr.head >= 1) and (ptr + ptr.head) < self.c:
-                    ptr += ptr.head
-
-                elif ptr + ptr.head == self.c:
+                if ptr + ptr.head == self.c:
                    yield self.doc[ptr - (self.c - self.i)]
-                    ptr += 1
-                else:
-                    ptr += 1
+                ptr += 1
                nr_iter += 1
                # This is ugly, but it's a way to guard out infinite loops
                if nr_iter >= 10000000:
@ -226,16 +218,10 @@ cdef class Token:
            tokens = []
            cdef int nr_iter = 0
            while ptr > self.c:
-                # If this head is still to the right of us, we can skip to it
-                # No token that's between this token and this head could be our
-                # child.
-                if (ptr.head < 0) and ((ptr + ptr.head) > self.c):
-                    ptr += ptr.head
-                elif ptr + ptr.head == self.c:
+                if ptr + ptr.head == self.c:
                    tokens.append(self.doc[ptr - (self.c - self.i)])
-                    ptr -= 1
-                else:
-                    ptr -= 1
+                ptr -= 1
+                nr_iter += 1
                if nr_iter >= 10000000:
                    raise RuntimeError(
                        "Possibly infinite loop encountered while looking for token.rights")