add tokenizer files for German, add/change code to train German pos tagger

- add files to specify rules for German tokenization - change generate_specials.py to generate from an external file (abbrev.de.tab) - copy gazetteer.json from lang_data/en/ - init_model.py - change doc freq threshold to 0 - add train_german_tagger.py - expects conll09-formatted input
2024-11-10 19:57:17 +03:00 · 2016-02-18 13:24:20 +01:00 · 2016-02-18 13:24:20 +01:00 · eae35e9b27
commit eae35e9b27
parent 9d8966a2c0
10 changed files with 1290 additions and 52 deletions
--- a/bin/init_model.py
+++ b/bin/init_model.py
@ -98,7 +98,7 @@ def _read_probs(loc):
    return probs, probs['-OOV-']


-def _read_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200):
+def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200):
    if not loc.exists():
        print("Warning: Frequencies file not found")
        return {}, 0.0
@ -125,7 +125,8 @@ def _read_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200):
        doc_freq = int(doc_freq)
        freq = int(freq)
        if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
-            word = literal_eval(key)
+#            word = literal_eval(key)
+            word = key
            smooth_count = counts.smoother(int(freq))
            log_smooth_count = math.log(smooth_count)
            probs[word] = math.log(smooth_count) - log_total
@ -165,7 +166,7 @@ def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir):
    clusters = _read_clusters(src_dir / 'clusters.txt')
    probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob')
    if not probs:
-        probs, oov_prob = _read_freqs(src_dir / 'freqs.txt.gz')
+        probs, oov_prob = _read_freqs(src_dir / 'freqs.txt')
    if not probs:
        oov_prob = -20
    else:
@ -223,9 +224,8 @@ def main(lang_id, lang_data_dir, corpora_dir, model_dir):
        copyfile(str(lang_data_dir / 'gazetteer.json'),
                 str(model_dir / 'vocab' / 'gazetteer.json'))

-    if (lang_data_dir / 'tag_map.json').exists():
-        copyfile(str(lang_data_dir / 'tag_map.json'),
-                 str(model_dir / 'vocab' / 'tag_map.json'))
+    copyfile(str(lang_data_dir / 'tag_map.json'),
+             str(model_dir / 'vocab' / 'tag_map.json'))

    if (lang_data_dir / 'lemma_rules.json').exists():
        copyfile(str(lang_data_dir / 'lemma_rules.json'),
--- a/bin/tagger/train_german_tagger.py
+++ b/bin/tagger/train_german_tagger.py
@ -0,0 +1,160 @@
+#!/usr/bin/env python
+from __future__ import division
+from __future__ import unicode_literals
+
+import os
+from os import path
+import shutil
+import io
+import random
+import time
+import gzip
+import ujson
+
+import plac
+import cProfile
+import pstats
+
+import spacy.util
+from spacy.de import German
+from spacy.gold import GoldParse
+from spacy.tagger import Tagger
+from spacy.scorer import PRFScore
+
+from spacy.tagger import P2_orth, P2_cluster, P2_shape, P2_prefix, P2_suffix, P2_pos, P2_lemma, P2_flags 
+from spacy.tagger import P1_orth, P1_cluster, P1_shape, P1_prefix, P1_suffix, P1_pos, P1_lemma, P1_flags 
+from spacy.tagger import W_orth, W_cluster, W_shape, W_prefix, W_suffix, W_pos, W_lemma, W_flags
+from spacy.tagger import N1_orth, N1_cluster, N1_shape, N1_prefix, N1_suffix, N1_pos, N1_lemma, N1_flags
+from spacy.tagger import N2_orth, N2_cluster, N2_shape, N2_prefix, N2_suffix, N2_pos, N2_lemma, N2_flags, N_CONTEXT_FIELDS
+
+
+def default_templates():
+    return spacy.tagger.Tagger.default_templates()
+
+def default_templates_without_clusters():
+    return (
+        (W_orth,),
+        (P1_lemma, P1_pos),
+        (P2_lemma, P2_pos),
+        (N1_orth,),
+        (N2_orth,),
+
+        (W_suffix,),
+        (W_prefix,),
+
+        (P1_pos,),
+        (P2_pos,),
+        (P1_pos, P2_pos),
+        (P1_pos, W_orth),
+        (P1_suffix,),
+        (N1_suffix,),
+
+        (W_shape,),
+
+        (W_flags,),
+        (N1_flags,),
+        (N2_flags,),
+        (P1_flags,),
+        (P2_flags,),
+    )
+
+
+def make_tagger(vocab, templates):
+    model = spacy.tagger.TaggerModel(templates)
+    return spacy.tagger.Tagger(vocab,model)
+
+
+def read_conll(file_):
+    def sentences():
+        words, tags = [], []
+        for line in file_:
+            line = line.strip()
+            if line:
+                word, tag = line.split('\t')[1::3][:2] # get column 1 and 4 (CoNLL09)
+                words.append(word)
+                tags.append(tag)
+            elif words:
+                yield words, tags
+                words, tags = [], []
+        if words:
+            yield words, tags
+    return [ s for s in sentences() ]
+
+        
+def score_model(score, nlp, words, gold_tags):
+    tokens = nlp.tokenizer.tokens_from_list(words)
+    assert(len(tokens) == len(gold_tags))
+    nlp.tagger(tokens)
+
+    for token, gold_tag in zip(tokens,gold_tags):
+        score.score_set(set([token.tag_]),set([gold_tag]))
+
+
+def train(Language, train_sents, dev_sents, model_dir, n_iter=15, seed=21):
+    # make shuffling deterministic
+    random.seed(seed)
+
+    # set up directory for model
+    pos_model_dir = path.join(model_dir, 'pos')
+    if path.exists(pos_model_dir):
+        shutil.rmtree(pos_model_dir)
+    os.mkdir(pos_model_dir)
+
+    nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
+    nlp.tagger = make_tagger(nlp.vocab,default_templates())
+     
+    print("Itn.\ttrain acc %\tdev acc %")
+    for itn in range(n_iter):
+        # train on train set
+        #train_acc = PRFScore()
+        correct, total = 0., 0.
+        for words, gold_tags in train_sents:
+            tokens = nlp.tokenizer.tokens_from_list(words)
+            correct += nlp.tagger.train(tokens, gold_tags)
+            total += len(words)
+        train_acc = correct/total
+
+        # test on dev set
+        dev_acc = PRFScore()
+        for words, gold_tags in dev_sents:
+            score_model(dev_acc, nlp, words, gold_tags)
+
+        random.shuffle(train_sents)
+        print('%d:\t%6.2f\t%6.2f' % (itn, 100*train_acc, 100*dev_acc.precision))
+
+
+    print('end training')
+    nlp.end_training(model_dir)
+    print('done')
+
+
+@plac.annotations(
+    train_loc=("Location of CoNLL 09 formatted training file"),
+    dev_loc=("Location of CoNLL 09 formatted development file"),
+    model_dir=("Location of output model directory"),
+    eval_only=("Skip training, and only evaluate", "flag", "e", bool),
+    n_iter=("Number of training iterations", "option", "i", int),
+)
+def main(train_loc, dev_loc, model_dir, eval_only=False, n_iter=15):
+    # training
+    if not eval_only:
+        with io.open(train_loc, 'r', encoding='utf8') as trainfile_, \
+             io.open(dev_loc, 'r', encoding='utf8') as devfile_:
+            train_sents = read_conll(trainfile_)
+            dev_sents = read_conll(devfile_)
+        train(German, train_sents, dev_sents, model_dir, n_iter=n_iter)
+
+    # testing
+    with io.open(dev_loc, 'r', encoding='utf8') as file_:
+        dev_sents = read_conll(file_)
+        nlp = German(data_dir=model_dir)
+
+        dev_acc = PRFScore()
+        for words, gold_tags in dev_sents:
+            score_model(dev_acc, nlp, words, gold_tags)                
+        
+        print('POS: %6.2f %%' % (100*dev_acc.precision))
+
+
+if __name__ == '__main__':
+    plac.call(main)
--- a/lang_data/de/abbrev.de.tab
+++ b/lang_data/de/abbrev.de.tab
@ -0,0 +1,319 @@
+# surface form lemma pos 
+# multiple values are separated by |
+# empty lines and lines starting with # are being ignored 
+
+''	''
+\")	\")
+\n	\n	<nl>	SP
+\t	\t	<tab>	SP
+ 	 	<space>	SP
+
+# example: Wie geht's?
+'s	's	es
+'S	'S	es
+
+# example: Haste mal 'nen Euro?
+'n	'n	ein
+'ne	'ne	eine
+'nen	'nen	einen
+
+# example: Kommen S’ nur herein!
+s'	s'	sie
+S'	S'	sie
+
+# example: Da haben wir's!
+ich's	ich|'s	ich|es
+du's	du|'s	du|es
+er's	er|'s	er|es
+sie's	sie|'s	sie|es
+wir's	wir|'s	wir|es
+ihr's	ihr|'s	ihr|es
+
+# example: Die katze auf'm dach.
+auf'm	auf|'m	auf|dem
+unter'm	unter|'m	unter|dem
+über'm	über|'m	über|dem
+vor'm	vor|'m	vor|dem
+hinter'm	hinter|'m	hinter|dem
+
+# persons
+B.A.	B.A.
+B.Sc.	B.Sc.
+Dipl.	Dipl.
+Dipl.-Ing.	Dipl.-Ing.
+Dr.	Dr.
+Fr.	Fr.
+Frl.	Frl.
+Hr.	Hr.
+Hrn.	Hrn.
+Frl.	Frl.
+Prof.	Prof.
+St.	St.
+Hrgs.	Hrgs.
+Hg.	Hg.
+a.Z.	a.Z.
+a.D.	a.D.
+h.c.	h.c.
+Jr.	Jr.
+jr.	jr.
+jun.	jun.
+sen.	sen.
+rer.	rer.
+Ing.	Ing.
+M.A.	M.A.
+Mr.	Mr.
+M.Sc.	M.Sc.
+nat.	nat.
+phil.	phil.
+
+# companies
+Co.	Co.
+co.	co.
+Cie.	Cie.
+A.G.	A.G.
+G.m.b.H.	G.m.b.H.
+i.G.	i.G.
+e.V.	e.V.
+
+# popular german abbreviations
+Abb.	Abb.
+Abk.	Abk.
+Abs.	Abs.
+Abt.	Abt.
+abzgl.	abzgl.
+allg.	allg.
+a.M.	a.M.
+Bd.	Bd.
+betr.	betr.
+Betr.	Betr.
+Biol.	Biol.
+biol.	biol.
+Bf.	Bf.
+Bhf.	Bhf.
+Bsp.	Bsp.
+bspw.	bspw.
+bzgl.	bzgl.
+bzw.	bzw.
+d.h.	d.h.
+dgl.	dgl.
+ebd.	ebd.
+ehem.	ehem.
+eigtl.	eigtl.
+entspr.	entspr.
+erm.	erm.
+ev.	ev.
+evtl.	evtl.
+Fa.	Fa.
+Fam.	Fam.
+geb.	geb.
+Gebr.	Gebr.
+gem.	gem.
+ggf.	ggf.
+ggü.	ggü.
+ggfs.	ggfs.
+gegr.	gegr.
+Hbf.	Hbf.
+Hrsg.	Hrsg.
+hrsg.	hrsg.
+i.A.	i.A.
+i.d.R.	i.d.R.
+inkl.	inkl.
+insb.	insb.
+i.O.	i.O.
+i.Tr.	i.Tr.
+i.V.	i.V.
+jur.	jur.
+kath.	kath.
+K.O.	K.O.
+lt.	lt.
+max.	max.
+m.E.	m.E.
+m.M.	m.M.
+mtl.	mtl.
+min.	min.
+mind.	mind.
+MwSt.	MwSt.
+Nr.	Nr.
+o.a.	o.a.
+o.ä.	o.ä.
+o.Ä.	o.Ä.
+o.g.	o.g.
+o.k.	o.k.
+O.K.	O.K.
+Orig.	Orig.
+orig.	orig.
+pers.	pers.
+Pkt.	Pkt.
+Red.	Red.
+röm.	röm.
+s.o.	s.o.
+sog.	sog.
+std.	std.
+stellv.	stellv.
+Str.	Str.
+tägl.	tägl.
+Tel.	Tel.
+u.a.	u.a.
+usf.	usf.
+u.s.w.	u.s.w.
+usw.	usw.
+u.U.	u.U.
+u.v.m.	u.v.m.
+uvm.	uvm.
+v.a.	v.a.
+vgl.	vgl.
+vllt.	vllt.
+v.l.n.r.	v.l.n.r.
+vlt.	vlt.
+Vol.	Vol.
+wiss.	wiss.
+Univ.	Univ.
+z.B.	z.B.
+z.b.	z.b.
+z.Bsp.	z.Bsp.
+z.T.	z.T.
+z.Z.	z.Z.
+zzgl.	zzgl.
+z.Zt.	z.Zt.
+
+# popular latin abbreviations
+vs.	vs.
+adv.	adv.
+Chr.	Chr.
+A.C.	A.C.
+A.D.	A.D.
+e.g.	e.g.
+i.e.	i.e.
+al.	al.
+p.a.	p.a.
+P.S.	P.S.
+q.e.d.	q.e.d.
+R.I.P.	R.I.P.
+etc.	etc.
+incl.	incl.
+ca.	ca.
+n.Chr.	n.Chr.
+p.s.	p.s.
+v.Chr.	v.Chr.
+
+# popular english abbreviations
+D.C.	D.C.
+N.Y.	N.Y.
+N.Y.C.	N.Y.C.
+U.S.	U.S.
+U.S.A.	U.S.A.
+L.A.	L.A.
+U.S.S.	U.S.S.
+
+# dates & time
+Jan.	Jan.
+Feb.	Feb.
+Mrz.	Mrz.
+Mär.	Mär.
+Apr.	Apr.
+Jun.	Jun.
+Jul.	Jul.
+Aug.	Aug.
+Sep.	Sep.
+Sept.	Sept.
+Okt.	Okt.
+Nov.	Nov.
+Dez.	Dez.
+Mo.	Mo.
+Di.	Di.
+Mi.	Mi.
+Do.	Do.
+Fr.	Fr.
+Sa.	Sa.
+So.	So.
+Std.	Std.
+Jh.	Jh.
+Jhd.	Jhd.
+
+# numbers
+Tsd.	Tsd.
+Mio.	Mio.
+Mrd.	Mrd.
+
+# countries & languages
+engl.	engl.
+frz.	frz.
+lat.	lat.
+österr.	österr.
+
+# smileys
+:)	:)
+<3	<3
+;)	;)
+(:	(:
+:(	:(
+-_-	-_-
+=)	=)
+:/	:/
+:>	:>
+;-)	;-)
+:Y	:Y
+:P	:P
+:-P	:-P
+:3	:3
+=3	=3
+xD	xD
+^_^	^_^
+=]	=]
+=D	=D
+<333	<333
+:))	:))
+:0	:0
+-__-	-__-
+xDD	xDD
+o_o	o_o
+o_O	o_O
+V_V	V_V
+=[[	=[[
+<33	<33
+;p	;p
+;D	;D
+;-p	;-p
+;(	;(
+:p	:p
+:]	:]
+:O	:O
+:-/	:-/
+:-)	:-)
+:(((	:(((
+:((	:((
+:')	:')
+(^_^)	(^_^)
+(=	(=
+o.O	o.O
+
+# single letters
+a.	a.
+b.	b.
+c.	c.
+d.	d.
+e.	e.
+f.	f.
+g.	g.
+h.	h.
+i.	i.
+j.	j.
+k.	k.
+l.	l.
+m.	m.
+n.	n.
+o.	o.
+p.	p.
+q.	q.
+r.	r.
+s.	s.
+t.	t.
+u.	u.
+v.	v.
+w.	w.
+x.	x.
+y.	y.
+z.	z.
+ä.	ä.
+ö.	ö.
+ü.	ü.
--- a/lang_data/de/gazetteer.json
+++ b/lang_data/de/gazetteer.json
@ -0,0 +1,194 @@
+{
+	"Reddit": [
+		"PRODUCT",
+		{},
+		[
+			[{"lower": "reddit"}]
+		]
+	],
+	"SeptemberElevenAttacks": [
+		"EVENT",
+		{},
+		[
+			[
+				{"orth": "9/11"}
+			],
+			[
+				{"lower": "september"},
+				{"orth": "11"}
+			]
+		]
+	],
+	"Linux": [
+		"PRODUCT",
+		{},
+		[
+			[{"lower": "linux"}]
+		]
+	],
+	"Haskell": [
+		"PRODUCT",
+		{},
+		[
+			[{"lower": "haskell"}]
+		]
+	],
+	"HaskellCurry": [
+		"PERSON",
+		{},
+		[
+			[
+				{"lower": "haskell"},
+				{"lower": "curry"}
+			]
+		]
+	],
+	"Javascript": [
+		"PRODUCT",
+		{},
+		[
+			[{"lower": "javascript"}]
+		]
+	],
+	"CSS": [
+		"PRODUCT",
+		{},
+		[
+			[{"lower": "css"}],
+			[{"lower": "css3"}]
+		]
+	],
+	"displaCy": [
+		"PRODUCT",
+		{},
+		[
+			[{"lower": "displacy"}]
+		]
+	],
+	"spaCy": [
+		"PRODUCT",
+		{},
+		[
+			[{"orth": "spaCy"}]
+		]
+	],
+
+    "HTML": [
+		"PRODUCT",
+		{},
+		[
+			[{"lower": "html"}],
+			[{"lower": "html5"}]
+		]
+	],
+    "Python": [
+        "PRODUCT",
+        {},
+        [
+            [{"orth": "Python"}]
+        ]
+    ],
+    "Ruby": [
+        "PRODUCT",
+        {},
+        [
+            [{"orth": "Ruby"}]
+        ]
+    ],
+    "Digg": [
+        "PRODUCT",
+        {},
+        [
+            [{"lower": "digg"}]
+        ]
+    ],
+     "FoxNews": [
+        "ORG",
+        {},
+        [
+            [{"orth": "Fox"}],
+            [{"orth": "News"}]
+        ]
+    ],
+    "Google": [
+        "ORG",
+        {},
+        [
+            [{"lower": "google"}]
+        ]
+    ],
+    "Mac": [
+        "PRODUCT",
+        {},
+        [
+            [{"lower": "mac"}]
+        ]
+    ],
+    "Wikipedia": [
+        "PRODUCT",
+        {},
+        [
+            [{"lower": "wikipedia"}]
+        ]
+    ],
+    "Windows": [
+        "PRODUCT",
+        {},
+        [
+            [{"orth": "Windows"}]
+        ]
+    ],
+     "Dell": [
+        "ORG",
+        {},
+        [
+            [{"lower": "dell"}]
+        ]
+    ],
+    "Facebook": [
+        "ORG",
+        {},
+        [
+            [{"lower": "facebook"}]
+        ]
+    ],
+     "Blizzard": [
+        "ORG",
+        {},
+        [
+            [{"orth": "Blizzard"}]
+        ]
+    ],
+    "Ubuntu": [
+        "ORG",
+        {},
+        [
+            [{"orth": "Ubuntu"}]
+        ]
+    ],
+    "Youtube": [
+        "PRODUCT",
+        {},
+        [
+            [{"lower": "youtube"}]
+        ]
+    ],
+    "false_positives": [
+        null,
+        {},
+        [
+            [{"orth": "Shit"}],
+            [{"orth": "Weed"}],
+            [{"orth": "Cool"}],
+            [{"orth": "Btw"}],
+            [{"orth": "Bah"}],
+            [{"orth": "Bullshit"}],
+            [{"orth": "Lol"}],
+            [{"orth": "Yo"}, {"lower": "dawg"}],
+            [{"orth": "Yay"}],
+            [{"orth": "Ahh"}],
+            [{"orth": "Yea"}],
+            [{"orth": "Bah"}]
+        ]
+    ]
+}
--- a/lang_data/de/generate_specials.py
+++ b/lang_data/de/generate_specials.py
@ -1,5 +1,7 @@
 # coding=utf8
 import json
+import io
+import itertools

 contractions = {}

@ -262,14 +264,30 @@ def get_token_properties(token, capitalize=False, remove_contractions=False):
    props["F"] = token
    return props

+
 def create_entry(token, endings, capitalize=False, remove_contractions=False):
-    
    properties = []
    properties.append(get_token_properties(token, capitalize=capitalize, remove_contractions=remove_contractions))
    for e in endings:
        properties.append(get_token_properties(e, remove_contractions=remove_contractions))
    return properties

+
+FIELDNAMES = ['F','L','pos']
+def read_hardcoded(stream):
+    hc_specials = {}
+    for line in stream:
+        line = line.strip()
+        if line.startswith('#') or not line:
+            continue
+        key,_,rest = line.partition('\t')
+        values = []
+        for annotation in zip(*[ e.split('|') for e in rest.split('\t') ]):
+            values.append({ k:v for k,v in itertools.izip_longest(FIELDNAMES,annotation) if v })
+        hc_specials[key] = values
+    return hc_specials
+
+
 def generate_specials():

    specials = {}
@ -303,7 +321,10 @@ def generate_specials():
                specials[special] = create_entry(token, endings, capitalize=True, remove_contractions=True)

    # add in hardcoded specials
-    specials = dict(specials, **hardcoded_specials)
+    # changed it so it generates them from a file
+    with io.open('abbrev.de.tab','r',encoding='utf8') as abbrev_:
+        hc_specials = read_hardcoded(abbrev_)
+    specials = dict(specials, **hc_specials)

    return specials

--- a/lang_data/de/infix.txt
+++ b/lang_data/de/infix.txt
@ -1,3 +1,6 @@
 \.\.\.
 (?<=[a-z])\.(?=[A-Z])
-(?<=[a-zA-Z])-(?=[a-zA-z])
+(?<=[a-zöäüßA-ZÖÄÜ"]):(?=[a-zöäüßA-ZÖÄÜ])
+(?<=[a-zöäüßA-ZÖÄÜ"])>(?=[a-zöäüßA-ZÖÄÜ])
+(?<=[a-zöäüßA-ZÖÄÜ"])<(?=[a-zöäüßA-ZÖÄÜ])
+(?<=[a-zöäüßA-ZÖÄÜ"])=(?=[a-zöäüßA-ZÖÄÜ])
--- a/lang_data/de/prefix.txt
+++ b/lang_data/de/prefix.txt
@ -5,6 +5,7 @@
 {
 *
 <
+>
 $
 £
 „
@ -20,3 +21,7 @@ a-
 ‘
 ....
 ...
+‚
+»
+_
+§
--- a/lang_data/de/specials.json
+++ b/lang_data/de/specials.json
@ -1,27 +1,4 @@
 {
-    "\t": [
-        {
-            "F": "\t",
-            "pos": "SP"
-        }
-    ],
-    "\n": [
-        {
-            "F": "\n",
-            "pos": "SP"
-        }
-    ],
-    " ": [
-        {
-            "F": " ",
-            "pos": "SP"
-        }
-    ],
-    "\")": [
-        {
-            "F": "\")"
-        }
-    ],
    "''": [
        {
            "F": "''"
@ -217,6 +194,11 @@
            "F": "<333"
        }
    ],
+    "<space>": [
+        {
+            "F": "SP"
+        }
+    ],
    "=)": [
        {
            "F": "=)"
@ -267,6 +249,16 @@
            "F": "Abk."
        }
    ],
+    "Abs.": [
+        {
+            "F": "Abs."
+        }
+    ],
+    "Abt.": [
+        {
+            "F": "Abt."
+        }
+    ],
    "Apr.": [
        {
            "F": "Apr."
@ -277,6 +269,26 @@
            "F": "Aug."
        }
    ],
+    "B.A.": [
+        {
+            "F": "B.A."
+        }
+    ],
+    "B.Sc.": [
+        {
+            "F": "B.Sc."
+        }
+    ],
+    "Bd.": [
+        {
+            "F": "Bd."
+        }
+    ],
+    "Betr.": [
+        {
+            "F": "Betr."
+        }
+    ],
    "Bf.": [
        {
            "F": "Bf."
@ -292,6 +304,11 @@
            "F": "Biol."
        }
    ],
+    "Bsp.": [
+        {
+            "F": "Bsp."
+        }
+    ],
    "Chr.": [
        {
            "F": "Chr."
@ -342,6 +359,16 @@
            "F": "Dr."
        }
    ],
+    "Fa.": [
+        {
+            "F": "Fa."
+        }
+    ],
+    "Fam.": [
+        {
+            "F": "Fam."
+        }
+    ],
    "Feb.": [
        {
            "F": "Feb."
@ -387,6 +414,16 @@
            "F": "Hrgs."
        }
    ],
+    "Hrn.": [
+        {
+            "F": "Hrn."
+        }
+    ],
+    "Hrsg.": [
+        {
+            "F": "Hrsg."
+        }
+    ],
    "Ing.": [
        {
            "F": "Ing."
@ -397,11 +434,21 @@
            "F": "Jan."
        }
    ],
+    "Jh.": [
+        {
+            "F": "Jh."
+        }
+    ],
    "Jhd.": [
        {
            "F": "Jhd."
        }
    ],
+    "Jr.": [
+        {
+            "F": "Jr."
+        }
+    ],
    "Jul.": [
        {
            "F": "Jul."
@ -412,21 +459,61 @@
            "F": "Jun."
        }
    ],
+    "K.O.": [
+        {
+            "F": "K.O."
+        }
+    ],
+    "L.A.": [
+        {
+            "F": "L.A."
+        }
+    ],
+    "M.A.": [
+        {
+            "F": "M.A."
+        }
+    ],
+    "M.Sc.": [
+        {
+            "F": "M.Sc."
+        }
+    ],
    "Mi.": [
        {
            "F": "Mi."
        }
    ],
+    "Mio.": [
+        {
+            "F": "Mio."
+        }
+    ],
    "Mo.": [
        {
            "F": "Mo."
        }
    ],
+    "Mr.": [
+        {
+            "F": "Mr."
+        }
+    ],
+    "Mrd.": [
+        {
+            "F": "Mrd."
+        }
+    ],
    "Mrz.": [
        {
            "F": "Mrz."
        }
    ],
+    "MwSt.": [
+        {
+            "F": "MwSt."
+        }
+    ],
    "M\u00e4r.": [
        {
            "F": "M\u00e4r."
@ -452,16 +539,31 @@
            "F": "Nr."
        }
    ],
+    "O.K.": [
+        {
+            "F": "O.K."
+        }
+    ],
    "Okt.": [
        {
            "F": "Okt."
        }
    ],
+    "Orig.": [
+        {
+            "F": "Orig."
+        }
+    ],
    "P.S.": [
        {
            "F": "P.S."
        }
    ],
+    "Pkt.": [
+        {
+            "F": "Pkt."
+        }
+    ],
    "Prof.": [
        {
            "F": "Prof."
@ -472,6 +574,11 @@
            "F": "R.I.P."
        }
    ],
+    "Red.": [
+        {
+            "F": "Red."
+        }
+    ],
    "S'": [
        {
            "F": "S'",
@ -503,6 +610,41 @@
            "F": "St."
        }
    ],
+    "Std.": [
+        {
+            "F": "Std."
+        }
+    ],
+    "Str.": [
+        {
+            "F": "Str."
+        }
+    ],
+    "Tel.": [
+        {
+            "F": "Tel."
+        }
+    ],
+    "Tsd.": [
+        {
+            "F": "Tsd."
+        }
+    ],
+    "U.S.": [
+        {
+            "F": "U.S."
+        }
+    ],
+    "U.S.A.": [
+        {
+            "F": "U.S.A."
+        }
+    ],
+    "U.S.S.": [
+        {
+            "F": "U.S.S."
+        }
+    ],
    "Univ.": [
        {
            "F": "Univ."
@ -513,6 +655,30 @@
            "F": "V_V"
        }
    ],
+    "Vol.": [
+        {
+            "F": "Vol."
+        }
+    ],
+    "\\\")": [
+        {
+            "F": "\\\")"
+        }
+    ],
+    "\\n": [
+        {
+            "F": "\\n",
+            "L": "<nl>",
+            "pos": "SP"
+        }
+    ],
+    "\\t": [
+        {
+            "F": "\\t",
+            "L": "<tab>",
+            "pos": "SP"
+        }
+    ],
    "^_^": [
        {
            "F": "^_^"
@ -528,6 +694,11 @@
            "F": "a.D."
        }
    ],
+    "a.M.": [
+        {
+            "F": "a.M."
+        }
+    ],
    "a.Z.": [
        {
            "F": "a.Z."
@ -548,9 +719,15 @@
            "F": "al."
        }
    ],
+    "allg.": [
+        {
+            "F": "allg."
+        }
+    ],
    "auf'm": [
        {
-            "F": "auf"
+            "F": "auf",
+            "L": "auf"
        },
        {
            "F": "'m",
@ -572,11 +749,31 @@
            "F": "biol."
        }
    ],
+    "bspw.": [
+        {
+            "F": "bspw."
+        }
+    ],
+    "bzgl.": [
+        {
+            "F": "bzgl."
+        }
+    ],
+    "bzw.": [
+        {
+            "F": "bzw."
+        }
+    ],
    "c.": [
        {
            "F": "c."
        }
    ],
+    "ca.": [
+        {
+            "F": "ca."
+        }
+    ],
    "co.": [
        {
            "F": "co."
@ -587,9 +784,20 @@
            "F": "d."
        }
    ],
+    "d.h.": [
+        {
+            "F": "d.h."
+        }
+    ],
+    "dgl.": [
+        {
+            "F": "dgl."
+        }
+    ],
    "du's": [
        {
-            "F": "du"
+            "F": "du",
+            "L": "du"
        },
        {
            "F": "'s",
@ -611,19 +819,35 @@
            "F": "e.g."
        }
    ],
+    "ebd.": [
+        {
+            "F": "ebd."
+        }
+    ],
    "ehem.": [
        {
            "F": "ehem."
        }
    ],
+    "eigtl.": [
+        {
+            "F": "eigtl."
+        }
+    ],
    "engl.": [
        {
            "F": "engl."
        }
    ],
+    "entspr.": [
+        {
+            "F": "entspr."
+        }
+    ],
    "er's": [
        {
-            "F": "er"
+            "F": "er",
+            "L": "er"
        },
        {
            "F": "'s",
@ -640,11 +864,26 @@
            "F": "etc."
        }
    ],
+    "ev.": [
+        {
+            "F": "ev."
+        }
+    ],
+    "evtl.": [
+        {
+            "F": "evtl."
+        }
+    ],
    "f.": [
        {
            "F": "f."
        }
    ],
+    "frz.": [
+        {
+            "F": "frz."
+        }
+    ],
    "g.": [
        {
            "F": "g."
@ -660,6 +899,11 @@
            "F": "gegr."
        }
    ],
+    "gem.": [
+        {
+            "F": "gem."
+        }
+    ],
    "ggf.": [
        {
            "F": "ggf."
@ -687,23 +931,39 @@
    ],
    "hinter'm": [
        {
-            "F": "hinter"
+            "F": "hinter",
+            "L": "hinter"
        },
        {
            "F": "'m",
            "L": "dem"
        }
    ],
+    "hrsg.": [
+        {
+            "F": "hrsg."
+        }
+    ],
    "i.": [
        {
            "F": "i."
        }
    ],
+    "i.A.": [
+        {
+            "F": "i.A."
+        }
+    ],
    "i.G.": [
        {
            "F": "i.G."
        }
    ],
+    "i.O.": [
+        {
+            "F": "i.O."
+        }
+    ],
    "i.Tr.": [
        {
            "F": "i.Tr."
@ -714,6 +974,11 @@
            "F": "i.V."
        }
    ],
+    "i.d.R.": [
+        {
+            "F": "i.d.R."
+        }
+    ],
    "i.e.": [
        {
            "F": "i.e."
@ -721,7 +986,8 @@
    ],
    "ich's": [
        {
-            "F": "ich"
+            "F": "ich",
+            "L": "ich"
        },
        {
            "F": "'s",
@ -730,7 +996,8 @@
    ],
    "ihr's": [
        {
-            "F": "ihr"
+            "F": "ihr",
+            "L": "ihr"
        },
        {
            "F": "'s",
@ -757,6 +1024,11 @@
            "F": "j."
        }
    ],
+    "jr.": [
+        {
+            "F": "jr."
+        }
+    ],
    "jun.": [
        {
            "F": "jun."
@ -772,11 +1044,21 @@
            "F": "k."
        }
    ],
+    "kath.": [
+        {
+            "F": "kath."
+        }
+    ],
    "l.": [
        {
            "F": "l."
        }
    ],
+    "lat.": [
+        {
+            "F": "lat."
+        }
+    ],
    "lt.": [
        {
            "F": "lt."
@ -787,11 +1069,46 @@
            "F": "m."
        }
    ],
+    "m.E.": [
+        {
+            "F": "m.E."
+        }
+    ],
+    "m.M.": [
+        {
+            "F": "m.M."
+        }
+    ],
+    "max.": [
+        {
+            "F": "max."
+        }
+    ],
+    "min.": [
+        {
+            "F": "min."
+        }
+    ],
+    "mind.": [
+        {
+            "F": "mind."
+        }
+    ],
+    "mtl.": [
+        {
+            "F": "mtl."
+        }
+    ],
    "n.": [
        {
            "F": "n."
        }
    ],
+    "n.Chr.": [
+        {
+            "F": "n.Chr."
+        }
+    ],
    "nat.": [
        {
            "F": "nat."
@ -807,6 +1124,31 @@
            "F": "o.O"
        }
    ],
+    "o.a.": [
+        {
+            "F": "o.a."
+        }
+    ],
+    "o.g.": [
+        {
+            "F": "o.g."
+        }
+    ],
+    "o.k.": [
+        {
+            "F": "o.k."
+        }
+    ],
+    "o.\u00c4.": [
+        {
+            "F": "o.\u00c4."
+        }
+    ],
+    "o.\u00e4.": [
+        {
+            "F": "o.\u00e4."
+        }
+    ],
    "o_O": [
        {
            "F": "o_O"
@ -817,6 +1159,11 @@
            "F": "o_o"
        }
    ],
+    "orig.": [
+        {
+            "F": "orig."
+        }
+    ],
    "p.": [
        {
            "F": "p."
@ -827,6 +1174,21 @@
            "F": "p.a."
        }
    ],
+    "p.s.": [
+        {
+            "F": "p.s."
+        }
+    ],
+    "pers.": [
+        {
+            "F": "pers."
+        }
+    ],
+    "phil.": [
+        {
+            "F": "phil."
+        }
+    ],
    "q.": [
        {
            "F": "q."
@ -847,6 +1209,11 @@
            "F": "rer."
        }
    ],
+    "r\u00f6m.": [
+        {
+            "F": "r\u00f6m."
+        }
+    ],
    "s'": [
        {
            "F": "s'",
@ -858,6 +1225,11 @@
            "F": "s."
        }
    ],
+    "s.o.": [
+        {
+            "F": "s.o."
+        }
+    ],
    "sen.": [
        {
            "F": "sen."
@ -865,23 +1237,49 @@
    ],
    "sie's": [
        {
-            "F": "sie"
+            "F": "sie",
+            "L": "sie"
        },
        {
            "F": "'s",
            "L": "es"
        }
    ],
+    "sog.": [
+        {
+            "F": "sog."
+        }
+    ],
+    "std.": [
+        {
+            "F": "std."
+        }
+    ],
+    "stellv.": [
+        {
+            "F": "stellv."
+        }
+    ],
    "t.": [
        {
            "F": "t."
        }
    ],
+    "t\u00e4gl.": [
+        {
+            "F": "t\u00e4gl."
+        }
+    ],
    "u.": [
        {
            "F": "u."
        }
    ],
+    "u.U.": [
+        {
+            "F": "u.U."
+        }
+    ],
    "u.a.": [
        {
            "F": "u.a."
@ -892,28 +1290,75 @@
            "F": "u.s.w."
        }
    ],
+    "u.v.m.": [
+        {
+            "F": "u.v.m."
+        }
+    ],
    "unter'm": [
        {
-            "F": "unter"
+            "F": "unter",
+            "L": "unter"
        },
        {
            "F": "'m",
            "L": "dem"
        }
    ],
+    "usf.": [
+        {
+            "F": "usf."
+        }
+    ],
+    "usw.": [
+        {
+            "F": "usw."
+        }
+    ],
+    "uvm.": [
+        {
+            "F": "uvm."
+        }
+    ],
    "v.": [
        {
            "F": "v."
        }
    ],
+    "v.Chr.": [
+        {
+            "F": "v.Chr."
+        }
+    ],
+    "v.a.": [
+        {
+            "F": "v.a."
+        }
+    ],
+    "v.l.n.r.": [
+        {
+            "F": "v.l.n.r."
+        }
+    ],
    "vgl.": [
        {
            "F": "vgl."
        }
    ],
+    "vllt.": [
+        {
+            "F": "vllt."
+        }
+    ],
+    "vlt.": [
+        {
+            "F": "vlt."
+        }
+    ],
    "vor'm": [
        {
-            "F": "vor"
+            "F": "vor",
+            "L": "vor"
        },
        {
            "F": "'m",
@ -932,13 +1377,19 @@
    ],
    "wir's": [
        {
-            "F": "wir"
+            "F": "wir",
+            "L": "wir"
        },
        {
            "F": "'s",
            "L": "es"
        }
    ],
+    "wiss.": [
+        {
+            "F": "wiss."
+        }
+    ],
    "x.": [
        {
            "F": "x."
@ -969,19 +1420,60 @@
            "F": "z.B."
        }
    ],
+    "z.Bsp.": [
+        {
+            "F": "z.Bsp."
+        }
+    ],
+    "z.T.": [
+        {
+            "F": "z.T."
+        }
+    ],
    "z.Z.": [
        {
            "F": "z.Z."
        }
    ],
+    "z.Zt.": [
+        {
+            "F": "z.Zt."
+        }
+    ],
+    "z.b.": [
+        {
+            "F": "z.b."
+        }
+    ],
    "zzgl.": [
        {
            "F": "zzgl."
        }
    ],
+    "\u00e4.": [
+        {
+            "F": "\u00e4."
+        }
+    ],
+    "\u00f6.": [
+        {
+            "F": "\u00f6."
+        }
+    ],
+    "\u00f6sterr.": [
+        {
+            "F": "\u00f6sterr."
+        }
+    ],
+    "\u00fc.": [
+        {
+            "F": "\u00fc."
+        }
+    ],
    "\u00fcber'm": [
        {
-            "F": "\u00fcber"
+            "F": "\u00fcber",
+            "L": "\u00fcber"
        },
        {
            "F": "'m",
--- a/lang_data/de/suffix.txt
+++ b/lang_data/de/suffix.txt
@ -13,14 +13,61 @@
 ;
 '
 ”
+“
+«
+_
 ''
 's
 'S
 ’s
 ’S
 ’
+‘
+°
+€
 \.\.
 \.\.\.
 \.\.\.\.
-(?<=[a-z0-9)\]"'%\)])\.
+(?<=[a-zäöüßÖÄÜ)\]"'´«‘’%\)²“”])\.
+\-\-
+´
+(?<=[0-9])km²
+(?<=[0-9])m²
+(?<=[0-9])cm²
+(?<=[0-9])mm²
+(?<=[0-9])km³
+(?<=[0-9])m³
+(?<=[0-9])cm³
+(?<=[0-9])mm³
+(?<=[0-9])ha
 (?<=[0-9])km
+(?<=[0-9])m
+(?<=[0-9])cm
+(?<=[0-9])mm
+(?<=[0-9])µm
+(?<=[0-9])nm
+(?<=[0-9])yd
+(?<=[0-9])in
+(?<=[0-9])ft
+(?<=[0-9])kg
+(?<=[0-9])g
+(?<=[0-9])mg
+(?<=[0-9])µg
+(?<=[0-9])t
+(?<=[0-9])lb
+(?<=[0-9])oz
+(?<=[0-9])m/s
+(?<=[0-9])km/h
+(?<=[0-9])mph
+(?<=[0-9])°C
+(?<=[0-9])°K
+(?<=[0-9])°F
+(?<=[0-9])hPa
+(?<=[0-9])Pa
+(?<=[0-9])mbar
+(?<=[0-9])mb
+(?<=[0-9])T
+(?<=[0-9])G
+(?<=[0-9])M
+(?<=[0-9])K
+(?<=[0-9])kb
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -153,10 +153,8 @@ cdef class Tagger:
    @classmethod
    def from_package(cls, pkg, vocab):
        # TODO: templates.json deprecated? not present in latest package
-        templates = cls.default_templates()
-        # templates = package.load_utf8(json.load,
-        #     'pos', 'templates.json',
-        #     default=cls.default_templates())
+        # templates = cls.default_templates()
+        templates = pkg.load_json(('pos', 'templates.json'), default=cls.default_templates())

        model = TaggerModel(templates)
        if pkg.has_file('pos', 'model'):
@ -203,7 +201,7 @@ cdef class Tagger:
                                  nr_class=self.vocab.morphology.n_tags,
                                  nr_feat=self.model.nr_feat)
        for i in range(tokens.length):
-            if tokens.c[i].pos == 0:
+            if tokens.c[i].pos == 0:                
                self.model.set_featuresC(&eg.c, tokens.c, i)
                self.model.set_scoresC(eg.c.scores,
                    eg.c.features, eg.c.nr_feat)
@ -221,7 +219,7 @@ cdef class Tagger:
    def train(self, Doc tokens, object gold_tag_strs):
        assert len(tokens) == len(gold_tag_strs)
        for tag in gold_tag_strs:
-            if tag not in self.tag_names:
+            if tag != None and tag not in self.tag_names:
                msg = ("Unrecognized gold tag: %s. tag_map.json must contain all"
                       "gold tags, to maintain coarse-grained mapping.")
                raise ValueError(msg % tag)
@ -234,10 +232,9 @@ cdef class Tagger:
            nr_feat=self.model.nr_feat)
        for i in range(tokens.length):
            self.model.set_featuresC(&eg.c, tokens.c, i)
-            eg.set_label(golds[i])
+            eg.costs = [ 1 if golds[i] not in (c, -1) else 0 for c in xrange(eg.nr_class) ]
            self.model.set_scoresC(eg.c.scores,
                eg.c.features, eg.c.nr_feat)
-            
            self.model.updateC(&eg.c)

            self.vocab.morphology.assign_tag(&tokens.c[i], eg.guess)