add tokenizer files for German, add/change code to train German pos tagger

- add files to specify rules for German tokenization - change generate_specials.py to generate from an external file (abbrev.de.tab) - copy gazetteer.json from lang_data/en/ - init_model.py - change doc freq threshold to 0 - add train_german_tagger.py - expects conll09-formatted input
2025-10-22 11:44:16 +03:00 · 2016-02-18 13:24:20 +01:00 · 2016-02-18 13:24:20 +01:00 · eae35e9b27
commit eae35e9b27
parent 9d8966a2c0
10 changed files with 1290 additions and 52 deletions
--- a/bin/init_model.py
+++ b/bin/init_model.py
@ -98,7 +98,7 @@ def _read_probs(loc):
    return probs, probs['-OOV-']
-def _read_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200):
+def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200):
    if not loc.exists():
        print("Warning: Frequencies file not found")
        return {}, 0.0
@ -125,7 +125,8 @@ def _read_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200):
        doc_freq = int(doc_freq)
        freq = int(freq)
        if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
-            word = literal_eval(key)
+#            word = literal_eval(key)
            word = key
            smooth_count = counts.smoother(int(freq))
            log_smooth_count = math.log(smooth_count)
            probs[word] = math.log(smooth_count) - log_total
@ -165,7 +166,7 @@ def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir):
    clusters = _read_clusters(src_dir / 'clusters.txt')
    probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob')
    if not probs:
-        probs, oov_prob = _read_freqs(src_dir / 'freqs.txt.gz')
+        probs, oov_prob = _read_freqs(src_dir / 'freqs.txt')
    if not probs:
        oov_prob = -20
    else:
@ -223,9 +224,8 @@ def main(lang_id, lang_data_dir, corpora_dir, model_dir):
        copyfile(str(lang_data_dir / 'gazetteer.json'),
                 str(model_dir / 'vocab' / 'gazetteer.json'))
-    if (lang_data_dir / 'tag_map.json').exists():
+    copyfile(str(lang_data_dir / 'tag_map.json'),
-        copyfile(str(lang_data_dir / 'tag_map.json'),
+             str(model_dir / 'vocab' / 'tag_map.json'))
                 str(model_dir / 'vocab' / 'tag_map.json'))
    if (lang_data_dir / 'lemma_rules.json').exists():
        copyfile(str(lang_data_dir / 'lemma_rules.json'),
--- a/bin/tagger/train_german_tagger.py
+++ b/bin/tagger/train_german_tagger.py
@ -0,0 +1,160 @@
 #!/usr/bin/env python
 from __future__ import division
 from __future__ import unicode_literals
 import os
 from os import path
 import shutil
 import io
 import random
 import time
 import gzip
 import ujson
 import plac
 import cProfile
 import pstats
 import spacy.util
 from spacy.de import German
 from spacy.gold import GoldParse
 from spacy.tagger import Tagger
 from spacy.scorer import PRFScore
 from spacy.tagger import P2_orth, P2_cluster, P2_shape, P2_prefix, P2_suffix, P2_pos, P2_lemma, P2_flags 
 from spacy.tagger import P1_orth, P1_cluster, P1_shape, P1_prefix, P1_suffix, P1_pos, P1_lemma, P1_flags 
 from spacy.tagger import W_orth, W_cluster, W_shape, W_prefix, W_suffix, W_pos, W_lemma, W_flags
 from spacy.tagger import N1_orth, N1_cluster, N1_shape, N1_prefix, N1_suffix, N1_pos, N1_lemma, N1_flags
 from spacy.tagger import N2_orth, N2_cluster, N2_shape, N2_prefix, N2_suffix, N2_pos, N2_lemma, N2_flags, N_CONTEXT_FIELDS
 def default_templates():
    return spacy.tagger.Tagger.default_templates()
 def default_templates_without_clusters():
    return (
        (W_orth,),
        (P1_lemma, P1_pos),
        (P2_lemma, P2_pos),
        (N1_orth,),
        (N2_orth,),
        (W_suffix,),
        (W_prefix,),
        (P1_pos,),
        (P2_pos,),
        (P1_pos, P2_pos),
        (P1_pos, W_orth),
        (P1_suffix,),
        (N1_suffix,),
        (W_shape,),
        (W_flags,),
        (N1_flags,),
        (N2_flags,),
        (P1_flags,),
        (P2_flags,),
    )
 def make_tagger(vocab, templates):
    model = spacy.tagger.TaggerModel(templates)
    return spacy.tagger.Tagger(vocab,model)
 def read_conll(file_):
    def sentences():
        words, tags = [], []
        for line in file_:
            line = line.strip()
            if line:
                word, tag = line.split('\t')[1::3][:2] # get column 1 and 4 (CoNLL09)
                words.append(word)
                tags.append(tag)
            elif words:
                yield words, tags
                words, tags = [], []
        if words:
            yield words, tags
    return [ s for s in sentences() ]
 def score_model(score, nlp, words, gold_tags):
    tokens = nlp.tokenizer.tokens_from_list(words)
    assert(len(tokens) == len(gold_tags))
    nlp.tagger(tokens)
    for token, gold_tag in zip(tokens,gold_tags):
        score.score_set(set([token.tag_]),set([gold_tag]))
 def train(Language, train_sents, dev_sents, model_dir, n_iter=15, seed=21):
    # make shuffling deterministic
    random.seed(seed)
    # set up directory for model
    pos_model_dir = path.join(model_dir, 'pos')
    if path.exists(pos_model_dir):
        shutil.rmtree(pos_model_dir)
    os.mkdir(pos_model_dir)
    nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
    nlp.tagger = make_tagger(nlp.vocab,default_templates())
    print("Itn.\ttrain acc %\tdev acc %")
    for itn in range(n_iter):
        # train on train set
        #train_acc = PRFScore()
        correct, total = 0., 0.
        for words, gold_tags in train_sents:
            tokens = nlp.tokenizer.tokens_from_list(words)
            correct += nlp.tagger.train(tokens, gold_tags)
            total += len(words)
        train_acc = correct/total
        # test on dev set
        dev_acc = PRFScore()
        for words, gold_tags in dev_sents:
            score_model(dev_acc, nlp, words, gold_tags)
        random.shuffle(train_sents)
        print('%d:\t%6.2f\t%6.2f' % (itn, 100*train_acc, 100*dev_acc.precision))
    print('end training')
    nlp.end_training(model_dir)
    print('done')
@plac.annotations(
    train_loc=("Location of CoNLL 09 formatted training file"),
    dev_loc=("Location of CoNLL 09 formatted development file"),
    model_dir=("Location of output model directory"),
    eval_only=("Skip training, and only evaluate", "flag", "e", bool),
    n_iter=("Number of training iterations", "option", "i", int),
 )
 def main(train_loc, dev_loc, model_dir, eval_only=False, n_iter=15):
    # training
    if not eval_only:
        with io.open(train_loc, 'r', encoding='utf8') as trainfile_, \
             io.open(dev_loc, 'r', encoding='utf8') as devfile_:
            train_sents = read_conll(trainfile_)
            dev_sents = read_conll(devfile_)
        train(German, train_sents, dev_sents, model_dir, n_iter=n_iter)
    # testing
    with io.open(dev_loc, 'r', encoding='utf8') as file_:
        dev_sents = read_conll(file_)
        nlp = German(data_dir=model_dir)
        dev_acc = PRFScore()
        for words, gold_tags in dev_sents:
            score_model(dev_acc, nlp, words, gold_tags)                
        print('POS: %6.2f %%' % (100*dev_acc.precision))
 if __name__ == '__main__':
    plac.call(main)
--- a/lang_data/de/abbrev.de.tab
+++ b/lang_data/de/abbrev.de.tab
@ -0,0 +1,319 @@
 # surface form lemma pos 
 # multiple values are separated by |
 # empty lines and lines starting with # are being ignored 
 ''	''
 \")	\")
 \n	\n	<nl>	SP
 \t	\t	<tab>	SP
 	 	<space>	SP
 # example: Wie geht's?
 's	's	es
 'S	'S	es
 # example: Haste mal 'nen Euro?
 'n	'n	ein
 'ne	'ne	eine
 'nen	'nen	einen
 # example: Kommen S’ nur herein!
 s'	s'	sie
 S'	S'	sie
 # example: Da haben wir's!
 ich's	ich|'s	ich|es
 du's	du|'s	du|es
 er's	er|'s	er|es
 sie's	sie|'s	sie|es
 wir's	wir|'s	wir|es
 ihr's	ihr|'s	ihr|es
 # example: Die katze auf'm dach.
 auf'm	auf|'m	auf|dem
 unter'm	unter|'m	unter|dem
 über'm	über|'m	über|dem
 vor'm	vor|'m	vor|dem
 hinter'm	hinter|'m	hinter|dem
 # persons
 B.A.	B.A.
 B.Sc.	B.Sc.
 Dipl.	Dipl.
 Dipl.-Ing.	Dipl.-Ing.
 Dr.	Dr.
 Fr.	Fr.
 Frl.	Frl.
 Hr.	Hr.
 Hrn.	Hrn.
 Frl.	Frl.
 Prof.	Prof.
 St.	St.
 Hrgs.	Hrgs.
 Hg.	Hg.
 a.Z.	a.Z.
 a.D.	a.D.
 h.c.	h.c.
 Jr.	Jr.
 jr.	jr.
 jun.	jun.
 sen.	sen.
 rer.	rer.
 Ing.	Ing.
 M.A.	M.A.
 Mr.	Mr.
 M.Sc.	M.Sc.
 nat.	nat.
 phil.	phil.
 # companies
 Co.	Co.
 co.	co.
 Cie.	Cie.
 A.G.	A.G.
 G.m.b.H.	G.m.b.H.
 i.G.	i.G.
 e.V.	e.V.
 # popular german abbreviations
 Abb.	Abb.
 Abk.	Abk.
 Abs.	Abs.
 Abt.	Abt.
 abzgl.	abzgl.
 allg.	allg.
 a.M.	a.M.
 Bd.	Bd.
 betr.	betr.
 Betr.	Betr.
 Biol.	Biol.
 biol.	biol.
 Bf.	Bf.
 Bhf.	Bhf.
 Bsp.	Bsp.
 bspw.	bspw.
 bzgl.	bzgl.
 bzw.	bzw.
 d.h.	d.h.
 dgl.	dgl.
 ebd.	ebd.
 ehem.	ehem.
 eigtl.	eigtl.
 entspr.	entspr.
 erm.	erm.
 ev.	ev.
 evtl.	evtl.
 Fa.	Fa.
 Fam.	Fam.
 geb.	geb.
 Gebr.	Gebr.
 gem.	gem.
 ggf.	ggf.
 ggü.	ggü.
 ggfs.	ggfs.
 gegr.	gegr.
 Hbf.	Hbf.
 Hrsg.	Hrsg.
 hrsg.	hrsg.
 i.A.	i.A.
 i.d.R.	i.d.R.
 inkl.	inkl.
 insb.	insb.
 i.O.	i.O.
 i.Tr.	i.Tr.
 i.V.	i.V.
 jur.	jur.
 kath.	kath.
 K.O.	K.O.
 lt.	lt.
 max.	max.
 m.E.	m.E.
 m.M.	m.M.
 mtl.	mtl.
 min.	min.
 mind.	mind.
 MwSt.	MwSt.
 Nr.	Nr.
 o.a.	o.a.
 o.ä.	o.ä.
 o.Ä.	o.Ä.
 o.g.	o.g.
 o.k.	o.k.
 O.K.	O.K.
 Orig.	Orig.
 orig.	orig.
 pers.	pers.
 Pkt.	Pkt.
 Red.	Red.
 röm.	röm.
 s.o.	s.o.
 sog.	sog.
 std.	std.
 stellv.	stellv.
 Str.	Str.
 tägl.	tägl.
 Tel.	Tel.
 u.a.	u.a.
 usf.	usf.
 u.s.w.	u.s.w.
 usw.	usw.
 u.U.	u.U.
 u.v.m.	u.v.m.
 uvm.	uvm.
 v.a.	v.a.
 vgl.	vgl.
 vllt.	vllt.
 v.l.n.r.	v.l.n.r.
 vlt.	vlt.
 Vol.	Vol.
 wiss.	wiss.
 Univ.	Univ.
 z.B.	z.B.
 z.b.	z.b.
 z.Bsp.	z.Bsp.
 z.T.	z.T.
 z.Z.	z.Z.
 zzgl.	zzgl.
 z.Zt.	z.Zt.
 # popular latin abbreviations
 vs.	vs.
 adv.	adv.
 Chr.	Chr.
 A.C.	A.C.
 A.D.	A.D.
 e.g.	e.g.
 i.e.	i.e.
 al.	al.
 p.a.	p.a.
 P.S.	P.S.
 q.e.d.	q.e.d.
 R.I.P.	R.I.P.
 etc.	etc.
 incl.	incl.
 ca.	ca.
 n.Chr.	n.Chr.
 p.s.	p.s.
 v.Chr.	v.Chr.
 # popular english abbreviations
 D.C.	D.C.
 N.Y.	N.Y.
 N.Y.C.	N.Y.C.
 U.S.	U.S.
 U.S.A.	U.S.A.
 L.A.	L.A.
 U.S.S.	U.S.S.
 # dates & time
 Jan.	Jan.
 Feb.	Feb.
 Mrz.	Mrz.
 Mär.	Mär.
 Apr.	Apr.
 Jun.	Jun.
 Jul.	Jul.
 Aug.	Aug.
 Sep.	Sep.
 Sept.	Sept.
 Okt.	Okt.
 Nov.	Nov.
 Dez.	Dez.
 Mo.	Mo.
 Di.	Di.
 Mi.	Mi.
 Do.	Do.
 Fr.	Fr.
 Sa.	Sa.
 So.	So.
 Std.	Std.
 Jh.	Jh.
 Jhd.	Jhd.
 # numbers
 Tsd.	Tsd.
 Mio.	Mio.
 Mrd.	Mrd.
 # countries & languages
 engl.	engl.
 frz.	frz.
 lat.	lat.
 österr.	österr.
 # smileys
 :)	:)
 <3	<3
 ;)	;)
 (:	(:
 :(	:(
 -_-	-_-
 =)	=)
 :/	:/
 :>	:>
 ;-)	;-)
 :Y	:Y
 :P	:P
 :-P	:-P
 :3	:3
 =3	=3
 xD	xD
 ^_^	^_^
 =]	=]
 =D	=D
 <333	<333
 :))	:))
 :0	:0
 -__-	-__-
 xDD	xDD
 o_o	o_o
 o_O	o_O
 V_V	V_V
 =[[	=[[
 <33	<33
 ;p	;p
 ;D	;D
 ;-p	;-p
 ;(	;(
 :p	:p
 :]	:]
 :O	:O
 :-/	:-/
 :-)	:-)
 :(((	:(((
 :((	:((
 :')	:')
 (^_^)	(^_^)
 (=	(=
 o.O	o.O
 # single letters
 a.	a.
 b.	b.
 c.	c.
 d.	d.
 e.	e.
 f.	f.
 g.	g.
 h.	h.
 i.	i.
 j.	j.
 k.	k.
 l.	l.
 m.	m.
 n.	n.
 o.	o.
 p.	p.
 q.	q.
 r.	r.
 s.	s.
 t.	t.
 u.	u.
 v.	v.
 w.	w.
 x.	x.
 y.	y.
 z.	z.
 ä.	ä.
 ö.	ö.
 ü.	ü.
--- a/lang_data/de/gazetteer.json
+++ b/lang_data/de/gazetteer.json
@ -0,0 +1,194 @@
 {
 	"Reddit": [
 		"PRODUCT",
 		{},
 		[
 			[{"lower": "reddit"}]
 		]
 	],
 	"SeptemberElevenAttacks": [
 		"EVENT",
 		{},
 		[
 			[
 				{"orth": "9/11"}
 			],
 			[
 				{"lower": "september"},
 				{"orth": "11"}
 			]
 		]
 	],
 	"Linux": [
 		"PRODUCT",
 		{},
 		[
 			[{"lower": "linux"}]
 		]
 	],
 	"Haskell": [
 		"PRODUCT",
 		{},
 		[
 			[{"lower": "haskell"}]
 		]
 	],
 	"HaskellCurry": [
 		"PERSON",
 		{},
 		[
 			[
 				{"lower": "haskell"},
 				{"lower": "curry"}
 			]
 		]
 	],
 	"Javascript": [
 		"PRODUCT",
 		{},
 		[
 			[{"lower": "javascript"}]
 		]
 	],
 	"CSS": [
 		"PRODUCT",
 		{},
 		[
 			[{"lower": "css"}],
 			[{"lower": "css3"}]
 		]
 	],
 	"displaCy": [
 		"PRODUCT",
 		{},
 		[
 			[{"lower": "displacy"}]
 		]
 	],
 	"spaCy": [
 		"PRODUCT",
 		{},
 		[
 			[{"orth": "spaCy"}]
 		]
 	],
    "HTML": [
 		"PRODUCT",
 		{},
 		[
 			[{"lower": "html"}],
 			[{"lower": "html5"}]
 		]
 	],
    "Python": [
        "PRODUCT",
        {},
        [
            [{"orth": "Python"}]
        ]
    ],
    "Ruby": [
        "PRODUCT",
        {},
        [
            [{"orth": "Ruby"}]
        ]
    ],
    "Digg": [
        "PRODUCT",
        {},
        [
            [{"lower": "digg"}]
        ]
    ],
     "FoxNews": [
        "ORG",
        {},
        [
            [{"orth": "Fox"}],
            [{"orth": "News"}]
        ]
    ],
    "Google": [
        "ORG",
        {},
        [
            [{"lower": "google"}]
        ]
    ],
    "Mac": [
        "PRODUCT",
        {},
        [
            [{"lower": "mac"}]
        ]
    ],
    "Wikipedia": [
        "PRODUCT",
        {},
        [
            [{"lower": "wikipedia"}]
        ]
    ],
    "Windows": [
        "PRODUCT",
        {},
        [
            [{"orth": "Windows"}]
        ]
    ],
     "Dell": [
        "ORG",
        {},
        [
            [{"lower": "dell"}]
        ]
    ],
    "Facebook": [
        "ORG",
        {},
        [
            [{"lower": "facebook"}]
        ]
    ],
     "Blizzard": [
        "ORG",
        {},
        [
            [{"orth": "Blizzard"}]
        ]
    ],
    "Ubuntu": [
        "ORG",
        {},
        [
            [{"orth": "Ubuntu"}]
        ]
    ],
    "Youtube": [
        "PRODUCT",
        {},
        [
            [{"lower": "youtube"}]
        ]
    ],
    "false_positives": [
        null,
        {},
        [
            [{"orth": "Shit"}],
            [{"orth": "Weed"}],
            [{"orth": "Cool"}],
            [{"orth": "Btw"}],
            [{"orth": "Bah"}],
            [{"orth": "Bullshit"}],
            [{"orth": "Lol"}],
            [{"orth": "Yo"}, {"lower": "dawg"}],
            [{"orth": "Yay"}],
            [{"orth": "Ahh"}],
            [{"orth": "Yea"}],
            [{"orth": "Bah"}]
        ]
    ]
 }
--- a/lang_data/de/generate_specials.py
+++ b/lang_data/de/generate_specials.py
@ -1,5 +1,7 @@
 # coding=utf8
 import json
 import io
 import itertools
 contractions = {}
@ -262,14 +264,30 @@ def get_token_properties(token, capitalize=False, remove_contractions=False):
    props["F"] = token
    return props
 def create_entry(token, endings, capitalize=False, remove_contractions=False):
    properties = []
    properties.append(get_token_properties(token, capitalize=capitalize, remove_contractions=remove_contractions))
    for e in endings:
        properties.append(get_token_properties(e, remove_contractions=remove_contractions))
    return properties
 FIELDNAMES = ['F','L','pos']
 def read_hardcoded(stream):
    hc_specials = {}
    for line in stream:
        line = line.strip()
        if line.startswith('#') or not line:
            continue
        key,_,rest = line.partition('\t')
        values = []
        for annotation in zip(*[ e.split('|') for e in rest.split('\t') ]):
            values.append({ k:v for k,v in itertools.izip_longest(FIELDNAMES,annotation) if v })
        hc_specials[key] = values
    return hc_specials
 def generate_specials():
    specials = {}
@ -303,7 +321,10 @@ def generate_specials():
                specials[special] = create_entry(token, endings, capitalize=True, remove_contractions=True)
    # add in hardcoded specials
-    specials = dict(specials, **hardcoded_specials)
+    # changed it so it generates them from a file
    with io.open('abbrev.de.tab','r',encoding='utf8') as abbrev_:
        hc_specials = read_hardcoded(abbrev_)
    specials = dict(specials, **hc_specials)
    return specials
--- a/lang_data/de/infix.txt
+++ b/lang_data/de/infix.txt
@ -1,3 +1,6 @@
 \.\.\.
 (?<=[a-z])\.(?=[A-Z])
-(?<=[a-zA-Z])-(?=[a-zA-z])
+(?<=[a-zöäüßA-ZÖÄÜ"]):(?=[a-zöäüßA-ZÖÄÜ])
 (?<=[a-zöäüßA-ZÖÄÜ"])>(?=[a-zöäüßA-ZÖÄÜ])
 (?<=[a-zöäüßA-ZÖÄÜ"])<(?=[a-zöäüßA-ZÖÄÜ])
 (?<=[a-zöäüßA-ZÖÄÜ"])=(?=[a-zöäüßA-ZÖÄÜ])
--- a/lang_data/de/prefix.txt
+++ b/lang_data/de/prefix.txt
@ -5,6 +5,7 @@
 {
 *
 <
 >
 $
 £
 „
@ -20,3 +21,7 @@ a-
 ‘
 ....
 ...
 ‚
 »
 _
 §
--- a/lang_data/de/specials.json
+++ b/lang_data/de/specials.json
@ -1,27 +1,4 @@
 {
    "\t": [
        {
            "F": "\t",
            "pos": "SP"
        }
    ],
    "\n": [
        {
            "F": "\n",
            "pos": "SP"
        }
    ],
    " ": [
        {
            "F": " ",
            "pos": "SP"
        }
    ],
    "\")": [
        {
            "F": "\")"
        }
    ],
    "''": [
        {
            "F": "''"
@ -217,6 +194,11 @@
            "F": "<333"
        }
    ],
    "<space>": [
        {
            "F": "SP"
        }
    ],
    "=)": [
        {
            "F": "=)"
@ -267,6 +249,16 @@
            "F": "Abk."
        }
    ],
    "Abs.": [
        {
            "F": "Abs."
        }
    ],
    "Abt.": [
        {
            "F": "Abt."
        }
    ],
    "Apr.": [
        {
            "F": "Apr."
@ -277,6 +269,26 @@
            "F": "Aug."
        }
    ],
    "B.A.": [
        {
            "F": "B.A."
        }
    ],
    "B.Sc.": [
        {
            "F": "B.Sc."
        }
    ],
    "Bd.": [
        {
            "F": "Bd."
        }
    ],
    "Betr.": [
        {
            "F": "Betr."
        }
    ],
    "Bf.": [
        {
            "F": "Bf."
@ -292,6 +304,11 @@
            "F": "Biol."
        }
    ],
    "Bsp.": [
        {
            "F": "Bsp."
        }
    ],
    "Chr.": [
        {
            "F": "Chr."
@ -342,6 +359,16 @@
            "F": "Dr."
        }
    ],
    "Fa.": [
        {
            "F": "Fa."
        }
    ],
    "Fam.": [
        {
            "F": "Fam."
        }
    ],
    "Feb.": [
        {
            "F": "Feb."
@ -387,6 +414,16 @@
            "F": "Hrgs."
        }
    ],
    "Hrn.": [
        {
            "F": "Hrn."
        }
    ],
    "Hrsg.": [
        {
            "F": "Hrsg."
        }
    ],
    "Ing.": [
        {
            "F": "Ing."
@ -397,11 +434,21 @@
            "F": "Jan."
        }
    ],
    "Jh.": [
        {
            "F": "Jh."
        }
    ],
    "Jhd.": [
        {
            "F": "Jhd."
        }
    ],
    "Jr.": [
        {
            "F": "Jr."
        }
    ],
    "Jul.": [
        {
            "F": "Jul."
@ -412,21 +459,61 @@
            "F": "Jun."
        }
    ],
    "K.O.": [
        {
            "F": "K.O."
        }
    ],
    "L.A.": [
        {
            "F": "L.A."
        }
    ],
    "M.A.": [
        {
            "F": "M.A."
        }
    ],
    "M.Sc.": [
        {
            "F": "M.Sc."
        }
    ],
    "Mi.": [
        {
            "F": "Mi."
        }
    ],
    "Mio.": [
        {
            "F": "Mio."
        }
    ],
    "Mo.": [
        {
            "F": "Mo."
        }
    ],
    "Mr.": [
        {
            "F": "Mr."
        }
    ],
    "Mrd.": [
        {
            "F": "Mrd."
        }
    ],
    "Mrz.": [
        {
            "F": "Mrz."
        }
    ],
    "MwSt.": [
        {
            "F": "MwSt."
        }
    ],
    "M\u00e4r.": [
        {
            "F": "M\u00e4r."
@ -452,16 +539,31 @@
            "F": "Nr."
        }
    ],
    "O.K.": [
        {
            "F": "O.K."
        }
    ],
    "Okt.": [
        {
            "F": "Okt."
        }
    ],
    "Orig.": [
        {
            "F": "Orig."
        }
    ],
    "P.S.": [
        {
            "F": "P.S."
        }
    ],
    "Pkt.": [
        {
            "F": "Pkt."
        }
    ],
    "Prof.": [
        {
            "F": "Prof."
@ -472,6 +574,11 @@
            "F": "R.I.P."
        }
    ],
    "Red.": [
        {
            "F": "Red."
        }
    ],
    "S'": [
        {
            "F": "S'",
@ -503,6 +610,41 @@
            "F": "St."
        }
    ],
    "Std.": [
        {
            "F": "Std."
        }
    ],
    "Str.": [
        {
            "F": "Str."
        }
    ],
    "Tel.": [
        {
            "F": "Tel."
        }
    ],
    "Tsd.": [
        {
            "F": "Tsd."
        }
    ],
    "U.S.": [
        {
            "F": "U.S."
        }
    ],
    "U.S.A.": [
        {
            "F": "U.S.A."
        }
    ],
    "U.S.S.": [
        {
            "F": "U.S.S."
        }
    ],
    "Univ.": [
        {
            "F": "Univ."
@ -513,6 +655,30 @@
            "F": "V_V"
        }
    ],
    "Vol.": [
        {
            "F": "Vol."
        }
    ],
    "\\\")": [
        {
            "F": "\\\")"
        }
    ],
    "\\n": [
        {
            "F": "\\n",
            "L": "<nl>",
            "pos": "SP"
        }
    ],
    "\\t": [
        {
            "F": "\\t",
            "L": "<tab>",
            "pos": "SP"
        }
    ],
    "^_^": [
        {
            "F": "^_^"
@ -528,6 +694,11 @@
            "F": "a.D."
        }
    ],
    "a.M.": [
        {
            "F": "a.M."
        }
    ],
    "a.Z.": [
        {
            "F": "a.Z."
@ -548,9 +719,15 @@
            "F": "al."
        }
    ],
    "allg.": [
        {
            "F": "allg."
        }
    ],
    "auf'm": [
        {
-            "F": "auf"
+            "F": "auf",
            "L": "auf"
        },
        {
            "F": "'m",
@ -572,11 +749,31 @@
            "F": "biol."
        }
    ],
    "bspw.": [
        {
            "F": "bspw."
        }
    ],
    "bzgl.": [
        {
            "F": "bzgl."
        }
    ],
    "bzw.": [
        {
            "F": "bzw."
        }
    ],
    "c.": [
        {
            "F": "c."
        }
    ],
    "ca.": [
        {
            "F": "ca."
        }
    ],
    "co.": [
        {
            "F": "co."
@ -587,9 +784,20 @@
            "F": "d."
        }
    ],
    "d.h.": [
        {
            "F": "d.h."
        }
    ],
    "dgl.": [
        {
            "F": "dgl."
        }
    ],
    "du's": [
        {
-            "F": "du"
+            "F": "du",
            "L": "du"
        },
        {
            "F": "'s",
@ -611,19 +819,35 @@
            "F": "e.g."
        }
    ],
    "ebd.": [
        {
            "F": "ebd."
        }
    ],
    "ehem.": [
        {
            "F": "ehem."
        }
    ],
    "eigtl.": [
        {
            "F": "eigtl."
        }
    ],
    "engl.": [
        {
            "F": "engl."
        }
    ],
    "entspr.": [
        {
            "F": "entspr."
        }
    ],
    "er's": [
        {
-            "F": "er"
+            "F": "er",
            "L": "er"
        },
        {
            "F": "'s",
@ -640,11 +864,26 @@
            "F": "etc."
        }
    ],
    "ev.": [
        {
            "F": "ev."
        }
    ],
    "evtl.": [
        {
            "F": "evtl."
        }
    ],
    "f.": [
        {
            "F": "f."
        }
    ],
    "frz.": [
        {
            "F": "frz."
        }
    ],
    "g.": [
        {
            "F": "g."
@ -660,6 +899,11 @@
            "F": "gegr."
        }
    ],
    "gem.": [
        {
            "F": "gem."
        }
    ],
    "ggf.": [
        {
            "F": "ggf."
@ -687,23 +931,39 @@
    ],
    "hinter'm": [
        {
-            "F": "hinter"
+            "F": "hinter",
            "L": "hinter"
        },
        {
            "F": "'m",
            "L": "dem"
        }
    ],
    "hrsg.": [
        {
            "F": "hrsg."
        }
    ],
    "i.": [
        {
            "F": "i."
        }
    ],
    "i.A.": [
        {
            "F": "i.A."
        }
    ],
    "i.G.": [
        {
            "F": "i.G."
        }
    ],
    "i.O.": [
        {
            "F": "i.O."
        }
    ],
    "i.Tr.": [
        {
            "F": "i.Tr."
@ -714,6 +974,11 @@
            "F": "i.V."
        }
    ],
    "i.d.R.": [
        {
            "F": "i.d.R."
        }
    ],
    "i.e.": [
        {
            "F": "i.e."
@ -721,7 +986,8 @@
    ],
    "ich's": [
        {
-            "F": "ich"
+            "F": "ich",
            "L": "ich"
        },
        {
            "F": "'s",
@ -730,7 +996,8 @@
    ],
    "ihr's": [
        {
-            "F": "ihr"
+            "F": "ihr",
            "L": "ihr"
        },
        {
            "F": "'s",
@ -757,6 +1024,11 @@
            "F": "j."
        }
    ],
    "jr.": [
        {
            "F": "jr."
        }
    ],
    "jun.": [
        {
            "F": "jun."
@ -772,11 +1044,21 @@
            "F": "k."
        }
    ],
    "kath.": [
        {
            "F": "kath."
        }
    ],
    "l.": [
        {
            "F": "l."
        }
    ],
    "lat.": [
        {
            "F": "lat."
        }
    ],
    "lt.": [
        {
            "F": "lt."
@ -787,11 +1069,46 @@
            "F": "m."
        }
    ],
    "m.E.": [
        {
            "F": "m.E."
        }
    ],
    "m.M.": [
        {
            "F": "m.M."
        }
    ],
    "max.": [
        {
            "F": "max."
        }
    ],
    "min.": [
        {
            "F": "min."
        }
    ],
    "mind.": [
        {
            "F": "mind."
        }
    ],
    "mtl.": [
        {
            "F": "mtl."
        }
    ],
    "n.": [
        {
            "F": "n."
        }
    ],
    "n.Chr.": [
        {
            "F": "n.Chr."
        }
    ],
    "nat.": [
        {
            "F": "nat."
@ -807,6 +1124,31 @@
            "F": "o.O"
        }
    ],
    "o.a.": [
        {
            "F": "o.a."
        }
    ],
    "o.g.": [
        {
            "F": "o.g."
        }
    ],
    "o.k.": [
        {
            "F": "o.k."
        }
    ],
    "o.\u00c4.": [
        {
            "F": "o.\u00c4."
        }
    ],
    "o.\u00e4.": [
        {
            "F": "o.\u00e4."
        }
    ],
    "o_O": [
        {
            "F": "o_O"
@ -817,6 +1159,11 @@
            "F": "o_o"
        }
    ],
    "orig.": [
        {
            "F": "orig."
        }
    ],
    "p.": [
        {
            "F": "p."
@ -827,6 +1174,21 @@
            "F": "p.a."
        }
    ],
    "p.s.": [
        {
            "F": "p.s."
        }
    ],
    "pers.": [
        {
            "F": "pers."
        }
    ],
    "phil.": [
        {
            "F": "phil."
        }
    ],
    "q.": [
        {
            "F": "q."
@ -847,6 +1209,11 @@
            "F": "rer."
        }
    ],
    "r\u00f6m.": [
        {
            "F": "r\u00f6m."
        }
    ],
    "s'": [
        {
            "F": "s'",
@ -858,6 +1225,11 @@
            "F": "s."
        }
    ],
    "s.o.": [
        {
            "F": "s.o."
        }
    ],
    "sen.": [
        {
            "F": "sen."
@ -865,23 +1237,49 @@
    ],
    "sie's": [
        {
-            "F": "sie"
+            "F": "sie",
            "L": "sie"
        },
        {
            "F": "'s",
            "L": "es"
        }
    ],
    "sog.": [
        {
            "F": "sog."
        }
    ],
    "std.": [
        {
            "F": "std."
        }
    ],
    "stellv.": [
        {
            "F": "stellv."
        }
    ],
    "t.": [
        {
            "F": "t."
        }
    ],
    "t\u00e4gl.": [
        {
            "F": "t\u00e4gl."
        }
    ],
    "u.": [
        {
            "F": "u."
        }
    ],
    "u.U.": [
        {
            "F": "u.U."
        }
    ],
    "u.a.": [
        {
            "F": "u.a."
@ -892,28 +1290,75 @@
            "F": "u.s.w."
        }
    ],
    "u.v.m.": [
        {
            "F": "u.v.m."
        }
    ],
    "unter'm": [
        {
-            "F": "unter"
+            "F": "unter",
            "L": "unter"
        },
        {
            "F": "'m",
            "L": "dem"
        }
    ],
    "usf.": [
        {
            "F": "usf."
        }
    ],
    "usw.": [
        {
            "F": "usw."
        }
    ],
    "uvm.": [
        {
            "F": "uvm."
        }
    ],
    "v.": [
        {
            "F": "v."
        }
    ],
    "v.Chr.": [
        {
            "F": "v.Chr."
        }
    ],
    "v.a.": [
        {
            "F": "v.a."
        }
    ],
    "v.l.n.r.": [
        {
            "F": "v.l.n.r."
        }
    ],
    "vgl.": [
        {
            "F": "vgl."
        }
    ],
    "vllt.": [
        {
            "F": "vllt."
        }
    ],
    "vlt.": [
        {
            "F": "vlt."
        }
    ],
    "vor'm": [
        {
-            "F": "vor"
+            "F": "vor",
            "L": "vor"
        },
        {
            "F": "'m",
@ -932,13 +1377,19 @@
    ],
    "wir's": [
        {
-            "F": "wir"
+            "F": "wir",
            "L": "wir"
        },
        {
            "F": "'s",
            "L": "es"
        }
    ],
    "wiss.": [
        {
            "F": "wiss."
        }
    ],
    "x.": [
        {
            "F": "x."
@ -969,19 +1420,60 @@
            "F": "z.B."
        }
    ],
    "z.Bsp.": [
        {
            "F": "z.Bsp."
        }
    ],
    "z.T.": [
        {
            "F": "z.T."
        }
    ],
    "z.Z.": [
        {
            "F": "z.Z."
        }
    ],
    "z.Zt.": [
        {
            "F": "z.Zt."
        }
    ],
    "z.b.": [
        {
            "F": "z.b."
        }
    ],
    "zzgl.": [
        {
            "F": "zzgl."
        }
    ],
    "\u00e4.": [
        {
            "F": "\u00e4."
        }
    ],
    "\u00f6.": [
        {
            "F": "\u00f6."
        }
    ],
    "\u00f6sterr.": [
        {
            "F": "\u00f6sterr."
        }
    ],
    "\u00fc.": [
        {
            "F": "\u00fc."
        }
    ],
    "\u00fcber'm": [
        {
-            "F": "\u00fcber"
+            "F": "\u00fcber",
            "L": "\u00fcber"
        },
        {
            "F": "'m",
--- a/lang_data/de/suffix.txt
+++ b/lang_data/de/suffix.txt
@ -13,14 +13,61 @@
 ;
 '
 ”
 “
 «
 _
 ''
 's
 'S
 ’s
 ’S
 ’
 ‘
 °
 €
 \.\.
 \.\.\.
 \.\.\.\.
-(?<=[a-z0-9)\]"'%\)])\.
+(?<=[a-zäöüßÖÄÜ)\]"'´«‘’%\)²“”])\.
 \-\-
 ´
 (?<=[0-9])km²
 (?<=[0-9])m²
 (?<=[0-9])cm²
 (?<=[0-9])mm²
 (?<=[0-9])km³
 (?<=[0-9])m³
 (?<=[0-9])cm³
 (?<=[0-9])mm³
 (?<=[0-9])ha
 (?<=[0-9])km
 (?<=[0-9])m
 (?<=[0-9])cm
 (?<=[0-9])mm
 (?<=[0-9])µm
 (?<=[0-9])nm
 (?<=[0-9])yd
 (?<=[0-9])in
 (?<=[0-9])ft
 (?<=[0-9])kg
 (?<=[0-9])g
 (?<=[0-9])mg
 (?<=[0-9])µg
 (?<=[0-9])t
 (?<=[0-9])lb
 (?<=[0-9])oz
 (?<=[0-9])m/s
 (?<=[0-9])km/h
 (?<=[0-9])mph
 (?<=[0-9])°C
 (?<=[0-9])°K
 (?<=[0-9])°F
 (?<=[0-9])hPa
 (?<=[0-9])Pa
 (?<=[0-9])mbar
 (?<=[0-9])mb
 (?<=[0-9])T
 (?<=[0-9])G
 (?<=[0-9])M
 (?<=[0-9])K
 (?<=[0-9])kb
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -153,10 +153,8 @@ cdef class Tagger:
    @classmethod
    def from_package(cls, pkg, vocab):
        # TODO: templates.json deprecated? not present in latest package
-        templates = cls.default_templates()
+        # templates = cls.default_templates()
-        # templates = package.load_utf8(json.load,
+        templates = pkg.load_json(('pos', 'templates.json'), default=cls.default_templates())
        #     'pos', 'templates.json',
        #     default=cls.default_templates())
        model = TaggerModel(templates)
        if pkg.has_file('pos', 'model'):
@ -203,7 +201,7 @@ cdef class Tagger:
                                  nr_class=self.vocab.morphology.n_tags,
                                  nr_feat=self.model.nr_feat)
        for i in range(tokens.length):
-            if tokens.c[i].pos == 0:
+            if tokens.c[i].pos == 0:                
                self.model.set_featuresC(&eg.c, tokens.c, i)
                self.model.set_scoresC(eg.c.scores,
                    eg.c.features, eg.c.nr_feat)
@ -221,7 +219,7 @@ cdef class Tagger:
    def train(self, Doc tokens, object gold_tag_strs):
        assert len(tokens) == len(gold_tag_strs)
        for tag in gold_tag_strs:
-            if tag not in self.tag_names:
+            if tag != None and tag not in self.tag_names:
                msg = ("Unrecognized gold tag: %s. tag_map.json must contain all"
                       "gold tags, to maintain coarse-grained mapping.")
                raise ValueError(msg % tag)
@ -234,10 +232,9 @@ cdef class Tagger:
            nr_feat=self.model.nr_feat)
        for i in range(tokens.length):
            self.model.set_featuresC(&eg.c, tokens.c, i)
-            eg.set_label(golds[i])
+            eg.costs = [ 1 if golds[i] not in (c, -1) else 0 for c in xrange(eg.nr_class) ]
            self.model.set_scoresC(eg.c.scores,
                eg.c.features, eg.c.nr_feat)
            self.model.updateC(&eg.c)
            self.vocab.morphology.assign_tag(&tokens.c[i], eg.guess)
 {
 *
 <
+>
 $
 £
 „
 ‘
 ....
 ...
+‚
+»
+_
+§