add tokenizer files for German, add/change code to train German pos tagger

- add files to specify rules for German tokenization
- change generate_specials.py to generate from an external file (abbrev.de.tab)
- copy gazetteer.json from lang_data/en/

- init_model.py
	- change doc freq threshold to 0
- add train_german_tagger.py
	- expects conll09-formatted input
This commit is contained in:
Wolfgang Seeker 2016-02-18 13:24:20 +01:00
parent 9d8966a2c0
commit eae35e9b27
10 changed files with 1290 additions and 52 deletions

View File

@ -98,7 +98,7 @@ def _read_probs(loc):
return probs, probs['-OOV-']
def _read_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200):
def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200):
if not loc.exists():
print("Warning: Frequencies file not found")
return {}, 0.0
@ -125,7 +125,8 @@ def _read_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200):
doc_freq = int(doc_freq)
freq = int(freq)
if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
word = literal_eval(key)
# word = literal_eval(key)
word = key
smooth_count = counts.smoother(int(freq))
log_smooth_count = math.log(smooth_count)
probs[word] = math.log(smooth_count) - log_total
@ -165,7 +166,7 @@ def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir):
clusters = _read_clusters(src_dir / 'clusters.txt')
probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob')
if not probs:
probs, oov_prob = _read_freqs(src_dir / 'freqs.txt.gz')
probs, oov_prob = _read_freqs(src_dir / 'freqs.txt')
if not probs:
oov_prob = -20
else:
@ -223,7 +224,6 @@ def main(lang_id, lang_data_dir, corpora_dir, model_dir):
copyfile(str(lang_data_dir / 'gazetteer.json'),
str(model_dir / 'vocab' / 'gazetteer.json'))
if (lang_data_dir / 'tag_map.json').exists():
copyfile(str(lang_data_dir / 'tag_map.json'),
str(model_dir / 'vocab' / 'tag_map.json'))

View File

@ -0,0 +1,160 @@
#!/usr/bin/env python
from __future__ import division
from __future__ import unicode_literals
import os
from os import path
import shutil
import io
import random
import time
import gzip
import ujson
import plac
import cProfile
import pstats
import spacy.util
from spacy.de import German
from spacy.gold import GoldParse
from spacy.tagger import Tagger
from spacy.scorer import PRFScore
from spacy.tagger import P2_orth, P2_cluster, P2_shape, P2_prefix, P2_suffix, P2_pos, P2_lemma, P2_flags
from spacy.tagger import P1_orth, P1_cluster, P1_shape, P1_prefix, P1_suffix, P1_pos, P1_lemma, P1_flags
from spacy.tagger import W_orth, W_cluster, W_shape, W_prefix, W_suffix, W_pos, W_lemma, W_flags
from spacy.tagger import N1_orth, N1_cluster, N1_shape, N1_prefix, N1_suffix, N1_pos, N1_lemma, N1_flags
from spacy.tagger import N2_orth, N2_cluster, N2_shape, N2_prefix, N2_suffix, N2_pos, N2_lemma, N2_flags, N_CONTEXT_FIELDS
def default_templates():
return spacy.tagger.Tagger.default_templates()
def default_templates_without_clusters():
return (
(W_orth,),
(P1_lemma, P1_pos),
(P2_lemma, P2_pos),
(N1_orth,),
(N2_orth,),
(W_suffix,),
(W_prefix,),
(P1_pos,),
(P2_pos,),
(P1_pos, P2_pos),
(P1_pos, W_orth),
(P1_suffix,),
(N1_suffix,),
(W_shape,),
(W_flags,),
(N1_flags,),
(N2_flags,),
(P1_flags,),
(P2_flags,),
)
def make_tagger(vocab, templates):
model = spacy.tagger.TaggerModel(templates)
return spacy.tagger.Tagger(vocab,model)
def read_conll(file_):
def sentences():
words, tags = [], []
for line in file_:
line = line.strip()
if line:
word, tag = line.split('\t')[1::3][:2] # get column 1 and 4 (CoNLL09)
words.append(word)
tags.append(tag)
elif words:
yield words, tags
words, tags = [], []
if words:
yield words, tags
return [ s for s in sentences() ]
def score_model(score, nlp, words, gold_tags):
tokens = nlp.tokenizer.tokens_from_list(words)
assert(len(tokens) == len(gold_tags))
nlp.tagger(tokens)
for token, gold_tag in zip(tokens,gold_tags):
score.score_set(set([token.tag_]),set([gold_tag]))
def train(Language, train_sents, dev_sents, model_dir, n_iter=15, seed=21):
# make shuffling deterministic
random.seed(seed)
# set up directory for model
pos_model_dir = path.join(model_dir, 'pos')
if path.exists(pos_model_dir):
shutil.rmtree(pos_model_dir)
os.mkdir(pos_model_dir)
nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
nlp.tagger = make_tagger(nlp.vocab,default_templates())
print("Itn.\ttrain acc %\tdev acc %")
for itn in range(n_iter):
# train on train set
#train_acc = PRFScore()
correct, total = 0., 0.
for words, gold_tags in train_sents:
tokens = nlp.tokenizer.tokens_from_list(words)
correct += nlp.tagger.train(tokens, gold_tags)
total += len(words)
train_acc = correct/total
# test on dev set
dev_acc = PRFScore()
for words, gold_tags in dev_sents:
score_model(dev_acc, nlp, words, gold_tags)
random.shuffle(train_sents)
print('%d:\t%6.2f\t%6.2f' % (itn, 100*train_acc, 100*dev_acc.precision))
print('end training')
nlp.end_training(model_dir)
print('done')
@plac.annotations(
train_loc=("Location of CoNLL 09 formatted training file"),
dev_loc=("Location of CoNLL 09 formatted development file"),
model_dir=("Location of output model directory"),
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
n_iter=("Number of training iterations", "option", "i", int),
)
def main(train_loc, dev_loc, model_dir, eval_only=False, n_iter=15):
# training
if not eval_only:
with io.open(train_loc, 'r', encoding='utf8') as trainfile_, \
io.open(dev_loc, 'r', encoding='utf8') as devfile_:
train_sents = read_conll(trainfile_)
dev_sents = read_conll(devfile_)
train(German, train_sents, dev_sents, model_dir, n_iter=n_iter)
# testing
with io.open(dev_loc, 'r', encoding='utf8') as file_:
dev_sents = read_conll(file_)
nlp = German(data_dir=model_dir)
dev_acc = PRFScore()
for words, gold_tags in dev_sents:
score_model(dev_acc, nlp, words, gold_tags)
print('POS: %6.2f %%' % (100*dev_acc.precision))
if __name__ == '__main__':
plac.call(main)

319
lang_data/de/abbrev.de.tab Normal file
View File

@ -0,0 +1,319 @@
# surface form lemma pos
# multiple values are separated by |
# empty lines and lines starting with # are being ignored
'' ''
\") \")
\n \n <nl> SP
\t \t <tab> SP
<space> SP
# example: Wie geht's?
's 's es
'S 'S es
# example: Haste mal 'nen Euro?
'n 'n ein
'ne 'ne eine
'nen 'nen einen
# example: Kommen S nur herein!
s' s' sie
S' S' sie
# example: Da haben wir's!
ich's ich|'s ich|es
du's du|'s du|es
er's er|'s er|es
sie's sie|'s sie|es
wir's wir|'s wir|es
ihr's ihr|'s ihr|es
# example: Die katze auf'm dach.
auf'm auf|'m auf|dem
unter'm unter|'m unter|dem
über'm über|'m über|dem
vor'm vor|'m vor|dem
hinter'm hinter|'m hinter|dem
# persons
B.A. B.A.
B.Sc. B.Sc.
Dipl. Dipl.
Dipl.-Ing. Dipl.-Ing.
Dr. Dr.
Fr. Fr.
Frl. Frl.
Hr. Hr.
Hrn. Hrn.
Frl. Frl.
Prof. Prof.
St. St.
Hrgs. Hrgs.
Hg. Hg.
a.Z. a.Z.
a.D. a.D.
h.c. h.c.
Jr. Jr.
jr. jr.
jun. jun.
sen. sen.
rer. rer.
Ing. Ing.
M.A. M.A.
Mr. Mr.
M.Sc. M.Sc.
nat. nat.
phil. phil.
# companies
Co. Co.
co. co.
Cie. Cie.
A.G. A.G.
G.m.b.H. G.m.b.H.
i.G. i.G.
e.V. e.V.
# popular german abbreviations
Abb. Abb.
Abk. Abk.
Abs. Abs.
Abt. Abt.
abzgl. abzgl.
allg. allg.
a.M. a.M.
Bd. Bd.
betr. betr.
Betr. Betr.
Biol. Biol.
biol. biol.
Bf. Bf.
Bhf. Bhf.
Bsp. Bsp.
bspw. bspw.
bzgl. bzgl.
bzw. bzw.
d.h. d.h.
dgl. dgl.
ebd. ebd.
ehem. ehem.
eigtl. eigtl.
entspr. entspr.
erm. erm.
ev. ev.
evtl. evtl.
Fa. Fa.
Fam. Fam.
geb. geb.
Gebr. Gebr.
gem. gem.
ggf. ggf.
ggü. ggü.
ggfs. ggfs.
gegr. gegr.
Hbf. Hbf.
Hrsg. Hrsg.
hrsg. hrsg.
i.A. i.A.
i.d.R. i.d.R.
inkl. inkl.
insb. insb.
i.O. i.O.
i.Tr. i.Tr.
i.V. i.V.
jur. jur.
kath. kath.
K.O. K.O.
lt. lt.
max. max.
m.E. m.E.
m.M. m.M.
mtl. mtl.
min. min.
mind. mind.
MwSt. MwSt.
Nr. Nr.
o.a. o.a.
o.ä. o.ä.
o.Ä. o.Ä.
o.g. o.g.
o.k. o.k.
O.K. O.K.
Orig. Orig.
orig. orig.
pers. pers.
Pkt. Pkt.
Red. Red.
röm. röm.
s.o. s.o.
sog. sog.
std. std.
stellv. stellv.
Str. Str.
tägl. tägl.
Tel. Tel.
u.a. u.a.
usf. usf.
u.s.w. u.s.w.
usw. usw.
u.U. u.U.
u.v.m. u.v.m.
uvm. uvm.
v.a. v.a.
vgl. vgl.
vllt. vllt.
v.l.n.r. v.l.n.r.
vlt. vlt.
Vol. Vol.
wiss. wiss.
Univ. Univ.
z.B. z.B.
z.b. z.b.
z.Bsp. z.Bsp.
z.T. z.T.
z.Z. z.Z.
zzgl. zzgl.
z.Zt. z.Zt.
# popular latin abbreviations
vs. vs.
adv. adv.
Chr. Chr.
A.C. A.C.
A.D. A.D.
e.g. e.g.
i.e. i.e.
al. al.
p.a. p.a.
P.S. P.S.
q.e.d. q.e.d.
R.I.P. R.I.P.
etc. etc.
incl. incl.
ca. ca.
n.Chr. n.Chr.
p.s. p.s.
v.Chr. v.Chr.
# popular english abbreviations
D.C. D.C.
N.Y. N.Y.
N.Y.C. N.Y.C.
U.S. U.S.
U.S.A. U.S.A.
L.A. L.A.
U.S.S. U.S.S.
# dates & time
Jan. Jan.
Feb. Feb.
Mrz. Mrz.
Mär. Mär.
Apr. Apr.
Jun. Jun.
Jul. Jul.
Aug. Aug.
Sep. Sep.
Sept. Sept.
Okt. Okt.
Nov. Nov.
Dez. Dez.
Mo. Mo.
Di. Di.
Mi. Mi.
Do. Do.
Fr. Fr.
Sa. Sa.
So. So.
Std. Std.
Jh. Jh.
Jhd. Jhd.
# numbers
Tsd. Tsd.
Mio. Mio.
Mrd. Mrd.
# countries & languages
engl. engl.
frz. frz.
lat. lat.
österr. österr.
# smileys
:) :)
<3 <3
;) ;)
(: (:
:( :(
-_- -_-
=) =)
:/ :/
:> :>
;-) ;-)
:Y :Y
:P :P
:-P :-P
:3 :3
=3 =3
xD xD
^_^ ^_^
=] =]
=D =D
<333 <333
:)) :))
:0 :0
-__- -__-
xDD xDD
o_o o_o
o_O o_O
V_V V_V
=[[ =[[
<33 <33
;p ;p
;D ;D
;-p ;-p
;( ;(
:p :p
:] :]
:O :O
:-/ :-/
:-) :-)
:((( :(((
:(( :((
:') :')
(^_^) (^_^)
(= (=
o.O o.O
# single letters
a. a.
b. b.
c. c.
d. d.
e. e.
f. f.
g. g.
h. h.
i. i.
j. j.
k. k.
l. l.
m. m.
n. n.
o. o.
p. p.
q. q.
r. r.
s. s.
t. t.
u. u.
v. v.
w. w.
x. x.
y. y.
z. z.
ä. ä.
ö. ö.
ü. ü.

194
lang_data/de/gazetteer.json Normal file
View File

@ -0,0 +1,194 @@
{
"Reddit": [
"PRODUCT",
{},
[
[{"lower": "reddit"}]
]
],
"SeptemberElevenAttacks": [
"EVENT",
{},
[
[
{"orth": "9/11"}
],
[
{"lower": "september"},
{"orth": "11"}
]
]
],
"Linux": [
"PRODUCT",
{},
[
[{"lower": "linux"}]
]
],
"Haskell": [
"PRODUCT",
{},
[
[{"lower": "haskell"}]
]
],
"HaskellCurry": [
"PERSON",
{},
[
[
{"lower": "haskell"},
{"lower": "curry"}
]
]
],
"Javascript": [
"PRODUCT",
{},
[
[{"lower": "javascript"}]
]
],
"CSS": [
"PRODUCT",
{},
[
[{"lower": "css"}],
[{"lower": "css3"}]
]
],
"displaCy": [
"PRODUCT",
{},
[
[{"lower": "displacy"}]
]
],
"spaCy": [
"PRODUCT",
{},
[
[{"orth": "spaCy"}]
]
],
"HTML": [
"PRODUCT",
{},
[
[{"lower": "html"}],
[{"lower": "html5"}]
]
],
"Python": [
"PRODUCT",
{},
[
[{"orth": "Python"}]
]
],
"Ruby": [
"PRODUCT",
{},
[
[{"orth": "Ruby"}]
]
],
"Digg": [
"PRODUCT",
{},
[
[{"lower": "digg"}]
]
],
"FoxNews": [
"ORG",
{},
[
[{"orth": "Fox"}],
[{"orth": "News"}]
]
],
"Google": [
"ORG",
{},
[
[{"lower": "google"}]
]
],
"Mac": [
"PRODUCT",
{},
[
[{"lower": "mac"}]
]
],
"Wikipedia": [
"PRODUCT",
{},
[
[{"lower": "wikipedia"}]
]
],
"Windows": [
"PRODUCT",
{},
[
[{"orth": "Windows"}]
]
],
"Dell": [
"ORG",
{},
[
[{"lower": "dell"}]
]
],
"Facebook": [
"ORG",
{},
[
[{"lower": "facebook"}]
]
],
"Blizzard": [
"ORG",
{},
[
[{"orth": "Blizzard"}]
]
],
"Ubuntu": [
"ORG",
{},
[
[{"orth": "Ubuntu"}]
]
],
"Youtube": [
"PRODUCT",
{},
[
[{"lower": "youtube"}]
]
],
"false_positives": [
null,
{},
[
[{"orth": "Shit"}],
[{"orth": "Weed"}],
[{"orth": "Cool"}],
[{"orth": "Btw"}],
[{"orth": "Bah"}],
[{"orth": "Bullshit"}],
[{"orth": "Lol"}],
[{"orth": "Yo"}, {"lower": "dawg"}],
[{"orth": "Yay"}],
[{"orth": "Ahh"}],
[{"orth": "Yea"}],
[{"orth": "Bah"}]
]
]
}

View File

@ -1,5 +1,7 @@
# coding=utf8
import json
import io
import itertools
contractions = {}
@ -262,14 +264,30 @@ def get_token_properties(token, capitalize=False, remove_contractions=False):
props["F"] = token
return props
def create_entry(token, endings, capitalize=False, remove_contractions=False):
def create_entry(token, endings, capitalize=False, remove_contractions=False):
properties = []
properties.append(get_token_properties(token, capitalize=capitalize, remove_contractions=remove_contractions))
for e in endings:
properties.append(get_token_properties(e, remove_contractions=remove_contractions))
return properties
FIELDNAMES = ['F','L','pos']
def read_hardcoded(stream):
hc_specials = {}
for line in stream:
line = line.strip()
if line.startswith('#') or not line:
continue
key,_,rest = line.partition('\t')
values = []
for annotation in zip(*[ e.split('|') for e in rest.split('\t') ]):
values.append({ k:v for k,v in itertools.izip_longest(FIELDNAMES,annotation) if v })
hc_specials[key] = values
return hc_specials
def generate_specials():
specials = {}
@ -303,7 +321,10 @@ def generate_specials():
specials[special] = create_entry(token, endings, capitalize=True, remove_contractions=True)
# add in hardcoded specials
specials = dict(specials, **hardcoded_specials)
# changed it so it generates them from a file
with io.open('abbrev.de.tab','r',encoding='utf8') as abbrev_:
hc_specials = read_hardcoded(abbrev_)
specials = dict(specials, **hc_specials)
return specials

View File

@ -1,3 +1,6 @@
\.\.\.
(?<=[a-z])\.(?=[A-Z])
(?<=[a-zA-Z])-(?=[a-zA-z])
(?<=[a-zöäüßA-ZÖÄÜ"]):(?=[a-zöäüßA-ZÖÄÜ])
(?<=[a-zöäüßA-ZÖÄÜ"])>(?=[a-zöäüßA-ZÖÄÜ])
(?<=[a-zöäüßA-ZÖÄÜ"])<(?=[a-zöäüßA-ZÖÄÜ])
(?<=[a-zöäüßA-ZÖÄÜ"])=(?=[a-zöäüßA-ZÖÄÜ])

View File

@ -5,6 +5,7 @@
{
*
<
>
$
£
@ -20,3 +21,7 @@ a-
....
...
»
_
§

View File

@ -1,27 +1,4 @@
{
"\t": [
{
"F": "\t",
"pos": "SP"
}
],
"\n": [
{
"F": "\n",
"pos": "SP"
}
],
" ": [
{
"F": " ",
"pos": "SP"
}
],
"\")": [
{
"F": "\")"
}
],
"''": [
{
"F": "''"
@ -217,6 +194,11 @@
"F": "<333"
}
],
"<space>": [
{
"F": "SP"
}
],
"=)": [
{
"F": "=)"
@ -267,6 +249,16 @@
"F": "Abk."
}
],
"Abs.": [
{
"F": "Abs."
}
],
"Abt.": [
{
"F": "Abt."
}
],
"Apr.": [
{
"F": "Apr."
@ -277,6 +269,26 @@
"F": "Aug."
}
],
"B.A.": [
{
"F": "B.A."
}
],
"B.Sc.": [
{
"F": "B.Sc."
}
],
"Bd.": [
{
"F": "Bd."
}
],
"Betr.": [
{
"F": "Betr."
}
],
"Bf.": [
{
"F": "Bf."
@ -292,6 +304,11 @@
"F": "Biol."
}
],
"Bsp.": [
{
"F": "Bsp."
}
],
"Chr.": [
{
"F": "Chr."
@ -342,6 +359,16 @@
"F": "Dr."
}
],
"Fa.": [
{
"F": "Fa."
}
],
"Fam.": [
{
"F": "Fam."
}
],
"Feb.": [
{
"F": "Feb."
@ -387,6 +414,16 @@
"F": "Hrgs."
}
],
"Hrn.": [
{
"F": "Hrn."
}
],
"Hrsg.": [
{
"F": "Hrsg."
}
],
"Ing.": [
{
"F": "Ing."
@ -397,11 +434,21 @@
"F": "Jan."
}
],
"Jh.": [
{
"F": "Jh."
}
],
"Jhd.": [
{
"F": "Jhd."
}
],
"Jr.": [
{
"F": "Jr."
}
],
"Jul.": [
{
"F": "Jul."
@ -412,21 +459,61 @@
"F": "Jun."
}
],
"K.O.": [
{
"F": "K.O."
}
],
"L.A.": [
{
"F": "L.A."
}
],
"M.A.": [
{
"F": "M.A."
}
],
"M.Sc.": [
{
"F": "M.Sc."
}
],
"Mi.": [
{
"F": "Mi."
}
],
"Mio.": [
{
"F": "Mio."
}
],
"Mo.": [
{
"F": "Mo."
}
],
"Mr.": [
{
"F": "Mr."
}
],
"Mrd.": [
{
"F": "Mrd."
}
],
"Mrz.": [
{
"F": "Mrz."
}
],
"MwSt.": [
{
"F": "MwSt."
}
],
"M\u00e4r.": [
{
"F": "M\u00e4r."
@ -452,16 +539,31 @@
"F": "Nr."
}
],
"O.K.": [
{
"F": "O.K."
}
],
"Okt.": [
{
"F": "Okt."
}
],
"Orig.": [
{
"F": "Orig."
}
],
"P.S.": [
{
"F": "P.S."
}
],
"Pkt.": [
{
"F": "Pkt."
}
],
"Prof.": [
{
"F": "Prof."
@ -472,6 +574,11 @@
"F": "R.I.P."
}
],
"Red.": [
{
"F": "Red."
}
],
"S'": [
{
"F": "S'",
@ -503,6 +610,41 @@
"F": "St."
}
],
"Std.": [
{
"F": "Std."
}
],
"Str.": [
{
"F": "Str."
}
],
"Tel.": [
{
"F": "Tel."
}
],
"Tsd.": [
{
"F": "Tsd."
}
],
"U.S.": [
{
"F": "U.S."
}
],
"U.S.A.": [
{
"F": "U.S.A."
}
],
"U.S.S.": [
{
"F": "U.S.S."
}
],
"Univ.": [
{
"F": "Univ."
@ -513,6 +655,30 @@
"F": "V_V"
}
],
"Vol.": [
{
"F": "Vol."
}
],
"\\\")": [
{
"F": "\\\")"
}
],
"\\n": [
{
"F": "\\n",
"L": "<nl>",
"pos": "SP"
}
],
"\\t": [
{
"F": "\\t",
"L": "<tab>",
"pos": "SP"
}
],
"^_^": [
{
"F": "^_^"
@ -528,6 +694,11 @@
"F": "a.D."
}
],
"a.M.": [
{
"F": "a.M."
}
],
"a.Z.": [
{
"F": "a.Z."
@ -548,9 +719,15 @@
"F": "al."
}
],
"allg.": [
{
"F": "allg."
}
],
"auf'm": [
{
"F": "auf"
"F": "auf",
"L": "auf"
},
{
"F": "'m",
@ -572,11 +749,31 @@
"F": "biol."
}
],
"bspw.": [
{
"F": "bspw."
}
],
"bzgl.": [
{
"F": "bzgl."
}
],
"bzw.": [
{
"F": "bzw."
}
],
"c.": [
{
"F": "c."
}
],
"ca.": [
{
"F": "ca."
}
],
"co.": [
{
"F": "co."
@ -587,9 +784,20 @@
"F": "d."
}
],
"d.h.": [
{
"F": "d.h."
}
],
"dgl.": [
{
"F": "dgl."
}
],
"du's": [
{
"F": "du"
"F": "du",
"L": "du"
},
{
"F": "'s",
@ -611,19 +819,35 @@
"F": "e.g."
}
],
"ebd.": [
{
"F": "ebd."
}
],
"ehem.": [
{
"F": "ehem."
}
],
"eigtl.": [
{
"F": "eigtl."
}
],
"engl.": [
{
"F": "engl."
}
],
"entspr.": [
{
"F": "entspr."
}
],
"er's": [
{
"F": "er"
"F": "er",
"L": "er"
},
{
"F": "'s",
@ -640,11 +864,26 @@
"F": "etc."
}
],
"ev.": [
{
"F": "ev."
}
],
"evtl.": [
{
"F": "evtl."
}
],
"f.": [
{
"F": "f."
}
],
"frz.": [
{
"F": "frz."
}
],
"g.": [
{
"F": "g."
@ -660,6 +899,11 @@
"F": "gegr."
}
],
"gem.": [
{
"F": "gem."
}
],
"ggf.": [
{
"F": "ggf."
@ -687,23 +931,39 @@
],
"hinter'm": [
{
"F": "hinter"
"F": "hinter",
"L": "hinter"
},
{
"F": "'m",
"L": "dem"
}
],
"hrsg.": [
{
"F": "hrsg."
}
],
"i.": [
{
"F": "i."
}
],
"i.A.": [
{
"F": "i.A."
}
],
"i.G.": [
{
"F": "i.G."
}
],
"i.O.": [
{
"F": "i.O."
}
],
"i.Tr.": [
{
"F": "i.Tr."
@ -714,6 +974,11 @@
"F": "i.V."
}
],
"i.d.R.": [
{
"F": "i.d.R."
}
],
"i.e.": [
{
"F": "i.e."
@ -721,7 +986,8 @@
],
"ich's": [
{
"F": "ich"
"F": "ich",
"L": "ich"
},
{
"F": "'s",
@ -730,7 +996,8 @@
],
"ihr's": [
{
"F": "ihr"
"F": "ihr",
"L": "ihr"
},
{
"F": "'s",
@ -757,6 +1024,11 @@
"F": "j."
}
],
"jr.": [
{
"F": "jr."
}
],
"jun.": [
{
"F": "jun."
@ -772,11 +1044,21 @@
"F": "k."
}
],
"kath.": [
{
"F": "kath."
}
],
"l.": [
{
"F": "l."
}
],
"lat.": [
{
"F": "lat."
}
],
"lt.": [
{
"F": "lt."
@ -787,11 +1069,46 @@
"F": "m."
}
],
"m.E.": [
{
"F": "m.E."
}
],
"m.M.": [
{
"F": "m.M."
}
],
"max.": [
{
"F": "max."
}
],
"min.": [
{
"F": "min."
}
],
"mind.": [
{
"F": "mind."
}
],
"mtl.": [
{
"F": "mtl."
}
],
"n.": [
{
"F": "n."
}
],
"n.Chr.": [
{
"F": "n.Chr."
}
],
"nat.": [
{
"F": "nat."
@ -807,6 +1124,31 @@
"F": "o.O"
}
],
"o.a.": [
{
"F": "o.a."
}
],
"o.g.": [
{
"F": "o.g."
}
],
"o.k.": [
{
"F": "o.k."
}
],
"o.\u00c4.": [
{
"F": "o.\u00c4."
}
],
"o.\u00e4.": [
{
"F": "o.\u00e4."
}
],
"o_O": [
{
"F": "o_O"
@ -817,6 +1159,11 @@
"F": "o_o"
}
],
"orig.": [
{
"F": "orig."
}
],
"p.": [
{
"F": "p."
@ -827,6 +1174,21 @@
"F": "p.a."
}
],
"p.s.": [
{
"F": "p.s."
}
],
"pers.": [
{
"F": "pers."
}
],
"phil.": [
{
"F": "phil."
}
],
"q.": [
{
"F": "q."
@ -847,6 +1209,11 @@
"F": "rer."
}
],
"r\u00f6m.": [
{
"F": "r\u00f6m."
}
],
"s'": [
{
"F": "s'",
@ -858,6 +1225,11 @@
"F": "s."
}
],
"s.o.": [
{
"F": "s.o."
}
],
"sen.": [
{
"F": "sen."
@ -865,23 +1237,49 @@
],
"sie's": [
{
"F": "sie"
"F": "sie",
"L": "sie"
},
{
"F": "'s",
"L": "es"
}
],
"sog.": [
{
"F": "sog."
}
],
"std.": [
{
"F": "std."
}
],
"stellv.": [
{
"F": "stellv."
}
],
"t.": [
{
"F": "t."
}
],
"t\u00e4gl.": [
{
"F": "t\u00e4gl."
}
],
"u.": [
{
"F": "u."
}
],
"u.U.": [
{
"F": "u.U."
}
],
"u.a.": [
{
"F": "u.a."
@ -892,28 +1290,75 @@
"F": "u.s.w."
}
],
"u.v.m.": [
{
"F": "u.v.m."
}
],
"unter'm": [
{
"F": "unter"
"F": "unter",
"L": "unter"
},
{
"F": "'m",
"L": "dem"
}
],
"usf.": [
{
"F": "usf."
}
],
"usw.": [
{
"F": "usw."
}
],
"uvm.": [
{
"F": "uvm."
}
],
"v.": [
{
"F": "v."
}
],
"v.Chr.": [
{
"F": "v.Chr."
}
],
"v.a.": [
{
"F": "v.a."
}
],
"v.l.n.r.": [
{
"F": "v.l.n.r."
}
],
"vgl.": [
{
"F": "vgl."
}
],
"vllt.": [
{
"F": "vllt."
}
],
"vlt.": [
{
"F": "vlt."
}
],
"vor'm": [
{
"F": "vor"
"F": "vor",
"L": "vor"
},
{
"F": "'m",
@ -932,13 +1377,19 @@
],
"wir's": [
{
"F": "wir"
"F": "wir",
"L": "wir"
},
{
"F": "'s",
"L": "es"
}
],
"wiss.": [
{
"F": "wiss."
}
],
"x.": [
{
"F": "x."
@ -969,19 +1420,60 @@
"F": "z.B."
}
],
"z.Bsp.": [
{
"F": "z.Bsp."
}
],
"z.T.": [
{
"F": "z.T."
}
],
"z.Z.": [
{
"F": "z.Z."
}
],
"z.Zt.": [
{
"F": "z.Zt."
}
],
"z.b.": [
{
"F": "z.b."
}
],
"zzgl.": [
{
"F": "zzgl."
}
],
"\u00e4.": [
{
"F": "\u00e4."
}
],
"\u00f6.": [
{
"F": "\u00f6."
}
],
"\u00f6sterr.": [
{
"F": "\u00f6sterr."
}
],
"\u00fc.": [
{
"F": "\u00fc."
}
],
"\u00fcber'm": [
{
"F": "\u00fcber"
"F": "\u00fcber",
"L": "\u00fcber"
},
{
"F": "'m",

View File

@ -13,14 +13,61 @@
;
'
«
_
''
's
'S
s
S
°
\.\.
\.\.\.
\.\.\.\.
(?<=[a-z0-9)\]"'%\)])\.
(?<=[a-zäöüßÖÄÜ)\]"'´«‘’%\)²“”])\.
\-\-
´
(?<=[0-9])km²
(?<=[0-9])m²
(?<=[0-9])cm²
(?<=[0-9])mm²
(?<=[0-9])km³
(?<=[0-9])m³
(?<=[0-9])cm³
(?<=[0-9])mm³
(?<=[0-9])ha
(?<=[0-9])km
(?<=[0-9])m
(?<=[0-9])cm
(?<=[0-9])mm
(?<=[0-9])µm
(?<=[0-9])nm
(?<=[0-9])yd
(?<=[0-9])in
(?<=[0-9])ft
(?<=[0-9])kg
(?<=[0-9])g
(?<=[0-9])mg
(?<=[0-9])µg
(?<=[0-9])t
(?<=[0-9])lb
(?<=[0-9])oz
(?<=[0-9])m/s
(?<=[0-9])km/h
(?<=[0-9])mph
(?<=[0-9])°C
(?<=[0-9])°K
(?<=[0-9])°F
(?<=[0-9])hPa
(?<=[0-9])Pa
(?<=[0-9])mbar
(?<=[0-9])mb
(?<=[0-9])T
(?<=[0-9])G
(?<=[0-9])M
(?<=[0-9])K
(?<=[0-9])kb

View File

@ -153,10 +153,8 @@ cdef class Tagger:
@classmethod
def from_package(cls, pkg, vocab):
# TODO: templates.json deprecated? not present in latest package
templates = cls.default_templates()
# templates = package.load_utf8(json.load,
# 'pos', 'templates.json',
# default=cls.default_templates())
# templates = cls.default_templates()
templates = pkg.load_json(('pos', 'templates.json'), default=cls.default_templates())
model = TaggerModel(templates)
if pkg.has_file('pos', 'model'):
@ -221,7 +219,7 @@ cdef class Tagger:
def train(self, Doc tokens, object gold_tag_strs):
assert len(tokens) == len(gold_tag_strs)
for tag in gold_tag_strs:
if tag not in self.tag_names:
if tag != None and tag not in self.tag_names:
msg = ("Unrecognized gold tag: %s. tag_map.json must contain all"
"gold tags, to maintain coarse-grained mapping.")
raise ValueError(msg % tag)
@ -234,10 +232,9 @@ cdef class Tagger:
nr_feat=self.model.nr_feat)
for i in range(tokens.length):
self.model.set_featuresC(&eg.c, tokens.c, i)
eg.set_label(golds[i])
eg.costs = [ 1 if golds[i] not in (c, -1) else 0 for c in xrange(eg.nr_class) ]
self.model.set_scoresC(eg.c.scores,
eg.c.features, eg.c.nr_feat)
self.model.updateC(&eg.c)
self.vocab.morphology.assign_tag(&tokens.c[i], eg.guess)