spaCy/lang_data/de/generate_specials.py
Wolfgang Seeker eae35e9b27 add tokenizer files for German, add/change code to train German pos tagger
- add files to specify rules for German tokenization
- change generate_specials.py to generate from an external file (abbrev.de.tab)
- copy gazetteer.json from lang_data/en/

- init_model.py
	- change doc freq threshold to 0
- add train_german_tagger.py
	- expects conll09-formatted input
2016-02-18 13:24:20 +01:00

335 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding=utf8
import json
import io
import itertools
contractions = {}
# contains the lemmas, parts of speech, number, and tenspect of
# potential tokens generated after splitting contractions off
token_properties = {}
# contains starting tokens with their potential contractions
# each potential contraction has a list of exceptions
# lower - don't generate the lowercase version
# upper - don't generate the uppercase version
# contrLower - don't generate the lowercase version with apostrophe (') removed
# contrUpper - dont' generate the uppercase version with apostrophe (') removed
# for example, we don't want to create the word "hell" or "Hell" from "he" + "'ll" so
# we add "contrLower" and "contrUpper" to the exceptions list
starting_tokens = {}
# other specials that don't really have contractions
# so they are hardcoded
hardcoded_specials = {
"''": [{"F": "''"}],
"\")": [{"F": "\")"}],
"\n": [{"F": "\n", "pos": "SP"}],
"\t": [{"F": "\t", "pos": "SP"}],
" ": [{"F": " ", "pos": "SP"}],
# example: Wie geht's?
"'s": [{"F": "'s", "L": "es"}],
"'S": [{"F": "'S", "L": "es"}],
# example: Haste mal 'nen Euro?
"'n": [{"F": "'n", "L": "ein"}],
"'ne": [{"F": "'ne", "L": "eine"}],
"'nen": [{"F": "'nen", "L": "einen"}],
# example: Kommen S nur herein!
"s'": [{"F": "s'", "L": "sie"}],
"S'": [{"F": "S'", "L": "sie"}],
# example: Da haben wir's!
"ich's": [{"F": "ich"}, {"F": "'s", "L": "es"}],
"du's": [{"F": "du"}, {"F": "'s", "L": "es"}],
"er's": [{"F": "er"}, {"F": "'s", "L": "es"}],
"sie's": [{"F": "sie"}, {"F": "'s", "L": "es"}],
"wir's": [{"F": "wir"}, {"F": "'s", "L": "es"}],
"ihr's": [{"F": "ihr"}, {"F": "'s", "L": "es"}],
# example: Die katze auf'm dach.
"auf'm": [{"F": "auf"}, {"F": "'m", "L": "dem"}],
"unter'm": [{"F": "unter"}, {"F": "'m", "L": "dem"}],
"über'm": [{"F": "über"}, {"F": "'m", "L": "dem"}],
"vor'm": [{"F": "vor"}, {"F": "'m", "L": "dem"}],
"hinter'm": [{"F": "hinter"}, {"F": "'m", "L": "dem"}],
# persons
"Fr.": [{"F": "Fr."}],
"Hr.": [{"F": "Hr."}],
"Frl.": [{"F": "Frl."}],
"Prof.": [{"F": "Prof."}],
"Dr.": [{"F": "Dr."}],
"St.": [{"F": "St."}],
"Hrgs.": [{"F": "Hrgs."}],
"Hg.": [{"F": "Hg."}],
"a.Z.": [{"F": "a.Z."}],
"a.D.": [{"F": "a.D."}],
"A.D.": [{"F": "A.D."}],
"h.c.": [{"F": "h.c."}],
"jun.": [{"F": "jun."}],
"sen.": [{"F": "sen."}],
"rer.": [{"F": "rer."}],
"Dipl.": [{"F": "Dipl."}],
"Ing.": [{"F": "Ing."}],
"Dipl.-Ing.": [{"F": "Dipl.-Ing."}],
# companies
"Co.": [{"F": "Co."}],
"co.": [{"F": "co."}],
"Cie.": [{"F": "Cie."}],
"A.G.": [{"F": "A.G."}],
"G.m.b.H.": [{"F": "G.m.b.H."}],
"i.G.": [{"F": "i.G."}],
"e.V.": [{"F": "e.V."}],
# popular german abbreviations
"ggü.": [{"F": "ggü."}],
"ggf.": [{"F": "ggf."}],
"ggfs.": [{"F": "ggfs."}],
"Gebr.": [{"F": "Gebr."}],
"geb.": [{"F": "geb."}],
"gegr.": [{"F": "gegr."}],
"erm.": [{"F": "erm."}],
"engl.": [{"F": "engl."}],
"ehem.": [{"F": "ehem."}],
"Biol.": [{"F": "Biol."}],
"biol.": [{"F": "biol."}],
"Abk.": [{"F": "Abk."}],
"Abb.": [{"F": "Abb."}],
"abzgl.": [{"F": "abzgl."}],
"Hbf.": [{"F": "Hbf."}],
"Bhf.": [{"F": "Bhf."}],
"Bf.": [{"F": "Bf."}],
"i.V.": [{"F": "i.V."}],
"inkl.": [{"F": "inkl."}],
"insb.": [{"F": "insb."}],
"z.B.": [{"F": "z.B."}],
"i.Tr.": [{"F": "i.Tr."}],
"Jhd.": [{"F": "Jhd."}],
"jur.": [{"F": "jur."}],
"lt.": [{"F": "lt."}],
"nat.": [{"F": "nat."}],
"u.a.": [{"F": "u.a."}],
"u.s.w.": [{"F": "u.s.w."}],
"Nr.": [{"F": "Nr."}],
"Univ.": [{"F": "Univ."}],
"vgl.": [{"F": "vgl."}],
"zzgl.": [{"F": "zzgl."}],
"z.Z.": [{"F": "z.Z."}],
"betr.": [{"F": "betr."}],
"ehem.": [{"F": "ehem."}],
# popular latin abbreviations
"vs.": [{"F": "vs."}],
"adv.": [{"F": "adv."}],
"Chr.": [{"F": "Chr."}],
"A.C.": [{"F": "A.C."}],
"A.D.": [{"F": "A.D."}],
"e.g.": [{"F": "e.g."}],
"i.e.": [{"F": "i.e."}],
"al.": [{"F": "al."}],
"p.a.": [{"F": "p.a."}],
"P.S.": [{"F": "P.S."}],
"q.e.d.": [{"F": "q.e.d."}],
"R.I.P.": [{"F": "R.I.P."}],
"etc.": [{"F": "etc."}],
"incl.": [{"F": "incl."}],
# popular english abbreviations
"D.C.": [{"F": "D.C."}],
"N.Y.": [{"F": "N.Y."}],
"N.Y.C.": [{"F": "N.Y.C."}],
# dates
"Jan.": [{"F": "Jan."}],
"Feb.": [{"F": "Feb."}],
"Mrz.": [{"F": "Mrz."}],
"Mär.": [{"F": "Mär."}],
"Apr.": [{"F": "Apr."}],
"Jun.": [{"F": "Jun."}],
"Jul.": [{"F": "Jul."}],
"Aug.": [{"F": "Aug."}],
"Sep.": [{"F": "Sep."}],
"Sept.": [{"F": "Sept."}],
"Okt.": [{"F": "Okt."}],
"Nov.": [{"F": "Nov."}],
"Dez.": [{"F": "Dez."}],
"Mo.": [{"F": "Mo."}],
"Di.": [{"F": "Di."}],
"Mi.": [{"F": "Mi."}],
"Do.": [{"F": "Do."}],
"Fr.": [{"F": "Fr."}],
"Sa.": [{"F": "Sa."}],
"So.": [{"F": "So."}],
# smileys
":)": [{"F": ":)"}],
"<3": [{"F": "<3"}],
";)": [{"F": ";)"}],
"(:": [{"F": "(:"}],
":(": [{"F": ":("}],
"-_-": [{"F": "-_-"}],
"=)": [{"F": "=)"}],
":/": [{"F": ":/"}],
":>": [{"F": ":>"}],
";-)": [{"F": ";-)"}],
":Y": [{"F": ":Y"}],
":P": [{"F": ":P"}],
":-P": [{"F": ":-P"}],
":3": [{"F": ":3"}],
"=3": [{"F": "=3"}],
"xD": [{"F": "xD"}],
"^_^": [{"F": "^_^"}],
"=]": [{"F": "=]"}],
"=D": [{"F": "=D"}],
"<333": [{"F": "<333"}],
":))": [{"F": ":))"}],
":0": [{"F": ":0"}],
"-__-": [{"F": "-__-"}],
"xDD": [{"F": "xDD"}],
"o_o": [{"F": "o_o"}],
"o_O": [{"F": "o_O"}],
"V_V": [{"F": "V_V"}],
"=[[": [{"F": "=[["}],
"<33": [{"F": "<33"}],
";p": [{"F": ";p"}],
";D": [{"F": ";D"}],
";-p": [{"F": ";-p"}],
";(": [{"F": ";("}],
":p": [{"F": ":p"}],
":]": [{"F": ":]"}],
":O": [{"F": ":O"}],
":-/": [{"F": ":-/"}],
":-)": [{"F": ":-)"}],
":(((": [{"F": ":((("}],
":((": [{"F": ":(("}],
":')": [{"F": ":')"}],
"(^_^)": [{"F": "(^_^)"}],
"(=": [{"F": "(="}],
"o.O": [{"F": "o.O"}],
"a.": [{"F": "a."}],
"b.": [{"F": "b."}],
"c.": [{"F": "c."}],
"d.": [{"F": "d."}],
"e.": [{"F": "e."}],
"f.": [{"F": "f."}],
"g.": [{"F": "g."}],
"h.": [{"F": "h."}],
"i.": [{"F": "i."}],
"j.": [{"F": "j."}],
"k.": [{"F": "k."}],
"l.": [{"F": "l."}],
"m.": [{"F": "m."}],
"n.": [{"F": "n."}],
"o.": [{"F": "o."}],
"p.": [{"F": "p."}],
"q.": [{"F": "q."}],
"r.": [{"F": "r."}],
"s.": [{"F": "s."}],
"t.": [{"F": "t."}],
"u.": [{"F": "u."}],
"v.": [{"F": "v."}],
"w.": [{"F": "w."}],
"x.": [{"F": "x."}],
"y.": [{"F": "y."}],
"z.": [{"F": "z."}],
}
def get_double_contractions(ending):
endings = []
ends_with_contraction = any([ending.endswith(contraction) for contraction in contractions])
while ends_with_contraction:
for contraction in contractions:
if ending.endswith(contraction):
endings.append(contraction)
ending = ending.rstrip(contraction)
ends_with_contraction = any([ending.endswith(contraction) for contraction in contractions])
endings.reverse() # reverse because the last ending is put in the list first
return endings
def get_token_properties(token, capitalize=False, remove_contractions=False):
props = dict(token_properties.get(token)) # ensure we copy the dict so we can add the "F" prop
if capitalize:
token = token.capitalize()
if remove_contractions:
token = token.replace("'", "")
props["F"] = token
return props
def create_entry(token, endings, capitalize=False, remove_contractions=False):
properties = []
properties.append(get_token_properties(token, capitalize=capitalize, remove_contractions=remove_contractions))
for e in endings:
properties.append(get_token_properties(e, remove_contractions=remove_contractions))
return properties
FIELDNAMES = ['F','L','pos']
def read_hardcoded(stream):
hc_specials = {}
for line in stream:
line = line.strip()
if line.startswith('#') or not line:
continue
key,_,rest = line.partition('\t')
values = []
for annotation in zip(*[ e.split('|') for e in rest.split('\t') ]):
values.append({ k:v for k,v in itertools.izip_longest(FIELDNAMES,annotation) if v })
hc_specials[key] = values
return hc_specials
def generate_specials():
specials = {}
for token in starting_tokens:
possible_endings = starting_tokens[token]
for ending in possible_endings:
endings = []
if ending.count("'") > 1:
endings.extend(get_double_contractions(ending))
else:
endings.append(ending)
exceptions = possible_endings[ending]
if "lower" not in exceptions:
special = token + ending
specials[special] = create_entry(token, endings)
if "upper" not in exceptions:
special = token.capitalize() + ending
specials[special] = create_entry(token, endings, capitalize=True)
if "contrLower" not in exceptions:
special = token + ending.replace("'", "")
specials[special] = create_entry(token, endings, remove_contractions=True)
if "contrUpper" not in exceptions:
special = token.capitalize() + ending.replace("'", "")
specials[special] = create_entry(token, endings, capitalize=True, remove_contractions=True)
# add in hardcoded specials
# changed it so it generates them from a file
with io.open('abbrev.de.tab','r',encoding='utf8') as abbrev_:
hc_specials = read_hardcoded(abbrev_)
specials = dict(specials, **hc_specials)
return specials
if __name__ == "__main__":
specials = generate_specials()
with open("specials.json", "w") as f:
json.dump(specials, f, sort_keys=True, indent=4, separators=(',', ': '))