From 9ecb98f30e8ae96b5b7b7a34fca76a10c3b60223 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Tue, 22 Sep 2015 11:56:29 +0200 Subject: [PATCH 1/2] basic german rules --- lang_data/de/generate_specials.py | 313 ++++++++ lang_data/de/morphs.json | 71 ++ lang_data/de/prefix.txt | 1 + lang_data/de/specials.json | 1138 +++++++++++++++++++++++++---- 4 files changed, 1375 insertions(+), 148 deletions(-) create mode 100644 lang_data/de/generate_specials.py diff --git a/lang_data/de/generate_specials.py b/lang_data/de/generate_specials.py new file mode 100644 index 000000000..44e674800 --- /dev/null +++ b/lang_data/de/generate_specials.py @@ -0,0 +1,313 @@ +# coding=utf8 +import json + +contractions = {} + +# contains the lemmas, parts of speech, number, and tenspect of +# potential tokens generated after splitting contractions off +token_properties = {} + +# contains starting tokens with their potential contractions +# each potential contraction has a list of exceptions + # lower - don't generate the lowercase version + # upper - don't generate the uppercase version + # contrLower - don't generate the lowercase version with apostrophe (') removed + # contrUpper - dont' generate the uppercase version with apostrophe (') removed +# for example, we don't want to create the word "hell" or "Hell" from "he" + "'ll" so +# we add "contrLower" and "contrUpper" to the exceptions list +starting_tokens = {} + +# other specials that don't really have contractions +# so they are hardcoded +hardcoded_specials = { + "''": [{"F": "''"}], + "\")": [{"F": "\")"}], + "\n": [{"F": "\n", "pos": "SP"}], + "\t": [{"F": "\t", "pos": "SP"}], + " ": [{"F": " ", "pos": "SP"}], + + # example: Wie geht's? + "'s": [{"F": "'s", "L": "es"}], + "'S": [{"F": "'S", "L": "es"}], + + # example: Haste mal 'nen Euro? + "'n": [{"F": "'n", "L": "ein"}], + "'ne": [{"F": "'ne", "L": "eine"}], + "'nen": [{"F": "'nen", "L": "einen"}], + + # example: Kommen S’ nur herein! + "s'": [{"F": "s'", "L": "sie"}], + "S'": [{"F": "S'", "L": "sie"}], + + # example: Da haben wir's! + "ich's": [{"F": "ich"}, {"F": "'s", "L": "es"}], + "du's": [{"F": "du"}, {"F": "'s", "L": "es"}], + "er's": [{"F": "er"}, {"F": "'s", "L": "es"}], + "sie's": [{"F": "sie"}, {"F": "'s", "L": "es"}], + "wir's": [{"F": "wir"}, {"F": "'s", "L": "es"}], + "ihr's": [{"F": "ihr"}, {"F": "'s", "L": "es"}], + + # example: Die katze auf'm dach. + "auf'm": [{"F": "auf"}, {"F": "'m", "L": "dem"}], + "unter'm": [{"F": "unter"}, {"F": "'m", "L": "dem"}], + "über'm": [{"F": "über"}, {"F": "'m", "L": "dem"}], + "vor'm": [{"F": "vor"}, {"F": "'m", "L": "dem"}], + "hinter'm": [{"F": "hinter"}, {"F": "'m", "L": "dem"}], + + # persons + "Fr.": [{"F": "Fr."}], + "Hr.": [{"F": "Hr."}], + "Frl.": [{"F": "Frl."}], + "Prof.": [{"F": "Prof."}], + "Dr.": [{"F": "Dr."}], + "St.": [{"F": "St."}], + "Hrgs.": [{"F": "Hrgs."}], + "Hg.": [{"F": "Hg."}], + "a.Z.": [{"F": "a.Z."}], + "a.D.": [{"F": "a.D."}], + "A.D.": [{"F": "A.D."}], + "h.c.": [{"F": "h.c."}], + "jun.": [{"F": "jun."}], + "sen.": [{"F": "sen."}], + "rer.": [{"F": "rer."}], + "Dipl.": [{"F": "Dipl."}], + "Ing.": [{"F": "Ing."}], + "Dipl.-Ing.": [{"F": "Dipl.-Ing."}], + + # companies + "Co.": [{"F": "Co."}], + "co.": [{"F": "co."}], + "Cie.": [{"F": "Cie."}], + "A.G.": [{"F": "A.G."}], + "G.m.b.H.": [{"F": "G.m.b.H."}], + "i.G.": [{"F": "i.G."}], + "e.V.": [{"F": "e.V."}], + + # popular german abbreviations + "ggü.": [{"F": "ggü."}], + "ggf.": [{"F": "ggf."}], + "ggfs.": [{"F": "ggfs."}], + "Gebr.": [{"F": "Gebr."}], + "geb.": [{"F": "geb."}], + "gegr.": [{"F": "gegr."}], + "erm.": [{"F": "erm."}], + "engl.": [{"F": "engl."}], + "ehem.": [{"F": "ehem."}], + "Biol.": [{"F": "Biol."}], + "biol.": [{"F": "biol."}], + "Abk.": [{"F": "Abk."}], + "Abb.": [{"F": "Abb."}], + "abzgl.": [{"F": "abzgl."}], + "Hbf.": [{"F": "Hbf."}], + "Bhf.": [{"F": "Bhf."}], + "Bf.": [{"F": "Bf."}], + "i.V.": [{"F": "i.V."}], + "inkl.": [{"F": "inkl."}], + "insb.": [{"F": "insb."}], + "z.B.": [{"F": "z.B."}], + "i.Tr.": [{"F": "i.Tr."}], + "Jhd.": [{"F": "Jhd."}], + "jur.": [{"F": "jur."}], + "lt.": [{"F": "lt."}], + "nat.": [{"F": "nat."}], + "u.a.": [{"F": "u.a."}], + "u.s.w.": [{"F": "u.s.w."}], + "Nr.": [{"F": "Nr."}], + "Univ.": [{"F": "Univ."}], + "vgl.": [{"F": "vgl."}], + "zzgl.": [{"F": "zzgl."}], + "z.Z.": [{"F": "z.Z."}], + "betr.": [{"F": "betr."}], + "ehem.": [{"F": "ehem."}], + + # popular latin abbreviations + "vs.": [{"F": "vs."}], + "adv.": [{"F": "adv."}], + "Chr.": [{"F": "Chr."}], + "A.C.": [{"F": "A.C."}], + "A.D.": [{"F": "A.D."}], + "e.g.": [{"F": "e.g."}], + "i.e.": [{"F": "i.e."}], + "al.": [{"F": "al."}], + "p.a.": [{"F": "p.a."}], + "P.S.": [{"F": "P.S."}], + "q.e.d.": [{"F": "q.e.d."}], + "R.I.P.": [{"F": "R.I.P."}], + "etc.": [{"F": "etc."}], + "incl.": [{"F": "incl."}], + + # popular english abbreviations + "D.C.": [{"F": "D.C."}], + "N.Y.": [{"F": "N.Y."}], + "N.Y.C.": [{"F": "N.Y.C."}], + + # dates + "Jan.": [{"F": "Jan."}], + "Feb.": [{"F": "Feb."}], + "Mrz.": [{"F": "Mrz."}], + "Mär.": [{"F": "Mär."}], + "Apr.": [{"F": "Apr."}], + "Jun.": [{"F": "Jun."}], + "Jul.": [{"F": "Jul."}], + "Aug.": [{"F": "Aug."}], + "Sep.": [{"F": "Sep."}], + "Sept.": [{"F": "Sept."}], + "Okt.": [{"F": "Okt."}], + "Nov.": [{"F": "Nov."}], + "Dez.": [{"F": "Dez."}], + "Mo.": [{"F": "Mo."}], + "Di.": [{"F": "Di."}], + "Mi.": [{"F": "Mi."}], + "Do.": [{"F": "Do."}], + "Fr.": [{"F": "Fr."}], + "Sa.": [{"F": "Sa."}], + "So.": [{"F": "So."}], + + # smileys + ":)": [{"F": ":)"}], + "<3": [{"F": "<3"}], + ";)": [{"F": ";)"}], + "(:": [{"F": "(:"}], + ":(": [{"F": ":("}], + "-_-": [{"F": "-_-"}], + "=)": [{"F": "=)"}], + ":/": [{"F": ":/"}], + ":>": [{"F": ":>"}], + ";-)": [{"F": ";-)"}], + ":Y": [{"F": ":Y"}], + ":P": [{"F": ":P"}], + ":-P": [{"F": ":-P"}], + ":3": [{"F": ":3"}], + "=3": [{"F": "=3"}], + "xD": [{"F": "xD"}], + "^_^": [{"F": "^_^"}], + "=]": [{"F": "=]"}], + "=D": [{"F": "=D"}], + "<333": [{"F": "<333"}], + ":))": [{"F": ":))"}], + ":0": [{"F": ":0"}], + "-__-": [{"F": "-__-"}], + "xDD": [{"F": "xDD"}], + "o_o": [{"F": "o_o"}], + "o_O": [{"F": "o_O"}], + "V_V": [{"F": "V_V"}], + "=[[": [{"F": "=[["}], + "<33": [{"F": "<33"}], + ";p": [{"F": ";p"}], + ";D": [{"F": ";D"}], + ";-p": [{"F": ";-p"}], + ";(": [{"F": ";("}], + ":p": [{"F": ":p"}], + ":]": [{"F": ":]"}], + ":O": [{"F": ":O"}], + ":-/": [{"F": ":-/"}], + ":-)": [{"F": ":-)"}], + ":(((": [{"F": ":((("}], + ":((": [{"F": ":(("}], + ":')": [{"F": ":')"}], + "(^_^)": [{"F": "(^_^)"}], + "(=": [{"F": "(="}], + "o.O": [{"F": "o.O"}], + + "a.": [{"F": "a."}], + "b.": [{"F": "b."}], + "c.": [{"F": "c."}], + "d.": [{"F": "d."}], + "e.": [{"F": "e."}], + "f.": [{"F": "f."}], + "g.": [{"F": "g."}], + "h.": [{"F": "h."}], + "i.": [{"F": "i."}], + "j.": [{"F": "j."}], + "k.": [{"F": "k."}], + "l.": [{"F": "l."}], + "m.": [{"F": "m."}], + "n.": [{"F": "n."}], + "o.": [{"F": "o."}], + "p.": [{"F": "p."}], + "q.": [{"F": "q."}], + "r.": [{"F": "r."}], + "s.": [{"F": "s."}], + "t.": [{"F": "t."}], + "u.": [{"F": "u."}], + "v.": [{"F": "v."}], + "w.": [{"F": "w."}], + "x.": [{"F": "x."}], + "y.": [{"F": "y."}], + "z.": [{"F": "z."}], +} + +def get_double_contractions(ending): + endings = [] + + ends_with_contraction = any([ending.endswith(contraction) for contraction in contractions]) + + while ends_with_contraction: + for contraction in contractions: + if ending.endswith(contraction): + endings.append(contraction) + ending = ending.rstrip(contraction) + ends_with_contraction = any([ending.endswith(contraction) for contraction in contractions]) + + endings.reverse() # reverse because the last ending is put in the list first + return endings + +def get_token_properties(token, capitalize=False, remove_contractions=False): + props = dict(token_properties.get(token)) # ensure we copy the dict so we can add the "F" prop + if capitalize: + token = token.capitalize() + if remove_contractions: + token = token.replace("'", "") + + props["F"] = token + return props + +def create_entry(token, endings, capitalize=False, remove_contractions=False): + + properties = [] + properties.append(get_token_properties(token, capitalize=capitalize, remove_contractions=remove_contractions)) + for e in endings: + properties.append(get_token_properties(e, remove_contractions=remove_contractions)) + return properties + +def generate_specials(): + + specials = {} + + for token in starting_tokens: + possible_endings = starting_tokens[token] + for ending in possible_endings: + + endings = [] + if ending.count("'") > 1: + endings.extend(get_double_contractions(ending)) + else: + endings.append(ending) + + exceptions = possible_endings[ending] + + if "lower" not in exceptions: + special = token + ending + specials[special] = create_entry(token, endings) + + if "upper" not in exceptions: + special = token.capitalize() + ending + specials[special] = create_entry(token, endings, capitalize=True) + + if "contrLower" not in exceptions: + special = token + ending.replace("'", "") + specials[special] = create_entry(token, endings, remove_contractions=True) + + if "contrUpper" not in exceptions: + special = token.capitalize() + ending.replace("'", "") + specials[special] = create_entry(token, endings, capitalize=True, remove_contractions=True) + + # add in hardcoded specials + specials = dict(specials, **hardcoded_specials) + + return specials + +if __name__ == "__main__": + specials = generate_specials() + with open("specials.json", "w") as f: + json.dump(specials, f, sort_keys=True, indent=4, separators=(',', ': ')) diff --git a/lang_data/de/morphs.json b/lang_data/de/morphs.json index e69de29bb..ae024add2 100644 --- a/lang_data/de/morphs.json +++ b/lang_data/de/morphs.json @@ -0,0 +1,71 @@ +{ + "PRP": { + "ich": {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 1}, + "meiner": {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 2}, + "mir": {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 3}, + "mich": {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 4}, + "du": {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 1}, + "deiner": {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 2}, + "dir": {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 3}, + "dich": {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 4}, + "er": {"L": "-PRON-", "person": 3, "number": 0, "gender": 1, "case": 1}, + "seiner": {"L": "-PRON-", "person": 3, "number": 0, "gender": 1, "case": 2}, + "ihm": {"L": "-PRON-", "person": 3, "number": 0, "gender": 1, "case": 3}, + "ihn": {"L": "-PRON-", "person": 3, "number": 0, "gender": 1, "case": 4}, + "sie": {"L": "-PRON-", "person": 3, "number": 0, "gender": 2, "case": 1}, + "ihrer": {"L": "-PRON-", "person": 3, "number": 0, "gender": 2, "case": 2}, + "ihr": {"L": "-PRON-", "person": 3, "number": 0, "gender": 2, "case": 3}, + "sie": {"L": "-PRON-", "person": 3, "number": 0, "gender": 2, "case": 4}, + "es": {"L": "-PRON-", "person": 3, "number": 0, "gender": 3, "case": 1}, + "seiner": {"L": "-PRON-", "person": 3, "number": 0, "gender": 3, "case": 2}, + "ihm": {"L": "-PRON-", "person": 3, "number": 0, "gender": 3, "case": 3}, + "es": {"L": "-PRON-", "person": 3, "number": 0, "gender": 3, "case": 4}, + "wir": {"L": "-PRON-", "person": 1, "number": 0, "gender": 0, "case": 1}, + "unser": {"L": "-PRON-", "person": 1, "number": 0, "gender": 0, "case": 2}, + "uns": {"L": "-PRON-", "person": 1, "number": 0, "gender": 0, "case": 3}, + "uns": {"L": "-PRON-", "person": 1, "number": 0, "gender": 0, "case": 4}, + "ihr": {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 1}, + "euer": {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 2}, + "euch": {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 3}, + "euch": {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 4}, + "sie": {"L": "-PRON-", "person": 3, "number": 0, "gender": 0, "case": 1}, + "ihrer": {"L": "-PRON-", "person": 3, "number": 0, "gender": 0, "case": 2}, + "ihnen": {"L": "-PRON-", "person": 3, "number": 0, "gender": 0, "case": 3}, + "sie": {"L": "-PRON-", "person": 3, "number": 0, "gender": 0, "case": 4} + }, + + "PRP$": { + "mein": {"L": "-PRON-", "person": 1, "number": 0, "gender": 0, "case": 1}, + "meines": {"L": "-PRON-", "person": 1, "number": 0, "gender": 0, "case": 2}, + "meinem": {"L": "-PRON-", "person": 1, "number": 0, "gender": 0, "case": 3}, + "meinen": {"L": "-PRON-", "person": 1, "number": 0, "gender": 0, "case": 4}, + "dein": {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 1}, + "deines": {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 2}, + "deinem": {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 3}, + "deinen": {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 4}, + "sein": {"L": "-PRON-", "person": 3, "number": 0, "gender": 1, "case": 1}, + "seines": {"L": "-PRON-", "person": 3, "number": 0, "gender": 1, "case": 2}, + "seinem": {"L": "-PRON-", "person": 3, "number": 0, "gender": 1, "case": 3}, + "seinen": {"L": "-PRON-", "person": 3, "number": 0, "gender": 1, "case": 4}, + "ihr": {"L": "-PRON-", "person": 3, "number": 0, "gender": 2, "case": 1}, + "ihrer": {"L": "-PRON-", "person": 3, "number": 0, "gender": 2, "case": 2}, + "ihrem": {"L": "-PRON-", "person": 3, "number": 0, "gender": 2, "case": 3}, + "ihren": {"L": "-PRON-", "person": 3, "number": 0, "gender": 2, "case": 4}, + "sein": {"L": "-PRON-", "person": 3, "number": 0, "gender": 3, "case": 1}, + "seines": {"L": "-PRON-", "person": 3, "number": 0, "gender": 3, "case": 2}, + "seinem": {"L": "-PRON-", "person": 3, "number": 0, "gender": 3, "case": 3}, + "seinen": {"L": "-PRON-", "person": 3, "number": 0, "gender": 3, "case": 4}, + "unser": {"L": "-PRON-", "person": 1, "number": 0, "gender": 0, "case": 1}, + "unseres": {"L": "-PRON-", "person": 1, "number": 0, "gender": 0, "case": 2}, + "unserem": {"L": "-PRON-", "person": 1, "number": 0, "gender": 0, "case": 3}, + "unseren": {"L": "-PRON-", "person": 1, "number": 0, "gender": 0, "case": 4}, + "euer": {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 1}, + "eures": {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 2}, + "eurem": {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 3}, + "euren": {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 4}, + "ihr": {"L": "-PRON-", "person": 3, "number": 0, "gender": 0, "case": 1}, + "ihres": {"L": "-PRON-", "person": 3, "number": 0, "gender": 0, "case": 2}, + "ihrem": {"L": "-PRON-", "person": 3, "number": 0, "gender": 0, "case": 3}, + "ihren": {"L": "-PRON-", "person": 3, "number": 0, "gender": 0, "case": 4} + } +} diff --git a/lang_data/de/prefix.txt b/lang_data/de/prefix.txt index 48c4fc549..1082bef7d 100644 --- a/lang_data/de/prefix.txt +++ b/lang_data/de/prefix.txt @@ -7,6 +7,7 @@ < $ £ +„ “ ' `` diff --git a/lang_data/de/specials.json b/lang_data/de/specials.json index 0e0986339..b8d084503 100644 --- a/lang_data/de/specials.json +++ b/lang_data/de/specials.json @@ -1,149 +1,991 @@ { -"a.m.": [{"F": "a.m."}], -"p.m.": [{"F": "p.m."}], - -"1a.m.": [{"F": "1"}, {"F": "a.m."}], -"2a.m.": [{"F": "2"}, {"F": "a.m."}], -"3a.m.": [{"F": "3"}, {"F": "a.m."}], -"4a.m.": [{"F": "4"}, {"F": "a.m."}], -"5a.m.": [{"F": "5"}, {"F": "a.m."}], -"6a.m.": [{"F": "6"}, {"F": "a.m."}], -"7a.m.": [{"F": "7"}, {"F": "a.m."}], -"8a.m.": [{"F": "8"}, {"F": "a.m."}], -"9a.m.": [{"F": "9"}, {"F": "a.m."}], -"10a.m.": [{"F": "10"}, {"F": "a.m."}], -"11a.m.": [{"F": "11"}, {"F": "a.m."}], -"12a.m.": [{"F": "12"}, {"F": "a.m."}], -"1am": [{"F": "1"}, {"F": "am", "L": "a.m."}], -"2am": [{"F": "2"}, {"F": "am", "L": "a.m."}], -"3am": [{"F": "3"}, {"F": "am", "L": "a.m."}], -"4am": [{"F": "4"}, {"F": "am", "L": "a.m."}], -"5am": [{"F": "5"}, {"F": "am", "L": "a.m."}], -"6am": [{"F": "6"}, {"F": "am", "L": "a.m."}], -"7am": [{"F": "7"}, {"F": "am", "L": "a.m."}], -"8am": [{"F": "8"}, {"F": "am", "L": "a.m."}], -"9am": [{"F": "9"}, {"F": "am", "L": "a.m."}], -"10am": [{"F": "10"}, {"F": "am", "L": "a.m."}], -"11am": [{"F": "11"}, {"F": "am", "L": "a.m."}], -"12am": [{"F": "12"}, {"F": "am", "L": "a.m."}], - - -"1p.m.": [{"F": "1"}, {"F": "p.m."}], -"2p.m.": [{"F": "2"}, {"F": "p.m."}], -"3p.m.": [{"F": "3"}, {"F": "p.m."}], -"4p.m.": [{"F": "4"}, {"F": "p.m."}], -"5p.m.": [{"F": "5"}, {"F": "p.m."}], -"6p.m.": [{"F": "6"}, {"F": "p.m."}], -"7p.m.": [{"F": "7"}, {"F": "p.m."}], -"8p.m.": [{"F": "8"}, {"F": "p.m."}], -"9p.m.": [{"F": "9"}, {"F": "p.m."}], -"10p.m.": [{"F": "10"}, {"F": "p.m."}], -"11p.m.": [{"F": "11"}, {"F": "p.m."}], -"12p.m.": [{"F": "12"}, {"F": "p.m."}], -"1pm": [{"F": "1"}, {"F": "pm", "L": "p.m."}], -"2pm": [{"F": "2"}, {"F": "pm", "L": "p.m."}], -"3pm": [{"F": "3"}, {"F": "pm", "L": "p.m."}], -"4pm": [{"F": "4"}, {"F": "pm", "L": "p.m."}], -"5pm": [{"F": "5"}, {"F": "pm", "L": "p.m."}], -"6pm": [{"F": "6"}, {"F": "pm", "L": "p.m."}], -"7pm": [{"F": "7"}, {"F": "pm", "L": "p.m."}], -"8pm": [{"F": "8"}, {"F": "pm", "L": "p.m."}], -"9pm": [{"F": "9"}, {"F": "pm", "L": "p.m."}], -"10pm": [{"F": "10"}, {"F": "pm", "L": "p.m."}], -"11pm": [{"F": "11"}, {"F": "pm", "L": "p.m."}], -"12pm": [{"F": "12"}, {"F": "pm", "L": "p.m."}], - -"Jan.": [{"F": "Jan.", "L": "Januar"}], -"Feb.": [{"F": "Feb.", "L": "Februar"}], -"Mär.": [{"F": "Mär.", "L": "März"}], -"Apr.": [{"F": "Apr.", "L": "April"}], -"Mai.": [{"F": "Mai.", "L": "Mai"}], -"Jun.": [{"F": "Jun.", "L": "Juni"}], -"Jul.": [{"F": "Jul.", "L": "Juli"}], -"Aug.": [{"F": "Aug.", "L": "August"}], -"Sep.": [{"F": "Sep.", "L": "September"}], -"Sept.": [{"F": "Sept.", "L": "September"}], -"Okt.": [{"F": "Okt.", "L": "Oktober"}], -"Nov.": [{"F": "Nov.", "L": "November"}], -"Dez.": [{"F": "Dez.", "L": "Dezember"}], - -":)": [{"F": ":)"}], -"<3": [{"F": "<3"}], -";)": [{"F": ";)"}], -"(:": [{"F": "(:"}], -":(": [{"F": ":("}], -"-_-": [{"F": "-_-"}], -"=)": [{"F": "=)"}], -":/": [{"F": ":/"}], -":>": [{"F": ":>"}], -";-)": [{"F": ";-)"}], -":Y": [{"F": ":Y"}], -":P": [{"F": ":P"}], -":-P": [{"F": ":-P"}], -":3": [{"F": ":3"}], -"=3": [{"F": "=3"}], -"xD": [{"F": "xD"}], -"^_^": [{"F": "^_^"}], -"=]": [{"F": "=]"}], -"=D": [{"F": "=D"}], -"<333": [{"F": "<333"}], -":))": [{"F": ":))"}], -":0": [{"F": ":0"}], -"-__-": [{"F": "-__-"}], -"xDD": [{"F": "xDD"}], -"o_o": [{"F": "o_o"}], -"o_O": [{"F": "o_O"}], -"V_V": [{"F": "V_V"}], -"=[[": [{"F": "=[["}], -"<33": [{"F": "<33"}], -";p": [{"F": ";p"}], -";D": [{"F": ";D"}], -";-p": [{"F": ";-p"}], -";(": [{"F": ";("}], -":p": [{"F": ":p"}], -":]": [{"F": ":]"}], -":O": [{"F": ":O"}], -":-/": [{"F": ":-/"}], -":-)": [{"F": ":-)"}], -":(((": [{"F": ":((("}], -":((": [{"F": ":(("}], -":')": [{"F": ":')"}], -"(^_^)": [{"F": "(^_^)"}], -"(=": [{"F": "(="}], -"o.O": [{"F": "o.O"}], -"\")": [{"F": "\")"}], -"a.": [{"F": "a."}], -"b.": [{"F": "b."}], -"c.": [{"F": "c."}], -"d.": [{"F": "d."}], -"e.": [{"F": "e."}], -"f.": [{"F": "f."}], -"g.": [{"F": "g."}], -"h.": [{"F": "h."}], -"i.": [{"F": "i."}], -"j.": [{"F": "j."}], -"k.": [{"F": "k."}], -"l.": [{"F": "l."}], -"m.": [{"F": "m."}], -"n.": [{"F": "n."}], -"o.": [{"F": "o."}], -"p.": [{"F": "p."}], -"q.": [{"F": "q."}], -"s.": [{"F": "s."}], -"t.": [{"F": "t."}], -"u.": [{"F": "u."}], -"v.": [{"F": "v."}], -"w.": [{"F": "w."}], -"x.": [{"F": "x."}], -"y.": [{"F": "y."}], -"z.": [{"F": "z."}], - -"z.b.": [{"F": "z.b."}], -"e.h.": [{"F": "I.e."}], -"o.ä.": [{"F": "I.E."}], -"bzw.": [{"F": "bzw."}], -"usw.": [{"F": "usw."}], -"\n": [{"F": "\n", "pos": "SP"}], -"\t": [{"F": "\t", "pos": "SP"}], -" ": [{"F": " ", "pos": "SP"}] -} + "\t": [ + { + "F": "\t", + "pos": "SP" + } + ], + "\n": [ + { + "F": "\n", + "pos": "SP" + } + ], + " ": [ + { + "F": " ", + "pos": "SP" + } + ], + "\")": [ + { + "F": "\")" + } + ], + "''": [ + { + "F": "''" + } + ], + "'S": [ + { + "F": "'S", + "L": "es" + } + ], + "'n": [ + { + "F": "'n", + "L": "ein" + } + ], + "'ne": [ + { + "F": "'ne", + "L": "eine" + } + ], + "'nen": [ + { + "F": "'nen", + "L": "einen" + } + ], + "'s": [ + { + "F": "'s", + "L": "es" + } + ], + "(:": [ + { + "F": "(:" + } + ], + "(=": [ + { + "F": "(=" + } + ], + "(^_^)": [ + { + "F": "(^_^)" + } + ], + "-_-": [ + { + "F": "-_-" + } + ], + "-__-": [ + { + "F": "-__-" + } + ], + ":')": [ + { + "F": ":')" + } + ], + ":(": [ + { + "F": ":(" + } + ], + ":((": [ + { + "F": ":((" + } + ], + ":(((": [ + { + "F": ":(((" + } + ], + ":)": [ + { + "F": ":)" + } + ], + ":))": [ + { + "F": ":))" + } + ], + ":-)": [ + { + "F": ":-)" + } + ], + ":-/": [ + { + "F": ":-/" + } + ], + ":-P": [ + { + "F": ":-P" + } + ], + ":/": [ + { + "F": ":/" + } + ], + ":0": [ + { + "F": ":0" + } + ], + ":3": [ + { + "F": ":3" + } + ], + ":>": [ + { + "F": ":>" + } + ], + ":O": [ + { + "F": ":O" + } + ], + ":P": [ + { + "F": ":P" + } + ], + ":Y": [ + { + "F": ":Y" + } + ], + ":]": [ + { + "F": ":]" + } + ], + ":p": [ + { + "F": ":p" + } + ], + ";(": [ + { + "F": ";(" + } + ], + ";)": [ + { + "F": ";)" + } + ], + ";-)": [ + { + "F": ";-)" + } + ], + ";-p": [ + { + "F": ";-p" + } + ], + ";D": [ + { + "F": ";D" + } + ], + ";p": [ + { + "F": ";p" + } + ], + "<3": [ + { + "F": "<3" + } + ], + "<33": [ + { + "F": "<33" + } + ], + "<333": [ + { + "F": "<333" + } + ], + "=)": [ + { + "F": "=)" + } + ], + "=3": [ + { + "F": "=3" + } + ], + "=D": [ + { + "F": "=D" + } + ], + "=[[": [ + { + "F": "=[[" + } + ], + "=]": [ + { + "F": "=]" + } + ], + "A.C.": [ + { + "F": "A.C." + } + ], + "A.D.": [ + { + "F": "A.D." + } + ], + "A.G.": [ + { + "F": "A.G." + } + ], + "Abb.": [ + { + "F": "Abb." + } + ], + "Abk.": [ + { + "F": "Abk." + } + ], + "Apr.": [ + { + "F": "Apr." + } + ], + "Aug.": [ + { + "F": "Aug." + } + ], + "Bf.": [ + { + "F": "Bf." + } + ], + "Bhf.": [ + { + "F": "Bhf." + } + ], + "Biol.": [ + { + "F": "Biol." + } + ], + "Chr.": [ + { + "F": "Chr." + } + ], + "Cie.": [ + { + "F": "Cie." + } + ], + "Co.": [ + { + "F": "Co." + } + ], + "D.C.": [ + { + "F": "D.C." + } + ], + "Dez.": [ + { + "F": "Dez." + } + ], + "Di.": [ + { + "F": "Di." + } + ], + "Dipl.": [ + { + "F": "Dipl." + } + ], + "Dipl.-Ing.": [ + { + "F": "Dipl.-Ing." + } + ], + "Do.": [ + { + "F": "Do." + } + ], + "Dr.": [ + { + "F": "Dr." + } + ], + "Feb.": [ + { + "F": "Feb." + } + ], + "Fr.": [ + { + "F": "Fr." + } + ], + "Frl.": [ + { + "F": "Frl." + } + ], + "G.m.b.H.": [ + { + "F": "G.m.b.H." + } + ], + "Gebr.": [ + { + "F": "Gebr." + } + ], + "Hbf.": [ + { + "F": "Hbf." + } + ], + "Hg.": [ + { + "F": "Hg." + } + ], + "Hr.": [ + { + "F": "Hr." + } + ], + "Hrgs.": [ + { + "F": "Hrgs." + } + ], + "Ing.": [ + { + "F": "Ing." + } + ], + "Jan.": [ + { + "F": "Jan." + } + ], + "Jhd.": [ + { + "F": "Jhd." + } + ], + "Jul.": [ + { + "F": "Jul." + } + ], + "Jun.": [ + { + "F": "Jun." + } + ], + "Mi.": [ + { + "F": "Mi." + } + ], + "Mo.": [ + { + "F": "Mo." + } + ], + "Mrz.": [ + { + "F": "Mrz." + } + ], + "M\u00e4r.": [ + { + "F": "M\u00e4r." + } + ], + "N.Y.": [ + { + "F": "N.Y." + } + ], + "N.Y.C.": [ + { + "F": "N.Y.C." + } + ], + "Nov.": [ + { + "F": "Nov." + } + ], + "Nr.": [ + { + "F": "Nr." + } + ], + "Okt.": [ + { + "F": "Okt." + } + ], + "P.S.": [ + { + "F": "P.S." + } + ], + "Prof.": [ + { + "F": "Prof." + } + ], + "R.I.P.": [ + { + "F": "R.I.P." + } + ], + "S'": [ + { + "F": "S'", + "L": "sie" + } + ], + "Sa.": [ + { + "F": "Sa." + } + ], + "Sep.": [ + { + "F": "Sep." + } + ], + "Sept.": [ + { + "F": "Sept." + } + ], + "So.": [ + { + "F": "So." + } + ], + "St.": [ + { + "F": "St." + } + ], + "Univ.": [ + { + "F": "Univ." + } + ], + "V_V": [ + { + "F": "V_V" + } + ], + "^_^": [ + { + "F": "^_^" + } + ], + "a.": [ + { + "F": "a." + } + ], + "a.D.": [ + { + "F": "a.D." + } + ], + "a.Z.": [ + { + "F": "a.Z." + } + ], + "abzgl.": [ + { + "F": "abzgl." + } + ], + "adv.": [ + { + "F": "adv." + } + ], + "al.": [ + { + "F": "al." + } + ], + "auf'm": [ + { + "F": "auf" + }, + { + "F": "'m", + "L": "dem" + } + ], + "b.": [ + { + "F": "b." + } + ], + "betr.": [ + { + "F": "betr." + } + ], + "biol.": [ + { + "F": "biol." + } + ], + "c.": [ + { + "F": "c." + } + ], + "co.": [ + { + "F": "co." + } + ], + "d.": [ + { + "F": "d." + } + ], + "du's": [ + { + "F": "du" + }, + { + "F": "'s", + "L": "es" + } + ], + "e.": [ + { + "F": "e." + } + ], + "e.V.": [ + { + "F": "e.V." + } + ], + "e.g.": [ + { + "F": "e.g." + } + ], + "ehem.": [ + { + "F": "ehem." + } + ], + "engl.": [ + { + "F": "engl." + } + ], + "er's": [ + { + "F": "er" + }, + { + "F": "'s", + "L": "es" + } + ], + "erm.": [ + { + "F": "erm." + } + ], + "etc.": [ + { + "F": "etc." + } + ], + "f.": [ + { + "F": "f." + } + ], + "g.": [ + { + "F": "g." + } + ], + "geb.": [ + { + "F": "geb." + } + ], + "gegr.": [ + { + "F": "gegr." + } + ], + "ggf.": [ + { + "F": "ggf." + } + ], + "ggfs.": [ + { + "F": "ggfs." + } + ], + "gg\u00fc.": [ + { + "F": "gg\u00fc." + } + ], + "h.": [ + { + "F": "h." + } + ], + "h.c.": [ + { + "F": "h.c." + } + ], + "hinter'm": [ + { + "F": "hinter" + }, + { + "F": "'m", + "L": "dem" + } + ], + "i.": [ + { + "F": "i." + } + ], + "i.G.": [ + { + "F": "i.G." + } + ], + "i.Tr.": [ + { + "F": "i.Tr." + } + ], + "i.V.": [ + { + "F": "i.V." + } + ], + "i.e.": [ + { + "F": "i.e." + } + ], + "ich's": [ + { + "F": "ich" + }, + { + "F": "'s", + "L": "es" + } + ], + "ihr's": [ + { + "F": "ihr" + }, + { + "F": "'s", + "L": "es" + } + ], + "incl.": [ + { + "F": "incl." + } + ], + "inkl.": [ + { + "F": "inkl." + } + ], + "insb.": [ + { + "F": "insb." + } + ], + "j.": [ + { + "F": "j." + } + ], + "jun.": [ + { + "F": "jun." + } + ], + "jur.": [ + { + "F": "jur." + } + ], + "k.": [ + { + "F": "k." + } + ], + "l.": [ + { + "F": "l." + } + ], + "lt.": [ + { + "F": "lt." + } + ], + "m.": [ + { + "F": "m." + } + ], + "n.": [ + { + "F": "n." + } + ], + "nat.": [ + { + "F": "nat." + } + ], + "o.": [ + { + "F": "o." + } + ], + "o.O": [ + { + "F": "o.O" + } + ], + "o_O": [ + { + "F": "o_O" + } + ], + "o_o": [ + { + "F": "o_o" + } + ], + "p.": [ + { + "F": "p." + } + ], + "p.a.": [ + { + "F": "p.a." + } + ], + "q.": [ + { + "F": "q." + } + ], + "q.e.d.": [ + { + "F": "q.e.d." + } + ], + "r.": [ + { + "F": "r." + } + ], + "rer.": [ + { + "F": "rer." + } + ], + "s'": [ + { + "F": "s'", + "L": "sie" + } + ], + "s.": [ + { + "F": "s." + } + ], + "sen.": [ + { + "F": "sen." + } + ], + "sie's": [ + { + "F": "sie" + }, + { + "F": "'s", + "L": "es" + } + ], + "t.": [ + { + "F": "t." + } + ], + "u.": [ + { + "F": "u." + } + ], + "u.a.": [ + { + "F": "u.a." + } + ], + "u.s.w.": [ + { + "F": "u.s.w." + } + ], + "unter'm": [ + { + "F": "unter" + }, + { + "F": "'m", + "L": "dem" + } + ], + "v.": [ + { + "F": "v." + } + ], + "vgl.": [ + { + "F": "vgl." + } + ], + "vor'm": [ + { + "F": "vor" + }, + { + "F": "'m", + "L": "dem" + } + ], + "vs.": [ + { + "F": "vs." + } + ], + "w.": [ + { + "F": "w." + } + ], + "wir's": [ + { + "F": "wir" + }, + { + "F": "'s", + "L": "es" + } + ], + "x.": [ + { + "F": "x." + } + ], + "xD": [ + { + "F": "xD" + } + ], + "xDD": [ + { + "F": "xDD" + } + ], + "y.": [ + { + "F": "y." + } + ], + "z.": [ + { + "F": "z." + } + ], + "z.B.": [ + { + "F": "z.B." + } + ], + "z.Z.": [ + { + "F": "z.Z." + } + ], + "zzgl.": [ + { + "F": "zzgl." + } + ], + "\u00fcber'm": [ + { + "F": "\u00fcber" + }, + { + "F": "'m", + "L": "dem" + } + ] +} \ No newline at end of file From 911de2ae49ce966b2332ee4888986a4d2da8ce6b Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Tue, 22 Sep 2015 12:29:47 +0200 Subject: [PATCH 2/2] add overseen (?) char --- lang_data/en/generate_specials.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lang_data/en/generate_specials.py b/lang_data/en/generate_specials.py index d86def177..1a8f1ae0b 100644 --- a/lang_data/en/generate_specials.py +++ b/lang_data/en/generate_specials.py @@ -318,6 +318,7 @@ hardcoded_specials = { "o.": [{"F": "o."}], "p.": [{"F": "p."}], "q.": [{"F": "q."}], + "r.": [{"F": "r."}], "s.": [{"F": "s."}], "t.": [{"F": "t."}], "u.": [{"F": "u."}],