mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 13:11:03 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			314 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			314 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # coding=utf8
 | ||
| import json
 | ||
| 
 | ||
| contractions = {}
 | ||
| 
 | ||
| # contains the lemmas, parts of speech, number, and tenspect of
 | ||
| # potential tokens generated after splitting contractions off
 | ||
| token_properties = {}
 | ||
| 
 | ||
| # contains starting tokens with their potential contractions
 | ||
| # each potential contraction has a list of exceptions
 | ||
|     # lower - don't generate the lowercase version
 | ||
|     # upper - don't generate the uppercase version
 | ||
|     # contrLower - don't generate the lowercase version with apostrophe (') removed
 | ||
|     # contrUpper - dont' generate the uppercase version with apostrophe (') removed
 | ||
| # for example, we don't want to create the word "hell" or "Hell" from "he" + "'ll" so 
 | ||
| # we add "contrLower" and "contrUpper" to the exceptions list
 | ||
| starting_tokens = {}
 | ||
| 
 | ||
| # other specials that don't really have contractions
 | ||
| # so they are hardcoded
 | ||
| hardcoded_specials = {
 | ||
|                 "''": [{"F": "''"}],
 | ||
|                 "\")": [{"F": "\")"}],
 | ||
|                 "\n": [{"F": "\n", "pos": "SP"}],
 | ||
|                 "\t": [{"F": "\t", "pos": "SP"}],
 | ||
|                 " ": [{"F": " ", "pos": "SP"}],
 | ||
| 
 | ||
|                 # example: Wie geht's?
 | ||
|                 "'s":  [{"F": "'s", "L": "es"}],
 | ||
|                 "'S":  [{"F": "'S", "L": "es"}],
 | ||
| 
 | ||
|                 # example: Haste mal 'nen Euro?
 | ||
|                 "'n":  [{"F": "'n", "L": "ein"}],
 | ||
|                 "'ne":  [{"F": "'ne", "L": "eine"}],
 | ||
|                 "'nen":  [{"F": "'nen", "L": "einen"}],
 | ||
| 
 | ||
|                 # example: Kommen S’ nur herein!
 | ||
|                 "s'":  [{"F": "s'", "L": "sie"}],
 | ||
|                 "S'":  [{"F": "S'", "L": "sie"}],
 | ||
| 
 | ||
|                 # example: Da haben wir's!
 | ||
|                 "ich's":  [{"F": "ich"}, {"F": "'s", "L": "es"}],
 | ||
|                 "du's":  [{"F": "du"}, {"F": "'s", "L": "es"}],
 | ||
|                 "er's":  [{"F": "er"}, {"F": "'s", "L": "es"}],
 | ||
|                 "sie's":  [{"F": "sie"}, {"F": "'s", "L": "es"}],
 | ||
|                 "wir's":  [{"F": "wir"}, {"F": "'s", "L": "es"}],
 | ||
|                 "ihr's":  [{"F": "ihr"}, {"F": "'s", "L": "es"}],
 | ||
| 
 | ||
|                 # example: Die katze auf'm dach.
 | ||
|                 "auf'm":  [{"F": "auf"}, {"F": "'m", "L": "dem"}],
 | ||
|                 "unter'm":  [{"F": "unter"}, {"F": "'m", "L": "dem"}],
 | ||
|                 "über'm":  [{"F": "über"}, {"F": "'m", "L": "dem"}],
 | ||
|                 "vor'm":  [{"F": "vor"}, {"F": "'m", "L": "dem"}],
 | ||
|                 "hinter'm":  [{"F": "hinter"}, {"F": "'m", "L": "dem"}],
 | ||
| 
 | ||
|                 # persons
 | ||
|                 "Fr.": [{"F": "Fr."}],
 | ||
|                 "Hr.": [{"F": "Hr."}],
 | ||
|                 "Frl.": [{"F": "Frl."}],
 | ||
|                 "Prof.": [{"F": "Prof."}],
 | ||
|                 "Dr.": [{"F": "Dr."}],
 | ||
|                 "St.": [{"F": "St."}],
 | ||
|                 "Hrgs.": [{"F": "Hrgs."}],
 | ||
|                 "Hg.": [{"F": "Hg."}],
 | ||
|                 "a.Z.": [{"F": "a.Z."}],
 | ||
|                 "a.D.": [{"F": "a.D."}],
 | ||
|                 "A.D.": [{"F": "A.D."}],
 | ||
|                 "h.c.": [{"F": "h.c."}],
 | ||
|                 "jun.": [{"F": "jun."}],
 | ||
|                 "sen.": [{"F": "sen."}],
 | ||
|                 "rer.": [{"F": "rer."}],
 | ||
|                 "Dipl.": [{"F": "Dipl."}],
 | ||
|                 "Ing.": [{"F": "Ing."}],
 | ||
|                 "Dipl.-Ing.": [{"F": "Dipl.-Ing."}],
 | ||
| 
 | ||
|                 # companies
 | ||
|                 "Co.": [{"F": "Co."}],
 | ||
|                 "co.": [{"F": "co."}],
 | ||
|                 "Cie.": [{"F": "Cie."}],
 | ||
|                 "A.G.": [{"F": "A.G."}],
 | ||
|                 "G.m.b.H.": [{"F": "G.m.b.H."}],
 | ||
|                 "i.G.": [{"F": "i.G."}],
 | ||
|                 "e.V.": [{"F": "e.V."}],
 | ||
| 
 | ||
|                 # popular german abbreviations
 | ||
|                 "ggü.": [{"F": "ggü."}],
 | ||
|                 "ggf.": [{"F": "ggf."}],
 | ||
|                 "ggfs.": [{"F": "ggfs."}],
 | ||
|                 "Gebr.": [{"F": "Gebr."}],
 | ||
|                 "geb.": [{"F": "geb."}],
 | ||
|                 "gegr.": [{"F": "gegr."}],
 | ||
|                 "erm.": [{"F": "erm."}],
 | ||
|                 "engl.": [{"F": "engl."}],
 | ||
|                 "ehem.": [{"F": "ehem."}],
 | ||
|                 "Biol.": [{"F": "Biol."}],
 | ||
|                 "biol.": [{"F": "biol."}],
 | ||
|                 "Abk.": [{"F": "Abk."}],
 | ||
|                 "Abb.": [{"F": "Abb."}],
 | ||
|                 "abzgl.": [{"F": "abzgl."}],
 | ||
|                 "Hbf.": [{"F": "Hbf."}],
 | ||
|                 "Bhf.": [{"F": "Bhf."}],
 | ||
|                 "Bf.": [{"F": "Bf."}],
 | ||
|                 "i.V.": [{"F": "i.V."}],
 | ||
|                 "inkl.": [{"F": "inkl."}],
 | ||
|                 "insb.": [{"F": "insb."}],
 | ||
|                 "z.B.": [{"F": "z.B."}],
 | ||
|                 "i.Tr.": [{"F": "i.Tr."}],
 | ||
|                 "Jhd.": [{"F": "Jhd."}],
 | ||
|                 "jur.": [{"F": "jur."}],
 | ||
|                 "lt.": [{"F": "lt."}],
 | ||
|                 "nat.": [{"F": "nat."}],
 | ||
|                 "u.a.": [{"F": "u.a."}],
 | ||
|                 "u.s.w.": [{"F": "u.s.w."}],
 | ||
|                 "Nr.": [{"F": "Nr."}],
 | ||
|                 "Univ.": [{"F": "Univ."}],
 | ||
|                 "vgl.": [{"F": "vgl."}],
 | ||
|                 "zzgl.": [{"F": "zzgl."}],
 | ||
|                 "z.Z.": [{"F": "z.Z."}],
 | ||
|                 "betr.": [{"F": "betr."}],
 | ||
|                 "ehem.": [{"F": "ehem."}],
 | ||
| 
 | ||
|                 # popular latin abbreviations
 | ||
|                 "vs.": [{"F": "vs."}],
 | ||
|                 "adv.": [{"F": "adv."}],
 | ||
|                 "Chr.": [{"F": "Chr."}],
 | ||
|                 "A.C.": [{"F": "A.C."}],
 | ||
|                 "A.D.": [{"F": "A.D."}],
 | ||
|                 "e.g.": [{"F": "e.g."}],
 | ||
|                 "i.e.": [{"F": "i.e."}],
 | ||
|                 "al.": [{"F": "al."}],
 | ||
|                 "p.a.": [{"F": "p.a."}],
 | ||
|                 "P.S.": [{"F": "P.S."}],
 | ||
|                 "q.e.d.": [{"F": "q.e.d."}],
 | ||
|                 "R.I.P.": [{"F": "R.I.P."}],
 | ||
|                 "etc.": [{"F": "etc."}],
 | ||
|                 "incl.": [{"F": "incl."}],
 | ||
| 
 | ||
|                 # popular english abbreviations
 | ||
|                 "D.C.": [{"F": "D.C."}],
 | ||
|                 "N.Y.": [{"F": "N.Y."}],
 | ||
|                 "N.Y.C.": [{"F": "N.Y.C."}],
 | ||
| 
 | ||
|                 # dates
 | ||
|                 "Jan.": [{"F": "Jan."}],
 | ||
|                 "Feb.": [{"F": "Feb."}],
 | ||
|                 "Mrz.": [{"F": "Mrz."}],
 | ||
|                 "Mär.": [{"F": "Mär."}],
 | ||
|                 "Apr.": [{"F": "Apr."}],
 | ||
|                 "Jun.": [{"F": "Jun."}],
 | ||
|                 "Jul.": [{"F": "Jul."}],
 | ||
|                 "Aug.": [{"F": "Aug."}],
 | ||
|                 "Sep.": [{"F": "Sep."}],
 | ||
|                 "Sept.": [{"F": "Sept."}],
 | ||
|                 "Okt.": [{"F": "Okt."}],
 | ||
|                 "Nov.": [{"F": "Nov."}],
 | ||
|                 "Dez.": [{"F": "Dez."}],
 | ||
|                 "Mo.": [{"F": "Mo."}],
 | ||
|                 "Di.": [{"F": "Di."}],
 | ||
|                 "Mi.": [{"F": "Mi."}],
 | ||
|                 "Do.": [{"F": "Do."}],
 | ||
|                 "Fr.": [{"F": "Fr."}],
 | ||
|                 "Sa.": [{"F": "Sa."}],
 | ||
|                 "So.": [{"F": "So."}],
 | ||
| 
 | ||
|                 # smileys
 | ||
|                 ":)":    [{"F": ":)"}],
 | ||
|                 "<3":    [{"F": "<3"}],
 | ||
|                 ";)":    [{"F": ";)"}],
 | ||
|                 "(:":    [{"F": "(:"}],
 | ||
|                 ":(":    [{"F": ":("}],
 | ||
|                 "-_-":   [{"F": "-_-"}],
 | ||
|                 "=)":    [{"F": "=)"}],
 | ||
|                 ":/":    [{"F": ":/"}],
 | ||
|                 ":>":    [{"F": ":>"}],
 | ||
|                 ";-)":   [{"F": ";-)"}],
 | ||
|                 ":Y":    [{"F": ":Y"}],
 | ||
|                 ":P":    [{"F": ":P"}],
 | ||
|                 ":-P":   [{"F": ":-P"}],
 | ||
|                 ":3":    [{"F": ":3"}],
 | ||
|                 "=3":    [{"F": "=3"}],
 | ||
|                 "xD":    [{"F": "xD"}],
 | ||
|                 "^_^":   [{"F": "^_^"}],
 | ||
|                 "=]":    [{"F": "=]"}],
 | ||
|                 "=D":    [{"F": "=D"}],
 | ||
|                 "<333":  [{"F": "<333"}],
 | ||
|                 ":))":   [{"F": ":))"}],
 | ||
|                 ":0":    [{"F": ":0"}],
 | ||
|                 "-__-":  [{"F": "-__-"}],
 | ||
|                 "xDD":   [{"F": "xDD"}],
 | ||
|                 "o_o":   [{"F": "o_o"}],
 | ||
|                 "o_O":   [{"F": "o_O"}],
 | ||
|                 "V_V":   [{"F": "V_V"}],
 | ||
|                 "=[[":   [{"F": "=[["}],
 | ||
|                 "<33":   [{"F": "<33"}],
 | ||
|                 ";p":    [{"F": ";p"}],
 | ||
|                 ";D":    [{"F": ";D"}],
 | ||
|                 ";-p":   [{"F": ";-p"}],
 | ||
|                 ";(":    [{"F": ";("}],
 | ||
|                 ":p":    [{"F": ":p"}],
 | ||
|                 ":]":    [{"F": ":]"}],
 | ||
|                 ":O":    [{"F": ":O"}],
 | ||
|                 ":-/":   [{"F": ":-/"}],
 | ||
|                 ":-)":   [{"F": ":-)"}],
 | ||
|                 ":(((":  [{"F": ":((("}],
 | ||
|                 ":((":   [{"F": ":(("}],
 | ||
|                 ":')":   [{"F": ":')"}],
 | ||
|                 "(^_^)": [{"F": "(^_^)"}],
 | ||
|                 "(=":    [{"F": "(="}],
 | ||
|                 "o.O":   [{"F": "o.O"}],
 | ||
| 
 | ||
|                 "a.": [{"F": "a."}],
 | ||
|                 "b.": [{"F": "b."}],
 | ||
|                 "c.": [{"F": "c."}],
 | ||
|                 "d.": [{"F": "d."}],
 | ||
|                 "e.": [{"F": "e."}],
 | ||
|                 "f.": [{"F": "f."}],
 | ||
|                 "g.": [{"F": "g."}],
 | ||
|                 "h.": [{"F": "h."}],
 | ||
|                 "i.": [{"F": "i."}],
 | ||
|                 "j.": [{"F": "j."}],
 | ||
|                 "k.": [{"F": "k."}],
 | ||
|                 "l.": [{"F": "l."}],
 | ||
|                 "m.": [{"F": "m."}],
 | ||
|                 "n.": [{"F": "n."}],
 | ||
|                 "o.": [{"F": "o."}],
 | ||
|                 "p.": [{"F": "p."}],
 | ||
|                 "q.": [{"F": "q."}],
 | ||
|                 "r.": [{"F": "r."}],
 | ||
|                 "s.": [{"F": "s."}],
 | ||
|                 "t.": [{"F": "t."}],
 | ||
|                 "u.": [{"F": "u."}],
 | ||
|                 "v.": [{"F": "v."}],
 | ||
|                 "w.": [{"F": "w."}],
 | ||
|                 "x.": [{"F": "x."}],
 | ||
|                 "y.": [{"F": "y."}],
 | ||
|                 "z.": [{"F": "z."}],
 | ||
| }
 | ||
| 
 | ||
| def get_double_contractions(ending):
 | ||
|     endings = []
 | ||
| 
 | ||
|     ends_with_contraction = any([ending.endswith(contraction) for contraction in contractions])
 | ||
| 
 | ||
|     while ends_with_contraction:
 | ||
|         for contraction in contractions:
 | ||
|             if ending.endswith(contraction):
 | ||
|                 endings.append(contraction)
 | ||
|                 ending = ending.rstrip(contraction)
 | ||
|         ends_with_contraction = any([ending.endswith(contraction) for contraction in contractions])
 | ||
| 
 | ||
|     endings.reverse() # reverse because the last ending is put in the list first
 | ||
|     return endings
 | ||
| 
 | ||
| def get_token_properties(token, capitalize=False, remove_contractions=False):
 | ||
|     props = dict(token_properties.get(token)) # ensure we copy the dict so we can add the "F" prop
 | ||
|     if capitalize:
 | ||
|         token = token.capitalize()
 | ||
|     if remove_contractions:
 | ||
|         token = token.replace("'", "")
 | ||
| 
 | ||
|     props["F"] = token
 | ||
|     return props
 | ||
| 
 | ||
| def create_entry(token, endings, capitalize=False, remove_contractions=False):
 | ||
|     
 | ||
|     properties = []
 | ||
|     properties.append(get_token_properties(token, capitalize=capitalize, remove_contractions=remove_contractions))
 | ||
|     for e in endings:
 | ||
|         properties.append(get_token_properties(e, remove_contractions=remove_contractions))
 | ||
|     return properties
 | ||
| 
 | ||
| def generate_specials():
 | ||
| 
 | ||
|     specials = {}
 | ||
| 
 | ||
|     for token in starting_tokens:
 | ||
|         possible_endings = starting_tokens[token]
 | ||
|         for ending in possible_endings:
 | ||
| 
 | ||
|             endings = []
 | ||
|             if ending.count("'") > 1:
 | ||
|                 endings.extend(get_double_contractions(ending))
 | ||
|             else:
 | ||
|                 endings.append(ending)
 | ||
| 
 | ||
|             exceptions = possible_endings[ending]
 | ||
| 
 | ||
|             if "lower" not in exceptions:
 | ||
|                 special = token + ending
 | ||
|                 specials[special] = create_entry(token, endings)
 | ||
| 
 | ||
|             if "upper" not in exceptions:
 | ||
|                 special = token.capitalize() + ending
 | ||
|                 specials[special] = create_entry(token, endings, capitalize=True)
 | ||
| 
 | ||
|             if "contrLower" not in exceptions:
 | ||
|                 special = token + ending.replace("'", "")
 | ||
|                 specials[special] = create_entry(token, endings, remove_contractions=True)
 | ||
| 
 | ||
|             if "contrUpper" not in exceptions:
 | ||
|                 special = token.capitalize() + ending.replace("'", "")
 | ||
|                 specials[special] = create_entry(token, endings, capitalize=True, remove_contractions=True)
 | ||
| 
 | ||
|     # add in hardcoded specials
 | ||
|     specials = dict(specials, **hardcoded_specials)
 | ||
| 
 | ||
|     return specials
 | ||
| 
 | ||
| if __name__ == "__main__":
 | ||
|     specials = generate_specials()
 | ||
|     with open("specials.json", "w") as f:
 | ||
|         json.dump(specials, f, sort_keys=True, indent=4, separators=(',', ': '))
 |