Use global abbreviation data languages and remove duplicates

This commit is contained in:
Ines Montani 2017-01-08 20:36:00 +01:00
parent 7c3cb2a652
commit 0dec90e9f7
13 changed files with 35 additions and 124 deletions

View File

@ -9,12 +9,13 @@ from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
TAG_MAP = dict(TAG_MAP) TAG_MAP = dict(TAG_MAP)
STOP_WORDS = set(STOP_WORDS) STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))

View File

@ -516,11 +516,6 @@ TOKENIZER_EXCEPTIONS = {
ORTH_ONLY = [ ORTH_ONLY = [
"'",
"\\\")",
"<space>",
"a.",
"ä.",
"A.C.", "A.C.",
"a.D.", "a.D.",
"A.D.", "A.D.",
@ -530,24 +525,20 @@ ORTH_ONLY = [
"Abs.", "Abs.",
"adv.", "adv.",
"al.", "al.",
"b.",
"B.A.", "B.A.",
"B.Sc.", "B.Sc.",
"betr.", "betr.",
"biol.", "biol.",
"Biol.", "Biol.",
"c.",
"ca.", "ca.",
"Chr.", "Chr.",
"Cie.", "Cie.",
"co.", "co.",
"Co.", "Co.",
"d.",
"D.C.", "D.C.",
"Dipl.-Ing.", "Dipl.-Ing.",
"Dipl.", "Dipl.",
"Dr.", "Dr.",
"e.",
"e.g.", "e.g.",
"e.V.", "e.V.",
"ehem.", "ehem.",
@ -555,79 +546,57 @@ ORTH_ONLY = [
"erm.", "erm.",
"etc.", "etc.",
"ev.", "ev.",
"f.",
"g.",
"G.m.b.H.", "G.m.b.H.",
"geb.", "geb.",
"Gebr.", "Gebr.",
"gem.", "gem.",
"h.",
"h.c.", "h.c.",
"Hg.", "Hg.",
"hrsg.", "hrsg.",
"Hrsg.", "Hrsg.",
"i.",
"i.A.", "i.A.",
"i.e.", "i.e.",
"i.G.", "i.G.",
"i.Tr.", "i.Tr.",
"i.V.", "i.V.",
"Ing.", "Ing.",
"j.",
"jr.", "jr.",
"Jr.", "Jr.",
"jun.", "jun.",
"jur.", "jur.",
"k.",
"K.O.", "K.O.",
"l.",
"L.A.", "L.A.",
"lat.", "lat.",
"m.",
"M.A.", "M.A.",
"m.E.", "m.E.",
"m.M.", "m.M.",
"M.Sc.", "M.Sc.",
"Mr.", "Mr.",
"n.",
"N.Y.", "N.Y.",
"N.Y.C.", "N.Y.C.",
"nat.", "nat.",
"ö." "ö."
"o.",
"o.a.", "o.a.",
"o.ä.", "o.ä.",
"o.g.", "o.g.",
"o.k.", "o.k.",
"O.K.", "O.K.",
"p.",
"p.a.", "p.a.",
"p.s.", "p.s.",
"P.S.", "P.S.",
"pers.", "pers.",
"phil.", "phil.",
"q.",
"q.e.d.", "q.e.d.",
"r.",
"R.I.P.", "R.I.P.",
"rer.", "rer.",
"s.",
"sen.", "sen.",
"St.", "St.",
"std.", "std.",
"t.",
"u.",
"ü.",
"u.a.", "u.a.",
"U.S.", "U.S.",
"U.S.A.", "U.S.A.",
"U.S.S.", "U.S.S.",
"v.",
"Vol.", "Vol.",
"vs.", "vs.",
"w.", "wiss."
"wiss.",
"x.",
"y.",
"z."
] ]

View File

@ -37,14 +37,16 @@ def get_time_exc(hours):
return exc return exc
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
TAG_MAP = dict(TAG_MAP) TAG_MAP = dict(TAG_MAP)
STOP_WORDS = set(STOP_WORDS) STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1))) update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1)))
update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", "")) update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", ""))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
__all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS", "LEMMA_RULES", "MORPH_RULES"] __all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS", "LEMMA_RULES", "MORPH_RULES"]

View File

@ -718,39 +718,25 @@ for string in EXCLUDE_EXC:
ORTH_ONLY = [ ORTH_ONLY = [
"'d", "'d",
"''",
"a.",
"a.m.", "a.m.",
"Adm.", "Adm.",
"b.",
"Bros.", "Bros.",
"c.",
"co.", "co.",
"Co.", "Co.",
"Corp.", "Corp.",
"d.",
"D.C.", "D.C.",
"Dr.", "Dr.",
"e.",
"e.g.", "e.g.",
"E.g.", "E.g.",
"E.G.", "E.G.",
"f.",
"g.",
"Gen.", "Gen.",
"Gov.", "Gov.",
"h.",
"i.",
"i.e.", "i.e.",
"I.e.", "I.e.",
"I.E.", "I.E.",
"Inc.", "Inc.",
"j.",
"Jr.", "Jr.",
"k.",
"l.",
"Ltd.", "Ltd.",
"m.",
"Md.", "Md.",
"Messrs.", "Messrs.",
"Mo.", "Mo.",
@ -758,24 +744,11 @@ ORTH_ONLY = [
"Mr.", "Mr.",
"Mrs.", "Mrs.",
"Ms.", "Ms.",
"n.",
"o.",
"p.",
"p.m.", "p.m.",
"Ph.D.", "Ph.D.",
"q.",
"r.",
"Rep.", "Rep.",
"Rev.", "Rev.",
"s.",
"Sen.", "Sen.",
"St.", "St.",
"t.", "vs."
"u.",
"v.",
"vs.",
"w.",
"x.",
"y.",
"z."
] ]

View File

@ -40,11 +40,14 @@ def get_time_exc(hours):
return exc return exc
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
STOP_WORDS = set(STOP_WORDS) STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1))) update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1)))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]

View File

@ -85,55 +85,29 @@ TOKENIZER_EXCEPTIONS = {
ORTH_ONLY = [ ORTH_ONLY = [
"a.",
"a.C.", "a.C.",
"a.J.C.", "a.J.C.",
"apdo.", "apdo.",
"Av.", "Av.",
"Avda.", "Avda.",
"b.",
"c.",
"Cía.", "Cía.",
"d.",
"e.",
"etc.", "etc.",
"f.",
"g.",
"Gob.", "Gob.",
"Gral.", "Gral.",
"h.",
"i.",
"Ing.", "Ing.",
"j.",
"J.C.", "J.C.",
"k.",
"l.",
"Lic.", "Lic.",
"m.",
"m.n.", "m.n.",
"n.",
"no.", "no.",
"núm.", "núm.",
"o.",
"p.",
"P.D.", "P.D.",
"Prof.", "Prof.",
"Profa.", "Profa.",
"q.",
"q.e.p.d." "q.e.p.d."
"r.",
"s.",
"S.A.", "S.A.",
"S.L.", "S.L.",
"s.s.s.", "s.s.s.",
"Sr.", "Sr.",
"Sra.", "Sra.",
"Srta.", "Srta."
"t.",
"u.",
"v.",
"w.",
"x.",
"y.",
"z."
] ]

View File

@ -2,13 +2,16 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .. import language_data as base from .. import language_data as base
from ..language_data import strings_to_exc from ..language_data import strings_to_exc, update_exc
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
STOP_WORDS = set(STOP_WORDS) STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]

View File

@ -11,13 +11,14 @@ from .tokenizer_exceptions import OTHER_EXC
from .. import language_data as base from .. import language_data as base
STOP_WORDS = set(STOP_WORDS) STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
TOKENIZER_PREFIXES = base.TOKENIZER_PREFIXES + TOKENIZER_PREFIXES TOKENIZER_PREFIXES = base.TOKENIZER_PREFIXES + TOKENIZER_PREFIXES
TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES
TOKENIZER_INFIXES = TOKENIZER_INFIXES TOKENIZER_INFIXES = TOKENIZER_INFIXES
# HYPHENS = [six.unichr(cp) for cp in [173, 8211, 8212, 8213, 8722, 9472]] # HYPHENS = [six.unichr(cp) for cp in [173, 8211, 8212, 8213, 8722, 9472]]
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(OTHER_EXC)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(OTHER_EXC))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ABBREVIATIONS)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ABBREVIATIONS))

View File

@ -111,7 +111,6 @@ Vcs.
Vhr. Vhr.
X.Y. X.Y.
Zs. Zs.
a.
a.C. a.C.
ac. ac.
adj. adj.
@ -126,7 +125,6 @@ ang.
arch. arch.
at. at.
aug. aug.
b.
b.a. b.a.
b.s. b.s.
b.sc. b.sc.
@ -141,7 +139,6 @@ br.
bsc. bsc.
bt. bt.
btk. btk.
c.
ca. ca.
cc. cc.
cca. cca.
@ -155,7 +152,6 @@ csc.
csüt. csüt.
cső. cső.
ctv. ctv.
d.
dbj. dbj.
dd. dd.
ddr. ddr.
@ -170,7 +166,6 @@ dolg.
dr. dr.
du. du.
dzs. dzs.
e.
ea. ea.
ed. ed.
eff. eff.
@ -186,7 +181,6 @@ etc.
ev. ev.
ezr. ezr.
. .
f.
f.h. f.h.
f.é. f.é.
fam. fam.
@ -213,7 +207,6 @@ főig.
főisk. főisk.
főtörm. főtörm.
főv. főv.
g.
gazd. gazd.
gimn. gimn.
gk. gk.
@ -225,7 +218,6 @@ gy.
gyak. gyak.
gyártm. gyártm.
gör. gör.
h.
hads. hads.
hallg. hallg.
hdm. hdm.
@ -266,7 +258,6 @@ isk.
ism. ism.
izr. izr.
. .
j.
jan. jan.
jav. jav.
jegyz. jegyz.
@ -278,7 +269,6 @@ jr.
jvb. jvb.
júl. júl.
jún. jún.
k.
karb. karb.
kat. kat.
kb. kb.
@ -313,7 +303,6 @@ közl.
közp. közp.
közt. közt.
. .
l.
lat. lat.
ld. ld.
legs. legs.
@ -324,7 +313,6 @@ lt.
ltd. ltd.
ltp. ltp.
luth. luth.
m.
m.a. m.a.
m.s. m.s.
m.sc. m.sc.
@ -359,7 +347,6 @@ műh.
műsz. műsz.
műv. műv.
művez. művez.
n.
nagyker. nagyker.
nagys. nagys.
nat. nat.
@ -372,7 +359,6 @@ ny.
nyilv. nyilv.
nyrt. nyrt.
nyug. nyug.
o.
obj. obj.
okl. okl.
okt. okt.
@ -381,7 +367,6 @@ orsz.
ort. ort.
ov. ov.
ovh. ovh.
p.
pf. pf.
pg. pg.
ph.d ph.d
@ -404,8 +389,6 @@ pság.
ptk. ptk.
pu. pu.
. .
q.
r.
r.k. r.k.
rac. rac.
rad. rad.
@ -420,7 +403,6 @@ rkt.
rt. rt.
rtg. rtg.
röv. röv.
s.
s.b. s.b.
s.k. s.k.
sa. sa.
@ -450,7 +432,6 @@ szt.
szubj. szubj.
szöv. szöv.
szül. szül.
t.
tanm. tanm.
tb. tb.
tbk. tbk.
@ -476,13 +457,11 @@ tvr.
ty. ty.
törv. törv.
. .
u.
ua. ua.
ui. ui.
unit. unit.
uo. uo.
uv. uv.
v.
vas. vas.
vb. vb.
vegy. vegy.
@ -501,9 +480,6 @@ vv.
vál. vál.
vízv. vízv.
. .
w.
y.
z.
zrt. zrt.
zs. zs.
Ész. Ész.
@ -520,7 +496,6 @@ zs.
évf. évf.
í. í.
ó. ó.
ö.
össz. össz.
ötk. ötk.
özv. özv.
@ -528,7 +503,6 @@ zs.
úm. úm.
ún. ún.
út. út.
ü.
üag. üag.
üd. üd.
üdv. üdv.
@ -544,6 +518,5 @@ zs.
""".strip().split() """.strip().split()
OTHER_EXC = """ OTHER_EXC = """
''
-e -e
""".strip().split() """.strip().split()

View File

@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
STOP_WORDS = set(STOP_WORDS) STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]

View File

@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
STOP_WORDS = set(STOP_WORDS) STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]

View File

@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
STOP_WORDS = set(STOP_WORDS) STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]

View File

@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
STOP_WORDS = set(STOP_WORDS) STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]