Use global abbreviation data languages and remove duplicates

This commit is contained in:
Ines Montani 2017-01-08 20:36:00 +01:00
parent 7c3cb2a652
commit 0dec90e9f7
13 changed files with 35 additions and 124 deletions

View File

@ -9,12 +9,13 @@ from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
TAG_MAP = dict(TAG_MAP)
STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))

View File

@ -516,11 +516,6 @@ TOKENIZER_EXCEPTIONS = {
ORTH_ONLY = [
"'",
"\\\")",
"<space>",
"a.",
"ä.",
"A.C.",
"a.D.",
"A.D.",
@ -530,24 +525,20 @@ ORTH_ONLY = [
"Abs.",
"adv.",
"al.",
"b.",
"B.A.",
"B.Sc.",
"betr.",
"biol.",
"Biol.",
"c.",
"ca.",
"Chr.",
"Cie.",
"co.",
"Co.",
"d.",
"D.C.",
"Dipl.-Ing.",
"Dipl.",
"Dr.",
"e.",
"e.g.",
"e.V.",
"ehem.",
@ -555,79 +546,57 @@ ORTH_ONLY = [
"erm.",
"etc.",
"ev.",
"f.",
"g.",
"G.m.b.H.",
"geb.",
"Gebr.",
"gem.",
"h.",
"h.c.",
"Hg.",
"hrsg.",
"Hrsg.",
"i.",
"i.A.",
"i.e.",
"i.G.",
"i.Tr.",
"i.V.",
"Ing.",
"j.",
"jr.",
"Jr.",
"jun.",
"jur.",
"k.",
"K.O.",
"l.",
"L.A.",
"lat.",
"m.",
"M.A.",
"m.E.",
"m.M.",
"M.Sc.",
"Mr.",
"n.",
"N.Y.",
"N.Y.C.",
"nat.",
"ö."
"o.",
"o.a.",
"o.ä.",
"o.g.",
"o.k.",
"O.K.",
"p.",
"p.a.",
"p.s.",
"P.S.",
"pers.",
"phil.",
"q.",
"q.e.d.",
"r.",
"R.I.P.",
"rer.",
"s.",
"sen.",
"St.",
"std.",
"t.",
"u.",
"ü.",
"u.a.",
"U.S.",
"U.S.A.",
"U.S.S.",
"v.",
"Vol.",
"vs.",
"w.",
"wiss.",
"x.",
"y.",
"z."
"wiss."
]

View File

@ -37,14 +37,16 @@ def get_time_exc(hours):
return exc
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
TAG_MAP = dict(TAG_MAP)
STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1)))
update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", ""))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
__all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS", "LEMMA_RULES", "MORPH_RULES"]

View File

@ -718,39 +718,25 @@ for string in EXCLUDE_EXC:
ORTH_ONLY = [
"'d",
"''",
"a.",
"a.m.",
"Adm.",
"b.",
"Bros.",
"c.",
"co.",
"Co.",
"Corp.",
"d.",
"D.C.",
"Dr.",
"e.",
"e.g.",
"E.g.",
"E.G.",
"f.",
"g.",
"Gen.",
"Gov.",
"h.",
"i.",
"i.e.",
"I.e.",
"I.E.",
"Inc.",
"j.",
"Jr.",
"k.",
"l.",
"Ltd.",
"m.",
"Md.",
"Messrs.",
"Mo.",
@ -758,24 +744,11 @@ ORTH_ONLY = [
"Mr.",
"Mrs.",
"Ms.",
"n.",
"o.",
"p.",
"p.m.",
"Ph.D.",
"q.",
"r.",
"Rep.",
"Rev.",
"s.",
"Sen.",
"St.",
"t.",
"u.",
"v.",
"vs.",
"w.",
"x.",
"y.",
"z."
"vs."
]

View File

@ -40,11 +40,14 @@ def get_time_exc(hours):
return exc
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1)))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]

View File

@ -85,55 +85,29 @@ TOKENIZER_EXCEPTIONS = {
ORTH_ONLY = [
"a.",
"a.C.",
"a.J.C.",
"apdo.",
"Av.",
"Avda.",
"b.",
"c.",
"Cía.",
"d.",
"e.",
"etc.",
"f.",
"g.",
"Gob.",
"Gral.",
"h.",
"i.",
"Ing.",
"j.",
"J.C.",
"k.",
"l.",
"Lic.",
"m.",
"m.n.",
"n.",
"no.",
"núm.",
"o.",
"p.",
"P.D.",
"Prof.",
"Profa.",
"q.",
"q.e.p.d."
"r.",
"s.",
"S.A.",
"S.L.",
"s.s.s.",
"Sr.",
"Sra.",
"Srta.",
"t.",
"u.",
"v.",
"w.",
"x.",
"y.",
"z."
"Srta."
]

View File

@ -2,13 +2,16 @@
from __future__ import unicode_literals
from .. import language_data as base
from ..language_data import strings_to_exc
from ..language_data import strings_to_exc, update_exc
from .stop_words import STOP_WORDS
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]

View File

@ -11,13 +11,14 @@ from .tokenizer_exceptions import OTHER_EXC
from .. import language_data as base
STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
TOKENIZER_PREFIXES = base.TOKENIZER_PREFIXES + TOKENIZER_PREFIXES
TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES
TOKENIZER_INFIXES = TOKENIZER_INFIXES
# HYPHENS = [six.unichr(cp) for cp in [173, 8211, 8212, 8213, 8722, 9472]]
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(OTHER_EXC))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ABBREVIATIONS))

View File

@ -111,7 +111,6 @@ Vcs.
Vhr.
X.Y.
Zs.
a.
a.C.
ac.
adj.
@ -126,7 +125,6 @@ ang.
arch.
at.
aug.
b.
b.a.
b.s.
b.sc.
@ -141,7 +139,6 @@ br.
bsc.
bt.
btk.
c.
ca.
cc.
cca.
@ -155,7 +152,6 @@ csc.
csüt.
cső.
ctv.
d.
dbj.
dd.
ddr.
@ -170,7 +166,6 @@ dolg.
dr.
du.
dzs.
e.
ea.
ed.
eff.
@ -186,7 +181,6 @@ etc.
ev.
ezr.
.
f.
f.h.
f.é.
fam.
@ -213,7 +207,6 @@ főig.
főisk.
főtörm.
főv.
g.
gazd.
gimn.
gk.
@ -225,7 +218,6 @@ gy.
gyak.
gyártm.
gör.
h.
hads.
hallg.
hdm.
@ -266,7 +258,6 @@ isk.
ism.
izr.
.
j.
jan.
jav.
jegyz.
@ -278,7 +269,6 @@ jr.
jvb.
júl.
jún.
k.
karb.
kat.
kb.
@ -313,7 +303,6 @@ közl.
közp.
közt.
.
l.
lat.
ld.
legs.
@ -324,7 +313,6 @@ lt.
ltd.
ltp.
luth.
m.
m.a.
m.s.
m.sc.
@ -359,7 +347,6 @@ műh.
műsz.
műv.
művez.
n.
nagyker.
nagys.
nat.
@ -372,7 +359,6 @@ ny.
nyilv.
nyrt.
nyug.
o.
obj.
okl.
okt.
@ -381,7 +367,6 @@ orsz.
ort.
ov.
ovh.
p.
pf.
pg.
ph.d
@ -404,8 +389,6 @@ pság.
ptk.
pu.
.
q.
r.
r.k.
rac.
rad.
@ -420,7 +403,6 @@ rkt.
rt.
rtg.
röv.
s.
s.b.
s.k.
sa.
@ -450,7 +432,6 @@ szt.
szubj.
szöv.
szül.
t.
tanm.
tb.
tbk.
@ -476,13 +457,11 @@ tvr.
ty.
törv.
.
u.
ua.
ui.
unit.
uo.
uv.
v.
vas.
vb.
vegy.
@ -501,9 +480,6 @@ vv.
vál.
vízv.
.
w.
y.
z.
zrt.
zs.
Ész.
@ -520,7 +496,6 @@ zs.
évf.
í.
ó.
ö.
össz.
ötk.
özv.
@ -528,7 +503,6 @@ zs.
úm.
ún.
út.
ü.
üag.
üd.
üdv.
@ -544,6 +518,5 @@ zs.
""".strip().split()
OTHER_EXC = """
''
-e
""".strip().split()

View File

@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc
from .stop_words import STOP_WORDS
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]

View File

@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc
from .stop_words import STOP_WORDS
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]

View File

@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc
from .stop_words import STOP_WORDS
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]

View File

@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc
from .stop_words import STOP_WORDS
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]